aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/python/pandas/py3
diff options
context:
space:
mode:
authormaxim-yurchuk <maxim-yurchuk@yandex-team.com>2025-02-11 13:26:52 +0300
committermaxim-yurchuk <maxim-yurchuk@yandex-team.com>2025-02-11 13:57:59 +0300
commitf895bba65827952ed934b2b46f9a45e30a191fd2 (patch)
tree03260c906d9ec41cdc03e2a496b15d407459cec0 /contrib/python/pandas/py3
parent5f7060466f7b9707818c2091e1a25c14f33c3474 (diff)
downloadydb-f895bba65827952ed934b2b46f9a45e30a191fd2.tar.gz
Remove deps on pandas
<https://github.com/ydb-platform/ydb/pull/14418> <https://github.com/ydb-platform/ydb/pull/14419> \-- аналогичные правки в gh Хочу залить в обход синка, чтобы посмотреть удалится ли pandas в нашей gh репе через piglet commit_hash:abca127aa37d4dbb94b07e1e18cdb8eb5b711860
Diffstat (limited to 'contrib/python/pandas/py3')
-rw-r--r--contrib/python/pandas/py3/.dist-info/METADATA331
-rw-r--r--contrib/python/pandas/py3/.dist-info/entry_points.txt2
-rw-r--r--contrib/python/pandas/py3/.dist-info/top_level.txt1
-rw-r--r--contrib/python/pandas/py3/.yandex_meta/yamaker.yaml17
-rw-r--r--contrib/python/pandas/py3/AUTHORS.md56
-rw-r--r--contrib/python/pandas/py3/LICENSE31
-rw-r--r--contrib/python/pandas/py3/LICENSES/DATEUTIL_LICENSE54
-rw-r--r--contrib/python/pandas/py3/LICENSES/HAVEN_LICENSE21
-rw-r--r--contrib/python/pandas/py3/LICENSES/HAVEN_MIT32
-rw-r--r--contrib/python/pandas/py3/LICENSES/KLIB_LICENSE23
-rw-r--r--contrib/python/pandas/py3/LICENSES/MUSL_LICENSE132
-rw-r--r--contrib/python/pandas/py3/LICENSES/NUMPY_LICENSE30
-rw-r--r--contrib/python/pandas/py3/LICENSES/OTHER75
-rw-r--r--contrib/python/pandas/py3/LICENSES/PACKAGING_LICENSE202
-rw-r--r--contrib/python/pandas/py3/LICENSES/PSF_LICENSE279
-rw-r--r--contrib/python/pandas/py3/LICENSES/PYUPGRADE_LICENSE19
-rw-r--r--contrib/python/pandas/py3/LICENSES/SAS7BDAT_LICENSE19
-rw-r--r--contrib/python/pandas/py3/LICENSES/SCIPY_LICENSE31
-rw-r--r--contrib/python/pandas/py3/LICENSES/ULTRAJSON_LICENSE34
-rw-r--r--contrib/python/pandas/py3/LICENSES/XARRAY_LICENSE195
-rw-r--r--contrib/python/pandas/py3/README.md170
-rw-r--r--contrib/python/pandas/py3/pandas/__init__.py346
-rw-r--r--contrib/python/pandas/py3/pandas/_config/__init__.py40
-rw-r--r--contrib/python/pandas/py3/pandas/_config/config.py909
-rw-r--r--contrib/python/pandas/py3/pandas/_config/dates.py25
-rw-r--r--contrib/python/pandas/py3/pandas/_config/display.py62
-rw-r--r--contrib/python/pandas/py3/pandas/_config/localization.py169
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/__init__.py22
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/algos.pxd22
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/algos.pyi420
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/algos.pyx1536
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi99
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi.in73
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi2162
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi.in222
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/arrays.pxd11
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/arrays.pyi34
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/arrays.pyx183
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/dtypes.pxd36
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/groupby.pyi191
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/groupby.pyx1884
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/hashing.pyi9
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/hashing.pyx194
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/hashtable.pxd189
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/hashtable.pyi251
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/hashtable.pyx125
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi7291
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi.in1506
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi2755
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi.in484
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/index.pyi105
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/index.pyx1280
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi381
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi.in78
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/indexing.pyi17
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/indexing.pyx28
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/internals.pyi102
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/internals.pyx920
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/interval.pyi174
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/interval.pyx650
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi2074
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi.in434
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/join.pyi78
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/join.pyx897
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/json.pyi23
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/khash.pxd129
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi209
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi.in44
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/lib.pxd6
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/lib.pyi250
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/lib.pyx3059
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/missing.pxd20
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/missing.pyi17
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/missing.pyx513
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/ops.pyi50
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/ops.pyx310
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/ops_dispatch.pyi5
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/ops_dispatch.pyx121
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/parsers.pyi75
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/parsers.pyx2127
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/properties.pyi27
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/properties.pyx69
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/reduction.pyi6
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/reduction.pyx33
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/reshape.pyi16
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/reshape.pyx138
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/sparse.pyi49
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/sparse.pyx733
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi5979
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi.in313
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/headers/portable.h18
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/inline_helper.h27
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/klib/khash.h719
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/klib/khash_python.h446
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/parse_helper.h100
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/parser/io.c107
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/parser/io.h34
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/parser/tokenizer.c2085
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/parser/tokenizer.h236
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/skiplist.h300
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajson.h317
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajsondec.c1208
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajsonenc.c1207
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/ujson/python/JSONtoObj.c520
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.c163
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.h39
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/ujson/python/objToJSON.c2130
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/ujson/python/ujson.c451
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/src/ujson/python/version.h43
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/testing.pyi12
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/testing.pyx219
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslib.pyi32
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslib.pyx715
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/__init__.py85
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/base.pxd5
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/base.pyx12
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pxd20
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pyi12
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pyx310
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pxd62
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pyi14
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pyx779
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pxd106
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pyi81
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pyx438
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/fields.pyi62
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/fields.pyx792
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pxd18
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pyi132
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pyx1245
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pxd132
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pyi21
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pyx629
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pxd12
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pyi279
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pyx4595
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pxd14
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pyi38
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pyx1189
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/period.pxd7
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/period.pyi127
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/period.pyx2708
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime.c1093
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime.h102
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c1150
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h111
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pxd4
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pyi13
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pyx700
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pxd28
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pyi163
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pyx2171
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pxd36
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pyi228
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pyx2382
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pxd23
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pyi21
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pyx448
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pxd39
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pyi21
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pyx816
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/util.pxd226
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/vectorized.pyi43
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/tslibs/vectorized.pyx379
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/util.pxd17
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/window/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/window/aggregations.pyi127
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/window/aggregations.pyx1953
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/window/indexers.pyi12
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/window/indexers.pyx149
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/writers.pyi20
-rw-r--r--contrib/python/pandas/py3/pandas/_libs/writers.pyx173
-rw-r--r--contrib/python/pandas/py3/pandas/_testing/__init__.py1168
-rw-r--r--contrib/python/pandas/py3/pandas/_testing/_hypothesis.py89
-rw-r--r--contrib/python/pandas/py3/pandas/_testing/_io.py435
-rw-r--r--contrib/python/pandas/py3/pandas/_testing/_random.py29
-rw-r--r--contrib/python/pandas/py3/pandas/_testing/_warnings.py216
-rw-r--r--contrib/python/pandas/py3/pandas/_testing/asserters.py1378
-rw-r--r--contrib/python/pandas/py3/pandas/_testing/compat.py24
-rw-r--r--contrib/python/pandas/py3/pandas/_testing/contexts.py219
-rw-r--r--contrib/python/pandas/py3/pandas/_typing.py373
-rw-r--r--contrib/python/pandas/py3/pandas/_version.py21
-rw-r--r--contrib/python/pandas/py3/pandas/api/__init__.py14
-rw-r--r--contrib/python/pandas/py3/pandas/api/extensions/__init__.py33
-rw-r--r--contrib/python/pandas/py3/pandas/api/indexers/__init__.py17
-rw-r--r--contrib/python/pandas/py3/pandas/api/interchange/__init__.py8
-rw-r--r--contrib/python/pandas/py3/pandas/api/types/__init__.py23
-rw-r--r--contrib/python/pandas/py3/pandas/arrays/__init__.py36
-rw-r--r--contrib/python/pandas/py3/pandas/compat/__init__.py169
-rw-r--r--contrib/python/pandas/py3/pandas/compat/_constants.py27
-rw-r--r--contrib/python/pandas/py3/pandas/compat/_optional.py173
-rw-r--r--contrib/python/pandas/py3/pandas/compat/compressors.py69
-rw-r--r--contrib/python/pandas/py3/pandas/compat/numpy/__init__.py36
-rw-r--r--contrib/python/pandas/py3/pandas/compat/numpy/function.py391
-rw-r--r--contrib/python/pandas/py3/pandas/compat/pickle_compat.py249
-rw-r--r--contrib/python/pandas/py3/pandas/compat/pyarrow.py22
-rw-r--r--contrib/python/pandas/py3/pandas/core/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/core/_numba/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/core/_numba/executor.py59
-rw-r--r--contrib/python/pandas/py3/pandas/core/_numba/kernels/__init__.py6
-rw-r--r--contrib/python/pandas/py3/pandas/core/_numba/kernels/mean_.py150
-rw-r--r--contrib/python/pandas/py3/pandas/core/_numba/kernels/min_max_.py69
-rw-r--r--contrib/python/pandas/py3/pandas/core/_numba/kernels/shared.py25
-rw-r--r--contrib/python/pandas/py3/pandas/core/_numba/kernels/sum_.py138
-rw-r--r--contrib/python/pandas/py3/pandas/core/_numba/kernels/var_.py157
-rw-r--r--contrib/python/pandas/py3/pandas/core/accessor.py340
-rw-r--r--contrib/python/pandas/py3/pandas/core/algorithms.py1672
-rw-r--r--contrib/python/pandas/py3/pandas/core/api.py140
-rw-r--r--contrib/python/pandas/py3/pandas/core/apply.py1502
-rw-r--r--contrib/python/pandas/py3/pandas/core/array_algos/__init__.py9
-rw-r--r--contrib/python/pandas/py3/pandas/core/array_algos/datetimelike_accumulations.py67
-rw-r--r--contrib/python/pandas/py3/pandas/core/array_algos/masked_accumulations.py92
-rw-r--r--contrib/python/pandas/py3/pandas/core/array_algos/masked_reductions.py192
-rw-r--r--contrib/python/pandas/py3/pandas/core/array_algos/putmask.py152
-rw-r--r--contrib/python/pandas/py3/pandas/core/array_algos/quantile.py224
-rw-r--r--contrib/python/pandas/py3/pandas/core/array_algos/replace.py150
-rw-r--r--contrib/python/pandas/py3/pandas/core/array_algos/take.py594
-rw-r--r--contrib/python/pandas/py3/pandas/core/array_algos/transforms.py42
-rw-r--r--contrib/python/pandas/py3/pandas/core/arraylike.py527
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/__init__.py43
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/_mixins.py496
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/_ranges.py209
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/arrow/__init__.py4
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/arrow/_arrow_utils.py61
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/arrow/array.py2206
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/arrow/dtype.py312
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/arrow/extension_types.py111
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/base.py1873
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/boolean.py394
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/categorical.py2604
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/datetimelike.py2267
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/datetimes.py2595
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/floating.py159
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/integer.py220
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/interval.py1796
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/masked.py1391
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/numeric.py291
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/numpy_.py476
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/period.py1148
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/sparse/__init__.py21
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/sparse/accessor.py386
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/sparse/array.py1892
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/sparse/dtype.py426
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/sparse/scipy_sparse.py208
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/string_.py608
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/string_arrow.py412
-rw-r--r--contrib/python/pandas/py3/pandas/core/arrays/timedeltas.py1062
-rw-r--r--contrib/python/pandas/py3/pandas/core/base.py1357
-rw-r--r--contrib/python/pandas/py3/pandas/core/common.py653
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/align.py213
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/api.py2
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/check.py12
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/common.py48
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/engines.py143
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/eval.py413
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/expr.py840
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/expressions.py283
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/ops.py620
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/parsing.py195
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/pytables.py641
-rw-r--r--contrib/python/pandas/py3/pandas/core/computation/scope.py357
-rw-r--r--contrib/python/pandas/py3/pandas/core/config_init.py883
-rw-r--r--contrib/python/pandas/py3/pandas/core/construction.py767
-rw-r--r--contrib/python/pandas/py3/pandas/core/dtypes/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/core/dtypes/api.py85
-rw-r--r--contrib/python/pandas/py3/pandas/core/dtypes/astype.py306
-rw-r--r--contrib/python/pandas/py3/pandas/core/dtypes/base.py528
-rw-r--r--contrib/python/pandas/py3/pandas/core/dtypes/cast.py1921
-rw-r--r--contrib/python/pandas/py3/pandas/core/dtypes/common.py1792
-rw-r--r--contrib/python/pandas/py3/pandas/core/dtypes/concat.py323
-rw-r--r--contrib/python/pandas/py3/pandas/core/dtypes/dtypes.py1478
-rw-r--r--contrib/python/pandas/py3/pandas/core/dtypes/generic.py147
-rw-r--r--contrib/python/pandas/py3/pandas/core/dtypes/inference.py431
-rw-r--r--contrib/python/pandas/py3/pandas/core/dtypes/missing.py761
-rw-r--r--contrib/python/pandas/py3/pandas/core/flags.py115
-rw-r--r--contrib/python/pandas/py3/pandas/core/frame.py11620
-rw-r--r--contrib/python/pandas/py3/pandas/core/generic.py12604
-rw-r--r--contrib/python/pandas/py3/pandas/core/groupby/__init__.py15
-rw-r--r--contrib/python/pandas/py3/pandas/core/groupby/base.py118
-rw-r--r--contrib/python/pandas/py3/pandas/core/groupby/categorical.py87
-rw-r--r--contrib/python/pandas/py3/pandas/core/groupby/generic.py2651
-rw-r--r--contrib/python/pandas/py3/pandas/core/groupby/groupby.py4292
-rw-r--r--contrib/python/pandas/py3/pandas/core/groupby/grouper.py1044
-rw-r--r--contrib/python/pandas/py3/pandas/core/groupby/indexing.py303
-rw-r--r--contrib/python/pandas/py3/pandas/core/groupby/numba_.py179
-rw-r--r--contrib/python/pandas/py3/pandas/core/groupby/ops.py1278
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexers/__init__.py31
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexers/objects.py390
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexers/utils.py555
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/accessors.py580
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/api.py369
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/base.py7243
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/category.py486
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/datetimelike.py787
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/datetimes.py1064
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/extension.py192
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/frozen.py117
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/interval.py1137
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/multi.py3918
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/period.py547
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/range.py1037
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexes/timedeltas.py315
-rw-r--r--contrib/python/pandas/py3/pandas/core/indexing.py2629
-rw-r--r--contrib/python/pandas/py3/pandas/core/interchange/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/core/interchange/buffer.py77
-rw-r--r--contrib/python/pandas/py3/pandas/core/interchange/column.py377
-rw-r--r--contrib/python/pandas/py3/pandas/core/interchange/dataframe.py111
-rw-r--r--contrib/python/pandas/py3/pandas/core/interchange/dataframe_protocol.py460
-rw-r--r--contrib/python/pandas/py3/pandas/core/interchange/from_dataframe.py499
-rw-r--r--contrib/python/pandas/py3/pandas/core/interchange/utils.py92
-rw-r--r--contrib/python/pandas/py3/pandas/core/internals/__init__.py40
-rw-r--r--contrib/python/pandas/py3/pandas/core/internals/api.py97
-rw-r--r--contrib/python/pandas/py3/pandas/core/internals/array_manager.py1361
-rw-r--r--contrib/python/pandas/py3/pandas/core/internals/base.py224
-rw-r--r--contrib/python/pandas/py3/pandas/core/internals/blocks.py2607
-rw-r--r--contrib/python/pandas/py3/pandas/core/internals/concat.py791
-rw-r--r--contrib/python/pandas/py3/pandas/core/internals/construction.py1069
-rw-r--r--contrib/python/pandas/py3/pandas/core/internals/managers.py2343
-rw-r--r--contrib/python/pandas/py3/pandas/core/internals/ops.py147
-rw-r--r--contrib/python/pandas/py3/pandas/core/methods/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/core/methods/describe.py408
-rw-r--r--contrib/python/pandas/py3/pandas/core/methods/selectn.py262
-rw-r--r--contrib/python/pandas/py3/pandas/core/methods/to_dict.py207
-rw-r--r--contrib/python/pandas/py3/pandas/core/missing.py1030
-rw-r--r--contrib/python/pandas/py3/pandas/core/nanops.py1767
-rw-r--r--contrib/python/pandas/py3/pandas/core/ops/__init__.py535
-rw-r--r--contrib/python/pandas/py3/pandas/core/ops/array_ops.py544
-rw-r--r--contrib/python/pandas/py3/pandas/core/ops/common.py151
-rw-r--r--contrib/python/pandas/py3/pandas/core/ops/dispatch.py26
-rw-r--r--contrib/python/pandas/py3/pandas/core/ops/docstrings.py765
-rw-r--r--contrib/python/pandas/py3/pandas/core/ops/invalid.py58
-rw-r--r--contrib/python/pandas/py3/pandas/core/ops/mask_ops.py189
-rw-r--r--contrib/python/pandas/py3/pandas/core/ops/methods.py124
-rw-r--r--contrib/python/pandas/py3/pandas/core/ops/missing.py180
-rw-r--r--contrib/python/pandas/py3/pandas/core/resample.py2302
-rw-r--r--contrib/python/pandas/py3/pandas/core/reshape/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/core/reshape/api.py41
-rw-r--r--contrib/python/pandas/py3/pandas/core/reshape/concat.py823
-rw-r--r--contrib/python/pandas/py3/pandas/core/reshape/encoding.py533
-rw-r--r--contrib/python/pandas/py3/pandas/core/reshape/melt.py540
-rw-r--r--contrib/python/pandas/py3/pandas/core/reshape/merge.py2645
-rw-r--r--contrib/python/pandas/py3/pandas/core/reshape/pivot.py885
-rw-r--r--contrib/python/pandas/py3/pandas/core/reshape/reshape.py841
-rw-r--r--contrib/python/pandas/py3/pandas/core/reshape/tile.py651
-rw-r--r--contrib/python/pandas/py3/pandas/core/reshape/util.py82
-rw-r--r--contrib/python/pandas/py3/pandas/core/roperator.py62
-rw-r--r--contrib/python/pandas/py3/pandas/core/sample.py153
-rw-r--r--contrib/python/pandas/py3/pandas/core/series.py6118
-rw-r--r--contrib/python/pandas/py3/pandas/core/shared_docs.py894
-rw-r--r--contrib/python/pandas/py3/pandas/core/sorting.py725
-rw-r--r--contrib/python/pandas/py3/pandas/core/sparse/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/core/sparse/api.py6
-rw-r--r--contrib/python/pandas/py3/pandas/core/strings/__init__.py28
-rw-r--r--contrib/python/pandas/py3/pandas/core/strings/accessor.py3376
-rw-r--r--contrib/python/pandas/py3/pandas/core/strings/base.py260
-rw-r--r--contrib/python/pandas/py3/pandas/core/strings/object_array.py498
-rw-r--r--contrib/python/pandas/py3/pandas/core/tools/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/core/tools/datetimes.py1272
-rw-r--r--contrib/python/pandas/py3/pandas/core/tools/numeric.py310
-rw-r--r--contrib/python/pandas/py3/pandas/core/tools/timedeltas.py265
-rw-r--r--contrib/python/pandas/py3/pandas/core/tools/times.py154
-rw-r--r--contrib/python/pandas/py3/pandas/core/util/__init__.py0
-rw-r--r--contrib/python/pandas/py3/pandas/core/util/hashing.py366
-rw-r--r--contrib/python/pandas/py3/pandas/core/util/numba_.py112
-rw-r--r--contrib/python/pandas/py3/pandas/core/window/__init__.py23
-rw-r--r--contrib/python/pandas/py3/pandas/core/window/common.py168
-rw-r--r--contrib/python/pandas/py3/pandas/core/window/doc.py116
-rw-r--r--contrib/python/pandas/py3/pandas/core/window/ewm.py1012
-rw-r--r--contrib/python/pandas/py3/pandas/core/window/expanding.py816
-rw-r--r--contrib/python/pandas/py3/pandas/core/window/numba_.py349
-rw-r--r--contrib/python/pandas/py3/pandas/core/window/online.py118
-rw-r--r--contrib/python/pandas/py3/pandas/core/window/rolling.py2744
-rw-r--r--contrib/python/pandas/py3/pandas/errors/__init__.py637
-rw-r--r--contrib/python/pandas/py3/pandas/io/__init__.py12
-rw-r--r--contrib/python/pandas/py3/pandas/io/_util.py23
-rw-r--r--contrib/python/pandas/py3/pandas/io/api.py65
-rw-r--r--contrib/python/pandas/py3/pandas/io/clipboard/__init__.py678
-rw-r--r--contrib/python/pandas/py3/pandas/io/clipboards.py178
-rw-r--r--contrib/python/pandas/py3/pandas/io/common.py1253
-rw-r--r--contrib/python/pandas/py3/pandas/io/excel/__init__.py19
-rw-r--r--contrib/python/pandas/py3/pandas/io/excel/_base.py1594
-rw-r--r--contrib/python/pandas/py3/pandas/io/excel/_odfreader.py249
-rw-r--r--contrib/python/pandas/py3/pandas/io/excel/_odswriter.py337
-rw-r--r--contrib/python/pandas/py3/pandas/io/excel/_openpyxl.py626
-rw-r--r--contrib/python/pandas/py3/pandas/io/excel/_pyxlsb.py112
-rw-r--r--contrib/python/pandas/py3/pandas/io/excel/_util.py332
-rw-r--r--contrib/python/pandas/py3/pandas/io/excel/_xlrd.py126
-rw-r--r--contrib/python/pandas/py3/pandas/io/excel/_xlsxwriter.py275
-rw-r--r--contrib/python/pandas/py3/pandas/io/feather_format.py162
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/__init__.py8
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/_color_data.py157
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/console.py94
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/css.py418
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/csvs.py319
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/excel.py950
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/format.py2240
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/html.py633
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/info.py1101
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/latex.py831
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/printing.py504
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/string.py207
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/style.py3946
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/style_render.py2342
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/templates/html.tpl16
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/templates/html_style.tpl26
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/templates/html_table.tpl63
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/templates/latex.tpl5
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/templates/latex_longtable.tpl82
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/templates/latex_table.tpl57
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/templates/string.tpl12
-rw-r--r--contrib/python/pandas/py3/pandas/io/formats/xml.py555
-rw-r--r--contrib/python/pandas/py3/pandas/io/gbq.py227
-rw-r--r--contrib/python/pandas/py3/pandas/io/html.py1230
-rw-r--r--contrib/python/pandas/py3/pandas/io/json/__init__.py15
-rw-r--r--contrib/python/pandas/py3/pandas/io/json/_json.py1420
-rw-r--r--contrib/python/pandas/py3/pandas/io/json/_normalize.py536
-rw-r--r--contrib/python/pandas/py3/pandas/io/json/_table_schema.py382
-rw-r--r--contrib/python/pandas/py3/pandas/io/orc.py205
-rw-r--r--contrib/python/pandas/py3/pandas/io/parquet.py516
-rw-r--r--contrib/python/pandas/py3/pandas/io/parsers/__init__.py9
-rw-r--r--contrib/python/pandas/py3/pandas/io/parsers/arrow_parser_wrapper.py164
-rw-r--r--contrib/python/pandas/py3/pandas/io/parsers/base_parser.py1388
-rw-r--r--contrib/python/pandas/py3/pandas/io/parsers/c_parser_wrapper.py423
-rw-r--r--contrib/python/pandas/py3/pandas/io/parsers/python_parser.py1351
-rw-r--r--contrib/python/pandas/py3/pandas/io/parsers/readers.py2127
-rw-r--r--contrib/python/pandas/py3/pandas/io/pickle.py204
-rw-r--r--contrib/python/pandas/py3/pandas/io/pytables.py5289
-rw-r--r--contrib/python/pandas/py3/pandas/io/sas/__init__.py3
-rw-r--r--contrib/python/pandas/py3/pandas/io/sas/_byteswap.pyi5
-rw-r--r--contrib/python/pandas/py3/pandas/io/sas/_sas.pyi7
-rw-r--r--contrib/python/pandas/py3/pandas/io/sas/byteswap.pyx93
-rw-r--r--contrib/python/pandas/py3/pandas/io/sas/sas.pyx548
-rw-r--r--contrib/python/pandas/py3/pandas/io/sas/sas7bdat.py747
-rw-r--r--contrib/python/pandas/py3/pandas/io/sas/sas_constants.py310
-rw-r--r--contrib/python/pandas/py3/pandas/io/sas/sas_xport.py506
-rw-r--r--contrib/python/pandas/py3/pandas/io/sas/sasreader.py180
-rw-r--r--contrib/python/pandas/py3/pandas/io/spss.py67
-rw-r--r--contrib/python/pandas/py3/pandas/io/sql.py2447
-rw-r--r--contrib/python/pandas/py3/pandas/io/stata.py3721
-rw-r--r--contrib/python/pandas/py3/pandas/io/xml.py1135
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/__init__.py98
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_core.py1864
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_matplotlib/__init__.py93
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_matplotlib/boxplot.py550
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_matplotlib/converter.py1109
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_matplotlib/core.py1877
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_matplotlib/groupby.py139
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_matplotlib/hist.py546
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_matplotlib/misc.py482
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_matplotlib/style.py274
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_matplotlib/timeseries.py335
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_matplotlib/tools.py483
-rw-r--r--contrib/python/pandas/py3/pandas/plotting/_misc.py618
-rw-r--r--contrib/python/pandas/py3/pandas/testing.py18
-rw-r--r--contrib/python/pandas/py3/pandas/tseries/__init__.py11
-rw-r--r--contrib/python/pandas/py3/pandas/tseries/api.py8
-rw-r--r--contrib/python/pandas/py3/pandas/tseries/frequencies.py619
-rw-r--r--contrib/python/pandas/py3/pandas/tseries/holiday.py609
-rw-r--r--contrib/python/pandas/py3/pandas/tseries/offsets.py91
-rw-r--r--contrib/python/pandas/py3/pandas/util/__init__.py11
-rw-r--r--contrib/python/pandas/py3/pandas/util/_decorators.py505
-rw-r--r--contrib/python/pandas/py3/pandas/util/_doctools.py199
-rw-r--r--contrib/python/pandas/py3/pandas/util/_exceptions.py94
-rw-r--r--contrib/python/pandas/py3/pandas/util/_print_versions.py134
-rw-r--r--contrib/python/pandas/py3/pandas/util/_str_methods.py28
-rw-r--r--contrib/python/pandas/py3/pandas/util/_test_decorators.py264
-rw-r--r--contrib/python/pandas/py3/pandas/util/_tester.py38
-rw-r--r--contrib/python/pandas/py3/pandas/util/_validators.py451
-rw-r--r--contrib/python/pandas/py3/pandas/util/version/__init__.py574
-rw-r--r--contrib/python/pandas/py3/patches/01-arcadia.patch33
-rw-r--r--contrib/python/pandas/py3/symbols.cmake180
-rw-r--r--contrib/python/pandas/py3/ya.make465
474 files changed, 0 insertions, 293470 deletions
diff --git a/contrib/python/pandas/py3/.dist-info/METADATA b/contrib/python/pandas/py3/.dist-info/METADATA
deleted file mode 100644
index 0c45c0e1559..00000000000
--- a/contrib/python/pandas/py3/.dist-info/METADATA
+++ /dev/null
@@ -1,331 +0,0 @@
-Metadata-Version: 2.1
-Name: pandas
-Version: 2.0.3
-Summary: Powerful data structures for data analysis, time series, and statistics
-Author-email: The Pandas Development Team <pandas-dev@python.org>
-License: BSD 3-Clause License
-
- Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
- All rights reserved.
-
- Copyright (c) 2011-2023, Open source contributors.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name of the copyright holder nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Project-URL: homepage, https://pandas.pydata.org
-Project-URL: documentation, https://pandas.pydata.org/docs/
-Project-URL: repository, https://github.com/pandas-dev/pandas
-Classifier: Development Status :: 5 - Production/Stable
-Classifier: Environment :: Console
-Classifier: Intended Audience :: Science/Research
-Classifier: License :: OSI Approved :: BSD License
-Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Cython
-Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Topic :: Scientific/Engineering
-Requires-Python: >=3.8
-Description-Content-Type: text/markdown
-License-File: LICENSE
-License-File: AUTHORS.md
-Requires-Dist: python-dateutil (>=2.8.2)
-Requires-Dist: pytz (>=2020.1)
-Requires-Dist: tzdata (>=2022.1)
-Requires-Dist: numpy (>=1.20.3) ; python_version < "3.10"
-Requires-Dist: numpy (>=1.21.0) ; python_version >= "3.10"
-Requires-Dist: numpy (>=1.23.2) ; python_version >= "3.11"
-Provides-Extra: all
-Requires-Dist: beautifulsoup4 (>=4.9.3) ; extra == 'all'
-Requires-Dist: bottleneck (>=1.3.2) ; extra == 'all'
-Requires-Dist: brotlipy (>=0.7.0) ; extra == 'all'
-Requires-Dist: fastparquet (>=0.6.3) ; extra == 'all'
-Requires-Dist: fsspec (>=2021.07.0) ; extra == 'all'
-Requires-Dist: gcsfs (>=2021.07.0) ; extra == 'all'
-Requires-Dist: html5lib (>=1.1) ; extra == 'all'
-Requires-Dist: hypothesis (>=6.34.2) ; extra == 'all'
-Requires-Dist: jinja2 (>=3.0.0) ; extra == 'all'
-Requires-Dist: lxml (>=4.6.3) ; extra == 'all'
-Requires-Dist: matplotlib (>=3.6.1) ; extra == 'all'
-Requires-Dist: numba (>=0.53.1) ; extra == 'all'
-Requires-Dist: numexpr (>=2.7.3) ; extra == 'all'
-Requires-Dist: odfpy (>=1.4.1) ; extra == 'all'
-Requires-Dist: openpyxl (>=3.0.7) ; extra == 'all'
-Requires-Dist: pandas-gbq (>=0.15.0) ; extra == 'all'
-Requires-Dist: psycopg2 (>=2.8.6) ; extra == 'all'
-Requires-Dist: pyarrow (>=7.0.0) ; extra == 'all'
-Requires-Dist: pymysql (>=1.0.2) ; extra == 'all'
-Requires-Dist: PyQt5 (>=5.15.1) ; extra == 'all'
-Requires-Dist: pyreadstat (>=1.1.2) ; extra == 'all'
-Requires-Dist: pytest (>=7.3.2) ; extra == 'all'
-Requires-Dist: pytest-xdist (>=2.2.0) ; extra == 'all'
-Requires-Dist: pytest-asyncio (>=0.17.0) ; extra == 'all'
-Requires-Dist: python-snappy (>=0.6.0) ; extra == 'all'
-Requires-Dist: pyxlsb (>=1.0.8) ; extra == 'all'
-Requires-Dist: qtpy (>=2.2.0) ; extra == 'all'
-Requires-Dist: scipy (>=1.7.1) ; extra == 'all'
-Requires-Dist: s3fs (>=2021.08.0) ; extra == 'all'
-Requires-Dist: SQLAlchemy (>=1.4.16) ; extra == 'all'
-Requires-Dist: tables (>=3.6.1) ; extra == 'all'
-Requires-Dist: tabulate (>=0.8.9) ; extra == 'all'
-Requires-Dist: xarray (>=0.21.0) ; extra == 'all'
-Requires-Dist: xlrd (>=2.0.1) ; extra == 'all'
-Requires-Dist: xlsxwriter (>=1.4.3) ; extra == 'all'
-Requires-Dist: zstandard (>=0.15.2) ; extra == 'all'
-Provides-Extra: aws
-Requires-Dist: s3fs (>=2021.08.0) ; extra == 'aws'
-Provides-Extra: clipboard
-Requires-Dist: PyQt5 (>=5.15.1) ; extra == 'clipboard'
-Requires-Dist: qtpy (>=2.2.0) ; extra == 'clipboard'
-Provides-Extra: compression
-Requires-Dist: brotlipy (>=0.7.0) ; extra == 'compression'
-Requires-Dist: python-snappy (>=0.6.0) ; extra == 'compression'
-Requires-Dist: zstandard (>=0.15.2) ; extra == 'compression'
-Provides-Extra: computation
-Requires-Dist: scipy (>=1.7.1) ; extra == 'computation'
-Requires-Dist: xarray (>=0.21.0) ; extra == 'computation'
-Provides-Extra: excel
-Requires-Dist: odfpy (>=1.4.1) ; extra == 'excel'
-Requires-Dist: openpyxl (>=3.0.7) ; extra == 'excel'
-Requires-Dist: pyxlsb (>=1.0.8) ; extra == 'excel'
-Requires-Dist: xlrd (>=2.0.1) ; extra == 'excel'
-Requires-Dist: xlsxwriter (>=1.4.3) ; extra == 'excel'
-Provides-Extra: feather
-Requires-Dist: pyarrow (>=7.0.0) ; extra == 'feather'
-Provides-Extra: fss
-Requires-Dist: fsspec (>=2021.07.0) ; extra == 'fss'
-Provides-Extra: gcp
-Requires-Dist: gcsfs (>=2021.07.0) ; extra == 'gcp'
-Requires-Dist: pandas-gbq (>=0.15.0) ; extra == 'gcp'
-Provides-Extra: hdf5
-Requires-Dist: tables (>=3.6.1) ; extra == 'hdf5'
-Provides-Extra: html
-Requires-Dist: beautifulsoup4 (>=4.9.3) ; extra == 'html'
-Requires-Dist: html5lib (>=1.1) ; extra == 'html'
-Requires-Dist: lxml (>=4.6.3) ; extra == 'html'
-Provides-Extra: mysql
-Requires-Dist: SQLAlchemy (>=1.4.16) ; extra == 'mysql'
-Requires-Dist: pymysql (>=1.0.2) ; extra == 'mysql'
-Provides-Extra: output_formatting
-Requires-Dist: jinja2 (>=3.0.0) ; extra == 'output_formatting'
-Requires-Dist: tabulate (>=0.8.9) ; extra == 'output_formatting'
-Provides-Extra: parquet
-Requires-Dist: pyarrow (>=7.0.0) ; extra == 'parquet'
-Provides-Extra: performance
-Requires-Dist: bottleneck (>=1.3.2) ; extra == 'performance'
-Requires-Dist: numba (>=0.53.1) ; extra == 'performance'
-Requires-Dist: numexpr (>=2.7.1) ; extra == 'performance'
-Provides-Extra: plot
-Requires-Dist: matplotlib (>=3.6.1) ; extra == 'plot'
-Provides-Extra: postgresql
-Requires-Dist: SQLAlchemy (>=1.4.16) ; extra == 'postgresql'
-Requires-Dist: psycopg2 (>=2.8.6) ; extra == 'postgresql'
-Provides-Extra: spss
-Requires-Dist: pyreadstat (>=1.1.2) ; extra == 'spss'
-Provides-Extra: sql-other
-Requires-Dist: SQLAlchemy (>=1.4.16) ; extra == 'sql-other'
-Provides-Extra: test
-Requires-Dist: hypothesis (>=6.34.2) ; extra == 'test'
-Requires-Dist: pytest (>=7.3.2) ; extra == 'test'
-Requires-Dist: pytest-xdist (>=2.2.0) ; extra == 'test'
-Requires-Dist: pytest-asyncio (>=0.17.0) ; extra == 'test'
-Provides-Extra: xml
-Requires-Dist: lxml (>=4.6.3) ; extra == 'xml'
-
-<div align="center">
- <img src="https://pandas.pydata.org/static/img/pandas.svg"><br>
-</div>
-
------------------
-
-# pandas: powerful Python data analysis toolkit
-[![PyPI Latest Release](https://img.shields.io/pypi/v/pandas.svg)](https://pypi.org/project/pandas/)
-[![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/anaconda/pandas/)
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134)
-[![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/)
-[![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/main/LICENSE)
-[![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=main)](https://codecov.io/gh/pandas-dev/pandas)
-[![Downloads](https://static.pepy.tech/personalized-badge/pandas?period=month&units=international_system&left_color=black&right_color=orange&left_text=PyPI%20downloads%20per%20month)](https://pepy.tech/project/pandas)
-[![Slack](https://img.shields.io/badge/join_Slack-information-brightgreen.svg?logo=slack)](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack)
-[![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org)
-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
-[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
-
-## What is it?
-
-**pandas** is a Python package that provides fast, flexible, and expressive data
-structures designed to make working with "relational" or "labeled" data both
-easy and intuitive. It aims to be the fundamental high-level building block for
-doing practical, **real world** data analysis in Python. Additionally, it has
-the broader goal of becoming **the most powerful and flexible open source data
-analysis / manipulation tool available in any language**. It is already well on
-its way towards this goal.
-
-## Main Features
-Here are just a few of the things that pandas does well:
-
- - Easy handling of [**missing data**][missing-data] (represented as
- `NaN`, `NA`, or `NaT`) in floating point as well as non-floating point data
- - Size mutability: columns can be [**inserted and
- deleted**][insertion-deletion] from DataFrame and higher dimensional
- objects
- - Automatic and explicit [**data alignment**][alignment]: objects can
- be explicitly aligned to a set of labels, or the user can simply
- ignore the labels and let `Series`, `DataFrame`, etc. automatically
- align the data for you in computations
- - Powerful, flexible [**group by**][groupby] functionality to perform
- split-apply-combine operations on data sets, for both aggregating
- and transforming data
- - Make it [**easy to convert**][conversion] ragged,
- differently-indexed data in other Python and NumPy data structures
- into DataFrame objects
- - Intelligent label-based [**slicing**][slicing], [**fancy
- indexing**][fancy-indexing], and [**subsetting**][subsetting] of
- large data sets
- - Intuitive [**merging**][merging] and [**joining**][joining] data
- sets
- - Flexible [**reshaping**][reshape] and [**pivoting**][pivot-table] of
- data sets
- - [**Hierarchical**][mi] labeling of axes (possible to have multiple
- labels per tick)
- - Robust IO tools for loading data from [**flat files**][flat-files]
- (CSV and delimited), [**Excel files**][excel], [**databases**][db],
- and saving/loading data from the ultrafast [**HDF5 format**][hdfstore]
- - [**Time series**][timeseries]-specific functionality: date range
- generation and frequency conversion, moving window statistics,
- date shifting and lagging
-
-
- [missing-data]: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
- [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#column-selection-addition-deletion
- [alignment]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html?highlight=alignment#intro-to-data-structures
- [groupby]: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#group-by-split-apply-combine
- [conversion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#dataframe
- [slicing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#slicing-ranges
- [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced
- [subsetting]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
- [merging]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#database-style-dataframe-or-named-series-joining-merging
- [joining]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#joining-on-index
- [reshape]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html
- [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html
- [mi]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#hierarchical-indexing-multiindex
- [flat-files]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#csv-text-files
- [excel]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#excel-files
- [db]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#sql-queries
- [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#hdf5-pytables
- [timeseries]: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-series-date-functionality
-
-## Where to get it
-The source code is currently hosted on GitHub at:
-https://github.com/pandas-dev/pandas
-
-Binary installers for the latest released version are available at the [Python
-Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/).
-
-```sh
-# conda
-conda install pandas
-```
-
-```sh
-# or PyPI
-pip install pandas
-```
-
-## Dependencies
-- [NumPy - Adds support for large, multi-dimensional arrays, matrices and high-level mathematical functions to operate on these arrays](https://www.numpy.org)
-- [python-dateutil - Provides powerful extensions to the standard datetime module](https://dateutil.readthedocs.io/en/stable/index.html)
-- [pytz - Brings the Olson tz database into Python which allows accurate and cross platform timezone calculations](https://github.com/stub42/pytz)
-
-See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for minimum supported versions of required, recommended and optional dependencies.
-
-## Installation from sources
-To install pandas from source you need [Cython](https://cython.org/) in addition to the normal
-dependencies above. Cython can be installed from PyPI:
-
-```sh
-pip install cython
-```
-
-In the `pandas` directory (same one where you found this file after
-cloning the git repo), execute:
-
-```sh
-python setup.py install
-```
-
-or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable):
-
-
-```sh
-python -m pip install -e . --no-build-isolation --no-use-pep517
-```
-
-or alternatively
-
-```sh
-python setup.py develop
-```
-
-See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-from-source).
-
-## License
-[BSD 3](LICENSE)
-
-## Documentation
-The official documentation is hosted on PyData.org: https://pandas.pydata.org/pandas-docs/stable
-
-## Background
-Work on ``pandas`` started at [AQR](https://www.aqr.com/) (a quantitative hedge fund) in 2008 and
-has been under active development since then.
-
-## Getting Help
-
-For usage questions, the best place to go to is [StackOverflow](https://stackoverflow.com/questions/tagged/pandas).
-Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata).
-
-## Discussion and Development
-Most development discussions take place on GitHub in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Slack channel](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack) is available for quick development related questions.
-
-## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas)
-
-All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome.
-
-A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas.pydata.org/docs/dev/development/contributing.html)**.
-
-If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out.
-
-You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas).
-
-Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it!
-
-Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Slack](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack).
-
-As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/.github/blob/master/CODE_OF_CONDUCT.md)
diff --git a/contrib/python/pandas/py3/.dist-info/entry_points.txt b/contrib/python/pandas/py3/.dist-info/entry_points.txt
deleted file mode 100644
index 69482a55034..00000000000
--- a/contrib/python/pandas/py3/.dist-info/entry_points.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-[pandas_plotting_backends]
-matplotlib = pandas:plotting._matplotlib
diff --git a/contrib/python/pandas/py3/.dist-info/top_level.txt b/contrib/python/pandas/py3/.dist-info/top_level.txt
deleted file mode 100644
index fb6c7ed7ec6..00000000000
--- a/contrib/python/pandas/py3/.dist-info/top_level.txt
+++ /dev/null
@@ -1 +0,0 @@
-pandas
diff --git a/contrib/python/pandas/py3/.yandex_meta/yamaker.yaml b/contrib/python/pandas/py3/.yandex_meta/yamaker.yaml
deleted file mode 100644
index b0e75563430..00000000000
--- a/contrib/python/pandas/py3/.yandex_meta/yamaker.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-copy:
- - LICENSES/*
-exclude_from_macros:
- - LICENSES/*
- - symbols.cmake
- - pandas/_libs/src/headers/cmath
-keep:
- - symbols.cmake
-cython_directive:
- - language_level=3
-cython_templates:
- - pandas/*.pxi.in
-mark_as_cython_c:
- - pandas/_libs/[!w]*.pyx
- - pandas/_libs/window/indexers.pyx
- - pandas/_libs/writers.pyx
- - pandas/io/sas/sas.pyx
diff --git a/contrib/python/pandas/py3/AUTHORS.md b/contrib/python/pandas/py3/AUTHORS.md
deleted file mode 100644
index 84fcfe05e30..00000000000
--- a/contrib/python/pandas/py3/AUTHORS.md
+++ /dev/null
@@ -1,56 +0,0 @@
-About the Copyright Holders
-===========================
-
-* Copyright (c) 2008-2011 AQR Capital Management, LLC
-
- AQR Capital Management began pandas development in 2008. Development was
- led by Wes McKinney. AQR released the source under this license in 2009.
-* Copyright (c) 2011-2012, Lambda Foundry, Inc.
-
- Wes is now an employee of Lambda Foundry, and remains the pandas project
- lead.
-* Copyright (c) 2011-2012, PyData Development Team
-
- The PyData Development Team is the collection of developers of the PyData
- project. This includes all of the PyData sub-projects, including pandas. The
- core team that coordinates development on GitHub can be found here:
- https://github.com/pydata.
-
-Full credits for pandas contributors can be found in the documentation.
-
-Our Copyright Policy
-====================
-
-PyData uses a shared copyright model. Each contributor maintains copyright
-over their contributions to PyData. However, it is important to note that
-these contributions are typically only changes to the repositories. Thus,
-the PyData source code, in its entirety, is not the copyright of any single
-person or institution. Instead, it is the collective copyright of the
-entire PyData Development Team. If individual contributors want to maintain
-a record of what changes/contributions they have specific copyright on,
-they should indicate their copyright in the commit message of the change
-when they commit the change to one of the PyData repositories.
-
-With this in mind, the following banner should be used in any source code
-file to indicate the copyright and license terms:
-
-```
-#-----------------------------------------------------------------------------
-# Copyright (c) 2012, PyData Development Team
-# All rights reserved.
-#
-# Distributed under the terms of the BSD Simplified License.
-#
-# The full license is in the LICENSE file, distributed with this software.
-#-----------------------------------------------------------------------------
-```
-
-Other licenses can be found in the LICENSES directory.
-
-License
-=======
-
-pandas is distributed under a 3-clause ("Simplified" or "New") BSD
-license. Parts of NumPy, SciPy, numpydoc, bottleneck, which all have
-BSD-compatible licenses, are included. Their licenses follow the pandas
-license.
diff --git a/contrib/python/pandas/py3/LICENSE b/contrib/python/pandas/py3/LICENSE
deleted file mode 100644
index cdfa749dc34..00000000000
--- a/contrib/python/pandas/py3/LICENSE
+++ /dev/null
@@ -1,31 +0,0 @@
-BSD 3-Clause License
-
-Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
-All rights reserved.
-
-Copyright (c) 2011-2023, Open source contributors.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
-* Neither the name of the copyright holder nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/contrib/python/pandas/py3/LICENSES/DATEUTIL_LICENSE b/contrib/python/pandas/py3/LICENSES/DATEUTIL_LICENSE
deleted file mode 100644
index 6053d35cfc6..00000000000
--- a/contrib/python/pandas/py3/LICENSES/DATEUTIL_LICENSE
+++ /dev/null
@@ -1,54 +0,0 @@
-Copyright 2017- Paul Ganssle <paul@ganssle.io>
-Copyright 2017- dateutil contributors (see AUTHORS file)
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-
-The above license applies to all contributions after 2017-12-01, as well as
-all contributions that have been re-licensed (see AUTHORS file for the list of
-contributors who have re-licensed their code).
---------------------------------------------------------------------------------
-dateutil - Extensions to the standard Python datetime module.
-
-Copyright (c) 2003-2011 - Gustavo Niemeyer <gustavo@niemeyer.net>
-Copyright (c) 2012-2014 - Tomi Pieviläinen <tomi.pievilainen@iki.fi>
-Copyright (c) 2014-2016 - Yaron de Leeuw <me@jarondl.net>
-Copyright (c) 2015- - Paul Ganssle <paul@ganssle.io>
-Copyright (c) 2015- - dateutil contributors (see AUTHORS file)
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
- * Neither the name of the copyright holder nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-The above BSD License Applies to all code, even that also covered by Apache 2.0.
diff --git a/contrib/python/pandas/py3/LICENSES/HAVEN_LICENSE b/contrib/python/pandas/py3/LICENSES/HAVEN_LICENSE
deleted file mode 100644
index ce1b07b783e..00000000000
--- a/contrib/python/pandas/py3/LICENSES/HAVEN_LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-# MIT License
-
-Copyright (c) 2019 Hadley Wickham; RStudio; and Evan Miller
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/contrib/python/pandas/py3/LICENSES/HAVEN_MIT b/contrib/python/pandas/py3/LICENSES/HAVEN_MIT
deleted file mode 100644
index b03d0e64062..00000000000
--- a/contrib/python/pandas/py3/LICENSES/HAVEN_MIT
+++ /dev/null
@@ -1,32 +0,0 @@
-Based on http://opensource.org/licenses/MIT
-
-This is a template. Complete and ship as file LICENSE the following 2
-lines (only)
-
-YEAR:
-COPYRIGHT HOLDER:
-
-and specify as
-
-License: MIT + file LICENSE
-
-Copyright (c) <YEAR>, <COPYRIGHT HOLDER>
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/contrib/python/pandas/py3/LICENSES/KLIB_LICENSE b/contrib/python/pandas/py3/LICENSES/KLIB_LICENSE
deleted file mode 100644
index 0a996fae336..00000000000
--- a/contrib/python/pandas/py3/LICENSES/KLIB_LICENSE
+++ /dev/null
@@ -1,23 +0,0 @@
-The MIT License
-
-Copyright (c) 2008- Attractive Chaos <attractor@live.co.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/contrib/python/pandas/py3/LICENSES/MUSL_LICENSE b/contrib/python/pandas/py3/LICENSES/MUSL_LICENSE
deleted file mode 100644
index a8833d4bc47..00000000000
--- a/contrib/python/pandas/py3/LICENSES/MUSL_LICENSE
+++ /dev/null
@@ -1,132 +0,0 @@
-musl as a whole is licensed under the following standard MIT license:
-
-----------------------------------------------------------------------
-Copyright © 2005-2014 Rich Felker, et al.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-----------------------------------------------------------------------
-
-Authors/contributors include:
-
-Anthony G. Basile
-Arvid Picciani
-Bobby Bingham
-Boris Brezillon
-Brent Cook
-Chris Spiegel
-Clément Vasseur
-Emil Renner Berthing
-Hiltjo Posthuma
-Isaac Dunham
-Jens Gustedt
-Jeremy Huntwork
-John Spencer
-Justin Cormack
-Luca Barbato
-Luka Perkov
-M Farkas-Dyck (Strake)
-Michael Forney
-Nicholas J. Kain
-orc
-Pascal Cuoq
-Pierre Carrier
-Rich Felker
-Richard Pennington
-sin
-Solar Designer
-Stefan Kristiansson
-Szabolcs Nagy
-Timo Teräs
-Valentin Ochs
-William Haddon
-
-Portions of this software are derived from third-party works licensed
-under terms compatible with the above MIT license:
-
-The TRE regular expression implementation (src/regex/reg* and
-src/regex/tre*) is Copyright © 2001-2008 Ville Laurikari and licensed
-under a 2-clause BSD license (license text in the source files). The
-included version has been heavily modified by Rich Felker in 2012, in
-the interests of size, simplicity, and namespace cleanliness.
-
-Much of the math library code (src/math/* and src/complex/*) is
-Copyright © 1993,2004 Sun Microsystems or
-Copyright © 2003-2011 David Schultz or
-Copyright © 2003-2009 Steven G. Kargl or
-Copyright © 2003-2009 Bruce D. Evans or
-Copyright © 2008 Stephen L. Moshier
-and labelled as such in comments in the individual source files. All
-have been licensed under extremely permissive terms.
-
-The ARM memcpy code (src/string/armel/memcpy.s) is Copyright © 2008
-The Android Open Source Project and is licensed under a two-clause BSD
-license. It was taken from Bionic libc, used on Android.
-
-The implementation of DES for crypt (src/misc/crypt_des.c) is
-Copyright © 1994 David Burren. It is licensed under a BSD license.
-
-The implementation of blowfish crypt (src/misc/crypt_blowfish.c) was
-originally written by Solar Designer and placed into the public
-domain. The code also comes with a fallback permissive license for use
-in jurisdictions that may not recognize the public domain.
-
-The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011
-Valentin Ochs and is licensed under an MIT-style license.
-
-The BSD PRNG implementation (src/prng/random.c) and XSI search API
-(src/search/*.c) functions are Copyright © 2011 Szabolcs Nagy and
-licensed under following terms: "Permission to use, copy, modify,
-and/or distribute this code for any purpose with or without fee is
-hereby granted. There is no warranty."
-
-The x86_64 port was written by Nicholas J. Kain. Several files (crt)
-were released into the public domain; others are licensed under the
-standard MIT license terms at the top of this file. See individual
-files for their copyright status.
-
-The mips and microblaze ports were originally written by Richard
-Pennington for use in the ellcc project. The original code was adapted
-by Rich Felker for build system and code conventions during upstream
-integration. It is licensed under the standard MIT terms.
-
-The powerpc port was also originally written by Richard Pennington,
-and later supplemented and integrated by John Spencer. It is licensed
-under the standard MIT terms.
-
-All other files which have no copyright comments are original works
-produced specifically for use as part of this library, written either
-by Rich Felker, the main author of the library, or by one or more
-contibutors listed above. Details on authorship of individual files
-can be found in the git version control history of the project. The
-omission of copyright and license comments in each file is in the
-interest of source tree size.
-
-All public header files (include/* and arch/*/bits/*) should be
-treated as Public Domain as they intentionally contain no content
-which can be covered by copyright. Some source modules may fall in
-this category as well. If you believe that a file is so trivial that
-it should be in the Public Domain, please contact the authors and
-request an explicit statement releasing it from copyright.
-
-The following files are trivial, believed not to be copyrightable in
-the first place, and hereby explicitly released to the Public Domain:
-
-All public headers: include/*, arch/*/bits/*
-Startup files: crt/*
diff --git a/contrib/python/pandas/py3/LICENSES/NUMPY_LICENSE b/contrib/python/pandas/py3/LICENSES/NUMPY_LICENSE
deleted file mode 100644
index 7e972cff807..00000000000
--- a/contrib/python/pandas/py3/LICENSES/NUMPY_LICENSE
+++ /dev/null
@@ -1,30 +0,0 @@
-Copyright (c) 2005-2011, NumPy Developers.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials provided
- with the distribution.
-
- * Neither the name of the NumPy Developers nor the names of any
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/contrib/python/pandas/py3/LICENSES/OTHER b/contrib/python/pandas/py3/LICENSES/OTHER
deleted file mode 100644
index 7446d68eb43..00000000000
--- a/contrib/python/pandas/py3/LICENSES/OTHER
+++ /dev/null
@@ -1,75 +0,0 @@
-Bottleneck license
-------------------
-
-Copyright (c) 2010-2012 Archipel Asset Management AB.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-google-api-python-client license
---------------------------------
-
-Copyright (C) 2012 Google Inc.
-All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-Pyperclip v1.3 license
-----------------------
-
-Copyright (c) 2010, Albert Sweigart
-All rights reserved.
-
-BSD-style license:
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the pyperclip nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY Albert Sweigart "AS IS" AND ANY
-EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL Albert Sweigart BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/contrib/python/pandas/py3/LICENSES/PACKAGING_LICENSE b/contrib/python/pandas/py3/LICENSES/PACKAGING_LICENSE
deleted file mode 100644
index 4216ea1ce23..00000000000
--- a/contrib/python/pandas/py3/LICENSES/PACKAGING_LICENSE
+++ /dev/null
@@ -1,202 +0,0 @@
-
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
-
-Copyright (c) Donald Stufft and individual contributors.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- 1. Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/contrib/python/pandas/py3/LICENSES/PSF_LICENSE b/contrib/python/pandas/py3/LICENSES/PSF_LICENSE
deleted file mode 100644
index 5cdb01e8d24..00000000000
--- a/contrib/python/pandas/py3/LICENSES/PSF_LICENSE
+++ /dev/null
@@ -1,279 +0,0 @@
-A. HISTORY OF THE SOFTWARE
-==========================
-
-Python was created in the early 1990s by Guido van Rossum at Stichting
-Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands
-as a successor of a language called ABC. Guido remains Python's
-principal author, although it includes many contributions from others.
-
-In 1995, Guido continued his work on Python at the Corporation for
-National Research Initiatives (CNRI, see http://www.cnri.reston.va.us)
-in Reston, Virginia where he released several versions of the
-software.
-
-In May 2000, Guido and the Python core development team moved to
-BeOpen.com to form the BeOpen PythonLabs team. In October of the same
-year, the PythonLabs team moved to Digital Creations (now Zope
-Corporation, see http://www.zope.com). In 2001, the Python Software
-Foundation (PSF, see http://www.python.org/psf/) was formed, a
-non-profit organization created specifically to own Python-related
-Intellectual Property. Zope Corporation is a sponsoring member of
-the PSF.
-
-All Python releases are Open Source (see http://www.opensource.org for
-the Open Source Definition). Historically, most, but not all, Python
-releases have also been GPL-compatible; the table below summarizes
-the various releases.
-
- Release Derived Year Owner GPL-
- from compatible? (1)
-
- 0.9.0 thru 1.2 1991-1995 CWI yes
- 1.3 thru 1.5.2 1.2 1995-1999 CNRI yes
- 1.6 1.5.2 2000 CNRI no
- 2.0 1.6 2000 BeOpen.com no
- 1.6.1 1.6 2001 CNRI yes (2)
- 2.1 2.0+1.6.1 2001 PSF no
- 2.0.1 2.0+1.6.1 2001 PSF yes
- 2.1.1 2.1+2.0.1 2001 PSF yes
- 2.2 2.1.1 2001 PSF yes
- 2.1.2 2.1.1 2002 PSF yes
- 2.1.3 2.1.2 2002 PSF yes
- 2.2.1 2.2 2002 PSF yes
- 2.2.2 2.2.1 2002 PSF yes
- 2.2.3 2.2.2 2003 PSF yes
- 2.3 2.2.2 2002-2003 PSF yes
- 2.3.1 2.3 2002-2003 PSF yes
- 2.3.2 2.3.1 2002-2003 PSF yes
- 2.3.3 2.3.2 2002-2003 PSF yes
- 2.3.4 2.3.3 2004 PSF yes
- 2.3.5 2.3.4 2005 PSF yes
- 2.4 2.3 2004 PSF yes
- 2.4.1 2.4 2005 PSF yes
- 2.4.2 2.4.1 2005 PSF yes
- 2.4.3 2.4.2 2006 PSF yes
- 2.4.4 2.4.3 2006 PSF yes
- 2.5 2.4 2006 PSF yes
- 2.5.1 2.5 2007 PSF yes
- 2.5.2 2.5.1 2008 PSF yes
- 2.5.3 2.5.2 2008 PSF yes
- 2.6 2.5 2008 PSF yes
- 2.6.1 2.6 2008 PSF yes
- 2.6.2 2.6.1 2009 PSF yes
- 2.6.3 2.6.2 2009 PSF yes
- 2.6.4 2.6.3 2009 PSF yes
- 2.6.5 2.6.4 2010 PSF yes
- 2.7 2.6 2010 PSF yes
-
-Footnotes:
-
-(1) GPL-compatible doesn't mean that we're distributing Python under
- the GPL. All Python licenses, unlike the GPL, let you distribute
- a modified version without making your changes open source. The
- GPL-compatible licenses make it possible to combine Python with
- other software that is released under the GPL; the others don't.
-
-(2) According to Richard Stallman, 1.6.1 is not GPL-compatible,
- because its license has a choice of law clause. According to
- CNRI, however, Stallman's lawyer has told CNRI's lawyer that 1.6.1
- is "not incompatible" with the GPL.
-
-Thanks to the many outside volunteers who have worked under Guido's
-direction to make these releases possible.
-
-
-B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON
-===============================================================
-
-PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2
---------------------------------------------
-
-1. This LICENSE AGREEMENT is between the Python Software Foundation
-("PSF"), and the Individual or Organization ("Licensee") accessing and
-otherwise using this software ("Python") in source or binary form and
-its associated documentation.
-
-2. Subject to the terms and conditions of this License Agreement, PSF hereby
-grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
-analyze, test, perform and/or display publicly, prepare derivative works,
-distribute, and otherwise use Python alone or in any derivative version,
-provided, however, that PSF's License Agreement and PSF's notice of copyright,
-i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
-Python Software Foundation; All Rights Reserved" are retained in Python alone or
-in any derivative version prepared by Licensee.
-
-3. In the event Licensee prepares a derivative work that is based on
-or incorporates Python or any part thereof, and wants to make
-the derivative work available to others as provided herein, then
-Licensee hereby agrees to include in any such work a brief summary of
-the changes made to Python.
-
-4. PSF is making Python available to Licensee on an "AS IS"
-basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND
-DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT
-INFRINGE ANY THIRD PARTY RIGHTS.
-
-5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
-FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
-A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON,
-OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-
-6. This License Agreement will automatically terminate upon a material
-breach of its terms and conditions.
-
-7. Nothing in this License Agreement shall be deemed to create any
-relationship of agency, partnership, or joint venture between PSF and
-Licensee. This License Agreement does not grant permission to use PSF
-trademarks or trade name in a trademark sense to endorse or promote
-products or services of Licensee, or any third party.
-
-8. By copying, installing or otherwise using Python, Licensee
-agrees to be bound by the terms and conditions of this License
-Agreement.
-
-
-BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
--------------------------------------------
-
-BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
-
-1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an
-office at 160 Saratoga Avenue, Santa Clara, CA 95051, and the
-Individual or Organization ("Licensee") accessing and otherwise using
-this software in source or binary form and its associated
-documentation ("the Software").
-
-2. Subject to the terms and conditions of this BeOpen Python License
-Agreement, BeOpen hereby grants Licensee a non-exclusive,
-royalty-free, world-wide license to reproduce, analyze, test, perform
-and/or display publicly, prepare derivative works, distribute, and
-otherwise use the Software alone or in any derivative version,
-provided, however, that the BeOpen Python License is retained in the
-Software, alone or in any derivative version prepared by Licensee.
-
-3. BeOpen is making the Software available to Licensee on an "AS IS"
-basis. BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND
-DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE WILL NOT
-INFRINGE ANY THIRD PARTY RIGHTS.
-
-4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE
-SOFTWARE FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS
-AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY
-DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-
-5. This License Agreement will automatically terminate upon a material
-breach of its terms and conditions.
-
-6. This License Agreement shall be governed by and interpreted in all
-respects by the law of the State of California, excluding conflict of
-law provisions. Nothing in this License Agreement shall be deemed to
-create any relationship of agency, partnership, or joint venture
-between BeOpen and Licensee. This License Agreement does not grant
-permission to use BeOpen trademarks or trade names in a trademark
-sense to endorse or promote products or services of Licensee, or any
-third party. As an exception, the "BeOpen Python" logos available at
-http://www.pythonlabs.com/logos.html may be used according to the
-permissions granted on that web page.
-
-7. By copying, installing or otherwise using the software, Licensee
-agrees to be bound by the terms and conditions of this License
-Agreement.
-
-
-CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
----------------------------------------
-
-1. This LICENSE AGREEMENT is between the Corporation for National
-Research Initiatives, having an office at 1895 Preston White Drive,
-Reston, VA 20191 ("CNRI"), and the Individual or Organization
-("Licensee") accessing and otherwise using Python 1.6.1 software in
-source or binary form and its associated documentation.
-
-2. Subject to the terms and conditions of this License Agreement, CNRI
-hereby grants Licensee a nonexclusive, royalty-free, world-wide
-license to reproduce, analyze, test, perform and/or display publicly,
-prepare derivative works, distribute, and otherwise use Python 1.6.1
-alone or in any derivative version, provided, however, that CNRI's
-License Agreement and CNRI's notice of copyright, i.e., "Copyright (c)
-1995-2001 Corporation for National Research Initiatives; All Rights
-Reserved" are retained in Python 1.6.1 alone or in any derivative
-version prepared by Licensee. Alternately, in lieu of CNRI's License
-Agreement, Licensee may substitute the following text (omitting the
-quotes): "Python 1.6.1 is made available subject to the terms and
-conditions in CNRI's License Agreement. This Agreement together with
-Python 1.6.1 may be located on the Internet using the following
-unique, persistent identifier (known as a handle): 1895.22/1013. This
-Agreement may also be obtained from a proxy server on the Internet
-using the following URL: http://hdl.handle.net/1895.22/1013".
-
-3. In the event Licensee prepares a derivative work that is based on
-or incorporates Python 1.6.1 or any part thereof, and wants to make
-the derivative work available to others as provided herein, then
-Licensee hereby agrees to include in any such work a brief summary of
-the changes made to Python 1.6.1.
-
-4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS"
-basis. CNRI MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR
-IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, CNRI MAKES NO AND
-DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS
-FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON 1.6.1 WILL NOT
-INFRINGE ANY THIRD PARTY RIGHTS.
-
-5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON
-1.6.1 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS
-A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1,
-OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
-
-6. This License Agreement will automatically terminate upon a material
-breach of its terms and conditions.
-
-7. This License Agreement shall be governed by the federal
-intellectual property law of the United States, including without
-limitation the federal copyright law, and, to the extent such
-U.S. federal law does not apply, by the law of the Commonwealth of
-Virginia, excluding Virginia's conflict of law provisions.
-Notwithstanding the foregoing, with regard to derivative works based
-on Python 1.6.1 that incorporate non-separable material that was
-previously distributed under the GNU General Public License (GPL), the
-law of the Commonwealth of Virginia shall govern this License
-Agreement only as to issues arising under or with respect to
-Paragraphs 4, 5, and 7 of this License Agreement. Nothing in this
-License Agreement shall be deemed to create any relationship of
-agency, partnership, or joint venture between CNRI and Licensee. This
-License Agreement does not grant permission to use CNRI trademarks or
-trade name in a trademark sense to endorse or promote products or
-services of Licensee, or any third party.
-
-8. By clicking on the "ACCEPT" button where indicated, or by copying,
-installing or otherwise using Python 1.6.1, Licensee agrees to be
-bound by the terms and conditions of this License Agreement.
-
- ACCEPT
-
-
-CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
---------------------------------------------------
-
-Copyright (c) 1991 - 1995, Stichting Mathematisch Centrum Amsterdam,
-The Netherlands. All rights reserved.
-
-Permission to use, copy, modify, and distribute this software and its
-documentation for any purpose and without fee is hereby granted,
-provided that the above copyright notice appear in all copies and that
-both that copyright notice and this permission notice appear in
-supporting documentation, and that the name of Stichting Mathematisch
-Centrum or CWI not be used in advertising or publicity pertaining to
-distribution of the software without specific, written prior
-permission.
-
-STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO
-THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE
-FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
-OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/contrib/python/pandas/py3/LICENSES/PYUPGRADE_LICENSE b/contrib/python/pandas/py3/LICENSES/PYUPGRADE_LICENSE
deleted file mode 100644
index 522fbe20b89..00000000000
--- a/contrib/python/pandas/py3/LICENSES/PYUPGRADE_LICENSE
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright (c) 2017 Anthony Sottile
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
diff --git a/contrib/python/pandas/py3/LICENSES/SAS7BDAT_LICENSE b/contrib/python/pandas/py3/LICENSES/SAS7BDAT_LICENSE
deleted file mode 100644
index 8fbf194013e..00000000000
--- a/contrib/python/pandas/py3/LICENSES/SAS7BDAT_LICENSE
+++ /dev/null
@@ -1,19 +0,0 @@
-Copyright (c) 2015 Jared Hobbs
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/contrib/python/pandas/py3/LICENSES/SCIPY_LICENSE b/contrib/python/pandas/py3/LICENSES/SCIPY_LICENSE
deleted file mode 100644
index d887ce5f989..00000000000
--- a/contrib/python/pandas/py3/LICENSES/SCIPY_LICENSE
+++ /dev/null
@@ -1,31 +0,0 @@
-Copyright (c) 2001, 2002 Enthought, Inc.
-All rights reserved.
-
-Copyright (c) 2003-2012 SciPy Developers.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- a. Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- b. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- c. Neither the name of Enthought nor the names of the SciPy Developers
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGE.
-
diff --git a/contrib/python/pandas/py3/LICENSES/ULTRAJSON_LICENSE b/contrib/python/pandas/py3/LICENSES/ULTRAJSON_LICENSE
deleted file mode 100644
index a905fb017d8..00000000000
--- a/contrib/python/pandas/py3/LICENSES/ULTRAJSON_LICENSE
+++ /dev/null
@@ -1,34 +0,0 @@
-Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the ESN Social Software AB nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
-https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
-
-Numeric decoder derived from TCL library
-http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
- * Copyright (c) 1988-1993 The Regents of the University of California.
- * Copyright (c) 1994 Sun Microsystems, Inc.
diff --git a/contrib/python/pandas/py3/LICENSES/XARRAY_LICENSE b/contrib/python/pandas/py3/LICENSES/XARRAY_LICENSE
deleted file mode 100644
index 6bafeb9d3d8..00000000000
--- a/contrib/python/pandas/py3/LICENSES/XARRAY_LICENSE
+++ /dev/null
@@ -1,195 +0,0 @@
-Copyright 2014-2019, xarray Developers
-
---------------------------------------------------------------------------------
-
-Apache License
-Version 2.0, January 2004
-http://www.apache.org/licenses/
-
-TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-1. Definitions.
-
-"License" shall mean the terms and conditions for use, reproduction, and
-distribution as defined by Sections 1 through 9 of this document.
-
-"Licensor" shall mean the copyright owner or entity authorized by the copyright
-owner that is granting the License.
-
-"Legal Entity" shall mean the union of the acting entity and all other entities
-that control, are controlled by, or are under common control with that entity.
-For the purposes of this definition, "control" means (i) the power, direct or
-indirect, to cause the direction or management of such entity, whether by
-contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
-outstanding shares, or (iii) beneficial ownership of such entity.
-
-"You" (or "Your") shall mean an individual or Legal Entity exercising
-permissions granted by this License.
-
-"Source" form shall mean the preferred form for making modifications, including
-but not limited to software source code, documentation source, and configuration
-files.
-
-"Object" form shall mean any form resulting from mechanical transformation or
-translation of a Source form, including but not limited to compiled object code,
-generated documentation, and conversions to other media types.
-
-"Work" shall mean the work of authorship, whether in Source or Object form, made
-available under the License, as indicated by a copyright notice that is included
-in or attached to the work (an example is provided in the Appendix below).
-
-"Derivative Works" shall mean any work, whether in Source or Object form, that
-is based on (or derived from) the Work and for which the editorial revisions,
-annotations, elaborations, or other modifications represent, as a whole, an
-original work of authorship. For the purposes of this License, Derivative Works
-shall not include works that remain separable from, or merely link (or bind by
-name) to the interfaces of, the Work and Derivative Works thereof.
-
-"Contribution" shall mean any work of authorship, including the original version
-of the Work and any modifications or additions to that Work or Derivative Works
-thereof, that is intentionally submitted to Licensor for inclusion in the Work
-by the copyright owner or by an individual or Legal Entity authorized to submit
-on behalf of the copyright owner. For the purposes of this definition,
-"submitted" means any form of electronic, verbal, or written communication sent
-to the Licensor or its representatives, including but not limited to
-communication on electronic mailing lists, source code control systems, and
-issue tracking systems that are managed by, or on behalf of, the Licensor for
-the purpose of discussing and improving the Work, but excluding communication
-that is conspicuously marked or otherwise designated in writing by the copyright
-owner as "Not a Contribution."
-
-"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
-of whom a Contribution has been received by Licensor and subsequently
-incorporated within the Work.
-
-2. Grant of Copyright License.
-
-Subject to the terms and conditions of this License, each Contributor hereby
-grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
-irrevocable copyright license to reproduce, prepare Derivative Works of,
-publicly display, publicly perform, sublicense, and distribute the Work and such
-Derivative Works in Source or Object form.
-
-3. Grant of Patent License.
-
-Subject to the terms and conditions of this License, each Contributor hereby
-grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
-irrevocable (except as stated in this section) patent license to make, have
-made, use, offer to sell, sell, import, and otherwise transfer the Work, where
-such license applies only to those patent claims licensable by such Contributor
-that are necessarily infringed by their Contribution(s) alone or by combination
-of their Contribution(s) with the Work to which such Contribution(s) was
-submitted. If You institute patent litigation against any entity (including a
-cross-claim or counterclaim in a lawsuit) alleging that the Work or a
-Contribution incorporated within the Work constitutes direct or contributory
-patent infringement, then any patent licenses granted to You under this License
-for that Work shall terminate as of the date such litigation is filed.
-
-4. Redistribution.
-
-You may reproduce and distribute copies of the Work or Derivative Works thereof
-in any medium, with or without modifications, and in Source or Object form,
-provided that You meet the following conditions:
-
-You must give any other recipients of the Work or Derivative Works a copy of
-this License; and
-You must cause any modified files to carry prominent notices stating that You
-changed the files; and
-You must retain, in the Source form of any Derivative Works that You distribute,
-all copyright, patent, trademark, and attribution notices from the Source form
-of the Work, excluding those notices that do not pertain to any part of the
-Derivative Works; and
-If the Work includes a "NOTICE" text file as part of its distribution, then any
-Derivative Works that You distribute must include a readable copy of the
-attribution notices contained within such NOTICE file, excluding those notices
-that do not pertain to any part of the Derivative Works, in at least one of the
-following places: within a NOTICE text file distributed as part of the
-Derivative Works; within the Source form or documentation, if provided along
-with the Derivative Works; or, within a display generated by the Derivative
-Works, if and wherever such third-party notices normally appear. The contents of
-the NOTICE file are for informational purposes only and do not modify the
-License. You may add Your own attribution notices within Derivative Works that
-You distribute, alongside or as an addendum to the NOTICE text from the Work,
-provided that such additional attribution notices cannot be construed as
-modifying the License.
-You may add Your own copyright statement to Your modifications and may provide
-additional or different license terms and conditions for use, reproduction, or
-distribution of Your modifications, or for any such Derivative Works as a whole,
-provided Your use, reproduction, and distribution of the Work otherwise complies
-with the conditions stated in this License.
-
-5. Submission of Contributions.
-
-Unless You explicitly state otherwise, any Contribution intentionally submitted
-for inclusion in the Work by You to the Licensor shall be under the terms and
-conditions of this License, without any additional terms or conditions.
-Notwithstanding the above, nothing herein shall supersede or modify the terms of
-any separate license agreement you may have executed with Licensor regarding
-such Contributions.
-
-6. Trademarks.
-
-This License does not grant permission to use the trade names, trademarks,
-service marks, or product names of the Licensor, except as required for
-reasonable and customary use in describing the origin of the Work and
-reproducing the content of the NOTICE file.
-
-7. Disclaimer of Warranty.
-
-Unless required by applicable law or agreed to in writing, Licensor provides the
-Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
-including, without limitation, any warranties or conditions of TITLE,
-NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
-solely responsible for determining the appropriateness of using or
-redistributing the Work and assume any risks associated with Your exercise of
-permissions under this License.
-
-8. Limitation of Liability.
-
-In no event and under no legal theory, whether in tort (including negligence),
-contract, or otherwise, unless required by applicable law (such as deliberate
-and grossly negligent acts) or agreed to in writing, shall any Contributor be
-liable to You for damages, including any direct, indirect, special, incidental,
-or consequential damages of any character arising as a result of this License or
-out of the use or inability to use the Work (including but not limited to
-damages for loss of goodwill, work stoppage, computer failure or malfunction, or
-any and all other commercial damages or losses), even if such Contributor has
-been advised of the possibility of such damages.
-
-9. Accepting Warranty or Additional Liability.
-
-While redistributing the Work or Derivative Works thereof, You may choose to
-offer, and charge a fee for, acceptance of support, warranty, indemnity, or
-other liability obligations and/or rights consistent with this License. However,
-in accepting such obligations, You may act only on Your own behalf and on Your
-sole responsibility, not on behalf of any other Contributor, and only if You
-agree to indemnify, defend, and hold each Contributor harmless for any liability
-incurred by, or claims asserted against, such Contributor by reason of your
-accepting any such warranty or additional liability.
-
-END OF TERMS AND CONDITIONS
-
-APPENDIX: How to apply the Apache License to your work
-
-To apply the Apache License to your work, attach the following boilerplate
-notice, with the fields enclosed by brackets "[]" replaced with your own
-identifying information. (Don't include the brackets!) The text should be
-enclosed in the appropriate comment syntax for the file format. We also
-recommend that a file or class name and description of purpose be included on
-the same "printed page" as the copyright notice for easier identification within
-third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
diff --git a/contrib/python/pandas/py3/README.md b/contrib/python/pandas/py3/README.md
deleted file mode 100644
index 38f4afb3e2f..00000000000
--- a/contrib/python/pandas/py3/README.md
+++ /dev/null
@@ -1,170 +0,0 @@
-<div align="center">
- <img src="https://pandas.pydata.org/static/img/pandas.svg"><br>
-</div>
-
------------------
-
-# pandas: powerful Python data analysis toolkit
-[![PyPI Latest Release](https://img.shields.io/pypi/v/pandas.svg)](https://pypi.org/project/pandas/)
-[![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/anaconda/pandas/)
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134)
-[![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/)
-[![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/main/LICENSE)
-[![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=main)](https://codecov.io/gh/pandas-dev/pandas)
-[![Downloads](https://static.pepy.tech/personalized-badge/pandas?period=month&units=international_system&left_color=black&right_color=orange&left_text=PyPI%20downloads%20per%20month)](https://pepy.tech/project/pandas)
-[![Slack](https://img.shields.io/badge/join_Slack-information-brightgreen.svg?logo=slack)](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack)
-[![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org)
-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
-[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
-
-## What is it?
-
-**pandas** is a Python package that provides fast, flexible, and expressive data
-structures designed to make working with "relational" or "labeled" data both
-easy and intuitive. It aims to be the fundamental high-level building block for
-doing practical, **real world** data analysis in Python. Additionally, it has
-the broader goal of becoming **the most powerful and flexible open source data
-analysis / manipulation tool available in any language**. It is already well on
-its way towards this goal.
-
-## Main Features
-Here are just a few of the things that pandas does well:
-
- - Easy handling of [**missing data**][missing-data] (represented as
- `NaN`, `NA`, or `NaT`) in floating point as well as non-floating point data
- - Size mutability: columns can be [**inserted and
- deleted**][insertion-deletion] from DataFrame and higher dimensional
- objects
- - Automatic and explicit [**data alignment**][alignment]: objects can
- be explicitly aligned to a set of labels, or the user can simply
- ignore the labels and let `Series`, `DataFrame`, etc. automatically
- align the data for you in computations
- - Powerful, flexible [**group by**][groupby] functionality to perform
- split-apply-combine operations on data sets, for both aggregating
- and transforming data
- - Make it [**easy to convert**][conversion] ragged,
- differently-indexed data in other Python and NumPy data structures
- into DataFrame objects
- - Intelligent label-based [**slicing**][slicing], [**fancy
- indexing**][fancy-indexing], and [**subsetting**][subsetting] of
- large data sets
- - Intuitive [**merging**][merging] and [**joining**][joining] data
- sets
- - Flexible [**reshaping**][reshape] and [**pivoting**][pivot-table] of
- data sets
- - [**Hierarchical**][mi] labeling of axes (possible to have multiple
- labels per tick)
- - Robust IO tools for loading data from [**flat files**][flat-files]
- (CSV and delimited), [**Excel files**][excel], [**databases**][db],
- and saving/loading data from the ultrafast [**HDF5 format**][hdfstore]
- - [**Time series**][timeseries]-specific functionality: date range
- generation and frequency conversion, moving window statistics,
- date shifting and lagging
-
-
- [missing-data]: https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
- [insertion-deletion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#column-selection-addition-deletion
- [alignment]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html?highlight=alignment#intro-to-data-structures
- [groupby]: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#group-by-split-apply-combine
- [conversion]: https://pandas.pydata.org/pandas-docs/stable/user_guide/dsintro.html#dataframe
- [slicing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#slicing-ranges
- [fancy-indexing]: https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced
- [subsetting]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
- [merging]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#database-style-dataframe-or-named-series-joining-merging
- [joining]: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#joining-on-index
- [reshape]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html
- [pivot-table]: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html
- [mi]: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#hierarchical-indexing-multiindex
- [flat-files]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#csv-text-files
- [excel]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#excel-files
- [db]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#sql-queries
- [hdfstore]: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#hdf5-pytables
- [timeseries]: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-series-date-functionality
-
-## Where to get it
-The source code is currently hosted on GitHub at:
-https://github.com/pandas-dev/pandas
-
-Binary installers for the latest released version are available at the [Python
-Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/).
-
-```sh
-# conda
-conda install pandas
-```
-
-```sh
-# or PyPI
-pip install pandas
-```
-
-## Dependencies
-- [NumPy - Adds support for large, multi-dimensional arrays, matrices and high-level mathematical functions to operate on these arrays](https://www.numpy.org)
-- [python-dateutil - Provides powerful extensions to the standard datetime module](https://dateutil.readthedocs.io/en/stable/index.html)
-- [pytz - Brings the Olson tz database into Python which allows accurate and cross platform timezone calculations](https://github.com/stub42/pytz)
-
-See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for minimum supported versions of required, recommended and optional dependencies.
-
-## Installation from sources
-To install pandas from source you need [Cython](https://cython.org/) in addition to the normal
-dependencies above. Cython can be installed from PyPI:
-
-```sh
-pip install cython
-```
-
-In the `pandas` directory (same one where you found this file after
-cloning the git repo), execute:
-
-```sh
-python setup.py install
-```
-
-or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable):
-
-
-```sh
-python -m pip install -e . --no-build-isolation --no-use-pep517
-```
-
-or alternatively
-
-```sh
-python setup.py develop
-```
-
-See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-from-source).
-
-## License
-[BSD 3](LICENSE)
-
-## Documentation
-The official documentation is hosted on PyData.org: https://pandas.pydata.org/pandas-docs/stable
-
-## Background
-Work on ``pandas`` started at [AQR](https://www.aqr.com/) (a quantitative hedge fund) in 2008 and
-has been under active development since then.
-
-## Getting Help
-
-For usage questions, the best place to go to is [StackOverflow](https://stackoverflow.com/questions/tagged/pandas).
-Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata).
-
-## Discussion and Development
-Most development discussions take place on GitHub in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Slack channel](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack) is available for quick development related questions.
-
-## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas)
-
-All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome.
-
-A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas.pydata.org/docs/dev/development/contributing.html)**.
-
-If you are simply looking to start working with the pandas codebase, navigate to the [GitHub "issues" tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out.
-
-You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas).
-
-Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it!
-
-Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Slack](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack).
-
-As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/.github/blob/master/CODE_OF_CONDUCT.md)
diff --git a/contrib/python/pandas/py3/pandas/__init__.py b/contrib/python/pandas/py3/pandas/__init__.py
deleted file mode 100644
index 1a549c09d22..00000000000
--- a/contrib/python/pandas/py3/pandas/__init__.py
+++ /dev/null
@@ -1,346 +0,0 @@
-from __future__ import annotations
-
-__docformat__ = "restructuredtext"
-
-# Let users know if they're missing any of our hard dependencies
-_hard_dependencies = ("numpy", "pytz", "dateutil")
-_missing_dependencies = []
-
-for _dependency in _hard_dependencies:
- try:
- __import__(_dependency)
- except ImportError as _e: # pragma: no cover
- _missing_dependencies.append(f"{_dependency}: {_e}")
-
-if _missing_dependencies: # pragma: no cover
- raise ImportError(
- "Unable to import required dependencies:\n" + "\n".join(_missing_dependencies)
- )
-del _hard_dependencies, _dependency, _missing_dependencies
-
-# numpy compat
-from pandas.compat import is_numpy_dev as _is_numpy_dev # pyright: ignore # noqa:F401
-
-try:
- from pandas._libs import hashtable as _hashtable, lib as _lib, tslib as _tslib
-except ImportError as _err: # pragma: no cover
- _module = _err.name
- raise ImportError(
- f"C extension: {_module} not built. If you want to import "
- "pandas from the source directory, you may need to run "
- "'python setup.py build_ext --force' to build the C extensions first."
- ) from _err
-else:
- del _tslib, _lib, _hashtable
-
-from pandas._config import (
- get_option,
- set_option,
- reset_option,
- describe_option,
- option_context,
- options,
-)
-
-# let init-time option registration happen
-import pandas.core.config_init # pyright: ignore # noqa:F401
-
-from pandas.core.api import (
- # dtype
- ArrowDtype,
- Int8Dtype,
- Int16Dtype,
- Int32Dtype,
- Int64Dtype,
- UInt8Dtype,
- UInt16Dtype,
- UInt32Dtype,
- UInt64Dtype,
- Float32Dtype,
- Float64Dtype,
- CategoricalDtype,
- PeriodDtype,
- IntervalDtype,
- DatetimeTZDtype,
- StringDtype,
- BooleanDtype,
- # missing
- NA,
- isna,
- isnull,
- notna,
- notnull,
- # indexes
- Index,
- CategoricalIndex,
- RangeIndex,
- MultiIndex,
- IntervalIndex,
- TimedeltaIndex,
- DatetimeIndex,
- PeriodIndex,
- IndexSlice,
- # tseries
- NaT,
- Period,
- period_range,
- Timedelta,
- timedelta_range,
- Timestamp,
- date_range,
- bdate_range,
- Interval,
- interval_range,
- DateOffset,
- # conversion
- to_numeric,
- to_datetime,
- to_timedelta,
- # misc
- Flags,
- Grouper,
- factorize,
- unique,
- value_counts,
- NamedAgg,
- array,
- Categorical,
- set_eng_float_format,
- Series,
- DataFrame,
-)
-
-from pandas.core.arrays.sparse import SparseDtype
-
-from pandas.tseries.api import infer_freq
-from pandas.tseries import offsets
-
-from pandas.core.computation.api import eval
-
-from pandas.core.reshape.api import (
- concat,
- lreshape,
- melt,
- wide_to_long,
- merge,
- merge_asof,
- merge_ordered,
- crosstab,
- pivot,
- pivot_table,
- get_dummies,
- from_dummies,
- cut,
- qcut,
-)
-
-from pandas import api, arrays, errors, io, plotting, tseries
-from pandas import testing
-from pandas.util._print_versions import show_versions
-
-from pandas.io.api import (
- # excel
- ExcelFile,
- ExcelWriter,
- read_excel,
- # parsers
- read_csv,
- read_fwf,
- read_table,
- # pickle
- read_pickle,
- to_pickle,
- # pytables
- HDFStore,
- read_hdf,
- # sql
- read_sql,
- read_sql_query,
- read_sql_table,
- # misc
- read_clipboard,
- read_parquet,
- read_orc,
- read_feather,
- read_gbq,
- read_html,
- read_xml,
- read_json,
- read_stata,
- read_sas,
- read_spss,
-)
-
-from pandas.io.json._normalize import json_normalize
-
-from pandas.util._tester import test
-
-# use the closest tagged version if possible
-from pandas._version import get_versions
-
-v = get_versions()
-__version__ = v.get("closest-tag", v["version"])
-__git_version__ = v.get("full-revisionid")
-del get_versions, v
-
-
-# module level doc-string
-__doc__ = """
-pandas - a powerful data analysis and manipulation library for Python
-=====================================================================
-
-**pandas** is a Python package providing fast, flexible, and expressive data
-structures designed to make working with "relational" or "labeled" data both
-easy and intuitive. It aims to be the fundamental high-level building block for
-doing practical, **real world** data analysis in Python. Additionally, it has
-the broader goal of becoming **the most powerful and flexible open source data
-analysis / manipulation tool available in any language**. It is already well on
-its way toward this goal.
-
-Main Features
--------------
-Here are just a few of the things that pandas does well:
-
- - Easy handling of missing data in floating point as well as non-floating
- point data.
- - Size mutability: columns can be inserted and deleted from DataFrame and
- higher dimensional objects
- - Automatic and explicit data alignment: objects can be explicitly aligned
- to a set of labels, or the user can simply ignore the labels and let
- `Series`, `DataFrame`, etc. automatically align the data for you in
- computations.
- - Powerful, flexible group by functionality to perform split-apply-combine
- operations on data sets, for both aggregating and transforming data.
- - Make it easy to convert ragged, differently-indexed data in other Python
- and NumPy data structures into DataFrame objects.
- - Intelligent label-based slicing, fancy indexing, and subsetting of large
- data sets.
- - Intuitive merging and joining data sets.
- - Flexible reshaping and pivoting of data sets.
- - Hierarchical labeling of axes (possible to have multiple labels per tick).
- - Robust IO tools for loading data from flat files (CSV and delimited),
- Excel files, databases, and saving/loading data from the ultrafast HDF5
- format.
- - Time series-specific functionality: date range generation and frequency
- conversion, moving window statistics, date shifting and lagging.
-"""
-
-# Use __all__ to let type checkers know what is part of the public API.
-# Pandas is not (yet) a py.typed library: the public API is determined
-# based on the documentation.
-__all__ = [
- "ArrowDtype",
- "BooleanDtype",
- "Categorical",
- "CategoricalDtype",
- "CategoricalIndex",
- "DataFrame",
- "DateOffset",
- "DatetimeIndex",
- "DatetimeTZDtype",
- "ExcelFile",
- "ExcelWriter",
- "Flags",
- "Float32Dtype",
- "Float64Dtype",
- "Grouper",
- "HDFStore",
- "Index",
- "IndexSlice",
- "Int16Dtype",
- "Int32Dtype",
- "Int64Dtype",
- "Int8Dtype",
- "Interval",
- "IntervalDtype",
- "IntervalIndex",
- "MultiIndex",
- "NA",
- "NaT",
- "NamedAgg",
- "Period",
- "PeriodDtype",
- "PeriodIndex",
- "RangeIndex",
- "Series",
- "SparseDtype",
- "StringDtype",
- "Timedelta",
- "TimedeltaIndex",
- "Timestamp",
- "UInt16Dtype",
- "UInt32Dtype",
- "UInt64Dtype",
- "UInt8Dtype",
- "api",
- "array",
- "arrays",
- "bdate_range",
- "concat",
- "crosstab",
- "cut",
- "date_range",
- "describe_option",
- "errors",
- "eval",
- "factorize",
- "get_dummies",
- "from_dummies",
- "get_option",
- "infer_freq",
- "interval_range",
- "io",
- "isna",
- "isnull",
- "json_normalize",
- "lreshape",
- "melt",
- "merge",
- "merge_asof",
- "merge_ordered",
- "notna",
- "notnull",
- "offsets",
- "option_context",
- "options",
- "period_range",
- "pivot",
- "pivot_table",
- "plotting",
- "qcut",
- "read_clipboard",
- "read_csv",
- "read_excel",
- "read_feather",
- "read_fwf",
- "read_gbq",
- "read_hdf",
- "read_html",
- "read_json",
- "read_orc",
- "read_parquet",
- "read_pickle",
- "read_sas",
- "read_spss",
- "read_sql",
- "read_sql_query",
- "read_sql_table",
- "read_stata",
- "read_table",
- "read_xml",
- "reset_option",
- "set_eng_float_format",
- "set_option",
- "show_versions",
- "test",
- "testing",
- "timedelta_range",
- "to_datetime",
- "to_numeric",
- "to_pickle",
- "to_timedelta",
- "tseries",
- "unique",
- "value_counts",
- "wide_to_long",
-]
diff --git a/contrib/python/pandas/py3/pandas/_config/__init__.py b/contrib/python/pandas/py3/pandas/_config/__init__.py
deleted file mode 100644
index d12dd3b4cb8..00000000000
--- a/contrib/python/pandas/py3/pandas/_config/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""
-pandas._config is considered explicitly upstream of everything else in pandas,
-should have no intra-pandas dependencies.
-
-importing `dates` and `display` ensures that keys needed by _libs
-are initialized.
-"""
-__all__ = [
- "config",
- "detect_console_encoding",
- "get_option",
- "set_option",
- "reset_option",
- "describe_option",
- "option_context",
- "options",
- "using_copy_on_write",
-]
-from pandas._config import config
-from pandas._config import dates # pyright: ignore # noqa:F401
-from pandas._config.config import (
- _global_config,
- describe_option,
- get_option,
- option_context,
- options,
- reset_option,
- set_option,
-)
-from pandas._config.display import detect_console_encoding
-
-
-def using_copy_on_write():
- _mode_options = _global_config["mode"]
- return _mode_options["copy_on_write"] and _mode_options["data_manager"] == "block"
-
-
-def using_nullable_dtypes():
- _mode_options = _global_config["mode"]
- return _mode_options["nullable_dtypes"]
diff --git a/contrib/python/pandas/py3/pandas/_config/config.py b/contrib/python/pandas/py3/pandas/_config/config.py
deleted file mode 100644
index 4d87e8dca6d..00000000000
--- a/contrib/python/pandas/py3/pandas/_config/config.py
+++ /dev/null
@@ -1,909 +0,0 @@
-"""
-The config module holds package-wide configurables and provides
-a uniform API for working with them.
-
-Overview
-========
-
-This module supports the following requirements:
-- options are referenced using keys in dot.notation, e.g. "x.y.option - z".
-- keys are case-insensitive.
-- functions should accept partial/regex keys, when unambiguous.
-- options can be registered by modules at import time.
-- options can be registered at init-time (via core.config_init)
-- options have a default value, and (optionally) a description and
- validation function associated with them.
-- options can be deprecated, in which case referencing them
- should produce a warning.
-- deprecated options can optionally be rerouted to a replacement
- so that accessing a deprecated option reroutes to a differently
- named option.
-- options can be reset to their default value.
-- all option can be reset to their default value at once.
-- all options in a certain sub - namespace can be reset at once.
-- the user can set / get / reset or ask for the description of an option.
-- a developer can register and mark an option as deprecated.
-- you can register a callback to be invoked when the option value
- is set or reset. Changing the stored value is considered misuse, but
- is not verboten.
-
-Implementation
-==============
-
-- Data is stored using nested dictionaries, and should be accessed
- through the provided API.
-
-- "Registered options" and "Deprecated options" have metadata associated
- with them, which are stored in auxiliary dictionaries keyed on the
- fully-qualified key, e.g. "x.y.z.option".
-
-- the config_init module is imported by the package's __init__.py file.
- placing any register_option() calls there will ensure those options
- are available as soon as pandas is loaded. If you use register_option
- in a module, it will only be available after that module is imported,
- which you should be aware of.
-
-- `config_prefix` is a context_manager (for use with the `with` keyword)
- which can save developers some typing, see the docstring.
-
-"""
-
-from __future__ import annotations
-
-from contextlib import (
- ContextDecorator,
- contextmanager,
-)
-import re
-from typing import (
- Any,
- Callable,
- Generator,
- Generic,
- Iterable,
- NamedTuple,
- cast,
-)
-import warnings
-
-from pandas._typing import (
- F,
- T,
-)
-from pandas.util._exceptions import find_stack_level
-
-
-class DeprecatedOption(NamedTuple):
- key: str
- msg: str | None
- rkey: str | None
- removal_ver: str | None
-
-
-class RegisteredOption(NamedTuple):
- key: str
- defval: object
- doc: str
- validator: Callable[[object], Any] | None
- cb: Callable[[str], Any] | None
-
-
-# holds deprecated option metadata
-_deprecated_options: dict[str, DeprecatedOption] = {}
-
-# holds registered option metadata
-_registered_options: dict[str, RegisteredOption] = {}
-
-# holds the current values for registered options
-_global_config: dict[str, Any] = {}
-
-# keys which have a special meaning
-_reserved_keys: list[str] = ["all"]
-
-
-class OptionError(AttributeError, KeyError):
- """
- Exception raised for pandas.options.
-
- Backwards compatible with KeyError checks.
- """
-
-
-#
-# User API
-
-
-def _get_single_key(pat: str, silent: bool) -> str:
- keys = _select_options(pat)
- if len(keys) == 0:
- if not silent:
- _warn_if_deprecated(pat)
- raise OptionError(f"No such keys(s): {repr(pat)}")
- if len(keys) > 1:
- raise OptionError("Pattern matched multiple keys")
- key = keys[0]
-
- if not silent:
- _warn_if_deprecated(key)
-
- key = _translate_key(key)
-
- return key
-
-
-def _get_option(pat: str, silent: bool = False) -> Any:
- key = _get_single_key(pat, silent)
-
- # walk the nested dict
- root, k = _get_root(key)
- return root[k]
-
-
-def _set_option(*args, **kwargs) -> None:
- # must at least 1 arg deal with constraints later
- nargs = len(args)
- if not nargs or nargs % 2 != 0:
- raise ValueError("Must provide an even number of non-keyword arguments")
-
- # default to false
- silent = kwargs.pop("silent", False)
-
- if kwargs:
- kwarg = list(kwargs.keys())[0]
- raise TypeError(f'_set_option() got an unexpected keyword argument "{kwarg}"')
-
- for k, v in zip(args[::2], args[1::2]):
- key = _get_single_key(k, silent)
-
- o = _get_registered_option(key)
- if o and o.validator:
- o.validator(v)
-
- # walk the nested dict
- root, k = _get_root(key)
- root[k] = v
-
- if o.cb:
- if silent:
- with warnings.catch_warnings(record=True):
- o.cb(key)
- else:
- o.cb(key)
-
-
-def _describe_option(pat: str = "", _print_desc: bool = True) -> str | None:
- keys = _select_options(pat)
- if len(keys) == 0:
- raise OptionError("No such keys(s)")
-
- s = "\n".join([_build_option_description(k) for k in keys])
-
- if _print_desc:
- print(s)
- return None
- return s
-
-
-def _reset_option(pat: str, silent: bool = False) -> None:
- keys = _select_options(pat)
-
- if len(keys) == 0:
- raise OptionError("No such keys(s)")
-
- if len(keys) > 1 and len(pat) < 4 and pat != "all":
- raise ValueError(
- "You must specify at least 4 characters when "
- "resetting multiple keys, use the special keyword "
- '"all" to reset all the options to their default value'
- )
-
- for k in keys:
- _set_option(k, _registered_options[k].defval, silent=silent)
-
-
-def get_default_val(pat: str):
- key = _get_single_key(pat, silent=True)
- return _get_registered_option(key).defval
-
-
-class DictWrapper:
- """provide attribute-style access to a nested dict"""
-
- def __init__(self, d: dict[str, Any], prefix: str = "") -> None:
- object.__setattr__(self, "d", d)
- object.__setattr__(self, "prefix", prefix)
-
- def __setattr__(self, key: str, val: Any) -> None:
- prefix = object.__getattribute__(self, "prefix")
- if prefix:
- prefix += "."
- prefix += key
- # you can't set new keys
- # can you can't overwrite subtrees
- if key in self.d and not isinstance(self.d[key], dict):
- _set_option(prefix, val)
- else:
- raise OptionError("You can only set the value of existing options")
-
- def __getattr__(self, key: str):
- prefix = object.__getattribute__(self, "prefix")
- if prefix:
- prefix += "."
- prefix += key
- try:
- v = object.__getattribute__(self, "d")[key]
- except KeyError as err:
- raise OptionError("No such option") from err
- if isinstance(v, dict):
- return DictWrapper(v, prefix)
- else:
- return _get_option(prefix)
-
- def __dir__(self) -> Iterable[str]:
- return list(self.d.keys())
-
-
-# For user convenience, we'd like to have the available options described
-# in the docstring. For dev convenience we'd like to generate the docstrings
-# dynamically instead of maintaining them by hand. To this, we use the
-# class below which wraps functions inside a callable, and converts
-# __doc__ into a property function. The doctsrings below are templates
-# using the py2.6+ advanced formatting syntax to plug in a concise list
-# of options, and option descriptions.
-
-
-class CallableDynamicDoc(Generic[T]):
- def __init__(self, func: Callable[..., T], doc_tmpl: str) -> None:
- self.__doc_tmpl__ = doc_tmpl
- self.__func__ = func
-
- def __call__(self, *args, **kwds) -> T:
- return self.__func__(*args, **kwds)
-
- # error: Signature of "__doc__" incompatible with supertype "object"
- @property
- def __doc__(self) -> str: # type: ignore[override]
- opts_desc = _describe_option("all", _print_desc=False)
- opts_list = pp_options_list(list(_registered_options.keys()))
- return self.__doc_tmpl__.format(opts_desc=opts_desc, opts_list=opts_list)
-
-
-_get_option_tmpl = """
-get_option(pat)
-
-Retrieves the value of the specified option.
-
-Available options:
-
-{opts_list}
-
-Parameters
-----------
-pat : str
- Regexp which should match a single option.
- Note: partial matches are supported for convenience, but unless you use the
- full option name (e.g. x.y.z.option_name), your code may break in future
- versions if new options with similar names are introduced.
-
-Returns
--------
-result : the value of the option
-
-Raises
-------
-OptionError : if no such option exists
-
-Notes
------
-Please reference the :ref:`User Guide <options>` for more information.
-
-The available options with its descriptions:
-
-{opts_desc}
-"""
-
-_set_option_tmpl = """
-set_option(pat, value)
-
-Sets the value of the specified option.
-
-Available options:
-
-{opts_list}
-
-Parameters
-----------
-pat : str
- Regexp which should match a single option.
- Note: partial matches are supported for convenience, but unless you use the
- full option name (e.g. x.y.z.option_name), your code may break in future
- versions if new options with similar names are introduced.
-value : object
- New value of option.
-
-Returns
--------
-None
-
-Raises
-------
-OptionError if no such option exists
-
-Notes
------
-Please reference the :ref:`User Guide <options>` for more information.
-
-The available options with its descriptions:
-
-{opts_desc}
-"""
-
-_describe_option_tmpl = """
-describe_option(pat, _print_desc=False)
-
-Prints the description for one or more registered options.
-
-Call with no arguments to get a listing for all registered options.
-
-Available options:
-
-{opts_list}
-
-Parameters
-----------
-pat : str
- Regexp pattern. All matching keys will have their description displayed.
-_print_desc : bool, default True
- If True (default) the description(s) will be printed to stdout.
- Otherwise, the description(s) will be returned as a unicode string
- (for testing).
-
-Returns
--------
-None by default, the description(s) as a unicode string if _print_desc
-is False
-
-Notes
------
-Please reference the :ref:`User Guide <options>` for more information.
-
-The available options with its descriptions:
-
-{opts_desc}
-"""
-
-_reset_option_tmpl = """
-reset_option(pat)
-
-Reset one or more options to their default value.
-
-Pass "all" as argument to reset all options.
-
-Available options:
-
-{opts_list}
-
-Parameters
-----------
-pat : str/regex
- If specified only options matching `prefix*` will be reset.
- Note: partial matches are supported for convenience, but unless you
- use the full option name (e.g. x.y.z.option_name), your code may break
- in future versions if new options with similar names are introduced.
-
-Returns
--------
-None
-
-Notes
------
-Please reference the :ref:`User Guide <options>` for more information.
-
-The available options with its descriptions:
-
-{opts_desc}
-"""
-
-# bind the functions with their docstrings into a Callable
-# and use that as the functions exposed in pd.api
-get_option = CallableDynamicDoc(_get_option, _get_option_tmpl)
-set_option = CallableDynamicDoc(_set_option, _set_option_tmpl)
-reset_option = CallableDynamicDoc(_reset_option, _reset_option_tmpl)
-describe_option = CallableDynamicDoc(_describe_option, _describe_option_tmpl)
-options = DictWrapper(_global_config)
-
-#
-# Functions for use by pandas developers, in addition to User - api
-
-
-class option_context(ContextDecorator):
- """
- Context manager to temporarily set options in the `with` statement context.
-
- You need to invoke as ``option_context(pat, val, [(pat, val), ...])``.
-
- Examples
- --------
- >>> from pandas import option_context
- >>> with option_context('display.max_rows', 10, 'display.max_columns', 5):
- ... pass
- """
-
- def __init__(self, *args) -> None:
- if len(args) % 2 != 0 or len(args) < 2:
- raise ValueError(
- "Need to invoke as option_context(pat, val, [(pat, val), ...])."
- )
-
- self.ops = list(zip(args[::2], args[1::2]))
-
- def __enter__(self) -> None:
- self.undo = [(pat, _get_option(pat, silent=True)) for pat, val in self.ops]
-
- for pat, val in self.ops:
- _set_option(pat, val, silent=True)
-
- def __exit__(self, *args) -> None:
- if self.undo:
- for pat, val in self.undo:
- _set_option(pat, val, silent=True)
-
-
-def register_option(
- key: str,
- defval: object,
- doc: str = "",
- validator: Callable[[object], Any] | None = None,
- cb: Callable[[str], Any] | None = None,
-) -> None:
- """
- Register an option in the package-wide pandas config object
-
- Parameters
- ----------
- key : str
- Fully-qualified key, e.g. "x.y.option - z".
- defval : object
- Default value of the option.
- doc : str
- Description of the option.
- validator : Callable, optional
- Function of a single argument, should raise `ValueError` if
- called with a value which is not a legal value for the option.
- cb
- a function of a single argument "key", which is called
- immediately after an option value is set/reset. key is
- the full name of the option.
-
- Raises
- ------
- ValueError if `validator` is specified and `defval` is not a valid value.
-
- """
- import keyword
- import tokenize
-
- key = key.lower()
-
- if key in _registered_options:
- raise OptionError(f"Option '{key}' has already been registered")
- if key in _reserved_keys:
- raise OptionError(f"Option '{key}' is a reserved key")
-
- # the default value should be legal
- if validator:
- validator(defval)
-
- # walk the nested dict, creating dicts as needed along the path
- path = key.split(".")
-
- for k in path:
- if not re.match("^" + tokenize.Name + "$", k):
- raise ValueError(f"{k} is not a valid identifier")
- if keyword.iskeyword(k):
- raise ValueError(f"{k} is a python keyword")
-
- cursor = _global_config
- msg = "Path prefix to option '{option}' is already an option"
-
- for i, p in enumerate(path[:-1]):
- if not isinstance(cursor, dict):
- raise OptionError(msg.format(option=".".join(path[:i])))
- if p not in cursor:
- cursor[p] = {}
- cursor = cursor[p]
-
- if not isinstance(cursor, dict):
- raise OptionError(msg.format(option=".".join(path[:-1])))
-
- cursor[path[-1]] = defval # initialize
-
- # save the option metadata
- _registered_options[key] = RegisteredOption(
- key=key, defval=defval, doc=doc, validator=validator, cb=cb
- )
-
-
-def deprecate_option(
- key: str,
- msg: str | None = None,
- rkey: str | None = None,
- removal_ver: str | None = None,
-) -> None:
- """
- Mark option `key` as deprecated, if code attempts to access this option,
- a warning will be produced, using `msg` if given, or a default message
- if not.
- if `rkey` is given, any access to the key will be re-routed to `rkey`.
-
- Neither the existence of `key` nor that if `rkey` is checked. If they
- do not exist, any subsequence access will fail as usual, after the
- deprecation warning is given.
-
- Parameters
- ----------
- key : str
- Name of the option to be deprecated.
- must be a fully-qualified option name (e.g "x.y.z.rkey").
- msg : str, optional
- Warning message to output when the key is referenced.
- if no message is given a default message will be emitted.
- rkey : str, optional
- Name of an option to reroute access to.
- If specified, any referenced `key` will be
- re-routed to `rkey` including set/get/reset.
- rkey must be a fully-qualified option name (e.g "x.y.z.rkey").
- used by the default message if no `msg` is specified.
- removal_ver : str, optional
- Specifies the version in which this option will
- be removed. used by the default message if no `msg` is specified.
-
- Raises
- ------
- OptionError
- If the specified key has already been deprecated.
- """
- key = key.lower()
-
- if key in _deprecated_options:
- raise OptionError(f"Option '{key}' has already been defined as deprecated.")
-
- _deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver)
-
-
-#
-# functions internal to the module
-
-
-def _select_options(pat: str) -> list[str]:
- """
- returns a list of keys matching `pat`
-
- if pat=="all", returns all registered options
- """
- # short-circuit for exact key
- if pat in _registered_options:
- return [pat]
-
- # else look through all of them
- keys = sorted(_registered_options.keys())
- if pat == "all": # reserved key
- return keys
-
- return [k for k in keys if re.search(pat, k, re.I)]
-
-
-def _get_root(key: str) -> tuple[dict[str, Any], str]:
- path = key.split(".")
- cursor = _global_config
- for p in path[:-1]:
- cursor = cursor[p]
- return cursor, path[-1]
-
-
-def _is_deprecated(key: str) -> bool:
- """Returns True if the given option has been deprecated"""
- key = key.lower()
- return key in _deprecated_options
-
-
-def _get_deprecated_option(key: str):
- """
- Retrieves the metadata for a deprecated option, if `key` is deprecated.
-
- Returns
- -------
- DeprecatedOption (namedtuple) if key is deprecated, None otherwise
- """
- try:
- d = _deprecated_options[key]
- except KeyError:
- return None
- else:
- return d
-
-
-def _get_registered_option(key: str):
- """
- Retrieves the option metadata if `key` is a registered option.
-
- Returns
- -------
- RegisteredOption (namedtuple) if key is deprecated, None otherwise
- """
- return _registered_options.get(key)
-
-
-def _translate_key(key: str) -> str:
- """
- if key id deprecated and a replacement key defined, will return the
- replacement key, otherwise returns `key` as - is
- """
- d = _get_deprecated_option(key)
- if d:
- return d.rkey or key
- else:
- return key
-
-
-def _warn_if_deprecated(key: str) -> bool:
- """
- Checks if `key` is a deprecated option and if so, prints a warning.
-
- Returns
- -------
- bool - True if `key` is deprecated, False otherwise.
- """
- d = _get_deprecated_option(key)
- if d:
- if d.msg:
- warnings.warn(
- d.msg,
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- else:
- msg = f"'{key}' is deprecated"
- if d.removal_ver:
- msg += f" and will be removed in {d.removal_ver}"
- if d.rkey:
- msg += f", please use '{d.rkey}' instead."
- else:
- msg += ", please refrain from using it."
-
- warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
- return True
- return False
-
-
-def _build_option_description(k: str) -> str:
- """Builds a formatted description of a registered option and prints it"""
- o = _get_registered_option(k)
- d = _get_deprecated_option(k)
-
- s = f"{k} "
-
- if o.doc:
- s += "\n".join(o.doc.strip().split("\n"))
- else:
- s += "No description available."
-
- if o:
- s += f"\n [default: {o.defval}] [currently: {_get_option(k, True)}]"
-
- if d:
- rkey = d.rkey or ""
- s += "\n (Deprecated"
- s += f", use `{rkey}` instead."
- s += ")"
-
- return s
-
-
-def pp_options_list(keys: Iterable[str], width: int = 80, _print: bool = False):
- """Builds a concise listing of available options, grouped by prefix"""
- from itertools import groupby
- from textwrap import wrap
-
- def pp(name: str, ks: Iterable[str]) -> list[str]:
- pfx = "- " + name + ".[" if name else ""
- ls = wrap(
- ", ".join(ks),
- width,
- initial_indent=pfx,
- subsequent_indent=" ",
- break_long_words=False,
- )
- if ls and ls[-1] and name:
- ls[-1] = ls[-1] + "]"
- return ls
-
- ls: list[str] = []
- singles = [x for x in sorted(keys) if x.find(".") < 0]
- if singles:
- ls += pp("", singles)
- keys = [x for x in keys if x.find(".") >= 0]
-
- for k, g in groupby(sorted(keys), lambda x: x[: x.rfind(".")]):
- ks = [x[len(k) + 1 :] for x in list(g)]
- ls += pp(k, ks)
- s = "\n".join(ls)
- if _print:
- print(s)
- else:
- return s
-
-
-#
-# helpers
-
-
-@contextmanager
-def config_prefix(prefix) -> Generator[None, None, None]:
- """
- contextmanager for multiple invocations of API with a common prefix
-
- supported API functions: (register / get / set )__option
-
- Warning: This is not thread - safe, and won't work properly if you import
- the API functions into your module using the "from x import y" construct.
-
- Example
- -------
- import pandas._config.config as cf
- with cf.config_prefix("display.font"):
- cf.register_option("color", "red")
- cf.register_option("size", " 5 pt")
- cf.set_option(size, " 6 pt")
- cf.get_option(size)
- ...
-
- etc'
-
- will register options "display.font.color", "display.font.size", set the
- value of "display.font.size"... and so on.
- """
- # Note: reset_option relies on set_option, and on key directly
- # it does not fit in to this monkey-patching scheme
-
- global register_option, get_option, set_option
-
- def wrap(func: F) -> F:
- def inner(key: str, *args, **kwds):
- pkey = f"{prefix}.{key}"
- return func(pkey, *args, **kwds)
-
- return cast(F, inner)
-
- _register_option = register_option
- _get_option = get_option
- _set_option = set_option
- set_option = wrap(set_option)
- get_option = wrap(get_option)
- register_option = wrap(register_option)
- try:
- yield
- finally:
- set_option = _set_option
- get_option = _get_option
- register_option = _register_option
-
-
-# These factories and methods are handy for use as the validator
-# arg in register_option
-
-
-def is_type_factory(_type: type[Any]) -> Callable[[Any], None]:
- """
-
- Parameters
- ----------
- `_type` - a type to be compared against (e.g. type(x) == `_type`)
-
- Returns
- -------
- validator - a function of a single argument x , which raises
- ValueError if type(x) is not equal to `_type`
-
- """
-
- def inner(x) -> None:
- if type(x) != _type:
- raise ValueError(f"Value must have type '{_type}'")
-
- return inner
-
-
-def is_instance_factory(_type) -> Callable[[Any], None]:
- """
-
- Parameters
- ----------
- `_type` - the type to be checked against
-
- Returns
- -------
- validator - a function of a single argument x , which raises
- ValueError if x is not an instance of `_type`
-
- """
- if isinstance(_type, (tuple, list)):
- _type = tuple(_type)
- type_repr = "|".join(map(str, _type))
- else:
- type_repr = f"'{_type}'"
-
- def inner(x) -> None:
- if not isinstance(x, _type):
- raise ValueError(f"Value must be an instance of {type_repr}")
-
- return inner
-
-
-def is_one_of_factory(legal_values) -> Callable[[Any], None]:
- callables = [c for c in legal_values if callable(c)]
- legal_values = [c for c in legal_values if not callable(c)]
-
- def inner(x) -> None:
- if x not in legal_values:
- if not any(c(x) for c in callables):
- uvals = [str(lval) for lval in legal_values]
- pp_values = "|".join(uvals)
- msg = f"Value must be one of {pp_values}"
- if len(callables):
- msg += " or a callable"
- raise ValueError(msg)
-
- return inner
-
-
-def is_nonnegative_int(value: object) -> None:
- """
- Verify that value is None or a positive int.
-
- Parameters
- ----------
- value : None or int
- The `value` to be checked.
-
- Raises
- ------
- ValueError
- When the value is not None or is a negative integer
- """
- if value is None:
- return
-
- elif isinstance(value, int):
- if value >= 0:
- return
-
- msg = "Value must be a nonnegative integer or None"
- raise ValueError(msg)
-
-
-# common type validators, for convenience
-# usage: register_option(... , validator = is_int)
-is_int = is_type_factory(int)
-is_bool = is_type_factory(bool)
-is_float = is_type_factory(float)
-is_str = is_type_factory(str)
-is_text = is_instance_factory((str, bytes))
-
-
-def is_callable(obj) -> bool:
- """
-
- Parameters
- ----------
- `obj` - the object to be checked
-
- Returns
- -------
- validator - returns True if object is callable
- raises ValueError otherwise.
-
- """
- if not callable(obj):
- raise ValueError("Value must be a callable")
- return True
diff --git a/contrib/python/pandas/py3/pandas/_config/dates.py b/contrib/python/pandas/py3/pandas/_config/dates.py
deleted file mode 100644
index b37831f96eb..00000000000
--- a/contrib/python/pandas/py3/pandas/_config/dates.py
+++ /dev/null
@@ -1,25 +0,0 @@
-"""
-config for datetime formatting
-"""
-from __future__ import annotations
-
-from pandas._config import config as cf
-
-pc_date_dayfirst_doc = """
-: boolean
- When True, prints and parses dates with the day first, eg 20/01/2005
-"""
-
-pc_date_yearfirst_doc = """
-: boolean
- When True, prints and parses dates with the year first, eg 2005/01/20
-"""
-
-with cf.config_prefix("display"):
- # Needed upstream of `_libs` because these are used in tslibs.parsing
- cf.register_option(
- "date_dayfirst", False, pc_date_dayfirst_doc, validator=cf.is_bool
- )
- cf.register_option(
- "date_yearfirst", False, pc_date_yearfirst_doc, validator=cf.is_bool
- )
diff --git a/contrib/python/pandas/py3/pandas/_config/display.py b/contrib/python/pandas/py3/pandas/_config/display.py
deleted file mode 100644
index df2c3ad36c8..00000000000
--- a/contrib/python/pandas/py3/pandas/_config/display.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""
-Unopinionated display configuration.
-"""
-
-from __future__ import annotations
-
-import locale
-import sys
-
-from pandas._config import config as cf
-
-# -----------------------------------------------------------------------------
-# Global formatting options
-_initial_defencoding: str | None = None
-
-
-def detect_console_encoding() -> str:
- """
- Try to find the most capable encoding supported by the console.
- slightly modified from the way IPython handles the same issue.
- """
- global _initial_defencoding
-
- encoding = None
- try:
- encoding = sys.stdout.encoding or sys.stdin.encoding
- except (AttributeError, OSError):
- pass
-
- # try again for something better
- if not encoding or "ascii" in encoding.lower():
- try:
- encoding = locale.getpreferredencoding()
- except locale.Error:
- # can be raised by locale.setlocale(), which is
- # called by getpreferredencoding
- # (on some systems, see stdlib locale docs)
- pass
-
- # when all else fails. this will usually be "ascii"
- if not encoding or "ascii" in encoding.lower():
- encoding = sys.getdefaultencoding()
-
- # GH#3360, save the reported defencoding at import time
- # MPL backends may change it. Make available for debugging.
- if not _initial_defencoding:
- _initial_defencoding = sys.getdefaultencoding()
-
- return encoding
-
-
-pc_encoding_doc = """
-: str/unicode
- Defaults to the detected encoding of the console.
- Specifies the encoding to be used for strings returned by to_string,
- these are generally strings meant to be displayed on the console.
-"""
-
-with cf.config_prefix("display"):
- cf.register_option(
- "encoding", detect_console_encoding(), pc_encoding_doc, validator=cf.is_text
- )
diff --git a/contrib/python/pandas/py3/pandas/_config/localization.py b/contrib/python/pandas/py3/pandas/_config/localization.py
deleted file mode 100644
index 4e9a0142af3..00000000000
--- a/contrib/python/pandas/py3/pandas/_config/localization.py
+++ /dev/null
@@ -1,169 +0,0 @@
-"""
-Helpers for configuring locale settings.
-
-Name `localization` is chosen to avoid overlap with builtin `locale` module.
-"""
-from __future__ import annotations
-
-from contextlib import contextmanager
-import locale
-import platform
-import re
-import subprocess
-from typing import Generator
-
-from pandas._config.config import options
-
-
-@contextmanager
-def set_locale(
- new_locale: str | tuple[str, str], lc_var: int = locale.LC_ALL
-) -> Generator[str | tuple[str, str], None, None]:
- """
- Context manager for temporarily setting a locale.
-
- Parameters
- ----------
- new_locale : str or tuple
- A string of the form <language_country>.<encoding>. For example to set
- the current locale to US English with a UTF8 encoding, you would pass
- "en_US.UTF-8".
- lc_var : int, default `locale.LC_ALL`
- The category of the locale being set.
-
- Notes
- -----
- This is useful when you want to run a particular block of code under a
- particular locale, without globally setting the locale. This probably isn't
- thread-safe.
- """
- # getlocale is not always compliant with setlocale, use setlocale. GH#46595
- current_locale = locale.setlocale(lc_var)
-
- try:
- locale.setlocale(lc_var, new_locale)
- normalized_code, normalized_encoding = locale.getlocale()
- if normalized_code is not None and normalized_encoding is not None:
- yield f"{normalized_code}.{normalized_encoding}"
- else:
- yield new_locale
- finally:
- locale.setlocale(lc_var, current_locale)
-
-
-def can_set_locale(lc: str, lc_var: int = locale.LC_ALL) -> bool:
- """
- Check to see if we can set a locale, and subsequently get the locale,
- without raising an Exception.
-
- Parameters
- ----------
- lc : str
- The locale to attempt to set.
- lc_var : int, default `locale.LC_ALL`
- The category of the locale being set.
-
- Returns
- -------
- bool
- Whether the passed locale can be set
- """
- try:
- with set_locale(lc, lc_var=lc_var):
- pass
- except (ValueError, locale.Error):
- # horrible name for a Exception subclass
- return False
- else:
- return True
-
-
-def _valid_locales(locales: list[str] | str, normalize: bool) -> list[str]:
- """
- Return a list of normalized locales that do not throw an ``Exception``
- when set.
-
- Parameters
- ----------
- locales : str
- A string where each locale is separated by a newline.
- normalize : bool
- Whether to call ``locale.normalize`` on each locale.
-
- Returns
- -------
- valid_locales : list
- A list of valid locales.
- """
- return [
- loc
- for loc in (
- locale.normalize(loc.strip()) if normalize else loc.strip()
- for loc in locales
- )
- if can_set_locale(loc)
- ]
-
-
-def get_locales(
- prefix: str | None = None,
- normalize: bool = True,
-) -> list[str]:
- """
- Get all the locales that are available on the system.
-
- Parameters
- ----------
- prefix : str
- If not ``None`` then return only those locales with the prefix
- provided. For example to get all English language locales (those that
- start with ``"en"``), pass ``prefix="en"``.
- normalize : bool
- Call ``locale.normalize`` on the resulting list of available locales.
- If ``True``, only locales that can be set without throwing an
- ``Exception`` are returned.
-
- Returns
- -------
- locales : list of strings
- A list of locale strings that can be set with ``locale.setlocale()``.
- For example::
-
- locale.setlocale(locale.LC_ALL, locale_string)
-
- On error will return an empty list (no locale available, e.g. Windows)
-
- """
- if platform.system() in ("Linux", "Darwin"):
- raw_locales = subprocess.check_output(["locale", "-a"])
- else:
- # Other platforms e.g. windows platforms don't define "locale -a"
- # Note: is_platform_windows causes circular import here
- return []
-
- try:
- # raw_locales is "\n" separated list of locales
- # it may contain non-decodable parts, so split
- # extract what we can and then rejoin.
- split_raw_locales = raw_locales.split(b"\n")
- out_locales = []
- for x in split_raw_locales:
- try:
- out_locales.append(str(x, encoding=options.display.encoding))
- except UnicodeError:
- # 'locale -a' is used to populated 'raw_locales' and on
- # Redhat 7 Linux (and maybe others) prints locale names
- # using windows-1252 encoding. Bug only triggered by
- # a few special characters and when there is an
- # extensive list of installed locales.
- out_locales.append(str(x, encoding="windows-1252"))
-
- except TypeError:
- pass
-
- if prefix is None:
- return _valid_locales(out_locales, normalize)
-
- pattern = re.compile(f"{prefix}.*")
- found = pattern.findall("\n".join(out_locales))
- return _valid_locales(found, normalize)
diff --git a/contrib/python/pandas/py3/pandas/_libs/__init__.py b/contrib/python/pandas/py3/pandas/_libs/__init__.py
deleted file mode 100644
index f119e280f58..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-__all__ = [
- "NaT",
- "NaTType",
- "OutOfBoundsDatetime",
- "Period",
- "Timedelta",
- "Timestamp",
- "iNaT",
- "Interval",
-]
-
-
-from pandas._libs.interval import Interval
-from pandas._libs.tslibs import (
- NaT,
- NaTType,
- OutOfBoundsDatetime,
- Period,
- Timedelta,
- Timestamp,
- iNaT,
-)
diff --git a/contrib/python/pandas/py3/pandas/_libs/algos.pxd b/contrib/python/pandas/py3/pandas/_libs/algos.pxd
deleted file mode 100644
index c3b83b9bd40..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/algos.pxd
+++ /dev/null
@@ -1,22 +0,0 @@
-from pandas._libs.dtypes cimport (
- numeric_object_t,
- numeric_t,
-)
-
-
-cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil
-
-cdef enum TiebreakEnumType:
- TIEBREAK_AVERAGE
- TIEBREAK_MIN,
- TIEBREAK_MAX
- TIEBREAK_FIRST
- TIEBREAK_FIRST_DESCENDING
- TIEBREAK_DENSE
-
-
-cdef numeric_object_t get_rank_nan_fill_val(
- bint rank_nans_highest,
- numeric_object_t val,
- bint is_datetimelike=*,
-)
diff --git a/contrib/python/pandas/py3/pandas/_libs/algos.pyi b/contrib/python/pandas/py3/pandas/_libs/algos.pyi
deleted file mode 100644
index 20a805533e8..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/algos.pyi
+++ /dev/null
@@ -1,420 +0,0 @@
-from typing import Any
-
-import numpy as np
-
-from pandas._typing import npt
-
-class Infinity:
- """
- Provide a positive Infinity comparison method for ranking.
- """
-
- def __eq__(self, other) -> bool: ...
- def __ne__(self, other) -> bool: ...
- def __lt__(self, other) -> bool: ...
- def __le__(self, other) -> bool: ...
- def __gt__(self, other) -> bool: ...
- def __ge__(self, other) -> bool: ...
-
-class NegInfinity:
- """
- Provide a negative Infinity comparison method for ranking.
- """
-
- def __eq__(self, other) -> bool: ...
- def __ne__(self, other) -> bool: ...
- def __lt__(self, other) -> bool: ...
- def __le__(self, other) -> bool: ...
- def __gt__(self, other) -> bool: ...
- def __ge__(self, other) -> bool: ...
-
-def unique_deltas(
- arr: np.ndarray, # const int64_t[:]
-) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1]
-def is_lexsorted(list_of_arrays: list[npt.NDArray[np.int64]]) -> bool: ...
-def groupsort_indexer(
- index: np.ndarray, # const int64_t[:]
- ngroups: int,
-) -> tuple[
- np.ndarray, # ndarray[int64_t, ndim=1]
- np.ndarray, # ndarray[int64_t, ndim=1]
-]: ...
-def kth_smallest(
- arr: np.ndarray, # numeric[:]
- k: int,
-) -> Any: ... # numeric
-
-# ----------------------------------------------------------------------
-# Pairwise correlation/covariance
-
-def nancorr(
- mat: npt.NDArray[np.float64], # const float64_t[:, :]
- cov: bool = ...,
- minp: int | None = ...,
-) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2]
-def nancorr_spearman(
- mat: npt.NDArray[np.float64], # ndarray[float64_t, ndim=2]
- minp: int = ...,
-) -> npt.NDArray[np.float64]: ... # ndarray[float64_t, ndim=2]
-
-# ----------------------------------------------------------------------
-
-def validate_limit(nobs: int | None, limit=...) -> int: ...
-def pad(
- old: np.ndarray, # ndarray[numeric_object_t]
- new: np.ndarray, # ndarray[numeric_object_t]
- limit=...,
-) -> npt.NDArray[np.intp]: ... # np.ndarray[np.intp, ndim=1]
-def pad_inplace(
- values: np.ndarray, # numeric_object_t[:]
- mask: np.ndarray, # uint8_t[:]
- limit=...,
-) -> None: ...
-def pad_2d_inplace(
- values: np.ndarray, # numeric_object_t[:, :]
- mask: np.ndarray, # const uint8_t[:, :]
- limit=...,
-) -> None: ...
-def backfill(
- old: np.ndarray, # ndarray[numeric_object_t]
- new: np.ndarray, # ndarray[numeric_object_t]
- limit=...,
-) -> npt.NDArray[np.intp]: ... # np.ndarray[np.intp, ndim=1]
-def backfill_inplace(
- values: np.ndarray, # numeric_object_t[:]
- mask: np.ndarray, # uint8_t[:]
- limit=...,
-) -> None: ...
-def backfill_2d_inplace(
- values: np.ndarray, # numeric_object_t[:, :]
- mask: np.ndarray, # const uint8_t[:, :]
- limit=...,
-) -> None: ...
-def is_monotonic(
- arr: np.ndarray, # ndarray[numeric_object_t, ndim=1]
- timelike: bool,
-) -> tuple[bool, bool, bool]: ...
-
-# ----------------------------------------------------------------------
-# rank_1d, rank_2d
-# ----------------------------------------------------------------------
-
-def rank_1d(
- values: np.ndarray, # ndarray[numeric_object_t, ndim=1]
- labels: np.ndarray | None = ..., # const int64_t[:]=None
- is_datetimelike: bool = ...,
- ties_method=...,
- ascending: bool = ...,
- pct: bool = ...,
- na_option=...,
- mask: npt.NDArray[np.bool_] | None = ...,
-) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1]
-def rank_2d(
- in_arr: np.ndarray, # ndarray[numeric_object_t, ndim=2]
- axis: int = ...,
- is_datetimelike: bool = ...,
- ties_method=...,
- ascending: bool = ...,
- na_option=...,
- pct: bool = ...,
-) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1]
-def diff_2d(
- arr: np.ndarray, # ndarray[diff_t, ndim=2]
- out: np.ndarray, # ndarray[out_t, ndim=2]
- periods: int,
- axis: int,
- datetimelike: bool = ...,
-) -> None: ...
-def ensure_platform_int(arr: object) -> npt.NDArray[np.intp]: ...
-def ensure_object(arr: object) -> npt.NDArray[np.object_]: ...
-def ensure_float64(arr: object) -> npt.NDArray[np.float64]: ...
-def ensure_int8(arr: object) -> npt.NDArray[np.int8]: ...
-def ensure_int16(arr: object) -> npt.NDArray[np.int16]: ...
-def ensure_int32(arr: object) -> npt.NDArray[np.int32]: ...
-def ensure_int64(arr: object) -> npt.NDArray[np.int64]: ...
-def ensure_uint64(arr: object) -> npt.NDArray[np.uint64]: ...
-def take_1d_int8_int8(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int8_int32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int8_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int8_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int16_int16(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int16_int32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int16_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int16_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int32_int32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int32_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int32_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int64_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_int64_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_float32_float32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_float32_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_float64_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_object_object(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_bool_bool(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_1d_bool_object(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int8_int8(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int8_int32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int8_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int8_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int16_int16(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int16_int32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int16_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int16_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int32_int32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int32_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int32_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int64_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_int64_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_float32_float32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_float32_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_float64_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_object_object(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_bool_bool(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis0_bool_object(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int8_int8(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int8_int32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int8_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int8_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int16_int16(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int16_int32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int16_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int16_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int32_int32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int32_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int32_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int64_int64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_int64_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_float32_float32(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_float32_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_float64_float64(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_object_object(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_bool_bool(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_axis1_bool_object(
- values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
-) -> None: ...
-def take_2d_multi_int8_int8(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int8_int32(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int8_int64(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int8_float64(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int16_int16(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int16_int32(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int16_int64(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int16_float64(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int32_int32(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int32_int64(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int32_float64(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int64_float64(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_float32_float32(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_float32_float64(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_float64_float64(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_object_object(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_bool_bool(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_bool_object(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
-def take_2d_multi_int64_int64(
- values: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value=...,
-) -> None: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/algos.pyx b/contrib/python/pandas/py3/pandas/_libs/algos.pyx
deleted file mode 100644
index 1f701a871ab..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/algos.pyx
+++ /dev/null
@@ -1,1536 +0,0 @@
-cimport cython
-from cython cimport Py_ssize_t
-from libc.math cimport (
- fabs,
- sqrt,
-)
-from libc.stdlib cimport (
- free,
- malloc,
-)
-from libc.string cimport memmove
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- NPY_FLOAT64,
- NPY_INT8,
- NPY_INT16,
- NPY_INT32,
- NPY_INT64,
- NPY_OBJECT,
- NPY_UINT64,
- float32_t,
- float64_t,
- int8_t,
- int16_t,
- int32_t,
- int64_t,
- intp_t,
- ndarray,
- uint8_t,
- uint16_t,
- uint32_t,
- uint64_t,
-)
-
-cnp.import_array()
-
-cimport pandas._libs.util as util
-from pandas._libs.dtypes cimport (
- numeric_object_t,
- numeric_t,
-)
-from pandas._libs.khash cimport (
- kh_destroy_int64,
- kh_get_int64,
- kh_init_int64,
- kh_int64_t,
- kh_put_int64,
- kh_resize_int64,
- khiter_t,
-)
-from pandas._libs.util cimport get_nat
-
-import pandas._libs.missing as missing
-
-cdef:
- float64_t FP_ERR = 1e-13
- float64_t NaN = <float64_t>np.NaN
- int64_t NPY_NAT = get_nat()
-
-
-tiebreakers = {
- "average": TIEBREAK_AVERAGE,
- "min": TIEBREAK_MIN,
- "max": TIEBREAK_MAX,
- "first": TIEBREAK_FIRST,
- "dense": TIEBREAK_DENSE,
-}
-
-
-cdef bint are_diff(object left, object right):
- try:
- return fabs(left - right) > FP_ERR
- except TypeError:
- return left != right
-
-
-class Infinity:
- """
- Provide a positive Infinity comparison method for ranking.
- """
- def __lt__(self, other):
- return False
-
- def __le__(self, other):
- return isinstance(other, Infinity)
-
- def __eq__(self, other):
- return isinstance(other, Infinity)
-
- def __ne__(self, other):
- return not isinstance(other, Infinity)
-
- def __gt__(self, other):
- return (not isinstance(other, Infinity) and
- not missing.checknull(other))
-
- def __ge__(self, other):
- return not missing.checknull(other)
-
-
-class NegInfinity:
- """
- Provide a negative Infinity comparison method for ranking.
- """
- def __lt__(self, other):
- return (not isinstance(other, NegInfinity) and
- not missing.checknull(other))
-
- def __le__(self, other):
- return not missing.checknull(other)
-
- def __eq__(self, other):
- return isinstance(other, NegInfinity)
-
- def __ne__(self, other):
- return not isinstance(other, NegInfinity)
-
- def __gt__(self, other):
- return False
-
- def __ge__(self, other):
- return isinstance(other, NegInfinity)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr):
- """
- Efficiently find the unique first-differences of the given array.
-
- Parameters
- ----------
- arr : ndarray[int64_t]
-
- Returns
- -------
- ndarray[int64_t]
- An ordered ndarray[int64_t]
- """
- cdef:
- Py_ssize_t i, n = len(arr)
- int64_t val
- khiter_t k
- kh_int64_t *table
- int ret = 0
- list uniques = []
- ndarray[int64_t, ndim=1] result
-
- table = kh_init_int64()
- kh_resize_int64(table, 10)
- for i in range(n - 1):
- val = arr[i + 1] - arr[i]
- k = kh_get_int64(table, val)
- if k == table.n_buckets:
- kh_put_int64(table, val, &ret)
- uniques.append(val)
- kh_destroy_int64(table)
-
- result = np.array(uniques, dtype=np.int64)
- result.sort()
- return result
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def is_lexsorted(list_of_arrays: list) -> bool:
- cdef:
- Py_ssize_t i
- Py_ssize_t n, nlevels
- int64_t k, cur, pre
- ndarray arr
- bint result = True
-
- nlevels = len(list_of_arrays)
- n = len(list_of_arrays[0])
-
- cdef int64_t **vecs = <int64_t**>malloc(nlevels * sizeof(int64_t*))
- for i in range(nlevels):
- arr = list_of_arrays[i]
- assert arr.dtype.name == "int64"
- vecs[i] = <int64_t*>cnp.PyArray_DATA(arr)
-
- # Assume uniqueness??
- with nogil:
- for i in range(1, n):
- for k in range(nlevels):
- cur = vecs[k][i]
- pre = vecs[k][i -1]
- if cur == pre:
- continue
- elif cur > pre:
- break
- else:
- result = False
- break
- if not result:
- break
- free(vecs)
- return result
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
- """
- Compute a 1-d indexer.
-
- The indexer is an ordering of the passed index,
- ordered by the groups.
-
- Parameters
- ----------
- index: np.ndarray[np.intp]
- Mappings from group -> position.
- ngroups: int64
- Number of groups.
-
- Returns
- -------
- ndarray[intp_t, ndim=1]
- Indexer
- ndarray[intp_t, ndim=1]
- Group Counts
-
- Notes
- -----
- This is a reverse of the label factorization process.
- """
- cdef:
- Py_ssize_t i, label, n
- intp_t[::1] indexer, where, counts
-
- counts = np.zeros(ngroups + 1, dtype=np.intp)
- n = len(index)
- indexer = np.zeros(n, dtype=np.intp)
- where = np.zeros(ngroups + 1, dtype=np.intp)
-
- with nogil:
-
- # count group sizes, location 0 for NA
- for i in range(n):
- counts[index[i] + 1] += 1
-
- # mark the start of each contiguous group of like-indexed data
- for i in range(1, ngroups + 1):
- where[i] = where[i - 1] + counts[i - 1]
-
- # this is our indexer
- for i in range(n):
- label = index[i] + 1
- indexer[where[label]] = i
- where[label] += 1
-
- return indexer.base, counts.base
-
-
-cdef Py_ssize_t swap(numeric_t *a, numeric_t *b) nogil:
- cdef:
- numeric_t t
-
- # cython doesn't allow pointer dereference so use array syntax
- t = a[0]
- a[0] = b[0]
- b[0] = t
- return 0
-
-
-cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil:
- """
- See kth_smallest.__doc__. The additional parameter n specifies the maximum
- number of elements considered in arr, needed for compatibility with usage
- in groupby.pyx
- """
- cdef:
- Py_ssize_t i, j, left, m
- numeric_t x
-
- left = 0
- m = n - 1
-
- while left < m:
- x = arr[k]
- i = left
- j = m
-
- while 1:
- while arr[i] < x:
- i += 1
- while x < arr[j]:
- j -= 1
- if i <= j:
- swap(&arr[i], &arr[j])
- i += 1
- j -= 1
-
- if i > j:
- break
-
- if j < k:
- left = i
- if k < i:
- m = j
- return arr[k]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def kth_smallest(numeric_t[::1] arr, Py_ssize_t k) -> numeric_t:
- """
- Compute the kth smallest value in arr. Note that the input
- array will be modified.
-
- Parameters
- ----------
- arr : numeric[::1]
- Array to compute the kth smallest value for, must be
- contiguous
- k : Py_ssize_t
-
- Returns
- -------
- numeric
- The kth smallest value in arr
- """
- cdef:
- numeric_t result
-
- with nogil:
- result = kth_smallest_c(&arr[0], k, arr.shape[0])
-
- return result
-
-
-# ----------------------------------------------------------------------
-# Pairwise correlation/covariance
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.cdivision(True)
-def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
- cdef:
- Py_ssize_t i, xi, yi, N, K
- bint minpv
- float64_t[:, ::1] result
- ndarray[uint8_t, ndim=2] mask
- int64_t nobs = 0
- float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy
-
- N, K = (<object>mat).shape
-
- if minp is None:
- minpv = 1
- else:
- minpv = <int>minp
-
- result = np.empty((K, K), dtype=np.float64)
- mask = np.isfinite(mat).view(np.uint8)
-
- with nogil:
- for xi in range(K):
- for yi in range(xi + 1):
- # Welford's method for the variance-calculation
- # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
- for i in range(N):
- if mask[i, xi] and mask[i, yi]:
- vx = mat[i, xi]
- vy = mat[i, yi]
- nobs += 1
- dx = vx - meanx
- dy = vy - meany
- meanx += 1. / nobs * dx
- meany += 1. / nobs * dy
- ssqdmx += (vx - meanx) * dx
- ssqdmy += (vy - meany) * dy
- covxy += (vx - meanx) * dy
-
- if nobs < minpv:
- result[xi, yi] = result[yi, xi] = NaN
- else:
- divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
-
- if divisor != 0:
- result[xi, yi] = result[yi, xi] = covxy / divisor
- else:
- result[xi, yi] = result[yi, xi] = NaN
-
- return result.base
-
-# ----------------------------------------------------------------------
-# Pairwise Spearman correlation
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
- cdef:
- Py_ssize_t i, xi, yi, N, K
- ndarray[float64_t, ndim=2] result
- ndarray[float64_t, ndim=2] ranked_mat
- ndarray[float64_t, ndim=1] rankedx, rankedy
- float64_t[::1] maskedx, maskedy
- ndarray[uint8_t, ndim=2] mask
- int64_t nobs = 0
- bint no_nans
- float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
-
- N, K = (<object>mat).shape
-
- # Handle the edge case where we know all results will be nan
- # to keep conditional logic inside loop simpler
- if N < minp:
- result = np.full((K, K), np.nan, dtype=np.float64)
- return result
-
- result = np.empty((K, K), dtype=np.float64)
- mask = np.isfinite(mat).view(np.uint8)
- no_nans = mask.all()
-
- ranked_mat = np.empty((N, K), dtype=np.float64)
-
- # Note: we index into maskedx, maskedy in loops up to nobs, but using N is safe
- # here since N >= nobs and values are stored contiguously
- maskedx = np.empty(N, dtype=np.float64)
- maskedy = np.empty(N, dtype=np.float64)
- for i in range(K):
- ranked_mat[:, i] = rank_1d(mat[:, i])
-
- with nogil:
- for xi in range(K):
- for yi in range(xi + 1):
- sumx = sumxx = sumyy = 0
-
- # Fastpath for data with no nans/infs, allows avoiding mask checks
- # and array reassignments
- if no_nans:
- mean = (N + 1) / 2.
-
- # now the cov numerator
- for i in range(N):
- vx = ranked_mat[i, xi] - mean
- vy = ranked_mat[i, yi] - mean
-
- sumx += vx * vy
- sumxx += vx * vx
- sumyy += vy * vy
- else:
- nobs = 0
- # Keep track of whether we need to recompute ranks
- all_ranks = True
- for i in range(N):
- all_ranks &= not (mask[i, xi] ^ mask[i, yi])
- if mask[i, xi] and mask[i, yi]:
- maskedx[nobs] = ranked_mat[i, xi]
- maskedy[nobs] = ranked_mat[i, yi]
- nobs += 1
-
- if nobs < minp:
- result[xi, yi] = result[yi, xi] = NaN
- continue
- else:
- if not all_ranks:
- with gil:
- # We need to slice back to nobs because rank_1d will
- # require arrays of nobs length
- rankedx = rank_1d(np.asarray(maskedx)[:nobs])
- rankedy = rank_1d(np.asarray(maskedy)[:nobs])
- for i in range(nobs):
- maskedx[i] = rankedx[i]
- maskedy[i] = rankedy[i]
-
- mean = (nobs + 1) / 2.
-
- # now the cov numerator
- for i in range(nobs):
- vx = maskedx[i] - mean
- vy = maskedy[i] - mean
-
- sumx += vx * vy
- sumxx += vx * vx
- sumyy += vy * vy
-
- divisor = sqrt(sumxx * sumyy)
-
- if divisor != 0:
- result[xi, yi] = result[yi, xi] = sumx / divisor
- else:
- result[xi, yi] = result[yi, xi] = NaN
-
- return result
-
-
-# ----------------------------------------------------------------------
-
-def validate_limit(nobs: int | None, limit=None) -> int:
- """
- Check that the `limit` argument is a positive integer.
-
- Parameters
- ----------
- nobs : int
- limit : object
-
- Returns
- -------
- int
- The limit.
- """
- if limit is None:
- lim = nobs
- else:
- if not util.is_integer_object(limit):
- raise ValueError("Limit must be an integer")
- if limit < 1:
- raise ValueError("Limit must be greater than 0")
- lim = limit
-
- return lim
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad(
- ndarray[numeric_object_t] old,
- ndarray[numeric_object_t] new,
- limit=None
-) -> ndarray:
- # -> ndarray[intp_t, ndim=1]
- cdef:
- Py_ssize_t i, j, nleft, nright
- ndarray[intp_t, ndim=1] indexer
- numeric_object_t cur, next_val
- int lim, fill_count = 0
-
- nleft = len(old)
- nright = len(new)
- indexer = np.empty(nright, dtype=np.intp)
- indexer[:] = -1
-
- lim = validate_limit(nright, limit)
-
- if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
- return indexer
-
- i = j = 0
-
- cur = old[0]
-
- while j <= nright - 1 and new[j] < cur:
- j += 1
-
- while True:
- if j == nright:
- break
-
- if i == nleft - 1:
- while j < nright:
- if new[j] == cur:
- indexer[j] = i
- elif new[j] > cur and fill_count < lim:
- indexer[j] = i
- fill_count += 1
- j += 1
- break
-
- next_val = old[i + 1]
-
- while j < nright and cur <= new[j] < next_val:
- if new[j] == cur:
- indexer[j] = i
- elif fill_count < lim:
- indexer[j] = i
- fill_count += 1
- j += 1
-
- fill_count = 0
- i += 1
- cur = next_val
-
- return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):
- cdef:
- Py_ssize_t i, N
- numeric_object_t val
- uint8_t prev_mask
- int lim, fill_count = 0
-
- N = len(values)
-
- # GH#2778
- if N == 0:
- return
-
- lim = validate_limit(N, limit)
-
- val = values[0]
- prev_mask = mask[0]
- for i in range(N):
- if mask[i]:
- if fill_count >= lim:
- continue
- fill_count += 1
- values[i] = val
- mask[i] = prev_mask
- else:
- fill_count = 0
- val = values[i]
- prev_mask = mask[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace(numeric_object_t[:, :] values, uint8_t[:, :] mask, limit=None):
- cdef:
- Py_ssize_t i, j, N, K
- numeric_object_t val
- int lim, fill_count = 0
-
- K, N = (<object>values).shape
-
- # GH#2778
- if N == 0:
- return
-
- lim = validate_limit(N, limit)
-
- for j in range(K):
- fill_count = 0
- val = values[j, 0]
- for i in range(N):
- if mask[j, i]:
- if fill_count >= lim or i == 0:
- continue
- fill_count += 1
- values[j, i] = val
- mask[j, i] = False
- else:
- fill_count = 0
- val = values[j, i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill(
- ndarray[numeric_object_t] old,
- ndarray[numeric_object_t] new,
- limit=None
-) -> ndarray: # -> ndarray[intp_t, ndim=1]
- """
- Backfilling logic for generating fill vector
-
- Diagram of what's going on
-
- Old New Fill vector Mask
- . 0 1
- . 0 1
- . 0 1
- A A 0 1
- . 1 1
- . 1 1
- . 1 1
- . 1 1
- . 1 1
- B B 1 1
- . 2 1
- . 2 1
- . 2 1
- C C 2 1
- . 0
- . 0
- D
- """
- cdef:
- Py_ssize_t i, j, nleft, nright
- ndarray[intp_t, ndim=1] indexer
- numeric_object_t cur, prev
- int lim, fill_count = 0
-
- nleft = len(old)
- nright = len(new)
- indexer = np.empty(nright, dtype=np.intp)
- indexer[:] = -1
-
- lim = validate_limit(nright, limit)
-
- if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
- return indexer
-
- i = nleft - 1
- j = nright - 1
-
- cur = old[nleft - 1]
-
- while j >= 0 and new[j] > cur:
- j -= 1
-
- while True:
- if j < 0:
- break
-
- if i == 0:
- while j >= 0:
- if new[j] == cur:
- indexer[j] = i
- elif new[j] < cur and fill_count < lim:
- indexer[j] = i
- fill_count += 1
- j -= 1
- break
-
- prev = old[i - 1]
-
- while j >= 0 and prev < new[j] <= cur:
- if new[j] == cur:
- indexer[j] = i
- elif new[j] < cur and fill_count < lim:
- indexer[j] = i
- fill_count += 1
- j -= 1
-
- fill_count = 0
- i -= 1
- cur = prev
-
- return indexer
-
-
-def backfill_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):
- pad_inplace(values[::-1], mask[::-1], limit=limit)
-
-
-def backfill_2d_inplace(numeric_object_t[:, :] values,
- uint8_t[:, :] mask,
- limit=None):
- pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit)
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike):
- """
- Returns
- -------
- tuple
- is_monotonic_inc : bool
- is_monotonic_dec : bool
- is_unique : bool
- """
- cdef:
- Py_ssize_t i, n
- numeric_object_t prev, cur
- bint is_monotonic_inc = 1
- bint is_monotonic_dec = 1
- bint is_unique = 1
- bint is_strict_monotonic = 1
-
- n = len(arr)
-
- if n == 1:
- if arr[0] != arr[0] or (numeric_object_t is int64_t and timelike and
- arr[0] == NPY_NAT):
- # single value is NaN
- return False, False, True
- else:
- return True, True, True
- elif n < 2:
- return True, True, True
-
- if timelike and <int64_t>arr[0] == NPY_NAT:
- return False, False, True
-
- if numeric_object_t is not object:
- with nogil:
- prev = arr[0]
- for i in range(1, n):
- cur = arr[i]
- if timelike and <int64_t>cur == NPY_NAT:
- is_monotonic_inc = 0
- is_monotonic_dec = 0
- break
- if cur < prev:
- is_monotonic_inc = 0
- elif cur > prev:
- is_monotonic_dec = 0
- elif cur == prev:
- is_unique = 0
- else:
- # cur or prev is NaN
- is_monotonic_inc = 0
- is_monotonic_dec = 0
- break
- if not is_monotonic_inc and not is_monotonic_dec:
- is_monotonic_inc = 0
- is_monotonic_dec = 0
- break
- prev = cur
- else:
- # object-dtype, identical to above except we cannot use `with nogil`
- prev = arr[0]
- for i in range(1, n):
- cur = arr[i]
- if timelike and <int64_t>cur == NPY_NAT:
- is_monotonic_inc = 0
- is_monotonic_dec = 0
- break
- if cur < prev:
- is_monotonic_inc = 0
- elif cur > prev:
- is_monotonic_dec = 0
- elif cur == prev:
- is_unique = 0
- else:
- # cur or prev is NaN
- is_monotonic_inc = 0
- is_monotonic_dec = 0
- break
- if not is_monotonic_inc and not is_monotonic_dec:
- is_monotonic_inc = 0
- is_monotonic_dec = 0
- break
- prev = cur
-
- is_strict_monotonic = is_unique and (is_monotonic_inc or is_monotonic_dec)
- return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic
-
-
-# ----------------------------------------------------------------------
-# rank_1d, rank_2d
-# ----------------------------------------------------------------------
-
-cdef numeric_object_t get_rank_nan_fill_val(
- bint rank_nans_highest,
- numeric_object_t val,
- bint is_datetimelike=False,
-):
- """
- Return the value we'll use to represent missing values when sorting depending
- on if we'd like missing values to end up at the top/bottom. (The second parameter
- is unused, but needed for fused type specialization)
- """
- if numeric_object_t is int64_t and is_datetimelike and not rank_nans_highest:
- return NPY_NAT + 1
-
- if rank_nans_highest:
- if numeric_object_t is object:
- return Infinity()
- elif numeric_object_t is int64_t:
- return util.INT64_MAX
- elif numeric_object_t is int32_t:
- return util.INT32_MAX
- elif numeric_object_t is int16_t:
- return util.INT16_MAX
- elif numeric_object_t is int8_t:
- return util.INT8_MAX
- elif numeric_object_t is uint64_t:
- return util.UINT64_MAX
- elif numeric_object_t is uint32_t:
- return util.UINT32_MAX
- elif numeric_object_t is uint16_t:
- return util.UINT16_MAX
- elif numeric_object_t is uint8_t:
- return util.UINT8_MAX
- else:
- return np.inf
- else:
- if numeric_object_t is object:
- return NegInfinity()
- elif numeric_object_t is int64_t:
- # Note(jbrockmendel) 2022-03-15 for reasons unknown, using util.INT64_MIN
- # instead of NPY_NAT here causes build warnings and failure in
- # test_cummax_i8_at_implementation_bound
- return NPY_NAT
- elif numeric_object_t is int32_t:
- return util.INT32_MIN
- elif numeric_object_t is int16_t:
- return util.INT16_MIN
- elif numeric_object_t is int8_t:
- return util.INT8_MIN
- elif numeric_object_t is uint64_t:
- return 0
- elif numeric_object_t is uint32_t:
- return 0
- elif numeric_object_t is uint16_t:
- return 0
- elif numeric_object_t is uint8_t:
- return 0
- else:
- return -np.inf
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def rank_1d(
- ndarray[numeric_object_t, ndim=1] values,
- const intp_t[:] labels=None,
- bint is_datetimelike=False,
- ties_method="average",
- bint ascending=True,
- bint pct=False,
- na_option="keep",
- const uint8_t[:] mask=None,
-):
- """
- Fast NaN-friendly version of ``scipy.stats.rankdata``.
-
- Parameters
- ----------
- values : array of numeric_object_t values to be ranked
- labels : np.ndarray[np.intp] or None
- Array containing unique label for each group, with its ordering
- matching up to the corresponding record in `values`. If not called
- from a groupby operation, will be None.
- is_datetimelike : bool, default False
- True if `values` contains datetime-like entries.
- ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
- 'average'
- * average: average rank of group
- * min: lowest rank in group
- * max: highest rank in group
- * first: ranks assigned in order they appear in the array
- * dense: like 'min', but rank always increases by 1 between groups
- ascending : bool, default True
- False for ranks by high (1) to low (N)
- na_option : {'keep', 'top', 'bottom'}, default 'keep'
- pct : bool, default False
- Compute percentage rank of data within each group
- na_option : {'keep', 'top', 'bottom'}, default 'keep'
- * keep: leave NA values where they are
- * top: smallest rank if ascending
- * bottom: smallest rank if descending
- mask : np.ndarray[bool], optional, default None
- Specify locations to be treated as NA, for e.g. Categorical.
- """
- cdef:
- TiebreakEnumType tiebreak
- Py_ssize_t N
- int64_t[::1] grp_sizes
- intp_t[:] lexsort_indexer
- float64_t[::1] out
- ndarray[numeric_object_t, ndim=1] masked_vals
- numeric_object_t[:] masked_vals_memview
- bint keep_na, nans_rank_highest, check_labels, check_mask
- numeric_object_t nan_fill_val
-
- tiebreak = tiebreakers[ties_method]
- if tiebreak == TIEBREAK_FIRST:
- if not ascending:
- tiebreak = TIEBREAK_FIRST_DESCENDING
-
- keep_na = na_option == "keep"
-
- N = len(values)
- if labels is not None:
- # TODO(cython3): cast won't be necessary (#2992)
- assert <Py_ssize_t>len(labels) == N
- out = np.empty(N)
- grp_sizes = np.ones(N, dtype=np.int64)
-
- # If we don't care about labels, can short-circuit later label
- # comparisons
- check_labels = labels is not None
-
- # For cases where a mask is not possible, we can avoid mask checks
- check_mask = (
- numeric_object_t is float32_t
- or numeric_object_t is float64_t
- or numeric_object_t is object
- or (numeric_object_t is int64_t and is_datetimelike)
- )
- check_mask = check_mask or mask is not None
-
- # Copy values into new array in order to fill missing data
- # with mask, without obfuscating location of missing data
- # in values array
- if numeric_object_t is object and values.dtype != np.object_:
- masked_vals = values.astype("O")
- else:
- masked_vals = values.copy()
-
- if mask is not None:
- pass
- elif numeric_object_t is object:
- mask = missing.isnaobj(masked_vals)
- elif numeric_object_t is int64_t and is_datetimelike:
- mask = (masked_vals == NPY_NAT).astype(np.uint8)
- elif numeric_object_t is float64_t or numeric_object_t is float32_t:
- mask = np.isnan(masked_vals).astype(np.uint8)
- else:
- mask = np.zeros(shape=len(masked_vals), dtype=np.uint8)
-
- # If `na_option == 'top'`, we want to assign the lowest rank
- # to NaN regardless of ascending/descending. So if ascending,
- # fill with lowest value of type to end up with lowest rank.
- # If descending, fill with highest value since descending
- # will flip the ordering to still end up with lowest rank.
- # Symmetric logic applies to `na_option == 'bottom'`
- nans_rank_highest = ascending ^ (na_option == "top")
- nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, <numeric_object_t>0)
- if nans_rank_highest:
- order = [masked_vals, mask]
- else:
- order = [masked_vals, ~(np.asarray(mask))]
-
- if check_labels:
- order.append(labels)
-
- np.putmask(masked_vals, mask, nan_fill_val)
- # putmask doesn't accept a memoryview, so we assign as a separate step
- masked_vals_memview = masked_vals
-
- # lexsort using labels, then mask, then actual values
- # each label corresponds to a different group value,
- # the mask helps you differentiate missing values before
- # performing sort on the actual values
- lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False)
-
- if not ascending:
- lexsort_indexer = lexsort_indexer[::-1]
-
- with nogil:
- rank_sorted_1d(
- out,
- grp_sizes,
- lexsort_indexer,
- masked_vals_memview,
- mask,
- check_mask=check_mask,
- N=N,
- tiebreak=tiebreak,
- keep_na=keep_na,
- pct=pct,
- labels=labels,
- )
-
- return np.asarray(out)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void rank_sorted_1d(
- float64_t[::1] out,
- int64_t[::1] grp_sizes,
- const intp_t[:] sort_indexer,
- # TODO(cython3): make const (https://github.com/cython/cython/issues/3222)
- numeric_object_t[:] masked_vals,
- const uint8_t[:] mask,
- bint check_mask,
- Py_ssize_t N,
- TiebreakEnumType tiebreak=TIEBREAK_AVERAGE,
- bint keep_na=True,
- bint pct=False,
- # https://github.com/cython/cython/issues/1630, only trailing arguments can
- # currently be omitted for cdef functions, which is why we keep this at the end
- const intp_t[:] labels=None,
-) nogil:
- """
- See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should
- be handled in the caller. Note that `out` and `grp_sizes` are modified inplace.
-
- Parameters
- ----------
- out : float64_t[::1]
- Array to store computed ranks
- grp_sizes : int64_t[::1]
- Array to store group counts, only used if pct=True. Should only be None
- if labels is None.
- sort_indexer : intp_t[:]
- Array of indices which sorts masked_vals
- masked_vals : numeric_object_t[:]
- The values input to rank_1d, with missing values replaced by fill values
- mask : uint8_t[:]
- Array where entries are True if the value is missing, False otherwise.
- check_mask : bool
- If False, assumes the mask is all False to skip mask indexing
- N : Py_ssize_t
- The number of elements to rank. Note: it is not always true that
- N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
- tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE
- See rank_1d.__doc__ for the different modes
- keep_na : bool, default True
- Whether or not to keep nulls
- pct : bool, default False
- Compute percentage rank of data within each group
- labels : See rank_1d.__doc__, default None. None implies all labels are the same.
- """
-
- cdef:
- Py_ssize_t i, j, dups=0, sum_ranks=0,
- Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0
- bint at_end, next_val_diff, group_changed, check_labels
- int64_t grp_size
-
- check_labels = labels is not None
-
- # Loop over the length of the value array
- # each incremental i value can be looked up in the lexsort_indexer
- # array that we sorted previously, which gives us the location of
- # that sorted value for retrieval back from the original
- # values / masked_vals arrays
- # TODO(cython3): de-duplicate once cython supports conditional nogil
- if numeric_object_t is object:
- with gil:
- for i in range(N):
- at_end = i == N - 1
-
- # dups and sum_ranks will be incremented each loop where
- # the value / group remains the same, and should be reset
- # when either of those change. Used to calculate tiebreakers
- dups += 1
- sum_ranks += i - grp_start + 1
-
- next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
- masked_vals[sort_indexer[i+1]])
-
- # We'll need this check later anyway to determine group size, so just
- # compute it here since shortcircuiting won't help
- group_changed = at_end or (check_labels and
- (labels[sort_indexer[i]]
- != labels[sort_indexer[i+1]]))
-
- # Update out only when there is a transition of values or labels.
- # When a new value or group is encountered, go back #dups steps(
- # the number of occurrence of current value) and assign the ranks
- # based on the starting index of the current group (grp_start)
- # and the current index
- if (next_val_diff or group_changed or (check_mask and
- (mask[sort_indexer[i]]
- ^ mask[sort_indexer[i+1]]))):
-
- # If keep_na, check for missing values and assign back
- # to the result where appropriate
- if keep_na and check_mask and mask[sort_indexer[i]]:
- grp_na_count = dups
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = NaN
- elif tiebreak == TIEBREAK_AVERAGE:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = sum_ranks / <float64_t>dups
- elif tiebreak == TIEBREAK_MIN:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = i - grp_start - dups + 2
- elif tiebreak == TIEBREAK_MAX:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = i - grp_start + 1
-
- # With n as the previous rank in the group and m as the number
- # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
- # then rankings should be n + 1, n + 2 ... n + m
- elif tiebreak == TIEBREAK_FIRST:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = j + 1 - grp_start
-
- # If TIEBREAK_FIRST and descending, the ranking should be
- # n + m, n + (m - 1) ... n + 1. This is equivalent to
- # (i - dups + 1) + (i - j + 1) - grp_start
- elif tiebreak == TIEBREAK_FIRST_DESCENDING:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
- elif tiebreak == TIEBREAK_DENSE:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = grp_vals_seen
-
- # Look forward to the next value (using the sorting in
- # lexsort_indexer). If the value does not equal the current
- # value then we need to reset the dups and sum_ranks, knowing
- # that a new value is coming up. The conditional also needs
- # to handle nan equality and the end of iteration. If group
- # changes we do not record seeing a new value in the group
- if not group_changed and (next_val_diff or (check_mask and
- (mask[sort_indexer[i]]
- ^ mask[sort_indexer[i+1]]))):
- dups = sum_ranks = 0
- grp_vals_seen += 1
-
- # Similar to the previous conditional, check now if we are
- # moving to a new group. If so, keep track of the index where
- # the new group occurs, so the tiebreaker calculations can
- # decrement that from their position. Fill in the size of each
- # group encountered (used by pct calculations later). Also be
- # sure to reset any of the items helping to calculate dups
- if group_changed:
-
- # If not dense tiebreak, group size used to compute
- # percentile will be # of non-null elements in group
- if tiebreak != TIEBREAK_DENSE:
- grp_size = i - grp_start + 1 - grp_na_count
-
- # Otherwise, it will be the number of distinct values
- # in the group, subtracting 1 if NaNs are present
- # since that is a distinct value we shouldn't count
- else:
- grp_size = grp_vals_seen - (grp_na_count > 0)
-
- for j in range(grp_start, i + 1):
- grp_sizes[sort_indexer[j]] = grp_size
-
- dups = sum_ranks = 0
- grp_na_count = 0
- grp_start = i + 1
- grp_vals_seen = 1
- else:
- for i in range(N):
- at_end = i == N - 1
-
- # dups and sum_ranks will be incremented each loop where
- # the value / group remains the same, and should be reset
- # when either of those change. Used to calculate tiebreakers
- dups += 1
- sum_ranks += i - grp_start + 1
-
- next_val_diff = at_end or (masked_vals[sort_indexer[i]]
- != masked_vals[sort_indexer[i+1]])
-
- # We'll need this check later anyway to determine group size, so just
- # compute it here since shortcircuiting won't help
- group_changed = at_end or (check_labels and
- (labels[sort_indexer[i]]
- != labels[sort_indexer[i+1]]))
-
- # Update out only when there is a transition of values or labels.
- # When a new value or group is encountered, go back #dups steps(
- # the number of occurrence of current value) and assign the ranks
- # based on the starting index of the current group (grp_start)
- # and the current index
- if (next_val_diff or group_changed
- or (check_mask and
- (mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))):
-
- # If keep_na, check for missing values and assign back
- # to the result where appropriate
- if keep_na and check_mask and mask[sort_indexer[i]]:
- grp_na_count = dups
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = NaN
- elif tiebreak == TIEBREAK_AVERAGE:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = sum_ranks / <float64_t>dups
- elif tiebreak == TIEBREAK_MIN:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = i - grp_start - dups + 2
- elif tiebreak == TIEBREAK_MAX:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = i - grp_start + 1
-
- # With n as the previous rank in the group and m as the number
- # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
- # then rankings should be n + 1, n + 2 ... n + m
- elif tiebreak == TIEBREAK_FIRST:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = j + 1 - grp_start
-
- # If TIEBREAK_FIRST and descending, the ranking should be
- # n + m, n + (m - 1) ... n + 1. This is equivalent to
- # (i - dups + 1) + (i - j + 1) - grp_start
- elif tiebreak == TIEBREAK_FIRST_DESCENDING:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
- elif tiebreak == TIEBREAK_DENSE:
- for j in range(i - dups + 1, i + 1):
- out[sort_indexer[j]] = grp_vals_seen
-
- # Look forward to the next value (using the sorting in
- # lexsort_indexer). If the value does not equal the current
- # value then we need to reset the dups and sum_ranks, knowing
- # that a new value is coming up. The conditional also needs
- # to handle nan equality and the end of iteration. If group
- # changes we do not record seeing a new value in the group
- if not group_changed and (next_val_diff
- or (check_mask and
- (mask[sort_indexer[i]]
- ^ mask[sort_indexer[i+1]]))):
- dups = sum_ranks = 0
- grp_vals_seen += 1
-
- # Similar to the previous conditional, check now if we are
- # moving to a new group. If so, keep track of the index where
- # the new group occurs, so the tiebreaker calculations can
- # decrement that from their position. Fill in the size of each
- # group encountered (used by pct calculations later). Also be
- # sure to reset any of the items helping to calculate dups
- if group_changed:
-
- # If not dense tiebreak, group size used to compute
- # percentile will be # of non-null elements in group
- if tiebreak != TIEBREAK_DENSE:
- grp_size = i - grp_start + 1 - grp_na_count
-
- # Otherwise, it will be the number of distinct values
- # in the group, subtracting 1 if NaNs are present
- # since that is a distinct value we shouldn't count
- else:
- grp_size = grp_vals_seen - (grp_na_count > 0)
-
- for j in range(grp_start, i + 1):
- grp_sizes[sort_indexer[j]] = grp_size
-
- dups = sum_ranks = 0
- grp_na_count = 0
- grp_start = i + 1
- grp_vals_seen = 1
-
- if pct:
- for i in range(N):
- if grp_sizes[i] != 0:
- out[i] = out[i] / grp_sizes[i]
-
-
-def rank_2d(
- ndarray[numeric_object_t, ndim=2] in_arr,
- int axis=0,
- bint is_datetimelike=False,
- ties_method="average",
- bint ascending=True,
- na_option="keep",
- bint pct=False,
-):
- """
- Fast NaN-friendly version of ``scipy.stats.rankdata``.
- """
- cdef:
- Py_ssize_t k, n, col
- float64_t[::1, :] out # Column-major so columns are contiguous
- int64_t[::1] grp_sizes
- ndarray[numeric_object_t, ndim=2] values
- numeric_object_t[:, :] masked_vals
- intp_t[:, :] sort_indexer
- uint8_t[:, :] mask
- TiebreakEnumType tiebreak
- bint check_mask, keep_na, nans_rank_highest
- numeric_object_t nan_fill_val
-
- tiebreak = tiebreakers[ties_method]
- if tiebreak == TIEBREAK_FIRST:
- if not ascending:
- tiebreak = TIEBREAK_FIRST_DESCENDING
-
- keep_na = na_option == "keep"
-
- # For cases where a mask is not possible, we can avoid mask checks
- check_mask = (
- numeric_object_t is float32_t
- or numeric_object_t is float64_t
- or numeric_object_t is object
- or (numeric_object_t is int64_t and is_datetimelike)
- )
-
- if axis == 1:
- values = np.asarray(in_arr).T.copy()
- else:
- values = np.asarray(in_arr).copy()
-
- if numeric_object_t is object:
- if values.dtype != np.object_:
- values = values.astype("O")
-
- nans_rank_highest = ascending ^ (na_option == "top")
- if check_mask:
- nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, <numeric_object_t>0)
-
- if numeric_object_t is object:
- mask = missing.isnaobj(values).view(np.uint8)
- elif numeric_object_t is float64_t or numeric_object_t is float32_t:
- mask = np.isnan(values).view(np.uint8)
- else:
- # i.e. int64 and datetimelike
- mask = (values == NPY_NAT).view(np.uint8)
- np.putmask(values, mask, nan_fill_val)
- else:
- mask = np.zeros_like(values, dtype=np.uint8)
-
- if nans_rank_highest:
- order = (values, mask)
- else:
- order = (values, ~np.asarray(mask))
-
- n, k = (<object>values).shape
- out = np.empty((n, k), dtype="f8", order="F")
- grp_sizes = np.ones(n, dtype=np.int64)
-
- # lexsort is slower, so only use if we need to worry about the mask
- if check_mask:
- sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
- else:
- kind = "stable" if ties_method == "first" else None
- sort_indexer = values.argsort(axis=0, kind=kind).astype(np.intp, copy=False)
-
- if not ascending:
- sort_indexer = sort_indexer[::-1, :]
-
- # putmask doesn't accept a memoryview, so we assign in a separate step
- masked_vals = values
- with nogil:
- for col in range(k):
- rank_sorted_1d(
- out[:, col],
- grp_sizes,
- sort_indexer[:, col],
- masked_vals[:, col],
- mask[:, col],
- check_mask=check_mask,
- N=n,
- tiebreak=tiebreak,
- keep_na=keep_na,
- pct=pct,
- )
-
- if axis == 1:
- return np.asarray(out.T)
- else:
- return np.asarray(out)
-
-
-ctypedef fused diff_t:
- float64_t
- float32_t
- int8_t
- int16_t
- int32_t
- int64_t
-
-ctypedef fused out_t:
- float32_t
- float64_t
- int64_t
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d(
- ndarray[diff_t, ndim=2] arr, # TODO(cython3) update to "const diff_t[:, :] arr"
- ndarray[out_t, ndim=2] out,
- Py_ssize_t periods,
- int axis,
- bint datetimelike=False,
-):
- cdef:
- Py_ssize_t i, j, sx, sy, start, stop
- bint f_contig = arr.flags.f_contiguous
- # bint f_contig = arr.is_f_contig() # TODO(cython3)
- diff_t left, right
-
- # Disable for unsupported dtype combinations,
- # see https://github.com/cython/cython/issues/2646
- if (out_t is float32_t
- and not (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
- raise NotImplementedError # pragma: no cover
- elif (out_t is float64_t
- and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
- raise NotImplementedError # pragma: no cover
- elif out_t is int64_t and diff_t is not int64_t:
- # We only have out_t of int64_t if we have datetimelike
- raise NotImplementedError # pragma: no cover
- else:
- # We put this inside an indented else block to avoid cython build
- # warnings about unreachable code
- sx, sy = (<object>arr).shape
- with nogil:
- if f_contig:
- if axis == 0:
- if periods >= 0:
- start, stop = periods, sx
- else:
- start, stop = 0, sx + periods
- for j in range(sy):
- for i in range(start, stop):
- left = arr[i, j]
- right = arr[i - periods, j]
- if out_t is int64_t and datetimelike:
- if left == NPY_NAT or right == NPY_NAT:
- out[i, j] = NPY_NAT
- else:
- out[i, j] = left - right
- else:
- out[i, j] = left - right
- else:
- if periods >= 0:
- start, stop = periods, sy
- else:
- start, stop = 0, sy + periods
- for j in range(start, stop):
- for i in range(sx):
- left = arr[i, j]
- right = arr[i, j - periods]
- if out_t is int64_t and datetimelike:
- if left == NPY_NAT or right == NPY_NAT:
- out[i, j] = NPY_NAT
- else:
- out[i, j] = left - right
- else:
- out[i, j] = left - right
- else:
- if axis == 0:
- if periods >= 0:
- start, stop = periods, sx
- else:
- start, stop = 0, sx + periods
- for i in range(start, stop):
- for j in range(sy):
- left = arr[i, j]
- right = arr[i - periods, j]
- if out_t is int64_t and datetimelike:
- if left == NPY_NAT or right == NPY_NAT:
- out[i, j] = NPY_NAT
- else:
- out[i, j] = left - right
- else:
- out[i, j] = left - right
- else:
- if periods >= 0:
- start, stop = periods, sy
- else:
- start, stop = 0, sy + periods
- for i in range(sx):
- for j in range(start, stop):
- left = arr[i, j]
- right = arr[i, j - periods]
- if out_t is int64_t and datetimelike:
- if left == NPY_NAT or right == NPY_NAT:
- out[i, j] = NPY_NAT
- else:
- out[i, j] = left - right
- else:
- out[i, j] = left - right
-
-
-# generated from template
-include "algos_common_helper.pxi"
-include "algos_take_helper.pxi"
diff --git a/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi b/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi
deleted file mode 100644
index 0f11639775a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi
+++ /dev/null
@@ -1,99 +0,0 @@
-"""
-Template for each `dtype` helper function using 1-d template
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-# ----------------------------------------------------------------------
-# ensure_dtype
-# ----------------------------------------------------------------------
-
-
-def ensure_platform_int(object arr):
- # GH3033, GH1392
- # platform int is the size of the int pointer, e.g. np.intp
- if util.is_array(arr):
- if (<ndarray>arr).descr.type_num == cnp.NPY_INTP:
- return arr
- else:
- # equiv: arr.astype(np.intp)
- return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_INTP)
- else:
- return np.array(arr, dtype=np.intp)
-
-
-def ensure_object(object arr):
- if util.is_array(arr):
- if (<ndarray>arr).descr.type_num == NPY_OBJECT:
- return arr
- else:
- # equiv: arr.astype(object)
- return cnp.PyArray_Cast(<ndarray>arr, NPY_OBJECT)
- else:
- return np.array(arr, dtype=np.object_)
-
-
-def ensure_float64(object arr):
- if util.is_array(arr):
- if (<ndarray>arr).descr.type_num == NPY_FLOAT64:
- return arr
- else:
- # equiv: arr.astype(np.float64)
- return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_FLOAT64)
- else:
- return np.asarray(arr, dtype=np.float64)
-
-
-def ensure_int8(object arr):
- if util.is_array(arr):
- if (<ndarray>arr).descr.type_num == NPY_INT8:
- return arr
- else:
- # equiv: arr.astype(np.int8)
- return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_INT8)
- else:
- return np.asarray(arr, dtype=np.int8)
-
-
-def ensure_int16(object arr):
- if util.is_array(arr):
- if (<ndarray>arr).descr.type_num == NPY_INT16:
- return arr
- else:
- # equiv: arr.astype(np.int16)
- return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_INT16)
- else:
- return np.asarray(arr, dtype=np.int16)
-
-
-def ensure_int32(object arr):
- if util.is_array(arr):
- if (<ndarray>arr).descr.type_num == NPY_INT32:
- return arr
- else:
- # equiv: arr.astype(np.int32)
- return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_INT32)
- else:
- return np.asarray(arr, dtype=np.int32)
-
-
-def ensure_int64(object arr):
- if util.is_array(arr):
- if (<ndarray>arr).descr.type_num == NPY_INT64:
- return arr
- else:
- # equiv: arr.astype(np.int64)
- return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_INT64)
- else:
- return np.asarray(arr, dtype=np.int64)
-
-
-def ensure_uint64(object arr):
- if util.is_array(arr):
- if (<ndarray>arr).descr.type_num == NPY_UINT64:
- return arr
- else:
- # equiv: arr.astype(np.uint64)
- return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_UINT64)
- else:
- return np.asarray(arr, dtype=np.uint64)
diff --git a/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi.in
deleted file mode 100644
index ee815b8bbf7..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/algos_common_helper.pxi.in
+++ /dev/null
@@ -1,73 +0,0 @@
-"""
-Template for each `dtype` helper function using 1-d template
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-# ----------------------------------------------------------------------
-# ensure_dtype
-# ----------------------------------------------------------------------
-
-
-def ensure_platform_int(object arr):
- # GH3033, GH1392
- # platform int is the size of the int pointer, e.g. np.intp
- if util.is_array(arr):
- if (<ndarray>arr).descr.type_num == cnp.NPY_INTP:
- return arr
- else:
- # equiv: arr.astype(np.intp)
- return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_INTP)
- else:
- return np.array(arr, dtype=np.intp)
-
-
-def ensure_object(object arr):
- if util.is_array(arr):
- if (<ndarray>arr).descr.type_num == NPY_OBJECT:
- return arr
- else:
- # equiv: arr.astype(object)
- return cnp.PyArray_Cast(<ndarray>arr, NPY_OBJECT)
- else:
- return np.array(arr, dtype=np.object_)
-
-{{py:
-
-# name, c_type, dtype
-dtypes = [('float64', 'FLOAT64', 'float64'),
- # ('float32', 'FLOAT32', 'float32'), # disabling bc unused
- ('int8', 'INT8', 'int8'),
- ('int16', 'INT16', 'int16'),
- ('int32', 'INT32', 'int32'),
- ('int64', 'INT64', 'int64'),
- ('uint64', 'UINT64', 'uint64'),
- # Disabling uint and complex dtypes because we do not use them
- # (and compiling them increases wheel size) (except uint64)
- # ('uint8', 'UINT8', 'uint8'),
- # ('uint16', 'UINT16', 'uint16'),
- # ('uint32', 'UINT32', 'uint32'),
- # ('complex64', 'COMPLEX64', 'complex64'),
- # ('complex128', 'COMPLEX128', 'complex128')
-]
-
-def get_dispatch(dtypes):
-
- for name, c_type, dtype in dtypes:
- yield name, c_type, dtype
-}}
-
-{{for name, c_type, dtype in get_dispatch(dtypes)}}
-
-
-def ensure_{{name}}(object arr):
- if util.is_array(arr):
- if (<ndarray>arr).descr.type_num == NPY_{{c_type}}:
- return arr
- else:
- # equiv: arr.astype(np.{{dtype}})
- return cnp.PyArray_Cast(<ndarray>arr, cnp.NPY_{{c_type}})
- else:
- return np.asarray(arr, dtype=np.{{dtype}})
-
-{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi b/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi
deleted file mode 100644
index cbd9fff2316..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi
+++ /dev/null
@@ -1,2162 +0,0 @@
-"""
-Template for each `dtype` helper function for take
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-# ----------------------------------------------------------------------
-# take_1d, take_2d
-# ----------------------------------------------------------------------
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_bool_bool(const uint8_t[:] values,
- const intp_t[:] indexer,
- uint8_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- uint8_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_bool_bool(const uint8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- uint8_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- uint8_t fv
- const uint8_t *v
- uint8_t *o
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
- # GH#3130
- if (values.strides[1] == out.strides[1] and
- values.strides[1] == sizeof(uint8_t) and
- sizeof(uint8_t) * n >= 256):
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- v = &values[idx, 0]
- o = &out[i, 0]
- memmove(o, v, <size_t>(sizeof(uint8_t) * k))
- return
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_bool_bool(const uint8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- uint8_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- uint8_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values,
- indexer,
- ndarray[uint8_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- uint8_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_bool_object(const uint8_t[:] values,
- const intp_t[:] indexer,
- object[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- object fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- if True:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = True if values[idx] > 0 else False
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_bool_object(const uint8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- object[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- object fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = True if values[idx, j] > 0 else False
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_bool_object(const uint8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- object[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- object fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = True if values[i, idx] > 0 else False
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values,
- indexer,
- ndarray[object, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- object fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = True if values[idx, idx1[j]] > 0 else False
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_int8(const int8_t[:] values,
- const intp_t[:] indexer,
- int8_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- int8_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_int8(const int8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int8_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- int8_t fv
- const int8_t *v
- int8_t *o
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
- # GH#3130
- if (values.strides[1] == out.strides[1] and
- values.strides[1] == sizeof(int8_t) and
- sizeof(int8_t) * n >= 256):
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- v = &values[idx, 0]
- o = &out[i, 0]
- memmove(o, v, <size_t>(sizeof(int8_t) * k))
- return
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_int8(const int8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int8_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- int8_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values,
- indexer,
- ndarray[int8_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- int8_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_int32(const int8_t[:] values,
- const intp_t[:] indexer,
- int32_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- int32_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_int32(const int8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int32_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- int32_t fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_int32(const int8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int32_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- int32_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values,
- indexer,
- ndarray[int32_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- int32_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_int64(const int8_t[:] values,
- const intp_t[:] indexer,
- int64_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- int64_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_int64(const int8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int64_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- int64_t fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_int64(const int8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int64_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- int64_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values,
- indexer,
- ndarray[int64_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- int64_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_float64(const int8_t[:] values,
- const intp_t[:] indexer,
- float64_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- float64_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_float64(const int8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_float64(const int8_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values,
- indexer,
- ndarray[float64_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- float64_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_int16(const int16_t[:] values,
- const intp_t[:] indexer,
- int16_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- int16_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_int16(const int16_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int16_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- int16_t fv
- const int16_t *v
- int16_t *o
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
- # GH#3130
- if (values.strides[1] == out.strides[1] and
- values.strides[1] == sizeof(int16_t) and
- sizeof(int16_t) * n >= 256):
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- v = &values[idx, 0]
- o = &out[i, 0]
- memmove(o, v, <size_t>(sizeof(int16_t) * k))
- return
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_int16(const int16_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int16_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- int16_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values,
- indexer,
- ndarray[int16_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- int16_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_int32(const int16_t[:] values,
- const intp_t[:] indexer,
- int32_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- int32_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_int32(const int16_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int32_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- int32_t fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_int32(const int16_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int32_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- int32_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values,
- indexer,
- ndarray[int32_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- int32_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_int64(const int16_t[:] values,
- const intp_t[:] indexer,
- int64_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- int64_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_int64(const int16_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int64_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- int64_t fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_int64(const int16_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int64_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- int64_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values,
- indexer,
- ndarray[int64_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- int64_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_float64(const int16_t[:] values,
- const intp_t[:] indexer,
- float64_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- float64_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_float64(const int16_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_float64(const int16_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values,
- indexer,
- ndarray[float64_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- float64_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int32_int32(const int32_t[:] values,
- const intp_t[:] indexer,
- int32_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- int32_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int32_int32(const int32_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int32_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- int32_t fv
- const int32_t *v
- int32_t *o
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
- # GH#3130
- if (values.strides[1] == out.strides[1] and
- values.strides[1] == sizeof(int32_t) and
- sizeof(int32_t) * n >= 256):
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- v = &values[idx, 0]
- o = &out[i, 0]
- memmove(o, v, <size_t>(sizeof(int32_t) * k))
- return
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int32_int32(const int32_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int32_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- int32_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values,
- indexer,
- ndarray[int32_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- int32_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int32_int64(const int32_t[:] values,
- const intp_t[:] indexer,
- int64_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- int64_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int32_int64(const int32_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int64_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- int64_t fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int32_int64(const int32_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int64_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- int64_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values,
- indexer,
- ndarray[int64_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- int64_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int32_float64(const int32_t[:] values,
- const intp_t[:] indexer,
- float64_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- float64_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int32_float64(const int32_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int32_float64(const int32_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values,
- indexer,
- ndarray[float64_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- float64_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int64_int64(const int64_t[:] values,
- const intp_t[:] indexer,
- int64_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- int64_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int64_int64(const int64_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int64_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- int64_t fv
- const int64_t *v
- int64_t *o
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
- # GH#3130
- if (values.strides[1] == out.strides[1] and
- values.strides[1] == sizeof(int64_t) and
- sizeof(int64_t) * n >= 256):
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- v = &values[idx, 0]
- o = &out[i, 0]
- memmove(o, v, <size_t>(sizeof(int64_t) * k))
- return
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int64_int64(const int64_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- int64_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- int64_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values,
- indexer,
- ndarray[int64_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- int64_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int64_float64(const int64_t[:] values,
- const intp_t[:] indexer,
- float64_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- float64_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int64_float64(const int64_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int64_float64(const int64_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values,
- indexer,
- ndarray[float64_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- float64_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_float32_float32(const float32_t[:] values,
- const intp_t[:] indexer,
- float32_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- float32_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_float32_float32(const float32_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float32_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- float32_t fv
- const float32_t *v
- float32_t *o
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
- # GH#3130
- if (values.strides[1] == out.strides[1] and
- values.strides[1] == sizeof(float32_t) and
- sizeof(float32_t) * n >= 256):
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- v = &values[idx, 0]
- o = &out[i, 0]
- memmove(o, v, <size_t>(sizeof(float32_t) * k))
- return
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_float32_float32(const float32_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float32_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- float32_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values,
- indexer,
- ndarray[float32_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- float32_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_float32_float64(const float32_t[:] values,
- const intp_t[:] indexer,
- float64_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- float64_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_float32_float64(const float32_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_float32_float64(const float32_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values,
- indexer,
- ndarray[float64_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- float64_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_float64_float64(const float64_t[:] values,
- const intp_t[:] indexer,
- float64_t[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- float64_t fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- with nogil:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_float64_float64(const float64_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
- const float64_t *v
- float64_t *o
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
- # GH#3130
- if (values.strides[1] == out.strides[1] and
- values.strides[1] == sizeof(float64_t) and
- sizeof(float64_t) * n >= 256):
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- v = &values[idx, 0]
- o = &out[i, 0]
- memmove(o, v, <size_t>(sizeof(float64_t) * k))
- return
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_float64_float64(const float64_t[:, :] values,
- ndarray[intp_t, ndim=1] indexer,
- float64_t[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- float64_t fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values,
- indexer,
- ndarray[float64_t, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- float64_t fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_object_object(ndarray[object, ndim=1] values,
- const intp_t[:] indexer,
- object[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- object fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- if True:
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_object_object(ndarray[object, ndim=2] values,
- ndarray[intp_t, ndim=1] indexer,
- object[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- object fv
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_object_object(ndarray[object, ndim=2] values,
- ndarray[intp_t, ndim=1] indexer,
- object[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- object fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_object_object(ndarray[object, ndim=2] values,
- indexer,
- ndarray[object, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- object fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- out[i, j] = values[idx, idx1[j]]
diff --git a/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi.in
deleted file mode 100644
index 2a3858674af..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/algos_take_helper.pxi.in
+++ /dev/null
@@ -1,222 +0,0 @@
-"""
-Template for each `dtype` helper function for take
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-# ----------------------------------------------------------------------
-# take_1d, take_2d
-# ----------------------------------------------------------------------
-
-
-{{py:
-
-# c_type_in, c_type_out
-dtypes = [
- ('uint8_t', 'uint8_t'),
- ('uint8_t', 'object'),
- ('int8_t', 'int8_t'),
- ('int8_t', 'int32_t'),
- ('int8_t', 'int64_t'),
- ('int8_t', 'float64_t'),
- ('int16_t', 'int16_t'),
- ('int16_t', 'int32_t'),
- ('int16_t', 'int64_t'),
- ('int16_t', 'float64_t'),
- ('int32_t', 'int32_t'),
- ('int32_t', 'int64_t'),
- ('int32_t', 'float64_t'),
- ('int64_t', 'int64_t'),
- ('int64_t', 'float64_t'),
- ('float32_t', 'float32_t'),
- ('float32_t', 'float64_t'),
- ('float64_t', 'float64_t'),
- ('object', 'object'),
-]
-
-
-def get_dispatch(dtypes):
-
- for (c_type_in, c_type_out) in dtypes:
-
- def get_name(dtype_name):
- if dtype_name == "object":
- return "object"
- if dtype_name == "uint8_t":
- return "bool"
- return dtype_name[:-2]
-
- name = get_name(c_type_in)
- dest = get_name(c_type_out)
-
- args = dict(name=name, dest=dest, c_type_in=c_type_in,
- c_type_out=c_type_out)
-
- yield (name, dest, c_type_in, c_type_out)
-
-}}
-
-
-{{for name, dest, c_type_in, c_type_out in get_dispatch(dtypes)}}
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-{{if c_type_in != "object"}}
-def take_1d_{{name}}_{{dest}}(const {{c_type_in}}[:] values,
-{{else}}
-def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values,
-{{endif}}
- const intp_t[:] indexer,
- {{c_type_out}}[:] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, n, idx
- {{c_type_out}} fv
-
- n = indexer.shape[0]
-
- fv = fill_value
-
- {{if c_type_out != "object"}}
- with nogil:
- {{else}}
- if True:
- {{endif}}
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- out[i] = fv
- else:
- {{if c_type_in == "uint8_t" and c_type_out == "object"}}
- out[i] = True if values[idx] > 0 else False
- {{else}}
- out[i] = values[idx]
- {{endif}}
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-{{if c_type_in != "object"}}
-def take_2d_axis0_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
-{{else}}
-def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
-{{endif}}
- ndarray[intp_t, ndim=1] indexer,
- {{c_type_out}}[:, :] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- {{c_type_out}} fv
- {{if c_type_in == c_type_out != "object"}}
- const {{c_type_out}} *v
- {{c_type_out}} *o
- {{endif}}
-
- n = len(indexer)
- k = values.shape[1]
-
- fv = fill_value
-
- {{if c_type_in == c_type_out != "object"}}
- # GH#3130
- if (values.strides[1] == out.strides[1] and
- values.strides[1] == sizeof({{c_type_out}}) and
- sizeof({{c_type_out}}) * n >= 256):
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- v = &values[idx, 0]
- o = &out[i, 0]
- memmove(o, v, <size_t>(sizeof({{c_type_out}}) * k))
- return
- {{endif}}
-
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- {{if c_type_in == "uint8_t" and c_type_out == "object"}}
- out[i, j] = True if values[idx, j] > 0 else False
- {{else}}
- out[i, j] = values[idx, j]
- {{endif}}
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-{{if c_type_in != "object"}}
-def take_2d_axis1_{{name}}_{{dest}}(const {{c_type_in}}[:, :] values,
-{{else}}
-def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
-{{endif}}
- ndarray[intp_t, ndim=1] indexer,
- {{c_type_out}}[:, :] out,
- fill_value=np.nan):
-
- cdef:
- Py_ssize_t i, j, k, n, idx
- {{c_type_out}} fv
-
- n = len(values)
- k = len(indexer)
-
- if n == 0 or k == 0:
- return
-
- fv = fill_value
-
- for i in range(n):
- for j in range(k):
- idx = indexer[j]
- if idx == -1:
- out[i, j] = fv
- else:
- {{if c_type_in == "uint8_t" and c_type_out == "object"}}
- out[i, j] = True if values[i, idx] > 0 else False
- {{else}}
- out[i, j] = values[i, idx]
- {{endif}}
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
- indexer,
- ndarray[{{c_type_out}}, ndim=2] out,
- fill_value=np.nan):
- cdef:
- Py_ssize_t i, j, k, n, idx
- ndarray[intp_t, ndim=1] idx0 = indexer[0]
- ndarray[intp_t, ndim=1] idx1 = indexer[1]
- {{c_type_out}} fv
-
- n = len(idx0)
- k = len(idx1)
-
- fv = fill_value
- for i in range(n):
- idx = idx0[i]
- if idx == -1:
- for j in range(k):
- out[i, j] = fv
- else:
- for j in range(k):
- if idx1[j] == -1:
- out[i, j] = fv
- else:
- {{if c_type_in == "uint8_t" and c_type_out == "object"}}
- out[i, j] = True if values[idx, idx1[j]] > 0 else False
- {{else}}
- out[i, j] = values[idx, idx1[j]]
- {{endif}}
-
-{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/arrays.pxd b/contrib/python/pandas/py3/pandas/_libs/arrays.pxd
deleted file mode 100644
index 737da29da46..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/arrays.pxd
+++ /dev/null
@@ -1,11 +0,0 @@
-
-from numpy cimport ndarray
-
-
-cdef class NDArrayBacked:
- cdef:
- readonly ndarray _ndarray
- readonly object _dtype
-
- cpdef NDArrayBacked _from_backing_data(self, ndarray values)
- cpdef __setstate__(self, state)
diff --git a/contrib/python/pandas/py3/pandas/_libs/arrays.pyi b/contrib/python/pandas/py3/pandas/_libs/arrays.pyi
deleted file mode 100644
index c9350ed9b8a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/arrays.pyi
+++ /dev/null
@@ -1,34 +0,0 @@
-from typing import Sequence
-
-import numpy as np
-
-from pandas._typing import (
- DtypeObj,
- Shape,
-)
-
-class NDArrayBacked:
- _dtype: DtypeObj
- _ndarray: np.ndarray
- def __init__(self, values: np.ndarray, dtype: DtypeObj) -> None: ...
- @classmethod
- def _simple_new(cls, values: np.ndarray, dtype: DtypeObj): ...
- def _from_backing_data(self, values: np.ndarray): ...
- def __setstate__(self, state): ...
- def __len__(self) -> int: ...
- @property
- def shape(self) -> Shape: ...
- @property
- def ndim(self) -> int: ...
- @property
- def size(self) -> int: ...
- @property
- def nbytes(self) -> int: ...
- def copy(self): ...
- def delete(self, loc, axis=...): ...
- def swapaxes(self, axis1, axis2): ...
- def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ...
- def reshape(self, *args, **kwargs): ...
- def ravel(self, order=...): ...
- @property
- def T(self): ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/arrays.pyx b/contrib/python/pandas/py3/pandas/_libs/arrays.pyx
deleted file mode 100644
index f63d16e819c..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/arrays.pyx
+++ /dev/null
@@ -1,183 +0,0 @@
-"""
-Cython implementations for internal ExtensionArrays.
-"""
-cimport cython
-
-import numpy as np
-
-cimport numpy as cnp
-from cpython cimport PyErr_Clear
-from numpy cimport ndarray
-
-cnp.import_array()
-
-
-@cython.freelist(16)
-cdef class NDArrayBacked:
- """
- Implementing these methods in cython improves performance quite a bit.
-
- import pandas as pd
-
- from pandas._libs.arrays import NDArrayBacked as cls
-
- dti = pd.date_range("2016-01-01", periods=3)
- dta = dti._data
- arr = dta._ndarray
-
- obj = cls._simple_new(arr, arr.dtype)
-
- # for foo in [arr, dta, obj]: ...
-
- %timeit foo.copy()
- 299 ns ± 30 ns per loop # <-- arr underlying ndarray (for reference)
- 530 ns ± 9.24 ns per loop # <-- dta with cython NDArrayBacked
- 1.66 µs ± 46.3 ns per loop # <-- dta without cython NDArrayBacked
- 328 ns ± 5.29 ns per loop # <-- obj with NDArrayBacked.__cinit__
- 371 ns ± 6.97 ns per loop # <-- obj with NDArrayBacked._simple_new
-
- %timeit foo.T
- 125 ns ± 6.27 ns per loop # <-- arr underlying ndarray (for reference)
- 226 ns ± 7.66 ns per loop # <-- dta with cython NDArrayBacked
- 911 ns ± 16.6 ns per loop # <-- dta without cython NDArrayBacked
- 215 ns ± 4.54 ns per loop # <-- obj with NDArrayBacked._simple_new
-
- """
- # TODO: implement take in terms of cnp.PyArray_TakeFrom
- # TODO: implement concat_same_type in terms of cnp.PyArray_Concatenate
-
- # cdef:
- # readonly ndarray _ndarray
- # readonly object _dtype
-
- def __init__(self, ndarray values, object dtype):
- self._ndarray = values
- self._dtype = dtype
-
- @classmethod
- def _simple_new(cls, ndarray values, object dtype):
- cdef:
- NDArrayBacked obj
- obj = NDArrayBacked.__new__(cls)
- obj._ndarray = values
- obj._dtype = dtype
- return obj
-
- cpdef NDArrayBacked _from_backing_data(self, ndarray values):
- """
- Construct a new ExtensionArray `new_array` with `arr` as its _ndarray.
-
- This should round-trip:
- self == self._from_backing_data(self._ndarray)
- """
- # TODO: re-reuse simple_new if/when it can be cpdef
- cdef:
- NDArrayBacked obj
- obj = NDArrayBacked.__new__(type(self))
- obj._ndarray = values
- obj._dtype = self._dtype
- return obj
-
- cpdef __setstate__(self, state):
- if isinstance(state, dict):
- if "_data" in state:
- data = state.pop("_data")
- elif "_ndarray" in state:
- data = state.pop("_ndarray")
- else:
- raise ValueError # pragma: no cover
- self._ndarray = data
- self._dtype = state.pop("_dtype")
-
- for key, val in state.items():
- setattr(self, key, val)
- elif isinstance(state, tuple):
- if len(state) != 3:
- if len(state) == 1 and isinstance(state[0], dict):
- self.__setstate__(state[0])
- return
- raise NotImplementedError(state) # pragma: no cover
-
- data, dtype = state[:2]
- if isinstance(dtype, np.ndarray):
- dtype, data = data, dtype
- self._ndarray = data
- self._dtype = dtype
-
- if isinstance(state[2], dict):
- for key, val in state[2].items():
- setattr(self, key, val)
- else:
- raise NotImplementedError(state) # pragma: no cover
- else:
- raise NotImplementedError(state) # pragma: no cover
-
- def __len__(self) -> int:
- return len(self._ndarray)
-
- @property
- def shape(self):
- # object cast bc _ndarray.shape is npy_intp*
- return (<object>(self._ndarray)).shape
-
- @property
- def ndim(self) -> int:
- return self._ndarray.ndim
-
- @property
- def size(self) -> int:
- return self._ndarray.size
-
- @property
- def nbytes(self) -> int:
- return self._ndarray.nbytes
-
- def copy(self, order="C"):
- cdef:
- cnp.NPY_ORDER order_code
- int success
-
- success = cnp.PyArray_OrderConverter(order, &order_code)
- if not success:
- # clear exception so that we don't get a SystemError
- PyErr_Clear()
- # same message used by numpy
- msg = f"order must be one of 'C', 'F', 'A', or 'K' (got '{order}')"
- raise ValueError(msg)
-
- res_values = cnp.PyArray_NewCopy(self._ndarray, order_code)
- return self._from_backing_data(res_values)
-
- def delete(self, loc, axis=0):
- res_values = np.delete(self._ndarray, loc, axis=axis)
- return self._from_backing_data(res_values)
-
- def swapaxes(self, axis1, axis2):
- res_values = cnp.PyArray_SwapAxes(self._ndarray, axis1, axis2)
- return self._from_backing_data(res_values)
-
- # TODO: pass NPY_MAXDIMS equiv to axis=None?
- def repeat(self, repeats, axis: int | np.integer = 0):
- if axis is None:
- axis = 0
- res_values = cnp.PyArray_Repeat(self._ndarray, repeats, <int>axis)
- return self._from_backing_data(res_values)
-
- def reshape(self, *args, **kwargs):
- res_values = self._ndarray.reshape(*args, **kwargs)
- return self._from_backing_data(res_values)
-
- def ravel(self, order="C"):
- # cnp.PyArray_OrderConverter(PyObject* obj, NPY_ORDER* order)
- # res_values = cnp.PyArray_Ravel(self._ndarray, order)
- res_values = self._ndarray.ravel(order)
- return self._from_backing_data(res_values)
-
- @property
- def T(self):
- res_values = self._ndarray.T
- return self._from_backing_data(res_values)
-
- def transpose(self, *axes):
- res_values = self._ndarray.transpose(*axes)
- return self._from_backing_data(res_values)
diff --git a/contrib/python/pandas/py3/pandas/_libs/dtypes.pxd b/contrib/python/pandas/py3/pandas/_libs/dtypes.pxd
deleted file mode 100644
index ccfb2d2ef4a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/dtypes.pxd
+++ /dev/null
@@ -1,36 +0,0 @@
-"""
-Common location for shared fused types
-"""
-
-from numpy cimport (
- float32_t,
- float64_t,
- int8_t,
- int16_t,
- int32_t,
- int64_t,
- uint8_t,
- uint16_t,
- uint32_t,
- uint64_t,
-)
-
-# All numeric types except complex
-ctypedef fused numeric_t:
- int8_t
- int16_t
- int32_t
- int64_t
-
- uint8_t
- uint16_t
- uint32_t
- uint64_t
-
- float32_t
- float64_t
-
-# All numeric types + object, doesn't include complex
-ctypedef fused numeric_object_t:
- numeric_t
- object
diff --git a/contrib/python/pandas/py3/pandas/_libs/groupby.pyi b/contrib/python/pandas/py3/pandas/_libs/groupby.pyi
deleted file mode 100644
index e3ca9c44d56..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/groupby.pyi
+++ /dev/null
@@ -1,191 +0,0 @@
-from typing import Literal
-
-import numpy as np
-
-from pandas._typing import npt
-
-def group_median_float64(
- out: np.ndarray, # ndarray[float64_t, ndim=2]
- counts: npt.NDArray[np.int64],
- values: np.ndarray, # ndarray[float64_t, ndim=2]
- labels: npt.NDArray[np.int64],
- min_count: int = ..., # Py_ssize_t
- mask: np.ndarray | None = ...,
- result_mask: np.ndarray | None = ...,
-) -> None: ...
-def group_cumprod(
- out: np.ndarray, # float64_t[:, ::1]
- values: np.ndarray, # const float64_t[:, :]
- labels: np.ndarray, # const int64_t[:]
- ngroups: int,
- is_datetimelike: bool,
- skipna: bool = ...,
- mask: np.ndarray | None = ...,
- result_mask: np.ndarray | None = ...,
-) -> None: ...
-def group_cumsum(
- out: np.ndarray, # int64float_t[:, ::1]
- values: np.ndarray, # ndarray[int64float_t, ndim=2]
- labels: np.ndarray, # const int64_t[:]
- ngroups: int,
- is_datetimelike: bool,
- skipna: bool = ...,
- mask: np.ndarray | None = ...,
- result_mask: np.ndarray | None = ...,
-) -> None: ...
-def group_shift_indexer(
- out: np.ndarray, # int64_t[::1]
- labels: np.ndarray, # const int64_t[:]
- ngroups: int,
- periods: int,
-) -> None: ...
-def group_fillna_indexer(
- out: np.ndarray, # ndarray[intp_t]
- labels: np.ndarray, # ndarray[int64_t]
- sorted_labels: npt.NDArray[np.intp],
- mask: npt.NDArray[np.uint8],
- direction: Literal["ffill", "bfill"],
- limit: int, # int64_t
- dropna: bool,
-) -> None: ...
-def group_any_all(
- out: np.ndarray, # uint8_t[::1]
- values: np.ndarray, # const uint8_t[::1]
- labels: np.ndarray, # const int64_t[:]
- mask: np.ndarray, # const uint8_t[::1]
- val_test: Literal["any", "all"],
- skipna: bool,
- nullable: bool,
-) -> None: ...
-def group_sum(
- out: np.ndarray, # complexfloatingintuint_t[:, ::1]
- counts: np.ndarray, # int64_t[::1]
- values: np.ndarray, # ndarray[complexfloatingintuint_t, ndim=2]
- labels: np.ndarray, # const intp_t[:]
- mask: np.ndarray | None,
- result_mask: np.ndarray | None = ...,
- min_count: int = ...,
- is_datetimelike: bool = ...,
-) -> None: ...
-def group_prod(
- out: np.ndarray, # int64float_t[:, ::1]
- counts: np.ndarray, # int64_t[::1]
- values: np.ndarray, # ndarray[int64float_t, ndim=2]
- labels: np.ndarray, # const intp_t[:]
- mask: np.ndarray | None,
- result_mask: np.ndarray | None = ...,
- min_count: int = ...,
-) -> None: ...
-def group_var(
- out: np.ndarray, # floating[:, ::1]
- counts: np.ndarray, # int64_t[::1]
- values: np.ndarray, # ndarray[floating, ndim=2]
- labels: np.ndarray, # const intp_t[:]
- min_count: int = ..., # Py_ssize_t
- ddof: int = ..., # int64_t
- mask: np.ndarray | None = ...,
- result_mask: np.ndarray | None = ...,
- is_datetimelike: bool = ...,
-) -> None: ...
-def group_mean(
- out: np.ndarray, # floating[:, ::1]
- counts: np.ndarray, # int64_t[::1]
- values: np.ndarray, # ndarray[floating, ndim=2]
- labels: np.ndarray, # const intp_t[:]
- min_count: int = ..., # Py_ssize_t
- is_datetimelike: bool = ..., # bint
- mask: np.ndarray | None = ...,
- result_mask: np.ndarray | None = ...,
-) -> None: ...
-def group_ohlc(
- out: np.ndarray, # floatingintuint_t[:, ::1]
- counts: np.ndarray, # int64_t[::1]
- values: np.ndarray, # ndarray[floatingintuint_t, ndim=2]
- labels: np.ndarray, # const intp_t[:]
- min_count: int = ...,
- mask: np.ndarray | None = ...,
- result_mask: np.ndarray | None = ...,
-) -> None: ...
-def group_quantile(
- out: npt.NDArray[np.float64],
- values: np.ndarray, # ndarray[numeric, ndim=1]
- labels: npt.NDArray[np.intp],
- mask: npt.NDArray[np.uint8],
- sort_indexer: npt.NDArray[np.intp], # const
- qs: npt.NDArray[np.float64], # const
- interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"],
- result_mask: np.ndarray | None = ...,
-) -> None: ...
-def group_last(
- out: np.ndarray, # rank_t[:, ::1]
- counts: np.ndarray, # int64_t[::1]
- values: np.ndarray, # ndarray[rank_t, ndim=2]
- labels: np.ndarray, # const int64_t[:]
- mask: npt.NDArray[np.bool_] | None,
- result_mask: npt.NDArray[np.bool_] | None = ...,
- min_count: int = ..., # Py_ssize_t
- is_datetimelike: bool = ...,
-) -> None: ...
-def group_nth(
- out: np.ndarray, # rank_t[:, ::1]
- counts: np.ndarray, # int64_t[::1]
- values: np.ndarray, # ndarray[rank_t, ndim=2]
- labels: np.ndarray, # const int64_t[:]
- mask: npt.NDArray[np.bool_] | None,
- result_mask: npt.NDArray[np.bool_] | None = ...,
- min_count: int = ..., # int64_t
- rank: int = ..., # int64_t
- is_datetimelike: bool = ...,
-) -> None: ...
-def group_rank(
- out: np.ndarray, # float64_t[:, ::1]
- values: np.ndarray, # ndarray[rank_t, ndim=2]
- labels: np.ndarray, # const int64_t[:]
- ngroups: int,
- is_datetimelike: bool,
- ties_method: Literal["average", "min", "max", "first", "dense"] = ...,
- ascending: bool = ...,
- pct: bool = ...,
- na_option: Literal["keep", "top", "bottom"] = ...,
- mask: npt.NDArray[np.bool_] | None = ...,
-) -> None: ...
-def group_max(
- out: np.ndarray, # groupby_t[:, ::1]
- counts: np.ndarray, # int64_t[::1]
- values: np.ndarray, # ndarray[groupby_t, ndim=2]
- labels: np.ndarray, # const int64_t[:]
- min_count: int = ...,
- is_datetimelike: bool = ...,
- mask: np.ndarray | None = ...,
- result_mask: np.ndarray | None = ...,
-) -> None: ...
-def group_min(
- out: np.ndarray, # groupby_t[:, ::1]
- counts: np.ndarray, # int64_t[::1]
- values: np.ndarray, # ndarray[groupby_t, ndim=2]
- labels: np.ndarray, # const int64_t[:]
- min_count: int = ...,
- is_datetimelike: bool = ...,
- mask: np.ndarray | None = ...,
- result_mask: np.ndarray | None = ...,
-) -> None: ...
-def group_cummin(
- out: np.ndarray, # groupby_t[:, ::1]
- values: np.ndarray, # ndarray[groupby_t, ndim=2]
- labels: np.ndarray, # const int64_t[:]
- ngroups: int,
- is_datetimelike: bool,
- mask: np.ndarray | None = ...,
- result_mask: np.ndarray | None = ...,
- skipna: bool = ...,
-) -> None: ...
-def group_cummax(
- out: np.ndarray, # groupby_t[:, ::1]
- values: np.ndarray, # ndarray[groupby_t, ndim=2]
- labels: np.ndarray, # const int64_t[:]
- ngroups: int,
- is_datetimelike: bool,
- mask: np.ndarray | None = ...,
- result_mask: np.ndarray | None = ...,
- skipna: bool = ...,
-) -> None: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/groupby.pyx b/contrib/python/pandas/py3/pandas/_libs/groupby.pyx
deleted file mode 100644
index 0c378acbc6d..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/groupby.pyx
+++ /dev/null
@@ -1,1884 +0,0 @@
-cimport cython
-from cython cimport (
- Py_ssize_t,
- floating,
-)
-from libc.stdlib cimport (
- free,
- malloc,
-)
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- complex64_t,
- complex128_t,
- float32_t,
- float64_t,
- int8_t,
- int64_t,
- intp_t,
- ndarray,
- uint8_t,
- uint64_t,
-)
-from numpy.math cimport NAN
-
-cnp.import_array()
-
-from pandas._libs cimport util
-from pandas._libs.algos cimport (
- get_rank_nan_fill_val,
- kth_smallest_c,
-)
-
-from pandas._libs.algos import (
- groupsort_indexer,
- rank_1d,
- take_2d_axis1_bool_bool,
- take_2d_axis1_float64_float64,
-)
-
-from pandas._libs.dtypes cimport (
- numeric_object_t,
- numeric_t,
-)
-from pandas._libs.missing cimport checknull
-
-
-cdef int64_t NPY_NAT = util.get_nat()
-_int64_max = np.iinfo(np.int64).max
-
-cdef float64_t NaN = <float64_t>np.NaN
-
-cdef enum InterpolationEnumType:
- INTERPOLATION_LINEAR,
- INTERPOLATION_LOWER,
- INTERPOLATION_HIGHER,
- INTERPOLATION_NEAREST,
- INTERPOLATION_MIDPOINT
-
-
-cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nogil:
- cdef:
- int i, j, na_count = 0
- float64_t* tmp
- float64_t result
-
- if n == 0:
- return NaN
-
- # count NAs
- for i in range(n):
- if mask[i]:
- na_count += 1
-
- if na_count:
- if na_count == n:
- return NaN
-
- tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
-
- j = 0
- for i in range(n):
- if not mask[i]:
- tmp[j] = a[i]
- j += 1
-
- a = tmp
- n -= na_count
-
- result = calc_median_linear(a, n, na_count)
-
- if na_count:
- free(a)
-
- return result
-
-
-cdef float64_t median_linear(float64_t* a, int n) nogil:
- cdef:
- int i, j, na_count = 0
- float64_t* tmp
- float64_t result
-
- if n == 0:
- return NaN
-
- # count NAs
- for i in range(n):
- if a[i] != a[i]:
- na_count += 1
-
- if na_count:
- if na_count == n:
- return NaN
-
- tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
-
- j = 0
- for i in range(n):
- if a[i] == a[i]:
- tmp[j] = a[i]
- j += 1
-
- a = tmp
- n -= na_count
-
- result = calc_median_linear(a, n, na_count)
-
- if na_count:
- free(a)
-
- return result
-
-
-cdef float64_t calc_median_linear(float64_t* a, int n, int na_count) nogil:
- cdef:
- float64_t result
-
- if n % 2:
- result = kth_smallest_c(a, n // 2, n)
- else:
- result = (kth_smallest_c(a, n // 2, n) +
- kth_smallest_c(a, n // 2 - 1, n)) / 2
-
- return result
-
-
-ctypedef fused int64float_t:
- int64_t
- uint64_t
- float32_t
- float64_t
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_median_float64(
- ndarray[float64_t, ndim=2] out,
- ndarray[int64_t] counts,
- ndarray[float64_t, ndim=2] values,
- ndarray[intp_t] labels,
- Py_ssize_t min_count=-1,
- const uint8_t[:, :] mask=None,
- uint8_t[:, ::1] result_mask=None,
-) -> None:
- """
- Only aggregates on axis=0
- """
- cdef:
- Py_ssize_t i, j, N, K, ngroups, size
- ndarray[intp_t] _counts
- ndarray[float64_t, ndim=2] data
- ndarray[uint8_t, ndim=2] data_mask
- ndarray[intp_t] indexer
- float64_t* ptr
- uint8_t* ptr_mask
- float64_t result
- bint uses_mask = mask is not None
-
- assert min_count == -1, "'min_count' only used in sum and prod"
-
- ngroups = len(counts)
- N, K = (<object>values).shape
-
- indexer, _counts = groupsort_indexer(labels, ngroups)
- counts[:] = _counts[1:]
-
- data = np.empty((K, N), dtype=np.float64)
- ptr = <float64_t*>cnp.PyArray_DATA(data)
-
- take_2d_axis1_float64_float64(values.T, indexer, out=data)
-
- if uses_mask:
- data_mask = np.empty((K, N), dtype=np.uint8)
- ptr_mask = <uint8_t *>cnp.PyArray_DATA(data_mask)
-
- take_2d_axis1_bool_bool(mask.T, indexer, out=data_mask, fill_value=1)
-
- with nogil:
-
- for i in range(K):
- # exclude NA group
- ptr += _counts[0]
- ptr_mask += _counts[0]
-
- for j in range(ngroups):
- size = _counts[j + 1]
- result = median_linear_mask(ptr, size, ptr_mask)
- out[j, i] = result
-
- if result != result:
- result_mask[j, i] = 1
- ptr += size
- ptr_mask += size
-
- else:
- with nogil:
- for i in range(K):
- # exclude NA group
- ptr += _counts[0]
- for j in range(ngroups):
- size = _counts[j + 1]
- out[j, i] = median_linear(ptr, size)
- ptr += size
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cumprod(
- int64float_t[:, ::1] out,
- ndarray[int64float_t, ndim=2] values,
- const intp_t[::1] labels,
- int ngroups,
- bint is_datetimelike,
- bint skipna=True,
- const uint8_t[:, :] mask=None,
- uint8_t[:, ::1] result_mask=None,
-) -> None:
- """
- Cumulative product of columns of `values`, in row groups `labels`.
-
- Parameters
- ----------
- out : np.ndarray[np.float64, ndim=2]
- Array to store cumprod in.
- values : np.ndarray[np.float64, ndim=2]
- Values to take cumprod of.
- labels : np.ndarray[np.intp]
- Labels to group by.
- ngroups : int
- Number of groups, larger than all entries of `labels`.
- is_datetimelike : bool
- Always false, `values` is never datetime-like.
- skipna : bool
- If true, ignore nans in `values`.
- mask: np.ndarray[uint8], optional
- Mask of values
- result_mask: np.ndarray[int8], optional
- Mask of out array
-
- Notes
- -----
- This method modifies the `out` parameter, rather than returning an object.
- """
- cdef:
- Py_ssize_t i, j, N, K
- int64float_t val, na_val
- int64float_t[:, ::1] accum
- intp_t lab
- uint8_t[:, ::1] accum_mask
- bint isna_entry, isna_prev = False
- bint uses_mask = mask is not None
-
- N, K = (<object>values).shape
- accum = np.ones((ngroups, K), dtype=(<object>values).dtype)
- na_val = _get_na_val(<int64float_t>0, is_datetimelike)
- accum_mask = np.zeros((ngroups, K), dtype="uint8")
-
- with nogil:
- for i in range(N):
- lab = labels[i]
-
- if lab < 0:
- continue
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = _treat_as_na(val, False)
-
- if not isna_entry:
- isna_prev = accum_mask[lab, j]
- if isna_prev:
- out[i, j] = na_val
- if uses_mask:
- result_mask[i, j] = True
-
- else:
- accum[lab, j] *= val
- out[i, j] = accum[lab, j]
-
- else:
- if uses_mask:
- result_mask[i, j] = True
- out[i, j] = 0
- else:
- out[i, j] = na_val
-
- if not skipna:
- accum[lab, j] = na_val
- accum_mask[lab, j] = True
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cumsum(
- int64float_t[:, ::1] out,
- ndarray[int64float_t, ndim=2] values,
- const intp_t[::1] labels,
- int ngroups,
- bint is_datetimelike,
- bint skipna=True,
- const uint8_t[:, :] mask=None,
- uint8_t[:, ::1] result_mask=None,
-) -> None:
- """
- Cumulative sum of columns of `values`, in row groups `labels`.
-
- Parameters
- ----------
- out : np.ndarray[ndim=2]
- Array to store cumsum in.
- values : np.ndarray[ndim=2]
- Values to take cumsum of.
- labels : np.ndarray[np.intp]
- Labels to group by.
- ngroups : int
- Number of groups, larger than all entries of `labels`.
- is_datetimelike : bool
- True if `values` contains datetime-like entries.
- skipna : bool
- If true, ignore nans in `values`.
- mask: np.ndarray[uint8], optional
- Mask of values
- result_mask: np.ndarray[int8], optional
- Mask of out array
-
- Notes
- -----
- This method modifies the `out` parameter, rather than returning an object.
- """
- cdef:
- Py_ssize_t i, j, N, K
- int64float_t val, y, t, na_val
- int64float_t[:, ::1] accum, compensation
- uint8_t[:, ::1] accum_mask
- intp_t lab
- bint isna_entry, isna_prev = False
- bint uses_mask = mask is not None
-
- N, K = (<object>values).shape
-
- if uses_mask:
- accum_mask = np.zeros((ngroups, K), dtype="uint8")
-
- accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
- compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
-
- na_val = _get_na_val(<int64float_t>0, is_datetimelike)
-
- with nogil:
- for i in range(N):
- lab = labels[i]
-
- if lab < 0:
- continue
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = _treat_as_na(val, is_datetimelike)
-
- if not skipna:
- if uses_mask:
- isna_prev = accum_mask[lab, j]
- else:
- isna_prev = _treat_as_na(accum[lab, j], is_datetimelike)
-
- if isna_prev:
- if uses_mask:
- result_mask[i, j] = True
- # Be deterministic, out was initialized as empty
- out[i, j] = 0
- else:
- out[i, j] = na_val
- continue
-
- if isna_entry:
-
- if uses_mask:
- result_mask[i, j] = True
- # Be deterministic, out was initialized as empty
- out[i, j] = 0
- else:
- out[i, j] = na_val
-
- if not skipna:
- if uses_mask:
- accum_mask[lab, j] = True
- else:
- accum[lab, j] = na_val
-
- else:
- # For floats, use Kahan summation to reduce floating-point
- # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
- if int64float_t == float32_t or int64float_t == float64_t:
- y = val - compensation[lab, j]
- t = accum[lab, j] + y
- compensation[lab, j] = t - accum[lab, j] - y
- else:
- t = val + accum[lab, j]
-
- accum[lab, j] = t
- out[i, j] = t
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_shift_indexer(
- int64_t[::1] out,
- const intp_t[::1] labels,
- int ngroups,
- int periods,
-) -> None:
- cdef:
- Py_ssize_t N, i, ii, lab
- int offset = 0, sign
- int64_t idxer, idxer_slot
- int64_t[::1] label_seen = np.zeros(ngroups, dtype=np.int64)
- int64_t[:, ::1] label_indexer
-
- N, = (<object>labels).shape
-
- if periods < 0:
- periods = -periods
- offset = N - 1
- sign = -1
- elif periods > 0:
- offset = 0
- sign = 1
-
- if periods == 0:
- with nogil:
- for i in range(N):
- out[i] = i
- else:
- # array of each previous indexer seen
- label_indexer = np.zeros((ngroups, periods), dtype=np.int64)
- with nogil:
- for i in range(N):
- # reverse iterator if shifting backwards
- ii = offset + sign * i
- lab = labels[ii]
-
- # Skip null keys
- if lab == -1:
- out[ii] = -1
- continue
-
- label_seen[lab] += 1
-
- idxer_slot = label_seen[lab] % periods
- idxer = label_indexer[lab, idxer_slot]
-
- if label_seen[lab] > periods:
- out[ii] = idxer
- else:
- out[ii] = -1
-
- label_indexer[lab, idxer_slot] = ii
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_fillna_indexer(
- ndarray[intp_t] out,
- ndarray[intp_t] labels,
- ndarray[intp_t] sorted_labels,
- ndarray[uint8_t] mask,
- str direction,
- int64_t limit,
- bint dropna,
-) -> None:
- """
- Indexes how to fill values forwards or backwards within a group.
-
- Parameters
- ----------
- out : np.ndarray[np.intp]
- Values into which this method will write its results.
- labels : np.ndarray[np.intp]
- Array containing unique label for each group, with its ordering
- matching up to the corresponding record in `values`.
- sorted_labels : np.ndarray[np.intp]
- obtained by `np.argsort(labels, kind="mergesort")`; reversed if
- direction == "bfill"
- values : np.ndarray[np.uint8]
- Containing the truth value of each element.
- mask : np.ndarray[np.uint8]
- Indicating whether a value is na or not.
- direction : {'ffill', 'bfill'}
- Direction for fill to be applied (forwards or backwards, respectively)
- limit : Consecutive values to fill before stopping, or -1 for no limit
- dropna : Flag to indicate if NaN groups should return all NaN values
-
- Notes
- -----
- This method modifies the `out` parameter rather than returning an object
- """
- cdef:
- Py_ssize_t i, N, idx
- intp_t curr_fill_idx=-1
- int64_t filled_vals = 0
-
- N = len(out)
-
- # Make sure all arrays are the same size
- assert N == len(labels) == len(mask)
-
- with nogil:
- for i in range(N):
- idx = sorted_labels[i]
- if dropna and labels[idx] == -1: # nan-group gets nan-values
- curr_fill_idx = -1
- elif mask[idx] == 1: # is missing
- # Stop filling once we've hit the limit
- if filled_vals >= limit and limit != -1:
- curr_fill_idx = -1
- filled_vals += 1
- else: # reset items when not missing
- filled_vals = 0
- curr_fill_idx = idx
-
- out[idx] = curr_fill_idx
-
- # If we move to the next group, reset
- # the fill_idx and counter
- if i == N - 1 or labels[idx] != labels[sorted_labels[i + 1]]:
- curr_fill_idx = -1
- filled_vals = 0
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_any_all(
- int8_t[:, ::1] out,
- const int8_t[:, :] values,
- const intp_t[::1] labels,
- const uint8_t[:, :] mask,
- str val_test,
- bint skipna,
- bint nullable,
-) -> None:
- """
- Aggregated boolean values to show truthfulness of group elements. If the
- input is a nullable type (nullable=True), the result will be computed
- using Kleene logic.
-
- Parameters
- ----------
- out : np.ndarray[np.int8]
- Values into which this method will write its results.
- labels : np.ndarray[np.intp]
- Array containing unique label for each group, with its
- ordering matching up to the corresponding record in `values`
- values : np.ndarray[np.int8]
- Containing the truth value of each element.
- mask : np.ndarray[np.uint8]
- Indicating whether a value is na or not.
- val_test : {'any', 'all'}
- String object dictating whether to use any or all truth testing
- skipna : bool
- Flag to ignore nan values during truth testing
- nullable : bool
- Whether or not the input is a nullable type. If True, the
- result will be computed using Kleene logic
-
- Notes
- -----
- This method modifies the `out` parameter rather than returning an object.
- The returned values will either be 0, 1 (False or True, respectively), or
- -1 to signify a masked position in the case of a nullable input.
- """
- cdef:
- Py_ssize_t i, j, N = len(labels), K = out.shape[1]
- intp_t lab
- int8_t flag_val, val
-
- if val_test == "all":
- # Because the 'all' value of an empty iterable in Python is True we can
- # start with an array full of ones and set to zero when a False value
- # is encountered
- flag_val = 0
- elif val_test == "any":
- # Because the 'any' value of an empty iterable in Python is False we
- # can start with an array full of zeros and set to one only if any
- # value encountered is True
- flag_val = 1
- else:
- raise ValueError("'bool_func' must be either 'any' or 'all'!")
-
- out[:] = 1 - flag_val
-
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- for j in range(K):
- if skipna and mask[i, j]:
- continue
-
- if nullable and mask[i, j]:
- # Set the position as masked if `out[lab] != flag_val`, which
- # would indicate True/False has not yet been seen for any/all,
- # so by Kleene logic the result is currently unknown
- if out[lab, j] != flag_val:
- out[lab, j] = -1
- continue
-
- val = values[i, j]
-
- # If True and 'any' or False and 'all', the result is
- # already determined
- if val == flag_val:
- out[lab, j] = flag_val
-
-
-# ----------------------------------------------------------------------
-# group_sum, group_prod, group_var, group_mean, group_ohlc
-# ----------------------------------------------------------------------
-
-ctypedef fused mean_t:
- float64_t
- float32_t
- complex64_t
- complex128_t
-
-ctypedef fused sum_t:
- mean_t
- int64_t
- uint64_t
- object
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_sum(
- sum_t[:, ::1] out,
- int64_t[::1] counts,
- ndarray[sum_t, ndim=2] values,
- const intp_t[::1] labels,
- const uint8_t[:, :] mask,
- uint8_t[:, ::1] result_mask=None,
- Py_ssize_t min_count=0,
- bint is_datetimelike=False,
-) -> None:
- """
- Only aggregates on axis=0 using Kahan summation
- """
- cdef:
- Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
- sum_t val, t, y
- sum_t[:, ::1] sumx, compensation
- int64_t[:, ::1] nobs
- Py_ssize_t len_values = len(values), len_labels = len(labels)
- bint uses_mask = mask is not None
- bint isna_entry
-
- if len_values != len_labels:
- raise ValueError("len(index) != len(labels)")
-
- nobs = np.zeros((<object>out).shape, dtype=np.int64)
- # the below is equivalent to `np.zeros_like(out)` but faster
- sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
- compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
-
- N, K = (<object>values).shape
-
- if sum_t is object:
- # NB: this does not use 'compensation' like the non-object track does.
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- # not nan
- if not checknull(val):
- nobs[lab, j] += 1
-
- if nobs[lab, j] == 1:
- # i.e. we haven't added anything yet; avoid TypeError
- # if e.g. val is a str and sumx[lab, j] is 0
- t = val
- else:
- t = sumx[lab, j] + val
- sumx[lab, j] = t
-
- for i in range(ncounts):
- for j in range(K):
- if nobs[i, j] < min_count:
- out[i, j] = None
-
- else:
- out[i, j] = sumx[i, j]
- else:
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = _treat_as_na(val, is_datetimelike)
-
- if not isna_entry:
- nobs[lab, j] += 1
- y = val - compensation[lab, j]
- t = sumx[lab, j] + y
- compensation[lab, j] = t - sumx[lab, j] - y
- sumx[lab, j] = t
-
- _check_below_mincount(
- out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
- )
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_prod(
- int64float_t[:, ::1] out,
- int64_t[::1] counts,
- ndarray[int64float_t, ndim=2] values,
- const intp_t[::1] labels,
- const uint8_t[:, ::1] mask,
- uint8_t[:, ::1] result_mask=None,
- Py_ssize_t min_count=0,
-) -> None:
- """
- Only aggregates on axis=0
- """
- cdef:
- Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
- int64float_t val
- int64float_t[:, ::1] prodx
- int64_t[:, ::1] nobs
- Py_ssize_t len_values = len(values), len_labels = len(labels)
- bint isna_entry, uses_mask = mask is not None
-
- if len_values != len_labels:
- raise ValueError("len(index) != len(labels)")
-
- nobs = np.zeros((<object>out).shape, dtype=np.int64)
- prodx = np.ones((<object>out).shape, dtype=(<object>out).base.dtype)
-
- N, K = (<object>values).shape
-
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = _treat_as_na(val, False)
-
- if not isna_entry:
- nobs[lab, j] += 1
- prodx[lab, j] *= val
-
- _check_below_mincount(
- out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
- )
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-@cython.cdivision(True)
-def group_var(
- floating[:, ::1] out,
- int64_t[::1] counts,
- ndarray[floating, ndim=2] values,
- const intp_t[::1] labels,
- Py_ssize_t min_count=-1,
- int64_t ddof=1,
- const uint8_t[:, ::1] mask=None,
- uint8_t[:, ::1] result_mask=None,
- bint is_datetimelike=False,
-) -> None:
- cdef:
- Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
- floating val, ct, oldmean
- floating[:, ::1] mean
- int64_t[:, ::1] nobs
- Py_ssize_t len_values = len(values), len_labels = len(labels)
- bint isna_entry, uses_mask = mask is not None
-
- assert min_count == -1, "'min_count' only used in sum and prod"
-
- if len_values != len_labels:
- raise ValueError("len(index) != len(labels)")
-
- nobs = np.zeros((<object>out).shape, dtype=np.int64)
- mean = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
-
- N, K = (<object>values).shape
-
- out[:, :] = 0.0
-
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
-
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- elif is_datetimelike:
- # With group_var, we cannot just use _treat_as_na bc
- # datetimelike dtypes get cast to float64 instead of
- # to int64.
- isna_entry = val == NPY_NAT
- else:
- isna_entry = _treat_as_na(val, is_datetimelike)
-
- if not isna_entry:
- nobs[lab, j] += 1
- oldmean = mean[lab, j]
- mean[lab, j] += (val - oldmean) / nobs[lab, j]
- out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
-
- for i in range(ncounts):
- for j in range(K):
- ct = nobs[i, j]
- if ct <= ddof:
- if uses_mask:
- result_mask[i, j] = True
- else:
- out[i, j] = NAN
- else:
- out[i, j] /= (ct - ddof)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_mean(
- mean_t[:, ::1] out,
- int64_t[::1] counts,
- ndarray[mean_t, ndim=2] values,
- const intp_t[::1] labels,
- Py_ssize_t min_count=-1,
- bint is_datetimelike=False,
- const uint8_t[:, ::1] mask=None,
- uint8_t[:, ::1] result_mask=None,
-) -> None:
- """
- Compute the mean per label given a label assignment for each value.
- NaN values are ignored.
-
- Parameters
- ----------
- out : np.ndarray[floating]
- Values into which this method will write its results.
- counts : np.ndarray[int64]
- A zeroed array of the same shape as labels,
- populated by group sizes during algorithm.
- values : np.ndarray[floating]
- 2-d array of the values to find the mean of.
- labels : np.ndarray[np.intp]
- Array containing unique label for each group, with its
- ordering matching up to the corresponding record in `values`.
- min_count : Py_ssize_t
- Only used in sum and prod. Always -1.
- is_datetimelike : bool
- True if `values` contains datetime-like entries.
- mask : ndarray[bool, ndim=2], optional
- Mask of the input values.
- result_mask : ndarray[bool, ndim=2], optional
- Mask of the out array
-
- Notes
- -----
- This method modifies the `out` parameter rather than returning an object.
- `counts` is modified to hold group sizes
- """
-
- cdef:
- Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
- mean_t val, count, y, t, nan_val
- mean_t[:, ::1] sumx, compensation
- int64_t[:, ::1] nobs
- Py_ssize_t len_values = len(values), len_labels = len(labels)
- bint isna_entry, uses_mask = mask is not None
-
- assert min_count == -1, "'min_count' only used in sum and prod"
-
- if len_values != len_labels:
- raise ValueError("len(index) != len(labels)")
-
- # the below is equivalent to `np.zeros_like(out)` but faster
- nobs = np.zeros((<object>out).shape, dtype=np.int64)
- sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
- compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
-
- N, K = (<object>values).shape
- if uses_mask:
- nan_val = 0
- elif is_datetimelike:
- nan_val = NPY_NAT
- else:
- nan_val = NAN
-
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- elif is_datetimelike:
- # With group_mean, we cannot just use _treat_as_na bc
- # datetimelike dtypes get cast to float64 instead of
- # to int64.
- isna_entry = val == NPY_NAT
- else:
- isna_entry = _treat_as_na(val, is_datetimelike)
-
- if not isna_entry:
- nobs[lab, j] += 1
- y = val - compensation[lab, j]
- t = sumx[lab, j] + y
- compensation[lab, j] = t - sumx[lab, j] - y
- sumx[lab, j] = t
-
- for i in range(ncounts):
- for j in range(K):
- count = nobs[i, j]
- if nobs[i, j] == 0:
-
- if uses_mask:
- result_mask[i, j] = True
- else:
- out[i, j] = nan_val
-
- else:
- out[i, j] = sumx[i, j] / count
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_ohlc(
- int64float_t[:, ::1] out,
- int64_t[::1] counts,
- ndarray[int64float_t, ndim=2] values,
- const intp_t[::1] labels,
- Py_ssize_t min_count=-1,
- const uint8_t[:, ::1] mask=None,
- uint8_t[:, ::1] result_mask=None,
-) -> None:
- """
- Only aggregates on axis=0
- """
- cdef:
- Py_ssize_t i, N, K, lab
- int64float_t val
- uint8_t[::1] first_element_set
- bint isna_entry, uses_mask = mask is not None
-
- assert min_count == -1, "'min_count' only used in sum and prod"
-
- if len(labels) == 0:
- return
-
- N, K = (<object>values).shape
-
- if out.shape[1] != 4:
- raise ValueError("Output array must have 4 columns")
-
- if K > 1:
- raise NotImplementedError("Argument 'values' must have only one dimension")
-
- if int64float_t is float32_t or int64float_t is float64_t:
- out[:] = np.nan
- else:
- out[:] = 0
-
- first_element_set = np.zeros((<object>counts).shape, dtype=np.uint8)
- if uses_mask:
- result_mask[:] = True
-
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab == -1:
- continue
-
- counts[lab] += 1
- val = values[i, 0]
-
- if uses_mask:
- isna_entry = mask[i, 0]
- else:
- isna_entry = _treat_as_na(val, False)
-
- if isna_entry:
- continue
-
- if not first_element_set[lab]:
- out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
- first_element_set[lab] = True
- if uses_mask:
- result_mask[lab] = False
- else:
- out[lab, 1] = max(out[lab, 1], val)
- out[lab, 2] = min(out[lab, 2], val)
- out[lab, 3] = val
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_quantile(
- ndarray[float64_t, ndim=2] out,
- ndarray[numeric_t, ndim=1] values,
- ndarray[intp_t] labels,
- ndarray[uint8_t] mask,
- const intp_t[:] sort_indexer,
- const float64_t[:] qs,
- str interpolation,
- uint8_t[:, ::1] result_mask=None,
-) -> None:
- """
- Calculate the quantile per group.
-
- Parameters
- ----------
- out : np.ndarray[np.float64, ndim=2]
- Array of aggregated values that will be written to.
- values : np.ndarray
- Array containing the values to apply the function against.
- labels : ndarray[np.intp]
- Array containing the unique group labels.
- sort_indexer : ndarray[np.intp]
- Indices describing sort order by values and labels.
- qs : ndarray[float64_t]
- The quantile values to search for.
- interpolation : {'linear', 'lower', 'highest', 'nearest', 'midpoint'}
-
- Notes
- -----
- Rather than explicitly returning a value, this function modifies the
- provided `out` parameter.
- """
- cdef:
- Py_ssize_t i, N=len(labels), ngroups, grp_sz, non_na_sz, k, nqs
- Py_ssize_t grp_start=0, idx=0
- intp_t lab
- InterpolationEnumType interp
- float64_t q_val, q_idx, frac, val, next_val
- int64_t[::1] counts, non_na_counts
- bint uses_result_mask = result_mask is not None
-
- assert values.shape[0] == N
-
- if any(not (0 <= q <= 1) for q in qs):
- wrong = [x for x in qs if not (0 <= x <= 1)][0]
- raise ValueError(
- f"Each 'q' must be between 0 and 1. Got '{wrong}' instead"
- )
-
- inter_methods = {
- "linear": INTERPOLATION_LINEAR,
- "lower": INTERPOLATION_LOWER,
- "higher": INTERPOLATION_HIGHER,
- "nearest": INTERPOLATION_NEAREST,
- "midpoint": INTERPOLATION_MIDPOINT,
- }
- interp = inter_methods[interpolation]
-
- nqs = len(qs)
- ngroups = len(out)
- counts = np.zeros(ngroups, dtype=np.int64)
- non_na_counts = np.zeros(ngroups, dtype=np.int64)
-
- # First figure out the size of every group
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab == -1: # NA group label
- continue
-
- counts[lab] += 1
- if not mask[i]:
- non_na_counts[lab] += 1
-
- with nogil:
- for i in range(ngroups):
- # Figure out how many group elements there are
- grp_sz = counts[i]
- non_na_sz = non_na_counts[i]
-
- if non_na_sz == 0:
- for k in range(nqs):
- if uses_result_mask:
- result_mask[i, k] = 1
- else:
- out[i, k] = NaN
- else:
- for k in range(nqs):
- q_val = qs[k]
-
- # Calculate where to retrieve the desired value
- # Casting to int will intentionally truncate result
- idx = grp_start + <int64_t>(q_val * <float64_t>(non_na_sz - 1))
-
- val = values[sort_indexer[idx]]
- # If requested quantile falls evenly on a particular index
- # then write that index's value out. Otherwise interpolate
- q_idx = q_val * (non_na_sz - 1)
- frac = q_idx % 1
-
- if frac == 0.0 or interp == INTERPOLATION_LOWER:
- out[i, k] = val
- else:
- next_val = values[sort_indexer[idx + 1]]
- if interp == INTERPOLATION_LINEAR:
- out[i, k] = val + (next_val - val) * frac
- elif interp == INTERPOLATION_HIGHER:
- out[i, k] = next_val
- elif interp == INTERPOLATION_MIDPOINT:
- out[i, k] = (val + next_val) / 2.0
- elif interp == INTERPOLATION_NEAREST:
- if frac > .5 or (frac == .5 and q_val > .5): # Always OK?
- out[i, k] = next_val
- else:
- out[i, k] = val
-
- # Increment the index reference in sorted_arr for the next group
- grp_start += grp_sz
-
-
-# ----------------------------------------------------------------------
-# group_nth, group_last, group_rank
-# ----------------------------------------------------------------------
-
-ctypedef fused numeric_object_complex_t:
- numeric_object_t
- complex64_t
- complex128_t
-
-
-cdef bint _treat_as_na(numeric_object_complex_t val, bint is_datetimelike) nogil:
- if numeric_object_complex_t is object:
- # Should never be used, but we need to avoid the `val != val` below
- # or else cython will raise about gil acquisition.
- raise NotImplementedError
-
- elif numeric_object_complex_t is int64_t:
- return is_datetimelike and val == NPY_NAT
- elif (
- numeric_object_complex_t is float32_t
- or numeric_object_complex_t is float64_t
- or numeric_object_complex_t is complex64_t
- or numeric_object_complex_t is complex128_t
- ):
- return val != val
- else:
- # non-datetimelike integer
- return False
-
-
-cdef numeric_object_t _get_min_or_max(
- numeric_object_t val,
- bint compute_max,
- bint is_datetimelike,
-):
- """
- Find either the min or the max supported by numeric_object_t; 'val' is a
- placeholder to effectively make numeric_object_t an argument.
- """
- return get_rank_nan_fill_val(
- not compute_max,
- val=val,
- is_datetimelike=is_datetimelike,
- )
-
-
-cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike):
- cdef:
- numeric_t na_val
-
- if numeric_t == float32_t or numeric_t == float64_t:
- na_val = NaN
- elif numeric_t is int64_t and is_datetimelike:
- na_val = NPY_NAT
- else:
- # Used in case of masks
- na_val = 0
- return na_val
-
-
-ctypedef fused mincount_t:
- numeric_t
- complex64_t
- complex128_t
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline void _check_below_mincount(
- mincount_t[:, ::1] out,
- bint uses_mask,
- uint8_t[:, ::1] result_mask,
- Py_ssize_t ncounts,
- Py_ssize_t K,
- int64_t[:, ::1] nobs,
- int64_t min_count,
- mincount_t[:, ::1] resx,
-) nogil:
- """
- Check if the number of observations for a group is below min_count,
- and if so set the result for that group to the appropriate NA-like value.
- """
- cdef:
- Py_ssize_t i, j
-
- for i in range(ncounts):
- for j in range(K):
-
- if nobs[i, j] < min_count:
- # if we are integer dtype, not is_datetimelike, and
- # not uses_mask, then getting here implies that
- # counts[i] < min_count, which means we will
- # be cast to float64 and masked at the end
- # of WrappedCythonOp._call_cython_op. So we can safely
- # set a placeholder value in out[i, j].
- if uses_mask:
- result_mask[i, j] = True
- # set out[i, j] to 0 to be deterministic, as
- # it was initialized with np.empty. Also ensures
- # we can downcast out if appropriate.
- out[i, j] = 0
- elif (
- mincount_t is float32_t
- or mincount_t is float64_t
- or mincount_t is complex64_t
- or mincount_t is complex128_t
- ):
- out[i, j] = NAN
- elif mincount_t is int64_t:
- # Per above, this is a placeholder in
- # non-is_datetimelike cases.
- out[i, j] = NPY_NAT
- else:
- # placeholder, see above
- out[i, j] = 0
- else:
- out[i, j] = resx[i, j]
-
-
-# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
-# use `const numeric_object_t[:, :] values`
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_last(
- numeric_object_t[:, ::1] out,
- int64_t[::1] counts,
- ndarray[numeric_object_t, ndim=2] values,
- const intp_t[::1] labels,
- const uint8_t[:, :] mask,
- uint8_t[:, ::1] result_mask=None,
- Py_ssize_t min_count=-1,
- bint is_datetimelike=False,
-) -> None:
- """
- Only aggregates on axis=0
- """
- cdef:
- Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
- numeric_object_t val
- numeric_object_t[:, ::1] resx
- int64_t[:, ::1] nobs
- bint uses_mask = mask is not None
- bint isna_entry
-
- # TODO(cython3):
- # Instead of `labels.shape[0]` use `len(labels)`
- if not len(values) == labels.shape[0]:
- raise AssertionError("len(index) != len(labels)")
-
- min_count = max(min_count, 1)
- nobs = np.zeros((<object>out).shape, dtype=np.int64)
- if numeric_object_t is object:
- resx = np.empty((<object>out).shape, dtype=object)
- else:
- resx = np.empty_like(out)
-
- N, K = (<object>values).shape
-
- if numeric_object_t is object:
- # TODO(cython3): De-duplicate once conditional-nogil is available
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = checknull(val)
-
- if not isna_entry:
- # TODO(cython3): use _treat_as_na here once
- # conditional-nogil is available.
- nobs[lab, j] += 1
- resx[lab, j] = val
-
- for i in range(ncounts):
- for j in range(K):
- if nobs[i, j] < min_count:
- out[i, j] = None
- else:
- out[i, j] = resx[i, j]
- else:
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = _treat_as_na(val, is_datetimelike)
-
- if not isna_entry:
- nobs[lab, j] += 1
- resx[lab, j] = val
-
- _check_below_mincount(
- out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
- )
-
-
-# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
-# use `const numeric_object_t[:, :] values`
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_nth(
- numeric_object_t[:, ::1] out,
- int64_t[::1] counts,
- ndarray[numeric_object_t, ndim=2] values,
- const intp_t[::1] labels,
- const uint8_t[:, :] mask,
- uint8_t[:, ::1] result_mask=None,
- int64_t min_count=-1,
- int64_t rank=1,
- bint is_datetimelike=False,
-) -> None:
- """
- Only aggregates on axis=0
- """
- cdef:
- Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
- numeric_object_t val
- numeric_object_t[:, ::1] resx
- int64_t[:, ::1] nobs
- bint uses_mask = mask is not None
- bint isna_entry
-
- # TODO(cython3):
- # Instead of `labels.shape[0]` use `len(labels)`
- if not len(values) == labels.shape[0]:
- raise AssertionError("len(index) != len(labels)")
-
- min_count = max(min_count, 1)
- nobs = np.zeros((<object>out).shape, dtype=np.int64)
- if numeric_object_t is object:
- resx = np.empty((<object>out).shape, dtype=object)
- else:
- resx = np.empty_like(out)
-
- N, K = (<object>values).shape
-
- if numeric_object_t is object:
- # TODO(cython3): De-duplicate once conditional-nogil is available
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = checknull(val)
-
- if not isna_entry:
- # TODO(cython3): use _treat_as_na here once
- # conditional-nogil is available.
- nobs[lab, j] += 1
- if nobs[lab, j] == rank:
- resx[lab, j] = val
-
- for i in range(ncounts):
- for j in range(K):
- if nobs[i, j] < min_count:
- out[i, j] = None
- else:
- out[i, j] = resx[i, j]
-
- else:
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = _treat_as_na(val, is_datetimelike)
-
- if not isna_entry:
- nobs[lab, j] += 1
- if nobs[lab, j] == rank:
- resx[lab, j] = val
-
- _check_below_mincount(
- out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
- )
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_rank(
- float64_t[:, ::1] out,
- ndarray[numeric_object_t, ndim=2] values,
- const intp_t[::1] labels,
- int ngroups,
- bint is_datetimelike,
- str ties_method="average",
- bint ascending=True,
- bint pct=False,
- str na_option="keep",
- const uint8_t[:, :] mask=None,
-) -> None:
- """
- Provides the rank of values within each group.
-
- Parameters
- ----------
- out : np.ndarray[np.float64, ndim=2]
- Values to which this method will write its results.
- values : np.ndarray of numeric_object_t values to be ranked
- labels : np.ndarray[np.intp]
- Array containing unique label for each group, with its ordering
- matching up to the corresponding record in `values`
- ngroups : int
- This parameter is not used, is needed to match signatures of other
- groupby functions.
- is_datetimelike : bool
- True if `values` contains datetime-like entries.
- ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
- * average: average rank of group
- * min: lowest rank in group
- * max: highest rank in group
- * first: ranks assigned in order they appear in the array
- * dense: like 'min', but rank always increases by 1 between groups
- ascending : bool, default True
- False for ranks by high (1) to low (N)
- na_option : {'keep', 'top', 'bottom'}, default 'keep'
- pct : bool, default False
- Compute percentage rank of data within each group
- na_option : {'keep', 'top', 'bottom'}, default 'keep'
- * keep: leave NA values where they are
- * top: smallest rank if ascending
- * bottom: smallest rank if descending
- mask : np.ndarray[bool] or None, default None
-
- Notes
- -----
- This method modifies the `out` parameter rather than returning an object
- """
- cdef:
- Py_ssize_t i, k, N
- ndarray[float64_t, ndim=1] result
- const uint8_t[:] sub_mask
-
- N = values.shape[1]
-
- for k in range(N):
- if mask is None:
- sub_mask = None
- else:
- sub_mask = mask[:, k]
-
- result = rank_1d(
- values=values[:, k],
- labels=labels,
- is_datetimelike=is_datetimelike,
- ties_method=ties_method,
- ascending=ascending,
- pct=pct,
- na_option=na_option,
- mask=sub_mask,
- )
- for i in range(len(result)):
- if labels[i] >= 0:
- out[i, k] = result[i]
-
-
-# ----------------------------------------------------------------------
-# group_min, group_max
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef group_min_max(
- numeric_t[:, ::1] out,
- int64_t[::1] counts,
- ndarray[numeric_t, ndim=2] values,
- const intp_t[::1] labels,
- Py_ssize_t min_count=-1,
- bint is_datetimelike=False,
- bint compute_max=True,
- const uint8_t[:, ::1] mask=None,
- uint8_t[:, ::1] result_mask=None,
-):
- """
- Compute minimum/maximum of columns of `values`, in row groups `labels`.
-
- Parameters
- ----------
- out : np.ndarray[numeric_t, ndim=2]
- Array to store result in.
- counts : np.ndarray[int64]
- Input as a zeroed array, populated by group sizes during algorithm
- values : array
- Values to find column-wise min/max of.
- labels : np.ndarray[np.intp]
- Labels to group by.
- min_count : Py_ssize_t, default -1
- The minimum number of non-NA group elements, NA result if threshold
- is not met
- is_datetimelike : bool
- True if `values` contains datetime-like entries.
- compute_max : bint, default True
- True to compute group-wise max, False to compute min
- mask : ndarray[bool, ndim=2], optional
- If not None, indices represent missing values,
- otherwise the mask will not be used
- result_mask : ndarray[bool, ndim=2], optional
- If not None, these specify locations in the output that are NA.
- Modified in-place.
-
- Notes
- -----
- This method modifies the `out` parameter, rather than returning an object.
- `counts` is modified to hold group sizes
- """
- cdef:
- Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
- numeric_t val
- numeric_t[:, ::1] group_min_or_max
- int64_t[:, ::1] nobs
- bint uses_mask = mask is not None
- bint isna_entry
-
- # TODO(cython3):
- # Instead of `labels.shape[0]` use `len(labels)`
- if not len(values) == labels.shape[0]:
- raise AssertionError("len(index) != len(labels)")
-
- min_count = max(min_count, 1)
- nobs = np.zeros((<object>out).shape, dtype=np.int64)
-
- group_min_or_max = np.empty_like(out)
- group_min_or_max[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike)
-
- N, K = (<object>values).shape
-
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
-
- counts[lab] += 1
- for j in range(K):
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = _treat_as_na(val, is_datetimelike)
-
- if not isna_entry:
- nobs[lab, j] += 1
- if compute_max:
- if val > group_min_or_max[lab, j]:
- group_min_or_max[lab, j] = val
- else:
- if val < group_min_or_max[lab, j]:
- group_min_or_max[lab, j] = val
-
- _check_below_mincount(
- out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
- )
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_max(
- numeric_t[:, ::1] out,
- int64_t[::1] counts,
- ndarray[numeric_t, ndim=2] values,
- const intp_t[::1] labels,
- Py_ssize_t min_count=-1,
- bint is_datetimelike=False,
- const uint8_t[:, ::1] mask=None,
- uint8_t[:, ::1] result_mask=None,
-) -> None:
- """See group_min_max.__doc__"""
- group_min_max(
- out,
- counts,
- values,
- labels,
- min_count=min_count,
- is_datetimelike=is_datetimelike,
- compute_max=True,
- mask=mask,
- result_mask=result_mask,
- )
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_min(
- numeric_t[:, ::1] out,
- int64_t[::1] counts,
- ndarray[numeric_t, ndim=2] values,
- const intp_t[::1] labels,
- Py_ssize_t min_count=-1,
- bint is_datetimelike=False,
- const uint8_t[:, ::1] mask=None,
- uint8_t[:, ::1] result_mask=None,
-) -> None:
- """See group_min_max.__doc__"""
- group_min_max(
- out,
- counts,
- values,
- labels,
- min_count=min_count,
- is_datetimelike=is_datetimelike,
- compute_max=False,
- mask=mask,
- result_mask=result_mask,
- )
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cdef group_cummin_max(
- numeric_t[:, ::1] out,
- ndarray[numeric_t, ndim=2] values,
- const uint8_t[:, ::1] mask,
- uint8_t[:, ::1] result_mask,
- const intp_t[::1] labels,
- int ngroups,
- bint is_datetimelike,
- bint skipna,
- bint compute_max,
-):
- """
- Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
-
- Parameters
- ----------
- out : np.ndarray[numeric_t, ndim=2]
- Array to store cummin/max in.
- values : np.ndarray[numeric_t, ndim=2]
- Values to take cummin/max of.
- mask : np.ndarray[bool] or None
- If not None, indices represent missing values,
- otherwise the mask will not be used
- result_mask : ndarray[bool, ndim=2], optional
- If not None, these specify locations in the output that are NA.
- Modified in-place.
- labels : np.ndarray[np.intp]
- Labels to group by.
- ngroups : int
- Number of groups, larger than all entries of `labels`.
- is_datetimelike : bool
- True if `values` contains datetime-like entries.
- skipna : bool
- If True, ignore nans in `values`.
- compute_max : bool
- True if cumulative maximum should be computed, False
- if cumulative minimum should be computed
-
- Notes
- -----
- This method modifies the `out` parameter, rather than returning an object.
- """
- cdef:
- numeric_t[:, ::1] accum
- Py_ssize_t i, j, N, K
- numeric_t val, mval, na_val
- uint8_t[:, ::1] seen_na
- intp_t lab
- bint na_possible
- bint uses_mask = mask is not None
- bint isna_entry
-
- accum = np.empty((ngroups, (<object>values).shape[1]), dtype=values.dtype)
- accum[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike)
-
- na_val = _get_na_val(<numeric_t>0, is_datetimelike)
-
- if uses_mask:
- na_possible = True
- # Will never be used, just to avoid uninitialized warning
- na_val = 0
- elif numeric_t is float64_t or numeric_t is float32_t:
- na_possible = True
- elif is_datetimelike:
- na_possible = True
- else:
- # Will never be used, just to avoid uninitialized warning
- na_possible = False
-
- if na_possible:
- seen_na = np.zeros((<object>accum).shape, dtype=np.uint8)
-
- N, K = (<object>values).shape
- with nogil:
- for i in range(N):
- lab = labels[i]
- if lab < 0:
- continue
- for j in range(K):
-
- if not skipna and na_possible and seen_na[lab, j]:
- if uses_mask:
- result_mask[i, j] = 1
- # Set to 0 ensures that we are deterministic and can
- # downcast if appropriate
- out[i, j] = 0
-
- else:
- out[i, j] = na_val
- else:
- val = values[i, j]
-
- if uses_mask:
- isna_entry = mask[i, j]
- else:
- isna_entry = _treat_as_na(val, is_datetimelike)
-
- if not isna_entry:
- mval = accum[lab, j]
- if compute_max:
- if val > mval:
- accum[lab, j] = mval = val
- else:
- if val < mval:
- accum[lab, j] = mval = val
- out[i, j] = mval
- else:
- seen_na[lab, j] = 1
- out[i, j] = val
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cummin(
- numeric_t[:, ::1] out,
- ndarray[numeric_t, ndim=2] values,
- const intp_t[::1] labels,
- int ngroups,
- bint is_datetimelike,
- const uint8_t[:, ::1] mask=None,
- uint8_t[:, ::1] result_mask=None,
- bint skipna=True,
-) -> None:
- """See group_cummin_max.__doc__"""
- group_cummin_max(
- out=out,
- values=values,
- mask=mask,
- result_mask=result_mask,
- labels=labels,
- ngroups=ngroups,
- is_datetimelike=is_datetimelike,
- skipna=skipna,
- compute_max=False,
- )
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cummax(
- numeric_t[:, ::1] out,
- ndarray[numeric_t, ndim=2] values,
- const intp_t[::1] labels,
- int ngroups,
- bint is_datetimelike,
- const uint8_t[:, ::1] mask=None,
- uint8_t[:, ::1] result_mask=None,
- bint skipna=True,
-) -> None:
- """See group_cummin_max.__doc__"""
- group_cummin_max(
- out=out,
- values=values,
- mask=mask,
- result_mask=result_mask,
- labels=labels,
- ngroups=ngroups,
- is_datetimelike=is_datetimelike,
- skipna=skipna,
- compute_max=True,
- )
diff --git a/contrib/python/pandas/py3/pandas/_libs/hashing.pyi b/contrib/python/pandas/py3/pandas/_libs/hashing.pyi
deleted file mode 100644
index 8361026e4a8..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/hashing.pyi
+++ /dev/null
@@ -1,9 +0,0 @@
-import numpy as np
-
-from pandas._typing import npt
-
-def hash_object_array(
- arr: npt.NDArray[np.object_],
- key: str,
- encoding: str = ...,
-) -> npt.NDArray[np.uint64]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/hashing.pyx b/contrib/python/pandas/py3/pandas/_libs/hashing.pyx
deleted file mode 100644
index 197ec99247b..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/hashing.pyx
+++ /dev/null
@@ -1,194 +0,0 @@
-# Translated from the reference implementation
-# at https://github.com/veorq/SipHash
-
-cimport cython
-from libc.stdlib cimport (
- free,
- malloc,
-)
-
-import numpy as np
-
-from numpy cimport (
- import_array,
- ndarray,
- uint8_t,
- uint64_t,
-)
-
-import_array()
-
-from pandas._libs.util cimport is_nan
-
-
-@cython.boundscheck(False)
-def hash_object_array(
- ndarray[object] arr, str key, str encoding="utf8"
-) -> np.ndarray[np.uint64]:
- """
- Parameters
- ----------
- arr : 1-d object ndarray of objects
- key : hash key, must be 16 byte len encoded
- encoding : encoding for key & arr, default to 'utf8'
-
- Returns
- -------
- 1-d uint64 ndarray of hashes.
-
- Raises
- ------
- TypeError
- If the array contains mixed types.
-
- Notes
- -----
- Allowed values must be strings, or nulls
- mixed array types will raise TypeError.
- """
- cdef:
- Py_ssize_t i, n
- uint64_t[::1] result
- bytes data, k
- uint8_t *kb
- uint64_t *lens
- char **vecs
- char *cdata
- object val
- list datas = []
-
- k = <bytes>key.encode(encoding)
- kb = <uint8_t *>k
- if len(k) != 16:
- raise ValueError(
- f"key should be a 16-byte string encoded, got {k} (len {len(k)})"
- )
-
- n = len(arr)
-
- # create an array of bytes
- vecs = <char **>malloc(n * sizeof(char *))
- lens = <uint64_t*>malloc(n * sizeof(uint64_t))
-
- for i in range(n):
- val = arr[i]
- if isinstance(val, bytes):
- data = <bytes>val
- elif isinstance(val, str):
- data = <bytes>val.encode(encoding)
- elif val is None or is_nan(val):
- # null, stringify and encode
- data = <bytes>str(val).encode(encoding)
-
- elif isinstance(val, tuple):
- # GH#28969 we could have a tuple, but need to ensure that
- # the tuple entries are themselves hashable before converting
- # to str
- hash(val)
- data = <bytes>str(val).encode(encoding)
- else:
- raise TypeError(
- f"{val} of type {type(val)} is not a valid type for hashing, "
- "must be string or null"
- )
-
- lens[i] = len(data)
- cdata = data
-
- # keep the references alive through the end of the
- # function
- datas.append(data)
- vecs[i] = cdata
-
- result = np.empty(n, dtype=np.uint64)
- with nogil:
- for i in range(n):
- result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
-
- free(vecs)
- free(lens)
- return result.base # .base to retrieve underlying np.ndarray
-
-
-cdef uint64_t _rotl(uint64_t x, uint64_t b) nogil:
- return (x << b) | (x >> (64 - b))
-
-
-cdef uint64_t u8to64_le(uint8_t* p) nogil:
- return (<uint64_t>p[0] |
- <uint64_t>p[1] << 8 |
- <uint64_t>p[2] << 16 |
- <uint64_t>p[3] << 24 |
- <uint64_t>p[4] << 32 |
- <uint64_t>p[5] << 40 |
- <uint64_t>p[6] << 48 |
- <uint64_t>p[7] << 56)
-
-
-cdef void _sipround(uint64_t* v0, uint64_t* v1,
- uint64_t* v2, uint64_t* v3) nogil:
- v0[0] += v1[0]
- v1[0] = _rotl(v1[0], 13)
- v1[0] ^= v0[0]
- v0[0] = _rotl(v0[0], 32)
- v2[0] += v3[0]
- v3[0] = _rotl(v3[0], 16)
- v3[0] ^= v2[0]
- v0[0] += v3[0]
- v3[0] = _rotl(v3[0], 21)
- v3[0] ^= v0[0]
- v2[0] += v1[0]
- v1[0] = _rotl(v1[0], 17)
- v1[0] ^= v2[0]
- v2[0] = _rotl(v2[0], 32)
-
-
-@cython.cdivision(True)
-cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
- uint8_t* key) nogil:
- cdef uint64_t v0 = 0x736f6d6570736575ULL
- cdef uint64_t v1 = 0x646f72616e646f6dULL
- cdef uint64_t v2 = 0x6c7967656e657261ULL
- cdef uint64_t v3 = 0x7465646279746573ULL
- cdef uint64_t b
- cdef uint64_t k0 = u8to64_le(key)
- cdef uint64_t k1 = u8to64_le(key + 8)
- cdef uint64_t m
- cdef int i
- cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
- cdef int left = datalen & 7
- cdef int cROUNDS = 2
- cdef int dROUNDS = 4
-
- b = (<uint64_t>datalen) << 56
- v3 ^= k1
- v2 ^= k0
- v1 ^= k1
- v0 ^= k0
-
- while (data != end):
- m = u8to64_le(data)
- v3 ^= m
- for i in range(cROUNDS):
- _sipround(&v0, &v1, &v2, &v3)
- v0 ^= m
-
- data += sizeof(uint64_t)
-
- for i in range(left-1, -1, -1):
- b |= (<uint64_t>data[i]) << (i * 8)
-
- v3 ^= b
-
- for i in range(cROUNDS):
- _sipround(&v0, &v1, &v2, &v3)
-
- v0 ^= b
- v2 ^= 0xff
-
- for i in range(dROUNDS):
- _sipround(&v0, &v1, &v2, &v3)
-
- b = v0 ^ v1 ^ v2 ^ v3
-
- return b
diff --git a/contrib/python/pandas/py3/pandas/_libs/hashtable.pxd b/contrib/python/pandas/py3/pandas/_libs/hashtable.pxd
deleted file mode 100644
index 6f66884ac82..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/hashtable.pxd
+++ /dev/null
@@ -1,189 +0,0 @@
-from numpy cimport (
- intp_t,
- ndarray,
-)
-
-from pandas._libs.khash cimport (
- complex64_t,
- complex128_t,
- float32_t,
- float64_t,
- int8_t,
- int16_t,
- int32_t,
- int64_t,
- kh_complex64_t,
- kh_complex128_t,
- kh_float32_t,
- kh_float64_t,
- kh_int8_t,
- kh_int16_t,
- kh_int32_t,
- kh_int64_t,
- kh_pymap_t,
- kh_str_t,
- kh_uint8_t,
- kh_uint16_t,
- kh_uint32_t,
- kh_uint64_t,
- khcomplex64_t,
- khcomplex128_t,
- uint8_t,
- uint16_t,
- uint32_t,
- uint64_t,
-)
-
-# prototypes for sharing
-
-cdef class HashTable:
- pass
-
-cdef class UInt64HashTable(HashTable):
- cdef kh_uint64_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, uint64_t val)
- cpdef set_item(self, uint64_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class Int64HashTable(HashTable):
- cdef kh_int64_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, int64_t val)
- cpdef set_item(self, int64_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class UInt32HashTable(HashTable):
- cdef kh_uint32_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, uint32_t val)
- cpdef set_item(self, uint32_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class Int32HashTable(HashTable):
- cdef kh_int32_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, int32_t val)
- cpdef set_item(self, int32_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class UInt16HashTable(HashTable):
- cdef kh_uint16_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, uint16_t val)
- cpdef set_item(self, uint16_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class Int16HashTable(HashTable):
- cdef kh_int16_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, int16_t val)
- cpdef set_item(self, int16_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class UInt8HashTable(HashTable):
- cdef kh_uint8_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, uint8_t val)
- cpdef set_item(self, uint8_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class Int8HashTable(HashTable):
- cdef kh_int8_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, int8_t val)
- cpdef set_item(self, int8_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class Float64HashTable(HashTable):
- cdef kh_float64_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, float64_t val)
- cpdef set_item(self, float64_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class Float32HashTable(HashTable):
- cdef kh_float32_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, float32_t val)
- cpdef set_item(self, float32_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class Complex64HashTable(HashTable):
- cdef kh_complex64_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, complex64_t val)
- cpdef set_item(self, complex64_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class Complex128HashTable(HashTable):
- cdef kh_complex128_t *table
- cdef int64_t na_position
- cdef bint uses_mask
-
- cpdef get_item(self, complex128_t val)
- cpdef set_item(self, complex128_t key, Py_ssize_t val)
- cpdef get_na(self)
- cpdef set_na(self, Py_ssize_t val)
-
-cdef class PyObjectHashTable(HashTable):
- cdef kh_pymap_t *table
-
- cpdef get_item(self, object val)
- cpdef set_item(self, object key, Py_ssize_t val)
-
-
-cdef class StringHashTable(HashTable):
- cdef kh_str_t *table
-
- cpdef get_item(self, str val)
- cpdef set_item(self, str key, Py_ssize_t val)
-
-cdef struct Int64VectorData:
- int64_t *data
- Py_ssize_t n, m
-
-cdef class Vector:
- cdef bint external_view_exists
-
-cdef class Int64Vector(Vector):
- cdef Int64VectorData *data
- cdef ndarray ao
-
- cdef resize(self)
- cpdef ndarray to_array(self)
- cdef void append(self, int64_t x)
- cdef extend(self, int64_t[:] x)
diff --git a/contrib/python/pandas/py3/pandas/_libs/hashtable.pyi b/contrib/python/pandas/py3/pandas/_libs/hashtable.pyi
deleted file mode 100644
index 2bc6d74fe6a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/hashtable.pyi
+++ /dev/null
@@ -1,251 +0,0 @@
-from typing import (
- Any,
- Hashable,
- Literal,
-)
-
-import numpy as np
-
-from pandas._typing import npt
-
-def unique_label_indices(
- labels: np.ndarray, # const int64_t[:]
-) -> np.ndarray: ...
-
-class Factorizer:
- count: int
- uniques: Any
- def __init__(self, size_hint: int) -> None: ...
- def get_count(self) -> int: ...
- def factorize(
- self,
- values: np.ndarray,
- sort: bool = ...,
- na_sentinel=...,
- na_value=...,
- mask=...,
- ) -> npt.NDArray[np.intp]: ...
-
-class ObjectFactorizer(Factorizer):
- table: PyObjectHashTable
- uniques: ObjectVector
-
-class Int64Factorizer(Factorizer):
- table: Int64HashTable
- uniques: Int64Vector
-
-class UInt64Factorizer(Factorizer):
- table: UInt64HashTable
- uniques: UInt64Vector
-
-class Int32Factorizer(Factorizer):
- table: Int32HashTable
- uniques: Int32Vector
-
-class UInt32Factorizer(Factorizer):
- table: UInt32HashTable
- uniques: UInt32Vector
-
-class Int16Factorizer(Factorizer):
- table: Int16HashTable
- uniques: Int16Vector
-
-class UInt16Factorizer(Factorizer):
- table: UInt16HashTable
- uniques: UInt16Vector
-
-class Int8Factorizer(Factorizer):
- table: Int8HashTable
- uniques: Int8Vector
-
-class UInt8Factorizer(Factorizer):
- table: UInt8HashTable
- uniques: UInt8Vector
-
-class Float64Factorizer(Factorizer):
- table: Float64HashTable
- uniques: Float64Vector
-
-class Float32Factorizer(Factorizer):
- table: Float32HashTable
- uniques: Float32Vector
-
-class Complex64Factorizer(Factorizer):
- table: Complex64HashTable
- uniques: Complex64Vector
-
-class Complex128Factorizer(Factorizer):
- table: Complex128HashTable
- uniques: Complex128Vector
-
-class Int64Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.int64]: ...
-
-class Int32Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.int32]: ...
-
-class Int16Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.int16]: ...
-
-class Int8Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.int8]: ...
-
-class UInt64Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.uint64]: ...
-
-class UInt32Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.uint32]: ...
-
-class UInt16Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.uint16]: ...
-
-class UInt8Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.uint8]: ...
-
-class Float64Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.float64]: ...
-
-class Float32Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.float32]: ...
-
-class Complex128Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.complex128]: ...
-
-class Complex64Vector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.complex64]: ...
-
-class StringVector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.object_]: ...
-
-class ObjectVector:
- def __init__(self, *args) -> None: ...
- def __len__(self) -> int: ...
- def to_array(self) -> npt.NDArray[np.object_]: ...
-
-class HashTable:
- # NB: The base HashTable class does _not_ actually have these methods;
- # we are putting them here for the sake of mypy to avoid
- # reproducing them in each subclass below.
- def __init__(self, size_hint: int = ..., uses_mask: bool = ...) -> None: ...
- def __len__(self) -> int: ...
- def __contains__(self, key: Hashable) -> bool: ...
- def sizeof(self, deep: bool = ...) -> int: ...
- def get_state(self) -> dict[str, int]: ...
- # TODO: `item` type is subclass-specific
- def get_item(self, item): ... # TODO: return type?
- def set_item(self, item, val) -> None: ...
- def get_na(self): ... # TODO: return type?
- def set_na(self, val) -> None: ...
- def map_locations(
- self,
- values: np.ndarray, # np.ndarray[subclass-specific]
- mask: npt.NDArray[np.bool_] | None = ...,
- ) -> None: ...
- def lookup(
- self,
- values: np.ndarray, # np.ndarray[subclass-specific]
- mask: npt.NDArray[np.bool_] | None = ...,
- ) -> npt.NDArray[np.intp]: ...
- def get_labels(
- self,
- values: np.ndarray, # np.ndarray[subclass-specific]
- uniques, # SubclassTypeVector
- count_prior: int = ...,
- na_sentinel: int = ...,
- na_value: object = ...,
- mask=...,
- ) -> npt.NDArray[np.intp]: ...
- def unique(
- self,
- values: np.ndarray, # np.ndarray[subclass-specific]
- return_inverse: bool = ...,
- ) -> (
- tuple[
- np.ndarray, # np.ndarray[subclass-specific]
- npt.NDArray[np.intp],
- ]
- | np.ndarray
- ): ... # np.ndarray[subclass-specific]
- def factorize(
- self,
- values: np.ndarray, # np.ndarray[subclass-specific]
- na_sentinel: int = ...,
- na_value: object = ...,
- mask=...,
- ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific]
-
-class Complex128HashTable(HashTable): ...
-class Complex64HashTable(HashTable): ...
-class Float64HashTable(HashTable): ...
-class Float32HashTable(HashTable): ...
-
-class Int64HashTable(HashTable):
- # Only Int64HashTable has get_labels_groupby, map_keys_to_values
- def get_labels_groupby(
- self,
- values: npt.NDArray[np.int64], # const int64_t[:]
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]: ...
- def map_keys_to_values(
- self,
- keys: npt.NDArray[np.int64],
- values: npt.NDArray[np.int64], # const int64_t[:]
- ) -> None: ...
-
-class Int32HashTable(HashTable): ...
-class Int16HashTable(HashTable): ...
-class Int8HashTable(HashTable): ...
-class UInt64HashTable(HashTable): ...
-class UInt32HashTable(HashTable): ...
-class UInt16HashTable(HashTable): ...
-class UInt8HashTable(HashTable): ...
-class StringHashTable(HashTable): ...
-class PyObjectHashTable(HashTable): ...
-class IntpHashTable(HashTable): ...
-
-def duplicated(
- values: np.ndarray,
- keep: Literal["last", "first", False] = ...,
- mask: npt.NDArray[np.bool_] | None = ...,
-) -> npt.NDArray[np.bool_]: ...
-def mode(
- values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ...
-) -> np.ndarray: ...
-def value_count(
- values: np.ndarray,
- dropna: bool,
- mask: npt.NDArray[np.bool_] | None = ...,
-) -> tuple[np.ndarray, npt.NDArray[np.int64]]: ... # np.ndarray[same-as-values]
-
-# arr and values should have same dtype
-def ismember(
- arr: np.ndarray,
- values: np.ndarray,
-) -> npt.NDArray[np.bool_]: ...
-def object_hash(obj) -> int: ...
-def objects_are_equal(a, b) -> bool: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/hashtable.pyx b/contrib/python/pandas/py3/pandas/_libs/hashtable.pyx
deleted file mode 100644
index ccac3d0b50d..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/hashtable.pyx
+++ /dev/null
@@ -1,125 +0,0 @@
-cimport cython
-from cpython.mem cimport (
- PyMem_Free,
- PyMem_Malloc,
-)
-from cpython.ref cimport (
- Py_INCREF,
- PyObject,
-)
-from libc.stdlib cimport (
- free,
- malloc,
-)
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport ndarray
-
-cnp.import_array()
-
-
-from pandas._libs cimport util
-from pandas._libs.dtypes cimport numeric_object_t
-from pandas._libs.khash cimport (
- KHASH_TRACE_DOMAIN,
- are_equivalent_float32_t,
- are_equivalent_float64_t,
- are_equivalent_khcomplex64_t,
- are_equivalent_khcomplex128_t,
- kh_needed_n_buckets,
- kh_python_hash_equal,
- kh_python_hash_func,
- khiter_t,
-)
-from pandas._libs.missing cimport checknull
-
-
-def get_hashtable_trace_domain():
- return KHASH_TRACE_DOMAIN
-
-
-def object_hash(obj):
- return kh_python_hash_func(obj)
-
-
-def objects_are_equal(a, b):
- return kh_python_hash_equal(a, b)
-
-
-cdef int64_t NPY_NAT = util.get_nat()
-SIZE_HINT_LIMIT = (1 << 20) + 7
-
-
-cdef Py_ssize_t _INIT_VEC_CAP = 128
-
-include "hashtable_class_helper.pxi"
-include "hashtable_func_helper.pxi"
-
-
-# map derived hash-map types onto basic hash-map types:
-if np.dtype(np.intp) == np.dtype(np.int64):
- IntpHashTable = Int64HashTable
- unique_label_indices = _unique_label_indices_int64
-elif np.dtype(np.intp) == np.dtype(np.int32):
- IntpHashTable = Int32HashTable
- unique_label_indices = _unique_label_indices_int32
-else:
- raise ValueError(np.dtype(np.intp))
-
-
-cdef class Factorizer:
- cdef readonly:
- Py_ssize_t count
-
- def __cinit__(self, size_hint: int):
- self.count = 0
-
- def get_count(self) -> int:
- return self.count
-
- def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray:
- raise NotImplementedError
-
-
-cdef class ObjectFactorizer(Factorizer):
- cdef public:
- PyObjectHashTable table
- ObjectVector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = PyObjectHashTable(size_hint)
- self.uniques = ObjectVector()
-
- def factorize(
- self, ndarray[object] values, na_sentinel=-1, na_value=None, mask=None
- ) -> np.ndarray:
- """
-
- Returns
- -------
- np.ndarray[np.intp]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = ObjectFactorizer(3)
- >>> fac.factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
- array([ 0, 1, 20])
- """
- cdef:
- ndarray[intp_t] labels
-
- if mask is not None:
- raise NotImplementedError("mask not supported for ObjectFactorizer.")
-
- if self.uniques.external_view_exists:
- uniques = ObjectVector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel, na_value)
- self.count = len(self.uniques)
- return labels
diff --git a/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi b/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi
deleted file mode 100644
index 665fbcf42f4..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi
+++ /dev/null
@@ -1,7291 +0,0 @@
-"""
-Template for each `dtype` helper function for hashtable
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-cdef khcomplex64_t to_khcomplex64_t(complex64_t val) nogil:
- cdef khcomplex64_t res
- res.real = val.real
- res.imag = val.imag
- return res
-cdef khcomplex128_t to_khcomplex128_t(complex128_t val) nogil:
- cdef khcomplex128_t res
- res.real = val.real
- res.imag = val.imag
- return res
-
-cdef bint is_nan_khcomplex128_t(khcomplex128_t val) nogil:
- return val.real != val.real or val.imag != val.imag
-# are_equivalent_khcomplex128_t is cimported via khash.pxd
-
-cdef bint is_nan_khcomplex64_t(khcomplex64_t val) nogil:
- return val.real != val.real or val.imag != val.imag
-# are_equivalent_khcomplex64_t is cimported via khash.pxd
-
-cdef bint is_nan_float64_t(float64_t val) nogil:
- return val != val
-# are_equivalent_float64_t is cimported via khash.pxd
-
-cdef bint is_nan_float32_t(float32_t val) nogil:
- return val != val
-# are_equivalent_float32_t is cimported via khash.pxd
-
-cdef bint is_nan_int64_t(int64_t val) nogil:
- return False
-cdef bint are_equivalent_int64_t(int64_t val1, int64_t val2) nogil:
- return val1 == val2
-
-cdef bint is_nan_int32_t(int32_t val) nogil:
- return False
-cdef bint are_equivalent_int32_t(int32_t val1, int32_t val2) nogil:
- return val1 == val2
-
-cdef bint is_nan_int16_t(int16_t val) nogil:
- return False
-cdef bint are_equivalent_int16_t(int16_t val1, int16_t val2) nogil:
- return val1 == val2
-
-cdef bint is_nan_int8_t(int8_t val) nogil:
- return False
-cdef bint are_equivalent_int8_t(int8_t val1, int8_t val2) nogil:
- return val1 == val2
-
-cdef bint is_nan_uint64_t(uint64_t val) nogil:
- return False
-cdef bint are_equivalent_uint64_t(uint64_t val1, uint64_t val2) nogil:
- return val1 == val2
-
-cdef bint is_nan_uint32_t(uint32_t val) nogil:
- return False
-cdef bint are_equivalent_uint32_t(uint32_t val1, uint32_t val2) nogil:
- return val1 == val2
-
-cdef bint is_nan_uint16_t(uint16_t val) nogil:
- return False
-cdef bint are_equivalent_uint16_t(uint16_t val1, uint16_t val2) nogil:
- return val1 == val2
-
-cdef bint is_nan_uint8_t(uint8_t val) nogil:
- return False
-cdef bint are_equivalent_uint8_t(uint8_t val1, uint8_t val2) nogil:
- return val1 == val2
-from pandas._libs.khash cimport (
- kh_destroy_complex64,
- kh_exist_complex64,
- kh_get_complex64,
- kh_init_complex64,
- kh_put_complex64,
- kh_resize_complex64,
-)
-from pandas._libs.khash cimport (
- kh_destroy_complex128,
- kh_exist_complex128,
- kh_get_complex128,
- kh_init_complex128,
- kh_put_complex128,
- kh_resize_complex128,
-)
-from pandas._libs.khash cimport (
- kh_destroy_float32,
- kh_exist_float32,
- kh_get_float32,
- kh_init_float32,
- kh_put_float32,
- kh_resize_float32,
-)
-from pandas._libs.khash cimport (
- kh_destroy_float64,
- kh_exist_float64,
- kh_get_float64,
- kh_init_float64,
- kh_put_float64,
- kh_resize_float64,
-)
-from pandas._libs.khash cimport (
- kh_destroy_int8,
- kh_exist_int8,
- kh_get_int8,
- kh_init_int8,
- kh_put_int8,
- kh_resize_int8,
-)
-from pandas._libs.khash cimport (
- kh_destroy_int16,
- kh_exist_int16,
- kh_get_int16,
- kh_init_int16,
- kh_put_int16,
- kh_resize_int16,
-)
-from pandas._libs.khash cimport (
- kh_destroy_int32,
- kh_exist_int32,
- kh_get_int32,
- kh_init_int32,
- kh_put_int32,
- kh_resize_int32,
-)
-from pandas._libs.khash cimport (
- kh_destroy_int64,
- kh_exist_int64,
- kh_get_int64,
- kh_init_int64,
- kh_put_int64,
- kh_resize_int64,
-)
-from pandas._libs.khash cimport (
- kh_destroy_pymap,
- kh_exist_pymap,
- kh_get_pymap,
- kh_init_pymap,
- kh_put_pymap,
- kh_resize_pymap,
-)
-from pandas._libs.khash cimport (
- kh_destroy_str,
- kh_exist_str,
- kh_get_str,
- kh_init_str,
- kh_put_str,
- kh_resize_str,
-)
-from pandas._libs.khash cimport (
- kh_destroy_strbox,
- kh_exist_strbox,
- kh_get_strbox,
- kh_init_strbox,
- kh_put_strbox,
- kh_resize_strbox,
-)
-from pandas._libs.khash cimport (
- kh_destroy_uint8,
- kh_exist_uint8,
- kh_get_uint8,
- kh_init_uint8,
- kh_put_uint8,
- kh_resize_uint8,
-)
-from pandas._libs.khash cimport (
- kh_destroy_uint16,
- kh_exist_uint16,
- kh_get_uint16,
- kh_init_uint16,
- kh_put_uint16,
- kh_resize_uint16,
-)
-from pandas._libs.khash cimport (
- kh_destroy_uint32,
- kh_exist_uint32,
- kh_get_uint32,
- kh_init_uint32,
- kh_put_uint32,
- kh_resize_uint32,
-)
-from pandas._libs.khash cimport (
- kh_destroy_uint64,
- kh_exist_uint64,
- kh_get_uint64,
- kh_init_uint64,
- kh_put_uint64,
- kh_resize_uint64,
-)
-
-# ----------------------------------------------------------------------
-# VectorData
-# ----------------------------------------------------------------------
-
-from pandas._libs.tslibs.util cimport get_c_string
-from pandas._libs.missing cimport C_NA
-
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct Complex128VectorData:
- khcomplex128_t *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_complex128(Complex128VectorData *data,
- khcomplex128_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct Complex64VectorData:
- khcomplex64_t *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_complex64(Complex64VectorData *data,
- khcomplex64_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct Float64VectorData:
- float64_t *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_float64(Float64VectorData *data,
- float64_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct Float32VectorData:
- float32_t *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_float32(Float32VectorData *data,
- float32_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_int64(Int64VectorData *data,
- int64_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct Int32VectorData:
- int32_t *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_int32(Int32VectorData *data,
- int32_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct Int16VectorData:
- int16_t *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_int16(Int16VectorData *data,
- int16_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct Int8VectorData:
- int8_t *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_int8(Int8VectorData *data,
- int8_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct StringVectorData:
- char * *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_string(StringVectorData *data,
- char * x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct UInt64VectorData:
- uint64_t *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_uint64(UInt64VectorData *data,
- uint64_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct UInt32VectorData:
- uint32_t *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_uint32(UInt32VectorData *data,
- uint32_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct UInt16VectorData:
- uint16_t *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_uint16(UInt16VectorData *data,
- uint16_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct UInt8VectorData:
- uint8_t *data
- Py_ssize_t n, m
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_uint8(UInt8VectorData *data,
- uint8_t x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-
-ctypedef fused vector_data:
- Int64VectorData
- Int32VectorData
- Int16VectorData
- Int8VectorData
- UInt64VectorData
- UInt32VectorData
- UInt16VectorData
- UInt8VectorData
- Float64VectorData
- Float32VectorData
- Complex128VectorData
- Complex64VectorData
- StringVectorData
-
-cdef bint needs_resize(vector_data *data) nogil:
- return data.n == data.m
-
-# ----------------------------------------------------------------------
-# Vector
-# ----------------------------------------------------------------------
-
-cdef class Vector:
- # cdef readonly:
- # bint external_view_exists
-
- def __cinit__(self):
- self.external_view_exists = False
-
-
-cdef class Complex128Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- cdef:
- Complex128VectorData *data
- ndarray ao
-
- def __cinit__(self):
- self.data = <Complex128VectorData *>PyMem_Malloc(
- sizeof(Complex128VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.complex128)
- self.data.data = <khcomplex128_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <khcomplex128_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, khcomplex128_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_complex128(self.data, x)
-
- cdef extend(self, const khcomplex128_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class Complex64Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- cdef:
- Complex64VectorData *data
- ndarray ao
-
- def __cinit__(self):
- self.data = <Complex64VectorData *>PyMem_Malloc(
- sizeof(Complex64VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.complex64)
- self.data.data = <khcomplex64_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <khcomplex64_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, khcomplex64_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_complex64(self.data, x)
-
- cdef extend(self, const khcomplex64_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class Float64Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- cdef:
- Float64VectorData *data
- ndarray ao
-
- def __cinit__(self):
- self.data = <Float64VectorData *>PyMem_Malloc(
- sizeof(Float64VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.float64)
- self.data.data = <float64_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <float64_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, float64_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_float64(self.data, x)
-
- cdef extend(self, const float64_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class UInt64Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- cdef:
- UInt64VectorData *data
- ndarray ao
-
- def __cinit__(self):
- self.data = <UInt64VectorData *>PyMem_Malloc(
- sizeof(UInt64VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.uint64)
- self.data.data = <uint64_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <uint64_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, uint64_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_uint64(self.data, x)
-
- cdef extend(self, const uint64_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class Int64Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
-
- def __cinit__(self):
- self.data = <Int64VectorData *>PyMem_Malloc(
- sizeof(Int64VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.int64)
- self.data.data = <int64_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <int64_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, int64_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_int64(self.data, x)
-
- cdef extend(self, const int64_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class Float32Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- cdef:
- Float32VectorData *data
- ndarray ao
-
- def __cinit__(self):
- self.data = <Float32VectorData *>PyMem_Malloc(
- sizeof(Float32VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.float32)
- self.data.data = <float32_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <float32_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, float32_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_float32(self.data, x)
-
- cdef extend(self, const float32_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class UInt32Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- cdef:
- UInt32VectorData *data
- ndarray ao
-
- def __cinit__(self):
- self.data = <UInt32VectorData *>PyMem_Malloc(
- sizeof(UInt32VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.uint32)
- self.data.data = <uint32_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <uint32_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, uint32_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_uint32(self.data, x)
-
- cdef extend(self, const uint32_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class Int32Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- cdef:
- Int32VectorData *data
- ndarray ao
-
- def __cinit__(self):
- self.data = <Int32VectorData *>PyMem_Malloc(
- sizeof(Int32VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.int32)
- self.data.data = <int32_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <int32_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, int32_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_int32(self.data, x)
-
- cdef extend(self, const int32_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class UInt16Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- cdef:
- UInt16VectorData *data
- ndarray ao
-
- def __cinit__(self):
- self.data = <UInt16VectorData *>PyMem_Malloc(
- sizeof(UInt16VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.uint16)
- self.data.data = <uint16_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <uint16_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, uint16_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_uint16(self.data, x)
-
- cdef extend(self, const uint16_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class Int16Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- cdef:
- Int16VectorData *data
- ndarray ao
-
- def __cinit__(self):
- self.data = <Int16VectorData *>PyMem_Malloc(
- sizeof(Int16VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.int16)
- self.data.data = <int16_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <int16_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, int16_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_int16(self.data, x)
-
- cdef extend(self, const int16_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class UInt8Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- cdef:
- UInt8VectorData *data
- ndarray ao
-
- def __cinit__(self):
- self.data = <UInt8VectorData *>PyMem_Malloc(
- sizeof(UInt8VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.uint8)
- self.data.data = <uint8_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <uint8_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, uint8_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_uint8(self.data, x)
-
- cdef extend(self, const uint8_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class Int8Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- cdef:
- Int8VectorData *data
- ndarray ao
-
- def __cinit__(self):
- self.data = <Int8VectorData *>PyMem_Malloc(
- sizeof(Int8VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.int8)
- self.data.data = <int8_t*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <int8_t*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, int8_t x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_int8(self.data, x)
-
- cdef extend(self, const int8_t[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-cdef class StringVector(Vector):
-
- cdef:
- StringVectorData *data
-
- def __cinit__(self):
- self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.data.data = <char **>malloc(self.data.m * sizeof(char *))
- if not self.data.data:
- raise MemoryError()
-
- cdef resize(self):
- cdef:
- char **orig_data
- Py_ssize_t i, m
-
- m = self.data.m
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
-
- orig_data = self.data.data
- self.data.data = <char **>malloc(self.data.m * sizeof(char *))
- if not self.data.data:
- raise MemoryError()
- for i in range(m):
- self.data.data[i] = orig_data[i]
-
- def __dealloc__(self):
- if self.data is not NULL:
- if self.data.data is not NULL:
- free(self.data.data)
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray[object, ndim=1] to_array(self):
- cdef:
- ndarray ao
- Py_ssize_t n
- object val
-
- ao = np.empty(self.data.n, dtype=object)
- for i in range(self.data.n):
- val = self.data.data[i]
- ao[i] = val
- self.external_view_exists = True
- self.data.m = self.data.n
- return ao
-
- cdef void append(self, char *x):
-
- if needs_resize(self.data):
- self.resize()
-
- append_data_string(self.data, x)
-
- cdef extend(self, ndarray[object] x):
- for i in range(len(x)):
- self.append(x[i])
-
-
-cdef class ObjectVector(Vector):
-
- cdef:
- PyObject **data
- Py_ssize_t n, m
- ndarray ao
-
- def __cinit__(self):
- self.n = 0
- self.m = _INIT_VEC_CAP
- self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
- self.data = <PyObject**>self.ao.data
-
- def __len__(self) -> int:
- return self.n
-
- cdef append(self, object obj):
- if self.n == self.m:
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.m = max(self.m * 2, _INIT_VEC_CAP)
- self.ao.resize(self.m, refcheck=False)
- self.data = <PyObject**>self.ao.data
-
- Py_INCREF(obj)
- self.data[self.n] = <PyObject*>obj
- self.n += 1
-
- cpdef ndarray[object, ndim=1] to_array(self):
- if self.m != self.n:
- if self.external_view_exists:
- raise ValueError("should have raised on append()")
- self.ao.resize(self.n, refcheck=False)
- self.m = self.n
- self.external_view_exists = True
- return self.ao
-
- cdef extend(self, ndarray[object] x):
- for i in range(len(x)):
- self.append(x[i])
-
-# ----------------------------------------------------------------------
-# HashTable
-# ----------------------------------------------------------------------
-
-
-cdef class HashTable:
-
- pass
-
-cdef class Complex128HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_complex128()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_complex128(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_complex128(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- khcomplex128_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = to_khcomplex128_t(key)
- k = kh_get_complex128(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(complex128_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, complex128_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- khcomplex128_t cval
-
- cval = to_khcomplex128_t(val)
- k = kh_get_complex128(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, complex128_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- khcomplex128_t ckey
-
- ckey = to_khcomplex128_t(key)
- k = kh_put_complex128(self.table, ckey, &ret)
- if kh_exist_complex128(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- khcomplex128_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
-
- @cython.boundscheck(False)
- def map_locations(self, const complex128_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- khcomplex128_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= to_khcomplex128_t(values[i])
- k = kh_put_complex128(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= to_khcomplex128_t(values[i])
- k = kh_put_complex128(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const complex128_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- khcomplex128_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = to_khcomplex128_t(values[i])
- k = kh_get_complex128(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const complex128_t[:] values, Complex128Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[complex128]
- Array of values of which unique will be calculated
- uniques : Complex128Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[complex128]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- khcomplex128_t val, na_value2
- khiter_t k
- Complex128VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = to_khcomplex128_t(na_value)
- else:
- na_value2 = to_khcomplex128_t(0)
-
- with nogil:
- for i in range(n):
- val = to_khcomplex128_t(values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_khcomplex128_t(val) or
- (use_na_value and are_equivalent_khcomplex128_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_complex128(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_complex128(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_complex128(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_complex128(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const complex128_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[complex128]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[complex128]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = Complex128Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const complex128_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[complex128]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[complex128]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = Complex128Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const complex128_t[:] values, Complex128Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
-
-
-cdef class Complex128Factorizer(Factorizer):
- cdef public:
- Complex128HashTable table
- Complex128Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = Complex128HashTable(size_hint)
- self.uniques = Complex128Vector()
-
- def factorize(self, const khcomplex128_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = Complex128Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="complex128"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = Complex128Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-cdef class Float64HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_float64()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_float64(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_float64(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- float64_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = (key)
- k = kh_get_float64(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(float64_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, float64_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- float64_t cval
-
- cval = (val)
- k = kh_get_float64(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, float64_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- float64_t ckey
-
- ckey = (key)
- k = kh_put_float64(self.table, ckey, &ret)
- if kh_exist_float64(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- float64_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
-
- @cython.boundscheck(False)
- def map_locations(self, const float64_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- float64_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= (values[i])
- k = kh_put_float64(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= (values[i])
- k = kh_put_float64(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const float64_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- float64_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = (values[i])
- k = kh_get_float64(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const float64_t[:] values, Float64Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[float64]
- Array of values of which unique will be calculated
- uniques : Float64Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[float64]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- float64_t val, na_value2
- khiter_t k
- Float64VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = (na_value)
- else:
- na_value2 = (0)
-
- with nogil:
- for i in range(n):
- val = (values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_float64_t(val) or
- (use_na_value and are_equivalent_float64_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_float64(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_float64(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_float64(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_float64(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const float64_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[float64]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[float64]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = Float64Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const float64_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[float64]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[float64]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = Float64Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const float64_t[:] values, Float64Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
-
-
-cdef class Float64Factorizer(Factorizer):
- cdef public:
- Float64HashTable table
- Float64Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = Float64HashTable(size_hint)
- self.uniques = Float64Vector()
-
- def factorize(self, const float64_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = Float64Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="float64"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = Float64Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-cdef class UInt64HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_uint64()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_uint64(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_uint64(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- uint64_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = (key)
- k = kh_get_uint64(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(uint64_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, uint64_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- uint64_t cval
-
- cval = (val)
- k = kh_get_uint64(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, uint64_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- uint64_t ckey
-
- ckey = (key)
- k = kh_put_uint64(self.table, ckey, &ret)
- if kh_exist_uint64(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- uint64_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
-
- @cython.boundscheck(False)
- def map_locations(self, const uint64_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- uint64_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= (values[i])
- k = kh_put_uint64(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= (values[i])
- k = kh_put_uint64(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const uint64_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- uint64_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = (values[i])
- k = kh_get_uint64(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const uint64_t[:] values, UInt64Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[uint64]
- Array of values of which unique will be calculated
- uniques : UInt64Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[uint64]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- uint64_t val, na_value2
- khiter_t k
- UInt64VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = (na_value)
- else:
- na_value2 = (0)
-
- with nogil:
- for i in range(n):
- val = (values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_uint64_t(val) or
- (use_na_value and are_equivalent_uint64_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_uint64(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_uint64(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_uint64(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_uint64(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const uint64_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[uint64]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[uint64]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = UInt64Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const uint64_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[uint64]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[uint64]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = UInt64Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const uint64_t[:] values, UInt64Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
-
-
-cdef class UInt64Factorizer(Factorizer):
- cdef public:
- UInt64HashTable table
- UInt64Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = UInt64HashTable(size_hint)
- self.uniques = UInt64Vector()
-
- def factorize(self, const uint64_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = UInt64Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="uint64"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = UInt64Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-cdef class Int64HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_int64()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_int64(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_int64(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- int64_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = (key)
- k = kh_get_int64(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(int64_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, int64_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int64_t cval
-
- cval = (val)
- k = kh_get_int64(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, int64_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- int64_t ckey
-
- ckey = (key)
- k = kh_put_int64(self.table, ckey, &ret)
- if kh_exist_int64(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- int64_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
- # We only use this for int64, can reduce build size and make .pyi
- # more accurate by only implementing it for int64
- @cython.boundscheck(False)
- def map_keys_to_values(
- self, const int64_t[:] keys, const int64_t[:] values
- ) -> None:
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- int64_t key
- khiter_t k
-
- with nogil:
- for i in range(n):
- key = (keys[i])
- k = kh_put_int64(self.table, key, &ret)
- self.table.vals[k] = <Py_ssize_t>values[i]
-
- @cython.boundscheck(False)
- def map_locations(self, const int64_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- int64_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= (values[i])
- k = kh_put_int64(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= (values[i])
- k = kh_put_int64(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const int64_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- int64_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = (values[i])
- k = kh_get_int64(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const int64_t[:] values, Int64Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[int64]
- Array of values of which unique will be calculated
- uniques : Int64Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[int64]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- int64_t val, na_value2
- khiter_t k
- Int64VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = (na_value)
- else:
- na_value2 = (0)
-
- with nogil:
- for i in range(n):
- val = (values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_int64_t(val) or
- (use_na_value and are_equivalent_int64_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_int64(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_int64(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_int64(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_int64(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const int64_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[int64]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[int64]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = Int64Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const int64_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[int64]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[int64]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = Int64Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const int64_t[:] values, Int64Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
- @cython.boundscheck(False)
- def get_labels_groupby(
- self, const int64_t[:] values
- ) -> tuple[ndarray, ndarray]:
- # tuple[np.ndarray[np.intp], np.ndarray[int64]]
- cdef:
- Py_ssize_t i, n = len(values)
- intp_t[::1] labels
- Py_ssize_t idx, count = 0
- int ret = 0
- int64_t val
- khiter_t k
- Int64Vector uniques = Int64Vector()
- Int64VectorData *ud
-
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
-
- with nogil:
- for i in range(n):
- val = (values[i])
-
- # specific for groupby
- if val < 0:
- labels[i] = -1
- continue
-
- k = kh_get_int64(self.table, val)
- if k != self.table.n_buckets:
- idx = self.table.vals[k]
- labels[i] = idx
- else:
- k = kh_put_int64(self.table, val, &ret)
- self.table.vals[k] = count
-
- if needs_resize(ud):
- with gil:
- uniques.resize()
- append_data_int64(ud, val)
- labels[i] = count
- count += 1
-
- arr_uniques = uniques.to_array()
-
- return np.asarray(labels), arr_uniques
-
-
-cdef class Int64Factorizer(Factorizer):
- cdef public:
- Int64HashTable table
- Int64Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = Int64HashTable(size_hint)
- self.uniques = Int64Vector()
-
- def factorize(self, const int64_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = Int64Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="int64"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = Int64Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-cdef class Complex64HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_complex64()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_complex64(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_complex64(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- khcomplex64_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = to_khcomplex64_t(key)
- k = kh_get_complex64(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(complex64_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, complex64_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- khcomplex64_t cval
-
- cval = to_khcomplex64_t(val)
- k = kh_get_complex64(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, complex64_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- khcomplex64_t ckey
-
- ckey = to_khcomplex64_t(key)
- k = kh_put_complex64(self.table, ckey, &ret)
- if kh_exist_complex64(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- khcomplex64_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
-
- @cython.boundscheck(False)
- def map_locations(self, const complex64_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- khcomplex64_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= to_khcomplex64_t(values[i])
- k = kh_put_complex64(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= to_khcomplex64_t(values[i])
- k = kh_put_complex64(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const complex64_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- khcomplex64_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = to_khcomplex64_t(values[i])
- k = kh_get_complex64(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const complex64_t[:] values, Complex64Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[complex64]
- Array of values of which unique will be calculated
- uniques : Complex64Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[complex64]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- khcomplex64_t val, na_value2
- khiter_t k
- Complex64VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = to_khcomplex64_t(na_value)
- else:
- na_value2 = to_khcomplex64_t(0)
-
- with nogil:
- for i in range(n):
- val = to_khcomplex64_t(values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_khcomplex64_t(val) or
- (use_na_value and are_equivalent_khcomplex64_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_complex64(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_complex64(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_complex64(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_complex64(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const complex64_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[complex64]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[complex64]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = Complex64Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const complex64_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[complex64]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[complex64]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = Complex64Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const complex64_t[:] values, Complex64Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
-
-
-cdef class Complex64Factorizer(Factorizer):
- cdef public:
- Complex64HashTable table
- Complex64Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = Complex64HashTable(size_hint)
- self.uniques = Complex64Vector()
-
- def factorize(self, const khcomplex64_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = Complex64Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="complex64"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = Complex64Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-cdef class Float32HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_float32()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_float32(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_float32(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- float32_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = (key)
- k = kh_get_float32(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(float32_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, float32_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- float32_t cval
-
- cval = (val)
- k = kh_get_float32(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, float32_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- float32_t ckey
-
- ckey = (key)
- k = kh_put_float32(self.table, ckey, &ret)
- if kh_exist_float32(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- float32_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
-
- @cython.boundscheck(False)
- def map_locations(self, const float32_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- float32_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= (values[i])
- k = kh_put_float32(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= (values[i])
- k = kh_put_float32(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const float32_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- float32_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = (values[i])
- k = kh_get_float32(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const float32_t[:] values, Float32Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[float32]
- Array of values of which unique will be calculated
- uniques : Float32Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[float32]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- float32_t val, na_value2
- khiter_t k
- Float32VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = (na_value)
- else:
- na_value2 = (0)
-
- with nogil:
- for i in range(n):
- val = (values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_float32_t(val) or
- (use_na_value and are_equivalent_float32_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_float32(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_float32(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_float32(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_float32(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const float32_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[float32]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[float32]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = Float32Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const float32_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[float32]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[float32]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = Float32Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const float32_t[:] values, Float32Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
-
-
-cdef class Float32Factorizer(Factorizer):
- cdef public:
- Float32HashTable table
- Float32Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = Float32HashTable(size_hint)
- self.uniques = Float32Vector()
-
- def factorize(self, const float32_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = Float32Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="float32"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = Float32Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-cdef class UInt32HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_uint32()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_uint32(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_uint32(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- uint32_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = (key)
- k = kh_get_uint32(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(uint32_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, uint32_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- uint32_t cval
-
- cval = (val)
- k = kh_get_uint32(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, uint32_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- uint32_t ckey
-
- ckey = (key)
- k = kh_put_uint32(self.table, ckey, &ret)
- if kh_exist_uint32(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- uint32_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
-
- @cython.boundscheck(False)
- def map_locations(self, const uint32_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- uint32_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= (values[i])
- k = kh_put_uint32(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= (values[i])
- k = kh_put_uint32(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const uint32_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- uint32_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = (values[i])
- k = kh_get_uint32(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const uint32_t[:] values, UInt32Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[uint32]
- Array of values of which unique will be calculated
- uniques : UInt32Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[uint32]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- uint32_t val, na_value2
- khiter_t k
- UInt32VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = (na_value)
- else:
- na_value2 = (0)
-
- with nogil:
- for i in range(n):
- val = (values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_uint32_t(val) or
- (use_na_value and are_equivalent_uint32_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_uint32(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_uint32(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_uint32(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_uint32(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const uint32_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[uint32]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[uint32]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = UInt32Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const uint32_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[uint32]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[uint32]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = UInt32Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const uint32_t[:] values, UInt32Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
-
-
-cdef class UInt32Factorizer(Factorizer):
- cdef public:
- UInt32HashTable table
- UInt32Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = UInt32HashTable(size_hint)
- self.uniques = UInt32Vector()
-
- def factorize(self, const uint32_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = UInt32Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="uint32"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = UInt32Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-cdef class Int32HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_int32()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_int32(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_int32(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- int32_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = (key)
- k = kh_get_int32(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(int32_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, int32_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int32_t cval
-
- cval = (val)
- k = kh_get_int32(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, int32_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- int32_t ckey
-
- ckey = (key)
- k = kh_put_int32(self.table, ckey, &ret)
- if kh_exist_int32(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- int32_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
-
- @cython.boundscheck(False)
- def map_locations(self, const int32_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- int32_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= (values[i])
- k = kh_put_int32(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= (values[i])
- k = kh_put_int32(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const int32_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- int32_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = (values[i])
- k = kh_get_int32(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const int32_t[:] values, Int32Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[int32]
- Array of values of which unique will be calculated
- uniques : Int32Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[int32]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- int32_t val, na_value2
- khiter_t k
- Int32VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = (na_value)
- else:
- na_value2 = (0)
-
- with nogil:
- for i in range(n):
- val = (values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_int32_t(val) or
- (use_na_value and are_equivalent_int32_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_int32(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_int32(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_int32(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_int32(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const int32_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[int32]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[int32]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = Int32Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const int32_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[int32]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[int32]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = Int32Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const int32_t[:] values, Int32Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
-
-
-cdef class Int32Factorizer(Factorizer):
- cdef public:
- Int32HashTable table
- Int32Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = Int32HashTable(size_hint)
- self.uniques = Int32Vector()
-
- def factorize(self, const int32_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = Int32Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="int32"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = Int32Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-cdef class UInt16HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_uint16()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_uint16(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_uint16(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- uint16_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = (key)
- k = kh_get_uint16(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(uint16_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, uint16_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- uint16_t cval
-
- cval = (val)
- k = kh_get_uint16(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, uint16_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- uint16_t ckey
-
- ckey = (key)
- k = kh_put_uint16(self.table, ckey, &ret)
- if kh_exist_uint16(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- uint16_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
-
- @cython.boundscheck(False)
- def map_locations(self, const uint16_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- uint16_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= (values[i])
- k = kh_put_uint16(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= (values[i])
- k = kh_put_uint16(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const uint16_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- uint16_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = (values[i])
- k = kh_get_uint16(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const uint16_t[:] values, UInt16Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[uint16]
- Array of values of which unique will be calculated
- uniques : UInt16Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[uint16]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- uint16_t val, na_value2
- khiter_t k
- UInt16VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = (na_value)
- else:
- na_value2 = (0)
-
- with nogil:
- for i in range(n):
- val = (values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_uint16_t(val) or
- (use_na_value and are_equivalent_uint16_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_uint16(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_uint16(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_uint16(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_uint16(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const uint16_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[uint16]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[uint16]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = UInt16Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const uint16_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[uint16]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[uint16]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = UInt16Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const uint16_t[:] values, UInt16Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
-
-
-cdef class UInt16Factorizer(Factorizer):
- cdef public:
- UInt16HashTable table
- UInt16Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = UInt16HashTable(size_hint)
- self.uniques = UInt16Vector()
-
- def factorize(self, const uint16_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = UInt16Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="uint16"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = UInt16Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-cdef class Int16HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_int16()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_int16(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_int16(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- int16_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = (key)
- k = kh_get_int16(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(int16_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, int16_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int16_t cval
-
- cval = (val)
- k = kh_get_int16(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, int16_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- int16_t ckey
-
- ckey = (key)
- k = kh_put_int16(self.table, ckey, &ret)
- if kh_exist_int16(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- int16_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
-
- @cython.boundscheck(False)
- def map_locations(self, const int16_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- int16_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= (values[i])
- k = kh_put_int16(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= (values[i])
- k = kh_put_int16(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const int16_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- int16_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = (values[i])
- k = kh_get_int16(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const int16_t[:] values, Int16Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[int16]
- Array of values of which unique will be calculated
- uniques : Int16Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[int16]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- int16_t val, na_value2
- khiter_t k
- Int16VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = (na_value)
- else:
- na_value2 = (0)
-
- with nogil:
- for i in range(n):
- val = (values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_int16_t(val) or
- (use_na_value and are_equivalent_int16_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_int16(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_int16(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_int16(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_int16(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const int16_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[int16]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[int16]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = Int16Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const int16_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[int16]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[int16]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = Int16Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const int16_t[:] values, Int16Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
-
-
-cdef class Int16Factorizer(Factorizer):
- cdef public:
- Int16HashTable table
- Int16Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = Int16HashTable(size_hint)
- self.uniques = Int16Vector()
-
- def factorize(self, const int16_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = Int16Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="int16"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = Int16Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-cdef class UInt8HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_uint8()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_uint8(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_uint8(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- uint8_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = (key)
- k = kh_get_uint8(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(uint8_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, uint8_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- uint8_t cval
-
- cval = (val)
- k = kh_get_uint8(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, uint8_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- uint8_t ckey
-
- ckey = (key)
- k = kh_put_uint8(self.table, ckey, &ret)
- if kh_exist_uint8(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- uint8_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
-
- @cython.boundscheck(False)
- def map_locations(self, const uint8_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- uint8_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= (values[i])
- k = kh_put_uint8(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= (values[i])
- k = kh_put_uint8(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const uint8_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- uint8_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = (values[i])
- k = kh_get_uint8(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const uint8_t[:] values, UInt8Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[uint8]
- Array of values of which unique will be calculated
- uniques : UInt8Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[uint8]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- uint8_t val, na_value2
- khiter_t k
- UInt8VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = (na_value)
- else:
- na_value2 = (0)
-
- with nogil:
- for i in range(n):
- val = (values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_uint8_t(val) or
- (use_na_value and are_equivalent_uint8_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_uint8(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_uint8(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_uint8(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_uint8(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const uint8_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[uint8]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[uint8]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = UInt8Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const uint8_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[uint8]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[uint8]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = UInt8Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const uint8_t[:] values, UInt8Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
-
-
-cdef class UInt8Factorizer(Factorizer):
- cdef public:
- UInt8HashTable table
- UInt8Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = UInt8HashTable(size_hint)
- self.uniques = UInt8Vector()
-
- def factorize(self, const uint8_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = UInt8Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="uint8"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = UInt8Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-cdef class Int8HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_int8()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_int8(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_int8(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- int8_t ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = (key)
- k = kh_get_int8(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(int8_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, int8_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int8_t cval
-
- cval = (val)
- k = kh_get_int8(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, int8_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- int8_t ckey
-
- ckey = (key)
- k = kh_put_int8(self.table, ckey, &ret)
- if kh_exist_int8(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- int8_t ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
-
- @cython.boundscheck(False)
- def map_locations(self, const int8_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- int8_t val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= (values[i])
- k = kh_put_int8(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= (values[i])
- k = kh_put_int8(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const int8_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- int8_t val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = (values[i])
- k = kh_get_int8(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const int8_t[:] values, Int8Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[int8]
- Array of values of which unique will be calculated
- uniques : Int8Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[int8]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- int8_t val, na_value2
- khiter_t k
- Int8VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = (na_value)
- else:
- na_value2 = (0)
-
- with nogil:
- for i in range(n):
- val = (values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_int8_t(val) or
- (use_na_value and are_equivalent_int8_t(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_int8(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_int8(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_int8(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_int8(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const int8_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[int8]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[int8]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = Int8Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const int8_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[int8]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[int8]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = Int8Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const int8_t[:] values, Int8Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
-
-
-cdef class Int8Factorizer(Factorizer):
- cdef public:
- Int8HashTable table
- Int8Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = Int8HashTable(size_hint)
- self.uniques = Int8Vector()
-
- def factorize(self, const int8_t[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = Int8Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="int8"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = Int8Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-
-cdef class StringHashTable(HashTable):
- # these by-definition *must* be strings
- # or a sentinel np.nan / None missing value
- na_string_sentinel = '__nan__'
-
- def __init__(self, int64_t size_hint=1):
- self.table = kh_init_str()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_str(self.table, size_hint)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_str(self.table)
- self.table = NULL
-
- def sizeof(self, deep: bool = False) -> int:
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(char *) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, str val):
- cdef:
- khiter_t k
- const char *v
- v = get_c_string(val)
-
- k = kh_get_str(self.table, v)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef set_item(self, str key, Py_ssize_t val):
- cdef:
- khiter_t k
- int ret = 0
- const char *v
-
- v = get_c_string(key)
-
- k = kh_put_str(self.table, v, &ret)
- if kh_exist_str(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- @cython.boundscheck(False)
- def get_indexer(self, ndarray[object] values) -> ndarray:
- # -> np.ndarray[np.intp]
- cdef:
- Py_ssize_t i, n = len(values)
- ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
- intp_t *resbuf = <intp_t*>labels.data
- khiter_t k
- kh_str_t *table = self.table
- const char *v
- const char **vecs
-
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
- v = get_c_string(val)
- vecs[i] = v
-
- with nogil:
- for i in range(n):
- k = kh_get_str(table, vecs[i])
- if k != table.n_buckets:
- resbuf[i] = table.vals[k]
- else:
- resbuf[i] = -1
-
- free(vecs)
- return labels
-
- @cython.boundscheck(False)
- def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- const char *v
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
-
- # these by-definition *must* be strings
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
-
- if isinstance(val, str):
- # GH#31499 if we have a np.str_ get_c_string won't recognize
- # it as a str, even though isinstance does.
- v = get_c_string(<str>val)
- else:
- v = get_c_string(self.na_string_sentinel)
- vecs[i] = v
-
- with nogil:
- for i in range(n):
- v = vecs[i]
- k = kh_get_str(self.table, v)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- free(vecs)
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- def map_locations(self, ndarray[object] values, object mask = None) -> None:
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- const char *v
- const char **vecs
- khiter_t k
-
- # these by-definition *must* be strings
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
-
- if isinstance(val, str):
- # GH#31499 if we have a np.str_ get_c_string won't recognize
- # it as a str, even though isinstance does.
- v = get_c_string(<str>val)
- else:
- v = get_c_string(self.na_string_sentinel)
- vecs[i] = v
-
- with nogil:
- for i in range(n):
- v = vecs[i]
- k = kh_put_str(self.table, v, &ret)
- self.table.vals[k] = i
- free(vecs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- bint return_inverse=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- uniques : ObjectVector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then any value
- that is not a string is considered missing. If na_value is
- not None, then _additionally_ any value "val" satisfying
- val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int64_t[::1] uindexer
- int ret = 0
- object val
- const char *v
- const char **vecs
- khiter_t k
- bint use_na_value
-
- if return_inverse:
- labels = np.zeros(n, dtype=np.intp)
- uindexer = np.empty(n, dtype=np.int64)
- use_na_value = na_value is not None
-
- # assign pointers and pre-filter out missing (if ignore_na)
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
-
- if (ignore_na
- and (not isinstance(val, str)
- or (use_na_value and val == na_value))):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), we can skip the actual value, and
- # replace the label with na_sentinel directly
- labels[i] = na_sentinel
- else:
- # if ignore_na is False, we also stringify NaN/None/etc.
- try:
- v = get_c_string(<str>val)
- except UnicodeEncodeError:
- v = get_c_string(<str>repr(val))
- vecs[i] = v
-
- # compute
- with nogil:
- for i in range(n):
- if ignore_na and labels[i] == na_sentinel:
- # skip entries for ignored missing values (see above)
- continue
-
- v = vecs[i]
- k = kh_get_str(self.table, v)
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_str(self.table, v, &ret)
- uindexer[count] = i
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- free(vecs)
-
- # uniques
- for i in range(count):
- uniques.append(values[uindexer[i]])
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- return uniques.to_array()
-
- def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- Not yet implemented for StringHashTable
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- """
- uniques = ObjectVector()
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse)
-
- def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then any value
- that is not a string is considered missing. If na_value is
- not None, then _additionally_ any value "val" satisfying
- val == na_value is considered missing.
- mask : ndarray[bool], optional
- Not yet implemented for StringHashTable.
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp]
- The labels from values to uniques
- """
- uniques_vector = ObjectVector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na,
- return_inverse=True)
-
- def get_labels(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True)
- return labels
-
-
-cdef class PyObjectHashTable(HashTable):
-
- def __init__(self, int64_t size_hint=1):
- self.table = kh_init_pymap()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_pymap(self.table, size_hint)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_pymap(self.table)
- self.table = NULL
-
- def __len__(self) -> int:
- return self.table.size
-
- def __contains__(self, object key) -> bool:
- cdef:
- khiter_t k
- hash(key)
-
- k = kh_get_pymap(self.table, <PyObject*>key)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """
- returns infos about the current state of the hashtable like size,
- number of buckets and so on.
- """
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, object val):
- cdef:
- khiter_t k
-
- k = kh_get_pymap(self.table, <PyObject*>val)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef set_item(self, object key, Py_ssize_t val):
- cdef:
- khiter_t k
- int ret = 0
- char* buf
-
- hash(key)
-
- k = kh_put_pymap(self.table, <PyObject*>key, &ret)
- if kh_exist_pymap(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- def map_locations(self, ndarray[object] values, object mask = None) -> None:
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- khiter_t k
-
- for i in range(n):
- val = values[i]
- hash(val)
-
- k = kh_put_pymap(self.table, <PyObject*>val, &ret)
- self.table.vals[k] = i
-
- def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
-
- for i in range(n):
- val = values[i]
- hash(val)
-
- k = kh_get_pymap(self.table, <PyObject*>val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- bint return_inverse=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- uniques : ObjectVector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then None _plus_
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- object val
- khiter_t k
- bint use_na_value
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- use_na_value = na_value is not None
-
- for i in range(n):
- val = values[i]
- hash(val)
-
- if ignore_na and (
- checknull(val)
- or (use_na_value and val == na_value)
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them, and
- # replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
-
- k = kh_get_pymap(self.table, <PyObject*>val)
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_pymap(self.table, <PyObject*>val, &ret)
- uniques.append(val)
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- return uniques.to_array()
-
- def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- Not yet implemented for PyObjectHashTable
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- """
- uniques = ObjectVector()
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse)
-
- def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then None _plus_
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- Not yet implemented for PyObjectHashTable.
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = ObjectVector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na,
- return_inverse=True)
-
- def get_labels(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True)
- return labels
diff --git a/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi.in
deleted file mode 100644
index d4d3117a32a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/hashtable_class_helper.pxi.in
+++ /dev/null
@@ -1,1506 +0,0 @@
-"""
-Template for each `dtype` helper function for hashtable
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-
-{{py:
-
-# name
-complex_types = ['complex64',
- 'complex128']
-}}
-
-{{for name in complex_types}}
-cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
- cdef kh{{name}}_t res
- res.real = val.real
- res.imag = val.imag
- return res
-
-{{endfor}}
-
-
-{{py:
-
-
-# name
-c_types = ['khcomplex128_t',
- 'khcomplex64_t',
- 'float64_t',
- 'float32_t',
- 'int64_t',
- 'int32_t',
- 'int16_t',
- 'int8_t',
- 'uint64_t',
- 'uint32_t',
- 'uint16_t',
- 'uint8_t']
-}}
-
-{{for c_type in c_types}}
-
-cdef bint is_nan_{{c_type}}({{c_type}} val) nogil:
- {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }}
- return val.real != val.real or val.imag != val.imag
- {{elif c_type in {'float64_t', 'float32_t'} }}
- return val != val
- {{else}}
- return False
- {{endif}}
-
-
-{{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }}
-# are_equivalent_{{c_type}} is cimported via khash.pxd
-{{else}}
-cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil:
- return val1 == val2
-{{endif}}
-
-{{endfor}}
-
-
-{{py:
-
-# name
-cimported_types = ['complex64',
- 'complex128',
- 'float32',
- 'float64',
- 'int8',
- 'int16',
- 'int32',
- 'int64',
- 'pymap',
- 'str',
- 'strbox',
- 'uint8',
- 'uint16',
- 'uint32',
- 'uint64']
-}}
-
-{{for name in cimported_types}}
-from pandas._libs.khash cimport (
- kh_destroy_{{name}},
- kh_exist_{{name}},
- kh_get_{{name}},
- kh_init_{{name}},
- kh_put_{{name}},
- kh_resize_{{name}},
-)
-
-{{endfor}}
-
-# ----------------------------------------------------------------------
-# VectorData
-# ----------------------------------------------------------------------
-
-from pandas._libs.tslibs.util cimport get_c_string
-from pandas._libs.missing cimport C_NA
-
-
-{{py:
-
-# name, dtype, c_type
-# the generated StringVector is not actually used
-# but is included for completeness (rather ObjectVector is used
-# for uniques in hashtables)
-
-dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
- ('Complex64', 'complex64', 'khcomplex64_t'),
- ('Float64', 'float64', 'float64_t'),
- ('Float32', 'float32', 'float32_t'),
- ('Int64', 'int64', 'int64_t'),
- ('Int32', 'int32', 'int32_t'),
- ('Int16', 'int16', 'int16_t'),
- ('Int8', 'int8', 'int8_t'),
- ('String', 'string', 'char *'),
- ('UInt64', 'uint64', 'uint64_t'),
- ('UInt32', 'uint32', 'uint32_t'),
- ('UInt16', 'uint16', 'uint16_t'),
- ('UInt8', 'uint8', 'uint8_t')]
-}}
-
-{{for name, dtype, c_type in dtypes}}
-
-
-{{if dtype != 'int64'}}
-# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
-# by IntervalTree
-
-ctypedef struct {{name}}VectorData:
- {{c_type}} *data
- Py_ssize_t n, m
-
-{{endif}}
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void append_data_{{dtype}}({{name}}VectorData *data,
- {{c_type}} x) nogil:
-
- data.data[data.n] = x
- data.n += 1
-
-{{endfor}}
-
-ctypedef fused vector_data:
- Int64VectorData
- Int32VectorData
- Int16VectorData
- Int8VectorData
- UInt64VectorData
- UInt32VectorData
- UInt16VectorData
- UInt8VectorData
- Float64VectorData
- Float32VectorData
- Complex128VectorData
- Complex64VectorData
- StringVectorData
-
-cdef bint needs_resize(vector_data *data) nogil:
- return data.n == data.m
-
-# ----------------------------------------------------------------------
-# Vector
-# ----------------------------------------------------------------------
-
-cdef class Vector:
- # cdef readonly:
- # bint external_view_exists
-
- def __cinit__(self):
- self.external_view_exists = False
-
-
-{{py:
-
-# name, dtype, c_type
-dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
- ('Complex64', 'complex64', 'khcomplex64_t'),
- ('Float64', 'float64', 'float64_t'),
- ('UInt64', 'uint64', 'uint64_t'),
- ('Int64', 'int64', 'int64_t'),
- ('Float32', 'float32', 'float32_t'),
- ('UInt32', 'uint32', 'uint32_t'),
- ('Int32', 'int32', 'int32_t'),
- ('UInt16', 'uint16', 'uint16_t'),
- ('Int16', 'int16', 'int16_t'),
- ('UInt8', 'uint8', 'uint8_t'),
- ('Int8', 'int8', 'int8_t')]
-
-}}
-
-{{for name, dtype, c_type in dtypes}}
-
-cdef class {{name}}Vector(Vector):
-
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- {{if dtype != 'int64'}}
- cdef:
- {{name}}VectorData *data
- ndarray ao
- {{endif}}
-
- def __cinit__(self):
- self.data = <{{name}}VectorData *>PyMem_Malloc(
- sizeof({{name}}VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
- self.data.data = <{{c_type}}*>self.ao.data
-
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <{{c_type}}*>self.ao.data
-
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
-
- cdef void append(self, {{c_type}} x):
-
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
-
- append_data_{{dtype}}(self.data, x)
-
- cdef extend(self, const {{c_type}}[:] x):
- for i in range(len(x)):
- self.append(x[i])
-
-{{endfor}}
-
-cdef class StringVector(Vector):
-
- cdef:
- StringVectorData *data
-
- def __cinit__(self):
- self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.data.data = <char **>malloc(self.data.m * sizeof(char *))
- if not self.data.data:
- raise MemoryError()
-
- cdef resize(self):
- cdef:
- char **orig_data
- Py_ssize_t i, m
-
- m = self.data.m
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
-
- orig_data = self.data.data
- self.data.data = <char **>malloc(self.data.m * sizeof(char *))
- if not self.data.data:
- raise MemoryError()
- for i in range(m):
- self.data.data[i] = orig_data[i]
-
- def __dealloc__(self):
- if self.data is not NULL:
- if self.data.data is not NULL:
- free(self.data.data)
- PyMem_Free(self.data)
- self.data = NULL
-
- def __len__(self) -> int:
- return self.data.n
-
- cpdef ndarray[object, ndim=1] to_array(self):
- cdef:
- ndarray ao
- Py_ssize_t n
- object val
-
- ao = np.empty(self.data.n, dtype=object)
- for i in range(self.data.n):
- val = self.data.data[i]
- ao[i] = val
- self.external_view_exists = True
- self.data.m = self.data.n
- return ao
-
- cdef void append(self, char *x):
-
- if needs_resize(self.data):
- self.resize()
-
- append_data_string(self.data, x)
-
- cdef extend(self, ndarray[object] x):
- for i in range(len(x)):
- self.append(x[i])
-
-
-cdef class ObjectVector(Vector):
-
- cdef:
- PyObject **data
- Py_ssize_t n, m
- ndarray ao
-
- def __cinit__(self):
- self.n = 0
- self.m = _INIT_VEC_CAP
- self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
- self.data = <PyObject**>self.ao.data
-
- def __len__(self) -> int:
- return self.n
-
- cdef append(self, object obj):
- if self.n == self.m:
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.m = max(self.m * 2, _INIT_VEC_CAP)
- self.ao.resize(self.m, refcheck=False)
- self.data = <PyObject**>self.ao.data
-
- Py_INCREF(obj)
- self.data[self.n] = <PyObject*>obj
- self.n += 1
-
- cpdef ndarray[object, ndim=1] to_array(self):
- if self.m != self.n:
- if self.external_view_exists:
- raise ValueError("should have raised on append()")
- self.ao.resize(self.n, refcheck=False)
- self.m = self.n
- self.external_view_exists = True
- return self.ao
-
- cdef extend(self, ndarray[object] x):
- for i in range(len(x)):
- self.append(x[i])
-
-# ----------------------------------------------------------------------
-# HashTable
-# ----------------------------------------------------------------------
-
-
-cdef class HashTable:
-
- pass
-
-{{py:
-
-# name, dtype, c_type, to_c_type
-dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
- ('Float64', 'float64', 'float64_t', ''),
- ('UInt64', 'uint64', 'uint64_t', ''),
- ('Int64', 'int64', 'int64_t', ''),
- ('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
- ('Float32', 'float32', 'float32_t', ''),
- ('UInt32', 'uint32', 'uint32_t', ''),
- ('Int32', 'int32', 'int32_t', ''),
- ('UInt16', 'uint16', 'uint16_t', ''),
- ('Int16', 'int16', 'int16_t', ''),
- ('UInt8', 'uint8', 'uint8_t', ''),
- ('Int8', 'int8', 'int8_t', '')]
-
-}}
-
-
-{{for name, dtype, c_type, to_c_type in dtypes}}
-
-cdef class {{name}}HashTable(HashTable):
-
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_{{dtype}}()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_{{dtype}}(self.table, size_hint)
-
- self.uses_mask = uses_mask
- self.na_position = -1
-
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_{{dtype}}(self.table)
- self.table = NULL
-
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- {{c_type}} ckey
-
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
-
- ckey = {{to_c_type}}(key)
- k = kh_get_{{dtype}}(self.table, ckey)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, {{dtype}}_t val):
- """Extracts the position of val from the hashtable.
-
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
-
- Returns
- -------
- The position of the requested integer.
- """
-
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- {{c_type}} cval
-
- cval = {{to_c_type}}(val)
- k = kh_get_{{dtype}}(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
-
- Returns
- -------
- The position of the last na value.
- """
-
- if not self.uses_mask:
- raise NotImplementedError
-
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
-
- cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- {{c_type}} ckey
-
- ckey = {{to_c_type}}(key)
- k = kh_put_{{dtype}}(self.table, ckey, &ret)
- if kh_exist_{{dtype}}(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- {{c_type}} ckey
-
- if not self.uses_mask:
- raise NotImplementedError
-
- self.na_position = val
-
- {{if dtype == "int64" }}
- # We only use this for int64, can reduce build size and make .pyi
- # more accurate by only implementing it for int64
- @cython.boundscheck(False)
- def map_keys_to_values(
- self, const {{dtype}}_t[:] keys, const int64_t[:] values
- ) -> None:
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- {{c_type}} key
- khiter_t k
-
- with nogil:
- for i in range(n):
- key = {{to_c_type}}(keys[i])
- k = kh_put_{{dtype}}(self.table, key, &ret)
- self.table.vals[k] = <Py_ssize_t>values[i]
- {{endif}}
-
- @cython.boundscheck(False)
- def map_locations(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- {{c_type}} val
- khiter_t k
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= {{to_c_type}}(values[i])
- k = kh_put_{{dtype}}(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= {{to_c_type}}(values[i])
- k = kh_put_{{dtype}}(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
-
- @cython.boundscheck(False)
- def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- {{c_type}} val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
-
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
-
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = {{to_c_type}}(values[i])
- k = kh_get_{{dtype}}(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[{{dtype}}]
- Array of values of which unique will be calculated
- uniques : {{name}}Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
-
- Returns
- -------
- uniques : ndarray[{{dtype}}]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- {{c_type}} val, na_value2
- khiter_t k
- {{name}}VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
-
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
-
- result_mask = UInt8Vector()
- rmd = result_mask.data
-
- if use_mask:
- mask_values = mask.view("uint8")
-
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = {{to_c_type}}(na_value)
- else:
- na_value2 = {{to_c_type}}(0)
-
- with nogil:
- for i in range(n):
- val = {{to_c_type}}(values[i])
-
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_{{c_type}}(val) or
- (use_na_value and are_equivalent_{{c_type}}(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
-
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_{{dtype}}(ud, val)
- append_data_uint8(rmd, 1)
- continue
-
- k = kh_get_{{dtype}}(self.table, val)
-
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_{{dtype}}(self.table, val, &ret)
-
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_{{dtype}}(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
-
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
-
- def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[{{dtype}}]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
-
- Returns
- -------
- uniques : ndarray[{{dtype}}]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = {{name}}Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
-
- def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[{{dtype}}]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- uniques : ndarray[{{dtype}}]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = {{name}}Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
-
- def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
-
- {{if dtype == 'int64'}}
- @cython.boundscheck(False)
- def get_labels_groupby(
- self, const {{dtype}}_t[:] values
- ) -> tuple[ndarray, ndarray]:
- # tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]]
- cdef:
- Py_ssize_t i, n = len(values)
- intp_t[::1] labels
- Py_ssize_t idx, count = 0
- int ret = 0
- {{c_type}} val
- khiter_t k
- {{name}}Vector uniques = {{name}}Vector()
- {{name}}VectorData *ud
-
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
-
- with nogil:
- for i in range(n):
- val = {{to_c_type}}(values[i])
-
- # specific for groupby
- if val < 0:
- labels[i] = -1
- continue
-
- k = kh_get_{{dtype}}(self.table, val)
- if k != self.table.n_buckets:
- idx = self.table.vals[k]
- labels[i] = idx
- else:
- k = kh_put_{{dtype}}(self.table, val, &ret)
- self.table.vals[k] = count
-
- if needs_resize(ud):
- with gil:
- uniques.resize()
- append_data_{{dtype}}(ud, val)
- labels[i] = count
- count += 1
-
- arr_uniques = uniques.to_array()
-
- return np.asarray(labels), arr_uniques
- {{endif}}
-
-
-cdef class {{name}}Factorizer(Factorizer):
- cdef public:
- {{name}}HashTable table
- {{name}}Vector uniques
-
- def __cinit__(self, size_hint: int):
- self.table = {{name}}HashTable(size_hint)
- self.uniques = {{name}}Vector()
-
- def factorize(self, const {{c_type}}[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
-
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
-
- >>> fac = {{name}}Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
-
- if self.uniques.external_view_exists:
- uniques = {{name}}Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
-
-{{endfor}}
-
-
-cdef class StringHashTable(HashTable):
- # these by-definition *must* be strings
- # or a sentinel np.nan / None missing value
- na_string_sentinel = '__nan__'
-
- def __init__(self, int64_t size_hint=1):
- self.table = kh_init_str()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_str(self.table, size_hint)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_str(self.table)
- self.table = NULL
-
- def sizeof(self, deep: bool = False) -> int:
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(char *) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, str val):
- cdef:
- khiter_t k
- const char *v
- v = get_c_string(val)
-
- k = kh_get_str(self.table, v)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef set_item(self, str key, Py_ssize_t val):
- cdef:
- khiter_t k
- int ret = 0
- const char *v
-
- v = get_c_string(key)
-
- k = kh_put_str(self.table, v, &ret)
- if kh_exist_str(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- @cython.boundscheck(False)
- def get_indexer(self, ndarray[object] values) -> ndarray:
- # -> np.ndarray[np.intp]
- cdef:
- Py_ssize_t i, n = len(values)
- ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
- intp_t *resbuf = <intp_t*>labels.data
- khiter_t k
- kh_str_t *table = self.table
- const char *v
- const char **vecs
-
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
- v = get_c_string(val)
- vecs[i] = v
-
- with nogil:
- for i in range(n):
- k = kh_get_str(table, vecs[i])
- if k != table.n_buckets:
- resbuf[i] = table.vals[k]
- else:
- resbuf[i] = -1
-
- free(vecs)
- return labels
-
- @cython.boundscheck(False)
- def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- const char *v
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
-
- # these by-definition *must* be strings
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
-
- if isinstance(val, str):
- # GH#31499 if we have a np.str_ get_c_string won't recognize
- # it as a str, even though isinstance does.
- v = get_c_string(<str>val)
- else:
- v = get_c_string(self.na_string_sentinel)
- vecs[i] = v
-
- with nogil:
- for i in range(n):
- v = vecs[i]
- k = kh_get_str(self.table, v)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- free(vecs)
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- def map_locations(self, ndarray[object] values, object mask = None) -> None:
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- const char *v
- const char **vecs
- khiter_t k
-
- # these by-definition *must* be strings
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
-
- if isinstance(val, str):
- # GH#31499 if we have a np.str_ get_c_string won't recognize
- # it as a str, even though isinstance does.
- v = get_c_string(<str>val)
- else:
- v = get_c_string(self.na_string_sentinel)
- vecs[i] = v
-
- with nogil:
- for i in range(n):
- v = vecs[i]
- k = kh_put_str(self.table, v, &ret)
- self.table.vals[k] = i
- free(vecs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- bint return_inverse=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- uniques : ObjectVector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then any value
- that is not a string is considered missing. If na_value is
- not None, then _additionally_ any value "val" satisfying
- val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int64_t[::1] uindexer
- int ret = 0
- object val
- const char *v
- const char **vecs
- khiter_t k
- bint use_na_value
-
- if return_inverse:
- labels = np.zeros(n, dtype=np.intp)
- uindexer = np.empty(n, dtype=np.int64)
- use_na_value = na_value is not None
-
- # assign pointers and pre-filter out missing (if ignore_na)
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
-
- if (ignore_na
- and (not isinstance(val, str)
- or (use_na_value and val == na_value))):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), we can skip the actual value, and
- # replace the label with na_sentinel directly
- labels[i] = na_sentinel
- else:
- # if ignore_na is False, we also stringify NaN/None/etc.
- try:
- v = get_c_string(<str>val)
- except UnicodeEncodeError:
- v = get_c_string(<str>repr(val))
- vecs[i] = v
-
- # compute
- with nogil:
- for i in range(n):
- if ignore_na and labels[i] == na_sentinel:
- # skip entries for ignored missing values (see above)
- continue
-
- v = vecs[i]
- k = kh_get_str(self.table, v)
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_str(self.table, v, &ret)
- uindexer[count] = i
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- free(vecs)
-
- # uniques
- for i in range(count):
- uniques.append(values[uindexer[i]])
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- return uniques.to_array()
-
- def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- Not yet implemented for StringHashTable
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- """
- uniques = ObjectVector()
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse)
-
- def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then any value
- that is not a string is considered missing. If na_value is
- not None, then _additionally_ any value "val" satisfying
- val == na_value is considered missing.
- mask : ndarray[bool], optional
- Not yet implemented for StringHashTable.
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp]
- The labels from values to uniques
- """
- uniques_vector = ObjectVector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na,
- return_inverse=True)
-
- def get_labels(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True)
- return labels
-
-
-cdef class PyObjectHashTable(HashTable):
-
- def __init__(self, int64_t size_hint=1):
- self.table = kh_init_pymap()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_pymap(self.table, size_hint)
-
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_pymap(self.table)
- self.table = NULL
-
- def __len__(self) -> int:
- return self.table.size
-
- def __contains__(self, object key) -> bool:
- cdef:
- khiter_t k
- hash(key)
-
- k = kh_get_pymap(self.table, <PyObject*>key)
- return k != self.table.n_buckets
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
-
- def get_state(self) -> dict[str, int]:
- """
- returns infos about the current state of the hashtable like size,
- number of buckets and so on.
- """
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
-
- cpdef get_item(self, object val):
- cdef:
- khiter_t k
-
- k = kh_get_pymap(self.table, <PyObject*>val)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
-
- cpdef set_item(self, object key, Py_ssize_t val):
- cdef:
- khiter_t k
- int ret = 0
- char* buf
-
- hash(key)
-
- k = kh_put_pymap(self.table, <PyObject*>key, &ret)
- if kh_exist_pymap(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
-
- def map_locations(self, ndarray[object] values, object mask = None) -> None:
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- khiter_t k
-
- for i in range(n):
- val = values[i]
- hash(val)
-
- k = kh_put_pymap(self.table, <PyObject*>val, &ret)
- self.table.vals[k] = i
-
- def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
-
- for i in range(n):
- val = values[i]
- hash(val)
-
- k = kh_get_pymap(self.table, <PyObject*>val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
-
- return np.asarray(locs)
-
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- bint return_inverse=False):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- uniques : ObjectVector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then None _plus_
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- object val
- khiter_t k
- bint use_na_value
-
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- use_na_value = na_value is not None
-
- for i in range(n):
- val = values[i]
- hash(val)
-
- if ignore_na and (
- checknull(val)
- or (use_na_value and val == na_value)
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them, and
- # replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
-
- k = kh_get_pymap(self.table, <PyObject*>val)
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_pymap(self.table, <PyObject*>val, &ret)
- uniques.append(val)
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
-
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- return uniques.to_array()
-
- def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- Not yet implemented for PyObjectHashTable
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- """
- uniques = ObjectVector()
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse)
-
- def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
-
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
-
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then None _plus_
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- Not yet implemented for PyObjectHashTable.
-
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = ObjectVector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na,
- return_inverse=True)
-
- def get_labels(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True)
- return labels
diff --git a/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi b/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi
deleted file mode 100644
index d148c62f573..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi
+++ /dev/null
@@ -1,2755 +0,0 @@
-"""
-Template for each `dtype` helper function for hashtable
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_complex128(const complex128_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_complex128_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- khcomplex128_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = Complex128Vector()
- table = kh_init_complex128()
-
- kh_resize_complex128(table, n)
-
- for i in range(n):
- val = to_khcomplex128_t(values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_khcomplex128_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_complex128(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_complex128(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_complex128(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_complex128(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_complex128(const complex128_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- khcomplex128_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_complex128_t *table = kh_init_complex128()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_complex128(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = to_khcomplex128_t(values[i])
- kh_put_complex128(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = to_khcomplex128_t(values[i])
- kh_put_complex128(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = to_khcomplex128_t(values[i])
- k = kh_get_complex128(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_complex128(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_complex128(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_complex128(const complex128_t[:] arr, const complex128_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : complex128 ndarray
- values : complex128 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- khcomplex128_t val
-
- kh_complex128_t *table = kh_init_complex128()
-
- # construct the table
- n = len(values)
- kh_resize_complex128(table, n)
-
- with nogil:
- for i in range(n):
- val = to_khcomplex128_t(values[i])
- kh_put_complex128(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = to_khcomplex128_t(arr[i])
- k = kh_get_complex128(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_complex128(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_complex64(const complex64_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_complex64_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- khcomplex64_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = Complex64Vector()
- table = kh_init_complex64()
-
- kh_resize_complex64(table, n)
-
- for i in range(n):
- val = to_khcomplex64_t(values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_khcomplex64_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_complex64(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_complex64(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_complex64(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_complex64(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_complex64(const complex64_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- khcomplex64_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_complex64_t *table = kh_init_complex64()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_complex64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = to_khcomplex64_t(values[i])
- kh_put_complex64(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = to_khcomplex64_t(values[i])
- kh_put_complex64(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = to_khcomplex64_t(values[i])
- k = kh_get_complex64(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_complex64(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_complex64(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_complex64(const complex64_t[:] arr, const complex64_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : complex64 ndarray
- values : complex64 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- khcomplex64_t val
-
- kh_complex64_t *table = kh_init_complex64()
-
- # construct the table
- n = len(values)
- kh_resize_complex64(table, n)
-
- with nogil:
- for i in range(n):
- val = to_khcomplex64_t(values[i])
- kh_put_complex64(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = to_khcomplex64_t(arr[i])
- k = kh_get_complex64(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_complex64(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_float64(const float64_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_float64_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- float64_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = Float64Vector()
- table = kh_init_float64()
-
- kh_resize_float64(table, n)
-
- for i in range(n):
- val = (values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_float64_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_float64(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_float64(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_float64(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_float64(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_float64(const float64_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- float64_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_float64_t *table = kh_init_float64()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_float64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_float64(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_float64(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = (values[i])
- k = kh_get_float64(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_float64(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_float64(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_float64(const float64_t[:] arr, const float64_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : float64 ndarray
- values : float64 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- float64_t val
-
- kh_float64_t *table = kh_init_float64()
-
- # construct the table
- n = len(values)
- kh_resize_float64(table, n)
-
- with nogil:
- for i in range(n):
- val = (values[i])
- kh_put_float64(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = (arr[i])
- k = kh_get_float64(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_float64(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_float32(const float32_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_float32_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- float32_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = Float32Vector()
- table = kh_init_float32()
-
- kh_resize_float32(table, n)
-
- for i in range(n):
- val = (values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_float32_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_float32(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_float32(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_float32(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_float32(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_float32(const float32_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- float32_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_float32_t *table = kh_init_float32()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_float32(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_float32(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_float32(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = (values[i])
- k = kh_get_float32(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_float32(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_float32(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_float32(const float32_t[:] arr, const float32_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : float32 ndarray
- values : float32 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- float32_t val
-
- kh_float32_t *table = kh_init_float32()
-
- # construct the table
- n = len(values)
- kh_resize_float32(table, n)
-
- with nogil:
- for i in range(n):
- val = (values[i])
- kh_put_float32(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = (arr[i])
- k = kh_get_float32(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_float32(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_uint64(const uint64_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_uint64_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- uint64_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = UInt64Vector()
- table = kh_init_uint64()
-
- kh_resize_uint64(table, n)
-
- for i in range(n):
- val = (values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_uint64_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_uint64(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_uint64(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_uint64(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_uint64(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_uint64(const uint64_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- uint64_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_uint64_t *table = kh_init_uint64()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_uint64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_uint64(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_uint64(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = (values[i])
- k = kh_get_uint64(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_uint64(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_uint64(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_uint64(const uint64_t[:] arr, const uint64_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : uint64 ndarray
- values : uint64 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- uint64_t val
-
- kh_uint64_t *table = kh_init_uint64()
-
- # construct the table
- n = len(values)
- kh_resize_uint64(table, n)
-
- with nogil:
- for i in range(n):
- val = (values[i])
- kh_put_uint64(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = (arr[i])
- k = kh_get_uint64(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_uint64(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_uint32(const uint32_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_uint32_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- uint32_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = UInt32Vector()
- table = kh_init_uint32()
-
- kh_resize_uint32(table, n)
-
- for i in range(n):
- val = (values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_uint32_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_uint32(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_uint32(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_uint32(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_uint32(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_uint32(const uint32_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- uint32_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_uint32_t *table = kh_init_uint32()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_uint32(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_uint32(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_uint32(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = (values[i])
- k = kh_get_uint32(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_uint32(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_uint32(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_uint32(const uint32_t[:] arr, const uint32_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : uint32 ndarray
- values : uint32 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- uint32_t val
-
- kh_uint32_t *table = kh_init_uint32()
-
- # construct the table
- n = len(values)
- kh_resize_uint32(table, n)
-
- with nogil:
- for i in range(n):
- val = (values[i])
- kh_put_uint32(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = (arr[i])
- k = kh_get_uint32(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_uint32(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_uint16(const uint16_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_uint16_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- uint16_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = UInt16Vector()
- table = kh_init_uint16()
-
- kh_resize_uint16(table, n)
-
- for i in range(n):
- val = (values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_uint16_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_uint16(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_uint16(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_uint16(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_uint16(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_uint16(const uint16_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- uint16_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_uint16_t *table = kh_init_uint16()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_uint16(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_uint16(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_uint16(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = (values[i])
- k = kh_get_uint16(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_uint16(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_uint16(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_uint16(const uint16_t[:] arr, const uint16_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : uint16 ndarray
- values : uint16 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- uint16_t val
-
- kh_uint16_t *table = kh_init_uint16()
-
- # construct the table
- n = len(values)
- kh_resize_uint16(table, n)
-
- with nogil:
- for i in range(n):
- val = (values[i])
- kh_put_uint16(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = (arr[i])
- k = kh_get_uint16(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_uint16(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_uint8(const uint8_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_uint8_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- uint8_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = UInt8Vector()
- table = kh_init_uint8()
-
- kh_resize_uint8(table, n)
-
- for i in range(n):
- val = (values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_uint8_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_uint8(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_uint8(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_uint8(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_uint8(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_uint8(const uint8_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- uint8_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_uint8_t *table = kh_init_uint8()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_uint8(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_uint8(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_uint8(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = (values[i])
- k = kh_get_uint8(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_uint8(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_uint8(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_uint8(const uint8_t[:] arr, const uint8_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : uint8 ndarray
- values : uint8 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- uint8_t val
-
- kh_uint8_t *table = kh_init_uint8()
-
- # construct the table
- n = len(values)
- kh_resize_uint8(table, n)
-
- with nogil:
- for i in range(n):
- val = (values[i])
- kh_put_uint8(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = (arr[i])
- k = kh_get_uint8(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_uint8(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_object(ndarray[object] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_pymap_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- object val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = ObjectVector()
- table = kh_init_pymap()
-
- if uses_mask:
- raise NotImplementedError("uses_mask not implemented with object dtype")
-
- kh_resize_pymap(table, n // 10)
-
- for i in range(n):
- val = values[i]
- if not dropna or not checknull(val):
- k = kh_get_pymap(table, <PyObject*>val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_pymap(table, <PyObject*>val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_pymap(table, result_keys.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_pymap(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_object(ndarray[object] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- PyObject* value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_pymap_t *table = kh_init_pymap()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_pymap(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- if True:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = <PyObject*>(values[i])
- kh_put_pymap(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- if True:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = <PyObject*>(values[i])
- kh_put_pymap(table, value, &ret)
- out[i] = ret == 0
-
- else:
- if True:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = <PyObject*>(values[i])
- k = kh_get_pymap(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_pymap(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_pymap(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_object(ndarray[object] arr, ndarray[object] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : object ndarray
- values : object ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- PyObject* val
-
- kh_pymap_t *table = kh_init_pymap()
-
- # construct the table
- n = len(values)
- kh_resize_pymap(table, n)
-
- if True:
- for i in range(n):
- val = <PyObject*>(values[i])
- kh_put_pymap(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- if True:
- for i in range(n):
- val = <PyObject*>(arr[i])
- k = kh_get_pymap(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_pymap(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_int64(const int64_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_int64_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- int64_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = Int64Vector()
- table = kh_init_int64()
-
- kh_resize_int64(table, n)
-
- for i in range(n):
- val = (values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_int64_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_int64(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_int64(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_int64(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_int64(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_int64(const int64_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- int64_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_int64_t *table = kh_init_int64()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_int64(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_int64(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = (values[i])
- k = kh_get_int64(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_int64(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_int64(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_int64(const int64_t[:] arr, const int64_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : int64 ndarray
- values : int64 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- int64_t val
-
- kh_int64_t *table = kh_init_int64()
-
- # construct the table
- n = len(values)
- kh_resize_int64(table, n)
-
- with nogil:
- for i in range(n):
- val = (values[i])
- kh_put_int64(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = (arr[i])
- k = kh_get_int64(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_int64(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_int32(const int32_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_int32_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- int32_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = Int32Vector()
- table = kh_init_int32()
-
- kh_resize_int32(table, n)
-
- for i in range(n):
- val = (values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_int32_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_int32(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_int32(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_int32(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_int32(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_int32(const int32_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- int32_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_int32_t *table = kh_init_int32()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_int32(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_int32(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_int32(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = (values[i])
- k = kh_get_int32(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_int32(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_int32(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_int32(const int32_t[:] arr, const int32_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : int32 ndarray
- values : int32 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- int32_t val
-
- kh_int32_t *table = kh_init_int32()
-
- # construct the table
- n = len(values)
- kh_resize_int32(table, n)
-
- with nogil:
- for i in range(n):
- val = (values[i])
- kh_put_int32(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = (arr[i])
- k = kh_get_int32(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_int32(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_int16(const int16_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_int16_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- int16_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = Int16Vector()
- table = kh_init_int16()
-
- kh_resize_int16(table, n)
-
- for i in range(n):
- val = (values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_int16_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_int16(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_int16(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_int16(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_int16(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_int16(const int16_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- int16_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_int16_t *table = kh_init_int16()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_int16(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_int16(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_int16(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = (values[i])
- k = kh_get_int16(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_int16(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_int16(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_int16(const int16_t[:] arr, const int16_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : int16 ndarray
- values : int16 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- int16_t val
-
- kh_int16_t *table = kh_init_int16()
-
- # construct the table
- n = len(values)
- kh_resize_int16(table, n)
-
- with nogil:
- for i in range(n):
- val = (values[i])
- kh_put_int16(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = (arr[i])
- k = kh_get_int16(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_int16(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef value_count_int8(const int8_t[:] values, bint dropna, const uint8_t[:] mask=None):
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_int8_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- int8_t val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = Int8Vector()
- table = kh_init_int8()
-
- kh_resize_int8(table, n)
-
- for i in range(n):
- val = (values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_int8_t(val)
-
- if not dropna or not isna_entry:
- k = kh_get_int8(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_int8(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- k = kh_get_int8(table, result_keys.data.data[i])
- result_counts[i] = table.vals[k]
-
- kh_destroy_int8(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef duplicated_int8(const int8_t[:] values, object keep='first', const uint8_t[:] mask=None):
- cdef:
- int ret = 0
- int8_t value
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_int8_t *table = kh_init_int8()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_int8(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- if keep == "last":
- with nogil:
- for i in range(n - 1, -1, -1):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_int8(table, value, &ret)
- out[i] = ret == 0
- elif keep == "first":
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = (values[i])
- kh_put_int8(table, value, &ret)
- out[i] = ret == 0
-
- else:
- with nogil:
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = (values[i])
- k = kh_get_int8(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_int8(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_int8(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ismember_int8(const int8_t[:] arr, const int8_t[:] values):
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : int8 ndarray
- values : int8 ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- int8_t val
-
- kh_int8_t *table = kh_init_int8()
-
- # construct the table
- n = len(values)
- kh_resize_int8(table, n)
-
- with nogil:
- for i in range(n):
- val = (values[i])
- kh_put_int8(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- with nogil:
- for i in range(n):
- val = (arr[i])
- k = kh_get_int8(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_int8(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-
-ctypedef fused htfunc_t:
- numeric_object_t
- complex128_t
- complex64_t
-
-
-cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
- if htfunc_t is object:
- return value_count_object(values, dropna, mask=mask)
-
- elif htfunc_t is int8_t:
- return value_count_int8(values, dropna, mask=mask)
- elif htfunc_t is int16_t:
- return value_count_int16(values, dropna, mask=mask)
- elif htfunc_t is int32_t:
- return value_count_int32(values, dropna, mask=mask)
- elif htfunc_t is int64_t:
- return value_count_int64(values, dropna, mask=mask)
-
- elif htfunc_t is uint8_t:
- return value_count_uint8(values, dropna, mask=mask)
- elif htfunc_t is uint16_t:
- return value_count_uint16(values, dropna, mask=mask)
- elif htfunc_t is uint32_t:
- return value_count_uint32(values, dropna, mask=mask)
- elif htfunc_t is uint64_t:
- return value_count_uint64(values, dropna, mask=mask)
-
- elif htfunc_t is float64_t:
- return value_count_float64(values, dropna, mask=mask)
- elif htfunc_t is float32_t:
- return value_count_float32(values, dropna, mask=mask)
-
- elif htfunc_t is complex128_t:
- return value_count_complex128(values, dropna, mask=mask)
- elif htfunc_t is complex64_t:
- return value_count_complex64(values, dropna, mask=mask)
-
- else:
- raise TypeError(values.dtype)
-
-
-cpdef duplicated(ndarray[htfunc_t] values, object keep="first", const uint8_t[:] mask=None):
- if htfunc_t is object:
- return duplicated_object(values, keep, mask=mask)
-
- elif htfunc_t is int8_t:
- return duplicated_int8(values, keep, mask=mask)
- elif htfunc_t is int16_t:
- return duplicated_int16(values, keep, mask=mask)
- elif htfunc_t is int32_t:
- return duplicated_int32(values, keep, mask=mask)
- elif htfunc_t is int64_t:
- return duplicated_int64(values, keep, mask=mask)
-
- elif htfunc_t is uint8_t:
- return duplicated_uint8(values, keep, mask=mask)
- elif htfunc_t is uint16_t:
- return duplicated_uint16(values, keep, mask=mask)
- elif htfunc_t is uint32_t:
- return duplicated_uint32(values, keep, mask=mask)
- elif htfunc_t is uint64_t:
- return duplicated_uint64(values, keep, mask=mask)
-
- elif htfunc_t is float64_t:
- return duplicated_float64(values, keep, mask=mask)
- elif htfunc_t is float32_t:
- return duplicated_float32(values, keep, mask=mask)
-
- elif htfunc_t is complex128_t:
- return duplicated_complex128(values, keep, mask=mask)
- elif htfunc_t is complex64_t:
- return duplicated_complex64(values, keep, mask=mask)
-
- else:
- raise TypeError(values.dtype)
-
-
-cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values):
- if htfunc_t is object:
- return ismember_object(arr, values)
-
- elif htfunc_t is int8_t:
- return ismember_int8(arr, values)
- elif htfunc_t is int16_t:
- return ismember_int16(arr, values)
- elif htfunc_t is int32_t:
- return ismember_int32(arr, values)
- elif htfunc_t is int64_t:
- return ismember_int64(arr, values)
-
- elif htfunc_t is uint8_t:
- return ismember_uint8(arr, values)
- elif htfunc_t is uint16_t:
- return ismember_uint16(arr, values)
- elif htfunc_t is uint32_t:
- return ismember_uint32(arr, values)
- elif htfunc_t is uint64_t:
- return ismember_uint64(arr, values)
-
- elif htfunc_t is float64_t:
- return ismember_float64(arr, values)
- elif htfunc_t is float32_t:
- return ismember_float32(arr, values)
-
- elif htfunc_t is complex128_t:
- return ismember_complex128(arr, values)
- elif htfunc_t is complex64_t:
- return ismember_complex64(arr, values)
-
- else:
- raise TypeError(values.dtype)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
- # TODO(cython3): use const htfunct_t[:]
-
- cdef:
- ndarray[htfunc_t] keys
- ndarray[htfunc_t] modes
-
- int64_t[::1] counts
- int64_t count, max_count = -1
- Py_ssize_t nkeys, k, j = 0
-
- keys, counts = value_count(values, dropna, mask=mask)
- nkeys = len(keys)
-
- modes = np.empty(nkeys, dtype=values.dtype)
-
- if htfunc_t is not object:
- with nogil:
- for k in range(nkeys):
- count = counts[k]
- if count == max_count:
- j += 1
- elif count > max_count:
- max_count = count
- j = 0
- else:
- continue
-
- modes[j] = keys[k]
- else:
- for k in range(nkeys):
- count = counts[k]
- if count == max_count:
- j += 1
- elif count > max_count:
- max_count = count
- j = 0
- else:
- continue
-
- modes[j] = keys[k]
-
- return modes[:j + 1]
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def _unique_label_indices_int64(const int64_t[:] labels) -> ndarray:
- """
- Indices of the first occurrences of the unique labels
- *excluding* -1. equivalent to:
- np.unique(labels, return_index=True)[1]
- """
- cdef:
- int ret = 0
- Py_ssize_t i, n = len(labels)
- kh_int64_t *table = kh_init_int64()
- Int64Vector idx = Int64Vector()
- ndarray[int64_t, ndim=1] arr
- Int64VectorData *ud = idx.data
-
- kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- with nogil:
- for i in range(n):
- kh_put_int64(table, labels[i], &ret)
- if ret != 0:
- if needs_resize(ud):
- with gil:
- idx.resize()
- append_data_int64(ud, i)
-
- kh_destroy_int64(table)
-
- arr = idx.to_array()
- arr = arr[np.asarray(labels)[arr].argsort()]
-
- return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def _unique_label_indices_int32(const int32_t[:] labels) -> ndarray:
- """
- Indices of the first occurrences of the unique labels
- *excluding* -1. equivalent to:
- np.unique(labels, return_index=True)[1]
- """
- cdef:
- int ret = 0
- Py_ssize_t i, n = len(labels)
- kh_int32_t *table = kh_init_int32()
- Int32Vector idx = Int32Vector()
- ndarray[int32_t, ndim=1] arr
- Int32VectorData *ud = idx.data
-
- kh_resize_int32(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- with nogil:
- for i in range(n):
- kh_put_int32(table, labels[i], &ret)
- if ret != 0:
- if needs_resize(ud):
- with gil:
- idx.resize()
- append_data_int32(ud, i)
-
- kh_destroy_int32(table)
-
- arr = idx.to_array()
- arr = arr[np.asarray(labels)[arr].argsort()]
-
- return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
diff --git a/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi.in
deleted file mode 100644
index b9cf6011481..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/hashtable_func_helper.pxi.in
+++ /dev/null
@@ -1,484 +0,0 @@
-"""
-Template for each `dtype` helper function for hashtable
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-{{py:
-
-# name, dtype, ttype, c_type, to_c_type
-dtypes = [('Complex128', 'complex128', 'complex128',
- 'khcomplex128_t', 'to_khcomplex128_t'),
- ('Complex64', 'complex64', 'complex64',
- 'khcomplex64_t', 'to_khcomplex64_t'),
- ('Float64', 'float64', 'float64', 'float64_t', ''),
- ('Float32', 'float32', 'float32', 'float32_t', ''),
- ('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
- ('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
- ('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
- ('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
- ('Object', 'object', 'pymap', 'object', '<PyObject*>'),
- ('Int64', 'int64', 'int64', 'int64_t', ''),
- ('Int32', 'int32', 'int32', 'int32_t', ''),
- ('Int16', 'int16', 'int16', 'int16_t', ''),
- ('Int8', 'int8', 'int8', 'int8_t', '')]
-
-}}
-
-{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-{{if dtype == 'object'}}
-cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None):
-{{else}}
-cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
-{{endif}}
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_{{ttype}}_t *table
-
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
-
- {{c_type}} val
-
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
-
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
-
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
-
- result_keys = {{name}}Vector()
- table = kh_init_{{ttype}}()
-
- {{if dtype == 'object'}}
- if uses_mask:
- raise NotImplementedError("uses_mask not implemented with object dtype")
-
- kh_resize_{{ttype}}(table, n // 10)
-
- for i in range(n):
- val = values[i]
- if not dropna or not checknull(val):
- k = kh_get_{{ttype}}(table, {{to_c_type}}val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
- {{else}}
- kh_resize_{{ttype}}(table, n)
-
- for i in range(n):
- val = {{to_c_type}}(values[i])
-
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_{{c_type}}(val)
-
- if not dropna or not isna_entry:
- k = kh_get_{{ttype}}(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_{{ttype}}(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
- {{endif}}
-
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
-
- for i in range(table.size):
- {{if dtype == 'object'}}
- k = kh_get_{{ttype}}(table, result_keys.data[i])
- {{else}}
- k = kh_get_{{ttype}}(table, result_keys.data.data[i])
- {{endif}}
- result_counts[i] = table.vals[k]
-
- kh_destroy_{{ttype}}(table)
-
- return result_keys.to_array(), result_counts.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-{{if dtype == 'object'}}
-cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None):
-{{else}}
-cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None):
-{{endif}}
- cdef:
- int ret = 0
- {{if dtype != 'object'}}
- {{c_type}} value
- {{else}}
- PyObject* value
- {{endif}}
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_{{ttype}}_t *table = kh_init_{{ttype}}()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
-
- kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
-
- {{for cond, keep in [('if', '"last"'), ('elif', '"first"')]}}
- {{cond}} keep == {{keep}}:
- {{if dtype == 'object'}}
- if True:
- {{else}}
- with nogil:
- {{endif}}
- {{if keep == '"last"'}}
- for i in range(n - 1, -1, -1):
- {{else}}
- for i in range(n):
- {{endif}}
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = {{to_c_type}}(values[i])
- kh_put_{{ttype}}(table, value, &ret)
- out[i] = ret == 0
- {{endfor}}
-
- else:
- {{if dtype == 'object'}}
- if True:
- {{else}}
- with nogil:
- {{endif}}
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
-
- else:
- value = {{to_c_type}}(values[i])
- k = kh_get_{{ttype}}(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_{{ttype}}(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
-
- kh_destroy_{{ttype}}(table)
- return out
-
-
-# ----------------------------------------------------------------------
-# Membership
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-{{if dtype == 'object'}}
-cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values):
-{{else}}
-cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
-{{endif}}
- """
- Return boolean of values in arr on an
- element by-element basis
-
- Parameters
- ----------
- arr : {{dtype}} ndarray
- values : {{dtype}} ndarray
-
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
-
- {{if dtype == "object"}}
- PyObject* val
- {{else}}
- {{c_type}} val
- {{endif}}
-
- kh_{{ttype}}_t *table = kh_init_{{ttype}}()
-
- # construct the table
- n = len(values)
- kh_resize_{{ttype}}(table, n)
-
- {{if dtype == 'object'}}
- if True:
- {{else}}
- with nogil:
- {{endif}}
- for i in range(n):
- val = {{to_c_type}}(values[i])
- kh_put_{{ttype}}(table, val, &ret)
-
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
-
- {{if dtype == 'object'}}
- if True:
- {{else}}
- with nogil:
- {{endif}}
- for i in range(n):
- val = {{to_c_type}}(arr[i])
- k = kh_get_{{ttype}}(table, val)
- result[i] = (k != table.n_buckets)
-
- kh_destroy_{{ttype}}(table)
- return result.view(np.bool_)
-
-# ----------------------------------------------------------------------
-# Mode Computations
-# ----------------------------------------------------------------------
-
-{{endfor}}
-
-
-ctypedef fused htfunc_t:
- numeric_object_t
- complex128_t
- complex64_t
-
-
-cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
- if htfunc_t is object:
- return value_count_object(values, dropna, mask=mask)
-
- elif htfunc_t is int8_t:
- return value_count_int8(values, dropna, mask=mask)
- elif htfunc_t is int16_t:
- return value_count_int16(values, dropna, mask=mask)
- elif htfunc_t is int32_t:
- return value_count_int32(values, dropna, mask=mask)
- elif htfunc_t is int64_t:
- return value_count_int64(values, dropna, mask=mask)
-
- elif htfunc_t is uint8_t:
- return value_count_uint8(values, dropna, mask=mask)
- elif htfunc_t is uint16_t:
- return value_count_uint16(values, dropna, mask=mask)
- elif htfunc_t is uint32_t:
- return value_count_uint32(values, dropna, mask=mask)
- elif htfunc_t is uint64_t:
- return value_count_uint64(values, dropna, mask=mask)
-
- elif htfunc_t is float64_t:
- return value_count_float64(values, dropna, mask=mask)
- elif htfunc_t is float32_t:
- return value_count_float32(values, dropna, mask=mask)
-
- elif htfunc_t is complex128_t:
- return value_count_complex128(values, dropna, mask=mask)
- elif htfunc_t is complex64_t:
- return value_count_complex64(values, dropna, mask=mask)
-
- else:
- raise TypeError(values.dtype)
-
-
-cpdef duplicated(ndarray[htfunc_t] values, object keep="first", const uint8_t[:] mask=None):
- if htfunc_t is object:
- return duplicated_object(values, keep, mask=mask)
-
- elif htfunc_t is int8_t:
- return duplicated_int8(values, keep, mask=mask)
- elif htfunc_t is int16_t:
- return duplicated_int16(values, keep, mask=mask)
- elif htfunc_t is int32_t:
- return duplicated_int32(values, keep, mask=mask)
- elif htfunc_t is int64_t:
- return duplicated_int64(values, keep, mask=mask)
-
- elif htfunc_t is uint8_t:
- return duplicated_uint8(values, keep, mask=mask)
- elif htfunc_t is uint16_t:
- return duplicated_uint16(values, keep, mask=mask)
- elif htfunc_t is uint32_t:
- return duplicated_uint32(values, keep, mask=mask)
- elif htfunc_t is uint64_t:
- return duplicated_uint64(values, keep, mask=mask)
-
- elif htfunc_t is float64_t:
- return duplicated_float64(values, keep, mask=mask)
- elif htfunc_t is float32_t:
- return duplicated_float32(values, keep, mask=mask)
-
- elif htfunc_t is complex128_t:
- return duplicated_complex128(values, keep, mask=mask)
- elif htfunc_t is complex64_t:
- return duplicated_complex64(values, keep, mask=mask)
-
- else:
- raise TypeError(values.dtype)
-
-
-cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values):
- if htfunc_t is object:
- return ismember_object(arr, values)
-
- elif htfunc_t is int8_t:
- return ismember_int8(arr, values)
- elif htfunc_t is int16_t:
- return ismember_int16(arr, values)
- elif htfunc_t is int32_t:
- return ismember_int32(arr, values)
- elif htfunc_t is int64_t:
- return ismember_int64(arr, values)
-
- elif htfunc_t is uint8_t:
- return ismember_uint8(arr, values)
- elif htfunc_t is uint16_t:
- return ismember_uint16(arr, values)
- elif htfunc_t is uint32_t:
- return ismember_uint32(arr, values)
- elif htfunc_t is uint64_t:
- return ismember_uint64(arr, values)
-
- elif htfunc_t is float64_t:
- return ismember_float64(arr, values)
- elif htfunc_t is float32_t:
- return ismember_float32(arr, values)
-
- elif htfunc_t is complex128_t:
- return ismember_complex128(arr, values)
- elif htfunc_t is complex64_t:
- return ismember_complex64(arr, values)
-
- else:
- raise TypeError(values.dtype)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
- # TODO(cython3): use const htfunct_t[:]
-
- cdef:
- ndarray[htfunc_t] keys
- ndarray[htfunc_t] modes
-
- int64_t[::1] counts
- int64_t count, max_count = -1
- Py_ssize_t nkeys, k, j = 0
-
- keys, counts = value_count(values, dropna, mask=mask)
- nkeys = len(keys)
-
- modes = np.empty(nkeys, dtype=values.dtype)
-
- if htfunc_t is not object:
- with nogil:
- for k in range(nkeys):
- count = counts[k]
- if count == max_count:
- j += 1
- elif count > max_count:
- max_count = count
- j = 0
- else:
- continue
-
- modes[j] = keys[k]
- else:
- for k in range(nkeys):
- count = counts[k]
- if count == max_count:
- j += 1
- elif count > max_count:
- max_count = count
- j = 0
- else:
- continue
-
- modes[j] = keys[k]
-
- return modes[:j + 1]
-
-
-{{py:
-
-# name, dtype, ttype, c_type
-dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
- ('Int32', 'int32', 'int32', 'int32_t'), ]
-
-}}
-
-{{for name, dtype, ttype, c_type in dtypes}}
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
- """
- Indices of the first occurrences of the unique labels
- *excluding* -1. equivalent to:
- np.unique(labels, return_index=True)[1]
- """
- cdef:
- int ret = 0
- Py_ssize_t i, n = len(labels)
- kh_{{ttype}}_t *table = kh_init_{{ttype}}()
- {{name}}Vector idx = {{name}}Vector()
- ndarray[{{c_type}}, ndim=1] arr
- {{name}}VectorData *ud = idx.data
-
- kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
-
- with nogil:
- for i in range(n):
- kh_put_{{ttype}}(table, labels[i], &ret)
- if ret != 0:
- if needs_resize(ud):
- with gil:
- idx.resize()
- append_data_{{ttype}}(ud, i)
-
- kh_destroy_{{ttype}}(table)
-
- arr = idx.to_array()
- arr = arr[np.asarray(labels)[arr].argsort()]
-
- return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
-
-{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/index.pyi b/contrib/python/pandas/py3/pandas/_libs/index.pyi
deleted file mode 100644
index e08faaaa031..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/index.pyi
+++ /dev/null
@@ -1,105 +0,0 @@
-import numpy as np
-
-from pandas._typing import npt
-
-from pandas import MultiIndex
-from pandas.core.arrays import ExtensionArray
-
-class IndexEngine:
- over_size_threshold: bool
- def __init__(self, values: np.ndarray) -> None: ...
- def __contains__(self, val: object) -> bool: ...
-
- # -> int | slice | np.ndarray[bool]
- def get_loc(self, val: object) -> int | slice | np.ndarray: ...
- def sizeof(self, deep: bool = ...) -> int: ...
- def __sizeof__(self) -> int: ...
- @property
- def is_unique(self) -> bool: ...
- @property
- def is_monotonic_increasing(self) -> bool: ...
- @property
- def is_monotonic_decreasing(self) -> bool: ...
- @property
- def is_mapping_populated(self) -> bool: ...
- def clear_mapping(self): ...
- def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
- def get_indexer_non_unique(
- self,
- targets: np.ndarray,
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
-
-class MaskedIndexEngine(IndexEngine):
- def __init__(self, values: object) -> None: ...
- def get_indexer_non_unique(
- self, targets: object
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
-
-class Float64Engine(IndexEngine): ...
-class Float32Engine(IndexEngine): ...
-class Complex128Engine(IndexEngine): ...
-class Complex64Engine(IndexEngine): ...
-class Int64Engine(IndexEngine): ...
-class Int32Engine(IndexEngine): ...
-class Int16Engine(IndexEngine): ...
-class Int8Engine(IndexEngine): ...
-class UInt64Engine(IndexEngine): ...
-class UInt32Engine(IndexEngine): ...
-class UInt16Engine(IndexEngine): ...
-class UInt8Engine(IndexEngine): ...
-class ObjectEngine(IndexEngine): ...
-class DatetimeEngine(Int64Engine): ...
-class TimedeltaEngine(DatetimeEngine): ...
-class PeriodEngine(Int64Engine): ...
-class BoolEngine(UInt8Engine): ...
-class MaskedFloat64Engine(MaskedIndexEngine): ...
-class MaskedFloat32Engine(MaskedIndexEngine): ...
-class MaskedComplex128Engine(MaskedIndexEngine): ...
-class MaskedComplex64Engine(MaskedIndexEngine): ...
-class MaskedInt64Engine(MaskedIndexEngine): ...
-class MaskedInt32Engine(MaskedIndexEngine): ...
-class MaskedInt16Engine(MaskedIndexEngine): ...
-class MaskedInt8Engine(MaskedIndexEngine): ...
-class MaskedUInt64Engine(MaskedIndexEngine): ...
-class MaskedUInt32Engine(MaskedIndexEngine): ...
-class MaskedUInt16Engine(MaskedIndexEngine): ...
-class MaskedUInt8Engine(MaskedIndexEngine): ...
-class MaskedBoolEngine(MaskedUInt8Engine): ...
-
-class BaseMultiIndexCodesEngine:
- levels: list[np.ndarray]
- offsets: np.ndarray # ndarray[uint64_t, ndim=1]
-
- def __init__(
- self,
- levels: list[np.ndarray], # all entries hashable
- labels: list[np.ndarray], # all entries integer-dtyped
- offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1]
- ) -> None: ...
- def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
- def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
- def get_indexer_with_fill(
- self,
- target: np.ndarray, # np.ndarray[object] of tuples
- values: np.ndarray, # np.ndarray[object] of tuples
- method: str,
- limit: int | None,
- ) -> npt.NDArray[np.intp]: ...
-
-class ExtensionEngine:
- def __init__(self, values: ExtensionArray) -> None: ...
- def __contains__(self, val: object) -> bool: ...
- def get_loc(self, val: object) -> int | slice | np.ndarray: ...
- def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
- def get_indexer_non_unique(
- self,
- targets: np.ndarray,
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
- @property
- def is_unique(self) -> bool: ...
- @property
- def is_monotonic_increasing(self) -> bool: ...
- @property
- def is_monotonic_decreasing(self) -> bool: ...
- def sizeof(self, deep: bool = ...) -> int: ...
- def clear_mapping(self): ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/index.pyx b/contrib/python/pandas/py3/pandas/_libs/index.pyx
deleted file mode 100644
index 1b42ad1c0fd..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/index.pyx
+++ /dev/null
@@ -1,1280 +0,0 @@
-cimport cython
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- int64_t,
- intp_t,
- ndarray,
- uint8_t,
- uint64_t,
-)
-
-cnp.import_array()
-
-
-from pandas._libs cimport util
-from pandas._libs.hashtable cimport HashTable
-from pandas._libs.tslibs.nattype cimport c_NaT as NaT
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- get_unit_from_dtype,
-)
-from pandas._libs.tslibs.period cimport is_period_object
-from pandas._libs.tslibs.timedeltas cimport _Timedelta
-from pandas._libs.tslibs.timestamps cimport _Timestamp
-
-from pandas._libs import (
- algos,
- hashtable as _hash,
-)
-
-from pandas._libs.lib cimport eq_NA_compat
-from pandas._libs.missing cimport (
- C_NA,
- checknull,
- is_matching_na,
-)
-
-# Defines shift of MultiIndex codes to avoid negative codes (missing values)
-multiindex_nulls_shift = 2
-
-
-cdef bint is_definitely_invalid_key(object val):
- try:
- hash(val)
- except TypeError:
- return True
- return False
-
-
-cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
- """
- Return a ndarray[bool] of locations where val matches self.values.
-
- If val is not NA, this is equivalent to `self.values == val`
- """
- # Caller is responsible for ensuring _check_type has already been called
- cdef:
- ndarray[uint8_t, ndim=1, cast=True] indexer
- Py_ssize_t i
- object item
-
- if values.descr.type_num == cnp.NPY_OBJECT:
- assert mask is None # no mask for object dtype
- # i.e. values.dtype == object
- if not checknull(val):
- indexer = eq_NA_compat(values, val)
-
- else:
- # We need to check for _matching_ NA values
- indexer = np.empty(len(values), dtype=np.uint8)
-
- for i in range(len(values)):
- item = values[i]
- indexer[i] = is_matching_na(item, val)
-
- else:
- if mask is not None:
- if val is C_NA:
- indexer = mask == 1
- else:
- indexer = (values == val) & ~mask
- else:
- if util.is_nan(val):
- indexer = np.isnan(values)
- else:
- indexer = values == val
-
- return indexer.view(bool)
-
-
-# Don't populate hash tables in monotonic indexes larger than this
-_SIZE_CUTOFF = 1_000_000
-
-
-cdef _unpack_bool_indexer(ndarray[uint8_t, ndim=1, cast=True] indexer, object val):
- """
- Possibly unpack a boolean mask to a single indexer.
- """
- # Returns ndarray[bool] or int
- cdef:
- ndarray[intp_t, ndim=1] found
- int count
-
- found = np.where(indexer)[0]
- count = len(found)
-
- if count > 1:
- return indexer
- if count == 1:
- return int(found[0])
-
- raise KeyError(val)
-
-
-@cython.freelist(32)
-cdef class IndexEngine:
-
- cdef readonly:
- ndarray values
- ndarray mask
- HashTable mapping
- bint over_size_threshold
-
- cdef:
- bint unique, monotonic_inc, monotonic_dec
- bint need_monotonic_check, need_unique_check
- object _np_type
-
- def __init__(self, ndarray values):
- self.values = values
- self.mask = None
-
- self.over_size_threshold = len(values) >= _SIZE_CUTOFF
- self.clear_mapping()
- self._np_type = values.dtype.type
-
- def __contains__(self, val: object) -> bool:
- hash(val)
- try:
- self.get_loc(val)
- except KeyError:
- return False
- return True
-
- cpdef get_loc(self, object val):
- # -> Py_ssize_t | slice | ndarray[bool]
- cdef:
- Py_ssize_t loc
-
- if is_definitely_invalid_key(val):
- raise TypeError(f"'{val}' is an invalid key")
-
- val = self._check_type(val)
-
- if self.over_size_threshold and self.is_monotonic_increasing:
- if not self.is_unique:
- return self._get_loc_duplicates(val)
- values = self.values
-
- loc = self._searchsorted_left(val)
- if loc >= len(values):
- raise KeyError(val)
- if values[loc] != val:
- raise KeyError(val)
- return loc
-
- self._ensure_mapping_populated()
- if not self.unique:
- return self._get_loc_duplicates(val)
- if self.mask is not None and val is C_NA:
- return self.mapping.get_na()
-
- try:
- return self.mapping.get_item(val)
- except OverflowError as err:
- # GH#41775 OverflowError e.g. if we are uint64 and val is -1
- # or if we are int64 and value is np.iinfo(np.int64).max+1
- # (the uint64 with -1 case should actually be excluded by _check_type)
- raise KeyError(val) from err
-
- cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
- """
- See ObjectEngine._searchsorted_left.__doc__.
- """
- # Caller is responsible for ensuring _check_type has already been called
- loc = self.values.searchsorted(self._np_type(val), side="left")
- return loc
-
- cdef _get_loc_duplicates(self, object val):
- # -> Py_ssize_t | slice | ndarray[bool]
- cdef:
- Py_ssize_t diff, left, right
-
- if self.is_monotonic_increasing:
- values = self.values
- try:
- left = values.searchsorted(val, side="left")
- right = values.searchsorted(val, side="right")
- except TypeError:
- # e.g. GH#29189 get_loc(None) with a Float64Index
- # 2021-09-29 Now only reached for object-dtype
- raise KeyError(val)
-
- diff = right - left
- if diff == 0:
- raise KeyError(val)
- elif diff == 1:
- return left
- else:
- return slice(left, right)
-
- return self._maybe_get_bool_indexer(val)
-
- cdef _maybe_get_bool_indexer(self, object val):
- # Returns ndarray[bool] or int
- cdef:
- ndarray[uint8_t, ndim=1, cast=True] indexer
-
- indexer = _get_bool_indexer(self.values, val, self.mask)
- return _unpack_bool_indexer(indexer, val)
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the sizeof our mapping """
- if not self.is_mapping_populated:
- return 0
- return self.mapping.sizeof(deep=deep)
-
- def __sizeof__(self) -> int:
- return self.sizeof()
-
- @property
- def is_unique(self) -> bool:
- if self.need_unique_check:
- self._do_unique_check()
-
- return self.unique == 1
-
- cdef _do_unique_check(self):
- self._ensure_mapping_populated()
-
- @property
- def is_monotonic_increasing(self) -> bool:
- if self.need_monotonic_check:
- self._do_monotonic_check()
-
- return self.monotonic_inc == 1
-
- @property
- def is_monotonic_decreasing(self) -> bool:
- if self.need_monotonic_check:
- self._do_monotonic_check()
-
- return self.monotonic_dec == 1
-
- cdef _do_monotonic_check(self):
- cdef:
- bint is_unique
- if self.mask is not None and np.any(self.mask):
- self.monotonic_inc = 0
- self.monotonic_dec = 0
- else:
- try:
- values = self.values
- self.monotonic_inc, self.monotonic_dec, is_unique = \
- self._call_monotonic(values)
- except TypeError:
- self.monotonic_inc = 0
- self.monotonic_dec = 0
- is_unique = 0
-
- self.need_monotonic_check = 0
-
- # we can only be sure of uniqueness if is_unique=1
- if is_unique:
- self.unique = 1
- self.need_unique_check = 0
-
- cdef _call_monotonic(self, values):
- return algos.is_monotonic(values, timelike=False)
-
- cdef _make_hash_table(self, Py_ssize_t n):
- raise NotImplementedError # pragma: no cover
-
- cdef _check_type(self, object val):
- hash(val)
- return val
-
- @property
- def is_mapping_populated(self) -> bool:
- return self.mapping is not None
-
- cdef _ensure_mapping_populated(self):
- # this populates the mapping
- # if its not already populated
- # also satisfies the need_unique_check
-
- if not self.is_mapping_populated:
-
- values = self.values
- self.mapping = self._make_hash_table(len(values))
- self.mapping.map_locations(values, self.mask)
-
- if len(self.mapping) == len(values):
- self.unique = 1
-
- self.need_unique_check = 0
-
- def clear_mapping(self):
- self.mapping = None
- self.need_monotonic_check = 1
- self.need_unique_check = 1
-
- self.unique = 0
- self.monotonic_inc = 0
- self.monotonic_dec = 0
-
- def get_indexer(self, ndarray values) -> np.ndarray:
- self._ensure_mapping_populated()
- return self.mapping.lookup(values)
-
- def get_indexer_non_unique(self, ndarray targets):
- """
- Return an indexer suitable for taking from a non unique index
- return the labels in the same order as the target
- and a missing indexer into the targets (which correspond
- to the -1 indices in the results
-
- Returns
- -------
- indexer : np.ndarray[np.intp]
- missing : np.ndarray[np.intp]
- """
- cdef:
- ndarray values
- ndarray[intp_t] result, missing
- set stargets, remaining_stargets, found_nas
- dict d = {}
- object val
- Py_ssize_t count = 0, count_missing = 0
- Py_ssize_t i, j, n, n_t, n_alloc, start, end
- bint check_na_values = False
-
- values = self.values
- stargets = set(targets)
-
- n = len(values)
- n_t = len(targets)
- if n > 10_000:
- n_alloc = 10_000
- else:
- n_alloc = n
-
- result = np.empty(n_alloc, dtype=np.intp)
- missing = np.empty(n_t, dtype=np.intp)
-
- # map each starget to its position in the index
- if (
- stargets and
- len(stargets) < 5 and
- not any([checknull(t) for t in stargets]) and
- self.is_monotonic_increasing
- ):
- # if there are few enough stargets and the index is monotonically
- # increasing, then use binary search for each starget
- remaining_stargets = set()
- for starget in stargets:
- try:
- start = values.searchsorted(starget, side="left")
- end = values.searchsorted(starget, side="right")
- except TypeError: # e.g. if we tried to search for string in int array
- remaining_stargets.add(starget)
- else:
- if start != end:
- d[starget] = list(range(start, end))
-
- stargets = remaining_stargets
-
- if stargets:
- # otherwise, map by iterating through all items in the index
-
- # short-circuit na check
- if values.dtype == object:
- check_na_values = True
- # keep track of nas in values
- found_nas = set()
-
- for i in range(n):
- val = values[i]
-
- # GH#43870
- # handle lookup for nas
- # (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat)
- if check_na_values and checknull(val):
- match = [na for na in found_nas if is_matching_na(val, na)]
-
- # matching na not found
- if not len(match):
- found_nas.add(val)
-
- # add na to stargets to utilize `in` for stargets/d lookup
- match_stargets = [
- x for x in stargets if is_matching_na(val, x)
- ]
-
- if len(match_stargets):
- # add our 'standardized' na
- stargets.add(val)
-
- # matching na found
- else:
- assert len(match) == 1
- val = match[0]
-
- if val in stargets:
- if val not in d:
- d[val] = []
- d[val].append(i)
-
- for i in range(n_t):
- val = targets[i]
-
- # ensure there are nas in values before looking for a matching na
- if check_na_values and checknull(val):
- match = [na for na in found_nas if is_matching_na(val, na)]
- if len(match):
- assert len(match) == 1
- val = match[0]
-
- # found
- if val in d:
- key = val
-
- for j in d[key]:
-
- # realloc if needed
- if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
-
- result[count] = j
- count += 1
-
- # value not found
- else:
-
- if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
- result[count] = -1
- count += 1
- missing[count_missing] = i
- count_missing += 1
-
- return result[0:count], missing[0:count_missing]
-
-
-cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
- # GH#1757 ndarray.searchsorted is not safe to use with array of tuples
- # (treats a tuple `val` as a sequence of keys instead of a single key),
- # so we implement something similar.
- # This is equivalent to the stdlib's bisect.bisect_left
-
- cdef:
- Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1
- object pval
-
- if hi == 0 or (hi > 0 and val > values[hi]):
- return len(values)
-
- while lo < hi:
- mid = (lo + hi) // 2
- pval = values[mid]
- if val < pval:
- hi = mid
- elif val > pval:
- lo = mid + 1
- else:
- while mid > 0 and val == values[mid - 1]:
- mid -= 1
- return mid
-
- if val <= values[mid]:
- return mid
- else:
- return mid + 1
-
-
-cdef class ObjectEngine(IndexEngine):
- """
- Index Engine for use with object-dtype Index, namely the base class Index.
- """
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.PyObjectHashTable(n)
-
- cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
- # using values.searchsorted here would treat a tuple `val` as a sequence
- # instead of a single key, so we use a different implementation
- try:
- loc = _bin_search(self.values, val)
- except TypeError as err:
- raise KeyError(val) from err
- return loc
-
-
-cdef class DatetimeEngine(Int64Engine):
-
- cdef:
- NPY_DATETIMEUNIT _creso
-
- def __init__(self, ndarray values):
- super().__init__(values.view("i8"))
- self._creso = get_unit_from_dtype(values.dtype)
-
- cdef int64_t _unbox_scalar(self, scalar) except? -1:
- # NB: caller is responsible for ensuring tzawareness compat
- # before we get here
- if scalar is NaT:
- return NaT._value
- elif isinstance(scalar, _Timestamp):
- if scalar._creso == self._creso:
- return scalar._value
- else:
- # Note: caller is responsible for catching potential ValueError
- # from _as_creso
- return (
- (<_Timestamp>scalar)._as_creso(self._creso, round_ok=False)._value
- )
- raise TypeError(scalar)
-
- def __contains__(self, val: object) -> bool:
- # We assume before we get here:
- # - val is hashable
- try:
- self._unbox_scalar(val)
- except ValueError:
- return False
-
- try:
- self.get_loc(val)
- return True
- except KeyError:
- return False
-
- cdef _call_monotonic(self, values):
- return algos.is_monotonic(values, timelike=True)
-
- cpdef get_loc(self, object val):
- # NB: the caller is responsible for ensuring that we are called
- # with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine)
-
- cdef:
- Py_ssize_t loc
-
- if is_definitely_invalid_key(val):
- raise TypeError(f"'{val}' is an invalid key")
-
- try:
- conv = self._unbox_scalar(val)
- except (TypeError, ValueError) as err:
- raise KeyError(val) from err
-
- # Welcome to the spaghetti factory
- if self.over_size_threshold and self.is_monotonic_increasing:
- if not self.is_unique:
- return self._get_loc_duplicates(conv)
- values = self.values
-
- loc = values.searchsorted(conv, side="left")
-
- if loc == len(values) or values[loc] != conv:
- raise KeyError(val)
- return loc
-
- self._ensure_mapping_populated()
- if not self.unique:
- return self._get_loc_duplicates(conv)
-
- try:
- return self.mapping.get_item(conv)
- except KeyError:
- raise KeyError(val)
-
-
-cdef class TimedeltaEngine(DatetimeEngine):
-
- cdef int64_t _unbox_scalar(self, scalar) except? -1:
- if scalar is NaT:
- return NaT._value
- elif isinstance(scalar, _Timedelta):
- if scalar._creso == self._creso:
- return scalar._value
- else:
- # Note: caller is responsible for catching potential ValueError
- # from _as_creso
- return (
- (<_Timedelta>scalar)._as_creso(self._creso, round_ok=False)._value
- )
- raise TypeError(scalar)
-
-
-cdef class PeriodEngine(Int64Engine):
-
- cdef int64_t _unbox_scalar(self, scalar) except? -1:
- if scalar is NaT:
- return scalar._value
- if is_period_object(scalar):
- # NB: we assume that we have the correct freq here.
- return scalar.ordinal
- raise TypeError(scalar)
-
- cpdef get_loc(self, object val):
- # NB: the caller is responsible for ensuring that we are called
- # with either a Period or NaT
- cdef:
- int64_t conv
-
- try:
- conv = self._unbox_scalar(val)
- except TypeError:
- raise KeyError(val)
-
- return Int64Engine.get_loc(self, conv)
-
- cdef _call_monotonic(self, values):
- return algos.is_monotonic(values, timelike=True)
-
-
-cdef class BaseMultiIndexCodesEngine:
- """
- Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
- represent each label in a MultiIndex as an integer, by juxtaposing the bits
- encoding each level, with appropriate offsets.
-
- For instance: if 3 levels have respectively 3, 6 and 1 possible values,
- then their labels can be represented using respectively 2, 3 and 1 bits,
- as follows:
- _ _ _ _____ _ __ __ __
- |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
- — — — ————— — —— —— ——
- |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
- — — — ————— — —— —— ——
- |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
- ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
- and the resulting unsigned integer representation will be:
- _ _ _ _____ _ __ __ __ __ __ __
- |0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
- ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
-
- Offsets are calculated at initialization, labels are transformed by method
- _codes_to_ints.
-
- Keys are located by first locating each component against the respective
- level, then locating (the integer representation of) codes.
- """
- def __init__(self, object levels, object labels,
- ndarray[uint64_t, ndim=1] offsets):
- """
- Parameters
- ----------
- levels : list-like of numpy arrays
- Levels of the MultiIndex.
- labels : list-like of numpy arrays of integer dtype
- Labels of the MultiIndex.
- offsets : numpy array of uint64 dtype
- Pre-calculated offsets, one for each level of the index.
- """
- self.levels = levels
- self.offsets = offsets
-
- # Transform labels in a single array, and add 2 so that we are working
- # with positive integers (-1 for NaN becomes 1). This enables us to
- # differentiate between values that are missing in other and matching
- # NaNs. We will set values that are not found to 0 later:
- labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
- codes = labels_arr.astype("uint64", copy=False)
- self.level_has_nans = [-1 in lab for lab in labels]
-
- # Map each codes combination in the index to an integer unambiguously
- # (no collisions possible), based on the "offsets", which describe the
- # number of bits to switch labels for each level:
- lab_ints = self._codes_to_ints(codes)
-
- # Initialize underlying index (e.g. libindex.UInt64Engine) with
- # integers representing labels: we will use its get_loc and get_indexer
- self._base.__init__(self, lab_ints)
-
- def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
- raise NotImplementedError("Implemented by subclass") # pragma: no cover
-
- def _extract_level_codes(self, target) -> np.ndarray:
- """
- Map the requested list of (tuple) keys to their integer representations
- for searching in the underlying integer index.
-
- Parameters
- ----------
- target : MultiIndex
-
- Returns
- ------
- int_keys : 1-dimensional array of dtype uint64 or object
- Integers representing one combination each
- """
- zt = [target._get_level_values(i) for i in range(target.nlevels)]
- level_codes = []
- for i, (lev, codes) in enumerate(zip(self.levels, zt)):
- result = lev.get_indexer_for(codes) + 1
- result[result > 0] += 1
- if self.level_has_nans[i] and codes.hasnans:
- result[codes.isna()] += 1
- level_codes.append(result)
- return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
-
- def get_indexer(self, target: np.ndarray) -> np.ndarray:
- """
- Returns an array giving the positions of each value of `target` in
- `self.values`, where -1 represents a value in `target` which does not
- appear in `self.values`
-
- Parameters
- ----------
- target : np.ndarray
-
- Returns
- -------
- np.ndarray[intp_t, ndim=1] of the indexer of `target` into
- `self.values`
- """
- return self._base.get_indexer(self, target)
-
- def get_indexer_with_fill(self, ndarray target, ndarray values,
- str method, object limit) -> np.ndarray:
- """
- Returns an array giving the positions of each value of `target` in
- `values`, where -1 represents a value in `target` which does not
- appear in `values`
-
- If `method` is "backfill" then the position for a value in `target`
- which does not appear in `values` is that of the next greater value
- in `values` (if one exists), and -1 if there is no such value.
-
- Similarly, if the method is "pad" then the position for a value in
- `target` which does not appear in `values` is that of the next smaller
- value in `values` (if one exists), and -1 if there is no such value.
-
- Parameters
- ----------
- target: ndarray[object] of tuples
- need not be sorted, but all must have the same length, which must be
- the same as the length of all tuples in `values`
- values : ndarray[object] of tuples
- must be sorted and all have the same length. Should be the set of
- the MultiIndex's values.
- method: string
- "backfill" or "pad"
- limit: int or None
- if provided, limit the number of fills to this value
-
- Returns
- -------
- np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
- filled with the `method` (and optionally `limit`) specified
- """
- assert method in ("backfill", "pad")
- cdef:
- int64_t i, j, next_code
- int64_t num_values, num_target_values
- ndarray[int64_t, ndim=1] target_order
- ndarray[object, ndim=1] target_values
- ndarray[int64_t, ndim=1] new_codes, new_target_codes
- ndarray[intp_t, ndim=1] sorted_indexer
-
- target_order = np.argsort(target).astype("int64")
- target_values = target[target_order]
- num_values, num_target_values = len(values), len(target_values)
- new_codes, new_target_codes = (
- np.empty((num_values,)).astype("int64"),
- np.empty((num_target_values,)).astype("int64"),
- )
-
- # `values` and `target_values` are both sorted, so we walk through them
- # and memoize the (ordered) set of indices in the (implicit) merged-and
- # sorted list of the two which belong to each of them
- # the effect of this is to create a factorization for the (sorted)
- # merger of the index values, where `new_codes` and `new_target_codes`
- # are the subset of the factors which appear in `values` and `target`,
- # respectively
- i, j, next_code = 0, 0, 0
- while i < num_values and j < num_target_values:
- val, target_val = values[i], target_values[j]
- if val <= target_val:
- new_codes[i] = next_code
- i += 1
- if target_val <= val:
- new_target_codes[j] = next_code
- j += 1
- next_code += 1
-
- # at this point, at least one should have reached the end
- # the remaining values of the other should be added to the end
- assert i == num_values or j == num_target_values
- while i < num_values:
- new_codes[i] = next_code
- i += 1
- next_code += 1
- while j < num_target_values:
- new_target_codes[j] = next_code
- j += 1
- next_code += 1
-
- # get the indexer, and undo the sorting of `target.values`
- algo = algos.backfill if method == "backfill" else algos.pad
- sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
- return sorted_indexer[np.argsort(target_order)]
-
- def get_loc(self, object key):
- if is_definitely_invalid_key(key):
- raise TypeError(f"'{key}' is an invalid key")
- if not isinstance(key, tuple):
- raise KeyError(key)
- try:
- indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift
- for lev, v in zip(self.levels, key)]
- except KeyError:
- raise KeyError(key)
-
- # Transform indices into single integer:
- lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
-
- return self._base.get_loc(self, lab_int)
-
- def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray:
- indexer = self._base.get_indexer_non_unique(self, target)
-
- return indexer
-
- def __contains__(self, val: object) -> bool:
- # We assume before we get here:
- # - val is hashable
- # Default __contains__ looks in the underlying mapping, which in this
- # case only contains integer representations.
- try:
- self.get_loc(val)
- return True
- except (KeyError, TypeError, ValueError):
- return False
-
-
-# Generated from template.
-include "index_class_helper.pxi"
-
-
-cdef class BoolEngine(UInt8Engine):
- cdef _check_type(self, object val):
- if not util.is_bool_object(val):
- raise KeyError(val)
- return <uint8_t>val
-
-
-cdef class MaskedBoolEngine(MaskedUInt8Engine):
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_bool_object(val):
- raise KeyError(val)
- return <uint8_t>val
-
-
-@cython.internal
-@cython.freelist(32)
-cdef class SharedEngine:
- cdef readonly:
- object values # ExtensionArray
- bint over_size_threshold
-
- cdef:
- bint unique, monotonic_inc, monotonic_dec
- bint need_monotonic_check, need_unique_check
-
- def __contains__(self, val: object) -> bool:
- # We assume before we get here:
- # - val is hashable
- try:
- self.get_loc(val)
- return True
- except KeyError:
- return False
-
- def clear_mapping(self):
- # for compat with IndexEngine
- pass
-
- @property
- def is_unique(self) -> bool:
- if self.need_unique_check:
- arr = self.values.unique()
- self.unique = len(arr) == len(self.values)
-
- self.need_unique_check = False
- return self.unique
-
- cdef _do_monotonic_check(self):
- raise NotImplementedError
-
- @property
- def is_monotonic_increasing(self) -> bool:
- if self.need_monotonic_check:
- self._do_monotonic_check()
-
- return self.monotonic_inc == 1
-
- @property
- def is_monotonic_decreasing(self) -> bool:
- if self.need_monotonic_check:
- self._do_monotonic_check()
-
- return self.monotonic_dec == 1
-
- cdef _call_monotonic(self, values):
- return algos.is_monotonic(values, timelike=False)
-
- def sizeof(self, deep: bool = False) -> int:
- """ return the sizeof our mapping """
- return 0
-
- def __sizeof__(self) -> int:
- return self.sizeof()
-
- cdef _check_type(self, object obj):
- raise NotImplementedError
-
- cpdef get_loc(self, object val):
- # -> Py_ssize_t | slice | ndarray[bool]
- cdef:
- Py_ssize_t loc
-
- if is_definitely_invalid_key(val):
- raise TypeError(f"'{val}' is an invalid key")
-
- self._check_type(val)
-
- if self.over_size_threshold and self.is_monotonic_increasing:
- if not self.is_unique:
- return self._get_loc_duplicates(val)
-
- values = self.values
-
- loc = self._searchsorted_left(val)
- if loc >= len(values):
- raise KeyError(val)
- if values[loc] != val:
- raise KeyError(val)
- return loc
-
- if not self.unique:
- return self._get_loc_duplicates(val)
-
- return self._get_loc_duplicates(val)
-
- cdef _get_loc_duplicates(self, object val):
- # -> Py_ssize_t | slice | ndarray[bool]
- cdef:
- Py_ssize_t diff
-
- if self.is_monotonic_increasing:
- values = self.values
- try:
- left = values.searchsorted(val, side="left")
- right = values.searchsorted(val, side="right")
- except TypeError:
- # e.g. GH#29189 get_loc(None) with a Float64Index
- raise KeyError(val)
-
- diff = right - left
- if diff == 0:
- raise KeyError(val)
- elif diff == 1:
- return left
- else:
- return slice(left, right)
-
- return self._maybe_get_bool_indexer(val)
-
- cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
- """
- See ObjectEngine._searchsorted_left.__doc__.
- """
- try:
- loc = self.values.searchsorted(val, side="left")
- except TypeError as err:
- # GH#35788 e.g. val=None with float64 values
- raise KeyError(val)
- return loc
-
- cdef ndarray _get_bool_indexer(self, val):
- raise NotImplementedError
-
- cdef _maybe_get_bool_indexer(self, object val):
- # Returns ndarray[bool] or int
- cdef:
- ndarray[uint8_t, ndim=1, cast=True] indexer
-
- indexer = self._get_bool_indexer(val)
- return _unpack_bool_indexer(indexer, val)
-
- def get_indexer(self, values) -> np.ndarray:
- # values : type(self.values)
- # Note: we only get here with self.is_unique
- cdef:
- Py_ssize_t i, N = len(values)
-
- res = np.empty(N, dtype=np.intp)
-
- for i in range(N):
- val = values[i]
- try:
- loc = self.get_loc(val)
- # Because we are unique, loc should always be an integer
- except KeyError:
- loc = -1
- else:
- assert util.is_integer_object(loc), (loc, val)
- res[i] = loc
-
- return res
-
- def get_indexer_non_unique(self, targets):
- """
- Return an indexer suitable for taking from a non unique index
- return the labels in the same order as the target
- and a missing indexer into the targets (which correspond
- to the -1 indices in the results
- Parameters
- ----------
- targets : type(self.values)
- Returns
- -------
- indexer : np.ndarray[np.intp]
- missing : np.ndarray[np.intp]
- """
- cdef:
- Py_ssize_t i, N = len(targets)
-
- indexer = []
- missing = []
-
- # See also IntervalIndex.get_indexer_pointwise
- for i in range(N):
- val = targets[i]
-
- try:
- locs = self.get_loc(val)
- except KeyError:
- locs = np.array([-1], dtype=np.intp)
- missing.append(i)
- else:
- if isinstance(locs, slice):
- # Only needed for get_indexer_non_unique
- locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
- elif util.is_integer_object(locs):
- locs = np.array([locs], dtype=np.intp)
- else:
- assert locs.dtype.kind == "b"
- locs = locs.nonzero()[0]
-
- indexer.append(locs)
-
- try:
- indexer = np.concatenate(indexer, dtype=np.intp)
- except TypeError:
- # numpy<1.20 doesn't accept dtype keyword
- indexer = np.concatenate(indexer).astype(np.intp, copy=False)
- missing = np.array(missing, dtype=np.intp)
-
- return indexer, missing
-
-
-cdef class ExtensionEngine(SharedEngine):
- def __init__(self, values: "ExtensionArray"):
- self.values = values
-
- self.over_size_threshold = len(values) >= _SIZE_CUTOFF
- self.need_unique_check = True
- self.need_monotonic_check = True
- self.need_unique_check = True
-
- cdef _do_monotonic_check(self):
- cdef:
- bint is_unique
-
- values = self.values
- if values._hasna:
- self.monotonic_inc = 0
- self.monotonic_dec = 0
-
- nunique = len(values.unique())
- self.unique = nunique == len(values)
- self.need_unique_check = 0
- return
-
- try:
- ranks = values._rank()
-
- except TypeError:
- self.monotonic_inc = 0
- self.monotonic_dec = 0
- is_unique = 0
- else:
- self.monotonic_inc, self.monotonic_dec, is_unique = \
- self._call_monotonic(ranks)
-
- self.need_monotonic_check = 0
-
- # we can only be sure of uniqueness if is_unique=1
- if is_unique:
- self.unique = 1
- self.need_unique_check = 0
-
- cdef ndarray _get_bool_indexer(self, val):
- if checknull(val):
- return self.values.isna()
-
- try:
- return self.values == val
- except TypeError:
- # e.g. if __eq__ returns a BooleanArray instead of ndarray[bool]
- try:
- return (self.values == val).to_numpy(dtype=bool, na_value=False)
- except (TypeError, AttributeError) as err:
- # e.g. (self.values == val) returned a bool
- # see test_get_loc_generator[string[pyarrow]]
- # e.g. self.value == val raises TypeError bc generator has no len
- # see test_get_loc_generator[string[python]]
- raise KeyError from err
-
- cdef _check_type(self, object val):
- hash(val)
-
-
-cdef class MaskedIndexEngine(IndexEngine):
- def __init__(self, object values):
- super().__init__(self._get_data(values))
- self.mask = self._get_mask(values)
-
- def _get_data(self, object values) -> np.ndarray:
- if hasattr(values, "_mask"):
- return values._data
- # We are an ArrowExtensionArray
- # Set 1 as na_value to avoid ending up with NA and an object array
- # TODO: Remove when arrow engine is implemented
- return values.to_numpy(na_value=1, dtype=values.dtype.numpy_dtype)
-
- def _get_mask(self, object values) -> np.ndarray:
- if hasattr(values, "_mask"):
- return values._mask
- # We are an ArrowExtensionArray
- return values.isna()
-
- def get_indexer(self, object values) -> np.ndarray:
- self._ensure_mapping_populated()
- return self.mapping.lookup(self._get_data(values), self._get_mask(values))
-
- def get_indexer_non_unique(self, object targets):
- """
- Return an indexer suitable for taking from a non unique index
- return the labels in the same order as the target
- and a missing indexer into the targets (which correspond
- to the -1 indices in the results
-
- Returns
- -------
- indexer : np.ndarray[np.intp]
- missing : np.ndarray[np.intp]
- """
- # TODO: Unify with parent class
- cdef:
- ndarray values, mask, target_vals, target_mask
- ndarray[intp_t] result, missing
- set stargets
- list na_pos
- dict d = {}
- object val
- Py_ssize_t count = 0, count_missing = 0
- Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx
-
- target_vals = self._get_data(targets)
- target_mask = self._get_mask(targets)
-
- values = self.values
- assert not values.dtype == object # go through object path instead
-
- mask = self.mask
- stargets = set(target_vals[~target_mask])
-
- n = len(values)
- n_t = len(target_vals)
- if n > 10_000:
- n_alloc = 10_000
- else:
- n_alloc = n
-
- result = np.empty(n_alloc, dtype=np.intp)
- missing = np.empty(n_t, dtype=np.intp)
-
- # map each starget to its position in the index
- if (
- stargets and
- len(stargets) < 5 and
- not np.any(target_mask) and
- self.is_monotonic_increasing
- ):
- # if there are few enough stargets and the index is monotonically
- # increasing, then use binary search for each starget
- for starget in stargets:
- start = values.searchsorted(starget, side="left")
- end = values.searchsorted(starget, side="right")
- if start != end:
- d[starget] = list(range(start, end))
-
- stargets = set()
-
- if stargets:
- # otherwise, map by iterating through all items in the index
-
- na_pos = []
-
- for i in range(n):
- val = values[i]
-
- if mask[i]:
- na_pos.append(i)
-
- else:
- if val in stargets:
- if val not in d:
- d[val] = []
- d[val].append(i)
-
- for i in range(n_t):
- val = target_vals[i]
-
- if target_mask[i]:
- if na_pos:
- for na_idx in na_pos:
- # realloc if needed
- if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
-
- result[count] = na_idx
- count += 1
- continue
-
- elif val in d:
- # found
- key = val
-
- for j in d[key]:
-
- # realloc if needed
- if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
-
- result[count] = j
- count += 1
- continue
-
- # value not found
- if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
- result[count] = -1
- count += 1
- missing[count_missing] = i
- count_missing += 1
-
- return result[0:count], missing[0:count_missing]
diff --git a/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi b/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi
deleted file mode 100644
index 14983e38340..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi
+++ /dev/null
@@ -1,381 +0,0 @@
-"""
-Template for functions of IndexEngine subclasses.
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-# ----------------------------------------------------------------------
-# IndexEngine Subclass Methods
-# ----------------------------------------------------------------------
-
-cdef class Float64Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Float64HashTable(n)
-
- cdef _check_type(self, object val):
- if not util.is_integer_object(val) and not util.is_float_object(val):
- # in particular catch bool and avoid casting True -> 1.0
- raise KeyError(val)
- return val
-
-cdef class MaskedFloat64Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Float64HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_integer_object(val) and not util.is_float_object(val):
- # in particular catch bool and avoid casting True -> 1.0
- raise KeyError(val)
- return val
-
-cdef class Float32Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Float32HashTable(n)
-
- cdef _check_type(self, object val):
- if not util.is_integer_object(val) and not util.is_float_object(val):
- # in particular catch bool and avoid casting True -> 1.0
- raise KeyError(val)
- return val
-
-cdef class MaskedFloat32Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Float32HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_integer_object(val) and not util.is_float_object(val):
- # in particular catch bool and avoid casting True -> 1.0
- raise KeyError(val)
- return val
-
-cdef class Int64Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Int64HashTable(n)
-
- cdef _check_type(self, object val):
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- return val
-
-cdef class MaskedInt64Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Int64HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- return val
-
-cdef class Int32Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Int32HashTable(n)
-
- cdef _check_type(self, object val):
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- return val
-
-cdef class MaskedInt32Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Int32HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- return val
-
-cdef class Int16Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Int16HashTable(n)
-
- cdef _check_type(self, object val):
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- return val
-
-cdef class MaskedInt16Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Int16HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- return val
-
-cdef class Int8Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Int8HashTable(n)
-
- cdef _check_type(self, object val):
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- return val
-
-cdef class MaskedInt8Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Int8HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- return val
-
-cdef class UInt64Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.UInt64HashTable(n)
-
- cdef _check_type(self, object val):
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- if val < 0:
- # cannot have negative values with unsigned int dtype
- raise KeyError(val)
- return val
-
-cdef class MaskedUInt64Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.UInt64HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- if val < 0:
- # cannot have negative values with unsigned int dtype
- raise KeyError(val)
- return val
-
-cdef class UInt32Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.UInt32HashTable(n)
-
- cdef _check_type(self, object val):
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- if val < 0:
- # cannot have negative values with unsigned int dtype
- raise KeyError(val)
- return val
-
-cdef class MaskedUInt32Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.UInt32HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- if val < 0:
- # cannot have negative values with unsigned int dtype
- raise KeyError(val)
- return val
-
-cdef class UInt16Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.UInt16HashTable(n)
-
- cdef _check_type(self, object val):
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- if val < 0:
- # cannot have negative values with unsigned int dtype
- raise KeyError(val)
- return val
-
-cdef class MaskedUInt16Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.UInt16HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- if val < 0:
- # cannot have negative values with unsigned int dtype
- raise KeyError(val)
- return val
-
-cdef class UInt8Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.UInt8HashTable(n)
-
- cdef _check_type(self, object val):
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- if val < 0:
- # cannot have negative values with unsigned int dtype
- raise KeyError(val)
- return val
-
-cdef class MaskedUInt8Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.UInt8HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- if val < 0:
- # cannot have negative values with unsigned int dtype
- raise KeyError(val)
- return val
-
-cdef class Complex64Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Complex64HashTable(n)
-
- cdef _check_type(self, object val):
- if (not util.is_integer_object(val)
- and not util.is_float_object(val)
- and not util.is_complex_object(val)
- ):
- # in particular catch bool and avoid casting True -> 1.0
- raise KeyError(val)
- return val
-
-cdef class MaskedComplex64Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Complex64HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if (not util.is_integer_object(val)
- and not util.is_float_object(val)
- and not util.is_complex_object(val)
- ):
- # in particular catch bool and avoid casting True -> 1.0
- raise KeyError(val)
- return val
-
-cdef class Complex128Engine(IndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Complex128HashTable(n)
-
- cdef _check_type(self, object val):
- if (not util.is_integer_object(val)
- and not util.is_float_object(val)
- and not util.is_complex_object(val)
- ):
- # in particular catch bool and avoid casting True -> 1.0
- raise KeyError(val)
- return val
-
-cdef class MaskedComplex128Engine(MaskedIndexEngine):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.Complex128HashTable(n, uses_mask=True)
-
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if (not util.is_integer_object(val)
- and not util.is_float_object(val)
- and not util.is_complex_object(val)
- ):
- # in particular catch bool and avoid casting True -> 1.0
- raise KeyError(val)
- return val
diff --git a/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi.in
deleted file mode 100644
index bf3d88edd93..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/index_class_helper.pxi.in
+++ /dev/null
@@ -1,78 +0,0 @@
-"""
-Template for functions of IndexEngine subclasses.
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-# ----------------------------------------------------------------------
-# IndexEngine Subclass Methods
-# ----------------------------------------------------------------------
-
-{{py:
-
-# name, dtype
-dtypes = [('Float64', 'float64'),
- ('Float32', 'float32'),
- ('Int64', 'int64'),
- ('Int32', 'int32'),
- ('Int16', 'int16'),
- ('Int8', 'int8'),
- ('UInt64', 'uint64'),
- ('UInt32', 'uint32'),
- ('UInt16', 'uint16'),
- ('UInt8', 'uint8'),
- ('Complex64', 'complex64'),
- ('Complex128', 'complex128'),
- ]
-
-engines = [('', 'IndexEngine'), ('Masked', 'MaskedIndexEngine')]
-
-}}
-
-{{for name, dtype in dtypes}}
-
-{{for prefix, engine in engines}}
-
-cdef class {{prefix}}{{name}}Engine({{engine}}):
-
- cdef _make_hash_table(self, Py_ssize_t n):
- {{if engine == 'MaskedIndexEngine'}}
- return _hash.{{name}}HashTable(n, uses_mask=True)
- {{else}}
- return _hash.{{name}}HashTable(n)
- {{endif}}
-
- cdef _check_type(self, object val):
- {{if engine == 'MaskedIndexEngine'}}
- if val is C_NA:
- return val
- {{endif}}
- {{if name not in {'Float64', 'Float32', 'Complex64', 'Complex128'} }}
- if not util.is_integer_object(val):
- if util.is_float_object(val):
- # Make sure Int64Index.get_loc(2.0) works
- if val.is_integer():
- return int(val)
- raise KeyError(val)
- {{if name.startswith("U")}}
- if val < 0:
- # cannot have negative values with unsigned int dtype
- raise KeyError(val)
- {{endif}}
- {{elif name not in {'Complex64', 'Complex128'} }}
- if not util.is_integer_object(val) and not util.is_float_object(val):
- # in particular catch bool and avoid casting True -> 1.0
- raise KeyError(val)
- {{else}}
- if (not util.is_integer_object(val)
- and not util.is_float_object(val)
- and not util.is_complex_object(val)
- ):
- # in particular catch bool and avoid casting True -> 1.0
- raise KeyError(val)
- {{endif}}
- return val
-
-{{endfor}}
-
-{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/indexing.pyi b/contrib/python/pandas/py3/pandas/_libs/indexing.pyi
deleted file mode 100644
index 3ae5c5044a2..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/indexing.pyi
+++ /dev/null
@@ -1,17 +0,0 @@
-from typing import (
- Generic,
- TypeVar,
-)
-
-from pandas.core.indexing import IndexingMixin
-
-_IndexingMixinT = TypeVar("_IndexingMixinT", bound=IndexingMixin)
-
-class NDFrameIndexerBase(Generic[_IndexingMixinT]):
- name: str
- # in practice obj is either a DataFrame or a Series
- obj: _IndexingMixinT
-
- def __init__(self, name: str, obj: _IndexingMixinT) -> None: ...
- @property
- def ndim(self) -> int: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/indexing.pyx b/contrib/python/pandas/py3/pandas/_libs/indexing.pyx
deleted file mode 100644
index c274b28b755..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/indexing.pyx
+++ /dev/null
@@ -1,28 +0,0 @@
-cdef class NDFrameIndexerBase:
- """
- A base class for _NDFrameIndexer for fast instantiation and attribute access.
- """
- cdef:
- Py_ssize_t _ndim
-
- cdef public:
- str name
- object obj
-
- def __init__(self, name: str, obj):
- self.obj = obj
- self.name = name
- self._ndim = -1
-
- @property
- def ndim(self) -> int:
- # Delay `ndim` instantiation until required as reading it
- # from `obj` isn't entirely cheap.
- ndim = self._ndim
- if ndim == -1:
- ndim = self._ndim = self.obj.ndim
- if ndim > 2:
- raise ValueError( # pragma: no cover
- "NDFrameIndexer does not support NDFrame objects with ndim > 2"
- )
- return ndim
diff --git a/contrib/python/pandas/py3/pandas/_libs/internals.pyi b/contrib/python/pandas/py3/pandas/_libs/internals.pyi
deleted file mode 100644
index cee96801290..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/internals.pyi
+++ /dev/null
@@ -1,102 +0,0 @@
-from typing import (
- Iterator,
- Sequence,
- final,
- overload,
-)
-import weakref
-
-import numpy as np
-
-from pandas._typing import (
- ArrayLike,
- T,
- npt,
-)
-
-from pandas import Index
-from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
-from pandas.core.internals.blocks import Block as B
-
-def slice_len(slc: slice, objlen: int = ...) -> int: ...
-def get_blkno_indexers(
- blknos: np.ndarray, # int64_t[:]
- group: bool = ...,
-) -> list[tuple[int, slice | np.ndarray]]: ...
-def get_blkno_placements(
- blknos: np.ndarray,
- group: bool = ...,
-) -> Iterator[tuple[int, BlockPlacement]]: ...
-def update_blklocs_and_blknos(
- blklocs: npt.NDArray[np.intp],
- blknos: npt.NDArray[np.intp],
- loc: int,
- nblocks: int,
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
-@final
-class BlockPlacement:
- def __init__(self, val: int | slice | np.ndarray) -> None: ...
- @property
- def indexer(self) -> np.ndarray | slice: ...
- @property
- def as_array(self) -> np.ndarray: ...
- @property
- def as_slice(self) -> slice: ...
- @property
- def is_slice_like(self) -> bool: ...
- @overload
- def __getitem__(
- self, loc: slice | Sequence[int] | npt.NDArray[np.intp]
- ) -> BlockPlacement: ...
- @overload
- def __getitem__(self, loc: int) -> int: ...
- def __iter__(self) -> Iterator[int]: ...
- def __len__(self) -> int: ...
- def delete(self, loc) -> BlockPlacement: ...
- def append(self, others: list[BlockPlacement]) -> BlockPlacement: ...
- def tile_for_unstack(self, factor: int) -> npt.NDArray[np.intp]: ...
-
-class SharedBlock:
- _mgr_locs: BlockPlacement
- ndim: int
- values: ArrayLike
- refs: BlockValuesRefs
- def __init__(
- self,
- values: ArrayLike,
- placement: BlockPlacement,
- ndim: int,
- refs: BlockValuesRefs | None = ...,
- ) -> None: ...
-
-class NumpyBlock(SharedBlock):
- values: np.ndarray
- @final
- def getitem_block_index(self: T, slicer: slice) -> T: ...
-
-class NDArrayBackedBlock(SharedBlock):
- values: NDArrayBackedExtensionArray
- @final
- def getitem_block_index(self: T, slicer: slice) -> T: ...
-
-class Block(SharedBlock): ...
-
-class BlockManager:
- blocks: tuple[B, ...]
- axes: list[Index]
- _known_consolidated: bool
- _is_consolidated: bool
- _blknos: np.ndarray
- _blklocs: np.ndarray
- def __init__(
- self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=...
- ) -> None: ...
- def get_slice(self: T, slobj: slice, axis: int = ...) -> T: ...
- def _rebuild_blknos_and_blklocs(self) -> None: ...
-
-class BlockValuesRefs:
- referenced_blocks: list[weakref.ref]
- def __init__(self, blk: SharedBlock | None = ...) -> None: ...
- def add_reference(self, blk: SharedBlock) -> None: ...
- def add_index_reference(self, index: object) -> None: ...
- def has_reference(self) -> bool: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/internals.pyx b/contrib/python/pandas/py3/pandas/_libs/internals.pyx
deleted file mode 100644
index 533727f8f2d..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/internals.pyx
+++ /dev/null
@@ -1,920 +0,0 @@
-from collections import defaultdict
-import weakref
-
-cimport cython
-from cpython.slice cimport PySlice_GetIndicesEx
-from cython cimport Py_ssize_t
-
-
-cdef extern from "Python.h":
- # TODO(cython3): from cpython.pyport cimport PY_SSIZE_T_MAX
- Py_ssize_t PY_SSIZE_T_MAX
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- NPY_INTP,
- int64_t,
- intp_t,
- ndarray,
-)
-
-cnp.import_array()
-
-from pandas._libs.algos import ensure_int64
-
-from pandas._libs.arrays cimport NDArrayBacked
-from pandas._libs.util cimport (
- is_array,
- is_integer_object,
-)
-
-
-@cython.final
-@cython.freelist(32)
-cdef class BlockPlacement:
- cdef:
- slice _as_slice
- ndarray _as_array # Note: this still allows `None`; will be intp_t
- bint _has_slice, _has_array, _is_known_slice_like
-
- def __cinit__(self, val):
- cdef:
- slice slc
-
- self._as_slice = None
- self._as_array = None
- self._has_slice = False
- self._has_array = False
-
- if is_integer_object(val):
- slc = slice(val, val + 1, 1)
- self._as_slice = slc
- self._has_slice = True
- elif isinstance(val, slice):
- slc = slice_canonize(val)
-
- if slc.start != slc.stop:
- self._as_slice = slc
- self._has_slice = True
- else:
- arr = np.empty(0, dtype=np.intp)
- self._as_array = arr
- self._has_array = True
- else:
- # Cython memoryview interface requires ndarray to be writeable.
- if (
- not is_array(val)
- or not cnp.PyArray_ISWRITEABLE(val)
- or (<ndarray>val).descr.type_num != cnp.NPY_INTP
- ):
- arr = np.require(val, dtype=np.intp, requirements="W")
- else:
- arr = val
- # Caller is responsible for ensuring arr.ndim == 1
- self._as_array = arr
- self._has_array = True
-
- def __str__(self) -> str:
- cdef:
- slice s = self._ensure_has_slice()
-
- if s is not None:
- v = self._as_slice
- else:
- v = self._as_array
-
- return f"{type(self).__name__}({v})"
-
- def __repr__(self) -> str:
- return str(self)
-
- def __len__(self) -> int:
- cdef:
- slice s = self._ensure_has_slice()
-
- if s is not None:
- return slice_len(s)
- else:
- return len(self._as_array)
-
- def __iter__(self):
- cdef:
- slice s = self._ensure_has_slice()
- Py_ssize_t start, stop, step, _
-
- if s is not None:
- start, stop, step, _ = slice_get_indices_ex(s)
- return iter(range(start, stop, step))
- else:
- return iter(self._as_array)
-
- @property
- def as_slice(self) -> slice:
- cdef:
- slice s = self._ensure_has_slice()
-
- if s is not None:
- return s
- else:
- raise TypeError("Not slice-like")
-
- @property
- def indexer(self):
- cdef:
- slice s = self._ensure_has_slice()
-
- if s is not None:
- return s
- else:
- return self._as_array
-
- @property
- def as_array(self) -> np.ndarray:
- cdef:
- Py_ssize_t start, stop, _
-
- if not self._has_array:
- start, stop, step, _ = slice_get_indices_ex(self._as_slice)
- # NOTE: this is the C-optimized equivalent of
- # `np.arange(start, stop, step, dtype=np.intp)`
- self._as_array = cnp.PyArray_Arange(start, stop, step, NPY_INTP)
- self._has_array = True
-
- return self._as_array
-
- @property
- def is_slice_like(self) -> bool:
- cdef:
- slice s = self._ensure_has_slice()
-
- return s is not None
-
- def __getitem__(self, loc):
- cdef:
- slice s = self._ensure_has_slice()
-
- if s is not None:
- val = slice_getitem(s, loc)
- else:
- val = self._as_array[loc]
-
- if not isinstance(val, slice) and val.ndim == 0:
- return val
-
- return BlockPlacement(val)
-
- def delete(self, loc) -> BlockPlacement:
- return BlockPlacement(np.delete(self.as_array, loc, axis=0))
-
- def append(self, others) -> BlockPlacement:
- if not len(others):
- return self
-
- return BlockPlacement(
- np.concatenate([self.as_array] + [o.as_array for o in others])
- )
-
- cdef BlockPlacement iadd(self, other):
- cdef:
- slice s = self._ensure_has_slice()
- Py_ssize_t other_int, start, stop, step
-
- if is_integer_object(other) and s is not None:
- other_int = <Py_ssize_t>other
-
- if other_int == 0:
- # BlockPlacement is treated as immutable
- return self
-
- start, stop, step, _ = slice_get_indices_ex(s)
- start += other_int
- stop += other_int
-
- if (step > 0 and start < 0) or (step < 0 and stop < step):
- raise ValueError("iadd causes length change")
-
- if stop < 0:
- val = slice(start, None, step)
- else:
- val = slice(start, stop, step)
-
- return BlockPlacement(val)
- else:
- newarr = self.as_array + other
- if (newarr < 0).any():
- raise ValueError("iadd causes length change")
-
- val = newarr
- return BlockPlacement(val)
-
- def add(self, other) -> BlockPlacement:
- # We can get here with int or ndarray
- return self.iadd(other)
-
- cdef slice _ensure_has_slice(self):
- if not self._has_slice:
- self._as_slice = indexer_as_slice(self._as_array)
- self._has_slice = True
-
- return self._as_slice
-
- cpdef BlockPlacement increment_above(self, Py_ssize_t loc):
- """
- Increment any entries of 'loc' or above by one.
- """
- cdef:
- slice nv, s = self._ensure_has_slice()
- Py_ssize_t start, stop, step
- ndarray[intp_t, ndim=1] newarr
-
- if s is not None:
- # see if we are either all-above or all-below, each of which
- # have fastpaths available.
-
- start, stop, step, _ = slice_get_indices_ex(s)
-
- if start < loc and stop <= loc:
- # We are entirely below, nothing to increment
- return self
-
- if start >= loc and stop >= loc:
- # We are entirely above, we can efficiently increment out slice
- nv = slice(start + 1, stop + 1, step)
- return BlockPlacement(nv)
-
- if loc == 0:
- # fastpath where we know everything is >= 0
- newarr = self.as_array + 1
- return BlockPlacement(newarr)
-
- newarr = self.as_array.copy()
- newarr[newarr >= loc] += 1
- return BlockPlacement(newarr)
-
- def tile_for_unstack(self, factor: int) -> np.ndarray:
- """
- Find the new mgr_locs for the un-stacked version of a Block.
- """
- cdef:
- slice slc = self._ensure_has_slice()
- ndarray[intp_t, ndim=1] new_placement
-
- if slc is not None and slc.step == 1:
- new_slc = slice(slc.start * factor, slc.stop * factor, 1)
- # equiv: np.arange(new_slc.start, new_slc.stop, dtype=np.intp)
- new_placement = cnp.PyArray_Arange(new_slc.start, new_slc.stop, 1, NPY_INTP)
- else:
- # Note: test_pivot_table_empty_aggfunc gets here with `slc is not None`
- mapped = [
- # equiv: np.arange(x * factor, (x + 1) * factor, dtype=np.intp)
- cnp.PyArray_Arange(x * factor, (x + 1) * factor, 1, NPY_INTP)
- for x in self
- ]
- new_placement = np.concatenate(mapped)
- return new_placement
-
-
-cdef slice slice_canonize(slice s):
- """
- Convert slice to canonical bounded form.
- """
- cdef:
- Py_ssize_t start = 0, stop = 0, step = 1
-
- if s.step is None:
- step = 1
- else:
- step = <Py_ssize_t>s.step
- if step == 0:
- raise ValueError("slice step cannot be zero")
-
- if step > 0:
- if s.stop is None:
- raise ValueError("unbounded slice")
-
- stop = <Py_ssize_t>s.stop
- if s.start is None:
- start = 0
- else:
- start = <Py_ssize_t>s.start
- if start > stop:
- start = stop
- elif step < 0:
- if s.start is None:
- raise ValueError("unbounded slice")
-
- start = <Py_ssize_t>s.start
- if s.stop is None:
- stop = -1
- else:
- stop = <Py_ssize_t>s.stop
- if stop > start:
- stop = start
-
- if start < 0 or (stop < 0 and s.stop is not None and step > 0):
- raise ValueError("unbounded slice")
-
- if stop < 0:
- return slice(start, None, step)
- else:
- return slice(start, stop, step)
-
-
-cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1:
- """
- Get length of a bounded slice.
-
- The slice must not have any "open" bounds that would create dependency on
- container size, i.e.:
- - if ``s.step is None or s.step > 0``, ``s.stop`` is not ``None``
- - if ``s.step < 0``, ``s.start`` is not ``None``
-
- Otherwise, the result is unreliable.
- """
- cdef:
- Py_ssize_t start, stop, step, length
-
- if slc is None:
- raise TypeError("slc must be slice") # pragma: no cover
-
- PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length)
-
- return length
-
-
-cdef (Py_ssize_t, Py_ssize_t, Py_ssize_t, Py_ssize_t) slice_get_indices_ex(
- slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX
-):
- """
- Get (start, stop, step, length) tuple for a slice.
-
- If `objlen` is not specified, slice must be bounded, otherwise the result
- will be wrong.
- """
- cdef:
- Py_ssize_t start, stop, step, length
-
- if slc is None:
- raise TypeError("slc should be a slice") # pragma: no cover
-
- PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length)
-
- return start, stop, step, length
-
-
-cdef slice_getitem(slice slc, ind):
- cdef:
- Py_ssize_t s_start, s_stop, s_step, s_len
- Py_ssize_t ind_start, ind_stop, ind_step, ind_len
-
- s_start, s_stop, s_step, s_len = slice_get_indices_ex(slc)
-
- if isinstance(ind, slice):
- ind_start, ind_stop, ind_step, ind_len = slice_get_indices_ex(ind, s_len)
-
- if ind_step > 0 and ind_len == s_len:
- # short-cut for no-op slice
- if ind_len == s_len:
- return slc
-
- if ind_step < 0:
- s_start = s_stop - s_step
- ind_step = -ind_step
-
- s_step *= ind_step
- s_stop = s_start + ind_stop * s_step
- s_start = s_start + ind_start * s_step
-
- if s_step < 0 and s_stop < 0:
- return slice(s_start, None, s_step)
- else:
- return slice(s_start, s_stop, s_step)
-
- else:
- # NOTE:
- # this is the C-optimized equivalent of
- # `np.arange(s_start, s_stop, s_step, dtype=np.intp)[ind]`
- return cnp.PyArray_Arange(s_start, s_stop, s_step, NPY_INTP)[ind]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cdef slice indexer_as_slice(intp_t[:] vals):
- cdef:
- Py_ssize_t i, n, start, stop
- int64_t d
-
- if vals is None:
- raise TypeError("vals must be ndarray") # pragma: no cover
-
- n = vals.shape[0]
-
- if n == 0 or vals[0] < 0:
- return None
-
- if n == 1:
- return slice(vals[0], vals[0] + 1, 1)
-
- if vals[1] < 0:
- return None
-
- # n > 2
- d = vals[1] - vals[0]
-
- if d == 0:
- return None
-
- for i in range(2, n):
- if vals[i] < 0 or vals[i] - vals[i - 1] != d:
- return None
-
- start = vals[0]
- stop = start + n * d
- if stop < 0 and d < 0:
- return slice(start, None, d)
- else:
- return slice(start, stop, d)
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def get_blkno_indexers(
- int64_t[:] blknos, bint group=True
-) -> list[tuple[int, slice | np.ndarray]]:
- """
- Enumerate contiguous runs of integers in ndarray.
-
- Iterate over elements of `blknos` yielding ``(blkno, slice(start, stop))``
- pairs for each contiguous run found.
-
- If `group` is True and there is more than one run for a certain blkno,
- ``(blkno, array)`` with an array containing positions of all elements equal
- to blkno.
-
- Returns
- -------
- list[tuple[int, slice | np.ndarray]]
- """
- # There's blkno in this function's name because it's used in block &
- # blockno handling.
- cdef:
- int64_t cur_blkno
- Py_ssize_t i, start, stop, n, diff
- cnp.npy_intp tot_len
- int64_t blkno
- object group_dict = defaultdict(list)
- ndarray[int64_t, ndim=1] arr
-
- n = blknos.shape[0]
- result = list()
-
- if n == 0:
- return result
-
- start = 0
- cur_blkno = blknos[start]
-
- if group is False:
- for i in range(1, n):
- if blknos[i] != cur_blkno:
- result.append((cur_blkno, slice(start, i)))
-
- start = i
- cur_blkno = blknos[i]
-
- result.append((cur_blkno, slice(start, n)))
- else:
- for i in range(1, n):
- if blknos[i] != cur_blkno:
- group_dict[cur_blkno].append((start, i))
-
- start = i
- cur_blkno = blknos[i]
-
- group_dict[cur_blkno].append((start, n))
-
- for blkno, slices in group_dict.items():
- if len(slices) == 1:
- result.append((blkno, slice(slices[0][0], slices[0][1])))
- else:
- tot_len = sum(stop - start for start, stop in slices)
- # equiv np.empty(tot_len, dtype=np.int64)
- arr = cnp.PyArray_EMPTY(1, &tot_len, cnp.NPY_INT64, 0)
-
- i = 0
- for start, stop in slices:
- for diff in range(start, stop):
- arr[i] = diff
- i += 1
-
- result.append((blkno, arr))
-
- return result
-
-
-def get_blkno_placements(blknos, group: bool = True):
- """
- Parameters
- ----------
- blknos : np.ndarray[int64]
- group : bool, default True
-
- Returns
- -------
- iterator
- yield (blkno, BlockPlacement)
- """
- blknos = ensure_int64(blknos)
-
- for blkno, indexer in get_blkno_indexers(blknos, group):
- yield blkno, BlockPlacement(indexer)
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cpdef update_blklocs_and_blknos(
- ndarray[intp_t, ndim=1] blklocs,
- ndarray[intp_t, ndim=1] blknos,
- Py_ssize_t loc,
- intp_t nblocks,
-):
- """
- Update blklocs and blknos when a new column is inserted at 'loc'.
- """
- cdef:
- Py_ssize_t i
- cnp.npy_intp length = blklocs.shape[0] + 1
- ndarray[intp_t, ndim=1] new_blklocs, new_blknos
-
- # equiv: new_blklocs = np.empty(length, dtype=np.intp)
- new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
- new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
-
- for i in range(loc):
- new_blklocs[i] = blklocs[i]
- new_blknos[i] = blknos[i]
-
- new_blklocs[loc] = 0
- new_blknos[loc] = nblocks
-
- for i in range(loc, length - 1):
- new_blklocs[i + 1] = blklocs[i]
- new_blknos[i + 1] = blknos[i]
-
- return new_blklocs, new_blknos
-
-
-def _unpickle_block(values, placement, ndim):
- # We have to do some gymnastics b/c "ndim" is keyword-only
-
- from pandas.core.internals.blocks import new_block
-
- return new_block(values, placement, ndim=ndim)
-
-
-@cython.freelist(64)
-cdef class SharedBlock:
- """
- Defining __init__ in a cython class significantly improves performance.
- """
- cdef:
- public BlockPlacement _mgr_locs
- public BlockValuesRefs refs
- readonly int ndim
-
- def __cinit__(
- self,
- values,
- placement: BlockPlacement,
- ndim: int,
- refs: BlockValuesRefs | None = None,
- ):
- """
- Parameters
- ----------
- values : np.ndarray or ExtensionArray
- We assume maybe_coerce_values has already been called.
- placement : BlockPlacement
- ndim : int
- 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame
- refs: BlockValuesRefs, optional
- Ref tracking object or None if block does not have any refs.
- """
- self._mgr_locs = placement
- self.ndim = ndim
- if refs is None:
- # if no refs are passed, that means we are creating a Block from
- # new values that it uniquely owns -> start a new BlockValuesRefs
- # object that only references this block
- self.refs = BlockValuesRefs(self)
- else:
- # if refs are passed, this is the BlockValuesRefs object that is shared
- # with the parent blocks which share the values, and a reference to this
- # new block is added
- refs.add_reference(self)
- self.refs = refs
-
- cpdef __reduce__(self):
- args = (self.values, self.mgr_locs.indexer, self.ndim)
- return _unpickle_block, args
-
- cpdef __setstate__(self, state):
- from pandas.core.construction import extract_array
-
- self.mgr_locs = BlockPlacement(state[0])
- self.values = extract_array(state[1], extract_numpy=True)
- if len(state) > 2:
- # we stored ndim
- self.ndim = state[2]
- else:
- # older pickle
- from pandas.core.internals.api import maybe_infer_ndim
-
- ndim = maybe_infer_ndim(self.values, self.mgr_locs)
- self.ndim = ndim
-
-
-cdef class NumpyBlock(SharedBlock):
- cdef:
- public ndarray values
-
- def __cinit__(
- self,
- ndarray values,
- BlockPlacement placement,
- int ndim,
- refs: BlockValuesRefs | None = None,
- ):
- # set values here; the (implicit) call to SharedBlock.__cinit__ will
- # set placement, ndim and refs
- self.values = values
-
- cpdef NumpyBlock getitem_block_index(self, slice slicer):
- """
- Perform __getitem__-like specialized to slicing along index.
-
- Assumes self.ndim == 2
- """
- new_values = self.values[..., slicer]
- return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs)
-
-
-cdef class NDArrayBackedBlock(SharedBlock):
- """
- Block backed by NDArrayBackedExtensionArray
- """
- cdef public:
- NDArrayBacked values
-
- def __cinit__(
- self,
- NDArrayBacked values,
- BlockPlacement placement,
- int ndim,
- refs: BlockValuesRefs | None = None,
- ):
- # set values here; the (implicit) call to SharedBlock.__cinit__ will
- # set placement, ndim and refs
- self.values = values
-
- cpdef NDArrayBackedBlock getitem_block_index(self, slice slicer):
- """
- Perform __getitem__-like specialized to slicing along index.
-
- Assumes self.ndim == 2
- """
- new_values = self.values[..., slicer]
- return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs)
-
-
-cdef class Block(SharedBlock):
- cdef:
- public object values
-
- def __cinit__(
- self,
- object values,
- BlockPlacement placement,
- int ndim,
- refs: BlockValuesRefs | None = None,
- ):
- # set values here; the (implicit) call to SharedBlock.__cinit__ will
- # set placement, ndim and refs
- self.values = values
-
-
-@cython.freelist(64)
-cdef class BlockManager:
- cdef:
- public tuple blocks
- public list axes
- public bint _known_consolidated, _is_consolidated
- public ndarray _blknos, _blklocs
-
- def __cinit__(
- self,
- blocks=None,
- axes=None,
- verify_integrity=True,
- ):
- # None as defaults for unpickling GH#42345
- if blocks is None:
- # This adds 1-2 microseconds to DataFrame(np.array([]))
- return
-
- if isinstance(blocks, list):
- # Backward compat for e.g. pyarrow
- blocks = tuple(blocks)
-
- self.blocks = blocks
- self.axes = axes.copy() # copy to make sure we are not remotely-mutable
-
- # Populate known_consolidate, blknos, and blklocs lazily
- self._known_consolidated = False
- self._is_consolidated = False
- self._blknos = None
- self._blklocs = None
-
- # -------------------------------------------------------------------
- # Block Placement
-
- def _rebuild_blknos_and_blklocs(self) -> None:
- """
- Update mgr._blknos / mgr._blklocs.
- """
- cdef:
- intp_t blkno, i, j
- cnp.npy_intp length = self.shape[0]
- SharedBlock blk
- BlockPlacement bp
- ndarray[intp_t, ndim=1] new_blknos, new_blklocs
-
- # equiv: np.empty(length, dtype=np.intp)
- new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
- new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
- # equiv: new_blknos.fill(-1)
- cnp.PyArray_FILLWBYTE(new_blknos, -1)
- cnp.PyArray_FILLWBYTE(new_blklocs, -1)
-
- for blkno, blk in enumerate(self.blocks):
- bp = blk._mgr_locs
- # Iterating over `bp` is a faster equivalent to
- # new_blknos[bp.indexer] = blkno
- # new_blklocs[bp.indexer] = np.arange(len(bp))
- for i, j in enumerate(bp):
- new_blknos[j] = blkno
- new_blklocs[j] = i
-
- for i in range(length):
- # faster than `for blkno in new_blknos`
- # https://github.com/cython/cython/issues/4393
- blkno = new_blknos[i]
-
- # If there are any -1s remaining, this indicates that our mgr_locs
- # are invalid.
- if blkno == -1:
- raise AssertionError("Gaps in blk ref_locs")
-
- self._blknos = new_blknos
- self._blklocs = new_blklocs
-
- # -------------------------------------------------------------------
- # Pickle
-
- cpdef __reduce__(self):
- if len(self.axes) == 1:
- # SingleBlockManager, __init__ expects Block, axis
- args = (self.blocks[0], self.axes[0])
- else:
- args = (self.blocks, self.axes)
- return type(self), args
-
- cpdef __setstate__(self, state):
- from pandas.core.construction import extract_array
- from pandas.core.internals.blocks import (
- ensure_block_shape,
- new_block,
- )
- from pandas.core.internals.managers import ensure_index
-
- if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
- state = state[3]["0.14.1"]
- axes = [ensure_index(ax) for ax in state["axes"]]
- ndim = len(axes)
-
- for blk in state["blocks"]:
- vals = blk["values"]
- # older versions may hold e.g. DatetimeIndex instead of DTA
- vals = extract_array(vals, extract_numpy=True)
- blk["values"] = ensure_block_shape(vals, ndim=ndim)
-
- nbs = [
- new_block(blk["values"], blk["mgr_locs"], ndim=ndim)
- for blk in state["blocks"]
- ]
- blocks = tuple(nbs)
- self.blocks = blocks
- self.axes = axes
-
- else: # pragma: no cover
- raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
-
- self._post_setstate()
-
- def _post_setstate(self) -> None:
- self._is_consolidated = False
- self._known_consolidated = False
- self._rebuild_blknos_and_blklocs()
-
- # -------------------------------------------------------------------
- # Indexing
-
- cdef BlockManager _get_index_slice(self, slobj):
- cdef:
- SharedBlock blk, nb
- BlockManager mgr
- ndarray blknos, blklocs
-
- nbs = []
- for blk in self.blocks:
- nb = blk.getitem_block_index(slobj)
- nbs.append(nb)
-
- new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)]
- mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False)
-
- # We can avoid having to rebuild blklocs/blknos
- blklocs = self._blklocs
- blknos = self._blknos
- if blknos is not None:
- mgr._blknos = blknos.copy()
- mgr._blklocs = blklocs.copy()
- return mgr
-
- def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager:
-
- if axis == 0:
- new_blocks = self._slice_take_blocks_ax0(slobj)
- elif axis == 1:
- return self._get_index_slice(slobj)
- else:
- raise IndexError("Requested axis not found in manager")
-
- new_axes = list(self.axes)
- new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
-
- return type(self)(tuple(new_blocks), new_axes, verify_integrity=False)
-
-
-cdef class BlockValuesRefs:
- """Tracks all references to a given array.
-
- Keeps track of all blocks (through weak references) that reference the same
- data.
- """
- cdef:
- public list referenced_blocks
-
- def __cinit__(self, blk: SharedBlock | None = None) -> None:
- if blk is not None:
- self.referenced_blocks = [weakref.ref(blk)]
- else:
- self.referenced_blocks = []
-
- def add_reference(self, blk: SharedBlock) -> None:
- """Adds a new reference to our reference collection.
-
- Parameters
- ----------
- blk: SharedBlock
- The block that the new references should point to.
- """
- self.referenced_blocks.append(weakref.ref(blk))
-
- def add_index_reference(self, index: object) -> None:
- """Adds a new reference to our reference collection when creating an index.
-
- Parameters
- ----------
- index: object
- The index that the new reference should point to.
- """
- self.referenced_blocks.append(weakref.ref(index))
-
- def has_reference(self) -> bool:
- """Checks if block has foreign references.
-
- A reference is only relevant if it is still alive. The reference to
- ourselves does not count.
-
- Returns
- -------
- bool
- """
- self.referenced_blocks = [
- ref for ref in self.referenced_blocks if ref() is not None
- ]
- # Checking for more references than block pointing to itself
- return len(self.referenced_blocks) > 1
diff --git a/contrib/python/pandas/py3/pandas/_libs/interval.pyi b/contrib/python/pandas/py3/pandas/_libs/interval.pyi
deleted file mode 100644
index 4c36246e04d..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/interval.pyi
+++ /dev/null
@@ -1,174 +0,0 @@
-from typing import (
- Any,
- Generic,
- TypeVar,
- overload,
-)
-
-import numpy as np
-import numpy.typing as npt
-
-from pandas._typing import (
- IntervalClosedType,
- Timedelta,
- Timestamp,
-)
-
-VALID_CLOSED: frozenset[str]
-
-_OrderableScalarT = TypeVar("_OrderableScalarT", int, float)
-_OrderableTimesT = TypeVar("_OrderableTimesT", Timestamp, Timedelta)
-_OrderableT = TypeVar("_OrderableT", int, float, Timestamp, Timedelta)
-
-class _LengthDescriptor:
- @overload
- def __get__(
- self, instance: Interval[_OrderableScalarT], owner: Any
- ) -> _OrderableScalarT: ...
- @overload
- def __get__(
- self, instance: Interval[_OrderableTimesT], owner: Any
- ) -> Timedelta: ...
-
-class _MidDescriptor:
- @overload
- def __get__(self, instance: Interval[_OrderableScalarT], owner: Any) -> float: ...
- @overload
- def __get__(
- self, instance: Interval[_OrderableTimesT], owner: Any
- ) -> _OrderableTimesT: ...
-
-class IntervalMixin:
- @property
- def closed_left(self) -> bool: ...
- @property
- def closed_right(self) -> bool: ...
- @property
- def open_left(self) -> bool: ...
- @property
- def open_right(self) -> bool: ...
- @property
- def is_empty(self) -> bool: ...
- def _check_closed_matches(self, other: IntervalMixin, name: str = ...) -> None: ...
-
-class Interval(IntervalMixin, Generic[_OrderableT]):
- @property
- def left(self: Interval[_OrderableT]) -> _OrderableT: ...
- @property
- def right(self: Interval[_OrderableT]) -> _OrderableT: ...
- @property
- def closed(self) -> IntervalClosedType: ...
- mid: _MidDescriptor
- length: _LengthDescriptor
- def __init__(
- self,
- left: _OrderableT,
- right: _OrderableT,
- closed: IntervalClosedType = ...,
- ) -> None: ...
- def __hash__(self) -> int: ...
- @overload
- def __contains__(
- self: Interval[Timedelta], key: Timedelta | Interval[Timedelta]
- ) -> bool: ...
- @overload
- def __contains__(
- self: Interval[Timestamp], key: Timestamp | Interval[Timestamp]
- ) -> bool: ...
- @overload
- def __contains__(
- self: Interval[_OrderableScalarT],
- key: _OrderableScalarT | Interval[_OrderableScalarT],
- ) -> bool: ...
- @overload
- def __add__(
- self: Interval[_OrderableTimesT], y: Timedelta
- ) -> Interval[_OrderableTimesT]: ...
- @overload
- def __add__(
- self: Interval[int], y: _OrderableScalarT
- ) -> Interval[_OrderableScalarT]: ...
- @overload
- def __add__(self: Interval[float], y: float) -> Interval[float]: ...
- @overload
- def __radd__(
- self: Interval[_OrderableTimesT], y: Timedelta
- ) -> Interval[_OrderableTimesT]: ...
- @overload
- def __radd__(
- self: Interval[int], y: _OrderableScalarT
- ) -> Interval[_OrderableScalarT]: ...
- @overload
- def __radd__(self: Interval[float], y: float) -> Interval[float]: ...
- @overload
- def __sub__(
- self: Interval[_OrderableTimesT], y: Timedelta
- ) -> Interval[_OrderableTimesT]: ...
- @overload
- def __sub__(
- self: Interval[int], y: _OrderableScalarT
- ) -> Interval[_OrderableScalarT]: ...
- @overload
- def __sub__(self: Interval[float], y: float) -> Interval[float]: ...
- @overload
- def __rsub__(
- self: Interval[_OrderableTimesT], y: Timedelta
- ) -> Interval[_OrderableTimesT]: ...
- @overload
- def __rsub__(
- self: Interval[int], y: _OrderableScalarT
- ) -> Interval[_OrderableScalarT]: ...
- @overload
- def __rsub__(self: Interval[float], y: float) -> Interval[float]: ...
- @overload
- def __mul__(
- self: Interval[int], y: _OrderableScalarT
- ) -> Interval[_OrderableScalarT]: ...
- @overload
- def __mul__(self: Interval[float], y: float) -> Interval[float]: ...
- @overload
- def __rmul__(
- self: Interval[int], y: _OrderableScalarT
- ) -> Interval[_OrderableScalarT]: ...
- @overload
- def __rmul__(self: Interval[float], y: float) -> Interval[float]: ...
- @overload
- def __truediv__(
- self: Interval[int], y: _OrderableScalarT
- ) -> Interval[_OrderableScalarT]: ...
- @overload
- def __truediv__(self: Interval[float], y: float) -> Interval[float]: ...
- @overload
- def __floordiv__(
- self: Interval[int], y: _OrderableScalarT
- ) -> Interval[_OrderableScalarT]: ...
- @overload
- def __floordiv__(self: Interval[float], y: float) -> Interval[float]: ...
- def overlaps(self: Interval[_OrderableT], other: Interval[_OrderableT]) -> bool: ...
-
-def intervals_to_interval_bounds(
- intervals: np.ndarray, validate_closed: bool = ...
-) -> tuple[np.ndarray, np.ndarray, str]: ...
-
-class IntervalTree(IntervalMixin):
- def __init__(
- self,
- left: np.ndarray,
- right: np.ndarray,
- closed: IntervalClosedType = ...,
- leaf_size: int = ...,
- ) -> None: ...
- @property
- def mid(self) -> np.ndarray: ...
- @property
- def length(self) -> np.ndarray: ...
- def get_indexer(self, target) -> npt.NDArray[np.intp]: ...
- def get_indexer_non_unique(
- self, target
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
- _na_count: int
- @property
- def is_overlapping(self) -> bool: ...
- @property
- def is_monotonic_increasing(self) -> bool: ...
- def clear_mapping(self) -> None: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/interval.pyx b/contrib/python/pandas/py3/pandas/_libs/interval.pyx
deleted file mode 100644
index 14b7baf7f5a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/interval.pyx
+++ /dev/null
@@ -1,650 +0,0 @@
-import numbers
-from operator import (
- le,
- lt,
-)
-
-from cpython.datetime cimport (
- PyDelta_Check,
- import_datetime,
-)
-
-import_datetime()
-
-cimport cython
-from cpython.object cimport PyObject_RichCompare
-from cython cimport Py_ssize_t
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- NPY_QUICKSORT,
- PyArray_ArgSort,
- PyArray_Take,
- float64_t,
- int64_t,
- ndarray,
- uint64_t,
-)
-
-cnp.import_array()
-
-
-from pandas._libs cimport util
-from pandas._libs.hashtable cimport Int64Vector
-from pandas._libs.tslibs.timedeltas cimport _Timedelta
-from pandas._libs.tslibs.timestamps cimport _Timestamp
-from pandas._libs.tslibs.timezones cimport tz_compare
-from pandas._libs.tslibs.util cimport (
- is_float_object,
- is_integer_object,
- is_timedelta64_object,
-)
-
-VALID_CLOSED = frozenset(["left", "right", "both", "neither"])
-
-
-cdef class IntervalMixin:
-
- @property
- def closed_left(self):
- """
- Check if the interval is closed on the left side.
-
- For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
-
- Returns
- -------
- bool
- True if the Interval is closed on the left-side.
-
- See Also
- --------
- Interval.closed_right : Check if the interval is closed on the right side.
- Interval.open_left : Boolean inverse of closed_left.
-
- Examples
- --------
- >>> iv = pd.Interval(0, 5, closed='left')
- >>> iv.closed_left
- True
-
- >>> iv = pd.Interval(0, 5, closed='right')
- >>> iv.closed_left
- False
- """
- return self.closed in ("left", "both")
-
- @property
- def closed_right(self):
- """
- Check if the interval is closed on the right side.
-
- For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
-
- Returns
- -------
- bool
- True if the Interval is closed on the left-side.
-
- See Also
- --------
- Interval.closed_left : Check if the interval is closed on the left side.
- Interval.open_right : Boolean inverse of closed_right.
-
- Examples
- --------
- >>> iv = pd.Interval(0, 5, closed='both')
- >>> iv.closed_right
- True
-
- >>> iv = pd.Interval(0, 5, closed='left')
- >>> iv.closed_right
- False
- """
- return self.closed in ("right", "both")
-
- @property
- def open_left(self):
- """
- Check if the interval is open on the left side.
-
- For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
-
- Returns
- -------
- bool
- True if the Interval is not closed on the left-side.
-
- See Also
- --------
- Interval.open_right : Check if the interval is open on the right side.
- Interval.closed_left : Boolean inverse of open_left.
-
- Examples
- --------
- >>> iv = pd.Interval(0, 5, closed='neither')
- >>> iv.open_left
- True
-
- >>> iv = pd.Interval(0, 5, closed='both')
- >>> iv.open_left
- False
- """
- return not self.closed_left
-
- @property
- def open_right(self):
- """
- Check if the interval is open on the right side.
-
- For the meaning of `closed` and `open` see :class:`~pandas.Interval`.
-
- Returns
- -------
- bool
- True if the Interval is not closed on the left-side.
-
- See Also
- --------
- Interval.open_left : Check if the interval is open on the left side.
- Interval.closed_right : Boolean inverse of open_right.
-
- Examples
- --------
- >>> iv = pd.Interval(0, 5, closed='left')
- >>> iv.open_right
- True
-
- >>> iv = pd.Interval(0, 5)
- >>> iv.open_right
- False
- """
- return not self.closed_right
-
- @property
- def mid(self):
- """
- Return the midpoint of the Interval.
-
- Examples
- --------
- >>> iv = pd.Interval(0, 5)
- >>> iv.mid
- 2.5
- """
- try:
- return 0.5 * (self.left + self.right)
- except TypeError:
- # datetime safe version
- return self.left + 0.5 * self.length
-
- @property
- def length(self):
- """
- Return the length of the Interval.
-
- See Also
- --------
- Interval.is_empty : Indicates if an interval contains no points.
- """
- return self.right - self.left
-
- @property
- def is_empty(self):
- """
- Indicates if an interval is empty, meaning it contains no points.
-
- Returns
- -------
- bool or ndarray
- A boolean indicating if a scalar :class:`Interval` is empty, or a
- boolean ``ndarray`` positionally indicating if an ``Interval`` in
- an :class:`~arrays.IntervalArray` or :class:`IntervalIndex` is
- empty.
-
- See Also
- --------
- Interval.length : Return the length of the Interval.
-
- Examples
- --------
- An :class:`Interval` that contains points is not empty:
-
- >>> pd.Interval(0, 1, closed='right').is_empty
- False
-
- An ``Interval`` that does not contain any points is empty:
-
- >>> pd.Interval(0, 0, closed='right').is_empty
- True
- >>> pd.Interval(0, 0, closed='left').is_empty
- True
- >>> pd.Interval(0, 0, closed='neither').is_empty
- True
-
- An ``Interval`` that contains a single point is not empty:
-
- >>> pd.Interval(0, 0, closed='both').is_empty
- False
-
- An :class:`~arrays.IntervalArray` or :class:`IntervalIndex` returns a
- boolean ``ndarray`` positionally indicating if an ``Interval`` is
- empty:
-
- >>> ivs = [pd.Interval(0, 0, closed='neither'),
- ... pd.Interval(1, 2, closed='neither')]
- >>> pd.arrays.IntervalArray(ivs).is_empty
- array([ True, False])
-
- Missing values are not considered empty:
-
- >>> ivs = [pd.Interval(0, 0, closed='neither'), np.nan]
- >>> pd.IntervalIndex(ivs).is_empty
- array([ True, False])
- """
- return (self.right == self.left) & (self.closed != "both")
-
- def _check_closed_matches(self, other, name="other"):
- """
- Check if the closed attribute of `other` matches.
-
- Note that 'left' and 'right' are considered different from 'both'.
-
- Parameters
- ----------
- other : Interval, IntervalIndex, IntervalArray
- name : str
- Name to use for 'other' in the error message.
-
- Raises
- ------
- ValueError
- When `other` is not closed exactly the same as self.
- """
- if self.closed != other.closed:
- raise ValueError(f"'{name}.closed' is {repr(other.closed)}, "
- f"expected {repr(self.closed)}.")
-
-
-cdef bint _interval_like(other):
- return (hasattr(other, "left")
- and hasattr(other, "right")
- and hasattr(other, "closed"))
-
-
-cdef class Interval(IntervalMixin):
- """
- Immutable object implementing an Interval, a bounded slice-like interval.
-
- Parameters
- ----------
- left : orderable scalar
- Left bound for the interval.
- right : orderable scalar
- Right bound for the interval.
- closed : {'right', 'left', 'both', 'neither'}, default 'right'
- Whether the interval is closed on the left-side, right-side, both or
- neither. See the Notes for more detailed explanation.
-
- See Also
- --------
- IntervalIndex : An Index of Interval objects that are all closed on the
- same side.
- cut : Convert continuous data into discrete bins (Categorical
- of Interval objects).
- qcut : Convert continuous data into bins (Categorical of Interval objects)
- based on quantiles.
- Period : Represents a period of time.
-
- Notes
- -----
- The parameters `left` and `right` must be from the same type, you must be
- able to compare them and they must satisfy ``left <= right``.
-
- A closed interval (in mathematics denoted by square brackets) contains
- its endpoints, i.e. the closed interval ``[0, 5]`` is characterized by the
- conditions ``0 <= x <= 5``. This is what ``closed='both'`` stands for.
- An open interval (in mathematics denoted by parentheses) does not contain
- its endpoints, i.e. the open interval ``(0, 5)`` is characterized by the
- conditions ``0 < x < 5``. This is what ``closed='neither'`` stands for.
- Intervals can also be half-open or half-closed, i.e. ``[0, 5)`` is
- described by ``0 <= x < 5`` (``closed='left'``) and ``(0, 5]`` is
- described by ``0 < x <= 5`` (``closed='right'``).
-
- Examples
- --------
- It is possible to build Intervals of different types, like numeric ones:
-
- >>> iv = pd.Interval(left=0, right=5)
- >>> iv
- Interval(0, 5, closed='right')
-
- You can check if an element belongs to it, or if it contains another interval:
-
- >>> 2.5 in iv
- True
- >>> pd.Interval(left=2, right=5, closed='both') in iv
- True
-
- You can test the bounds (``closed='right'``, so ``0 < x <= 5``):
-
- >>> 0 in iv
- False
- >>> 5 in iv
- True
- >>> 0.0001 in iv
- True
-
- Calculate its length
-
- >>> iv.length
- 5
-
- You can operate with `+` and `*` over an Interval and the operation
- is applied to each of its bounds, so the result depends on the type
- of the bound elements
-
- >>> shifted_iv = iv + 3
- >>> shifted_iv
- Interval(3, 8, closed='right')
- >>> extended_iv = iv * 10.0
- >>> extended_iv
- Interval(0.0, 50.0, closed='right')
-
- To create a time interval you can use Timestamps as the bounds
-
- >>> year_2017 = pd.Interval(pd.Timestamp('2017-01-01 00:00:00'),
- ... pd.Timestamp('2018-01-01 00:00:00'),
- ... closed='left')
- >>> pd.Timestamp('2017-01-01 00:00') in year_2017
- True
- >>> year_2017.length
- Timedelta('365 days 00:00:00')
- """
- _typ = "interval"
- __array_priority__ = 1000
-
- cdef readonly object left
- """
- Left bound for the interval.
- """
-
- cdef readonly object right
- """
- Right bound for the interval.
- """
-
- cdef readonly str closed
- """
- String describing the inclusive side the intervals.
-
- Either ``left``, ``right``, ``both`` or ``neither``.
- """
-
- def __init__(self, left, right, str closed="right"):
- # note: it is faster to just do these checks than to use a special
- # constructor (__cinit__/__new__) to avoid them
-
- self._validate_endpoint(left)
- self._validate_endpoint(right)
-
- if closed not in VALID_CLOSED:
- raise ValueError(f"invalid option for 'closed': {closed}")
- if not left <= right:
- raise ValueError("left side of interval must be <= right side")
- if (isinstance(left, _Timestamp) and
- not tz_compare(left.tzinfo, right.tzinfo)):
- # GH 18538
- raise ValueError("left and right must have the same time zone, got "
- f"{repr(left.tzinfo)}' and {repr(right.tzinfo)}")
- self.left = left
- self.right = right
- self.closed = closed
-
- def _validate_endpoint(self, endpoint):
- # GH 23013
- if not (is_integer_object(endpoint) or is_float_object(endpoint) or
- isinstance(endpoint, (_Timestamp, _Timedelta))):
- raise ValueError("Only numeric, Timestamp and Timedelta endpoints "
- "are allowed when constructing an Interval.")
-
- def __hash__(self):
- return hash((self.left, self.right, self.closed))
-
- def __contains__(self, key) -> bool:
- if _interval_like(key):
- key_closed_left = key.closed in ("left", "both")
- key_closed_right = key.closed in ("right", "both")
- if self.open_left and key_closed_left:
- left_contained = self.left < key.left
- else:
- left_contained = self.left <= key.left
- if self.open_right and key_closed_right:
- right_contained = key.right < self.right
- else:
- right_contained = key.right <= self.right
- return left_contained and right_contained
- return ((self.left < key if self.open_left else self.left <= key) and
- (key < self.right if self.open_right else key <= self.right))
-
- def __richcmp__(self, other, op: int):
- if isinstance(other, Interval):
- self_tuple = (self.left, self.right, self.closed)
- other_tuple = (other.left, other.right, other.closed)
- return PyObject_RichCompare(self_tuple, other_tuple, op)
- elif util.is_array(other):
- return np.array(
- [PyObject_RichCompare(self, x, op) for x in other],
- dtype=bool,
- )
-
- return NotImplemented
-
- def __reduce__(self):
- args = (self.left, self.right, self.closed)
- return (type(self), args)
-
- def _repr_base(self):
- left = self.left
- right = self.right
-
- # TODO: need more general formatting methodology here
- if isinstance(left, _Timestamp) and isinstance(right, _Timestamp):
- left = left._short_repr
- right = right._short_repr
-
- return left, right
-
- def __repr__(self) -> str:
-
- left, right = self._repr_base()
- name = type(self).__name__
- repr_str = f"{name}({repr(left)}, {repr(right)}, closed={repr(self.closed)})"
- return repr_str
-
- def __str__(self) -> str:
-
- left, right = self._repr_base()
- start_symbol = "[" if self.closed_left else "("
- end_symbol = "]" if self.closed_right else ")"
- return f"{start_symbol}{left}, {right}{end_symbol}"
-
- def __add__(self, y):
- if (
- isinstance(y, numbers.Number)
- or PyDelta_Check(y)
- or is_timedelta64_object(y)
- ):
- return Interval(self.left + y, self.right + y, closed=self.closed)
- elif (
- # __radd__ pattern
- # TODO(cython3): remove this
- isinstance(y, Interval)
- and (
- isinstance(self, numbers.Number)
- or PyDelta_Check(self)
- or is_timedelta64_object(self)
- )
- ):
- return Interval(y.left + self, y.right + self, closed=y.closed)
- return NotImplemented
-
- def __radd__(self, other):
- if (
- isinstance(other, numbers.Number)
- or PyDelta_Check(other)
- or is_timedelta64_object(other)
- ):
- return Interval(self.left + other, self.right + other, closed=self.closed)
- return NotImplemented
-
- def __sub__(self, y):
- if (
- isinstance(y, numbers.Number)
- or PyDelta_Check(y)
- or is_timedelta64_object(y)
- ):
- return Interval(self.left - y, self.right - y, closed=self.closed)
- return NotImplemented
-
- def __mul__(self, y):
- if isinstance(y, numbers.Number):
- return Interval(self.left * y, self.right * y, closed=self.closed)
- elif isinstance(y, Interval) and isinstance(self, numbers.Number):
- # __radd__ semantics
- # TODO(cython3): remove this
- return Interval(y.left * self, y.right * self, closed=y.closed)
- return NotImplemented
-
- def __rmul__(self, other):
- if isinstance(other, numbers.Number):
- return Interval(self.left * other, self.right * other, closed=self.closed)
- return NotImplemented
-
- def __truediv__(self, y):
- if isinstance(y, numbers.Number):
- return Interval(self.left / y, self.right / y, closed=self.closed)
- return NotImplemented
-
- def __floordiv__(self, y):
- if isinstance(y, numbers.Number):
- return Interval(
- self.left // y, self.right // y, closed=self.closed)
- return NotImplemented
-
- def overlaps(self, other):
- """
- Check whether two Interval objects overlap.
-
- Two intervals overlap if they share a common point, including closed
- endpoints. Intervals that only have an open endpoint in common do not
- overlap.
-
- Parameters
- ----------
- other : Interval
- Interval to check against for an overlap.
-
- Returns
- -------
- bool
- True if the two intervals overlap.
-
- See Also
- --------
- IntervalArray.overlaps : The corresponding method for IntervalArray.
- IntervalIndex.overlaps : The corresponding method for IntervalIndex.
-
- Examples
- --------
- >>> i1 = pd.Interval(0, 2)
- >>> i2 = pd.Interval(1, 3)
- >>> i1.overlaps(i2)
- True
- >>> i3 = pd.Interval(4, 5)
- >>> i1.overlaps(i3)
- False
-
- Intervals that share closed endpoints overlap:
-
- >>> i4 = pd.Interval(0, 1, closed='both')
- >>> i5 = pd.Interval(1, 2, closed='both')
- >>> i4.overlaps(i5)
- True
-
- Intervals that only have an open endpoint in common do not overlap:
-
- >>> i6 = pd.Interval(1, 2, closed='neither')
- >>> i4.overlaps(i6)
- False
- """
- if not isinstance(other, Interval):
- raise TypeError("`other` must be an Interval, "
- f"got {type(other).__name__}")
-
- # equality is okay if both endpoints are closed (overlap at a point)
- op1 = le if (self.closed_left and other.closed_right) else lt
- op2 = le if (other.closed_left and self.closed_right) else lt
-
- # overlaps is equivalent negation of two interval being disjoint:
- # disjoint = (A.left > B.right) or (B.left > A.right)
- # (simplifying the negation allows this to be done in less operations)
- return op1(self.left, other.right) and op2(other.left, self.right)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def intervals_to_interval_bounds(ndarray intervals, bint validate_closed=True):
- """
- Parameters
- ----------
- intervals : ndarray
- Object array of Intervals / nulls.
-
- validate_closed: bool, default True
- Boolean indicating if all intervals must be closed on the same side.
- Mismatching closed will raise if True, else return None for closed.
-
- Returns
- -------
- tuple of
- left : ndarray
- right : ndarray
- closed: str
- """
- cdef:
- object closed = None, interval
- Py_ssize_t i, n = len(intervals)
- ndarray left, right
- bint seen_closed = False
-
- left = np.empty(n, dtype=intervals.dtype)
- right = np.empty(n, dtype=intervals.dtype)
-
- for i in range(n):
- interval = intervals[i]
- if interval is None or util.is_nan(interval):
- left[i] = np.nan
- right[i] = np.nan
- continue
-
- if not isinstance(interval, Interval):
- raise TypeError(f"type {type(interval)} with value "
- f"{interval} is not an interval")
-
- left[i] = interval.left
- right[i] = interval.right
- if not seen_closed:
- seen_closed = True
- closed = interval.closed
- elif closed != interval.closed:
- closed = None
- if validate_closed:
- raise ValueError("intervals must all be closed on the same side")
-
- return left, right, closed
-
-
-include "intervaltree.pxi"
diff --git a/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi b/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi
deleted file mode 100644
index 025fb8f8b68..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi
+++ /dev/null
@@ -1,2074 +0,0 @@
-"""
-Template for intervaltree
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-from pandas._libs.algos import is_monotonic
-
-ctypedef fused int_scalar_t:
- int64_t
- float64_t
-
-ctypedef fused uint_scalar_t:
- uint64_t
- float64_t
-
-ctypedef fused scalar_t:
- int_scalar_t
- uint_scalar_t
-
-# ----------------------------------------------------------------------
-# IntervalTree
-# ----------------------------------------------------------------------
-
-cdef class IntervalTree(IntervalMixin):
- """A centered interval tree
-
- Based off the algorithm described on Wikipedia:
- https://en.wikipedia.org/wiki/Interval_tree
-
- we are emulating the IndexEngine interface
- """
- cdef readonly:
- ndarray left, right
- IntervalNode root
- object dtype
- str closed
- object _is_overlapping, _left_sorter, _right_sorter
- Py_ssize_t _na_count
-
- def __init__(self, left, right, closed='right', leaf_size=100):
- """
- Parameters
- ----------
- left, right : np.ndarray[ndim=1]
- Left and right bounds for each interval. Assumed to contain no
- NaNs.
- closed : {'left', 'right', 'both', 'neither'}, optional
- Whether the intervals are closed on the left-side, right-side, both
- or neither. Defaults to 'right'.
- leaf_size : int, optional
- Parameter that controls when the tree switches from creating nodes
- to brute-force search. Tune this parameter to optimize query
- performance.
- """
- if closed not in ['left', 'right', 'both', 'neither']:
- raise ValueError("invalid option for 'closed': %s" % closed)
-
- left = np.asarray(left)
- right = np.asarray(right)
- self.dtype = np.result_type(left, right)
- self.left = np.asarray(left, dtype=self.dtype)
- self.right = np.asarray(right, dtype=self.dtype)
-
- indices = np.arange(len(left), dtype='int64')
-
- self.closed = closed
-
- # GH 23352: ensure no nan in nodes
- mask = ~np.isnan(self.left)
- self._na_count = len(mask) - mask.sum()
- self.left = self.left[mask]
- self.right = self.right[mask]
- indices = indices[mask]
-
- node_cls = NODE_CLASSES[str(self.dtype), closed]
- self.root = node_cls(self.left, self.right, indices, leaf_size)
-
- @property
- def left_sorter(self) -> np.ndarray:
- """How to sort the left labels; this is used for binary search
- """
- if self._left_sorter is None:
- values = [self.right, self.left]
- self._left_sorter = np.lexsort(values)
- return self._left_sorter
-
- @property
- def right_sorter(self) -> np.ndarray:
- """How to sort the right labels
- """
- if self._right_sorter is None:
- self._right_sorter = np.argsort(self.right)
- return self._right_sorter
-
- @property
- def is_overlapping(self) -> bool:
- """
- Determine if the IntervalTree contains overlapping intervals.
- Cached as self._is_overlapping.
- """
- if self._is_overlapping is not None:
- return self._is_overlapping
-
- # <= when both sides closed since endpoints can overlap
- op = le if self.closed == 'both' else lt
-
- # overlap if start of current interval < end of previous interval
- # (current and previous in terms of sorted order by left/start side)
- current = self.left[self.left_sorter[1:]]
- previous = self.right[self.left_sorter[:-1]]
- self._is_overlapping = bool(op(current, previous).any())
-
- return self._is_overlapping
-
- @property
- def is_monotonic_increasing(self) -> bool:
- """
- Return True if the IntervalTree is monotonic increasing (only equal or
- increasing values), else False
- """
- if self._na_count > 0:
- return False
-
- sort_order = self.left_sorter
- return is_monotonic(sort_order, False)[0]
-
- def get_indexer(self, scalar_t[:] target) -> np.ndarray:
- """Return the positions corresponding to unique intervals that overlap
- with the given array of scalar targets.
- """
-
- # TODO: write get_indexer_intervals
- cdef:
- Py_ssize_t old_len
- Py_ssize_t i
- Int64Vector result
-
- result = Int64Vector()
- old_len = 0
- for i in range(len(target)):
- try:
- self.root.query(result, target[i])
- except OverflowError:
- # overflow -> no match, which is already handled below
- pass
-
- if result.data.n == old_len:
- result.append(-1)
- elif result.data.n > old_len + 1:
- raise KeyError(
- 'indexer does not intersect a unique set of intervals')
- old_len = result.data.n
- return result.to_array().astype('intp')
-
- def get_indexer_non_unique(self, scalar_t[:] target):
- """Return the positions corresponding to intervals that overlap with
- the given array of scalar targets. Non-unique positions are repeated.
- """
- cdef:
- Py_ssize_t old_len
- Py_ssize_t i
- Int64Vector result, missing
-
- result = Int64Vector()
- missing = Int64Vector()
- old_len = 0
- for i in range(len(target)):
- try:
- self.root.query(result, target[i])
- except OverflowError:
- # overflow -> no match, which is already handled below
- pass
-
- if result.data.n == old_len:
- result.append(-1)
- missing.append(i)
- old_len = result.data.n
- return (result.to_array().astype('intp'),
- missing.to_array().astype('intp'))
-
- def __repr__(self) -> str:
- return ('<IntervalTree[{dtype},{closed}]: '
- '{n_elements} elements>'.format(
- dtype=self.dtype, closed=self.closed,
- n_elements=self.root.n_elements))
-
- # compat with IndexEngine interface
- def clear_mapping(self) -> None:
- pass
-
-
-cdef take(ndarray source, ndarray indices):
- """Take the given positions from a 1D ndarray
- """
- return PyArray_Take(source, indices, 0)
-
-
-cdef sort_values_and_indices(all_values, all_indices, subset):
- indices = take(all_indices, subset)
- values = take(all_values, subset)
- sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT)
- sorted_values = take(values, sorter)
- sorted_indices = take(indices, sorter)
- return sorted_values, sorted_indices
-
-
-# ----------------------------------------------------------------------
-# Nodes
-# ----------------------------------------------------------------------
-
-@cython.internal
-cdef class IntervalNode:
- cdef readonly:
- int64_t n_elements, n_center, leaf_size
- bint is_leaf_node
-
- def __repr__(self) -> str:
- if self.is_leaf_node:
- return (
- f"<{type(self).__name__}: {self.n_elements} elements (terminal)>"
- )
- else:
- n_left = self.left_node.n_elements
- n_right = self.right_node.n_elements
- n_center = self.n_elements - n_left - n_right
- return (
- f"<{type(self).__name__}: "
- f"pivot {self.pivot}, {self.n_elements} elements "
- f"({n_left} left, {n_right} right, {n_center} overlapping)>"
- )
-
- def counts(self):
- """
- Inspect counts on this node
- useful for debugging purposes
- """
- if self.is_leaf_node:
- return self.n_elements
- else:
- m = len(self.center_left_values)
- l = self.left_node.counts()
- r = self.right_node.counts()
- return (m, (l, r))
-
-
-# we need specialized nodes and leaves to optimize for different dtype and
-# closed values
-
-NODE_CLASSES = {}
-
-
-@cython.internal
-cdef class Float64ClosedLeftIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Float64ClosedLeftIntervalNode left_node, right_node
- float64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- float64_t min_left, max_right
- float64_t pivot
-
- def __init__(self,
- ndarray[float64_t, ndim=1] left,
- ndarray[float64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(float64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, float64_t[:] left, float64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] <= self.pivot:
- left_ind.append(i)
- elif self.pivot < left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[float64_t, ndim=1] left,
- ndarray[float64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Float64ClosedLeftIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- float64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] <= point < self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] <= point:
- break
- result.append(indices[i])
- if point < self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point < values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left <= point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['float64',
- 'left'] = Float64ClosedLeftIntervalNode
-
-
-@cython.internal
-cdef class Float64ClosedRightIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Float64ClosedRightIntervalNode left_node, right_node
- float64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- float64_t min_left, max_right
- float64_t pivot
-
- def __init__(self,
- ndarray[float64_t, ndim=1] left,
- ndarray[float64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(float64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, float64_t[:] left, float64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] < self.pivot:
- left_ind.append(i)
- elif self.pivot <= left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[float64_t, ndim=1] left,
- ndarray[float64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Float64ClosedRightIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- float64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] < point <= self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] < point:
- break
- result.append(indices[i])
- if point <= self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point <= values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left < point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['float64',
- 'right'] = Float64ClosedRightIntervalNode
-
-
-@cython.internal
-cdef class Float64ClosedBothIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Float64ClosedBothIntervalNode left_node, right_node
- float64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- float64_t min_left, max_right
- float64_t pivot
-
- def __init__(self,
- ndarray[float64_t, ndim=1] left,
- ndarray[float64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(float64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, float64_t[:] left, float64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] < self.pivot:
- left_ind.append(i)
- elif self.pivot < left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[float64_t, ndim=1] left,
- ndarray[float64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Float64ClosedBothIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- float64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] <= point <= self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] <= point:
- break
- result.append(indices[i])
- if point <= self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point <= values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left <= point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['float64',
- 'both'] = Float64ClosedBothIntervalNode
-
-
-@cython.internal
-cdef class Float64ClosedNeitherIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Float64ClosedNeitherIntervalNode left_node, right_node
- float64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- float64_t min_left, max_right
- float64_t pivot
-
- def __init__(self,
- ndarray[float64_t, ndim=1] left,
- ndarray[float64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(float64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, float64_t[:] left, float64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] <= self.pivot:
- left_ind.append(i)
- elif self.pivot <= left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[float64_t, ndim=1] left,
- ndarray[float64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Float64ClosedNeitherIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- float64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] < point < self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] < point:
- break
- result.append(indices[i])
- if point < self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point < values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left < point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['float64',
- 'neither'] = Float64ClosedNeitherIntervalNode
-
-
-@cython.internal
-cdef class Int64ClosedLeftIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Int64ClosedLeftIntervalNode left_node, right_node
- int64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- int64_t min_left, max_right
- int64_t pivot
-
- def __init__(self,
- ndarray[int64_t, ndim=1] left,
- ndarray[int64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(int64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, int64_t[:] left, int64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] <= self.pivot:
- left_ind.append(i)
- elif self.pivot < left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[int64_t, ndim=1] left,
- ndarray[int64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Int64ClosedLeftIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, int_scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- int64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] <= point < self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] <= point:
- break
- result.append(indices[i])
- if point < self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point < values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left <= point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['int64',
- 'left'] = Int64ClosedLeftIntervalNode
-
-
-@cython.internal
-cdef class Int64ClosedRightIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Int64ClosedRightIntervalNode left_node, right_node
- int64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- int64_t min_left, max_right
- int64_t pivot
-
- def __init__(self,
- ndarray[int64_t, ndim=1] left,
- ndarray[int64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(int64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, int64_t[:] left, int64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] < self.pivot:
- left_ind.append(i)
- elif self.pivot <= left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[int64_t, ndim=1] left,
- ndarray[int64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Int64ClosedRightIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, int_scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- int64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] < point <= self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] < point:
- break
- result.append(indices[i])
- if point <= self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point <= values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left < point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['int64',
- 'right'] = Int64ClosedRightIntervalNode
-
-
-@cython.internal
-cdef class Int64ClosedBothIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Int64ClosedBothIntervalNode left_node, right_node
- int64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- int64_t min_left, max_right
- int64_t pivot
-
- def __init__(self,
- ndarray[int64_t, ndim=1] left,
- ndarray[int64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(int64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, int64_t[:] left, int64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] < self.pivot:
- left_ind.append(i)
- elif self.pivot < left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[int64_t, ndim=1] left,
- ndarray[int64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Int64ClosedBothIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, int_scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- int64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] <= point <= self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] <= point:
- break
- result.append(indices[i])
- if point <= self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point <= values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left <= point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['int64',
- 'both'] = Int64ClosedBothIntervalNode
-
-
-@cython.internal
-cdef class Int64ClosedNeitherIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Int64ClosedNeitherIntervalNode left_node, right_node
- int64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- int64_t min_left, max_right
- int64_t pivot
-
- def __init__(self,
- ndarray[int64_t, ndim=1] left,
- ndarray[int64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(int64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, int64_t[:] left, int64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] <= self.pivot:
- left_ind.append(i)
- elif self.pivot <= left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[int64_t, ndim=1] left,
- ndarray[int64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Int64ClosedNeitherIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, int_scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- int64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] < point < self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] < point:
- break
- result.append(indices[i])
- if point < self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point < values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left < point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['int64',
- 'neither'] = Int64ClosedNeitherIntervalNode
-
-
-@cython.internal
-cdef class Uint64ClosedLeftIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Uint64ClosedLeftIntervalNode left_node, right_node
- uint64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- uint64_t min_left, max_right
- uint64_t pivot
-
- def __init__(self,
- ndarray[uint64_t, ndim=1] left,
- ndarray[uint64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(uint64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, uint64_t[:] left, uint64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] <= self.pivot:
- left_ind.append(i)
- elif self.pivot < left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[uint64_t, ndim=1] left,
- ndarray[uint64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Uint64ClosedLeftIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, uint_scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- uint64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] <= point < self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] <= point:
- break
- result.append(indices[i])
- if point < self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point < values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left <= point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['uint64',
- 'left'] = Uint64ClosedLeftIntervalNode
-
-
-@cython.internal
-cdef class Uint64ClosedRightIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Uint64ClosedRightIntervalNode left_node, right_node
- uint64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- uint64_t min_left, max_right
- uint64_t pivot
-
- def __init__(self,
- ndarray[uint64_t, ndim=1] left,
- ndarray[uint64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(uint64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, uint64_t[:] left, uint64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] < self.pivot:
- left_ind.append(i)
- elif self.pivot <= left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[uint64_t, ndim=1] left,
- ndarray[uint64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Uint64ClosedRightIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, uint_scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- uint64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] < point <= self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] < point:
- break
- result.append(indices[i])
- if point <= self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point <= values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left < point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['uint64',
- 'right'] = Uint64ClosedRightIntervalNode
-
-
-@cython.internal
-cdef class Uint64ClosedBothIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Uint64ClosedBothIntervalNode left_node, right_node
- uint64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- uint64_t min_left, max_right
- uint64_t pivot
-
- def __init__(self,
- ndarray[uint64_t, ndim=1] left,
- ndarray[uint64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(uint64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, uint64_t[:] left, uint64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] < self.pivot:
- left_ind.append(i)
- elif self.pivot < left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[uint64_t, ndim=1] left,
- ndarray[uint64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Uint64ClosedBothIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, uint_scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- uint64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] <= point <= self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] <= point:
- break
- result.append(indices[i])
- if point <= self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point <= values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left <= point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['uint64',
- 'both'] = Uint64ClosedBothIntervalNode
-
-
-@cython.internal
-cdef class Uint64ClosedNeitherIntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- Uint64ClosedNeitherIntervalNode left_node, right_node
- uint64_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- uint64_t min_left, max_right
- uint64_t pivot
-
- def __init__(self,
- ndarray[uint64_t, ndim=1] left,
- ndarray[uint64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast(uint64_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, uint64_t[:] left, uint64_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] <= self.pivot:
- left_ind.append(i)
- elif self.pivot <= left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[uint64_t, ndim=1] left,
- ndarray[uint64_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return Uint64ClosedNeitherIntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, uint_scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- uint64_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] < point < self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] < point:
- break
- result.append(indices[i])
- if point < self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point < values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left < point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['uint64',
- 'neither'] = Uint64ClosedNeitherIntervalNode
diff --git a/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi.in b/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi.in
deleted file mode 100644
index 67fee7c5fba..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/intervaltree.pxi.in
+++ /dev/null
@@ -1,434 +0,0 @@
-"""
-Template for intervaltree
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-from pandas._libs.algos import is_monotonic
-
-ctypedef fused int_scalar_t:
- int64_t
- float64_t
-
-ctypedef fused uint_scalar_t:
- uint64_t
- float64_t
-
-ctypedef fused scalar_t:
- int_scalar_t
- uint_scalar_t
-
-# ----------------------------------------------------------------------
-# IntervalTree
-# ----------------------------------------------------------------------
-
-cdef class IntervalTree(IntervalMixin):
- """A centered interval tree
-
- Based off the algorithm described on Wikipedia:
- https://en.wikipedia.org/wiki/Interval_tree
-
- we are emulating the IndexEngine interface
- """
- cdef readonly:
- ndarray left, right
- IntervalNode root
- object dtype
- str closed
- object _is_overlapping, _left_sorter, _right_sorter
- Py_ssize_t _na_count
-
- def __init__(self, left, right, closed='right', leaf_size=100):
- """
- Parameters
- ----------
- left, right : np.ndarray[ndim=1]
- Left and right bounds for each interval. Assumed to contain no
- NaNs.
- closed : {'left', 'right', 'both', 'neither'}, optional
- Whether the intervals are closed on the left-side, right-side, both
- or neither. Defaults to 'right'.
- leaf_size : int, optional
- Parameter that controls when the tree switches from creating nodes
- to brute-force search. Tune this parameter to optimize query
- performance.
- """
- if closed not in ['left', 'right', 'both', 'neither']:
- raise ValueError("invalid option for 'closed': %s" % closed)
-
- left = np.asarray(left)
- right = np.asarray(right)
- self.dtype = np.result_type(left, right)
- self.left = np.asarray(left, dtype=self.dtype)
- self.right = np.asarray(right, dtype=self.dtype)
-
- indices = np.arange(len(left), dtype='int64')
-
- self.closed = closed
-
- # GH 23352: ensure no nan in nodes
- mask = ~np.isnan(self.left)
- self._na_count = len(mask) - mask.sum()
- self.left = self.left[mask]
- self.right = self.right[mask]
- indices = indices[mask]
-
- node_cls = NODE_CLASSES[str(self.dtype), closed]
- self.root = node_cls(self.left, self.right, indices, leaf_size)
-
- @property
- def left_sorter(self) -> np.ndarray:
- """How to sort the left labels; this is used for binary search
- """
- if self._left_sorter is None:
- values = [self.right, self.left]
- self._left_sorter = np.lexsort(values)
- return self._left_sorter
-
- @property
- def right_sorter(self) -> np.ndarray:
- """How to sort the right labels
- """
- if self._right_sorter is None:
- self._right_sorter = np.argsort(self.right)
- return self._right_sorter
-
- @property
- def is_overlapping(self) -> bool:
- """
- Determine if the IntervalTree contains overlapping intervals.
- Cached as self._is_overlapping.
- """
- if self._is_overlapping is not None:
- return self._is_overlapping
-
- # <= when both sides closed since endpoints can overlap
- op = le if self.closed == 'both' else lt
-
- # overlap if start of current interval < end of previous interval
- # (current and previous in terms of sorted order by left/start side)
- current = self.left[self.left_sorter[1:]]
- previous = self.right[self.left_sorter[:-1]]
- self._is_overlapping = bool(op(current, previous).any())
-
- return self._is_overlapping
-
- @property
- def is_monotonic_increasing(self) -> bool:
- """
- Return True if the IntervalTree is monotonic increasing (only equal or
- increasing values), else False
- """
- if self._na_count > 0:
- return False
-
- sort_order = self.left_sorter
- return is_monotonic(sort_order, False)[0]
-
- def get_indexer(self, scalar_t[:] target) -> np.ndarray:
- """Return the positions corresponding to unique intervals that overlap
- with the given array of scalar targets.
- """
-
- # TODO: write get_indexer_intervals
- cdef:
- Py_ssize_t old_len
- Py_ssize_t i
- Int64Vector result
-
- result = Int64Vector()
- old_len = 0
- for i in range(len(target)):
- try:
- self.root.query(result, target[i])
- except OverflowError:
- # overflow -> no match, which is already handled below
- pass
-
- if result.data.n == old_len:
- result.append(-1)
- elif result.data.n > old_len + 1:
- raise KeyError(
- 'indexer does not intersect a unique set of intervals')
- old_len = result.data.n
- return result.to_array().astype('intp')
-
- def get_indexer_non_unique(self, scalar_t[:] target):
- """Return the positions corresponding to intervals that overlap with
- the given array of scalar targets. Non-unique positions are repeated.
- """
- cdef:
- Py_ssize_t old_len
- Py_ssize_t i
- Int64Vector result, missing
-
- result = Int64Vector()
- missing = Int64Vector()
- old_len = 0
- for i in range(len(target)):
- try:
- self.root.query(result, target[i])
- except OverflowError:
- # overflow -> no match, which is already handled below
- pass
-
- if result.data.n == old_len:
- result.append(-1)
- missing.append(i)
- old_len = result.data.n
- return (result.to_array().astype('intp'),
- missing.to_array().astype('intp'))
-
- def __repr__(self) -> str:
- return ('<IntervalTree[{dtype},{closed}]: '
- '{n_elements} elements>'.format(
- dtype=self.dtype, closed=self.closed,
- n_elements=self.root.n_elements))
-
- # compat with IndexEngine interface
- def clear_mapping(self) -> None:
- pass
-
-
-cdef take(ndarray source, ndarray indices):
- """Take the given positions from a 1D ndarray
- """
- return PyArray_Take(source, indices, 0)
-
-
-cdef sort_values_and_indices(all_values, all_indices, subset):
- indices = take(all_indices, subset)
- values = take(all_values, subset)
- sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT)
- sorted_values = take(values, sorter)
- sorted_indices = take(indices, sorter)
- return sorted_values, sorted_indices
-
-
-# ----------------------------------------------------------------------
-# Nodes
-# ----------------------------------------------------------------------
-
-@cython.internal
-cdef class IntervalNode:
- cdef readonly:
- int64_t n_elements, n_center, leaf_size
- bint is_leaf_node
-
- def __repr__(self) -> str:
- if self.is_leaf_node:
- return (
- f"<{type(self).__name__}: {self.n_elements} elements (terminal)>"
- )
- else:
- n_left = self.left_node.n_elements
- n_right = self.right_node.n_elements
- n_center = self.n_elements - n_left - n_right
- return (
- f"<{type(self).__name__}: "
- f"pivot {self.pivot}, {self.n_elements} elements "
- f"({n_left} left, {n_right} right, {n_center} overlapping)>"
- )
-
- def counts(self):
- """
- Inspect counts on this node
- useful for debugging purposes
- """
- if self.is_leaf_node:
- return self.n_elements
- else:
- m = len(self.center_left_values)
- l = self.left_node.counts()
- r = self.right_node.counts()
- return (m, (l, r))
-
-
-# we need specialized nodes and leaves to optimize for different dtype and
-# closed values
-
-{{py:
-
-nodes = []
-for dtype in ['float64', 'int64', 'uint64']:
- for closed, cmp_left, cmp_right in [
- ('left', '<=', '<'),
- ('right', '<', '<='),
- ('both', '<=', '<='),
- ('neither', '<', '<')]:
- cmp_left_converse = '<' if cmp_left == '<=' else '<='
- cmp_right_converse = '<' if cmp_right == '<=' else '<='
- if dtype.startswith('int'):
- fused_prefix = 'int_'
- elif dtype.startswith('uint'):
- fused_prefix = 'uint_'
- elif dtype.startswith('float'):
- fused_prefix = ''
- nodes.append((dtype, dtype.title(),
- closed, closed.title(),
- cmp_left,
- cmp_right,
- cmp_left_converse,
- cmp_right_converse,
- fused_prefix))
-
-}}
-
-NODE_CLASSES = {}
-
-{{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right,
- cmp_left_converse, cmp_right_converse, fused_prefix in nodes}}
-
-
-@cython.internal
-cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode(IntervalNode):
- """Non-terminal node for an IntervalTree
-
- Categorizes intervals by those that fall to the left, those that fall to
- the right, and those that overlap with the pivot.
- """
- cdef readonly:
- {{dtype_title}}Closed{{closed_title}}IntervalNode left_node, right_node
- {{dtype}}_t[:] center_left_values, center_right_values, left, right
- int64_t[:] center_left_indices, center_right_indices, indices
- {{dtype}}_t min_left, max_right
- {{dtype}}_t pivot
-
- def __init__(self,
- ndarray[{{dtype}}_t, ndim=1] left,
- ndarray[{{dtype}}_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- int64_t leaf_size):
-
- self.n_elements = len(left)
- self.leaf_size = leaf_size
-
- # min_left and min_right are used to speed-up query by skipping
- # query on sub-nodes. If this node has size 0, query is cheap,
- # so these values don't matter.
- if left.size > 0:
- self.min_left = left.min()
- self.max_right = right.max()
- else:
- self.min_left = 0
- self.max_right = 0
-
- if self.n_elements <= leaf_size:
- # make this a terminal (leaf) node
- self.is_leaf_node = True
- self.left = left
- self.right = right
- self.indices = indices
- self.n_center = 0
- else:
- # calculate a pivot so we can create child nodes
- self.is_leaf_node = False
- self.pivot = np.median(left / 2 + right / 2)
- if np.isinf(self.pivot):
- self.pivot = cython.cast({{dtype}}_t, 0)
- if self.pivot > np.max(right):
- self.pivot = np.max(left)
- if self.pivot < np.min(left):
- self.pivot = np.min(right)
-
- left_set, right_set, center_set = self.classify_intervals(
- left, right)
-
- self.left_node = self.new_child_node(left, right,
- indices, left_set)
- self.right_node = self.new_child_node(left, right,
- indices, right_set)
-
- self.center_left_values, self.center_left_indices = \
- sort_values_and_indices(left, indices, center_set)
- self.center_right_values, self.center_right_indices = \
- sort_values_and_indices(right, indices, center_set)
- self.n_center = len(self.center_left_indices)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef classify_intervals(self, {{dtype}}_t[:] left, {{dtype}}_t[:] right):
- """Classify the given intervals based upon whether they fall to the
- left, right, or overlap with this node's pivot.
- """
- cdef:
- Int64Vector left_ind, right_ind, overlapping_ind
- Py_ssize_t i
-
- left_ind = Int64Vector()
- right_ind = Int64Vector()
- overlapping_ind = Int64Vector()
-
- for i in range(self.n_elements):
- if right[i] {{cmp_right_converse}} self.pivot:
- left_ind.append(i)
- elif self.pivot {{cmp_left_converse}} left[i]:
- right_ind.append(i)
- else:
- overlapping_ind.append(i)
-
- return (left_ind.to_array(),
- right_ind.to_array(),
- overlapping_ind.to_array())
-
- cdef new_child_node(self,
- ndarray[{{dtype}}_t, ndim=1] left,
- ndarray[{{dtype}}_t, ndim=1] right,
- ndarray[int64_t, ndim=1] indices,
- ndarray[int64_t, ndim=1] subset):
- """Create a new child node.
- """
- left = take(left, subset)
- right = take(right, subset)
- indices = take(indices, subset)
- return {{dtype_title}}Closed{{closed_title}}IntervalNode(
- left, right, indices, self.leaf_size)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- @cython.initializedcheck(False)
- cpdef query(self, Int64Vector result, {{fused_prefix}}scalar_t point):
- """Recursively query this node and its sub-nodes for intervals that
- overlap with the query point.
- """
- cdef:
- int64_t[:] indices
- {{dtype}}_t[:] values
- Py_ssize_t i
-
- if self.is_leaf_node:
- # Once we get down to a certain size, it doesn't make sense to
- # continue the binary tree structure. Instead, we use linear
- # search.
- for i in range(self.n_elements):
- if self.left[i] {{cmp_left}} point {{cmp_right}} self.right[i]:
- result.append(self.indices[i])
- else:
- # There are child nodes. Based on comparing our query to the pivot,
- # look at the center values, then go to the relevant child.
- if point < self.pivot:
- values = self.center_left_values
- indices = self.center_left_indices
- for i in range(self.n_center):
- if not values[i] {{cmp_left}} point:
- break
- result.append(indices[i])
- if point {{cmp_right}} self.left_node.max_right:
- self.left_node.query(result, point)
- elif point > self.pivot:
- values = self.center_right_values
- indices = self.center_right_indices
- for i in range(self.n_center - 1, -1, -1):
- if not point {{cmp_right}} values[i]:
- break
- result.append(indices[i])
- if self.right_node.min_left {{cmp_left}} point:
- self.right_node.query(result, point)
- else:
- result.extend(self.center_left_indices)
-
-
-NODE_CLASSES['{{dtype}}',
- '{{closed}}'] = {{dtype_title}}Closed{{closed_title}}IntervalNode
-
-{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/join.pyi b/contrib/python/pandas/py3/pandas/_libs/join.pyi
deleted file mode 100644
index 11b65b85909..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/join.pyi
+++ /dev/null
@@ -1,78 +0,0 @@
-import numpy as np
-
-from pandas._typing import npt
-
-def inner_join(
- left: np.ndarray, # const intp_t[:]
- right: np.ndarray, # const intp_t[:]
- max_groups: int,
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
-def left_outer_join(
- left: np.ndarray, # const intp_t[:]
- right: np.ndarray, # const intp_t[:]
- max_groups: int,
- sort: bool = ...,
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
-def full_outer_join(
- left: np.ndarray, # const intp_t[:]
- right: np.ndarray, # const intp_t[:]
- max_groups: int,
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
-def ffill_indexer(
- indexer: np.ndarray, # const intp_t[:]
-) -> npt.NDArray[np.intp]: ...
-def left_join_indexer_unique(
- left: np.ndarray, # ndarray[join_t]
- right: np.ndarray, # ndarray[join_t]
-) -> npt.NDArray[np.intp]: ...
-def left_join_indexer(
- left: np.ndarray, # ndarray[join_t]
- right: np.ndarray, # ndarray[join_t]
-) -> tuple[
- np.ndarray, # np.ndarray[join_t]
- npt.NDArray[np.intp],
- npt.NDArray[np.intp],
-]: ...
-def inner_join_indexer(
- left: np.ndarray, # ndarray[join_t]
- right: np.ndarray, # ndarray[join_t]
-) -> tuple[
- np.ndarray, # np.ndarray[join_t]
- npt.NDArray[np.intp],
- npt.NDArray[np.intp],
-]: ...
-def outer_join_indexer(
- left: np.ndarray, # ndarray[join_t]
- right: np.ndarray, # ndarray[join_t]
-) -> tuple[
- np.ndarray, # np.ndarray[join_t]
- npt.NDArray[np.intp],
- npt.NDArray[np.intp],
-]: ...
-def asof_join_backward_on_X_by_Y(
- left_values: np.ndarray, # asof_t[:]
- right_values: np.ndarray, # asof_t[:]
- left_by_values: np.ndarray, # by_t[:]
- right_by_values: np.ndarray, # by_t[:]
- allow_exact_matches: bool = ...,
- tolerance: np.number | float | None = ...,
- use_hashtable: bool = ...,
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
-def asof_join_forward_on_X_by_Y(
- left_values: np.ndarray, # asof_t[:]
- right_values: np.ndarray, # asof_t[:]
- left_by_values: np.ndarray, # by_t[:]
- right_by_values: np.ndarray, # by_t[:]
- allow_exact_matches: bool = ...,
- tolerance: np.number | float | None = ...,
- use_hashtable: bool = ...,
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
-def asof_join_nearest_on_X_by_Y(
- left_values: np.ndarray, # asof_t[:]
- right_values: np.ndarray, # asof_t[:]
- left_by_values: np.ndarray, # by_t[:]
- right_by_values: np.ndarray, # by_t[:]
- allow_exact_matches: bool = ...,
- tolerance: np.number | float | None = ...,
- use_hashtable: bool = ...,
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/join.pyx b/contrib/python/pandas/py3/pandas/_libs/join.pyx
deleted file mode 100644
index 2b3b147470c..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/join.pyx
+++ /dev/null
@@ -1,897 +0,0 @@
-cimport cython
-from cython cimport Py_ssize_t
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- int64_t,
- intp_t,
- ndarray,
- uint64_t,
-)
-
-cnp.import_array()
-
-from pandas._libs.algos import groupsort_indexer
-
-from pandas._libs.dtypes cimport (
- numeric_object_t,
- numeric_t,
-)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join(const intp_t[:] left, const intp_t[:] right,
- Py_ssize_t max_groups):
- cdef:
- Py_ssize_t i, j, k, count = 0
- intp_t[::1] left_sorter, right_sorter
- intp_t[::1] left_count, right_count
- intp_t[::1] left_indexer, right_indexer
- intp_t lc, rc
- Py_ssize_t left_pos = 0, right_pos = 0, position = 0
- Py_ssize_t offset
-
- left_sorter, left_count = groupsort_indexer(left, max_groups)
- right_sorter, right_count = groupsort_indexer(right, max_groups)
-
- with nogil:
- # First pass, determine size of result set, do not use the NA group
- for i in range(1, max_groups + 1):
- lc = left_count[i]
- rc = right_count[i]
-
- if rc > 0 and lc > 0:
- count += lc * rc
-
- left_indexer = np.empty(count, dtype=np.intp)
- right_indexer = np.empty(count, dtype=np.intp)
-
- with nogil:
- # exclude the NA group
- left_pos = left_count[0]
- right_pos = right_count[0]
- for i in range(1, max_groups + 1):
- lc = left_count[i]
- rc = right_count[i]
-
- if rc > 0 and lc > 0:
- for j in range(lc):
- offset = position + j * rc
- for k in range(rc):
- left_indexer[offset + k] = left_pos + j
- right_indexer[offset + k] = right_pos + k
- position += lc * rc
- left_pos += lc
- right_pos += rc
-
- # Will overwrite left/right indexer with the result
- _get_result_indexer(left_sorter, left_indexer)
- _get_result_indexer(right_sorter, right_indexer)
-
- return np.asarray(left_indexer), np.asarray(right_indexer)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_outer_join(const intp_t[:] left, const intp_t[:] right,
- Py_ssize_t max_groups, bint sort=True):
- cdef:
- Py_ssize_t i, j, k, count = 0
- ndarray[intp_t] rev
- intp_t[::1] left_count, right_count
- intp_t[::1] left_sorter, right_sorter
- intp_t[::1] left_indexer, right_indexer
- intp_t lc, rc
- Py_ssize_t left_pos = 0, right_pos = 0, position = 0
- Py_ssize_t offset
-
- left_sorter, left_count = groupsort_indexer(left, max_groups)
- right_sorter, right_count = groupsort_indexer(right, max_groups)
-
- with nogil:
- # First pass, determine size of result set, do not use the NA group
- for i in range(1, max_groups + 1):
- lc = left_count[i]
- rc = right_count[i]
-
- if rc > 0:
- count += lc * rc
- else:
- count += lc
-
- left_indexer = np.empty(count, dtype=np.intp)
- right_indexer = np.empty(count, dtype=np.intp)
-
- with nogil:
- # exclude the NA group
- left_pos = left_count[0]
- right_pos = right_count[0]
- for i in range(1, max_groups + 1):
- lc = left_count[i]
- rc = right_count[i]
-
- if rc == 0:
- for j in range(lc):
- left_indexer[position + j] = left_pos + j
- right_indexer[position + j] = -1
- position += lc
- else:
- for j in range(lc):
- offset = position + j * rc
- for k in range(rc):
- left_indexer[offset + k] = left_pos + j
- right_indexer[offset + k] = right_pos + k
- position += lc * rc
- left_pos += lc
- right_pos += rc
-
- # Will overwrite left/right indexer with the result
- _get_result_indexer(left_sorter, left_indexer)
- _get_result_indexer(right_sorter, right_indexer)
-
- if not sort: # if not asked to sort, revert to original order
- if len(left) == len(left_indexer):
- # no multiple matches for any row on the left
- # this is a short-cut to avoid groupsort_indexer
- # otherwise, the `else` path also works in this case
- rev = np.empty(len(left), dtype=np.intp)
- rev.put(np.asarray(left_sorter), np.arange(len(left)))
- else:
- rev, _ = groupsort_indexer(left_indexer, len(left))
-
- return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev)
- else:
- return np.asarray(left_indexer), np.asarray(right_indexer)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def full_outer_join(const intp_t[:] left, const intp_t[:] right,
- Py_ssize_t max_groups):
- cdef:
- Py_ssize_t i, j, k, count = 0
- intp_t[::1] left_sorter, right_sorter
- intp_t[::1] left_count, right_count
- intp_t[::1] left_indexer, right_indexer
- intp_t lc, rc
- intp_t left_pos = 0, right_pos = 0
- Py_ssize_t offset, position = 0
-
- left_sorter, left_count = groupsort_indexer(left, max_groups)
- right_sorter, right_count = groupsort_indexer(right, max_groups)
-
- with nogil:
- # First pass, determine size of result set, do not use the NA group
- for i in range(1, max_groups + 1):
- lc = left_count[i]
- rc = right_count[i]
-
- if rc > 0 and lc > 0:
- count += lc * rc
- else:
- count += lc + rc
-
- left_indexer = np.empty(count, dtype=np.intp)
- right_indexer = np.empty(count, dtype=np.intp)
-
- with nogil:
- # exclude the NA group
- left_pos = left_count[0]
- right_pos = right_count[0]
- for i in range(1, max_groups + 1):
- lc = left_count[i]
- rc = right_count[i]
-
- if rc == 0:
- for j in range(lc):
- left_indexer[position + j] = left_pos + j
- right_indexer[position + j] = -1
- position += lc
- elif lc == 0:
- for j in range(rc):
- left_indexer[position + j] = -1
- right_indexer[position + j] = right_pos + j
- position += rc
- else:
- for j in range(lc):
- offset = position + j * rc
- for k in range(rc):
- left_indexer[offset + k] = left_pos + j
- right_indexer[offset + k] = right_pos + k
- position += lc * rc
- left_pos += lc
- right_pos += rc
-
- # Will overwrite left/right indexer with the result
- _get_result_indexer(left_sorter, left_indexer)
- _get_result_indexer(right_sorter, right_indexer)
-
- return np.asarray(left_indexer), np.asarray(right_indexer)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void _get_result_indexer(intp_t[::1] sorter, intp_t[::1] indexer) nogil:
- """NOTE: overwrites indexer with the result to avoid allocating another array"""
- cdef:
- Py_ssize_t i, n, idx
-
- if len(sorter) > 0:
- # cython-only equivalent to
- # `res = algos.take_nd(sorter, indexer, fill_value=-1)`
- n = indexer.shape[0]
- for i in range(n):
- idx = indexer[i]
- if idx == -1:
- indexer[i] = -1
- else:
- indexer[i] = sorter[idx]
- else:
- # length-0 case
- indexer[:] = -1
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def ffill_indexer(const intp_t[:] indexer) -> np.ndarray:
- cdef:
- Py_ssize_t i, n = len(indexer)
- ndarray[intp_t] result
- intp_t val, last_obs
-
- result = np.empty(n, dtype=np.intp)
- last_obs = -1
-
- for i in range(n):
- val = indexer[i]
- if val == -1:
- result[i] = last_obs
- else:
- result[i] = val
- last_obs = val
-
- return result
-
-
-# ----------------------------------------------------------------------
-# left_join_indexer, inner_join_indexer, outer_join_indexer
-# ----------------------------------------------------------------------
-
-# Joins on ordered, unique indices
-
-# right might contain non-unique values
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique(
- ndarray[numeric_object_t] left,
- ndarray[numeric_object_t] right
-):
- """
- Both left and right are strictly monotonic increasing.
- """
- cdef:
- Py_ssize_t i, j, nleft, nright
- ndarray[intp_t] indexer
- numeric_object_t rval
-
- i = 0
- j = 0
- nleft = len(left)
- nright = len(right)
-
- indexer = np.empty(nleft, dtype=np.intp)
- while True:
- if i == nleft:
- break
-
- if j == nright:
- indexer[i] = -1
- i += 1
- continue
-
- rval = right[j]
-
- while i < nleft - 1 and left[i] == rval:
- indexer[i] = j
- i += 1
-
- if left[i] == rval:
- indexer[i] = j
- i += 1
- while i < nleft - 1 and left[i] == rval:
- indexer[i] = j
- i += 1
- j += 1
- elif left[i] > rval:
- indexer[i] = -1
- j += 1
- else:
- indexer[i] = -1
- i += 1
- return indexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right):
- """
- Two-pass algorithm for monotonic indexes. Handles many-to-one merges.
-
- Both left and right are monotonic increasing, but at least one of them
- is non-unique (if both were unique we'd use left_join_indexer_unique).
- """
- cdef:
- Py_ssize_t i, j, nright, nleft, count
- numeric_object_t lval, rval
- ndarray[intp_t] lindexer, rindexer
- ndarray[numeric_object_t] result
-
- nleft = len(left)
- nright = len(right)
-
- # First pass is to find the size 'count' of our output indexers.
- i = 0
- j = 0
- count = 0
- if nleft > 0:
- while i < nleft:
- if j == nright:
- count += nleft - i
- break
-
- lval = left[i]
- rval = right[j]
-
- if lval == rval:
- # This block is identical across
- # left_join_indexer, inner_join_indexer, outer_join_indexer
- count += 1
- if i < nleft - 1:
- if j < nright - 1 and right[j + 1] == rval:
- j += 1
- else:
- i += 1
- if left[i] != rval:
- j += 1
- elif j < nright - 1:
- j += 1
- if lval != right[j]:
- i += 1
- else:
- # end of the road
- break
- elif lval < rval:
- count += 1
- i += 1
- else:
- j += 1
-
- # do it again now that result size is known
-
- lindexer = np.empty(count, dtype=np.intp)
- rindexer = np.empty(count, dtype=np.intp)
- result = np.empty(count, dtype=left.dtype)
-
- i = 0
- j = 0
- count = 0
- if nleft > 0:
- while i < nleft:
- if j == nright:
- while i < nleft:
- lindexer[count] = i
- rindexer[count] = -1
- result[count] = left[i]
- i += 1
- count += 1
- break
-
- lval = left[i]
- rval = right[j]
-
- if lval == rval:
- lindexer[count] = i
- rindexer[count] = j
- result[count] = lval
- count += 1
- if i < nleft - 1:
- if j < nright - 1 and right[j + 1] == rval:
- j += 1
- else:
- i += 1
- if left[i] != rval:
- j += 1
- elif j < nright - 1:
- j += 1
- if lval != right[j]:
- i += 1
- else:
- # end of the road
- break
- elif lval < rval:
- # i.e. lval not in right; we keep for left_join_indexer
- lindexer[count] = i
- rindexer[count] = -1
- result[count] = lval
- count += 1
- i += 1
- else:
- # i.e. rval not in left; we discard for left_join_indexer
- j += 1
-
- return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right):
- """
- Two-pass algorithm for monotonic indexes. Handles many-to-one merges.
-
- Both left and right are monotonic increasing but not necessarily unique.
- """
- cdef:
- Py_ssize_t i, j, nright, nleft, count
- numeric_object_t lval, rval
- ndarray[intp_t] lindexer, rindexer
- ndarray[numeric_object_t] result
-
- nleft = len(left)
- nright = len(right)
-
- # First pass is to find the size 'count' of our output indexers.
- i = 0
- j = 0
- count = 0
- if nleft > 0 and nright > 0:
- while True:
- if i == nleft:
- break
- if j == nright:
- break
-
- lval = left[i]
- rval = right[j]
- if lval == rval:
- count += 1
- if i < nleft - 1:
- if j < nright - 1 and right[j + 1] == rval:
- j += 1
- else:
- i += 1
- if left[i] != rval:
- j += 1
- elif j < nright - 1:
- j += 1
- if lval != right[j]:
- i += 1
- else:
- # end of the road
- break
- elif lval < rval:
- # i.e. lval not in right; we discard for inner_indexer
- i += 1
- else:
- # i.e. rval not in left; we discard for inner_indexer
- j += 1
-
- # do it again now that result size is known
-
- lindexer = np.empty(count, dtype=np.intp)
- rindexer = np.empty(count, dtype=np.intp)
- result = np.empty(count, dtype=left.dtype)
-
- i = 0
- j = 0
- count = 0
- if nleft > 0 and nright > 0:
- while True:
- if i == nleft:
- break
- if j == nright:
- break
-
- lval = left[i]
- rval = right[j]
- if lval == rval:
- lindexer[count] = i
- rindexer[count] = j
- result[count] = lval
- count += 1
- if i < nleft - 1:
- if j < nright - 1 and right[j + 1] == rval:
- j += 1
- else:
- i += 1
- if left[i] != rval:
- j += 1
- elif j < nright - 1:
- j += 1
- if lval != right[j]:
- i += 1
- else:
- # end of the road
- break
- elif lval < rval:
- # i.e. lval not in right; we discard for inner_indexer
- i += 1
- else:
- # i.e. rval not in left; we discard for inner_indexer
- j += 1
-
- return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right):
- """
- Both left and right are monotonic increasing but not necessarily unique.
- """
- cdef:
- Py_ssize_t i, j, nright, nleft, count
- numeric_object_t lval, rval
- ndarray[intp_t] lindexer, rindexer
- ndarray[numeric_object_t] result
-
- nleft = len(left)
- nright = len(right)
-
- # First pass is to find the size 'count' of our output indexers.
- # count will be length of left plus the number of elements of right not in
- # left (counting duplicates)
- i = 0
- j = 0
- count = 0
- if nleft == 0:
- count = nright
- elif nright == 0:
- count = nleft
- else:
- while True:
- if i == nleft:
- count += nright - j
- break
- if j == nright:
- count += nleft - i
- break
-
- lval = left[i]
- rval = right[j]
- if lval == rval:
- count += 1
- if i < nleft - 1:
- if j < nright - 1 and right[j + 1] == rval:
- j += 1
- else:
- i += 1
- if left[i] != rval:
- j += 1
- elif j < nright - 1:
- j += 1
- if lval != right[j]:
- i += 1
- else:
- # end of the road
- break
- elif lval < rval:
- count += 1
- i += 1
- else:
- count += 1
- j += 1
-
- lindexer = np.empty(count, dtype=np.intp)
- rindexer = np.empty(count, dtype=np.intp)
- result = np.empty(count, dtype=left.dtype)
-
- # do it again, but populate the indexers / result
-
- i = 0
- j = 0
- count = 0
- if nleft == 0:
- for j in range(nright):
- lindexer[j] = -1
- rindexer[j] = j
- result[j] = right[j]
- elif nright == 0:
- for i in range(nleft):
- lindexer[i] = i
- rindexer[i] = -1
- result[i] = left[i]
- else:
- while True:
- if i == nleft:
- while j < nright:
- lindexer[count] = -1
- rindexer[count] = j
- result[count] = right[j]
- count += 1
- j += 1
- break
- if j == nright:
- while i < nleft:
- lindexer[count] = i
- rindexer[count] = -1
- result[count] = left[i]
- count += 1
- i += 1
- break
-
- lval = left[i]
- rval = right[j]
-
- if lval == rval:
- lindexer[count] = i
- rindexer[count] = j
- result[count] = lval
- count += 1
- if i < nleft - 1:
- if j < nright - 1 and right[j + 1] == rval:
- j += 1
- else:
- i += 1
- if left[i] != rval:
- j += 1
- elif j < nright - 1:
- j += 1
- if lval != right[j]:
- i += 1
- else:
- # end of the road
- break
- elif lval < rval:
- # i.e. lval not in right; we keep for outer_join_indexer
- lindexer[count] = i
- rindexer[count] = -1
- result[count] = lval
- count += 1
- i += 1
- else:
- # i.e. rval not in left; we keep for outer_join_indexer
- lindexer[count] = -1
- rindexer[count] = j
- result[count] = rval
- count += 1
- j += 1
-
- return result, lindexer, rindexer
-
-
-# ----------------------------------------------------------------------
-# asof_join_by
-# ----------------------------------------------------------------------
-
-from pandas._libs.hashtable cimport (
- HashTable,
- Int64HashTable,
- PyObjectHashTable,
- UInt64HashTable,
-)
-
-ctypedef fused by_t:
- object
- int64_t
- uint64_t
-
-
-def asof_join_backward_on_X_by_Y(numeric_t[:] left_values,
- numeric_t[:] right_values,
- by_t[:] left_by_values,
- by_t[:] right_by_values,
- bint allow_exact_matches=True,
- tolerance=None,
- bint use_hashtable=True):
-
- cdef:
- Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
- ndarray[intp_t] left_indexer, right_indexer
- bint has_tolerance = False
- numeric_t tolerance_ = 0
- numeric_t diff = 0
- HashTable hash_table
- by_t by_value
-
- # if we are using tolerance, set our objects
- if tolerance is not None:
- has_tolerance = True
- tolerance_ = tolerance
-
- left_size = len(left_values)
- right_size = len(right_values)
-
- left_indexer = np.empty(left_size, dtype=np.intp)
- right_indexer = np.empty(left_size, dtype=np.intp)
-
- if use_hashtable:
- if by_t is object:
- hash_table = PyObjectHashTable(right_size)
- elif by_t is int64_t:
- hash_table = Int64HashTable(right_size)
- elif by_t is uint64_t:
- hash_table = UInt64HashTable(right_size)
-
- right_pos = 0
- for left_pos in range(left_size):
- # restart right_pos if it went negative in a previous iteration
- if right_pos < 0:
- right_pos = 0
-
- # find last position in right whose value is less than left's
- if allow_exact_matches:
- while (right_pos < right_size and
- right_values[right_pos] <= left_values[left_pos]):
- if use_hashtable:
- hash_table.set_item(right_by_values[right_pos], right_pos)
- right_pos += 1
- else:
- while (right_pos < right_size and
- right_values[right_pos] < left_values[left_pos]):
- if use_hashtable:
- hash_table.set_item(right_by_values[right_pos], right_pos)
- right_pos += 1
- right_pos -= 1
-
- # save positions as the desired index
- if use_hashtable:
- by_value = left_by_values[left_pos]
- found_right_pos = (hash_table.get_item(by_value)
- if by_value in hash_table else -1)
- else:
- found_right_pos = right_pos
-
- left_indexer[left_pos] = left_pos
- right_indexer[left_pos] = found_right_pos
-
- # if needed, verify that tolerance is met
- if has_tolerance and found_right_pos != -1:
- diff = left_values[left_pos] - right_values[found_right_pos]
- if diff > tolerance_:
- right_indexer[left_pos] = -1
-
- return left_indexer, right_indexer
-
-
-def asof_join_forward_on_X_by_Y(numeric_t[:] left_values,
- numeric_t[:] right_values,
- by_t[:] left_by_values,
- by_t[:] right_by_values,
- bint allow_exact_matches=1,
- tolerance=None,
- bint use_hashtable=True):
-
- cdef:
- Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
- ndarray[intp_t] left_indexer, right_indexer
- bint has_tolerance = False
- numeric_t tolerance_ = 0
- numeric_t diff = 0
- HashTable hash_table
- by_t by_value
-
- # if we are using tolerance, set our objects
- if tolerance is not None:
- has_tolerance = True
- tolerance_ = tolerance
-
- left_size = len(left_values)
- right_size = len(right_values)
-
- left_indexer = np.empty(left_size, dtype=np.intp)
- right_indexer = np.empty(left_size, dtype=np.intp)
-
- if use_hashtable:
- if by_t is object:
- hash_table = PyObjectHashTable(right_size)
- elif by_t is int64_t:
- hash_table = Int64HashTable(right_size)
- elif by_t is uint64_t:
- hash_table = UInt64HashTable(right_size)
-
- right_pos = right_size - 1
- for left_pos in range(left_size - 1, -1, -1):
- # restart right_pos if it went over in a previous iteration
- if right_pos == right_size:
- right_pos = right_size - 1
-
- # find first position in right whose value is greater than left's
- if allow_exact_matches:
- while (right_pos >= 0 and
- right_values[right_pos] >= left_values[left_pos]):
- if use_hashtable:
- hash_table.set_item(right_by_values[right_pos], right_pos)
- right_pos -= 1
- else:
- while (right_pos >= 0 and
- right_values[right_pos] > left_values[left_pos]):
- if use_hashtable:
- hash_table.set_item(right_by_values[right_pos], right_pos)
- right_pos -= 1
- right_pos += 1
-
- # save positions as the desired index
- if use_hashtable:
- by_value = left_by_values[left_pos]
- found_right_pos = (hash_table.get_item(by_value)
- if by_value in hash_table else -1)
- else:
- found_right_pos = (right_pos
- if right_pos != right_size else -1)
-
- left_indexer[left_pos] = left_pos
- right_indexer[left_pos] = found_right_pos
-
- # if needed, verify that tolerance is met
- if has_tolerance and found_right_pos != -1:
- diff = right_values[found_right_pos] - left_values[left_pos]
- if diff > tolerance_:
- right_indexer[left_pos] = -1
-
- return left_indexer, right_indexer
-
-
-def asof_join_nearest_on_X_by_Y(ndarray[numeric_t] left_values,
- ndarray[numeric_t] right_values,
- ndarray[by_t] left_by_values,
- ndarray[by_t] right_by_values,
- bint allow_exact_matches=True,
- tolerance=None,
- bint use_hashtable=True):
-
- cdef:
- ndarray[intp_t] bli, bri, fli, fri
-
- ndarray[intp_t] left_indexer, right_indexer
- Py_ssize_t left_size, i
- numeric_t bdiff, fdiff
-
- # search both forward and backward
- # TODO(cython3):
- # Bug in beta1 preventing Cython from choosing
- # right specialization when one fused memview is None
- # Doesn't matter what type we choose
- # (nothing happens anyways since it is None)
- # GH 51640
- if left_by_values is not None and left_by_values.dtype != object:
- by_dtype = f"{left_by_values.dtype}_t"
- else:
- by_dtype = object
- bli, bri = asof_join_backward_on_X_by_Y[f"{left_values.dtype}_t", by_dtype](
- left_values,
- right_values,
- left_by_values,
- right_by_values,
- allow_exact_matches,
- tolerance,
- use_hashtable
- )
- fli, fri = asof_join_forward_on_X_by_Y[f"{left_values.dtype}_t", by_dtype](
- left_values,
- right_values,
- left_by_values,
- right_by_values,
- allow_exact_matches,
- tolerance,
- use_hashtable
- )
-
- # choose the smaller timestamp
- left_size = len(left_values)
- left_indexer = np.empty(left_size, dtype=np.intp)
- right_indexer = np.empty(left_size, dtype=np.intp)
-
- for i in range(len(bri)):
- # choose timestamp from right with smaller difference
- if bri[i] != -1 and fri[i] != -1:
- bdiff = left_values[bli[i]] - right_values[bri[i]]
- fdiff = right_values[fri[i]] - left_values[fli[i]]
- right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i]
- else:
- right_indexer[i] = bri[i] if bri[i] != -1 else fri[i]
- left_indexer[i] = bli[i]
-
- return left_indexer, right_indexer
diff --git a/contrib/python/pandas/py3/pandas/_libs/json.pyi b/contrib/python/pandas/py3/pandas/_libs/json.pyi
deleted file mode 100644
index 8e7ba60ccce..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/json.pyi
+++ /dev/null
@@ -1,23 +0,0 @@
-from typing import (
- Any,
- Callable,
-)
-
-def dumps(
- obj: Any,
- ensure_ascii: bool = ...,
- double_precision: int = ...,
- indent: int = ...,
- orient: str = ...,
- date_unit: str = ...,
- iso_dates: bool = ...,
- default_handler: None
- | Callable[[Any], str | float | bool | list | dict | None] = ...,
-) -> str: ...
-def loads(
- s: str,
- precise_float: bool = ...,
- numpy: bool = ...,
- dtype: None = ...,
- labelled: bool = ...,
-) -> Any: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/khash.pxd b/contrib/python/pandas/py3/pandas/_libs/khash.pxd
deleted file mode 100644
index a9f819e5e16..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/khash.pxd
+++ /dev/null
@@ -1,129 +0,0 @@
-from cpython.object cimport PyObject
-from numpy cimport (
- complex64_t,
- complex128_t,
- float32_t,
- float64_t,
- int8_t,
- int16_t,
- int32_t,
- int64_t,
- uint8_t,
- uint16_t,
- uint32_t,
- uint64_t,
-)
-
-
-cdef extern from "khash_python.h":
- const int KHASH_TRACE_DOMAIN
-
- ctypedef uint32_t khuint_t
- ctypedef khuint_t khiter_t
-
- ctypedef struct khcomplex128_t:
- double real
- double imag
-
- bint are_equivalent_khcomplex128_t \
- "kh_complex_hash_equal" (khcomplex128_t a, khcomplex128_t b) nogil
-
- ctypedef struct khcomplex64_t:
- float real
- float imag
-
- bint are_equivalent_khcomplex64_t \
- "kh_complex_hash_equal" (khcomplex64_t a, khcomplex64_t b) nogil
-
- bint are_equivalent_float64_t \
- "kh_floats_hash_equal" (float64_t a, float64_t b) nogil
-
- bint are_equivalent_float32_t \
- "kh_floats_hash_equal" (float32_t a, float32_t b) nogil
-
- uint32_t kh_python_hash_func(object key)
- bint kh_python_hash_equal(object a, object b)
-
- ctypedef struct kh_pymap_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- PyObject **keys
- size_t *vals
-
- kh_pymap_t* kh_init_pymap()
- void kh_destroy_pymap(kh_pymap_t*)
- void kh_clear_pymap(kh_pymap_t*)
- khuint_t kh_get_pymap(kh_pymap_t*, PyObject*)
- void kh_resize_pymap(kh_pymap_t*, khuint_t)
- khuint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*)
- void kh_del_pymap(kh_pymap_t*, khuint_t)
-
- bint kh_exist_pymap(kh_pymap_t*, khiter_t)
-
- ctypedef struct kh_pyset_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- PyObject **keys
- size_t *vals
-
- kh_pyset_t* kh_init_pyset()
- void kh_destroy_pyset(kh_pyset_t*)
- void kh_clear_pyset(kh_pyset_t*)
- khuint_t kh_get_pyset(kh_pyset_t*, PyObject*)
- void kh_resize_pyset(kh_pyset_t*, khuint_t)
- khuint_t kh_put_pyset(kh_pyset_t*, PyObject*, int*)
- void kh_del_pyset(kh_pyset_t*, khuint_t)
-
- bint kh_exist_pyset(kh_pyset_t*, khiter_t)
-
- ctypedef char* kh_cstr_t
-
- ctypedef struct kh_str_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- kh_cstr_t *keys
- size_t *vals
-
- kh_str_t* kh_init_str() nogil
- void kh_destroy_str(kh_str_t*) nogil
- void kh_clear_str(kh_str_t*) nogil
- khuint_t kh_get_str(kh_str_t*, kh_cstr_t) nogil
- void kh_resize_str(kh_str_t*, khuint_t) nogil
- khuint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) nogil
- void kh_del_str(kh_str_t*, khuint_t) nogil
-
- bint kh_exist_str(kh_str_t*, khiter_t) nogil
-
- ctypedef struct kh_str_starts_t:
- kh_str_t *table
- int starts[256]
-
- kh_str_starts_t* kh_init_str_starts() nogil
- khuint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key,
- int* ret) nogil
- khuint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) nogil
- void kh_destroy_str_starts(kh_str_starts_t*) nogil
- void kh_resize_str_starts(kh_str_starts_t*, khuint_t) nogil
-
- # sweep factorize
-
- ctypedef struct kh_strbox_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- kh_cstr_t *keys
- PyObject **vals
-
- kh_strbox_t* kh_init_strbox() nogil
- void kh_destroy_strbox(kh_strbox_t*) nogil
- void kh_clear_strbox(kh_strbox_t*) nogil
- khuint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t) nogil
- void kh_resize_strbox(kh_strbox_t*, khuint_t) nogil
- khuint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*) nogil
- void kh_del_strbox(kh_strbox_t*, khuint_t) nogil
-
- bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil
-
- khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
-
-
-include "khash_for_primitive_helper.pxi"
diff --git a/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi b/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi
deleted file mode 100644
index 60c09335498..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi
+++ /dev/null
@@ -1,209 +0,0 @@
-"""
-Template for wrapping khash-tables for each primitive `dtype`
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_int64_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- int64_t *keys
- size_t *vals
-
- kh_int64_t* kh_init_int64() nogil
- void kh_destroy_int64(kh_int64_t*) nogil
- void kh_clear_int64(kh_int64_t*) nogil
- khuint_t kh_get_int64(kh_int64_t*, int64_t) nogil
- void kh_resize_int64(kh_int64_t*, khuint_t) nogil
- khuint_t kh_put_int64(kh_int64_t*, int64_t, int*) nogil
- void kh_del_int64(kh_int64_t*, khuint_t) nogil
-
- bint kh_exist_int64(kh_int64_t*, khiter_t) nogil
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_uint64_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- uint64_t *keys
- size_t *vals
-
- kh_uint64_t* kh_init_uint64() nogil
- void kh_destroy_uint64(kh_uint64_t*) nogil
- void kh_clear_uint64(kh_uint64_t*) nogil
- khuint_t kh_get_uint64(kh_uint64_t*, uint64_t) nogil
- void kh_resize_uint64(kh_uint64_t*, khuint_t) nogil
- khuint_t kh_put_uint64(kh_uint64_t*, uint64_t, int*) nogil
- void kh_del_uint64(kh_uint64_t*, khuint_t) nogil
-
- bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_float64_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- float64_t *keys
- size_t *vals
-
- kh_float64_t* kh_init_float64() nogil
- void kh_destroy_float64(kh_float64_t*) nogil
- void kh_clear_float64(kh_float64_t*) nogil
- khuint_t kh_get_float64(kh_float64_t*, float64_t) nogil
- void kh_resize_float64(kh_float64_t*, khuint_t) nogil
- khuint_t kh_put_float64(kh_float64_t*, float64_t, int*) nogil
- void kh_del_float64(kh_float64_t*, khuint_t) nogil
-
- bint kh_exist_float64(kh_float64_t*, khiter_t) nogil
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_int32_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- int32_t *keys
- size_t *vals
-
- kh_int32_t* kh_init_int32() nogil
- void kh_destroy_int32(kh_int32_t*) nogil
- void kh_clear_int32(kh_int32_t*) nogil
- khuint_t kh_get_int32(kh_int32_t*, int32_t) nogil
- void kh_resize_int32(kh_int32_t*, khuint_t) nogil
- khuint_t kh_put_int32(kh_int32_t*, int32_t, int*) nogil
- void kh_del_int32(kh_int32_t*, khuint_t) nogil
-
- bint kh_exist_int32(kh_int32_t*, khiter_t) nogil
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_uint32_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- uint32_t *keys
- size_t *vals
-
- kh_uint32_t* kh_init_uint32() nogil
- void kh_destroy_uint32(kh_uint32_t*) nogil
- void kh_clear_uint32(kh_uint32_t*) nogil
- khuint_t kh_get_uint32(kh_uint32_t*, uint32_t) nogil
- void kh_resize_uint32(kh_uint32_t*, khuint_t) nogil
- khuint_t kh_put_uint32(kh_uint32_t*, uint32_t, int*) nogil
- void kh_del_uint32(kh_uint32_t*, khuint_t) nogil
-
- bint kh_exist_uint32(kh_uint32_t*, khiter_t) nogil
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_float32_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- float32_t *keys
- size_t *vals
-
- kh_float32_t* kh_init_float32() nogil
- void kh_destroy_float32(kh_float32_t*) nogil
- void kh_clear_float32(kh_float32_t*) nogil
- khuint_t kh_get_float32(kh_float32_t*, float32_t) nogil
- void kh_resize_float32(kh_float32_t*, khuint_t) nogil
- khuint_t kh_put_float32(kh_float32_t*, float32_t, int*) nogil
- void kh_del_float32(kh_float32_t*, khuint_t) nogil
-
- bint kh_exist_float32(kh_float32_t*, khiter_t) nogil
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_int16_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- int16_t *keys
- size_t *vals
-
- kh_int16_t* kh_init_int16() nogil
- void kh_destroy_int16(kh_int16_t*) nogil
- void kh_clear_int16(kh_int16_t*) nogil
- khuint_t kh_get_int16(kh_int16_t*, int16_t) nogil
- void kh_resize_int16(kh_int16_t*, khuint_t) nogil
- khuint_t kh_put_int16(kh_int16_t*, int16_t, int*) nogil
- void kh_del_int16(kh_int16_t*, khuint_t) nogil
-
- bint kh_exist_int16(kh_int16_t*, khiter_t) nogil
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_uint16_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- uint16_t *keys
- size_t *vals
-
- kh_uint16_t* kh_init_uint16() nogil
- void kh_destroy_uint16(kh_uint16_t*) nogil
- void kh_clear_uint16(kh_uint16_t*) nogil
- khuint_t kh_get_uint16(kh_uint16_t*, uint16_t) nogil
- void kh_resize_uint16(kh_uint16_t*, khuint_t) nogil
- khuint_t kh_put_uint16(kh_uint16_t*, uint16_t, int*) nogil
- void kh_del_uint16(kh_uint16_t*, khuint_t) nogil
-
- bint kh_exist_uint16(kh_uint16_t*, khiter_t) nogil
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_int8_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- int8_t *keys
- size_t *vals
-
- kh_int8_t* kh_init_int8() nogil
- void kh_destroy_int8(kh_int8_t*) nogil
- void kh_clear_int8(kh_int8_t*) nogil
- khuint_t kh_get_int8(kh_int8_t*, int8_t) nogil
- void kh_resize_int8(kh_int8_t*, khuint_t) nogil
- khuint_t kh_put_int8(kh_int8_t*, int8_t, int*) nogil
- void kh_del_int8(kh_int8_t*, khuint_t) nogil
-
- bint kh_exist_int8(kh_int8_t*, khiter_t) nogil
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_uint8_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- uint8_t *keys
- size_t *vals
-
- kh_uint8_t* kh_init_uint8() nogil
- void kh_destroy_uint8(kh_uint8_t*) nogil
- void kh_clear_uint8(kh_uint8_t*) nogil
- khuint_t kh_get_uint8(kh_uint8_t*, uint8_t) nogil
- void kh_resize_uint8(kh_uint8_t*, khuint_t) nogil
- khuint_t kh_put_uint8(kh_uint8_t*, uint8_t, int*) nogil
- void kh_del_uint8(kh_uint8_t*, khuint_t) nogil
-
- bint kh_exist_uint8(kh_uint8_t*, khiter_t) nogil
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_complex64_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- khcomplex64_t *keys
- size_t *vals
-
- kh_complex64_t* kh_init_complex64() nogil
- void kh_destroy_complex64(kh_complex64_t*) nogil
- void kh_clear_complex64(kh_complex64_t*) nogil
- khuint_t kh_get_complex64(kh_complex64_t*, khcomplex64_t) nogil
- void kh_resize_complex64(kh_complex64_t*, khuint_t) nogil
- khuint_t kh_put_complex64(kh_complex64_t*, khcomplex64_t, int*) nogil
- void kh_del_complex64(kh_complex64_t*, khuint_t) nogil
-
- bint kh_exist_complex64(kh_complex64_t*, khiter_t) nogil
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_complex128_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- khcomplex128_t *keys
- size_t *vals
-
- kh_complex128_t* kh_init_complex128() nogil
- void kh_destroy_complex128(kh_complex128_t*) nogil
- void kh_clear_complex128(kh_complex128_t*) nogil
- khuint_t kh_get_complex128(kh_complex128_t*, khcomplex128_t) nogil
- void kh_resize_complex128(kh_complex128_t*, khuint_t) nogil
- khuint_t kh_put_complex128(kh_complex128_t*, khcomplex128_t, int*) nogil
- void kh_del_complex128(kh_complex128_t*, khuint_t) nogil
-
- bint kh_exist_complex128(kh_complex128_t*, khiter_t) nogil
diff --git a/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi.in
deleted file mode 100644
index d0934b3e0ee..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/khash_for_primitive_helper.pxi.in
+++ /dev/null
@@ -1,44 +0,0 @@
-"""
-Template for wrapping khash-tables for each primitive `dtype`
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-{{py:
-
-# name, c_type
-primitive_types = [('int64', 'int64_t'),
- ('uint64', 'uint64_t'),
- ('float64', 'float64_t'),
- ('int32', 'int32_t'),
- ('uint32', 'uint32_t'),
- ('float32', 'float32_t'),
- ('int16', 'int16_t'),
- ('uint16', 'uint16_t'),
- ('int8', 'int8_t'),
- ('uint8', 'uint8_t'),
- ('complex64', 'khcomplex64_t'),
- ('complex128', 'khcomplex128_t'),
- ]
-}}
-
-{{for name, c_type in primitive_types}}
-
-cdef extern from "khash_python.h":
- ctypedef struct kh_{{name}}_t:
- khuint_t n_buckets, size, n_occupied, upper_bound
- uint32_t *flags
- {{c_type}} *keys
- size_t *vals
-
- kh_{{name}}_t* kh_init_{{name}}() nogil
- void kh_destroy_{{name}}(kh_{{name}}_t*) nogil
- void kh_clear_{{name}}(kh_{{name}}_t*) nogil
- khuint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil
- void kh_resize_{{name}}(kh_{{name}}_t*, khuint_t) nogil
- khuint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil
- void kh_del_{{name}}(kh_{{name}}_t*, khuint_t) nogil
-
- bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil
-
-{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/lib.pxd b/contrib/python/pandas/py3/pandas/_libs/lib.pxd
deleted file mode 100644
index 46a339f2e7c..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/lib.pxd
+++ /dev/null
@@ -1,6 +0,0 @@
-from numpy cimport ndarray
-
-
-cdef bint c_is_list_like(object, bint) except -1
-
-cpdef ndarray eq_NA_compat(ndarray[object] arr, object key)
diff --git a/contrib/python/pandas/py3/pandas/_libs/lib.pyi b/contrib/python/pandas/py3/pandas/_libs/lib.pyi
deleted file mode 100644
index fbc577712d2..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/lib.pyi
+++ /dev/null
@@ -1,250 +0,0 @@
-# TODO(npdtypes): Many types specified here can be made more specific/accurate;
-# the more specific versions are specified in comments
-
-from typing import (
- Any,
- Callable,
- Final,
- Generator,
- Hashable,
- Literal,
- overload,
-)
-
-import numpy as np
-
-from pandas._typing import (
- ArrayLike,
- DtypeObj,
- npt,
-)
-
-# placeholder until we can specify np.ndarray[object, ndim=2]
-ndarray_obj_2d = np.ndarray
-
-from enum import Enum
-
-class _NoDefault(Enum):
- no_default = ...
-
-no_default: Final = _NoDefault.no_default
-NoDefault = Literal[_NoDefault.no_default]
-
-i8max: int
-u8max: int
-
-def item_from_zerodim(val: object) -> object: ...
-def infer_dtype(value: object, skipna: bool = ...) -> str: ...
-def is_iterator(obj: object) -> bool: ...
-def is_scalar(val: object) -> bool: ...
-def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ...
-def is_period(val: object) -> bool: ...
-def is_interval(val: object) -> bool: ...
-def is_decimal(val: object) -> bool: ...
-def is_complex(val: object) -> bool: ...
-def is_bool(val: object) -> bool: ...
-def is_integer(val: object) -> bool: ...
-def is_float(val: object) -> bool: ...
-def is_interval_array(values: np.ndarray) -> bool: ...
-def is_datetime64_array(values: np.ndarray) -> bool: ...
-def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ...
-def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ...
-def is_time_array(values: np.ndarray, skipna: bool = ...): ...
-def is_date_array(values: np.ndarray, skipna: bool = ...): ...
-def is_datetime_array(values: np.ndarray, skipna: bool = ...): ...
-def is_string_array(values: np.ndarray, skipna: bool = ...): ...
-def is_float_array(values: np.ndarray, skipna: bool = ...): ...
-def is_integer_array(values: np.ndarray, skipna: bool = ...): ...
-def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
-def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ...
-def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ...
-def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ...
-def map_infer(
- arr: np.ndarray,
- f: Callable[[Any], Any],
- convert: bool = ...,
- ignore_na: bool = ...,
-) -> np.ndarray: ...
-@overload # all convert_foo False -> only convert numeric
-def maybe_convert_objects(
- objects: npt.NDArray[np.object_],
- *,
- try_float: bool = ...,
- safe: bool = ...,
- convert_numeric: bool = ...,
- convert_datetime: Literal[False] = ...,
- convert_timedelta: Literal[False] = ...,
- convert_period: Literal[False] = ...,
- convert_interval: Literal[False] = ...,
- convert_to_nullable_dtype: Literal[False] = ...,
- dtype_if_all_nat: DtypeObj | None = ...,
-) -> npt.NDArray[np.object_ | np.number]: ...
-@overload # both convert_datetime and convert_to_nullable_integer False -> np.ndarray
-def maybe_convert_objects(
- objects: npt.NDArray[np.object_],
- *,
- try_float: bool = ...,
- safe: bool = ...,
- convert_numeric: bool = ...,
- convert_datetime: Literal[False] = ...,
- convert_timedelta: bool = ...,
- convert_period: Literal[False] = ...,
- convert_interval: Literal[False] = ...,
- convert_to_nullable_dtype: Literal[False] = ...,
- dtype_if_all_nat: DtypeObj | None = ...,
-) -> np.ndarray: ...
-@overload
-def maybe_convert_objects(
- objects: npt.NDArray[np.object_],
- *,
- try_float: bool = ...,
- safe: bool = ...,
- convert_numeric: bool = ...,
- convert_datetime: bool = ...,
- convert_timedelta: bool = ...,
- convert_period: bool = ...,
- convert_interval: bool = ...,
- convert_to_nullable_dtype: Literal[True] = ...,
- dtype_if_all_nat: DtypeObj | None = ...,
-) -> ArrayLike: ...
-@overload
-def maybe_convert_objects(
- objects: npt.NDArray[np.object_],
- *,
- try_float: bool = ...,
- safe: bool = ...,
- convert_numeric: bool = ...,
- convert_datetime: Literal[True] = ...,
- convert_timedelta: bool = ...,
- convert_period: bool = ...,
- convert_interval: bool = ...,
- convert_to_nullable_dtype: bool = ...,
- dtype_if_all_nat: DtypeObj | None = ...,
-) -> ArrayLike: ...
-@overload
-def maybe_convert_objects(
- objects: npt.NDArray[np.object_],
- *,
- try_float: bool = ...,
- safe: bool = ...,
- convert_numeric: bool = ...,
- convert_datetime: bool = ...,
- convert_timedelta: bool = ...,
- convert_period: Literal[True] = ...,
- convert_interval: bool = ...,
- convert_to_nullable_dtype: bool = ...,
- dtype_if_all_nat: DtypeObj | None = ...,
-) -> ArrayLike: ...
-@overload
-def maybe_convert_objects(
- objects: npt.NDArray[np.object_],
- *,
- try_float: bool = ...,
- safe: bool = ...,
- convert_numeric: bool = ...,
- convert_datetime: bool = ...,
- convert_timedelta: bool = ...,
- convert_period: bool = ...,
- convert_interval: bool = ...,
- convert_to_nullable_dtype: bool = ...,
- dtype_if_all_nat: DtypeObj | None = ...,
-) -> ArrayLike: ...
-@overload
-def maybe_convert_numeric(
- values: npt.NDArray[np.object_],
- na_values: set,
- convert_empty: bool = ...,
- coerce_numeric: bool = ...,
- convert_to_masked_nullable: Literal[False] = ...,
-) -> tuple[np.ndarray, None]: ...
-@overload
-def maybe_convert_numeric(
- values: npt.NDArray[np.object_],
- na_values: set,
- convert_empty: bool = ...,
- coerce_numeric: bool = ...,
- *,
- convert_to_masked_nullable: Literal[True],
-) -> tuple[np.ndarray, np.ndarray]: ...
-
-# TODO: restrict `arr`?
-def ensure_string_array(
- arr,
- na_value: object = ...,
- convert_na_value: bool = ...,
- copy: bool = ...,
- skipna: bool = ...,
-) -> npt.NDArray[np.object_]: ...
-def convert_nans_to_NA(
- arr: npt.NDArray[np.object_],
-) -> npt.NDArray[np.object_]: ...
-def fast_zip(ndarrays: list) -> npt.NDArray[np.object_]: ...
-
-# TODO: can we be more specific about rows?
-def to_object_array_tuples(rows: object) -> ndarray_obj_2d: ...
-def tuples_to_object_array(
- tuples: npt.NDArray[np.object_],
-) -> ndarray_obj_2d: ...
-
-# TODO: can we be more specific about rows?
-def to_object_array(rows: object, min_width: int = ...) -> ndarray_obj_2d: ...
-def dicts_to_array(dicts: list, columns: list) -> ndarray_obj_2d: ...
-def maybe_booleans_to_slice(
- mask: npt.NDArray[np.uint8],
-) -> slice | npt.NDArray[np.uint8]: ...
-def maybe_indices_to_slice(
- indices: npt.NDArray[np.intp],
- max_len: int,
-) -> slice | npt.NDArray[np.intp]: ...
-def is_all_arraylike(obj: list) -> bool: ...
-
-# -----------------------------------------------------------------
-# Functions which in reality take memoryviews
-
-def memory_usage_of_objects(arr: np.ndarray) -> int: ... # object[:] # np.int64
-def map_infer_mask(
- arr: np.ndarray,
- f: Callable[[Any], Any],
- mask: np.ndarray, # const uint8_t[:]
- convert: bool = ...,
- na_value: Any = ...,
- dtype: np.dtype = ...,
-) -> np.ndarray: ...
-def indices_fast(
- index: npt.NDArray[np.intp],
- labels: np.ndarray, # const int64_t[:]
- keys: list,
- sorted_labels: list[npt.NDArray[np.int64]],
-) -> dict[Hashable, npt.NDArray[np.intp]]: ...
-def generate_slices(
- labels: np.ndarray, ngroups: int # const intp_t[:]
-) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ...
-def count_level_2d(
- mask: np.ndarray, # ndarray[uint8_t, ndim=2, cast=True],
- labels: np.ndarray, # const intp_t[:]
- max_bin: int,
-) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2]
-def get_level_sorter(
- label: np.ndarray, # const int64_t[:]
- starts: np.ndarray, # const intp_t[:]
-) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1]
-def generate_bins_dt64(
- values: npt.NDArray[np.int64],
- binner: np.ndarray, # const int64_t[:]
- closed: object = ...,
- hasnans: bool = ...,
-) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1]
-def array_equivalent_object(
- left: npt.NDArray[np.object_],
- right: npt.NDArray[np.object_],
-) -> bool: ...
-def has_infs(arr: np.ndarray) -> bool: ... # const floating[:]
-def get_reverse_indexer(
- indexer: np.ndarray, # const intp_t[:]
- length: int,
-) -> npt.NDArray[np.intp]: ...
-def is_bool_list(obj: list) -> bool: ...
-def dtypes_all_equal(types: list[DtypeObj]) -> bool: ...
-def is_range_indexer(
- left: np.ndarray, n: int # np.ndarray[np.int64, ndim=1]
-) -> bool: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/lib.pyx b/contrib/python/pandas/py3/pandas/_libs/lib.pyx
deleted file mode 100644
index a918cda157e..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/lib.pyx
+++ /dev/null
@@ -1,3059 +0,0 @@
-from collections import abc
-from decimal import Decimal
-from enum import Enum
-from typing import (
- Literal,
- _GenericAlias,
-)
-
-cimport cython
-from cpython.datetime cimport (
- PyDate_Check,
- PyDateTime_Check,
- PyDelta_Check,
- PyTime_Check,
- import_datetime,
-)
-from cpython.iterator cimport PyIter_Check
-from cpython.number cimport PyNumber_Check
-from cpython.object cimport (
- Py_EQ,
- PyObject,
- PyObject_RichCompareBool,
- PyTypeObject,
-)
-from cpython.ref cimport Py_INCREF
-from cpython.sequence cimport PySequence_Check
-from cpython.tuple cimport (
- PyTuple_New,
- PyTuple_SET_ITEM,
-)
-from cython cimport (
- Py_ssize_t,
- floating,
-)
-
-from pandas._libs.missing import check_na_tuples_nonequal
-
-import_datetime()
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- NPY_OBJECT,
- PyArray_Check,
- PyArray_GETITEM,
- PyArray_ITER_DATA,
- PyArray_ITER_NEXT,
- PyArray_IterNew,
- complex128_t,
- flatiter,
- float64_t,
- int32_t,
- int64_t,
- intp_t,
- ndarray,
- uint8_t,
- uint64_t,
-)
-
-cnp.import_array()
-
-cdef extern from "Python.h":
- # Note: importing extern-style allows us to declare these as nogil
- # functions, whereas `from cpython cimport` does not.
- bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
-
-cdef extern from "numpy/arrayobject.h":
- # cython's numpy.dtype specification is incorrect, which leads to
- # errors in issubclass(self.dtype.type, np.bool_), so we directly
- # include the correct version
- # https://github.com/cython/cython/issues/2022
-
- ctypedef class numpy.dtype [object PyArray_Descr]:
- # Use PyDataType_* macros when possible, however there are no macros
- # for accessing some of the fields, so some are defined. Please
- # ask on cython-dev if you need more.
- cdef:
- int type_num
- int itemsize "elsize"
- char byteorder
- object fields
- tuple names
-
- PyTypeObject PySignedIntegerArrType_Type
- PyTypeObject PyUnsignedIntegerArrType_Type
-
-cdef extern from "numpy/ndarrayobject.h":
- bint PyArray_CheckScalar(obj) nogil
-
-
-cdef extern from "src/parse_helper.h":
- int floatify(object, float64_t *result, int *maybe_int) except -1
-
-from pandas._libs cimport util
-from pandas._libs.util cimport (
- INT64_MAX,
- INT64_MIN,
- UINT64_MAX,
- is_nan,
-)
-
-from pandas._libs.tslibs import (
- OutOfBoundsDatetime,
- OutOfBoundsTimedelta,
-)
-from pandas._libs.tslibs.period import Period
-
-from pandas._libs.missing cimport (
- C_NA,
- checknull,
- is_matching_na,
- is_null_datetime64,
- is_null_timedelta64,
-)
-from pandas._libs.tslibs.conversion cimport (
- _TSObject,
- convert_to_tsobject,
-)
-from pandas._libs.tslibs.nattype cimport (
- NPY_NAT,
- c_NaT as NaT,
- checknull_with_nat,
-)
-from pandas._libs.tslibs.np_datetime cimport NPY_FR_ns
-from pandas._libs.tslibs.offsets cimport is_offset_object
-from pandas._libs.tslibs.period cimport is_period_object
-from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
-from pandas._libs.tslibs.timezones cimport tz_compare
-
-# constants that will be compared to potentially arbitrarily large
-# python int
-cdef:
- object oINT64_MAX = <int64_t>INT64_MAX
- object oINT64_MIN = <int64_t>INT64_MIN
- object oUINT64_MAX = <uint64_t>UINT64_MAX
-
- float64_t NaN = <float64_t>np.NaN
-
-# python-visible
-i8max = <int64_t>INT64_MAX
-u8max = <uint64_t>UINT64_MAX
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def memory_usage_of_objects(arr: object[:]) -> int64_t:
- """
- Return the memory usage of an object array in bytes.
-
- Does not include the actual bytes of the pointers
- """
- cdef:
- Py_ssize_t i
- Py_ssize_t n
- int64_t size = 0
-
- n = len(arr)
- for i in range(n):
- size += arr[i].__sizeof__()
- return size
-
-
-# ----------------------------------------------------------------------
-
-
-def is_scalar(val: object) -> bool:
- """
- Return True if given object is scalar.
-
- Parameters
- ----------
- val : object
- This includes:
-
- - numpy array scalar (e.g. np.int64)
- - Python builtin numerics
- - Python builtin byte arrays and strings
- - None
- - datetime.datetime
- - datetime.timedelta
- - Period
- - decimal.Decimal
- - Interval
- - DateOffset
- - Fraction
- - Number.
-
- Returns
- -------
- bool
- Return True if given object is scalar.
-
- Examples
- --------
- >>> import datetime
- >>> dt = datetime.datetime(2018, 10, 3)
- >>> pd.api.types.is_scalar(dt)
- True
-
- >>> pd.api.types.is_scalar([2, 3])
- False
-
- >>> pd.api.types.is_scalar({0: 1, 2: 3})
- False
-
- >>> pd.api.types.is_scalar((0, 2))
- False
-
- pandas supports PEP 3141 numbers:
-
- >>> from fractions import Fraction
- >>> pd.api.types.is_scalar(Fraction(3, 5))
- True
- """
-
- # Start with C-optimized checks
- if (cnp.PyArray_IsAnyScalar(val)
- # PyArray_IsAnyScalar is always False for bytearrays on Py3
- or PyDate_Check(val)
- or PyDelta_Check(val)
- or PyTime_Check(val)
- # We differ from numpy, which claims that None is not scalar;
- # see np.isscalar
- or val is C_NA
- or val is None):
- return True
-
- # Next use C-optimized checks to exclude common non-scalars before falling
- # back to non-optimized checks.
- if PySequence_Check(val):
- # e.g. list, tuple
- # includes np.ndarray, Series which PyNumber_Check can return True for
- return False
-
- # Note: PyNumber_Check check includes Decimal, Fraction, numbers.Number
- return (PyNumber_Check(val)
- or is_period_object(val)
- or is_interval(val)
- or is_offset_object(val))
-
-
-cdef int64_t get_itemsize(object val):
- """
- Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar.
-
- Parameters
- ----------
- val : object
-
- Returns
- -------
- is_ndarray : bool
- """
- if PyArray_CheckScalar(val):
- return cnp.PyArray_DescrFromScalar(val).itemsize
- else:
- return -1
-
-
-def is_iterator(obj: object) -> bool:
- """
- Check if the object is an iterator.
-
- This is intended for generators, not list-like objects.
-
- Parameters
- ----------
- obj : The object to check
-
- Returns
- -------
- is_iter : bool
- Whether `obj` is an iterator.
-
- Examples
- --------
- >>> import datetime
- >>> from pandas.api.types import is_iterator
- >>> is_iterator((x for x in []))
- True
- >>> is_iterator([1, 2, 3])
- False
- >>> is_iterator(datetime.datetime(2017, 1, 1))
- False
- >>> is_iterator("foo")
- False
- >>> is_iterator(1)
- False
- """
- return PyIter_Check(obj)
-
-
-def item_from_zerodim(val: object) -> object:
- """
- If the value is a zerodim array, return the item it contains.
-
- Parameters
- ----------
- val : object
-
- Returns
- -------
- object
-
- Examples
- --------
- >>> item_from_zerodim(1)
- 1
- >>> item_from_zerodim('foobar')
- 'foobar'
- >>> item_from_zerodim(np.array(1))
- 1
- >>> item_from_zerodim(np.array([1]))
- array([1])
- """
- if cnp.PyArray_IsZeroDim(val):
- return cnp.PyArray_ToScalar(cnp.PyArray_DATA(val), val)
- return val
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list:
- cdef:
- list buf
- Py_ssize_t k = len(lists)
- Py_ssize_t i, j, n
- list uniques = []
- dict table = {}
- object val, stub = 0
-
- for i in range(k):
- buf = lists[i]
- n = len(buf)
- for j in range(n):
- val = buf[j]
- if val not in table:
- table[val] = stub
- uniques.append(val)
- if sort:
- try:
- uniques.sort()
- except TypeError:
- pass
-
- return uniques
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list:
- """
- Generate a list of unique values from a generator of lists.
-
- Parameters
- ----------
- gen : generator object
- Generator of lists from which the unique list is created.
- sort : bool
- Whether or not to sort the resulting unique list.
-
- Returns
- -------
- list of unique values
- """
- cdef:
- list buf
- Py_ssize_t j, n
- list uniques = []
- dict table = {}
- object val, stub = 0
-
- for buf in gen:
- n = len(buf)
- for j in range(n):
- val = buf[j]
- if val not in table:
- table[val] = stub
- uniques.append(val)
- if sort:
- try:
- uniques.sort()
- except TypeError:
- pass
-
- return uniques
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def dicts_to_array(dicts: list, columns: list):
- cdef:
- Py_ssize_t i, j, k, n
- ndarray[object, ndim=2] result
- dict row
- object col, onan = np.nan
-
- k = len(columns)
- n = len(dicts)
-
- result = np.empty((n, k), dtype="O")
-
- for i in range(n):
- row = dicts[i]
- for j in range(k):
- col = columns[j]
- if col in row:
- result[i, j] = row[col]
- else:
- result[i, j] = onan
-
- return result
-
-
-def fast_zip(list ndarrays) -> ndarray[object]:
- """
- For zipping multiple ndarrays into an ndarray of tuples.
- """
- cdef:
- Py_ssize_t i, j, k, n
- ndarray[object, ndim=1] result
- flatiter it
- object val, tup
-
- k = len(ndarrays)
- n = len(ndarrays[0])
-
- result = np.empty(n, dtype=object)
-
- # initialize tuples on first pass
- arr = ndarrays[0]
- it = <flatiter>PyArray_IterNew(arr)
- for i in range(n):
- val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
- tup = PyTuple_New(k)
-
- PyTuple_SET_ITEM(tup, 0, val)
- Py_INCREF(val)
- result[i] = tup
- PyArray_ITER_NEXT(it)
-
- for j in range(1, k):
- arr = ndarrays[j]
- it = <flatiter>PyArray_IterNew(arr)
- if len(arr) != n:
- raise ValueError("all arrays must be same length")
-
- for i in range(n):
- val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
- PyTuple_SET_ITEM(result[i], j, val)
- Py_INCREF(val)
- PyArray_ITER_NEXT(it)
-
- return result
-
-
-def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
- """
- Reverse indexing operation.
-
- Given `indexer`, make `indexer_inv` of it, such that::
-
- indexer_inv[indexer[x]] = x
-
- Parameters
- ----------
- indexer : np.ndarray[np.intp]
- length : int
-
- Returns
- -------
- np.ndarray[np.intp]
-
- Notes
- -----
- If indexer is not unique, only first occurrence is accounted.
- """
- cdef:
- Py_ssize_t i, n = len(indexer)
- ndarray[intp_t, ndim=1] rev_indexer
- intp_t idx
-
- rev_indexer = np.empty(length, dtype=np.intp)
- rev_indexer[:] = -1
- for i in range(n):
- idx = indexer[i]
- if idx != -1:
- rev_indexer[idx] = i
-
- return rev_indexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-# TODO(cython3): Can add const once cython#1772 is resolved
-def has_infs(floating[:] arr) -> bool:
- cdef:
- Py_ssize_t i, n = len(arr)
- floating inf, neginf, val
- bint ret = False
-
- inf = np.inf
- neginf = -inf
- with nogil:
- for i in range(n):
- val = arr[i]
- if val == inf or val == neginf:
- ret = True
- break
- return ret
-
-
-def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len):
- cdef:
- Py_ssize_t i, n = len(indices)
- intp_t k, vstart, vlast, v
-
- if n == 0:
- return slice(0, 0)
-
- vstart = indices[0]
- if vstart < 0 or max_len <= vstart:
- return indices
-
- if n == 1:
- return slice(vstart, <intp_t>(vstart + 1))
-
- vlast = indices[n - 1]
- if vlast < 0 or max_len <= vlast:
- return indices
-
- k = indices[1] - indices[0]
- if k == 0:
- return indices
- else:
- for i in range(2, n):
- v = indices[i]
- if v - indices[i - 1] != k:
- return indices
-
- if k > 0:
- return slice(vstart, <intp_t>(vlast + 1), k)
- else:
- if vlast == 0:
- return slice(vstart, None, k)
- else:
- return slice(vstart, <intp_t>(vlast - 1), k)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def maybe_booleans_to_slice(ndarray[uint8_t, ndim=1] mask):
- cdef:
- Py_ssize_t i, n = len(mask)
- Py_ssize_t start = 0, end = 0
- bint started = False, finished = False
-
- for i in range(n):
- if mask[i]:
- if finished:
- return mask.view(np.bool_)
- if not started:
- started = True
- start = i
- else:
- if finished:
- continue
-
- if started:
- end = i
- finished = True
-
- if not started:
- return slice(0, 0)
- if not finished:
- return slice(start, None)
- else:
- return slice(start, end)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def array_equivalent_object(ndarray left, ndarray right) -> bool:
- """
- Perform an element by element comparison on N-d object arrays
- taking into account nan positions.
- """
- # left and right both have object dtype, but we cannot annotate that
- # without limiting ndim.
- cdef:
- Py_ssize_t i, n = left.size
- object x, y
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(left, right)
-
- # Caller is responsible for checking left.shape == right.shape
-
- for i in range(n):
- # Analogous to: x = left[i]
- x = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 0))[0]
- y = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- # we are either not equal or both nan
- # I think None == None will be true here
- try:
- if PyArray_Check(x) and PyArray_Check(y):
- if x.shape != y.shape:
- return False
- if x.dtype == y.dtype == object:
- if not array_equivalent_object(x, y):
- return False
- else:
- # Circular import isn't great, but so it goes.
- # TODO: could use np.array_equal?
- from pandas.core.dtypes.missing import array_equivalent
-
- if not array_equivalent(x, y):
- return False
-
- elif (x is C_NA) ^ (y is C_NA):
- return False
- elif not (
- PyObject_RichCompareBool(x, y, Py_EQ)
- or is_matching_na(x, y, nan_matches_none=True)
- ):
- return False
- except (ValueError, TypeError):
- # Avoid raising ValueError when comparing Numpy arrays to other types
- if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y):
- # Only compare scalars to scalars and non-scalars to non-scalars
- return False
- elif (not (cnp.PyArray_IsPythonScalar(x) or cnp.PyArray_IsPythonScalar(y))
- and not (isinstance(x, type(y)) or isinstance(y, type(x)))):
- # Check if non-scalars have the same type
- return False
- elif check_na_tuples_nonequal(x, y):
- # We have tuples where one Side has a NA and the other side does not
- # Only condition we may end up with a TypeError
- return False
- raise
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return True
-
-
-ctypedef fused int6432_t:
- int64_t
- int32_t
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def is_range_indexer(ndarray[int6432_t, ndim=1] left, Py_ssize_t n) -> bool:
- """
- Perform an element by element comparison on 1-d integer arrays, meant for indexer
- comparisons
- """
- cdef:
- Py_ssize_t i
-
- if left.size != n:
- return False
-
- for i in range(n):
-
- if left[i] != i:
- return False
-
- return True
-
-
-ctypedef fused ndarr_object:
- ndarray[object, ndim=1]
- ndarray[object, ndim=2]
-
-# TODO: get rid of this in StringArray and modify
-# and go through ensure_string_array instead
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def convert_nans_to_NA(ndarr_object arr) -> ndarray:
- """
- Helper for StringArray that converts null values that
- are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements
- have already been validated as null.
- """
- cdef:
- Py_ssize_t i, m, n
- object val
- ndarr_object result
- result = np.asarray(arr, dtype="object")
- if arr.ndim == 2:
- m, n = arr.shape[0], arr.shape[1]
- for i in range(m):
- for j in range(n):
- val = arr[i, j]
- if not isinstance(val, str):
- result[i, j] = <object>C_NA
- else:
- n = len(arr)
- for i in range(n):
- val = arr[i]
- if not isinstance(val, str):
- result[i] = <object>C_NA
- return result
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef ndarray[object] ensure_string_array(
- arr,
- object na_value=np.nan,
- bint convert_na_value=True,
- bint copy=True,
- bint skipna=True,
-):
- """
- Returns a new numpy array with object dtype and only strings and na values.
-
- Parameters
- ----------
- arr : array-like
- The values to be converted to str, if needed.
- na_value : Any, default np.nan
- The value to use for na. For example, np.nan or pd.NA.
- convert_na_value : bool, default True
- If False, existing na values will be used unchanged in the new array.
- copy : bool, default True
- Whether to ensure that a new array is returned.
- skipna : bool, default True
- Whether or not to coerce nulls to their stringified form
- (e.g. if False, NaN becomes 'nan').
-
- Returns
- -------
- np.ndarray[object]
- An array with the input array's elements casted to str or nan-like.
- """
- cdef:
- Py_ssize_t i = 0, n = len(arr)
- bint already_copied = True
-
- if hasattr(arr, "to_numpy"):
-
- if hasattr(arr, "dtype") and arr.dtype.kind in ["m", "M"]:
- # dtype check to exclude DataFrame
- # GH#41409 TODO: not a great place for this
- out = arr.astype(str).astype(object)
- out[arr.isna()] = na_value
- return out
- arr = arr.to_numpy()
- elif not util.is_array(arr):
- arr = np.array(arr, dtype="object")
-
- result = np.asarray(arr, dtype="object")
-
- if copy and result is arr:
- result = result.copy()
- elif not copy and result is arr:
- already_copied = False
-
- if issubclass(arr.dtype.type, np.str_):
- # short-circuit, all elements are str
- return result
-
- for i in range(n):
- val = arr[i]
-
- if isinstance(val, str):
- continue
-
- elif not already_copied:
- result = result.copy()
- already_copied = True
-
- if not checknull(val):
- if isinstance(val, bytes):
- # GH#49658 discussion of desired behavior here
- result[i] = val.decode()
- elif not util.is_float_object(val):
- # f"{val}" is faster than str(val)
- result[i] = f"{val}"
- else:
- # f"{val}" is not always equivalent to str(val) for floats
- result[i] = str(val)
- else:
- if convert_na_value:
- val = na_value
- if skipna:
- result[i] = val
- else:
- result[i] = f"{val}"
-
- return result
-
-
-def is_all_arraylike(obj: list) -> bool:
- """
- Should we treat these as levels of a MultiIndex, as opposed to Index items?
- """
- cdef:
- Py_ssize_t i, n = len(obj)
- object val
- bint all_arrays = True
-
- for i in range(n):
- val = obj[i]
- if not (isinstance(val, list) or
- util.is_array(val) or hasattr(val, "_data")):
- # TODO: EA?
- # exclude tuples, frozensets as they may be contained in an Index
- all_arrays = False
- break
-
- return all_arrays
-
-
-# ------------------------------------------------------------------------------
-# Groupby-related functions
-
-# TODO: could do even better if we know something about the data. eg, index has
-# 1-min data, binner has 5-min data, then bins are just strides in index. This
-# is a general, O(max(len(values), len(binner))) method.
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def generate_bins_dt64(ndarray[int64_t, ndim=1] values, const int64_t[:] binner,
- object closed="left", bint hasnans=False):
- """
- Int64 (datetime64) version of generic python version in ``groupby.py``.
- """
- cdef:
- Py_ssize_t lenidx, lenbin, i, j, bc
- ndarray[int64_t, ndim=1] bins
- int64_t r_bin, nat_count
- bint right_closed = closed == "right"
-
- nat_count = 0
- if hasnans:
- mask = values == NPY_NAT
- nat_count = np.sum(mask)
- values = values[~mask]
-
- lenidx = len(values)
- lenbin = len(binner)
-
- if lenidx <= 0 or lenbin <= 0:
- raise ValueError("Invalid length for values or for binner")
-
- # check binner fits data
- if values[0] < binner[0]:
- raise ValueError("Values falls before first bin")
-
- if values[lenidx - 1] > binner[lenbin - 1]:
- raise ValueError("Values falls after last bin")
-
- bins = np.empty(lenbin - 1, dtype=np.int64)
-
- j = 0 # index into values
- bc = 0 # bin count
-
- # linear scan
- if right_closed:
- for i in range(0, lenbin - 1):
- r_bin = binner[i + 1]
- # count values in current bin, advance to next bin
- while j < lenidx and values[j] <= r_bin:
- j += 1
- bins[bc] = j
- bc += 1
- else:
- for i in range(0, lenbin - 1):
- r_bin = binner[i + 1]
- # count values in current bin, advance to next bin
- while j < lenidx and values[j] < r_bin:
- j += 1
- bins[bc] = j
- bc += 1
-
- if nat_count > 0:
- # shift bins by the number of NaT
- bins = bins + nat_count
- bins = np.insert(bins, 0, nat_count)
-
- return bins
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def get_level_sorter(
- ndarray[int64_t, ndim=1] codes, const intp_t[:] starts
-) -> ndarray:
- """
- Argsort for a single level of a multi-index, keeping the order of higher
- levels unchanged. `starts` points to starts of same-key indices w.r.t
- to leading levels; equivalent to:
- np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort')
- + starts[i] for i in range(len(starts) - 1)])
-
- Parameters
- ----------
- codes : np.ndarray[int64_t, ndim=1]
- starts : np.ndarray[intp, ndim=1]
-
- Returns
- -------
- np.ndarray[np.int, ndim=1]
- """
- cdef:
- Py_ssize_t i, l, r
- ndarray[intp_t, ndim=1] out = cnp.PyArray_EMPTY(1, codes.shape, cnp.NPY_INTP, 0)
-
- for i in range(len(starts) - 1):
- l, r = starts[i], starts[i + 1]
- out[l:r] = l + codes[l:r].argsort(kind="mergesort")
-
- return out
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
- const intp_t[:] labels,
- Py_ssize_t max_bin,
- ):
- cdef:
- Py_ssize_t i, j, k, n
- ndarray[int64_t, ndim=2] counts
-
- n, k = (<object>mask).shape
-
- counts = np.zeros((n, max_bin), dtype="i8")
- with nogil:
- for i in range(n):
- for j in range(k):
- if mask[i, j]:
- counts[i, labels[j]] += 1
-
- return counts
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups):
- cdef:
- Py_ssize_t i, group_size, n, start
- intp_t lab
- int64_t[::1] starts, ends
-
- n = len(labels)
-
- starts = np.zeros(ngroups, dtype=np.int64)
- ends = np.zeros(ngroups, dtype=np.int64)
-
- start = 0
- group_size = 0
- with nogil:
- for i in range(n):
- lab = labels[i]
- if lab < 0:
- start += 1
- else:
- group_size += 1
- if i == n - 1 or lab != labels[i + 1]:
- starts[lab] = start
- ends[lab] = start + group_size
- start += group_size
- group_size = 0
-
- return np.asarray(starts), np.asarray(ends)
-
-
-def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list keys,
- list sorted_labels) -> dict:
- """
- Parameters
- ----------
- index : ndarray[intp]
- labels : ndarray[int64]
- keys : list
- sorted_labels : list[ndarray[int64]]
- """
- cdef:
- Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
- dict result = {}
- object tup
-
- k = len(keys)
-
- # Start at the first non-null entry
- j = 0
- for j in range(0, n):
- if labels[j] != -1:
- break
- else:
- return result
- cur = labels[j]
- start = j
-
- for i in range(j+1, n):
- lab = labels[i]
-
- if lab != cur:
- if lab != -1:
- if k == 1:
- # When k = 1 we do not want to return a tuple as key
- tup = keys[0][sorted_labels[0][i - 1]]
- else:
- tup = PyTuple_New(k)
- for j in range(k):
- val = keys[j][sorted_labels[j][i - 1]]
- PyTuple_SET_ITEM(tup, j, val)
- Py_INCREF(val)
- result[tup] = index[start:i]
- start = i
- cur = lab
-
- if k == 1:
- # When k = 1 we do not want to return a tuple as key
- tup = keys[0][sorted_labels[0][n - 1]]
- else:
- tup = PyTuple_New(k)
- for j in range(k):
- val = keys[j][sorted_labels[j][n - 1]]
- PyTuple_SET_ITEM(tup, j, val)
- Py_INCREF(val)
- result[tup] = index[start:]
-
- return result
-
-
-# core.common import for fast inference checks
-
-def is_float(obj: object) -> bool:
- """
- Return True if given object is float.
-
- Returns
- -------
- bool
- """
- return util.is_float_object(obj)
-
-
-def is_integer(obj: object) -> bool:
- """
- Return True if given object is integer.
-
- Returns
- -------
- bool
- """
- return util.is_integer_object(obj)
-
-
-def is_bool(obj: object) -> bool:
- """
- Return True if given object is boolean.
-
- Returns
- -------
- bool
- """
- return util.is_bool_object(obj)
-
-
-def is_complex(obj: object) -> bool:
- """
- Return True if given object is complex.
-
- Returns
- -------
- bool
- """
- return util.is_complex_object(obj)
-
-
-cpdef bint is_decimal(object obj):
- return isinstance(obj, Decimal)
-
-
-cpdef bint is_interval(object obj):
- return getattr(obj, "_typ", "_typ") == "interval"
-
-
-def is_period(val: object) -> bool:
- """
- Return True if given object is Period.
-
- Returns
- -------
- bool
- """
- return is_period_object(val)
-
-
-def is_list_like(obj: object, allow_sets: bool = True) -> bool:
- """
- Check if the object is list-like.
-
- Objects that are considered list-like are for example Python
- lists, tuples, sets, NumPy arrays, and Pandas Series.
-
- Strings and datetime objects, however, are not considered list-like.
-
- Parameters
- ----------
- obj : object
- Object to check.
- allow_sets : bool, default True
- If this parameter is False, sets will not be considered list-like.
-
- Returns
- -------
- bool
- Whether `obj` has list-like properties.
-
- Examples
- --------
- >>> import datetime
- >>> from pandas.api.types import is_list_like
- >>> is_list_like([1, 2, 3])
- True
- >>> is_list_like({1, 2, 3})
- True
- >>> is_list_like(datetime.datetime(2017, 1, 1))
- False
- >>> is_list_like("foo")
- False
- >>> is_list_like(1)
- False
- >>> is_list_like(np.array([2]))
- True
- >>> is_list_like(np.array(2))
- False
- """
- return c_is_list_like(obj, allow_sets)
-
-
-cdef bint c_is_list_like(object obj, bint allow_sets) except -1:
- # first, performance short-cuts for the most common cases
- if util.is_array(obj):
- # exclude zero-dimensional numpy arrays, effectively scalars
- return not cnp.PyArray_IsZeroDim(obj)
- elif isinstance(obj, list):
- return True
- # then the generic implementation
- return (
- # equiv: `isinstance(obj, abc.Iterable)`
- getattr(obj, "__iter__", None) is not None and not isinstance(obj, type)
- # we do not count strings/unicode/bytes as list-like
- # exclude Generic types that have __iter__
- and not isinstance(obj, (str, bytes, _GenericAlias))
- # exclude zero-dimensional duck-arrays, effectively scalars
- and not (hasattr(obj, "ndim") and obj.ndim == 0)
- # exclude sets if allow_sets is False
- and not (allow_sets is False and isinstance(obj, abc.Set))
- )
-
-
-_TYPE_MAP = {
- "categorical": "categorical",
- "category": "categorical",
- "int8": "integer",
- "int16": "integer",
- "int32": "integer",
- "int64": "integer",
- "i": "integer",
- "uint8": "integer",
- "uint16": "integer",
- "uint32": "integer",
- "uint64": "integer",
- "u": "integer",
- "float32": "floating",
- "float64": "floating",
- "f": "floating",
- "complex64": "complex",
- "complex128": "complex",
- "c": "complex",
- "string": "string",
- str: "string",
- "S": "bytes",
- "U": "string",
- "bool": "boolean",
- "b": "boolean",
- "datetime64[ns]": "datetime64",
- "M": "datetime64",
- "timedelta64[ns]": "timedelta64",
- "m": "timedelta64",
- "interval": "interval",
- Period: "period",
-}
-
-# types only exist on certain platform
-try:
- np.float128
- _TYPE_MAP["float128"] = "floating"
-except AttributeError:
- pass
-try:
- np.complex256
- _TYPE_MAP["complex256"] = "complex"
-except AttributeError:
- pass
-try:
- np.float16
- _TYPE_MAP["float16"] = "floating"
-except AttributeError:
- pass
-
-
-@cython.internal
-cdef class Seen:
- """
- Class for keeping track of the types of elements
- encountered when trying to perform type conversions.
- """
-
- cdef:
- bint int_ # seen_int
- bint nat_ # seen nat
- bint bool_ # seen_bool
- bint null_ # seen_null
- bint nan_ # seen_np.nan
- bint uint_ # seen_uint (unsigned integer)
- bint sint_ # seen_sint (signed integer)
- bint float_ # seen_float
- bint object_ # seen_object
- bint complex_ # seen_complex
- bint datetime_ # seen_datetime
- bint coerce_numeric # coerce data to numeric
- bint timedelta_ # seen_timedelta
- bint datetimetz_ # seen_datetimetz
- bint period_ # seen_period
- bint interval_ # seen_interval
-
- def __cinit__(self, bint coerce_numeric=False):
- """
- Initialize a Seen instance.
-
- Parameters
- ----------
- coerce_numeric : bool, default False
- Whether or not to force conversion to a numeric data type if
- initial methods to convert to numeric fail.
- """
- self.int_ = False
- self.nat_ = False
- self.bool_ = False
- self.null_ = False
- self.nan_ = False
- self.uint_ = False
- self.sint_ = False
- self.float_ = False
- self.object_ = False
- self.complex_ = False
- self.datetime_ = False
- self.timedelta_ = False
- self.datetimetz_ = False
- self.period_ = False
- self.interval_ = False
- self.coerce_numeric = coerce_numeric
-
- cdef bint check_uint64_conflict(self) except -1:
- """
- Check whether we can safely convert a uint64 array to a numeric dtype.
-
- There are two cases when conversion to numeric dtype with a uint64
- array is not safe (and will therefore not be performed)
-
- 1) A NaN element is encountered.
-
- uint64 cannot be safely cast to float64 due to truncation issues
- at the extreme ends of the range.
-
- 2) A negative number is encountered.
-
- There is no numerical dtype that can hold both negative numbers
- and numbers greater than INT64_MAX. Hence, at least one number
- will be improperly cast if we convert to a numeric dtype.
-
- Returns
- -------
- bool
- Whether or not we should return the original input array to avoid
- data truncation.
-
- Raises
- ------
- ValueError
- uint64 elements were detected, and at least one of the
- two conflict cases was also detected. However, we are
- trying to force conversion to a numeric dtype.
- """
- return (self.uint_ and (self.null_ or self.sint_)
- and not self.coerce_numeric)
-
- cdef saw_null(self):
- """
- Set flags indicating that a null value was encountered.
- """
- self.null_ = True
- self.float_ = True
-
- cdef saw_int(self, object val):
- """
- Set flags indicating that an integer value was encountered.
-
- In addition to setting a flag that an integer was seen, we
- also set two flags depending on the type of integer seen:
-
- 1) sint_ : a signed numpy integer type or a negative (signed) number in the
- range of [-2**63, 0) was encountered
- 2) uint_ : an unsigned numpy integer type or a positive number in the range of
- [2**63, 2**64) was encountered
-
- Parameters
- ----------
- val : Python int
- Value with which to set the flags.
- """
- self.int_ = True
- self.sint_ = (
- self.sint_
- or (oINT64_MIN <= val < 0)
- # Cython equivalent of `isinstance(val, np.signedinteger)`
- or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type)
- )
- self.uint_ = (
- self.uint_
- or (oINT64_MAX < val <= oUINT64_MAX)
- # Cython equivalent of `isinstance(val, np.unsignedinteger)`
- or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type)
- )
-
- @property
- def numeric_(self):
- return self.complex_ or self.float_ or self.int_
-
- @property
- def is_bool(self):
- # i.e. not (anything but bool)
- return self.is_bool_or_na and not (self.nan_ or self.null_)
-
- @property
- def is_bool_or_na(self):
- # i.e. not (anything but bool or missing values)
- return self.bool_ and not (
- self.datetime_ or self.datetimetz_ or self.nat_ or self.timedelta_
- or self.period_ or self.interval_ or self.numeric_ or self.object_
- )
-
-
-cdef object _try_infer_map(object dtype):
- """
- If its in our map, just return the dtype.
- """
- cdef:
- object val
- str attr
- for attr in ["kind", "name", "base", "type"]:
- val = getattr(dtype, attr, None)
- if val in _TYPE_MAP:
- return _TYPE_MAP[val]
- return None
-
-
-def infer_dtype(value: object, skipna: bool = True) -> str:
- """
- Return a string label of the type of a scalar or list-like of values.
-
- Parameters
- ----------
- value : scalar, list, ndarray, or pandas type
- skipna : bool, default True
- Ignore NaN values when inferring the type.
-
- Returns
- -------
- str
- Describing the common type of the input data.
- Results can include:
-
- - string
- - bytes
- - floating
- - integer
- - mixed-integer
- - mixed-integer-float
- - decimal
- - complex
- - categorical
- - boolean
- - datetime64
- - datetime
- - date
- - timedelta64
- - timedelta
- - time
- - period
- - mixed
- - unknown-array
-
- Raises
- ------
- TypeError
- If ndarray-like but cannot infer the dtype
-
- Notes
- -----
- - 'mixed' is the catchall for anything that is not otherwise
- specialized
- - 'mixed-integer-float' are floats and integers
- - 'mixed-integer' are integers mixed with non-integers
- - 'unknown-array' is the catchall for something that *is* an array (has
- a dtype attribute), but has a dtype unknown to pandas (e.g. external
- extension array)
-
- Examples
- --------
- >>> import datetime
- >>> infer_dtype(['foo', 'bar'])
- 'string'
-
- >>> infer_dtype(['a', np.nan, 'b'], skipna=True)
- 'string'
-
- >>> infer_dtype(['a', np.nan, 'b'], skipna=False)
- 'mixed'
-
- >>> infer_dtype([b'foo', b'bar'])
- 'bytes'
-
- >>> infer_dtype([1, 2, 3])
- 'integer'
-
- >>> infer_dtype([1, 2, 3.5])
- 'mixed-integer-float'
-
- >>> infer_dtype([1.0, 2.0, 3.5])
- 'floating'
-
- >>> infer_dtype(['a', 1])
- 'mixed-integer'
-
- >>> infer_dtype([Decimal(1), Decimal(2.0)])
- 'decimal'
-
- >>> infer_dtype([True, False])
- 'boolean'
-
- >>> infer_dtype([True, False, np.nan])
- 'boolean'
-
- >>> infer_dtype([pd.Timestamp('20130101')])
- 'datetime'
-
- >>> infer_dtype([datetime.date(2013, 1, 1)])
- 'date'
-
- >>> infer_dtype([np.datetime64('2013-01-01')])
- 'datetime64'
-
- >>> infer_dtype([datetime.timedelta(0, 1, 1)])
- 'timedelta'
-
- >>> infer_dtype(pd.Series(list('aabc')).astype('category'))
- 'categorical'
- """
- cdef:
- Py_ssize_t i, n
- object val
- ndarray values
- bint seen_pdnat = False
- bint seen_val = False
- flatiter it
-
- if util.is_array(value):
- values = value
- elif hasattr(type(value), "inferred_type") and skipna is False:
- # Index, use the cached attribute if possible, populate the cache otherwise
- return value.inferred_type
- elif hasattr(value, "dtype"):
- inferred = _try_infer_map(value.dtype)
- if inferred is not None:
- return inferred
- elif not cnp.PyArray_DescrCheck(value.dtype):
- return "unknown-array"
- # Unwrap Series/Index
- values = np.asarray(value)
- else:
- if not isinstance(value, list):
- value = list(value)
- if not value:
- return "empty"
-
- from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
- values = construct_1d_object_array_from_listlike(value)
-
- inferred = _try_infer_map(values.dtype)
- if inferred is not None:
- # Anything other than object-dtype should return here.
- return inferred
-
- if values.descr.type_num != NPY_OBJECT:
- # i.e. values.dtype != np.object_
- # This should not be reached
- values = values.astype(object)
-
- n = cnp.PyArray_SIZE(values)
- if n == 0:
- return "empty"
-
- # Iterate until we find our first valid value. We will use this
- # value to decide which of the is_foo_array functions to call.
- it = PyArray_IterNew(values)
- for i in range(n):
- # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
- # equivalents to `val = values[i]`
- val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
- PyArray_ITER_NEXT(it)
-
- # do not use checknull to keep
- # np.datetime64('nat') and np.timedelta64('nat')
- if val is None or util.is_nan(val) or val is C_NA:
- pass
- elif val is NaT:
- seen_pdnat = True
- else:
- seen_val = True
- break
-
- # if all values are nan/NaT
- if seen_val is False and seen_pdnat is True:
- return "datetime"
- # float/object nan is handled in latter logic
- if seen_val is False and skipna:
- return "empty"
-
- if util.is_datetime64_object(val):
- if is_datetime64_array(values, skipna=skipna):
- return "datetime64"
-
- elif is_timedelta(val):
- if is_timedelta_or_timedelta64_array(values, skipna=skipna):
- return "timedelta"
-
- elif util.is_integer_object(val):
- # ordering matters here; this check must come after the is_timedelta
- # check otherwise numpy timedelta64 objects would come through here
-
- if is_integer_array(values, skipna=skipna):
- return "integer"
- elif is_integer_float_array(values, skipna=skipna):
- if is_integer_na_array(values, skipna=skipna):
- return "integer-na"
- else:
- return "mixed-integer-float"
- return "mixed-integer"
-
- elif PyDateTime_Check(val):
- if is_datetime_array(values, skipna=skipna):
- return "datetime"
- elif is_date_array(values, skipna=skipna):
- return "date"
-
- elif PyDate_Check(val):
- if is_date_array(values, skipna=skipna):
- return "date"
-
- elif PyTime_Check(val):
- if is_time_array(values, skipna=skipna):
- return "time"
-
- elif is_decimal(val):
- if is_decimal_array(values, skipna=skipna):
- return "decimal"
-
- elif util.is_complex_object(val):
- if is_complex_array(values):
- return "complex"
-
- elif util.is_float_object(val):
- if is_float_array(values):
- return "floating"
- elif is_integer_float_array(values, skipna=skipna):
- if is_integer_na_array(values, skipna=skipna):
- return "integer-na"
- else:
- return "mixed-integer-float"
-
- elif util.is_bool_object(val):
- if is_bool_array(values, skipna=skipna):
- return "boolean"
-
- elif isinstance(val, str):
- if is_string_array(values, skipna=skipna):
- return "string"
-
- elif isinstance(val, bytes):
- if is_bytes_array(values, skipna=skipna):
- return "bytes"
-
- elif is_period_object(val):
- if is_period_array(values, skipna=skipna):
- return "period"
-
- elif is_interval(val):
- if is_interval_array(values):
- return "interval"
-
- cnp.PyArray_ITER_RESET(it)
- for i in range(n):
- val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
- PyArray_ITER_NEXT(it)
-
- if util.is_integer_object(val):
- return "mixed-integer"
-
- return "mixed"
-
-
-cdef bint is_timedelta(object o):
- return PyDelta_Check(o) or util.is_timedelta64_object(o)
-
-
-@cython.internal
-cdef class Validator:
-
- cdef:
- Py_ssize_t n
- dtype dtype
- bint skipna
-
- def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
- bint skipna=False):
- self.n = n
- self.dtype = dtype
- self.skipna = skipna
-
- cdef bint validate(self, ndarray values) except -1:
- if not self.n:
- return False
-
- if self.is_array_typed():
- # i.e. this ndarray is already of the desired dtype
- return True
- elif self.dtype.type_num == NPY_OBJECT:
- if self.skipna:
- return self._validate_skipna(values)
- else:
- return self._validate(values)
- else:
- return False
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef bint _validate(self, ndarray values) except -1:
- cdef:
- Py_ssize_t i
- Py_ssize_t n = values.size
- flatiter it = PyArray_IterNew(values)
-
- for i in range(n):
- # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
- # equivalents to `val = values[i]`
- val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
- PyArray_ITER_NEXT(it)
- if not self.is_valid(val):
- return False
-
- return True
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef bint _validate_skipna(self, ndarray values) except -1:
- cdef:
- Py_ssize_t i
- Py_ssize_t n = values.size
- flatiter it = PyArray_IterNew(values)
-
- for i in range(n):
- # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
- # equivalents to `val = values[i]`
- val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
- PyArray_ITER_NEXT(it)
- if not self.is_valid_skipna(val):
- return False
-
- return True
-
- cdef bint is_valid(self, object value) except -1:
- return self.is_value_typed(value)
-
- cdef bint is_valid_skipna(self, object value) except -1:
- return self.is_valid(value) or self.is_valid_null(value)
-
- cdef bint is_value_typed(self, object value) except -1:
- raise NotImplementedError(f"{type(self).__name__} child class "
- "must define is_value_typed")
-
- cdef bint is_valid_null(self, object value) except -1:
- return value is None or value is C_NA or util.is_nan(value)
- # TODO: include decimal NA?
-
- cdef bint is_array_typed(self) except -1:
- return False
-
-
-@cython.internal
-cdef class BoolValidator(Validator):
- cdef bint is_value_typed(self, object value) except -1:
- return util.is_bool_object(value)
-
- cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.bool_)
-
-
-cpdef bint is_bool_array(ndarray values, bint skipna=False):
- cdef:
- BoolValidator validator = BoolValidator(len(values),
- values.dtype,
- skipna=skipna)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class IntegerValidator(Validator):
- cdef bint is_value_typed(self, object value) except -1:
- return util.is_integer_object(value)
-
- cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.integer)
-
-
-# Note: only python-exposed for tests
-cpdef bint is_integer_array(ndarray values, bint skipna=True):
- cdef:
- IntegerValidator validator = IntegerValidator(len(values),
- values.dtype,
- skipna=skipna)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class IntegerNaValidator(Validator):
- cdef bint is_value_typed(self, object value) except -1:
- return (util.is_integer_object(value)
- or (util.is_nan(value) and util.is_float_object(value)))
-
-
-cdef bint is_integer_na_array(ndarray values, bint skipna=True):
- cdef:
- IntegerNaValidator validator = IntegerNaValidator(len(values),
- values.dtype, skipna=skipna)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class IntegerFloatValidator(Validator):
- cdef bint is_value_typed(self, object value) except -1:
- return util.is_integer_object(value) or util.is_float_object(value)
-
- cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.integer)
-
-
-cdef bint is_integer_float_array(ndarray values, bint skipna=True):
- cdef:
- IntegerFloatValidator validator = IntegerFloatValidator(len(values),
- values.dtype,
- skipna=skipna)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class FloatValidator(Validator):
- cdef bint is_value_typed(self, object value) except -1:
- return util.is_float_object(value)
-
- cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.floating)
-
-
-# Note: only python-exposed for tests
-cpdef bint is_float_array(ndarray values):
- cdef:
- FloatValidator validator = FloatValidator(len(values), values.dtype)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class ComplexValidator(Validator):
- cdef bint is_value_typed(self, object value) except -1:
- return (
- util.is_complex_object(value)
- or (util.is_float_object(value) and is_nan(value))
- )
-
- cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.complexfloating)
-
-
-cdef bint is_complex_array(ndarray values):
- cdef:
- ComplexValidator validator = ComplexValidator(len(values), values.dtype)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class DecimalValidator(Validator):
- cdef bint is_value_typed(self, object value) except -1:
- return is_decimal(value)
-
-
-cdef bint is_decimal_array(ndarray values, bint skipna=False):
- cdef:
- DecimalValidator validator = DecimalValidator(
- len(values), values.dtype, skipna=skipna
- )
- return validator.validate(values)
-
-
-@cython.internal
-cdef class StringValidator(Validator):
- cdef bint is_value_typed(self, object value) except -1:
- return isinstance(value, str)
-
- cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.str_)
-
-
-cpdef bint is_string_array(ndarray values, bint skipna=False):
- cdef:
- StringValidator validator = StringValidator(len(values),
- values.dtype,
- skipna=skipna)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class BytesValidator(Validator):
- cdef bint is_value_typed(self, object value) except -1:
- return isinstance(value, bytes)
-
- cdef bint is_array_typed(self) except -1:
- return issubclass(self.dtype.type, np.bytes_)
-
-
-cdef bint is_bytes_array(ndarray values, bint skipna=False):
- cdef:
- BytesValidator validator = BytesValidator(len(values), values.dtype,
- skipna=skipna)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class TemporalValidator(Validator):
- cdef:
- bint all_generic_na
-
- def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
- bint skipna=False):
- self.n = n
- self.dtype = dtype
- self.skipna = skipna
- self.all_generic_na = True
-
- cdef bint is_valid(self, object value) except -1:
- return self.is_value_typed(value) or self.is_valid_null(value)
-
- cdef bint is_valid_null(self, object value) except -1:
- raise NotImplementedError(f"{type(self).__name__} child class "
- "must define is_valid_null")
-
- cdef bint is_valid_skipna(self, object value) except -1:
- cdef:
- bint is_typed_null = self.is_valid_null(value)
- bint is_generic_null = value is None or util.is_nan(value)
- if not is_generic_null:
- self.all_generic_na = False
- return self.is_value_typed(value) or is_typed_null or is_generic_null
-
- cdef bint _validate_skipna(self, ndarray values) except -1:
- """
- If we _only_ saw non-dtype-specific NA values, even if they are valid
- for this dtype, we do not infer this dtype.
- """
- return Validator._validate_skipna(self, values) and not self.all_generic_na
-
-
-@cython.internal
-cdef class DatetimeValidator(TemporalValidator):
- cdef bint is_value_typed(self, object value) except -1:
- return PyDateTime_Check(value)
-
- cdef bint is_valid_null(self, object value) except -1:
- return is_null_datetime64(value)
-
-
-cpdef bint is_datetime_array(ndarray values, bint skipna=True):
- cdef:
- DatetimeValidator validator = DatetimeValidator(len(values),
- skipna=skipna)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class Datetime64Validator(DatetimeValidator):
- cdef bint is_value_typed(self, object value) except -1:
- return util.is_datetime64_object(value)
-
-
-# Note: only python-exposed for tests
-cpdef bint is_datetime64_array(ndarray values, bint skipna=True):
- cdef:
- Datetime64Validator validator = Datetime64Validator(len(values),
- skipna=skipna)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class AnyDatetimeValidator(DatetimeValidator):
- cdef bint is_value_typed(self, object value) except -1:
- return util.is_datetime64_object(value) or (
- PyDateTime_Check(value) and value.tzinfo is None
- )
-
-
-cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True):
- cdef:
- AnyDatetimeValidator validator = AnyDatetimeValidator(len(values),
- skipna=skipna)
- return validator.validate(values)
-
-
-# Note: only python-exposed for tests
-def is_datetime_with_singletz_array(values: ndarray) -> bool:
- """
- Check values have the same tzinfo attribute.
- Doesn't check values are datetime-like types.
- """
- cdef:
- Py_ssize_t i = 0, j, n = len(values)
- object base_val, base_tz, val, tz
-
- if n == 0:
- return False
-
- # Get a reference timezone to compare with the rest of the tzs in the array
- for i in range(n):
- base_val = values[i]
- if base_val is not NaT and base_val is not None and not util.is_nan(base_val):
- base_tz = getattr(base_val, "tzinfo", None)
- break
-
- for j in range(i, n):
- # Compare val's timezone with the reference timezone
- # NaT can coexist with tz-aware datetimes, so skip if encountered
- val = values[j]
- if val is not NaT and val is not None and not util.is_nan(val):
- tz = getattr(val, "tzinfo", None)
- if not tz_compare(base_tz, tz):
- return False
-
- # Note: we should only be called if a tzaware datetime has been seen,
- # so base_tz should always be set at this point.
- return True
-
-
-@cython.internal
-cdef class TimedeltaValidator(TemporalValidator):
- cdef bint is_value_typed(self, object value) except -1:
- return PyDelta_Check(value)
-
- cdef bint is_valid_null(self, object value) except -1:
- return is_null_timedelta64(value)
-
-
-@cython.internal
-cdef class AnyTimedeltaValidator(TimedeltaValidator):
- cdef bint is_value_typed(self, object value) except -1:
- return is_timedelta(value)
-
-
-# Note: only python-exposed for tests
-cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True):
- """
- Infer with timedeltas and/or nat/none.
- """
- cdef:
- AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values),
- skipna=skipna)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class DateValidator(Validator):
- cdef bint is_value_typed(self, object value) except -1:
- return PyDate_Check(value)
-
-
-# Note: only python-exposed for tests
-cpdef bint is_date_array(ndarray values, bint skipna=False):
- cdef:
- DateValidator validator = DateValidator(len(values), skipna=skipna)
- return validator.validate(values)
-
-
-@cython.internal
-cdef class TimeValidator(Validator):
- cdef bint is_value_typed(self, object value) except -1:
- return PyTime_Check(value)
-
-
-# Note: only python-exposed for tests
-cpdef bint is_time_array(ndarray values, bint skipna=False):
- cdef:
- TimeValidator validator = TimeValidator(len(values), skipna=skipna)
- return validator.validate(values)
-
-
-# FIXME: actually use skipna
-cdef bint is_period_array(ndarray values, bint skipna=True):
- """
- Is this an ndarray of Period objects (or NaT) with a single `freq`?
- """
- # values should be object-dtype, but ndarray[object] assumes 1D, while
- # this _may_ be 2D.
- cdef:
- Py_ssize_t i, N = values.size
- int dtype_code = -10000 # i.e. c_FreqGroup.FR_UND
- object val
- flatiter it
-
- if N == 0:
- return False
-
- it = PyArray_IterNew(values)
- for i in range(N):
- # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
- # equivalents to `val = values[i]`
- val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
- PyArray_ITER_NEXT(it)
-
- if is_period_object(val):
- if dtype_code == -10000:
- dtype_code = val._dtype._dtype_code
- elif dtype_code != val._dtype._dtype_code:
- # mismatched freqs
- return False
- elif checknull_with_nat(val):
- pass
- else:
- # Not a Period or NaT-like
- return False
-
- if dtype_code == -10000:
- # we saw all-NaTs, no actual Periods
- return False
- return True
-
-
-# Note: only python-exposed for tests
-cpdef bint is_interval_array(ndarray values):
- """
- Is this an ndarray of Interval (or np.nan) with a single dtype?
- """
- cdef:
- Py_ssize_t i, n = len(values)
- str closed = None
- bint numeric = False
- bint dt64 = False
- bint td64 = False
- object val
-
- if len(values) == 0:
- return False
-
- for i in range(n):
- val = values[i]
-
- if is_interval(val):
- if closed is None:
- closed = val.closed
- numeric = (
- util.is_float_object(val.left)
- or util.is_integer_object(val.left)
- )
- td64 = is_timedelta(val.left)
- dt64 = PyDateTime_Check(val.left)
- elif val.closed != closed:
- # mismatched closedness
- return False
- elif numeric:
- if not (
- util.is_float_object(val.left)
- or util.is_integer_object(val.left)
- ):
- # i.e. datetime64 or timedelta64
- return False
- elif td64:
- if not is_timedelta(val.left):
- return False
- elif dt64:
- if not PyDateTime_Check(val.left):
- return False
- else:
- raise ValueError(val)
- elif util.is_nan(val) or val is None:
- pass
- else:
- return False
-
- if closed is None:
- # we saw all-NAs, no actual Intervals
- return False
- return True
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def maybe_convert_numeric(
- ndarray[object, ndim=1] values,
- set na_values,
- bint convert_empty=True,
- bint coerce_numeric=False,
- bint convert_to_masked_nullable=False,
-) -> tuple[np.ndarray, np.ndarray | None]:
- """
- Convert object array to a numeric array if possible.
-
- Parameters
- ----------
- values : ndarray[object]
- Array of object elements to convert.
- na_values : set
- Set of values that should be interpreted as NaN.
- convert_empty : bool, default True
- If an empty array-like object is encountered, whether to interpret
- that element as NaN or not. If set to False, a ValueError will be
- raised if such an element is encountered and 'coerce_numeric' is False.
- coerce_numeric : bool, default False
- If initial attempts to convert to numeric have failed, whether to
- force conversion to numeric via alternative methods or by setting the
- element to NaN. Otherwise, an Exception will be raised when such an
- element is encountered.
-
- This boolean also has an impact on how conversion behaves when a
- numeric array has no suitable numerical dtype to return (i.e. uint64,
- int32, uint8). If set to False, the original object array will be
- returned. Otherwise, a ValueError will be raised.
- convert_to_masked_nullable : bool, default False
- Whether to return a mask for the converted values. This also disables
- upcasting for ints with nulls to float64.
- Returns
- -------
- np.ndarray
- Array of converted object values to numerical ones.
-
- Optional[np.ndarray]
- If convert_to_masked_nullable is True,
- returns a boolean mask for the converted values, otherwise returns None.
- """
- if len(values) == 0:
- return (np.array([], dtype="i8"), None)
-
- # fastpath for ints - try to convert all based on first value
- cdef:
- object val = values[0]
-
- if util.is_integer_object(val):
- try:
- maybe_ints = values.astype("i8")
- if (maybe_ints == values).all():
- return (maybe_ints, None)
- except (ValueError, OverflowError, TypeError):
- pass
-
- # Otherwise, iterate and do full inference.
- cdef:
- int maybe_int
- Py_ssize_t i, n = values.size
- Seen seen = Seen(coerce_numeric)
- ndarray[float64_t, ndim=1] floats = cnp.PyArray_EMPTY(
- 1, values.shape, cnp.NPY_FLOAT64, 0
- )
- ndarray[complex128_t, ndim=1] complexes = cnp.PyArray_EMPTY(
- 1, values.shape, cnp.NPY_COMPLEX128, 0
- )
- ndarray[int64_t, ndim=1] ints = cnp.PyArray_EMPTY(
- 1, values.shape, cnp.NPY_INT64, 0
- )
- ndarray[uint64_t, ndim=1] uints = cnp.PyArray_EMPTY(
- 1, values.shape, cnp.NPY_UINT64, 0
- )
- ndarray[uint8_t, ndim=1] bools = cnp.PyArray_EMPTY(
- 1, values.shape, cnp.NPY_UINT8, 0
- )
- ndarray[uint8_t, ndim=1] mask = np.zeros(n, dtype="u1")
- float64_t fval
- bint allow_null_in_int = convert_to_masked_nullable
-
- for i in range(n):
- val = values[i]
- # We only want to disable NaNs showing as float if
- # a) convert_to_masked_nullable = True
- # b) no floats have been seen ( assuming an int shows up later )
- # However, if no ints present (all null array), we need to return floats
- allow_null_in_int = convert_to_masked_nullable and not seen.float_
-
- if val.__hash__ is not None and val in na_values:
- if allow_null_in_int:
- seen.null_ = True
- mask[i] = 1
- else:
- if convert_to_masked_nullable:
- mask[i] = 1
- seen.saw_null()
- floats[i] = complexes[i] = NaN
- elif util.is_float_object(val):
- fval = val
- if fval != fval:
- seen.null_ = True
- if allow_null_in_int:
- mask[i] = 1
- else:
- if convert_to_masked_nullable:
- mask[i] = 1
- seen.float_ = True
- else:
- seen.float_ = True
- floats[i] = complexes[i] = fval
- elif util.is_integer_object(val):
- floats[i] = complexes[i] = val
-
- val = int(val)
- seen.saw_int(val)
-
- if val >= 0:
- if val <= oUINT64_MAX:
- uints[i] = val
- else:
- seen.float_ = True
-
- if oINT64_MIN <= val <= oINT64_MAX:
- ints[i] = val
-
- if val < oINT64_MIN or (seen.sint_ and seen.uint_):
- seen.float_ = True
-
- elif util.is_bool_object(val):
- floats[i] = uints[i] = ints[i] = bools[i] = val
- seen.bool_ = True
- elif val is None or val is C_NA:
- if allow_null_in_int:
- seen.null_ = True
- mask[i] = 1
- else:
- if convert_to_masked_nullable:
- mask[i] = 1
- seen.saw_null()
- floats[i] = complexes[i] = NaN
- elif hasattr(val, "__len__") and len(val) == 0:
- if convert_empty or seen.coerce_numeric:
- seen.saw_null()
- floats[i] = complexes[i] = NaN
- mask[i] = 1
- else:
- raise ValueError("Empty string encountered")
- elif util.is_complex_object(val):
- complexes[i] = val
- seen.complex_ = True
- elif is_decimal(val):
- floats[i] = complexes[i] = val
- seen.float_ = True
- else:
- try:
- floatify(val, &fval, &maybe_int)
-
- if fval in na_values:
- seen.saw_null()
- floats[i] = complexes[i] = NaN
- mask[i] = 1
- else:
- if fval != fval:
- seen.null_ = True
- mask[i] = 1
-
- floats[i] = fval
-
- if maybe_int:
- as_int = int(val)
-
- if as_int in na_values:
- mask[i] = 1
- seen.null_ = True
- if not allow_null_in_int:
- seen.float_ = True
- else:
- seen.saw_int(as_int)
-
- if as_int not in na_values:
- if as_int < oINT64_MIN or as_int > oUINT64_MAX:
- if seen.coerce_numeric:
- seen.float_ = True
- else:
- raise ValueError("Integer out of range.")
- else:
- if as_int >= 0:
- uints[i] = as_int
-
- if as_int <= oINT64_MAX:
- ints[i] = as_int
-
- seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
- else:
- seen.float_ = True
- except (TypeError, ValueError) as err:
- if not seen.coerce_numeric:
- raise type(err)(f"{err} at position {i}")
-
- mask[i] = 1
-
- if allow_null_in_int:
- seen.null_ = True
- else:
- seen.saw_null()
- floats[i] = NaN
-
- if seen.check_uint64_conflict():
- return (values, None)
-
- # This occurs since we disabled float nulls showing as null in anticipation
- # of seeing ints that were never seen. So then, we return float
- if allow_null_in_int and seen.null_ and not seen.int_ and not seen.bool_:
- seen.float_ = True
-
- if seen.complex_:
- return (complexes, None)
- elif seen.float_:
- if seen.null_ and convert_to_masked_nullable:
- return (floats, mask.view(np.bool_))
- return (floats, None)
- elif seen.int_:
- if seen.null_ and convert_to_masked_nullable:
- if seen.uint_:
- return (uints, mask.view(np.bool_))
- else:
- return (ints, mask.view(np.bool_))
- if seen.uint_:
- return (uints, None)
- else:
- return (ints, None)
- elif seen.bool_:
- if allow_null_in_int:
- return (bools.view(np.bool_), mask.view(np.bool_))
- return (bools.view(np.bool_), None)
- elif seen.uint_:
- return (uints, None)
- return (ints, None)
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def maybe_convert_objects(ndarray[object] objects,
- *,
- bint try_float=False,
- bint safe=False,
- bint convert_numeric=True, # NB: different default!
- bint convert_datetime=False,
- bint convert_timedelta=False,
- bint convert_period=False,
- bint convert_interval=False,
- bint convert_to_nullable_dtype=False,
- object dtype_if_all_nat=None) -> "ArrayLike":
- """
- Type inference function-- convert object array to proper dtype
-
- Parameters
- ----------
- objects : ndarray[object]
- Array of object elements to convert.
- try_float : bool, default False
- If an array-like object contains only float or NaN values is
- encountered, whether to convert and return an array of float dtype.
- safe : bool, default False
- Whether to upcast numeric type (e.g. int cast to float). If set to
- True, no upcasting will be performed.
- convert_numeric : bool, default True
- Whether to convert numeric entries.
- convert_datetime : bool, default False
- If an array-like object contains only datetime values or NaT is
- encountered, whether to convert and return an array of M8[ns] dtype.
- convert_timedelta : bool, default False
- If an array-like object contains only timedelta values or NaT is
- encountered, whether to convert and return an array of m8[ns] dtype.
- convert_period : bool, default False
- If an array-like object contains only (homogeneous-freq) Period values
- or NaT, whether to convert and return a PeriodArray.
- convert_interval : bool, default False
- If an array-like object contains only Interval objects (with matching
- dtypes and closedness) or NaN, whether to convert to IntervalArray.
- convert_to_nullable_dtype : bool, default False
- If an array-like object contains only integer or boolean values (and NaN) is
- encountered, whether to convert and return an Boolean/IntegerArray.
- dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None
- Dtype to cast to if we have all-NaT.
-
- Returns
- -------
- np.ndarray or ExtensionArray
- Array of converted object values to more specific dtypes if applicable.
- """
- cdef:
- Py_ssize_t i, n, itemsize_max = 0
- ndarray[float64_t] floats
- ndarray[complex128_t] complexes
- ndarray[int64_t] ints
- ndarray[uint64_t] uints
- ndarray[uint8_t] bools
- Seen seen = Seen()
- object val
- _TSObject tsobj
- float64_t fnan = np.nan
-
- if dtype_if_all_nat is not None:
- # in practice we don't expect to ever pass dtype_if_all_nat
- # without both convert_datetime and convert_timedelta, so disallow
- # it to avoid needing to handle it below.
- if not convert_datetime or not convert_timedelta:
- raise ValueError(
- "Cannot specify 'dtype_if_all_nat' without convert_datetime=True "
- "and convert_timedelta=True"
- )
-
- n = len(objects)
-
- floats = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_FLOAT64, 0)
- complexes = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_COMPLEX128, 0)
- ints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_INT64, 0)
- uints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT64, 0)
- bools = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT8, 0)
- mask = np.full(n, False)
-
- for i in range(n):
- val = objects[i]
- if itemsize_max != -1:
- itemsize = get_itemsize(val)
- if itemsize > itemsize_max or itemsize == -1:
- itemsize_max = itemsize
-
- if val is None:
- seen.null_ = True
- floats[i] = complexes[i] = fnan
- mask[i] = True
- elif val is NaT:
- seen.nat_ = True
- if not (convert_datetime or convert_timedelta or convert_period):
- seen.object_ = True
- break
- elif util.is_nan(val):
- seen.nan_ = True
- mask[i] = True
- floats[i] = complexes[i] = val
- elif util.is_bool_object(val):
- seen.bool_ = True
- bools[i] = val
- if not convert_numeric:
- break
- elif util.is_float_object(val):
- floats[i] = complexes[i] = val
- seen.float_ = True
- if not convert_numeric:
- break
- elif is_timedelta(val):
- if convert_timedelta:
- seen.timedelta_ = True
- try:
- convert_to_timedelta64(val, "ns")
- except OutOfBoundsTimedelta:
- seen.object_ = True
- break
- break
- else:
- seen.object_ = True
- break
- elif util.is_integer_object(val):
- seen.int_ = True
- floats[i] = <float64_t>val
- complexes[i] = <double complex>val
- if not seen.null_ or convert_to_nullable_dtype:
- seen.saw_int(val)
-
- if ((seen.uint_ and seen.sint_) or
- val > oUINT64_MAX or val < oINT64_MIN):
- seen.object_ = True
- break
-
- if seen.uint_:
- uints[i] = val
- elif seen.sint_:
- ints[i] = val
- else:
- uints[i] = val
- ints[i] = val
- if not convert_numeric:
- break
-
- elif util.is_complex_object(val):
- complexes[i] = val
- seen.complex_ = True
- if not convert_numeric:
- break
- elif PyDateTime_Check(val) or util.is_datetime64_object(val):
-
- # if we have an tz's attached then return the objects
- if convert_datetime:
- if getattr(val, "tzinfo", None) is not None:
- seen.datetimetz_ = True
- break
- else:
- seen.datetime_ = True
- try:
- tsobj = convert_to_tsobject(val, None, None, 0, 0)
- tsobj.ensure_reso(NPY_FR_ns)
- except OutOfBoundsDatetime:
- seen.object_ = True
- break
- else:
- seen.object_ = True
- break
- elif is_period_object(val):
- if convert_period:
- seen.period_ = True
- break
- else:
- seen.object_ = True
- break
- elif try_float and not isinstance(val, str):
- # this will convert Decimal objects
- try:
- floats[i] = float(val)
- complexes[i] = complex(val)
- seen.float_ = True
- except (ValueError, TypeError):
- seen.object_ = True
- break
- elif is_interval(val):
- if convert_interval:
- seen.interval_ = True
- break
- else:
- seen.object_ = True
- break
- else:
- seen.object_ = True
- break
-
- # we try to coerce datetime w/tz but must all have the same tz
- if seen.datetimetz_:
- if is_datetime_with_singletz_array(objects):
- from pandas import DatetimeIndex
-
- try:
- dti = DatetimeIndex(objects)
- except OutOfBoundsDatetime:
- # e.g. test_to_datetime_cache_coerce_50_lines_outofbounds
- pass
- else:
- # unbox to DatetimeArray
- return dti._data
- seen.object_ = True
-
- elif seen.datetime_:
- if is_datetime_or_datetime64_array(objects):
- from pandas import DatetimeIndex
-
- try:
- dti = DatetimeIndex(objects)
- except OutOfBoundsDatetime:
- pass
- else:
- # unbox to ndarray[datetime64[ns]]
- return dti._data._ndarray
- seen.object_ = True
-
- elif seen.timedelta_:
- if is_timedelta_or_timedelta64_array(objects):
- from pandas import TimedeltaIndex
-
- try:
- tdi = TimedeltaIndex(objects)
- except OutOfBoundsTimedelta:
- pass
- else:
- # unbox to ndarray[timedelta64[ns]]
- return tdi._data._ndarray
- seen.object_ = True
-
- if seen.period_:
- if is_period_array(objects):
- from pandas import PeriodIndex
- pi = PeriodIndex(objects)
-
- # unbox to PeriodArray
- return pi._data
- seen.object_ = True
-
- if seen.interval_:
- if is_interval_array(objects):
- from pandas import IntervalIndex
- ii = IntervalIndex(objects)
-
- # unbox to IntervalArray
- return ii._data
-
- seen.object_ = True
-
- if seen.nat_:
- if not seen.object_ and not seen.numeric_ and not seen.bool_:
- # all NaT, None, or nan (at least one NaT)
- # see GH#49340 for discussion of desired behavior
- dtype = dtype_if_all_nat
- if cnp.PyArray_DescrCheck(dtype):
- # i.e. isinstance(dtype, np.dtype)
- if dtype.kind not in ["m", "M"]:
- raise ValueError(dtype)
- else:
- res = np.empty((<object>objects).shape, dtype=dtype)
- res[:] = NPY_NAT
- return res
- elif dtype is not None:
- # EA, we don't expect to get here, but _could_ implement
- raise NotImplementedError(dtype)
- elif convert_datetime and convert_timedelta:
- # we don't guess
- seen.object_ = True
- elif convert_datetime:
- res = np.empty((<object>objects).shape, dtype="M8[ns]")
- res[:] = NPY_NAT
- return res
- elif convert_timedelta:
- res = np.empty((<object>objects).shape, dtype="m8[ns]")
- res[:] = NPY_NAT
- return res
- else:
- seen.object_ = True
- else:
- seen.object_ = True
-
- if not convert_numeric:
- # Note: we count "bool" as numeric here. This is becase
- # np.array(list_of_items) will convert bools just like it will numeric
- # entries.
- return objects
-
- if seen.bool_:
- if seen.is_bool:
- # is_bool property rules out everything else
- return bools.view(np.bool_)
- elif convert_to_nullable_dtype and seen.is_bool_or_na:
- from pandas.core.arrays import BooleanArray
- return BooleanArray(bools.view(np.bool_), mask)
- seen.object_ = True
-
- if not seen.object_:
- result = None
- if not safe:
- if seen.null_ or seen.nan_:
- if seen.complex_:
- result = complexes
- elif seen.float_:
- result = floats
- elif seen.int_ or seen.uint_:
- if convert_to_nullable_dtype:
- from pandas.core.arrays import IntegerArray
- if seen.uint_:
- result = IntegerArray(uints, mask)
- else:
- result = IntegerArray(ints, mask)
- else:
- result = floats
- elif seen.nan_:
- result = floats
- else:
- if seen.complex_:
- result = complexes
- elif seen.float_:
- result = floats
- elif seen.int_:
- if seen.uint_:
- result = uints
- else:
- result = ints
-
- else:
- # don't cast int to float, etc.
- if seen.null_:
- if seen.complex_:
- if not seen.int_:
- result = complexes
- elif seen.float_ or seen.nan_:
- if not seen.int_:
- result = floats
- else:
- if seen.complex_:
- if not seen.int_:
- result = complexes
- elif seen.float_ or seen.nan_:
- if not seen.int_:
- result = floats
- elif seen.int_:
- if seen.uint_:
- result = uints
- else:
- result = ints
-
- if result is uints or result is ints or result is floats or result is complexes:
- # cast to the largest itemsize when all values are NumPy scalars
- if itemsize_max > 0 and itemsize_max != result.dtype.itemsize:
- result = result.astype(result.dtype.kind + str(itemsize_max))
- return result
- elif result is not None:
- return result
-
- return objects
-
-
-class _NoDefault(Enum):
- # We make this an Enum
- # 1) because it round-trips through pickle correctly (see GH#40397)
- # 2) because mypy does not understand singletons
- no_default = "NO_DEFAULT"
-
- def __repr__(self) -> str:
- return "<no_default>"
-
-
-# Note: no_default is exported to the public API in pandas.api.extensions
-no_default = _NoDefault.no_default # Sentinel indicating the default value.
-NoDefault = Literal[_NoDefault.no_default]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True,
- object na_value=no_default, cnp.dtype dtype=np.dtype(object)
- ) -> np.ndarray:
- """
- Substitute for np.vectorize with pandas-friendly dtype inference.
-
- Parameters
- ----------
- arr : ndarray
- f : function
- mask : ndarray
- uint8 dtype ndarray indicating values not to apply `f` to.
- convert : bool, default True
- Whether to call `maybe_convert_objects` on the resulting ndarray
- na_value : Any, optional
- The result value to use for masked values. By default, the
- input value is used
- dtype : numpy.dtype
- The numpy dtype to use for the result ndarray.
-
- Returns
- -------
- np.ndarray
- """
- cdef:
- Py_ssize_t i, n
- ndarray result
- object val
-
- n = len(arr)
- result = np.empty(n, dtype=dtype)
- for i in range(n):
- if mask[i]:
- if na_value is no_default:
- val = arr[i]
- else:
- val = na_value
- else:
- val = f(arr[i])
-
- if cnp.PyArray_IsZeroDim(val):
- # unbox 0-dim arrays, GH#690
- val = val.item()
-
- result[i] = val
-
- if convert:
- return maybe_convert_objects(result,
- try_float=False,
- convert_datetime=False,
- convert_timedelta=False)
-
- return result
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def map_infer(
- ndarray arr, object f, bint convert=True, bint ignore_na=False
-) -> np.ndarray:
- """
- Substitute for np.vectorize with pandas-friendly dtype inference.
-
- Parameters
- ----------
- arr : ndarray
- f : function
- convert : bint
- ignore_na : bint
- If True, NA values will not have f applied
-
- Returns
- -------
- np.ndarray
- """
- cdef:
- Py_ssize_t i, n
- ndarray[object] result
- object val
-
- n = len(arr)
- result = cnp.PyArray_EMPTY(1, arr.shape, cnp.NPY_OBJECT, 0)
- for i in range(n):
- if ignore_na and checknull(arr[i]):
- result[i] = arr[i]
- continue
- val = f(arr[i])
-
- if cnp.PyArray_IsZeroDim(val):
- # unbox 0-dim arrays, GH#690
- val = val.item()
-
- result[i] = val
-
- if convert:
- return maybe_convert_objects(result,
- try_float=False,
- convert_datetime=False,
- convert_timedelta=False)
-
- return result
-
-
-def to_object_array(rows: object, min_width: int = 0) -> ndarray:
- """
- Convert a list of lists into an object array.
-
- Parameters
- ----------
- rows : 2-d array (N, K)
- List of lists to be converted into an array.
- min_width : int
- Minimum width of the object array. If a list
- in `rows` contains fewer than `width` elements,
- the remaining elements in the corresponding row
- will all be `NaN`.
-
- Returns
- -------
- np.ndarray[object, ndim=2]
- """
- cdef:
- Py_ssize_t i, j, n, k, tmp
- ndarray[object, ndim=2] result
- list row
-
- rows = list(rows)
- n = len(rows)
-
- k = min_width
- for i in range(n):
- tmp = len(rows[i])
- if tmp > k:
- k = tmp
-
- result = np.empty((n, k), dtype=object)
-
- for i in range(n):
- row = list(rows[i])
-
- for j in range(len(row)):
- result[i, j] = row[j]
-
- return result
-
-
-def tuples_to_object_array(ndarray[object] tuples):
- cdef:
- Py_ssize_t i, j, n, k
- ndarray[object, ndim=2] result
- tuple tup
-
- n = len(tuples)
- k = len(tuples[0])
- result = np.empty((n, k), dtype=object)
- for i in range(n):
- tup = tuples[i]
- for j in range(k):
- result[i, j] = tup[j]
-
- return result
-
-
-def to_object_array_tuples(rows: object) -> np.ndarray:
- """
- Convert a list of tuples into an object array. Any subclass of
- tuple in `rows` will be casted to tuple.
-
- Parameters
- ----------
- rows : 2-d array (N, K)
- List of tuples to be converted into an array.
-
- Returns
- -------
- np.ndarray[object, ndim=2]
- """
- cdef:
- Py_ssize_t i, j, n, k, tmp
- ndarray[object, ndim=2] result
- tuple row
-
- rows = list(rows)
- n = len(rows)
-
- k = 0
- for i in range(n):
- tmp = 1 if checknull(rows[i]) else len(rows[i])
- if tmp > k:
- k = tmp
-
- result = np.empty((n, k), dtype=object)
-
- try:
- for i in range(n):
- row = rows[i]
- for j in range(len(row)):
- result[i, j] = row[j]
- except TypeError:
- # e.g. "Expected tuple, got list"
- # upcast any subclasses to tuple
- for i in range(n):
- row = (rows[i],) if checknull(rows[i]) else tuple(rows[i])
- for j in range(len(row)):
- result[i, j] = row[j]
-
- return result
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray:
- cdef:
- Py_ssize_t i, n = len(keys)
- object val
- ndarray[object] output = np.empty(n, dtype="O")
-
- if n == 0:
- # kludge, for Series
- return np.empty(0, dtype="f8")
-
- for i in range(n):
- val = keys[i]
- if val in mapping:
- output[i] = mapping[val]
- else:
- output[i] = default
-
- return maybe_convert_objects(output)
-
-
-def is_bool_list(obj: list) -> bool:
- """
- Check if this list contains only bool or np.bool_ objects.
-
- This is appreciably faster than checking `np.array(obj).dtype == bool`
-
- obj1 = [True, False] * 100
- obj2 = obj1 * 100
- obj3 = obj2 * 100
- obj4 = [True, None] + obj1
-
- for obj in [obj1, obj2, obj3, obj4]:
- %timeit is_bool_list(obj)
- %timeit np.array(obj).dtype.kind == "b"
-
- 340 ns ± 8.22 ns
- 8.78 µs ± 253 ns
-
- 28.8 µs ± 704 ns
- 813 µs ± 17.8 µs
-
- 3.4 ms ± 168 µs
- 78.4 ms ± 1.05 ms
-
- 48.1 ns ± 1.26 ns
- 8.1 µs ± 198 ns
- """
- cdef:
- object item
-
- for item in obj:
- if not util.is_bool_object(item):
- return False
-
- # Note: we return True for empty list
- return True
-
-
-cpdef ndarray eq_NA_compat(ndarray[object] arr, object key):
- """
- Check for `arr == key`, treating all values as not-equal to pd.NA.
-
- key is assumed to have `not isna(key)`
- """
- cdef:
- ndarray[uint8_t, cast=True] result = cnp.PyArray_EMPTY(
- arr.ndim, arr.shape, cnp.NPY_BOOL, 0
- )
- Py_ssize_t i
- object item
-
- for i in range(len(arr)):
- item = arr[i]
- if item is C_NA:
- result[i] = False
- else:
- result[i] = item == key
-
- return result
-
-
-def dtypes_all_equal(list types not None) -> bool:
- """
- Faster version for:
-
- first = types[0]
- all(is_dtype_equal(first, t) for t in types[1:])
-
- And assuming all elements in the list are np.dtype/ExtensionDtype objects
-
- See timings at https://github.com/pandas-dev/pandas/pull/44594
- """
- first = types[0]
- for t in types[1:]:
- try:
- if not t == first:
- return False
- except (TypeError, AttributeError):
- return False
- else:
- return True
diff --git a/contrib/python/pandas/py3/pandas/_libs/missing.pxd b/contrib/python/pandas/py3/pandas/_libs/missing.pxd
deleted file mode 100644
index 59206495194..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/missing.pxd
+++ /dev/null
@@ -1,20 +0,0 @@
-from numpy cimport (
- ndarray,
- uint8_t,
-)
-
-
-cpdef bint is_matching_na(object left, object right, bint nan_matches_none=*)
-cpdef bint check_na_tuples_nonequal(object left, object right)
-
-cpdef bint checknull(object val, bint inf_as_na=*)
-cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=*)
-
-cdef bint is_null_datetime64(v)
-cdef bint is_null_timedelta64(v)
-cdef bint checknull_with_nat_and_na(object obj)
-
-cdef class C_NAType:
- pass
-
-cdef C_NAType C_NA
diff --git a/contrib/python/pandas/py3/pandas/_libs/missing.pyi b/contrib/python/pandas/py3/pandas/_libs/missing.pyi
deleted file mode 100644
index d5c9f1342a0..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/missing.pyi
+++ /dev/null
@@ -1,17 +0,0 @@
-import numpy as np
-from numpy import typing as npt
-
-class NAType:
- def __new__(cls, *args, **kwargs): ...
-
-NA: NAType
-
-def is_matching_na(
- left: object, right: object, nan_matches_none: bool = ...
-) -> bool: ...
-def isposinf_scalar(val: object) -> bool: ...
-def isneginf_scalar(val: object) -> bool: ...
-def checknull(val: object, inf_as_na: bool = ...) -> bool: ...
-def isnaobj(arr: np.ndarray, inf_as_na: bool = ...) -> npt.NDArray[np.bool_]: ...
-def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
-def is_float_nan(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/missing.pyx b/contrib/python/pandas/py3/pandas/_libs/missing.pyx
deleted file mode 100644
index b6794de94c0..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/missing.pyx
+++ /dev/null
@@ -1,513 +0,0 @@
-from decimal import Decimal
-import numbers
-from sys import maxsize
-
-cimport cython
-from cpython.datetime cimport (
- date,
- time,
- timedelta,
-)
-from cython cimport Py_ssize_t
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- flatiter,
- float64_t,
- int64_t,
- ndarray,
- uint8_t,
-)
-
-cnp.import_array()
-
-from pandas._libs cimport util
-from pandas._libs.tslibs.nattype cimport (
- c_NaT as NaT,
- checknull_with_nat,
- is_dt64nat,
- is_td64nat,
-)
-from pandas._libs.tslibs.np_datetime cimport (
- get_datetime64_unit,
- get_datetime64_value,
- get_timedelta64_value,
-)
-
-from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
-
-cdef:
- float64_t INF = <float64_t>np.inf
- float64_t NEGINF = -INF
-
- int64_t NPY_NAT = util.get_nat()
-
- bint is_32bit = maxsize <= 2 ** 32
-
- type cDecimal = Decimal # for faster isinstance checks
-
-
-cpdef bint check_na_tuples_nonequal(object left, object right):
- """
- When we have NA in one of the tuples but not the other we have to check here,
- because our regular checks fail before with ambigous boolean value.
-
- Parameters
- ----------
- left: Any
- right: Any
-
- Returns
- -------
- True if we are dealing with tuples that have NA on one side and non NA on
- the other side.
-
- """
- if not isinstance(left, tuple) or not isinstance(right, tuple):
- return False
-
- if len(left) != len(right):
- return False
-
- for left_element, right_element in zip(left, right):
- if left_element is C_NA and right_element is not C_NA:
- return True
- elif right_element is C_NA and left_element is not C_NA:
- return True
-
- return False
-
-
-cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False):
- """
- Check if two scalars are both NA of matching types.
-
- Parameters
- ----------
- left : Any
- right : Any
- nan_matches_none : bool, default False
- For backwards compatibility, consider NaN as matching None.
-
- Returns
- -------
- bool
- """
- if left is None:
- if nan_matches_none and util.is_nan(right):
- return True
- return right is None
- elif left is C_NA:
- return right is C_NA
- elif left is NaT:
- return right is NaT
- elif util.is_float_object(left):
- if nan_matches_none and right is None and util.is_nan(left):
- return True
- return (
- util.is_nan(left)
- and util.is_float_object(right)
- and util.is_nan(right)
- )
- elif util.is_complex_object(left):
- return (
- util.is_nan(left)
- and util.is_complex_object(right)
- and util.is_nan(right)
- )
- elif util.is_datetime64_object(left):
- return (
- get_datetime64_value(left) == NPY_NAT
- and util.is_datetime64_object(right)
- and get_datetime64_value(right) == NPY_NAT
- and get_datetime64_unit(left) == get_datetime64_unit(right)
- )
- elif util.is_timedelta64_object(left):
- return (
- get_timedelta64_value(left) == NPY_NAT
- and util.is_timedelta64_object(right)
- and get_timedelta64_value(right) == NPY_NAT
- and get_datetime64_unit(left) == get_datetime64_unit(right)
- )
- elif is_decimal_na(left):
- return is_decimal_na(right)
- return False
-
-
-cpdef bint checknull(object val, bint inf_as_na=False):
- """
- Return boolean describing of the input is NA-like, defined here as any
- of:
- - None
- - nan
- - NaT
- - np.datetime64 representation of NaT
- - np.timedelta64 representation of NaT
- - NA
- - Decimal("NaN")
-
- Parameters
- ----------
- val : object
- inf_as_na : bool, default False
- Whether to treat INF and -INF as NA values.
-
- Returns
- -------
- bool
- """
- if val is None or val is NaT or val is C_NA:
- return True
- elif util.is_float_object(val) or util.is_complex_object(val):
- if val != val:
- return True
- elif inf_as_na:
- return val == INF or val == NEGINF
- return False
- elif util.is_timedelta64_object(val):
- return get_timedelta64_value(val) == NPY_NAT
- elif util.is_datetime64_object(val):
- return get_datetime64_value(val) == NPY_NAT
- else:
- return is_decimal_na(val)
-
-
-cdef bint is_decimal_na(object val):
- """
- Is this a decimal.Decimal object Decimal("NAN").
- """
- return isinstance(val, cDecimal) and val != val
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False):
- """
- Return boolean mask denoting which elements of a 1-D array are na-like,
- according to the criteria defined in `checknull`:
- - None
- - nan
- - NaT
- - np.datetime64 representation of NaT
- - np.timedelta64 representation of NaT
- - NA
- - Decimal("NaN")
-
- Parameters
- ----------
- arr : ndarray
-
- Returns
- -------
- result : ndarray (dtype=np.bool_)
- """
- cdef:
- Py_ssize_t i, n = arr.size
- object val
- bint is_null
- ndarray result = np.empty((<object>arr).shape, dtype=np.uint8)
- flatiter it = cnp.PyArray_IterNew(arr)
- flatiter it2 = cnp.PyArray_IterNew(result)
-
- for i in range(n):
- # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
- # equivalents to `val = values[i]`
- val = cnp.PyArray_GETITEM(arr, cnp.PyArray_ITER_DATA(it))
- cnp.PyArray_ITER_NEXT(it)
- is_null = checknull(val, inf_as_na=inf_as_na)
- # Dereference pointer (set value)
- (<uint8_t *>(cnp.PyArray_ITER_DATA(it2)))[0] = <uint8_t>is_null
- cnp.PyArray_ITER_NEXT(it2)
- return result.view(np.bool_)
-
-
-def isposinf_scalar(val: object) -> bool:
- return util.is_float_object(val) and val == INF
-
-
-def isneginf_scalar(val: object) -> bool:
- return util.is_float_object(val) and val == NEGINF
-
-
-cdef bint is_null_datetime64(v):
- # determine if we have a null for a datetime (or integer versions),
- # excluding np.timedelta64('nat')
- if checknull_with_nat(v) or is_dt64nat(v):
- return True
- return False
-
-
-cdef bint is_null_timedelta64(v):
- # determine if we have a null for a timedelta (or integer versions),
- # excluding np.datetime64('nat')
- if checknull_with_nat(v) or is_td64nat(v):
- return True
- return False
-
-
-cdef bint checknull_with_nat_and_na(object obj):
- # See GH#32214
- return checknull_with_nat(obj) or obj is C_NA
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def is_float_nan(values: ndarray) -> ndarray:
- """
- True for elements which correspond to a float nan
-
- Returns
- -------
- ndarray[bool]
- """
- cdef:
- ndarray[uint8_t] result
- Py_ssize_t i, N
- object val
-
- N = len(values)
- result = np.zeros(N, dtype=np.uint8)
-
- for i in range(N):
- val = values[i]
- if util.is_nan(val):
- result[i] = True
- return result.view(bool)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def is_numeric_na(values: ndarray) -> ndarray:
- """
- Check for NA values consistent with IntegerArray/FloatingArray.
-
- Similar to a vectorized is_valid_na_for_dtype restricted to numeric dtypes.
-
- Returns
- -------
- ndarray[bool]
- """
- cdef:
- ndarray[uint8_t] result
- Py_ssize_t i, N
- object val
-
- N = len(values)
- result = np.zeros(N, dtype=np.uint8)
-
- for i in range(N):
- val = values[i]
- if checknull(val):
- if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val):
- result[i] = True
- else:
- raise TypeError(f"'values' contains non-numeric NA {val}")
- return result.view(bool)
-
-
-# -----------------------------------------------------------------------------
-# Implementation of NA singleton
-
-
-def _create_binary_propagating_op(name, is_divmod=False):
- is_cmp = name.strip("_") in ["eq", "ne", "le", "lt", "ge", "gt"]
-
- def method(self, other):
- if (other is C_NA or isinstance(other, (str, bytes))
- or isinstance(other, (numbers.Number, np.bool_))
- or util.is_array(other) and not other.shape):
- # Need the other.shape clause to handle NumPy scalars,
- # since we do a setitem on `out` below, which
- # won't work for NumPy scalars.
- if is_divmod:
- return NA, NA
- else:
- return NA
-
- elif util.is_array(other):
- out = np.empty(other.shape, dtype=object)
- out[:] = NA
-
- if is_divmod:
- return out, out.copy()
- else:
- return out
-
- elif is_cmp and isinstance(other, (date, time, timedelta)):
- return NA
-
- return NotImplemented
-
- method.__name__ = name
- return method
-
-
-def _create_unary_propagating_op(name: str):
- def method(self):
- return NA
-
- method.__name__ = name
- return method
-
-
-cdef class C_NAType:
- pass
-
-
-class NAType(C_NAType):
- """
- NA ("not available") missing value indicator.
-
- .. warning::
-
- Experimental: the behaviour of NA can still change without warning.
-
- The NA singleton is a missing value indicator defined by pandas. It is
- used in certain new extension dtypes (currently the "string" dtype).
- """
-
- _instance = None
-
- def __new__(cls, *args, **kwargs):
- if NAType._instance is None:
- NAType._instance = C_NAType.__new__(cls, *args, **kwargs)
- return NAType._instance
-
- def __repr__(self) -> str:
- return "<NA>"
-
- def __format__(self, format_spec) -> str:
- try:
- return self.__repr__().__format__(format_spec)
- except ValueError:
- return self.__repr__()
-
- def __bool__(self):
- raise TypeError("boolean value of NA is ambiguous")
-
- def __hash__(self):
- # GH 30013: Ensure hash is large enough to avoid hash collisions with integers
- exponent = 31 if is_32bit else 61
- return 2 ** exponent - 1
-
- def __reduce__(self):
- return "NA"
-
- # Binary arithmetic and comparison ops -> propagate
-
- __add__ = _create_binary_propagating_op("__add__")
- __radd__ = _create_binary_propagating_op("__radd__")
- __sub__ = _create_binary_propagating_op("__sub__")
- __rsub__ = _create_binary_propagating_op("__rsub__")
- __mul__ = _create_binary_propagating_op("__mul__")
- __rmul__ = _create_binary_propagating_op("__rmul__")
- __matmul__ = _create_binary_propagating_op("__matmul__")
- __rmatmul__ = _create_binary_propagating_op("__rmatmul__")
- __truediv__ = _create_binary_propagating_op("__truediv__")
- __rtruediv__ = _create_binary_propagating_op("__rtruediv__")
- __floordiv__ = _create_binary_propagating_op("__floordiv__")
- __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
- __mod__ = _create_binary_propagating_op("__mod__")
- __rmod__ = _create_binary_propagating_op("__rmod__")
- __divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True)
- __rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True)
- # __lshift__ and __rshift__ are not implemented
-
- __eq__ = _create_binary_propagating_op("__eq__")
- __ne__ = _create_binary_propagating_op("__ne__")
- __le__ = _create_binary_propagating_op("__le__")
- __lt__ = _create_binary_propagating_op("__lt__")
- __gt__ = _create_binary_propagating_op("__gt__")
- __ge__ = _create_binary_propagating_op("__ge__")
-
- # Unary ops
-
- __neg__ = _create_unary_propagating_op("__neg__")
- __pos__ = _create_unary_propagating_op("__pos__")
- __abs__ = _create_unary_propagating_op("__abs__")
- __invert__ = _create_unary_propagating_op("__invert__")
-
- # pow has special
- def __pow__(self, other):
- if other is C_NA:
- return NA
- elif isinstance(other, (numbers.Number, np.bool_)):
- if other == 0:
- # returning positive is correct for +/- 0.
- return type(other)(1)
- else:
- return NA
- elif util.is_array(other):
- return np.where(other == 0, other.dtype.type(1), NA)
-
- return NotImplemented
-
- def __rpow__(self, other):
- if other is C_NA:
- return NA
- elif isinstance(other, (numbers.Number, np.bool_)):
- if other == 1:
- return other
- else:
- return NA
- elif util.is_array(other):
- return np.where(other == 1, other, NA)
- return NotImplemented
-
- # Logical ops using Kleene logic
-
- def __and__(self, other):
- if other is False:
- return False
- elif other is True or other is C_NA:
- return NA
- return NotImplemented
-
- __rand__ = __and__
-
- def __or__(self, other):
- if other is True:
- return True
- elif other is False or other is C_NA:
- return NA
- return NotImplemented
-
- __ror__ = __or__
-
- def __xor__(self, other):
- if other is False or other is True or other is C_NA:
- return NA
- return NotImplemented
-
- __rxor__ = __xor__
-
- __array_priority__ = 1000
- _HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_)
-
- def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
- types = self._HANDLED_TYPES + (NAType,)
- for x in inputs:
- if not isinstance(x, types):
- return NotImplemented
-
- if method != "__call__":
- raise ValueError(f"ufunc method '{method}' not supported for NA")
- result = maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is NotImplemented:
- # For a NumPy ufunc that's not a binop, like np.logaddexp
- index = [i for i, x in enumerate(inputs) if x is NA][0]
- result = np.broadcast_arrays(*inputs)[index]
- if result.ndim == 0:
- result = result.item()
- if ufunc.nout > 1:
- result = (NA,) * ufunc.nout
-
- return result
-
-
-C_NA = NAType() # C-visible
-NA = C_NA # Python-visible
diff --git a/contrib/python/pandas/py3/pandas/_libs/ops.pyi b/contrib/python/pandas/py3/pandas/_libs/ops.pyi
deleted file mode 100644
index 74a6ad87cd2..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/ops.pyi
+++ /dev/null
@@ -1,50 +0,0 @@
-from typing import (
- Any,
- Callable,
- Iterable,
- Literal,
- overload,
-)
-
-import numpy as np
-
-from pandas._typing import npt
-
-_BinOp = Callable[[Any, Any], Any]
-_BoolOp = Callable[[Any, Any], bool]
-
-def scalar_compare(
- values: np.ndarray, # object[:]
- val: object,
- op: _BoolOp, # {operator.eq, operator.ne, ...}
-) -> npt.NDArray[np.bool_]: ...
-def vec_compare(
- left: npt.NDArray[np.object_],
- right: npt.NDArray[np.object_],
- op: _BoolOp, # {operator.eq, operator.ne, ...}
-) -> npt.NDArray[np.bool_]: ...
-def scalar_binop(
- values: np.ndarray, # object[:]
- val: object,
- op: _BinOp, # binary operator
-) -> np.ndarray: ...
-def vec_binop(
- left: np.ndarray, # object[:]
- right: np.ndarray, # object[:]
- op: _BinOp, # binary operator
-) -> np.ndarray: ...
-@overload
-def maybe_convert_bool(
- arr: npt.NDArray[np.object_],
- true_values: Iterable = ...,
- false_values: Iterable = ...,
- convert_to_masked_nullable: Literal[False] = ...,
-) -> tuple[np.ndarray, None]: ...
-@overload
-def maybe_convert_bool(
- arr: npt.NDArray[np.object_],
- true_values: Iterable = ...,
- false_values: Iterable = ...,
- *,
- convert_to_masked_nullable: Literal[True],
-) -> tuple[np.ndarray, np.ndarray]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/ops.pyx b/contrib/python/pandas/py3/pandas/_libs/ops.pyx
deleted file mode 100644
index 9154e836b34..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/ops.pyx
+++ /dev/null
@@ -1,310 +0,0 @@
-import operator
-
-cimport cython
-from cpython.object cimport (
- Py_EQ,
- Py_GE,
- Py_GT,
- Py_LE,
- Py_LT,
- Py_NE,
- PyObject_RichCompareBool,
-)
-from cython cimport Py_ssize_t
-
-import numpy as np
-
-from numpy cimport (
- import_array,
- ndarray,
- uint8_t,
-)
-
-import_array()
-
-
-from pandas._libs.missing cimport checknull
-from pandas._libs.util cimport is_nan
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def scalar_compare(object[:] values, object val, object op) -> ndarray:
- """
- Compare each element of `values` array with the scalar `val`, with
- the comparison operation described by `op`.
-
- Parameters
- ----------
- values : ndarray[object]
- val : object
- op : {operator.eq, operator.ne,
- operator.le, operator.lt,
- operator.ge, operator.gt}
-
- Returns
- -------
- result : ndarray[bool]
- """
- cdef:
- Py_ssize_t i, n = len(values)
- ndarray[uint8_t, cast=True] result
- bint isnull_val
- int flag
- object x
-
- if op is operator.lt:
- flag = Py_LT
- elif op is operator.le:
- flag = Py_LE
- elif op is operator.gt:
- flag = Py_GT
- elif op is operator.ge:
- flag = Py_GE
- elif op is operator.eq:
- flag = Py_EQ
- elif op is operator.ne:
- flag = Py_NE
- else:
- raise ValueError("Unrecognized operator")
-
- result = np.empty(n, dtype=bool).view(np.uint8)
- isnull_val = checknull(val)
-
- if flag == Py_NE:
- for i in range(n):
- x = values[i]
- if checknull(x):
- result[i] = True
- elif isnull_val:
- result[i] = True
- else:
- try:
- result[i] = PyObject_RichCompareBool(x, val, flag)
- except TypeError:
- result[i] = True
- elif flag == Py_EQ:
- for i in range(n):
- x = values[i]
- if checknull(x):
- result[i] = False
- elif isnull_val:
- result[i] = False
- else:
- try:
- result[i] = PyObject_RichCompareBool(x, val, flag)
- except TypeError:
- result[i] = False
-
- else:
- for i in range(n):
- x = values[i]
- if checknull(x):
- result[i] = False
- elif isnull_val:
- result[i] = False
- else:
- result[i] = PyObject_RichCompareBool(x, val, flag)
-
- return result.view(bool)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def vec_compare(ndarray[object] left, ndarray[object] right, object op) -> ndarray:
- """
- Compare the elements of `left` with the elements of `right` pointwise,
- with the comparison operation described by `op`.
-
- Parameters
- ----------
- left : ndarray[object]
- right : ndarray[object]
- op : {operator.eq, operator.ne,
- operator.le, operator.lt,
- operator.ge, operator.gt}
-
- Returns
- -------
- result : ndarray[bool]
- """
- cdef:
- Py_ssize_t i, n = len(left)
- ndarray[uint8_t, cast=True] result
- int flag
-
- if n != <Py_ssize_t>len(right):
- raise ValueError(f"Arrays were different lengths: {n} vs {len(right)}")
-
- if op is operator.lt:
- flag = Py_LT
- elif op is operator.le:
- flag = Py_LE
- elif op is operator.gt:
- flag = Py_GT
- elif op is operator.ge:
- flag = Py_GE
- elif op is operator.eq:
- flag = Py_EQ
- elif op is operator.ne:
- flag = Py_NE
- else:
- raise ValueError("Unrecognized operator")
-
- result = np.empty(n, dtype=bool).view(np.uint8)
-
- if flag == Py_NE:
- for i in range(n):
- x = left[i]
- y = right[i]
-
- if checknull(x) or checknull(y):
- result[i] = True
- else:
- result[i] = PyObject_RichCompareBool(x, y, flag)
- else:
- for i in range(n):
- x = left[i]
- y = right[i]
-
- if checknull(x) or checknull(y):
- result[i] = False
- else:
- result[i] = PyObject_RichCompareBool(x, y, flag)
-
- return result.view(bool)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def scalar_binop(object[:] values, object val, object op) -> ndarray:
- """
- Apply the given binary operator `op` between each element of the array
- `values` and the scalar `val`.
-
- Parameters
- ----------
- values : ndarray[object]
- val : object
- op : binary operator
-
- Returns
- -------
- result : ndarray[object]
- """
- cdef:
- Py_ssize_t i, n = len(values)
- object[::1] result
- object x
-
- result = np.empty(n, dtype=object)
- if val is None or is_nan(val):
- result[:] = val
- return result.base # `.base` to access underlying np.ndarray
-
- for i in range(n):
- x = values[i]
- if x is None or is_nan(x):
- result[i] = x
- else:
- result[i] = op(x, val)
-
- return maybe_convert_bool(result.base)[0]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def vec_binop(object[:] left, object[:] right, object op) -> ndarray:
- """
- Apply the given binary operator `op` pointwise to the elements of
- arrays `left` and `right`.
-
- Parameters
- ----------
- left : ndarray[object]
- right : ndarray[object]
- op : binary operator
-
- Returns
- -------
- result : ndarray[object]
- """
- cdef:
- Py_ssize_t i, n = len(left)
- object[::1] result
-
- if n != <Py_ssize_t>len(right):
- raise ValueError(f"Arrays were different lengths: {n} vs {len(right)}")
-
- result = np.empty(n, dtype=object)
-
- for i in range(n):
- x = left[i]
- y = right[i]
- try:
- result[i] = op(x, y)
- except TypeError:
- if x is None or is_nan(x):
- result[i] = x
- elif y is None or is_nan(y):
- result[i] = y
- else:
- raise
-
- return maybe_convert_bool(result.base)[0] # `.base` to access np.ndarray
-
-
-def maybe_convert_bool(ndarray[object] arr,
- true_values=None,
- false_values=None,
- convert_to_masked_nullable=False
- ) -> tuple[np.ndarray, np.ndarray | None]:
- cdef:
- Py_ssize_t i, n
- ndarray[uint8_t] result
- ndarray[uint8_t] mask
- object val
- set true_vals, false_vals
- bint has_na = False
-
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
- mask = np.zeros(n, dtype=np.uint8)
- # the defaults
- true_vals = {"True", "TRUE", "true"}
- false_vals = {"False", "FALSE", "false"}
-
- if true_values is not None:
- true_vals = true_vals | set(true_values)
-
- if false_values is not None:
- false_vals = false_vals | set(false_values)
-
- for i in range(n):
- val = arr[i]
-
- if isinstance(val, bool):
- if val is True:
- result[i] = 1
- else:
- result[i] = 0
- elif val in true_vals:
- result[i] = 1
- elif val in false_vals:
- result[i] = 0
- elif is_nan(val) or val is None:
- mask[i] = 1
- result[i] = 0 # Value here doesn't matter, will be replaced w/ nan
- has_na = True
- else:
- return (arr, None)
-
- if has_na:
- if convert_to_masked_nullable:
- return (result.view(np.bool_), mask.view(np.bool_))
- else:
- arr = result.view(np.bool_).astype(object)
- np.putmask(arr, mask, np.nan)
- return (arr, None)
- else:
- return (result.view(np.bool_), None)
diff --git a/contrib/python/pandas/py3/pandas/_libs/ops_dispatch.pyi b/contrib/python/pandas/py3/pandas/_libs/ops_dispatch.pyi
deleted file mode 100644
index 91b5a4dbaae..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/ops_dispatch.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-import numpy as np
-
-def maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc: np.ufunc, method: str, *inputs, **kwargs
-): ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/ops_dispatch.pyx b/contrib/python/pandas/py3/pandas/_libs/ops_dispatch.pyx
deleted file mode 100644
index 2b2a411e663..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/ops_dispatch.pyx
+++ /dev/null
@@ -1,121 +0,0 @@
-DISPATCHED_UFUNCS = {
- "add",
- "sub",
- "mul",
- "pow",
- "mod",
- "floordiv",
- "truediv",
- "divmod",
- "eq",
- "ne",
- "lt",
- "gt",
- "le",
- "ge",
- "remainder",
- "matmul",
- "or",
- "xor",
- "and",
- "neg",
- "pos",
- "abs",
-}
-UNARY_UFUNCS = {
- "neg",
- "pos",
- "abs",
-}
-UFUNC_ALIASES = {
- "subtract": "sub",
- "multiply": "mul",
- "floor_divide": "floordiv",
- "true_divide": "truediv",
- "power": "pow",
- "remainder": "mod",
- "divide": "truediv",
- "equal": "eq",
- "not_equal": "ne",
- "less": "lt",
- "less_equal": "le",
- "greater": "gt",
- "greater_equal": "ge",
- "bitwise_or": "or",
- "bitwise_and": "and",
- "bitwise_xor": "xor",
- "negative": "neg",
- "absolute": "abs",
- "positive": "pos",
-}
-
-# For op(., Array) -> Array.__r{op}__
-REVERSED_NAMES = {
- "lt": "__gt__",
- "le": "__ge__",
- "gt": "__lt__",
- "ge": "__le__",
- "eq": "__eq__",
- "ne": "__ne__",
-}
-
-
-def maybe_dispatch_ufunc_to_dunder_op(
- object self, object ufunc, str method, *inputs, **kwargs
-):
- """
- Dispatch a ufunc to the equivalent dunder method.
-
- Parameters
- ----------
- self : ArrayLike
- The array whose dunder method we dispatch to
- ufunc : Callable
- A NumPy ufunc
- method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'}
- inputs : ArrayLike
- The input arrays.
- kwargs : Any
- The additional keyword arguments, e.g. ``out``.
-
- Returns
- -------
- result : Any
- The result of applying the ufunc
- """
- # special has the ufuncs we dispatch to the dunder op on
-
- op_name = ufunc.__name__
- op_name = UFUNC_ALIASES.get(op_name, op_name)
-
- def not_implemented(*args, **kwargs):
- return NotImplemented
-
- if kwargs or ufunc.nin > 2:
- return NotImplemented
-
- if method == "__call__" and op_name in DISPATCHED_UFUNCS:
-
- if inputs[0] is self:
- name = f"__{op_name}__"
- meth = getattr(self, name, not_implemented)
-
- if op_name in UNARY_UFUNCS:
- assert len(inputs) == 1
- return meth()
-
- return meth(inputs[1])
-
- elif inputs[1] is self:
- name = REVERSED_NAMES.get(op_name, f"__r{op_name}__")
-
- meth = getattr(self, name, not_implemented)
- result = meth(inputs[0])
- return result
-
- else:
- # should not be reached, but covering our bases
- return NotImplemented
-
- else:
- return NotImplemented
diff --git a/contrib/python/pandas/py3/pandas/_libs/parsers.pyi b/contrib/python/pandas/py3/pandas/_libs/parsers.pyi
deleted file mode 100644
index ec5244469cf..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/parsers.pyi
+++ /dev/null
@@ -1,75 +0,0 @@
-from typing import (
- Hashable,
- Literal,
-)
-
-import numpy as np
-
-from pandas._typing import (
- ArrayLike,
- Dtype,
- npt,
-)
-
-STR_NA_VALUES: set[str]
-
-def sanitize_objects(
- values: npt.NDArray[np.object_],
- na_values: set,
-) -> int: ...
-
-class TextReader:
- unnamed_cols: set[str]
- table_width: int # int64_t
- leading_cols: int # int64_t
- header: list[list[int]] # non-negative integers
- def __init__(
- self,
- source,
- delimiter: bytes | str = ..., # single-character only
- header=...,
- header_start: int = ..., # int64_t
- header_end: int = ..., # uint64_t
- index_col=...,
- names=...,
- tokenize_chunksize: int = ..., # int64_t
- delim_whitespace: bool = ...,
- converters=...,
- skipinitialspace: bool = ...,
- escapechar: bytes | str | None = ..., # single-character only
- doublequote: bool = ...,
- quotechar: str | bytes | None = ..., # at most 1 character
- quoting: int = ...,
- lineterminator: bytes | str | None = ..., # at most 1 character
- comment=...,
- decimal: bytes | str = ..., # single-character only
- thousands: bytes | str | None = ..., # single-character only
- dtype: Dtype | dict[Hashable, Dtype] = ...,
- usecols=...,
- error_bad_lines: bool = ...,
- warn_bad_lines: bool = ...,
- na_filter: bool = ...,
- na_values=...,
- na_fvalues=...,
- keep_default_na: bool = ...,
- true_values=...,
- false_values=...,
- allow_leading_cols: bool = ...,
- skiprows=...,
- skipfooter: int = ..., # int64_t
- verbose: bool = ...,
- float_precision: Literal["round_trip", "legacy", "high"] | None = ...,
- skip_blank_lines: bool = ...,
- encoding_errors: bytes | str = ...,
- ) -> None: ...
- def set_noconvert(self, i: int) -> None: ...
- def remove_noconvert(self, i: int) -> None: ...
- def close(self) -> None: ...
- def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ...
- def read_low_memory(self, rows: int | None) -> list[dict[int, ArrayLike]]: ...
-
-# _maybe_upcast, na_values are only exposed for testing
-
-def _maybe_upcast(
- arr, use_dtype_backend: bool = ..., dtype_backend: str = ...
-) -> np.ndarray: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/parsers.pyx b/contrib/python/pandas/py3/pandas/_libs/parsers.pyx
deleted file mode 100644
index 0bd0597f32a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/parsers.pyx
+++ /dev/null
@@ -1,2127 +0,0 @@
-# Copyright (c) 2012, Lambda Foundry, Inc.
-# See LICENSE for the license
-from collections import defaultdict
-from csv import (
- QUOTE_MINIMAL,
- QUOTE_NONE,
- QUOTE_NONNUMERIC,
-)
-import sys
-import time
-import warnings
-
-from pandas.errors import ParserError
-from pandas.util._exceptions import find_stack_level
-
-from pandas import StringDtype
-from pandas.core.arrays import (
- ArrowExtensionArray,
- BooleanArray,
- FloatingArray,
- IntegerArray,
-)
-
-cimport cython
-from cpython.bytes cimport PyBytes_AsString
-from cpython.exc cimport (
- PyErr_Fetch,
- PyErr_Occurred,
-)
-from cpython.object cimport PyObject
-from cpython.ref cimport (
- Py_INCREF,
- Py_XDECREF,
-)
-from cpython.unicode cimport (
- PyUnicode_AsUTF8String,
- PyUnicode_Decode,
- PyUnicode_DecodeUTF8,
-)
-from cython cimport Py_ssize_t
-from libc.stdlib cimport free
-from libc.string cimport (
- strcasecmp,
- strlen,
- strncpy,
-)
-
-
-cdef extern from "Python.h":
- # TODO(cython3): get this from cpython.unicode
- object PyUnicode_FromString(char *v)
-
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- float64_t,
- int64_t,
- ndarray,
- uint8_t,
- uint64_t,
-)
-
-cnp.import_array()
-
-from pandas._libs cimport util
-from pandas._libs.util cimport (
- INT64_MAX,
- INT64_MIN,
- UINT64_MAX,
-)
-
-from pandas._libs import lib
-
-from pandas._libs.khash cimport (
- kh_destroy_float64,
- kh_destroy_str,
- kh_destroy_str_starts,
- kh_destroy_strbox,
- kh_exist_str,
- kh_float64_t,
- kh_get_float64,
- kh_get_str,
- kh_get_str_starts_item,
- kh_get_strbox,
- kh_init_float64,
- kh_init_str,
- kh_init_str_starts,
- kh_init_strbox,
- kh_put_float64,
- kh_put_str,
- kh_put_str_starts_item,
- kh_put_strbox,
- kh_resize_float64,
- kh_resize_str_starts,
- kh_str_starts_t,
- kh_str_t,
- kh_strbox_t,
- khiter_t,
-)
-
-from pandas.errors import (
- EmptyDataError,
- ParserError,
- ParserWarning,
-)
-
-from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_datetime64_dtype,
- is_extension_array_dtype,
- is_float_dtype,
- is_integer_dtype,
- is_object_dtype,
-)
-from pandas.core.dtypes.dtypes import CategoricalDtype
-from pandas.core.dtypes.inference import is_dict_like
-
-from pandas.core.arrays.boolean import BooleanDtype
-
-cdef:
- float64_t INF = <float64_t>np.inf
- float64_t NEGINF = -INF
- int64_t DEFAULT_CHUNKSIZE = 256 * 1024
-
-
-cdef extern from "headers/portable.h":
- # I *think* this is here so that strcasecmp is defined on Windows
- # so we don't get
- # `parsers.obj : error LNK2001: unresolved external symbol strcasecmp`
- # in Appveyor.
- # In a sane world, the `from libc.string cimport` above would fail
- # loudly.
- pass
-
-
-cdef extern from "parser/tokenizer.h":
-
- ctypedef enum ParserState:
- START_RECORD
- START_FIELD
- ESCAPED_CHAR
- IN_FIELD
- IN_QUOTED_FIELD
- ESCAPE_IN_QUOTED_FIELD
- QUOTE_IN_QUOTED_FIELD
- EAT_CRNL
- EAT_CRNL_NOP
- EAT_WHITESPACE
- EAT_COMMENT
- EAT_LINE_COMMENT
- WHITESPACE_LINE
- SKIP_LINE
- FINISHED
-
- enum: ERROR_OVERFLOW
-
- ctypedef enum BadLineHandleMethod:
- ERROR,
- WARN,
- SKIP
-
- ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
- int *status, const char *encoding_errors)
- ctypedef int (*io_cleanup)(void *src)
-
- ctypedef struct parser_t:
- void *source
- io_callback cb_io
- io_cleanup cb_cleanup
-
- int64_t chunksize # Number of bytes to prepare for each chunk
- char *data # pointer to data to be processed
- int64_t datalen # amount of data available
- int64_t datapos
-
- # where to write out tokenized data
- char *stream
- uint64_t stream_len
- uint64_t stream_cap
-
- # Store words in (potentially ragged) matrix for now, hmm
- char **words
- int64_t *word_starts # where we are in the stream
- uint64_t words_len
- uint64_t words_cap
- uint64_t max_words_cap # maximum word cap encountered
-
- char *pword_start # pointer to stream start of current field
- int64_t word_start # position start of current field
-
- int64_t *line_start # position in words for start of line
- int64_t *line_fields # Number of fields in each line
- uint64_t lines # Number of lines observed
- uint64_t file_lines # Number of lines observed (with bad/skipped)
- uint64_t lines_cap # Vector capacity
-
- # Tokenizing stuff
- ParserState state
- int doublequote # is " represented by ""? */
- char delimiter # field separator */
- int delim_whitespace # consume tabs / spaces instead
- char quotechar # quote character */
- char escapechar # escape character */
- char lineterminator
- int skipinitialspace # ignore spaces following delimiter? */
- int quoting # style of quoting to write */
-
- char commentchar
- int allow_embedded_newline
-
- int usecols
-
- Py_ssize_t expected_fields
- BadLineHandleMethod on_bad_lines
-
- # floating point options
- char decimal
- char sci
-
- # thousands separator (comma, period)
- char thousands
-
- int header # Boolean: 1: has header, 0: no header
- int64_t header_start # header row start
- uint64_t header_end # header row end
-
- void *skipset
- PyObject *skipfunc
- int64_t skip_first_N_rows
- int64_t skipfooter
- # pick one, depending on whether the converter requires GIL
- float64_t (*double_converter)(const char *, char **,
- char, char, char,
- int, int *, int *) nogil
-
- # error handling
- char *warn_msg
- char *error_msg
-
- int64_t skip_empty_lines
-
- ctypedef struct coliter_t:
- char **words
- int64_t *line_start
- int64_t col
-
- ctypedef struct uint_state:
- int seen_sint
- int seen_uint
- int seen_null
-
- void uint_state_init(uint_state *self)
- int uint64_conflict(uint_state *self)
-
- void coliter_setup(coliter_t *it, parser_t *parser,
- int64_t i, int64_t start) nogil
- void COLITER_NEXT(coliter_t, const char *) nogil
-
- parser_t* parser_new()
-
- int parser_init(parser_t *self) nogil
- void parser_free(parser_t *self) nogil
- void parser_del(parser_t *self) nogil
- int parser_add_skiprow(parser_t *self, int64_t row)
-
- int parser_set_skipfirstnrows(parser_t *self, int64_t nrows)
-
- void parser_set_default_options(parser_t *self)
-
- int parser_consume_rows(parser_t *self, size_t nrows)
-
- int parser_trim_buffers(parser_t *self)
-
- int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
- int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
-
- int64_t str_to_int64(char *p_item, int64_t int_min,
- int64_t int_max, int *error, char tsep) nogil
- uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
- uint64_t uint_max, int *error, char tsep) nogil
-
- float64_t xstrtod(const char *p, char **q, char decimal,
- char sci, char tsep, int skip_trailing,
- int *error, int *maybe_int) nogil
- float64_t precise_xstrtod(const char *p, char **q, char decimal,
- char sci, char tsep, int skip_trailing,
- int *error, int *maybe_int) nogil
- float64_t round_trip(const char *p, char **q, char decimal,
- char sci, char tsep, int skip_trailing,
- int *error, int *maybe_int) nogil
-
- int to_boolean(const char *item, uint8_t *val) nogil
-
-
-cdef extern from "parser/io.h":
- void *new_rd_source(object obj) except NULL
-
- int del_rd_source(void *src)
-
- void* buffer_rd_bytes(void *source, size_t nbytes,
- size_t *bytes_read, int *status, const char *encoding_errors)
-
-
-cdef class TextReader:
- """
-
- # source: StringIO or file object
-
- ..versionchange:: 1.2.0
- removed 'compression', 'memory_map', and 'encoding' argument.
- These arguments are outsourced to CParserWrapper.
- 'source' has to be a file handle.
- """
-
- cdef:
- parser_t *parser
- object na_fvalues
- object true_values, false_values
- object handle
- object orig_header
- bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
- bint allow_leading_cols
- uint64_t parser_start # this is modified after __init__
- list clocks
- const char *encoding_errors
- kh_str_starts_t *false_set
- kh_str_starts_t *true_set
- int64_t buffer_lines, skipfooter
- list dtype_cast_order # list[np.dtype]
- list names # can be None
- set noconvert # set[int]
-
- cdef public:
- int64_t leading_cols, table_width
- object delimiter # bytes or str
- object converters
- object na_values
- list header # list[list[non-negative integers]]
- object index_col
- object skiprows
- object dtype
- object usecols
- set unnamed_cols # set[str]
- str dtype_backend
-
- def __cinit__(self, source,
- delimiter=b",", # bytes | str
- header=0,
- int64_t header_start=0,
- uint64_t header_end=0,
- index_col=None,
- names=None,
- tokenize_chunksize=DEFAULT_CHUNKSIZE,
- bint delim_whitespace=False,
- converters=None,
- bint skipinitialspace=False,
- escapechar=None, # bytes | str
- bint doublequote=True,
- quotechar=b'"',
- quoting=0, # int
- lineterminator=None, # bytes | str
- comment=None,
- decimal=b".", # bytes | str
- thousands=None, # bytes | str
- dtype=None,
- usecols=None,
- on_bad_lines=ERROR,
- bint na_filter=True,
- na_values=None,
- na_fvalues=None,
- bint keep_default_na=True,
- true_values=None,
- false_values=None,
- bint allow_leading_cols=True,
- skiprows=None,
- skipfooter=0, # int64_t
- bint verbose=False,
- float_precision=None,
- bint skip_blank_lines=True,
- encoding_errors=b"strict",
- dtype_backend="numpy"):
-
- # set encoding for native Python and C library
- if isinstance(encoding_errors, str):
- encoding_errors = encoding_errors.encode("utf-8")
- elif encoding_errors is None:
- encoding_errors = b"strict"
- Py_INCREF(encoding_errors)
- self.encoding_errors = PyBytes_AsString(encoding_errors)
-
- self.parser = parser_new()
- self.parser.chunksize = tokenize_chunksize
-
- # For timekeeping
- self.clocks = []
-
- self.parser.usecols = (usecols is not None)
-
- self._setup_parser_source(source)
- parser_set_default_options(self.parser)
-
- parser_init(self.parser)
-
- if delim_whitespace:
- self.parser.delim_whitespace = delim_whitespace
- else:
- if len(delimiter) > 1:
- raise ValueError("only length-1 separators excluded right now")
- self.parser.delimiter = <char>ord(delimiter)
-
- # ----------------------------------------
- # parser options
-
- self.parser.doublequote = doublequote
- self.parser.skipinitialspace = skipinitialspace
- self.parser.skip_empty_lines = skip_blank_lines
-
- if lineterminator is not None:
- if len(lineterminator) != 1:
- raise ValueError("Only length-1 line terminators supported")
- self.parser.lineterminator = <char>ord(lineterminator)
-
- if len(decimal) != 1:
- raise ValueError("Only length-1 decimal markers supported")
- self.parser.decimal = <char>ord(decimal)
-
- if thousands is not None:
- if len(thousands) != 1:
- raise ValueError("Only length-1 thousands markers supported")
- self.parser.thousands = <char>ord(thousands)
-
- if escapechar is not None:
- if len(escapechar) != 1:
- raise ValueError("Only length-1 escapes supported")
- self.parser.escapechar = <char>ord(escapechar)
-
- self._set_quoting(quotechar, quoting)
-
- dtype_order = ["int64", "float64", "bool", "object"]
- if quoting == QUOTE_NONNUMERIC:
- # consistent with csv module semantics, cast all to float
- dtype_order = dtype_order[1:]
- self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
-
- if comment is not None:
- if len(comment) > 1:
- raise ValueError("Only length-1 comment characters supported")
- self.parser.commentchar = <char>ord(comment)
-
- self.parser.on_bad_lines = on_bad_lines
-
- self.skiprows = skiprows
- if skiprows is not None:
- self._make_skiprow_set()
-
- self.skipfooter = skipfooter
-
- if usecols is not None:
- self.has_usecols = 1
- # GH-20558, validate usecols at higher level and only pass clean
- # usecols into TextReader.
- self.usecols = usecols
-
- if skipfooter > 0:
- self.parser.on_bad_lines = SKIP
-
- self.delimiter = delimiter
-
- self.na_values = na_values
- if na_fvalues is None:
- na_fvalues = set()
- self.na_fvalues = na_fvalues
-
- self.true_values = _maybe_encode(true_values) + _true_values
- self.false_values = _maybe_encode(false_values) + _false_values
-
- self.true_set = kset_from_list(self.true_values)
- self.false_set = kset_from_list(self.false_values)
-
- self.keep_default_na = keep_default_na
- self.converters = converters
- self.na_filter = na_filter
-
- self.verbose = verbose
-
- if float_precision == "round_trip":
- # see gh-15140
- self.parser.double_converter = round_trip
- elif float_precision == "legacy":
- self.parser.double_converter = xstrtod
- elif float_precision == "high" or float_precision is None:
- self.parser.double_converter = precise_xstrtod
- else:
- raise ValueError(f"Unrecognized float_precision option: "
- f"{float_precision}")
-
- # Caller is responsible for ensuring we have one of
- # - None
- # - DtypeObj
- # - dict[Any, DtypeObj]
- self.dtype = dtype
- self.dtype_backend = dtype_backend
-
- self.noconvert = set()
-
- self.index_col = index_col
-
- # ----------------------------------------
- # header stuff
-
- self.allow_leading_cols = allow_leading_cols
- self.leading_cols = 0 # updated in _get_header
-
- # TODO: no header vs. header is not the first row
- self.has_mi_columns = 0
- self.orig_header = header
- if header is None:
- # sentinel value
- self.parser.header_start = -1
- self.parser.header_end = -1
- self.parser.header = -1
- self.parser_start = 0
- prelim_header = []
- else:
- if isinstance(header, list):
- if len(header) > 1:
- # need to artificially skip the final line
- # which is still a header line
- header = list(header)
- header.append(header[-1] + 1)
- self.parser.header_end = header[-1]
- self.has_mi_columns = 1
- else:
- self.parser.header_end = header[0]
-
- self.parser_start = header[-1] + 1
- self.parser.header_start = header[0]
- self.parser.header = header[0]
- prelim_header = header
- else:
- self.parser.header_start = header
- self.parser.header_end = header
- self.parser_start = header + 1
- self.parser.header = header
- prelim_header = [header]
-
- self.names = names
- header, table_width, unnamed_cols = self._get_header(prelim_header)
- # header, table_width, and unnamed_cols are set here, never changed
- self.header = header
- self.table_width = table_width
- self.unnamed_cols = unnamed_cols
-
- if not self.table_width:
- raise EmptyDataError("No columns to parse from file")
-
- # Compute buffer_lines as function of table width.
- heuristic = 2**20 // self.table_width
- self.buffer_lines = 1
- while self.buffer_lines * 2 < heuristic:
- self.buffer_lines *= 2
-
- def __init__(self, *args, **kwargs):
- pass
-
- def __dealloc__(self):
- _close(self)
- parser_del(self.parser)
-
- def close(self):
- _close(self)
-
- def _set_quoting(self, quote_char: str | bytes | None, quoting: int):
- if not isinstance(quoting, int):
- raise TypeError('"quoting" must be an integer')
-
- if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE:
- raise TypeError('bad "quoting" value')
-
- if not isinstance(quote_char, (str, bytes)) and quote_char is not None:
- dtype = type(quote_char).__name__
- raise TypeError(f'"quotechar" must be string, not {dtype}')
-
- if quote_char is None or quote_char == "":
- if quoting != QUOTE_NONE:
- raise TypeError("quotechar must be set if quoting enabled")
- self.parser.quoting = quoting
- self.parser.quotechar = -1
- elif len(quote_char) > 1: # 0-len case handled earlier
- raise TypeError('"quotechar" must be a 1-character string')
- else:
- self.parser.quoting = quoting
- self.parser.quotechar = <char>ord(quote_char)
-
- cdef _make_skiprow_set(self):
- if util.is_integer_object(self.skiprows):
- parser_set_skipfirstnrows(self.parser, self.skiprows)
- elif not callable(self.skiprows):
- for i in self.skiprows:
- parser_add_skiprow(self.parser, i)
- else:
- self.parser.skipfunc = <PyObject *>self.skiprows
-
- cdef _setup_parser_source(self, source):
- cdef:
- void *ptr
-
- ptr = new_rd_source(source)
- self.parser.source = ptr
- self.parser.cb_io = &buffer_rd_bytes
- self.parser.cb_cleanup = &del_rd_source
-
- cdef _get_header(self, list prelim_header):
- # header is now a list of lists, so field_count should use header[0]
- #
- # modifies:
- # self.parser attributes
- # self.parser_start
- # self.leading_cols
-
- cdef:
- Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
- char *word
- str name
- uint64_t hr, data_line = 0
- list header = []
- set unnamed_cols = set()
-
- if self.parser.header_start >= 0:
-
- # Header is in the file
- for level, hr in enumerate(prelim_header):
-
- this_header = []
-
- if self.parser.lines < hr + 1:
- self._tokenize_rows(hr + 2)
-
- if self.parser.lines == 0:
- field_count = 0
- start = self.parser.line_start[0]
-
- # e.g., if header=3 and file only has 2 lines
- elif (self.parser.lines < hr + 1
- and not isinstance(self.orig_header, list)) or (
- self.parser.lines < hr):
- msg = self.orig_header
- if isinstance(msg, list):
- joined = ",".join(str(m) for m in msg)
- msg = f"[{joined}], len of {len(msg)},"
- raise ParserError(
- f"Passed header={msg} but only "
- f"{self.parser.lines} lines in file")
-
- else:
- field_count = self.parser.line_fields[hr]
- start = self.parser.line_start[hr]
-
- unnamed_count = 0
- unnamed_col_indices = []
-
- for i in range(field_count):
- word = self.parser.words[start + i]
-
- name = PyUnicode_DecodeUTF8(word, strlen(word),
- self.encoding_errors)
-
- if name == "":
- if self.has_mi_columns:
- name = f"Unnamed: {i}_level_{level}"
- else:
- name = f"Unnamed: {i}"
-
- unnamed_count += 1
- unnamed_col_indices.append(i)
-
- this_header.append(name)
-
- if not self.has_mi_columns:
- # Ensure that regular columns are used before unnamed ones
- # to keep given names and mangle unnamed columns
- col_loop_order = [i for i in range(len(this_header))
- if i not in unnamed_col_indices
- ] + unnamed_col_indices
- counts = {}
-
- for i in col_loop_order:
- col = this_header[i]
- old_col = col
- cur_count = counts.get(col, 0)
-
- if cur_count > 0:
- while cur_count > 0:
- counts[old_col] = cur_count + 1
- col = f"{old_col}.{cur_count}"
- if col in this_header:
- cur_count += 1
- else:
- cur_count = counts.get(col, 0)
-
- if (
- self.dtype is not None
- and is_dict_like(self.dtype)
- and self.dtype.get(old_col) is not None
- and self.dtype.get(col) is None
- ):
- self.dtype.update({col: self.dtype.get(old_col)})
-
- this_header[i] = col
- counts[col] = cur_count + 1
-
- if self.has_mi_columns:
-
- # If we have grabbed an extra line, but it's not in our
- # format, save in the buffer, and create an blank extra
- # line for the rest of the parsing code.
- if hr == prelim_header[-1]:
- lc = len(this_header)
- ic = (len(self.index_col) if self.index_col
- is not None else 0)
-
- # if wrong number of blanks or no index, not our format
- if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0:
- hr -= 1
- self.parser_start -= 1
- this_header = [None] * lc
-
- data_line = hr + 1
- header.append(this_header)
- unnamed_cols.update({this_header[i] for i in unnamed_col_indices})
-
- if self.names is not None:
- header = [self.names]
-
- elif self.names is not None:
- # Names passed
- if self.parser.lines < 1:
- if not self.has_usecols:
- self.parser.expected_fields = len(self.names)
- self._tokenize_rows(1)
-
- header = [self.names]
-
- if self.parser.lines < 1:
- field_count = len(header[0])
- else:
- field_count = self.parser.line_fields[data_line]
-
- # Enforce this unless usecols
- if not self.has_usecols:
- self.parser.expected_fields = max(field_count, len(self.names))
-
- else:
- # No header passed nor to be found in the file
- if self.parser.lines < 1:
- self._tokenize_rows(1)
-
- return None, self.parser.line_fields[0], unnamed_cols
-
- # Corner case, not enough lines in the file
- if self.parser.lines < data_line + 1:
- field_count = len(header[0])
- else:
-
- field_count = self.parser.line_fields[data_line]
-
- # #2981
- if self.names is not None:
- field_count = max(field_count, len(self.names))
-
- passed_count = len(header[0])
-
- if (self.has_usecols and self.allow_leading_cols and
- not callable(self.usecols)):
- nuse = len(self.usecols)
- if nuse == passed_count:
- self.leading_cols = 0
- elif self.names is None and nuse < passed_count:
- self.leading_cols = field_count - passed_count
- elif passed_count != field_count:
- raise ValueError("Number of passed names did not match number of "
- "header fields in the file")
- # oh boy, #2442, #2981
- elif self.allow_leading_cols and passed_count < field_count:
- self.leading_cols = field_count - passed_count
-
- return header, field_count, unnamed_cols
-
- def read(self, rows: int | None = None) -> dict[int, "ArrayLike"]:
- """
- rows=None --> read all rows
- """
- # Don't care about memory usage
- columns = self._read_rows(rows, 1)
-
- return columns
-
- def read_low_memory(self, rows: int | None)-> list[dict[int, "ArrayLike"]]:
- """
- rows=None --> read all rows
- """
- # Conserve intermediate space
- # Caller is responsible for concatenating chunks,
- # see c_parser_wrapper._concatenate_chunks
- cdef:
- size_t rows_read = 0
- list chunks = []
-
- if rows is None:
- while True:
- try:
- chunk = self._read_rows(self.buffer_lines, 0)
- if len(chunk) == 0:
- break
- except StopIteration:
- break
- else:
- chunks.append(chunk)
- else:
- while rows_read < rows:
- try:
- crows = min(self.buffer_lines, rows - rows_read)
-
- chunk = self._read_rows(crows, 0)
- if len(chunk) == 0:
- break
-
- rows_read += len(list(chunk.values())[0])
- except StopIteration:
- break
- else:
- chunks.append(chunk)
-
- parser_trim_buffers(self.parser)
-
- if len(chunks) == 0:
- raise StopIteration
-
- return chunks
-
- cdef _tokenize_rows(self, size_t nrows):
- cdef:
- int status
-
- with nogil:
- status = tokenize_nrows(self.parser, nrows, self.encoding_errors)
-
- self._check_tokenize_status(status)
-
- cdef _check_tokenize_status(self, int status):
- if self.parser.warn_msg != NULL:
- print(PyUnicode_DecodeUTF8(
- self.parser.warn_msg, strlen(self.parser.warn_msg),
- self.encoding_errors), file=sys.stderr)
- free(self.parser.warn_msg)
- self.parser.warn_msg = NULL
-
- if status < 0:
- raise_parser_error("Error tokenizing data", self.parser)
-
- # -> dict[int, "ArrayLike"]
- cdef _read_rows(self, rows, bint trim):
- cdef:
- int64_t buffered_lines
- int64_t irows
-
- self._start_clock()
-
- if rows is not None:
- irows = rows
- buffered_lines = self.parser.lines - self.parser_start
- if buffered_lines < irows:
- self._tokenize_rows(irows - buffered_lines)
-
- if self.skipfooter > 0:
- raise ValueError("skipfooter can only be used to read "
- "the whole file")
- else:
- with nogil:
- status = tokenize_all_rows(self.parser, self.encoding_errors)
-
- self._check_tokenize_status(status)
-
- if self.parser_start >= self.parser.lines:
- raise StopIteration
- self._end_clock("Tokenization")
-
- self._start_clock()
- columns = self._convert_column_data(rows)
- self._end_clock("Type conversion")
- self._start_clock()
- if len(columns) > 0:
- rows_read = len(list(columns.values())[0])
- # trim
- parser_consume_rows(self.parser, rows_read)
- if trim:
- parser_trim_buffers(self.parser)
- self.parser_start -= rows_read
-
- self._end_clock("Parser memory cleanup")
-
- return columns
-
- cdef _start_clock(self):
- self.clocks.append(time.time())
-
- cdef _end_clock(self, str what):
- if self.verbose:
- elapsed = time.time() - self.clocks.pop(-1)
- print(f"{what} took: {elapsed * 1000:.2f} ms")
-
- def set_noconvert(self, i: int) -> None:
- self.noconvert.add(i)
-
- def remove_noconvert(self, i: int) -> None:
- self.noconvert.remove(i)
-
- def _convert_column_data(self, rows: int | None) -> dict[int, "ArrayLike"]:
- cdef:
- int64_t i
- int nused
- kh_str_starts_t *na_hashset = NULL
- int64_t start, end
- object name, na_flist, col_dtype = None
- bint na_filter = 0
- int64_t num_cols
- dict results
-
- start = self.parser_start
-
- if rows is None:
- end = self.parser.lines
- else:
- end = min(start + rows, self.parser.lines)
-
- num_cols = -1
- # Py_ssize_t cast prevents build warning
- for i in range(<Py_ssize_t>self.parser.lines):
- num_cols = (num_cols < self.parser.line_fields[i]) * \
- self.parser.line_fields[i] + \
- (num_cols >= self.parser.line_fields[i]) * num_cols
-
- usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols
- names_larger_num_cols = (self.names and
- len(self.names) - self.leading_cols > num_cols)
-
- if self.table_width - self.leading_cols > num_cols:
- if (usecols_not_callable_and_exists
- and self.table_width - self.leading_cols < len(self.usecols)
- or names_larger_num_cols):
- raise ParserError(f"Too many columns specified: expected "
- f"{self.table_width - self.leading_cols} "
- f"and found {num_cols}")
-
- if (usecols_not_callable_and_exists and
- all(isinstance(u, int) for u in self.usecols)):
- missing_usecols = [col for col in self.usecols if col >= num_cols]
- if missing_usecols:
- raise ParserError(
- "Defining usecols without of bounds indices is not allowed. "
- f"{missing_usecols} are out of bounds.",
- )
-
- results = {}
- nused = 0
- is_default_dict_dtype = isinstance(self.dtype, defaultdict)
-
- for i in range(self.table_width):
- if i < self.leading_cols:
- # Pass through leading columns always
- name = i
- elif (self.usecols and not callable(self.usecols) and
- nused == len(self.usecols)):
- # Once we've gathered all requested columns, stop. GH5766
- break
- else:
- name = self._get_column_name(i, nused)
- usecols = set()
- if callable(self.usecols):
- if self.usecols(name):
- usecols = {i}
- else:
- usecols = self.usecols
- if self.has_usecols and not (i in usecols or
- name in usecols):
- continue
- nused += 1
-
- conv = self._get_converter(i, name)
-
- col_dtype = None
- if self.dtype is not None:
- if isinstance(self.dtype, dict):
- if name in self.dtype:
- col_dtype = self.dtype[name]
- elif i in self.dtype:
- col_dtype = self.dtype[i]
- elif is_default_dict_dtype:
- col_dtype = self.dtype[name]
- else:
- if self.dtype.names:
- # structured array
- col_dtype = np.dtype(self.dtype.descr[i][1])
- else:
- col_dtype = self.dtype
-
- if conv:
- if col_dtype is not None:
- warnings.warn((f"Both a converter and dtype were specified "
- f"for column {name} - only the converter will "
- f"be used."), ParserWarning,
- stacklevel=find_stack_level())
- results[i] = _apply_converter(conv, self.parser, i, start, end)
- continue
-
- # Collect the list of NaN values associated with the column.
- # If we aren't supposed to do that, or none are collected,
- # we set `na_filter` to `0` (`1` otherwise).
- na_flist = set()
-
- if self.na_filter:
- na_list, na_flist = self._get_na_list(i, name)
- if na_list is None:
- na_filter = 0
- else:
- na_filter = 1
- na_hashset = kset_from_list(na_list)
- else:
- na_filter = 0
-
- # Attempt to parse tokens and infer dtype of the column.
- # Should return as the desired dtype (inferred or specified).
- try:
- col_res, na_count = self._convert_tokens(
- i, start, end, name, na_filter, na_hashset,
- na_flist, col_dtype)
- finally:
- # gh-21353
- #
- # Cleanup the NaN hash that we generated
- # to avoid memory leaks.
- if na_filter:
- self._free_na_set(na_hashset)
-
- # don't try to upcast EAs
- if (
- na_count > 0 and not is_extension_array_dtype(col_dtype)
- or self.dtype_backend != "numpy"
- ):
- use_dtype_backend = self.dtype_backend != "numpy" and col_dtype is None
- col_res = _maybe_upcast(
- col_res,
- use_dtype_backend=use_dtype_backend,
- dtype_backend=self.dtype_backend,
- )
-
- if col_res is None:
- raise ParserError(f"Unable to parse column {i}")
-
- results[i] = col_res
-
- self.parser_start += end - start
-
- return results
-
- # -> tuple["ArrayLike", int]:
- cdef _convert_tokens(self, Py_ssize_t i, int64_t start,
- int64_t end, object name, bint na_filter,
- kh_str_starts_t *na_hashset,
- object na_flist, object col_dtype):
-
- if col_dtype is not None:
- col_res, na_count = self._convert_with_dtype(
- col_dtype, i, start, end, na_filter,
- 1, na_hashset, na_flist)
-
- # Fallback on the parse (e.g. we requested int dtype,
- # but its actually a float).
- if col_res is not None:
- return col_res, na_count
-
- if i in self.noconvert:
- return self._string_convert(i, start, end, na_filter, na_hashset)
- else:
- col_res = None
- for dt in self.dtype_cast_order:
- try:
- col_res, na_count = self._convert_with_dtype(
- dt, i, start, end, na_filter, 0, na_hashset, na_flist)
- except ValueError:
- # This error is raised from trying to convert to uint64,
- # and we discover that we cannot convert to any numerical
- # dtype successfully. As a result, we leave the data
- # column AS IS with object dtype.
- col_res, na_count = self._convert_with_dtype(
- np.dtype("object"), i, start, end, 0,
- 0, na_hashset, na_flist)
- except OverflowError:
- col_res, na_count = self._convert_with_dtype(
- np.dtype("object"), i, start, end, na_filter,
- 0, na_hashset, na_flist)
-
- if col_res is not None:
- break
-
- # we had a fallback parse on the dtype, so now try to cast
- if col_res is not None and col_dtype is not None:
- # If col_res is bool, it might actually be a bool array mixed with NaNs
- # (see _try_bool_flex()). Usually this would be taken care of using
- # _maybe_upcast(), but if col_dtype is a floating type we should just
- # take care of that cast here.
- if col_res.dtype == np.bool_ and is_float_dtype(col_dtype):
- mask = col_res.view(np.uint8) == na_values[np.uint8]
- col_res = col_res.astype(col_dtype)
- np.putmask(col_res, mask, np.nan)
- return col_res, na_count
-
- # NaNs are already cast to True here, so can not use astype
- if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype):
- if na_count > 0:
- raise ValueError(
- f"cannot safely convert passed user dtype of "
- f"{col_dtype} for {np.bool_} dtyped data in "
- f"column {i} due to NA values"
- )
-
- # only allow safe casts, eg. with a nan you cannot safely cast to int
- try:
- col_res = col_res.astype(col_dtype, casting="safe")
- except TypeError:
-
- # float -> int conversions can fail the above
- # even with no nans
- col_res_orig = col_res
- col_res = col_res.astype(col_dtype)
- if (col_res != col_res_orig).any():
- raise ValueError(
- f"cannot safely convert passed user dtype of "
- f"{col_dtype} for {col_res_orig.dtype.name} dtyped data in "
- f"column {i}")
-
- return col_res, na_count
-
- cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
- int64_t start, int64_t end,
- bint na_filter,
- bint user_dtype,
- kh_str_starts_t *na_hashset,
- object na_flist):
- if isinstance(dtype, CategoricalDtype):
- # TODO: I suspect that _categorical_convert could be
- # optimized when dtype is an instance of CategoricalDtype
- codes, cats, na_count = _categorical_convert(
- self.parser, i, start, end, na_filter, na_hashset)
-
- # Method accepts list of strings, not encoded ones.
- true_values = [x.decode() for x in self.true_values]
- array_type = dtype.construct_array_type()
- cat = array_type._from_inferred_categories(
- cats, codes, dtype, true_values=true_values)
- return cat, na_count
-
- elif is_extension_array_dtype(dtype):
- result, na_count = self._string_convert(i, start, end, na_filter,
- na_hashset)
-
- array_type = dtype.construct_array_type()
- try:
- # use _from_sequence_of_strings if the class defines it
- if isinstance(dtype, BooleanDtype):
- # xref GH 47534: BooleanArray._from_sequence_of_strings has extra
- # kwargs
- true_values = [x.decode() for x in self.true_values]
- false_values = [x.decode() for x in self.false_values]
- result = array_type._from_sequence_of_strings(
- result, dtype=dtype, true_values=true_values,
- false_values=false_values)
- else:
- result = array_type._from_sequence_of_strings(result, dtype=dtype)
- except NotImplementedError:
- raise NotImplementedError(
- f"Extension Array: {array_type} must implement "
- f"_from_sequence_of_strings in order "
- f"to be used in parser methods")
-
- return result, na_count
-
- elif is_integer_dtype(dtype):
- try:
- result, na_count = _try_int64(self.parser, i, start,
- end, na_filter, na_hashset)
- if user_dtype and na_count is not None:
- if na_count > 0:
- raise ValueError(f"Integer column has NA values in column {i}")
- except OverflowError:
- result = _try_uint64(self.parser, i, start, end,
- na_filter, na_hashset)
- na_count = 0
-
- if result is not None and dtype != "int64":
- result = result.astype(dtype)
-
- return result, na_count
-
- elif is_float_dtype(dtype):
- result, na_count = _try_double(self.parser, i, start, end,
- na_filter, na_hashset, na_flist)
-
- if result is not None and dtype != "float64":
- result = result.astype(dtype)
- return result, na_count
- elif is_bool_dtype(dtype):
- result, na_count = _try_bool_flex(self.parser, i, start, end,
- na_filter, na_hashset,
- self.true_set, self.false_set)
- if user_dtype and na_count is not None:
- if na_count > 0:
- raise ValueError(f"Bool column has NA values in column {i}")
- return result, na_count
-
- elif dtype.kind == "S":
- # TODO: na handling
- width = dtype.itemsize
- if width > 0:
- result = _to_fw_string(self.parser, i, start, end, width)
- return result, 0
-
- # treat as a regular string parsing
- return self._string_convert(i, start, end, na_filter,
- na_hashset)
- elif dtype.kind == "U":
- width = dtype.itemsize
- if width > 0:
- raise TypeError(f"the dtype {dtype} is not supported for parsing")
-
- # unicode variable width
- return self._string_convert(i, start, end, na_filter,
- na_hashset)
- elif is_object_dtype(dtype):
- return self._string_convert(i, start, end, na_filter,
- na_hashset)
- elif is_datetime64_dtype(dtype):
- raise TypeError(f"the dtype {dtype} is not supported "
- f"for parsing, pass this column "
- f"using parse_dates instead")
- else:
- raise TypeError(f"the dtype {dtype} is not supported for parsing")
-
- # -> tuple[ndarray[object], int]
- cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
- bint na_filter, kh_str_starts_t *na_hashset):
-
- return _string_box_utf8(self.parser, i, start, end, na_filter,
- na_hashset, self.encoding_errors)
-
- def _get_converter(self, i: int, name):
- if self.converters is None:
- return None
-
- if name is not None and name in self.converters:
- return self.converters[name]
-
- # Converter for position, if any
- return self.converters.get(i)
-
- cdef _get_na_list(self, Py_ssize_t i, name):
- # Note: updates self.na_values, self.na_fvalues
- if self.na_values is None:
- return None, set()
-
- if isinstance(self.na_values, dict):
- key = None
- values = None
-
- if name is not None and name in self.na_values:
- key = name
- elif i in self.na_values:
- key = i
- else: # No na_values provided for this column.
- if self.keep_default_na:
- return _NA_VALUES, set()
-
- return list(), set()
-
- values = self.na_values[key]
- if values is not None and not isinstance(values, list):
- values = list(values)
-
- fvalues = self.na_fvalues[key]
- if fvalues is not None and not isinstance(fvalues, set):
- fvalues = set(fvalues)
-
- return _ensure_encoded(values), fvalues
- else:
- if not isinstance(self.na_values, list):
- self.na_values = list(self.na_values)
- if not isinstance(self.na_fvalues, set):
- self.na_fvalues = set(self.na_fvalues)
-
- return _ensure_encoded(self.na_values), self.na_fvalues
-
- cdef _free_na_set(self, kh_str_starts_t *table):
- kh_destroy_str_starts(table)
-
- cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
- cdef int64_t j
- if self.has_usecols and self.names is not None:
- if (not callable(self.usecols) and
- len(self.names) == len(self.usecols)):
- return self.names[nused]
- else:
- return self.names[i - self.leading_cols]
- else:
- if self.header is not None:
- j = i - self.leading_cols
- # generate extra (bogus) headers if there are more columns than headers
- # These should be strings, not integers, because otherwise we might get
- # issues with callables as usecols GH#46997
- if j >= len(self.header[0]):
- return str(j)
- elif self.has_mi_columns:
- return tuple(header_row[j] for header_row in self.header)
- else:
- return self.header[0][j]
- else:
- return None
-
-
-# Factor out code common to TextReader.__dealloc__ and TextReader.close
-# It cannot be a class method, since calling self.close() in __dealloc__
-# which causes a class attribute lookup and violates best practices
-# https://cython.readthedocs.io/en/latest/src/userguide/special_methods.html#finalization-method-dealloc
-cdef _close(TextReader reader):
- # also preemptively free all allocated memory
- parser_free(reader.parser)
- if reader.true_set:
- kh_destroy_str_starts(reader.true_set)
- reader.true_set = NULL
- if reader.false_set:
- kh_destroy_str_starts(reader.false_set)
- reader.false_set = NULL
-
-
-cdef:
- object _true_values = [b"True", b"TRUE", b"true"]
- object _false_values = [b"False", b"FALSE", b"false"]
-
-
-def _ensure_encoded(list lst):
- cdef:
- list result = []
- for x in lst:
- if isinstance(x, str):
- x = PyUnicode_AsUTF8String(x)
- elif not isinstance(x, bytes):
- x = str(x).encode("utf-8")
-
- result.append(x)
- return result
-
-
-# common NA values
-# no longer excluding inf representations
-# '1.#INF','-1.#INF', '1.#INF000000',
-STR_NA_VALUES = {
- "-1.#IND",
- "1.#QNAN",
- "1.#IND",
- "-1.#QNAN",
- "#N/A N/A",
- "#N/A",
- "N/A",
- "n/a",
- "NA",
- "<NA>",
- "#NA",
- "NULL",
- "null",
- "NaN",
- "-NaN",
- "nan",
- "-nan",
- "",
- "None",
-}
-_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
-
-
-def _maybe_upcast(
- arr, use_dtype_backend: bool = False, dtype_backend: str = "numpy"
-):
- """Sets nullable dtypes or upcasts if nans are present.
-
- Upcast, if use_dtype_backend is false and nans are present so that the
- current dtype can not hold the na value. We use nullable dtypes if the
- flag is true for every array.
-
- Parameters
- ----------
- arr: ndarray
- Numpy array that is potentially being upcast.
-
- use_dtype_backend: bool, default False
- If true, we cast to the associated nullable dtypes.
-
- Returns
- -------
- The casted array.
- """
- if is_extension_array_dtype(arr.dtype):
- # TODO: the docstring says arr is an ndarray, in which case this cannot
- # be reached. Is that incorrect?
- return arr
-
- na_value = na_values[arr.dtype]
-
- if issubclass(arr.dtype.type, np.integer):
- mask = arr == na_value
-
- if use_dtype_backend:
- arr = IntegerArray(arr, mask)
- else:
- arr = arr.astype(float)
- np.putmask(arr, mask, np.nan)
-
- elif arr.dtype == np.bool_:
- mask = arr.view(np.uint8) == na_value
-
- if use_dtype_backend:
- arr = BooleanArray(arr, mask)
- else:
- arr = arr.astype(object)
- np.putmask(arr, mask, np.nan)
-
- elif issubclass(arr.dtype.type, float) or arr.dtype.type == np.float32:
- if use_dtype_backend:
- mask = np.isnan(arr)
- arr = FloatingArray(arr, mask)
-
- elif arr.dtype == np.object_:
- if use_dtype_backend:
- arr = StringDtype().construct_array_type()._from_sequence(arr)
-
- if use_dtype_backend and dtype_backend == "pyarrow":
- import pyarrow as pa
- if isinstance(arr, IntegerArray) and arr.isna().all():
- # use null instead of int64 in pyarrow
- arr = arr.to_numpy()
- arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
-
- return arr
-
-
-# ----------------------------------------------------------------------
-# Type conversions / inference support code
-
-
-# -> tuple[ndarray[object], int]
-cdef _string_box_utf8(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset,
- const char *encoding_errors):
- cdef:
- int na_count = 0
- Py_ssize_t i, lines
- coliter_t it
- const char *word = NULL
- ndarray[object] result
-
- int ret = 0
- kh_strbox_t *table
-
- object pyval
-
- object NA = na_values[np.object_]
- khiter_t k
-
- table = kh_init_strbox()
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.object_)
- coliter_setup(&it, parser, col, line_start)
-
- for i in range(lines):
- COLITER_NEXT(it, word)
-
- if na_filter:
- if kh_get_str_starts_item(na_hashset, word):
- # in the hash table
- na_count += 1
- result[i] = NA
- continue
-
- k = kh_get_strbox(table, word)
-
- # in the hash table
- if k != table.n_buckets:
- # this increments the refcount, but need to test
- pyval = <object>table.vals[k]
- else:
- # box it. new ref?
- pyval = PyUnicode_Decode(word, strlen(word), "utf-8", encoding_errors)
-
- k = kh_put_strbox(table, word, &ret)
- table.vals[k] = <PyObject *>pyval
-
- result[i] = pyval
-
- kh_destroy_strbox(table)
-
- return result, na_count
-
-
-@cython.boundscheck(False)
-cdef _categorical_convert(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset):
- "Convert column data into codes, categories"
- cdef:
- int na_count = 0
- Py_ssize_t i, lines
- coliter_t it
- const char *word = NULL
-
- int64_t NA = -1
- int64_t[::1] codes
- int64_t current_category = 0
-
- int ret = 0
- kh_str_t *table
- khiter_t k
-
- lines = line_end - line_start
- codes = np.empty(lines, dtype=np.int64)
-
- # factorize parsed values, creating a hash table
- # bytes -> category code
- with nogil:
- table = kh_init_str()
- coliter_setup(&it, parser, col, line_start)
-
- for i in range(lines):
- COLITER_NEXT(it, word)
-
- if na_filter:
- if kh_get_str_starts_item(na_hashset, word):
- # is in NA values
- na_count += 1
- codes[i] = NA
- continue
-
- k = kh_get_str(table, word)
- # not in the hash table
- if k == table.n_buckets:
- k = kh_put_str(table, word, &ret)
- table.vals[k] = current_category
- current_category += 1
-
- codes[i] = table.vals[k]
-
- # parse and box categories to python strings
- result = np.empty(table.n_occupied, dtype=np.object_)
- for k in range(table.n_buckets):
- if kh_exist_str(table, k):
- result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
-
- kh_destroy_str(table)
- return np.asarray(codes), result, na_count
-
-
-# -> ndarray[f'|S{width}']
-cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
- int64_t line_end, int64_t width):
- cdef:
- char *data
- ndarray result
-
- result = np.empty(line_end - line_start, dtype=f"|S{width}")
- data = <char*>result.data
-
- with nogil:
- _to_fw_string_nogil(parser, col, line_start, line_end, width, data)
-
- return result
-
-
-cdef void _to_fw_string_nogil(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- size_t width, char *data) nogil:
- cdef:
- int64_t i
- coliter_t it
- const char *word = NULL
-
- coliter_setup(&it, parser, col, line_start)
-
- for i in range(line_end - line_start):
- COLITER_NEXT(it, word)
- strncpy(data, word, width)
- data += width
-
-
-cdef:
- char* cinf = b"inf"
- char* cposinf = b"+inf"
- char* cneginf = b"-inf"
-
- char* cinfty = b"Infinity"
- char* cposinfty = b"+Infinity"
- char* cneginfty = b"-Infinity"
-
-
-# -> tuple[ndarray[float64_t], int] | tuple[None, None]
-cdef _try_double(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset, object na_flist):
- cdef:
- int error, na_count = 0
- Py_ssize_t lines
- float64_t *data
- float64_t NA = na_values[np.float64]
- kh_float64_t *na_fset
- ndarray[float64_t] result
- bint use_na_flist = len(na_flist) > 0
-
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.float64)
- data = <float64_t *>result.data
- na_fset = kset_float64_from_list(na_flist)
- with nogil:
- error = _try_double_nogil(parser, parser.double_converter,
- col, line_start, line_end,
- na_filter, na_hashset, use_na_flist,
- na_fset, NA, data, &na_count)
-
- kh_destroy_float64(na_fset)
- if error != 0:
- return None, None
- return result, na_count
-
-
-cdef int _try_double_nogil(parser_t *parser,
- float64_t (*double_converter)(
- const char *, char **, char,
- char, char, int, int *, int *) nogil,
- int64_t col, int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset,
- bint use_na_flist,
- const kh_float64_t *na_flist,
- float64_t NA, float64_t *data,
- int *na_count) nogil:
- cdef:
- int error = 0,
- Py_ssize_t i, lines = line_end - line_start
- coliter_t it
- const char *word = NULL
- char *p_end
- khiter_t k64
-
- na_count[0] = 0
- coliter_setup(&it, parser, col, line_start)
-
- if na_filter:
- for i in range(lines):
- COLITER_NEXT(it, word)
-
- if kh_get_str_starts_item(na_hashset, word):
- # in the hash table
- na_count[0] += 1
- data[0] = NA
- else:
- data[0] = double_converter(word, &p_end, parser.decimal,
- parser.sci, parser.thousands,
- 1, &error, NULL)
- if error != 0 or p_end == word or p_end[0]:
- error = 0
- if (strcasecmp(word, cinf) == 0 or
- strcasecmp(word, cposinf) == 0 or
- strcasecmp(word, cinfty) == 0 or
- strcasecmp(word, cposinfty) == 0):
- data[0] = INF
- elif (strcasecmp(word, cneginf) == 0 or
- strcasecmp(word, cneginfty) == 0):
- data[0] = NEGINF
- else:
- return 1
- if use_na_flist:
- k64 = kh_get_float64(na_flist, data[0])
- if k64 != na_flist.n_buckets:
- na_count[0] += 1
- data[0] = NA
- data += 1
- else:
- for i in range(lines):
- COLITER_NEXT(it, word)
- data[0] = double_converter(word, &p_end, parser.decimal,
- parser.sci, parser.thousands,
- 1, &error, NULL)
- if error != 0 or p_end == word or p_end[0]:
- error = 0
- if (strcasecmp(word, cinf) == 0 or
- strcasecmp(word, cposinf) == 0 or
- strcasecmp(word, cinfty) == 0 or
- strcasecmp(word, cposinfty) == 0):
- data[0] = INF
- elif (strcasecmp(word, cneginf) == 0 or
- strcasecmp(word, cneginfty) == 0):
- data[0] = NEGINF
- else:
- return 1
- data += 1
-
- return 0
-
-
-cdef _try_uint64(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset):
- cdef:
- int error
- Py_ssize_t lines
- coliter_t it
- uint64_t *data
- ndarray result
- uint_state state
-
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.uint64)
- data = <uint64_t *>result.data
-
- uint_state_init(&state)
- coliter_setup(&it, parser, col, line_start)
- with nogil:
- error = _try_uint64_nogil(parser, col, line_start, line_end,
- na_filter, na_hashset, data, &state)
- if error != 0:
- if error == ERROR_OVERFLOW:
- # Can't get the word variable
- raise OverflowError("Overflow")
- return None
-
- if uint64_conflict(&state):
- raise ValueError("Cannot convert to numerical dtype")
-
- if state.seen_sint:
- raise OverflowError("Overflow")
-
- return result
-
-
-cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
- int64_t line_start,
- int64_t line_end, bint na_filter,
- const kh_str_starts_t *na_hashset,
- uint64_t *data, uint_state *state) nogil:
- cdef:
- int error
- Py_ssize_t i, lines = line_end - line_start
- coliter_t it
- const char *word = NULL
-
- coliter_setup(&it, parser, col, line_start)
-
- if na_filter:
- for i in range(lines):
- COLITER_NEXT(it, word)
- if kh_get_str_starts_item(na_hashset, word):
- # in the hash table
- state.seen_null = 1
- data[i] = 0
- continue
-
- data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
- &error, parser.thousands)
- if error != 0:
- return error
- else:
- for i in range(lines):
- COLITER_NEXT(it, word)
- data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
- &error, parser.thousands)
- if error != 0:
- return error
-
- return 0
-
-
-cdef _try_int64(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset):
- cdef:
- int error, na_count = 0
- Py_ssize_t lines
- coliter_t it
- int64_t *data
- ndarray result
- int64_t NA = na_values[np.int64]
-
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.int64)
- data = <int64_t *>result.data
- coliter_setup(&it, parser, col, line_start)
- with nogil:
- error = _try_int64_nogil(parser, col, line_start, line_end,
- na_filter, na_hashset, NA, data, &na_count)
- if error != 0:
- if error == ERROR_OVERFLOW:
- # Can't get the word variable
- raise OverflowError("Overflow")
- return None, None
-
- return result, na_count
-
-
-cdef int _try_int64_nogil(parser_t *parser, int64_t col,
- int64_t line_start,
- int64_t line_end, bint na_filter,
- const kh_str_starts_t *na_hashset, int64_t NA,
- int64_t *data, int *na_count) nogil:
- cdef:
- int error
- Py_ssize_t i, lines = line_end - line_start
- coliter_t it
- const char *word = NULL
-
- na_count[0] = 0
- coliter_setup(&it, parser, col, line_start)
-
- if na_filter:
- for i in range(lines):
- COLITER_NEXT(it, word)
- if kh_get_str_starts_item(na_hashset, word):
- # in the hash table
- na_count[0] += 1
- data[i] = NA
- continue
-
- data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
- &error, parser.thousands)
- if error != 0:
- return error
- else:
- for i in range(lines):
- COLITER_NEXT(it, word)
- data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
- &error, parser.thousands)
- if error != 0:
- return error
-
- return 0
-
-
-# -> tuple[ndarray[bool], int]
-cdef _try_bool_flex(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, const kh_str_starts_t *na_hashset,
- const kh_str_starts_t *true_hashset,
- const kh_str_starts_t *false_hashset):
- cdef:
- int error, na_count = 0
- Py_ssize_t lines
- uint8_t *data
- ndarray result
- uint8_t NA = na_values[np.bool_]
-
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.uint8)
- data = <uint8_t *>result.data
- with nogil:
- error = _try_bool_flex_nogil(parser, col, line_start, line_end,
- na_filter, na_hashset, true_hashset,
- false_hashset, NA, data, &na_count)
- if error != 0:
- return None, None
- return result.view(np.bool_), na_count
-
-
-cdef int _try_bool_flex_nogil(parser_t *parser, int64_t col,
- int64_t line_start,
- int64_t line_end, bint na_filter,
- const kh_str_starts_t *na_hashset,
- const kh_str_starts_t *true_hashset,
- const kh_str_starts_t *false_hashset,
- uint8_t NA, uint8_t *data,
- int *na_count) nogil:
- cdef:
- int error = 0
- Py_ssize_t i, lines = line_end - line_start
- coliter_t it
- const char *word = NULL
-
- na_count[0] = 0
- coliter_setup(&it, parser, col, line_start)
-
- if na_filter:
- for i in range(lines):
- COLITER_NEXT(it, word)
-
- if kh_get_str_starts_item(na_hashset, word):
- # in the hash table
- na_count[0] += 1
- data[0] = NA
- data += 1
- continue
-
- if kh_get_str_starts_item(true_hashset, word):
- data[0] = 1
- data += 1
- continue
- if kh_get_str_starts_item(false_hashset, word):
- data[0] = 0
- data += 1
- continue
-
- error = to_boolean(word, data)
- if error != 0:
- return error
- data += 1
- else:
- for i in range(lines):
- COLITER_NEXT(it, word)
-
- if kh_get_str_starts_item(true_hashset, word):
- data[0] = 1
- data += 1
- continue
-
- if kh_get_str_starts_item(false_hashset, word):
- data[0] = 0
- data += 1
- continue
-
- error = to_boolean(word, data)
- if error != 0:
- return error
- data += 1
-
- return 0
-
-
-cdef kh_str_starts_t* kset_from_list(list values) except NULL:
- # caller takes responsibility for freeing the hash table
- cdef:
- Py_ssize_t i
- kh_str_starts_t *table
- int ret = 0
- object val
-
- table = kh_init_str_starts()
-
- for i in range(len(values)):
- val = values[i]
-
- # None creeps in sometimes, which isn't possible here
- if not isinstance(val, bytes):
- kh_destroy_str_starts(table)
- raise ValueError("Must be all encoded bytes")
-
- kh_put_str_starts_item(table, PyBytes_AsString(val), &ret)
-
- if table.table.n_buckets <= 128:
- # Resize the hash table to make it almost empty, this
- # reduces amount of hash collisions on lookup thus
- # "key not in table" case is faster.
- # Note that this trades table memory footprint for lookup speed.
- kh_resize_str_starts(table, table.table.n_buckets * 8)
-
- return table
-
-
-cdef kh_float64_t* kset_float64_from_list(values) except NULL:
- # caller takes responsibility for freeing the hash table
- cdef:
- kh_float64_t *table
- int ret = 0
- float64_t val
- object value
-
- table = kh_init_float64()
-
- for value in values:
- val = float(value)
-
- kh_put_float64(table, val, &ret)
-
- if table.n_buckets <= 128:
- # See reasoning in kset_from_list
- kh_resize_float64(table, table.n_buckets * 8)
- return table
-
-
-cdef raise_parser_error(object base, parser_t *parser):
- cdef:
- object old_exc
- object exc_type
- PyObject *type
- PyObject *value
- PyObject *traceback
-
- if PyErr_Occurred():
- PyErr_Fetch(&type, &value, &traceback)
- Py_XDECREF(traceback)
-
- if value != NULL:
- old_exc = <object>value
- Py_XDECREF(value)
-
- # PyErr_Fetch only returned the error message in *value,
- # so the Exception class must be extracted from *type.
- if isinstance(old_exc, str):
- if type != NULL:
- exc_type = <object>type
- else:
- exc_type = ParserError
-
- Py_XDECREF(type)
- raise exc_type(old_exc)
- else:
- Py_XDECREF(type)
- raise old_exc
-
- message = f"{base}. C error: "
- if parser.error_msg != NULL:
- message += parser.error_msg.decode("utf-8")
- else:
- message += "no error message set"
-
- raise ParserError(message)
-
-
-# ----------------------------------------------------------------------
-# NA values
-def _compute_na_values():
- int64info = np.iinfo(np.int64)
- int32info = np.iinfo(np.int32)
- int16info = np.iinfo(np.int16)
- int8info = np.iinfo(np.int8)
- uint64info = np.iinfo(np.uint64)
- uint32info = np.iinfo(np.uint32)
- uint16info = np.iinfo(np.uint16)
- uint8info = np.iinfo(np.uint8)
- na_values = {
- np.float32: np.nan,
- np.float64: np.nan,
- np.int64: int64info.min,
- np.int32: int32info.min,
- np.int16: int16info.min,
- np.int8: int8info.min,
- np.uint64: uint64info.max,
- np.uint32: uint32info.max,
- np.uint16: uint16info.max,
- np.uint8: uint8info.max,
- np.bool_: uint8info.max,
- np.object_: np.nan,
- }
- return na_values
-
-
-na_values = _compute_na_values()
-
-for k in list(na_values):
- na_values[np.dtype(k)] = na_values[k]
-
-
-# -> ArrayLike
-cdef _apply_converter(object f, parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end):
- cdef:
- Py_ssize_t i, lines
- coliter_t it
- const char *word = NULL
- ndarray[object] result
- object val
-
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.object_)
-
- coliter_setup(&it, parser, col, line_start)
-
- for i in range(lines):
- COLITER_NEXT(it, word)
- val = PyUnicode_FromString(word)
- result[i] = f(val)
-
- return lib.maybe_convert_objects(result)
-
-
-cdef list _maybe_encode(list values):
- if values is None:
- return []
- return [x.encode("utf-8") if isinstance(x, str) else x for x in values]
-
-
-def sanitize_objects(ndarray[object] values, set na_values) -> int:
- """
- Convert specified values, including the given set na_values to np.nan.
-
- Parameters
- ----------
- values : ndarray[object]
- na_values : set
-
- Returns
- -------
- na_count : int
- """
- cdef:
- Py_ssize_t i, n
- object val, onan
- Py_ssize_t na_count = 0
- dict memo = {}
-
- n = len(values)
- onan = np.nan
-
- for i in range(n):
- val = values[i]
- if val in na_values:
- values[i] = onan
- na_count += 1
- elif val in memo:
- values[i] = memo[val]
- else:
- memo[val] = val
-
- return na_count
diff --git a/contrib/python/pandas/py3/pandas/_libs/properties.pyi b/contrib/python/pandas/py3/pandas/_libs/properties.pyi
deleted file mode 100644
index aaa44a0cf47..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/properties.pyi
+++ /dev/null
@@ -1,27 +0,0 @@
-from typing import (
- Sequence,
- overload,
-)
-
-from pandas._typing import (
- AnyArrayLike,
- DataFrame,
- Index,
- Series,
-)
-
-# note: this is a lie to make type checkers happy (they special
-# case property). cache_readonly uses attribute names similar to
-# property (fget) but it does not provide fset and fdel.
-cache_readonly = property
-
-class AxisProperty:
- axis: int
- def __init__(self, axis: int = ..., doc: str = ...) -> None: ...
- @overload
- def __get__(self, obj: DataFrame | Series, type) -> Index: ...
- @overload
- def __get__(self, obj: None, type) -> AxisProperty: ...
- def __set__(
- self, obj: DataFrame | Series, value: AnyArrayLike | Sequence
- ) -> None: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/properties.pyx b/contrib/python/pandas/py3/pandas/_libs/properties.pyx
deleted file mode 100644
index 33cd2ef27a9..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/properties.pyx
+++ /dev/null
@@ -1,69 +0,0 @@
-from cpython.dict cimport (
- PyDict_Contains,
- PyDict_GetItem,
- PyDict_SetItem,
-)
-from cython cimport Py_ssize_t
-
-
-cdef class CachedProperty:
-
- cdef readonly:
- object fget, name, __doc__
-
- def __init__(self, fget):
- self.fget = fget
- self.name = fget.__name__
- self.__doc__ = getattr(fget, "__doc__", None)
-
- def __get__(self, obj, typ):
- if obj is None:
- # accessed on the class, not the instance
- return self
-
- # Get the cache or set a default one if needed
- cache = getattr(obj, "_cache", None)
- if cache is None:
- try:
- cache = obj._cache = {}
- except (AttributeError):
- return self
-
- if PyDict_Contains(cache, self.name):
- # not necessary to Py_INCREF
- val = <object>PyDict_GetItem(cache, self.name)
- else:
- val = self.fget(obj)
- PyDict_SetItem(cache, self.name, val)
- return val
-
- def __set__(self, obj, value):
- raise AttributeError("Can't set attribute")
-
-
-cache_readonly = CachedProperty
-
-
-cdef class AxisProperty:
-
- cdef readonly:
- Py_ssize_t axis
- object __doc__
-
- def __init__(self, axis=0, doc=""):
- self.axis = axis
- self.__doc__ = doc
-
- def __get__(self, obj, type):
- cdef:
- list axes
-
- if obj is None:
- # Only instances have _mgr, not classes
- return self
- else:
- axes = obj._mgr.axes
- return axes[self.axis]
-
- def __set__(self, obj, value):
- obj._set_axis(self.axis, value)
diff --git a/contrib/python/pandas/py3/pandas/_libs/reduction.pyi b/contrib/python/pandas/py3/pandas/_libs/reduction.pyi
deleted file mode 100644
index 525546f26c8..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/reduction.pyi
+++ /dev/null
@@ -1,6 +0,0 @@
-from typing import Any
-
-from pandas._typing import DtypeObj
-
-def check_result_array(obj: object, dtype: DtypeObj) -> None: ...
-def extract_result(res: object) -> Any: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/reduction.pyx b/contrib/python/pandas/py3/pandas/_libs/reduction.pyx
deleted file mode 100644
index 7ff0842678d..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/reduction.pyx
+++ /dev/null
@@ -1,33 +0,0 @@
-import numpy as np
-
-cimport numpy as cnp
-
-cnp.import_array()
-
-from pandas._libs.util cimport is_array
-
-
-cdef cnp.dtype _dtype_obj = np.dtype("object")
-
-
-cpdef check_result_array(object obj, object dtype):
- # Our operation is supposed to be an aggregation/reduction. If
- # it returns an ndarray, this likely means an invalid operation has
- # been passed. See test_apply_without_aggregation, test_agg_must_agg
- if is_array(obj):
- if dtype != _dtype_obj:
- # If it is object dtype, the function can be a reduction/aggregation
- # and still return an ndarray e.g. test_agg_over_numpy_arrays
- raise ValueError("Must produce aggregated value")
-
-
-cpdef inline extract_result(object res):
- """ extract the result object, it might be a 0-dim ndarray
- or a len-1 0-dim, or a scalar """
- if hasattr(res, "_values"):
- # Preserve EA
- res = res._values
- if res.ndim == 1 and len(res) == 1:
- # see test_agg_lambda_with_timezone, test_resampler_grouper.py::test_apply
- res = res[0]
- return res
diff --git a/contrib/python/pandas/py3/pandas/_libs/reshape.pyi b/contrib/python/pandas/py3/pandas/_libs/reshape.pyi
deleted file mode 100644
index 110687fcd0c..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/reshape.pyi
+++ /dev/null
@@ -1,16 +0,0 @@
-import numpy as np
-
-from pandas._typing import npt
-
-def unstack(
- values: np.ndarray, # reshape_t[:, :]
- mask: np.ndarray, # const uint8_t[:]
- stride: int,
- length: int,
- width: int,
- new_values: np.ndarray, # reshape_t[:, :]
- new_mask: np.ndarray, # uint8_t[:, :]
-) -> None: ...
-def explode(
- values: npt.NDArray[np.object_],
-) -> tuple[npt.NDArray[np.object_], npt.NDArray[np.int64]]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/reshape.pyx b/contrib/python/pandas/py3/pandas/_libs/reshape.pyx
deleted file mode 100644
index 946ba5ddaa2..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/reshape.pyx
+++ /dev/null
@@ -1,138 +0,0 @@
-cimport cython
-from cython cimport Py_ssize_t
-from numpy cimport (
- int64_t,
- ndarray,
- uint8_t,
-)
-
-import numpy as np
-
-cimport numpy as cnp
-
-cnp.import_array()
-
-from pandas._libs.dtypes cimport numeric_object_t
-from pandas._libs.lib cimport c_is_list_like
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def unstack(numeric_object_t[:, :] values, const uint8_t[:] mask,
- Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width,
- numeric_object_t[:, :] new_values, uint8_t[:, :] new_mask) -> None:
- """
- Transform long values to wide new_values.
-
- Parameters
- ----------
- values : typed ndarray
- mask : np.ndarray[bool]
- stride : int
- length : int
- width : int
- new_values : np.ndarray[bool]
- result array
- new_mask : np.ndarray[bool]
- result mask
- """
- cdef:
- Py_ssize_t i, j, w, nulls, s, offset
-
- if numeric_object_t is not object:
- # evaluated at compile-time
- with nogil:
- for i in range(stride):
-
- nulls = 0
- for j in range(length):
-
- for w in range(width):
-
- offset = j * width + w
-
- if mask[offset]:
- s = i * width + w
- new_values[j, s] = values[offset - nulls, i]
- new_mask[j, s] = 1
- else:
- nulls += 1
-
- else:
- # object-dtype, identical to above but we cannot use nogil
- for i in range(stride):
-
- nulls = 0
- for j in range(length):
-
- for w in range(width):
-
- offset = j * width + w
-
- if mask[offset]:
- s = i * width + w
- new_values[j, s] = values[offset - nulls, i]
- new_mask[j, s] = 1
- else:
- nulls += 1
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def explode(ndarray[object] values):
- """
- transform array list-likes to long form
- preserve non-list entries
-
- Parameters
- ----------
- values : ndarray[object]
-
- Returns
- -------
- ndarray[object]
- result
- ndarray[int64_t]
- counts
- """
- cdef:
- Py_ssize_t i, j, count, n
- object v
- ndarray[object] result
- ndarray[int64_t] counts
-
- # find the resulting len
- n = len(values)
- counts = np.zeros(n, dtype="int64")
- for i in range(n):
- v = values[i]
-
- if c_is_list_like(v, True):
- if len(v):
- counts[i] += len(v)
- else:
- # empty list-like, use a nan marker
- counts[i] += 1
- else:
- counts[i] += 1
-
- result = np.empty(counts.sum(), dtype="object")
- count = 0
- for i in range(n):
- v = values[i]
-
- if c_is_list_like(v, True):
- if len(v):
- v = list(v)
- for j in range(len(v)):
- result[count] = v[j]
- count += 1
- else:
- # empty list-like, use a nan marker
- result[count] = np.nan
- count += 1
- else:
- # replace with the existing scalar
- result[count] = v
- count += 1
- return result, counts
diff --git a/contrib/python/pandas/py3/pandas/_libs/sparse.pyi b/contrib/python/pandas/py3/pandas/_libs/sparse.pyi
deleted file mode 100644
index 8c3989b818a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/sparse.pyi
+++ /dev/null
@@ -1,49 +0,0 @@
-from typing import (
- Sequence,
- TypeVar,
-)
-
-import numpy as np
-
-from pandas._typing import npt
-
-_SparseIndexT = TypeVar("_SparseIndexT", bound=SparseIndex)
-
-class SparseIndex:
- length: int
- npoints: int
- def __init__(self) -> None: ...
- @property
- def ngaps(self) -> int: ...
- @property
- def nbytes(self) -> int: ...
- @property
- def indices(self) -> npt.NDArray[np.int32]: ...
- def equals(self, other) -> bool: ...
- def lookup(self, index: int) -> np.int32: ...
- def lookup_array(self, indexer: npt.NDArray[np.int32]) -> npt.NDArray[np.int32]: ...
- def to_int_index(self) -> IntIndex: ...
- def to_block_index(self) -> BlockIndex: ...
- def intersect(self: _SparseIndexT, y_: SparseIndex) -> _SparseIndexT: ...
- def make_union(self: _SparseIndexT, y_: SparseIndex) -> _SparseIndexT: ...
-
-class IntIndex(SparseIndex):
- indices: npt.NDArray[np.int32]
- def __init__(
- self, length: int, indices: Sequence[int], check_integrity: bool = ...
- ) -> None: ...
-
-class BlockIndex(SparseIndex):
- nblocks: int
- blocs: np.ndarray
- blengths: np.ndarray
- def __init__(
- self, length: int, blocs: np.ndarray, blengths: np.ndarray
- ) -> None: ...
-
-def make_mask_object_ndarray(
- arr: npt.NDArray[np.object_], fill_value
-) -> npt.NDArray[np.bool_]: ...
-def get_blocks(
- indices: npt.NDArray[np.int32],
-) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.int32]]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/sparse.pyx b/contrib/python/pandas/py3/pandas/_libs/sparse.pyx
deleted file mode 100644
index 74f7653ebbe..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/sparse.pyx
+++ /dev/null
@@ -1,733 +0,0 @@
-cimport cython
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- float64_t,
- int8_t,
- int32_t,
- int64_t,
- ndarray,
- uint8_t,
-)
-
-cnp.import_array()
-
-
-# -----------------------------------------------------------------------------
-# Preamble stuff
-
-cdef float64_t NaN = <float64_t>np.NaN
-cdef float64_t INF = <float64_t>np.inf
-
-# -----------------------------------------------------------------------------
-
-
-cdef class SparseIndex:
- """
- Abstract superclass for sparse index types.
- """
-
- def __init__(self):
- raise NotImplementedError
-
-
-cdef class IntIndex(SparseIndex):
- """
- Object for holding exact integer sparse indexing information
-
- Parameters
- ----------
- length : integer
- indices : array-like
- Contains integers corresponding to the indices.
- check_integrity : bool, default=True
- Check integrity of the input.
- """
-
- cdef readonly:
- Py_ssize_t length, npoints
- ndarray indices
-
- def __init__(self, Py_ssize_t length, indices, bint check_integrity=True):
- self.length = length
- self.indices = np.ascontiguousarray(indices, dtype=np.int32)
- self.npoints = len(self.indices)
-
- if check_integrity:
- self.check_integrity()
-
- def __reduce__(self):
- args = (self.length, self.indices)
- return IntIndex, args
-
- def __repr__(self) -> str:
- output = "IntIndex\n"
- output += f"Indices: {repr(self.indices)}\n"
- return output
-
- @property
- def nbytes(self) -> int:
- return self.indices.nbytes
-
- cdef check_integrity(self):
- """
- Checks the following:
-
- - Indices are strictly ascending
- - Number of indices is at most self.length
- - Indices are at least 0 and at most the total length less one
-
- A ValueError is raised if any of these conditions is violated.
- """
-
- if self.npoints > self.length:
- raise ValueError(
- f"Too many indices. Expected {self.length} but found {self.npoints}"
- )
-
- # Indices are vacuously ordered and non-negative
- # if the sequence of indices is empty.
- if self.npoints == 0:
- return
-
- if self.indices.min() < 0:
- raise ValueError("No index can be less than zero")
-
- if self.indices.max() >= self.length:
- raise ValueError("All indices must be less than the length")
-
- monotonic = np.all(self.indices[:-1] < self.indices[1:])
- if not monotonic:
- raise ValueError("Indices must be strictly increasing")
-
- def equals(self, other: object) -> bool:
- if not isinstance(other, IntIndex):
- return False
-
- if self is other:
- return True
-
- same_length = self.length == other.length
- same_indices = np.array_equal(self.indices, other.indices)
- return same_length and same_indices
-
- @property
- def ngaps(self) -> int:
- return self.length - self.npoints
-
- cpdef to_int_index(self):
- return self
-
- def to_block_index(self):
- locs, lens = get_blocks(self.indices)
- return BlockIndex(self.length, locs, lens)
-
- cpdef IntIndex intersect(self, SparseIndex y_):
- cdef:
- Py_ssize_t xi, yi = 0, result_indexer = 0
- int32_t xind
- ndarray[int32_t, ndim=1] xindices, yindices, new_indices
- IntIndex y
-
- # if is one already, returns self
- y = y_.to_int_index()
-
- if self.length != y.length:
- raise Exception("Indices must reference same underlying length")
-
- xindices = self.indices
- yindices = y.indices
- new_indices = np.empty(min(
- len(xindices), len(yindices)), dtype=np.int32)
-
- for xi in range(self.npoints):
- xind = xindices[xi]
-
- while yi < y.npoints and yindices[yi] < xind:
- yi += 1
-
- if yi >= y.npoints:
- break
-
- # TODO: would a two-pass algorithm be faster?
- if yindices[yi] == xind:
- new_indices[result_indexer] = xind
- result_indexer += 1
-
- new_indices = new_indices[:result_indexer]
- return IntIndex(self.length, new_indices)
-
- cpdef IntIndex make_union(self, SparseIndex y_):
-
- cdef:
- ndarray[int32_t, ndim=1] new_indices
- IntIndex y
-
- # if is one already, returns self
- y = y_.to_int_index()
-
- if self.length != y.length:
- raise ValueError("Indices must reference same underlying length")
-
- new_indices = np.union1d(self.indices, y.indices)
- return IntIndex(self.length, new_indices)
-
- @cython.wraparound(False)
- cpdef int32_t lookup(self, Py_ssize_t index):
- """
- Return the internal location if value exists on given index.
- Return -1 otherwise.
- """
- cdef:
- int32_t res
- ndarray[int32_t, ndim=1] inds
-
- inds = self.indices
- if self.npoints == 0:
- return -1
- elif index < 0 or self.length <= index:
- return -1
-
- res = inds.searchsorted(index)
- if res == self.npoints:
- return -1
- elif inds[res] == index:
- return res
- else:
- return -1
-
- @cython.wraparound(False)
- cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer):
- """
- Vectorized lookup, returns ndarray[int32_t]
- """
- cdef:
- Py_ssize_t n
- ndarray[int32_t, ndim=1] inds
- ndarray[uint8_t, ndim=1, cast=True] mask
- ndarray[int32_t, ndim=1] masked
- ndarray[int32_t, ndim=1] res
- ndarray[int32_t, ndim=1] results
-
- n = len(indexer)
- results = np.empty(n, dtype=np.int32)
- results[:] = -1
-
- if self.npoints == 0:
- return results
-
- inds = self.indices
- mask = (inds[0] <= indexer) & (indexer <= inds[len(inds) - 1])
-
- masked = indexer[mask]
- res = inds.searchsorted(masked).astype(np.int32)
-
- res[inds[res] != masked] = -1
- results[mask] = res
- return results
-
-
-cpdef get_blocks(ndarray[int32_t, ndim=1] indices):
- cdef:
- Py_ssize_t i, npoints, result_indexer = 0
- int32_t block, length = 1, cur, prev
- ndarray[int32_t, ndim=1] locs, lens
-
- npoints = len(indices)
-
- # just handle the special empty case separately
- if npoints == 0:
- return np.array([], dtype=np.int32), np.array([], dtype=np.int32)
-
- # block size can't be longer than npoints
- locs = np.empty(npoints, dtype=np.int32)
- lens = np.empty(npoints, dtype=np.int32)
-
- # TODO: two-pass algorithm faster?
- prev = block = indices[0]
- for i in range(1, npoints):
- cur = indices[i]
- if cur - prev > 1:
- # new block
- locs[result_indexer] = block
- lens[result_indexer] = length
- block = cur
- length = 1
- result_indexer += 1
- else:
- # same block, increment length
- length += 1
-
- prev = cur
-
- locs[result_indexer] = block
- lens[result_indexer] = length
- result_indexer += 1
- locs = locs[:result_indexer]
- lens = lens[:result_indexer]
- return locs, lens
-
-
-# -----------------------------------------------------------------------------
-# BlockIndex
-
-cdef class BlockIndex(SparseIndex):
- """
- Object for holding block-based sparse indexing information
-
- Parameters
- ----------
- """
- cdef readonly:
- int32_t nblocks, npoints, length
- ndarray blocs, blengths
-
- cdef:
- object __weakref__ # need to be picklable
- int32_t *locbuf
- int32_t *lenbuf
-
- def __init__(self, length, blocs, blengths):
-
- self.blocs = np.ascontiguousarray(blocs, dtype=np.int32)
- self.blengths = np.ascontiguousarray(blengths, dtype=np.int32)
-
- # in case we need
- self.locbuf = <int32_t*>self.blocs.data
- self.lenbuf = <int32_t*>self.blengths.data
-
- self.length = length
- self.nblocks = np.int32(len(self.blocs))
- self.npoints = self.blengths.sum()
-
- self.check_integrity()
-
- def __reduce__(self):
- args = (self.length, self.blocs, self.blengths)
- return BlockIndex, args
-
- def __repr__(self) -> str:
- output = "BlockIndex\n"
- output += f"Block locations: {repr(self.blocs)}\n"
- output += f"Block lengths: {repr(self.blengths)}"
-
- return output
-
- @property
- def nbytes(self) -> int:
- return self.blocs.nbytes + self.blengths.nbytes
-
- @property
- def ngaps(self) -> int:
- return self.length - self.npoints
-
- cdef check_integrity(self):
- """
- Check:
- - Locations are in ascending order
- - No overlapping blocks
- - Blocks to not start after end of index, nor extend beyond end
- """
- cdef:
- Py_ssize_t i
- ndarray[int32_t, ndim=1] blocs, blengths
-
- blocs = self.blocs
- blengths = self.blengths
-
- if len(blocs) != len(blengths):
- raise ValueError("block bound arrays must be same length")
-
- for i in range(self.nblocks):
- if i > 0:
- if blocs[i] <= blocs[i - 1]:
- raise ValueError("Locations not in ascending order")
-
- if i < self.nblocks - 1:
- if blocs[i] + blengths[i] > blocs[i + 1]:
- raise ValueError(f"Block {i} overlaps")
- else:
- if blocs[i] + blengths[i] > self.length:
- raise ValueError(f"Block {i} extends beyond end")
-
- # no zero-length blocks
- if blengths[i] == 0:
- raise ValueError(f"Zero-length block {i}")
-
- def equals(self, other: object) -> bool:
- if not isinstance(other, BlockIndex):
- return False
-
- if self is other:
- return True
-
- same_length = self.length == other.length
- same_blocks = (np.array_equal(self.blocs, other.blocs) and
- np.array_equal(self.blengths, other.blengths))
- return same_length and same_blocks
-
- def to_block_index(self):
- return self
-
- cpdef to_int_index(self):
- cdef:
- int32_t i = 0, j, b
- int32_t offset
- ndarray[int32_t, ndim=1] indices
-
- indices = np.empty(self.npoints, dtype=np.int32)
-
- for b in range(self.nblocks):
- offset = self.locbuf[b]
-
- for j in range(self.lenbuf[b]):
- indices[i] = offset + j
- i += 1
-
- return IntIndex(self.length, indices)
-
- @property
- def indices(self):
- return self.to_int_index().indices
-
- cpdef BlockIndex intersect(self, SparseIndex other):
- """
- Intersect two BlockIndex objects
-
- Returns
- -------
- BlockIndex
- """
- cdef:
- BlockIndex y
- ndarray[int32_t, ndim=1] xloc, xlen, yloc, ylen, out_bloc, out_blen
- Py_ssize_t xi = 0, yi = 0, max_len, result_indexer = 0
- int32_t cur_loc, cur_length, diff
-
- y = other.to_block_index()
-
- if self.length != y.length:
- raise Exception("Indices must reference same underlying length")
-
- xloc = self.blocs
- xlen = self.blengths
- yloc = y.blocs
- ylen = y.blengths
-
- # block may be split, but can't exceed original len / 2 + 1
- max_len = min(self.length, y.length) // 2 + 1
- out_bloc = np.empty(max_len, dtype=np.int32)
- out_blen = np.empty(max_len, dtype=np.int32)
-
- while True:
- # we are done (or possibly never began)
- if xi >= self.nblocks or yi >= y.nblocks:
- break
-
- # completely symmetric...would like to avoid code dup but oh well
- if xloc[xi] >= yloc[yi]:
- cur_loc = xloc[xi]
- diff = xloc[xi] - yloc[yi]
-
- if ylen[yi] <= diff:
- # have to skip this block
- yi += 1
- continue
-
- if ylen[yi] - diff < xlen[xi]:
- # take end of y block, move onward
- cur_length = ylen[yi] - diff
- yi += 1
- else:
- # take end of x block
- cur_length = xlen[xi]
- xi += 1
-
- else: # xloc[xi] < yloc[yi]
- cur_loc = yloc[yi]
- diff = yloc[yi] - xloc[xi]
-
- if xlen[xi] <= diff:
- # have to skip this block
- xi += 1
- continue
-
- if xlen[xi] - diff < ylen[yi]:
- # take end of x block, move onward
- cur_length = xlen[xi] - diff
- xi += 1
- else:
- # take end of y block
- cur_length = ylen[yi]
- yi += 1
-
- out_bloc[result_indexer] = cur_loc
- out_blen[result_indexer] = cur_length
- result_indexer += 1
-
- out_bloc = out_bloc[:result_indexer]
- out_blen = out_blen[:result_indexer]
-
- return BlockIndex(self.length, out_bloc, out_blen)
-
- cpdef BlockIndex make_union(self, SparseIndex y):
- """
- Combine together two BlockIndex objects, accepting indices if contained
- in one or the other
-
- Parameters
- ----------
- other : SparseIndex
-
- Notes
- -----
- union is a protected keyword in Cython, hence make_union
-
- Returns
- -------
- BlockIndex
- """
- return BlockUnion(self, y.to_block_index()).result
-
- cpdef Py_ssize_t lookup(self, Py_ssize_t index):
- """
- Return the internal location if value exists on given index.
- Return -1 otherwise.
- """
- cdef:
- Py_ssize_t i, cum_len
- ndarray[int32_t, ndim=1] locs, lens
-
- locs = self.blocs
- lens = self.blengths
-
- if self.nblocks == 0:
- return -1
- elif index < locs[0]:
- return -1
-
- cum_len = 0
- for i in range(self.nblocks):
- if index >= locs[i] and index < locs[i] + lens[i]:
- return cum_len + index - locs[i]
- cum_len += lens[i]
-
- return -1
-
- @cython.wraparound(False)
- cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer):
- """
- Vectorized lookup, returns ndarray[int32_t]
- """
- cdef:
- Py_ssize_t n, i, j, ind_val
- ndarray[int32_t, ndim=1] locs, lens
- ndarray[int32_t, ndim=1] results
-
- locs = self.blocs
- lens = self.blengths
-
- n = len(indexer)
- results = np.empty(n, dtype=np.int32)
- results[:] = -1
-
- if self.npoints == 0:
- return results
-
- for i in range(n):
- ind_val = indexer[i]
- if not (ind_val < 0 or self.length <= ind_val):
- cum_len = 0
- for j in range(self.nblocks):
- if ind_val >= locs[j] and ind_val < locs[j] + lens[j]:
- results[i] = cum_len + ind_val - locs[j]
- cum_len += lens[j]
- return results
-
-
-@cython.internal
-cdef class BlockMerge:
- """
- Object-oriented approach makes sharing state between recursive functions a
- lot easier and reduces code duplication
- """
- cdef:
- BlockIndex x, y, result
- ndarray xstart, xlen, xend, ystart, ylen, yend
- int32_t xi, yi # block indices
-
- def __init__(self, BlockIndex x, BlockIndex y):
- self.x = x
- self.y = y
-
- if x.length != y.length:
- raise Exception("Indices must reference same underlying length")
-
- self.xstart = self.x.blocs
- self.ystart = self.y.blocs
-
- self.xend = self.x.blocs + self.x.blengths
- self.yend = self.y.blocs + self.y.blengths
-
- # self.xlen = self.x.blengths
- # self.ylen = self.y.blengths
-
- self.xi = 0
- self.yi = 0
-
- self.result = self._make_merged_blocks()
-
- cdef _make_merged_blocks(self):
- raise NotImplementedError
-
- cdef _set_current_indices(self, int32_t xi, int32_t yi, bint mode):
- if mode == 0:
- self.xi = xi
- self.yi = yi
- else:
- self.xi = yi
- self.yi = xi
-
-
-@cython.internal
-cdef class BlockUnion(BlockMerge):
- """
- Object-oriented approach makes sharing state between recursive functions a
- lot easier and reduces code duplication
- """
-
- cdef _make_merged_blocks(self):
- cdef:
- ndarray[int32_t, ndim=1] xstart, xend, ystart
- ndarray[int32_t, ndim=1] yend, out_bloc, out_blen
- int32_t nstart, nend
- Py_ssize_t max_len, result_indexer = 0
-
- xstart = self.xstart
- xend = self.xend
- ystart = self.ystart
- yend = self.yend
-
- max_len = min(self.x.length, self.y.length) // 2 + 1
- out_bloc = np.empty(max_len, dtype=np.int32)
- out_blen = np.empty(max_len, dtype=np.int32)
-
- while True:
- # we are done (or possibly never began)
- if self.xi >= self.x.nblocks and self.yi >= self.y.nblocks:
- break
- elif self.yi >= self.y.nblocks:
- # through with y, just pass through x blocks
- nstart = xstart[self.xi]
- nend = xend[self.xi]
- self.xi += 1
- elif self.xi >= self.x.nblocks:
- # through with x, just pass through y blocks
- nstart = ystart[self.yi]
- nend = yend[self.yi]
- self.yi += 1
- else:
- # find end of new block
- if xstart[self.xi] < ystart[self.yi]:
- nstart = xstart[self.xi]
- nend = self._find_next_block_end(0)
- else:
- nstart = ystart[self.yi]
- nend = self._find_next_block_end(1)
-
- out_bloc[result_indexer] = nstart
- out_blen[result_indexer] = nend - nstart
- result_indexer += 1
-
- out_bloc = out_bloc[:result_indexer]
- out_blen = out_blen[:result_indexer]
-
- return BlockIndex(self.x.length, out_bloc, out_blen)
-
- cdef int32_t _find_next_block_end(self, bint mode) except -1:
- """
- Wow, this got complicated in a hurry
-
- mode 0: block started in index x
- mode 1: block started in index y
- """
- cdef:
- ndarray[int32_t, ndim=1] xstart, xend, ystart, yend
- int32_t xi, yi, ynblocks, nend
-
- if mode != 0 and mode != 1:
- raise Exception("Mode must be 0 or 1")
-
- # so symmetric code will work
- if mode == 0:
- xstart = self.xstart
- xend = self.xend
- xi = self.xi
-
- ystart = self.ystart
- yend = self.yend
- yi = self.yi
- ynblocks = self.y.nblocks
- else:
- xstart = self.ystart
- xend = self.yend
- xi = self.yi
-
- ystart = self.xstart
- yend = self.xend
- yi = self.xi
- ynblocks = self.x.nblocks
-
- nend = xend[xi]
-
- # done with y?
- if yi == ynblocks:
- self._set_current_indices(xi + 1, yi, mode)
- return nend
- elif nend < ystart[yi]:
- # block ends before y block
- self._set_current_indices(xi + 1, yi, mode)
- return nend
- else:
- while yi < ynblocks and nend > yend[yi]:
- yi += 1
-
- self._set_current_indices(xi + 1, yi, mode)
-
- if yi == ynblocks:
- return nend
-
- if nend < ystart[yi]:
- # we're done, return the block end
- return nend
- else:
- # merge blocks, continue searching
- # this also catches the case where blocks
- return self._find_next_block_end(1 - mode)
-
-
-# -----------------------------------------------------------------------------
-# Sparse arithmetic
-
-include "sparse_op_helper.pxi"
-
-
-# -----------------------------------------------------------------------------
-# SparseArray mask create operations
-
-def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value):
- cdef:
- object value
- Py_ssize_t i
- Py_ssize_t new_length = len(arr)
- ndarray[int8_t, ndim=1] mask
-
- mask = np.ones(new_length, dtype=np.int8)
-
- for i in range(new_length):
- value = arr[i]
- if value == fill_value and type(value) == type(fill_value):
- mask[i] = 0
-
- return mask.view(dtype=bool)
diff --git a/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi b/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi
deleted file mode 100644
index 76da4eec502..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi
+++ /dev/null
@@ -1,5979 +0,0 @@
-"""
-Template for each `dtype` helper function for sparse ops
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-# ----------------------------------------------------------------------
-# Sparse op
-# ----------------------------------------------------------------------
-
-ctypedef fused sparse_t:
- float64_t
- int64_t
-
-
-cdef float64_t __div__(sparse_t a, sparse_t b):
- if b == 0:
- if a > 0:
- return INF
- elif a < 0:
- return -INF
- else:
- return NaN
- else:
- return float(a) / b
-
-
-cdef float64_t __truediv__(sparse_t a, sparse_t b):
- return __div__(a, b)
-
-
-cdef sparse_t __mod__(sparse_t a, sparse_t b):
- if b == 0:
- if sparse_t is float64_t:
- return NaN
- else:
- return 0
- else:
- return a % b
-
-
-cdef sparse_t __floordiv__(sparse_t a, sparse_t b):
- if b == 0:
- if sparse_t is float64_t:
- # Match non-sparse Series behavior implemented in mask_zero_div_zero
- if a > 0:
- return INF
- elif a < 0:
- return -INF
- return NaN
- else:
- return 0
- else:
- return a // b
-
-
-# ----------------------------------------------------------------------
-# sparse array op
-# ----------------------------------------------------------------------
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_add_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] + yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill + y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] + y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] + yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill + y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill + yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_add_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill + y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] + yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] + y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] + yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill + y[yi]
- yi += 1
-
- return out, out_index, xfill + yfill
-
-
-cpdef sparse_add_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_add_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_add_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_add_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] + yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill + y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] + y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] + yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill + y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill + yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_add_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill + y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] + yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] + y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] + yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill + y[yi]
- yi += 1
-
- return out, out_index, xfill + yfill
-
-
-cpdef sparse_add_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_add_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_add_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_sub_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] - yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill - y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] - y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] - yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill - y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill - yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_sub_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill - y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] - yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] - y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] - yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill - y[yi]
- yi += 1
-
- return out, out_index, xfill - yfill
-
-
-cpdef sparse_sub_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_sub_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_sub_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_sub_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] - yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill - y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] - y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] - yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill - y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill - yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_sub_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill - y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] - yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] - y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] - yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill - y[yi]
- yi += 1
-
- return out, out_index, xfill - yfill
-
-
-cpdef sparse_sub_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_sub_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_sub_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_mul_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] * yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill * y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] * y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] * yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill * y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill * yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_mul_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill * y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] * yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] * y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] * yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill * y[yi]
- yi += 1
-
- return out, out_index, xfill * yfill
-
-
-cpdef sparse_mul_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_mul_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_mul_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_mul_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] * yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill * y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] * y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] * yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill * y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill * yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_mul_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill * y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] * yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] * y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] * yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill * y[yi]
- yi += 1
-
- return out, out_index, xfill * yfill
-
-
-cpdef sparse_mul_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_mul_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_mul_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_div_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = __div__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = __div__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __div__(x[xi], y[yi])
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __div__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = __div__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, __div__(xfill, yfill)
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_div_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = __div__(xfill, y[yi])
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = __div__(x[xi], yfill)
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __div__(x[xi], y[yi])
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __div__(x[xi], yfill)
- xi += 1
- else:
- # use x fill value
- out[out_i] = __div__(xfill, y[yi])
- yi += 1
-
- return out, out_index, __div__(xfill, yfill)
-
-
-cpdef sparse_div_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_div_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_div_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_div_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = __div__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = __div__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __div__(x[xi], y[yi])
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __div__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = __div__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, __div__(xfill, yfill)
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_div_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = __div__(xfill, y[yi])
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = __div__(x[xi], yfill)
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __div__(x[xi], y[yi])
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __div__(x[xi], yfill)
- xi += 1
- else:
- # use x fill value
- out[out_i] = __div__(xfill, y[yi])
- yi += 1
-
- return out, out_index, __div__(xfill, yfill)
-
-
-cpdef sparse_div_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_div_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_div_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_mod_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = __mod__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = __mod__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __mod__(x[xi], y[yi])
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __mod__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = __mod__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, __mod__(xfill, yfill)
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_mod_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = __mod__(xfill, y[yi])
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = __mod__(x[xi], yfill)
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __mod__(x[xi], y[yi])
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __mod__(x[xi], yfill)
- xi += 1
- else:
- # use x fill value
- out[out_i] = __mod__(xfill, y[yi])
- yi += 1
-
- return out, out_index, __mod__(xfill, yfill)
-
-
-cpdef sparse_mod_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_mod_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_mod_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_mod_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = __mod__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = __mod__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __mod__(x[xi], y[yi])
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __mod__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = __mod__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, __mod__(xfill, yfill)
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_mod_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = __mod__(xfill, y[yi])
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = __mod__(x[xi], yfill)
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __mod__(x[xi], y[yi])
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __mod__(x[xi], yfill)
- xi += 1
- else:
- # use x fill value
- out[out_i] = __mod__(xfill, y[yi])
- yi += 1
-
- return out, out_index, __mod__(xfill, yfill)
-
-
-cpdef sparse_mod_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_mod_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_mod_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_truediv_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = __truediv__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = __truediv__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __truediv__(x[xi], y[yi])
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __truediv__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = __truediv__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, __truediv__(xfill, yfill)
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_truediv_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = __truediv__(xfill, y[yi])
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = __truediv__(x[xi], yfill)
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __truediv__(x[xi], y[yi])
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __truediv__(x[xi], yfill)
- xi += 1
- else:
- # use x fill value
- out[out_i] = __truediv__(xfill, y[yi])
- yi += 1
-
- return out, out_index, __truediv__(xfill, yfill)
-
-
-cpdef sparse_truediv_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_truediv_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_truediv_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_truediv_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = __truediv__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = __truediv__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __truediv__(x[xi], y[yi])
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __truediv__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = __truediv__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, __truediv__(xfill, yfill)
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_truediv_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = __truediv__(xfill, y[yi])
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = __truediv__(x[xi], yfill)
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __truediv__(x[xi], y[yi])
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __truediv__(x[xi], yfill)
- xi += 1
- else:
- # use x fill value
- out[out_i] = __truediv__(xfill, y[yi])
- yi += 1
-
- return out, out_index, __truediv__(xfill, yfill)
-
-
-cpdef sparse_truediv_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_truediv_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_truediv_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_floordiv_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = __floordiv__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = __floordiv__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __floordiv__(x[xi], y[yi])
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __floordiv__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = __floordiv__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, __floordiv__(xfill, yfill)
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_floordiv_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = __floordiv__(xfill, y[yi])
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = __floordiv__(x[xi], yfill)
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __floordiv__(x[xi], y[yi])
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __floordiv__(x[xi], yfill)
- xi += 1
- else:
- # use x fill value
- out[out_i] = __floordiv__(xfill, y[yi])
- yi += 1
-
- return out, out_index, __floordiv__(xfill, yfill)
-
-
-cpdef sparse_floordiv_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_floordiv_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_floordiv_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_floordiv_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = __floordiv__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = __floordiv__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __floordiv__(x[xi], y[yi])
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __floordiv__(x[xi], yfill)
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = __floordiv__(xfill, y[yi])
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, __floordiv__(xfill, yfill)
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_floordiv_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = __floordiv__(xfill, y[yi])
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = __floordiv__(x[xi], yfill)
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = __floordiv__(x[xi], y[yi])
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = __floordiv__(x[xi], yfill)
- xi += 1
- else:
- # use x fill value
- out[out_i] = __floordiv__(xfill, y[yi])
- yi += 1
-
- return out, out_index, __floordiv__(xfill, yfill)
-
-
-cpdef sparse_floordiv_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_floordiv_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_floordiv_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.cpow(True) # Cython 3 matches Python pow, which isn't what we want here
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_pow_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] ** yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill ** y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] ** y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] ** yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill ** y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill ** yfill
-@cython.cpow(True) # Cython 3 matches Python pow, which isn't what we want here
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_pow_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[float64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.float64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill ** y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] ** yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] ** y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] ** yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill ** y[yi]
- yi += 1
-
- return out, out_index, xfill ** yfill
-
-
-cpdef sparse_pow_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_pow_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_pow_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.cpow(True) # Cython 3 matches Python pow, which isn't what we want here
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_pow_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] ** yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill ** y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] ** y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] ** yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill ** y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill ** yfill
-@cython.cpow(True) # Cython 3 matches Python pow, which isn't what we want here
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_pow_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[int64_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.int64)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill ** y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] ** yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] ** y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] ** yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill ** y[yi]
- yi += 1
-
- return out, out_index, xfill ** yfill
-
-
-cpdef sparse_pow_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_pow_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_pow_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_eq_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] == yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill == y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] == y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] == yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill == y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill == yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_eq_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill == y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] == yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] == y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] == yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill == y[yi]
- yi += 1
-
- return out, out_index, xfill == yfill
-
-
-cpdef sparse_eq_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_eq_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_eq_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_eq_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] == yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill == y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] == y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] == yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill == y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill == yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_eq_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill == y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] == yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] == y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] == yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill == y[yi]
- yi += 1
-
- return out, out_index, xfill == yfill
-
-
-cpdef sparse_eq_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_eq_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_eq_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_ne_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] != yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill != y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] != y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] != yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill != y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill != yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_ne_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill != y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] != yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] != y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] != yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill != y[yi]
- yi += 1
-
- return out, out_index, xfill != yfill
-
-
-cpdef sparse_ne_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_ne_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_ne_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_ne_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] != yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill != y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] != y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] != yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill != y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill != yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_ne_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill != y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] != yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] != y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] != yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill != y[yi]
- yi += 1
-
- return out, out_index, xfill != yfill
-
-
-cpdef sparse_ne_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_ne_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_ne_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_lt_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] < yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill < y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] < y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] < yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill < y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill < yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_lt_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill < y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] < yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] < y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] < yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill < y[yi]
- yi += 1
-
- return out, out_index, xfill < yfill
-
-
-cpdef sparse_lt_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_lt_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_lt_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_lt_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] < yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill < y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] < y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] < yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill < y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill < yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_lt_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill < y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] < yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] < y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] < yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill < y[yi]
- yi += 1
-
- return out, out_index, xfill < yfill
-
-
-cpdef sparse_lt_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_lt_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_lt_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_gt_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] > yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill > y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] > y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] > yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill > y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill > yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_gt_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill > y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] > yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] > y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] > yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill > y[yi]
- yi += 1
-
- return out, out_index, xfill > yfill
-
-
-cpdef sparse_gt_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_gt_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_gt_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_gt_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] > yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill > y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] > y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] > yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill > y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill > yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_gt_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill > y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] > yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] > y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] > yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill > y[yi]
- yi += 1
-
- return out, out_index, xfill > yfill
-
-
-cpdef sparse_gt_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_gt_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_gt_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_le_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] <= yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill <= y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] <= y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] <= yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill <= y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill <= yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_le_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill <= y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] <= yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] <= y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] <= yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill <= y[yi]
- yi += 1
-
- return out, out_index, xfill <= yfill
-
-
-cpdef sparse_le_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_le_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_le_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_le_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] <= yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill <= y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] <= y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] <= yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill <= y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill <= yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_le_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill <= y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] <= yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] <= y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] <= yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill <= y[yi]
- yi += 1
-
- return out, out_index, xfill <= yfill
-
-
-cpdef sparse_le_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_le_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_le_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_ge_float64(float64_t[:] x_,
- BlockIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- BlockIndex yindex,
- float64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] >= yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill >= y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] >= y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] >= yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill >= y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill >= yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_ge_float64(float64_t[:] x_,
- IntIndex xindex,
- float64_t xfill,
- float64_t[:] y_,
- IntIndex yindex,
- float64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- float64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill >= y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] >= yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] >= y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] >= yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill >= y[yi]
- yi += 1
-
- return out, out_index, xfill >= yfill
-
-
-cpdef sparse_ge_float64(float64_t[:] x,
- SparseIndex xindex, float64_t xfill,
- float64_t[:] y,
- SparseIndex yindex, float64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_ge_float64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_ge_float64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_ge_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] >= yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill >= y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] >= y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] >= yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill >= y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill >= yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_ge_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill >= y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] >= yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] >= y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] >= yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill >= y[yi]
- yi += 1
-
- return out, out_index, xfill >= yfill
-
-
-cpdef sparse_ge_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_ge_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_ge_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_and_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] & yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill & y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] & y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] & yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill & y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill & yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_and_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill & y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] & yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] & y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] & yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill & y[yi]
- yi += 1
-
- return out, out_index, xfill & yfill
-
-
-cpdef sparse_and_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_and_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_and_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_and_uint8(uint8_t[:] x_,
- BlockIndex xindex,
- uint8_t xfill,
- uint8_t[:] y_,
- BlockIndex yindex,
- uint8_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- uint8_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] & yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill & y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] & y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] & yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill & y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill & yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_and_uint8(uint8_t[:] x_,
- IntIndex xindex,
- uint8_t xfill,
- uint8_t[:] y_,
- IntIndex yindex,
- uint8_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- uint8_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill & y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] & yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] & y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] & yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill & y[yi]
- yi += 1
-
- return out, out_index, xfill & yfill
-
-
-cpdef sparse_and_uint8(uint8_t[:] x,
- SparseIndex xindex, uint8_t xfill,
- uint8_t[:] y,
- SparseIndex yindex, uint8_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_and_uint8(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_and_uint8(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_or_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] | yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill | y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] | y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] | yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill | y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill | yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_or_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill | y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] | yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] | y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] | yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill | y[yi]
- yi += 1
-
- return out, out_index, xfill | yfill
-
-
-cpdef sparse_or_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_or_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_or_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_or_uint8(uint8_t[:] x_,
- BlockIndex xindex,
- uint8_t xfill,
- uint8_t[:] y_,
- BlockIndex yindex,
- uint8_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- uint8_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] | yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill | y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] | y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] | yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill | y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill | yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_or_uint8(uint8_t[:] x_,
- IntIndex xindex,
- uint8_t xfill,
- uint8_t[:] y_,
- IntIndex yindex,
- uint8_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- uint8_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill | y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] | yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] | y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] | yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill | y[yi]
- yi += 1
-
- return out, out_index, xfill | yfill
-
-
-cpdef sparse_or_uint8(uint8_t[:] x,
- SparseIndex xindex, uint8_t xfill,
- uint8_t[:] y,
- SparseIndex yindex, uint8_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_or_uint8(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_or_uint8(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_xor_int64(int64_t[:] x_,
- BlockIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- BlockIndex yindex,
- int64_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] ^ yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill ^ y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] ^ y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] ^ yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill ^ y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill ^ yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_xor_int64(int64_t[:] x_,
- IntIndex xindex,
- int64_t xfill,
- int64_t[:] y_,
- IntIndex yindex,
- int64_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- int64_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill ^ y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] ^ yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] ^ y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] ^ yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill ^ y[yi]
- yi += 1
-
- return out, out_index, xfill ^ yfill
-
-
-cpdef sparse_xor_int64(int64_t[:] x,
- SparseIndex xindex, int64_t xfill,
- int64_t[:] y,
- SparseIndex yindex, int64_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_xor_int64(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_xor_int64(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_xor_uint8(uint8_t[:] x_,
- BlockIndex xindex,
- uint8_t xfill,
- uint8_t[:] y_,
- BlockIndex yindex,
- uint8_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- uint8_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = x[xi] ^ yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = xfill ^ y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] ^ y[yi]
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] ^ yfill
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = xfill ^ y[yi]
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, xfill ^ yfill
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_xor_uint8(uint8_t[:] x_,
- IntIndex xindex,
- uint8_t xfill,
- uint8_t[:] y_,
- IntIndex yindex,
- uint8_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- uint8_t[:] x, y
- ndarray[uint8_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.uint8)
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = xfill ^ y[yi]
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = x[xi] ^ yfill
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = x[xi] ^ y[yi]
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = x[xi] ^ yfill
- xi += 1
- else:
- # use x fill value
- out[out_i] = xfill ^ y[yi]
- yi += 1
-
- return out, out_index, xfill ^ yfill
-
-
-cpdef sparse_xor_uint8(uint8_t[:] x,
- SparseIndex xindex, uint8_t xfill,
- uint8_t[:] y,
- SparseIndex yindex, uint8_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_xor_uint8(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_xor_uint8(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
diff --git a/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi.in b/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi.in
deleted file mode 100644
index 774a8c579f6..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/sparse_op_helper.pxi.in
+++ /dev/null
@@ -1,313 +0,0 @@
-"""
-Template for each `dtype` helper function for sparse ops
-
-WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
-"""
-
-# ----------------------------------------------------------------------
-# Sparse op
-# ----------------------------------------------------------------------
-
-ctypedef fused sparse_t:
- float64_t
- int64_t
-
-
-cdef float64_t __div__(sparse_t a, sparse_t b):
- if b == 0:
- if a > 0:
- return INF
- elif a < 0:
- return -INF
- else:
- return NaN
- else:
- return float(a) / b
-
-
-cdef float64_t __truediv__(sparse_t a, sparse_t b):
- return __div__(a, b)
-
-
-cdef sparse_t __mod__(sparse_t a, sparse_t b):
- if b == 0:
- if sparse_t is float64_t:
- return NaN
- else:
- return 0
- else:
- return a % b
-
-
-cdef sparse_t __floordiv__(sparse_t a, sparse_t b):
- if b == 0:
- if sparse_t is float64_t:
- # Match non-sparse Series behavior implemented in mask_zero_div_zero
- if a > 0:
- return INF
- elif a < 0:
- return -INF
- return NaN
- else:
- return 0
- else:
- return a // b
-
-
-# ----------------------------------------------------------------------
-# sparse array op
-# ----------------------------------------------------------------------
-
-{{py:
-
-# dtype, arith_comp_group, logical_group
-dtypes = [('float64', True, False),
- ('int64', True, True),
- ('uint8', False, True)]
-# do not generate arithmetic / comparison template for uint8,
-# it should be done in fused types
-
-def get_op(tup):
- assert isinstance(tup, tuple)
- assert len(tup) == 4
-
- opname, lval, rval, dtype = tup
-
- ops_dict = {'add': '{0} + {1}',
- 'sub': '{0} - {1}',
- 'mul': '{0} * {1}',
- 'div': '__div__({0}, {1})',
- 'mod': '__mod__({0}, {1})',
- 'truediv': '__truediv__({0}, {1})',
- 'floordiv': '__floordiv__({0}, {1})',
- 'pow': '{0} ** {1}',
- 'eq': '{0} == {1}',
- 'ne': '{0} != {1}',
- 'lt': '{0} < {1}',
- 'gt': '{0} > {1}',
- 'le': '{0} <= {1}',
- 'ge': '{0} >= {1}',
-
- 'and': '{0} & {1}', # logical op
- 'or': '{0} | {1}',
- 'xor': '{0} ^ {1}'}
-
- return ops_dict[opname].format(lval, rval)
-
-
-def get_dispatch(dtypes):
-
- ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv',
- 'floordiv', 'pow',
- 'eq', 'ne', 'lt', 'gt', 'le', 'ge',
- 'and', 'or', 'xor']
-
- for opname in ops_list:
- for dtype, arith_comp_group, logical_group in dtypes:
-
- if opname in ('div', 'truediv'):
- rdtype = 'float64'
- elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
- # comparison op
- rdtype = 'uint8'
- elif opname in ('and', 'or', 'xor'):
- # logical op
- rdtype = 'uint8'
- else:
- rdtype = dtype
-
- if opname in ('and', 'or', 'xor'):
- if logical_group:
- yield opname, dtype, rdtype
- else:
- if arith_comp_group:
- yield opname, dtype, rdtype
-
-}}
-
-
-{{for opname, dtype, rdtype in get_dispatch(dtypes)}}
-
-{{if opname == "pow"}}
-@cython.cpow(True) # Cython 3 matches Python pow, which isn't what we want here
-{{endif}}
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple block_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_,
- BlockIndex xindex,
- {{dtype}}_t xfill,
- {{dtype}}_t[:] y_,
- BlockIndex yindex,
- {{dtype}}_t yfill):
- """
- Binary operator on BlockIndex objects with fill values
- """
-
- cdef:
- BlockIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xbp = 0, ybp = 0 # block positions
- int32_t xloc, yloc
- Py_ssize_t xblock = 0, yblock = 0 # block numbers
-
- {{dtype}}_t[:] x, y
- ndarray[{{rdtype}}_t, ndim=1] out
-
- # to suppress Cython warning
- x = x_
- y = y_
-
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.{{rdtype}})
-
- # Wow, what a hack job. Need to do something about this
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if yblock == yindex.nblocks:
- # use y fill value
- out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- continue
-
- if xblock == xindex.nblocks:
- # use x fill value
- out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
- continue
-
- yloc = yindex.locbuf[yblock] + ybp
- xloc = xindex.locbuf[xblock] + xbp
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}}
- xi += 1
- yi += 1
-
- # advance both locations
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
-
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- elif xloc < yloc:
- # use y fill value
- out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
- xi += 1
-
- # advance x location
- xbp += 1
- if xbp == xindex.lenbuf[xblock]:
- xblock += 1
- xbp = 0
- else:
- # use x fill value
- out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
- yi += 1
-
- # advance y location
- ybp += 1
- if ybp == yindex.lenbuf[yblock]:
- yblock += 1
- ybp = 0
-
- return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}}
-
-{{if opname == "pow"}}
-@cython.cpow(True) # Cython 3 matches Python pow, which isn't what we want here
-{{endif}}
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef tuple int_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_,
- IntIndex xindex,
- {{dtype}}_t xfill,
- {{dtype}}_t[:] y_,
- IntIndex yindex,
- {{dtype}}_t yfill):
- cdef:
- IntIndex out_index
- Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
- int32_t xloc, yloc
- int32_t[:] xindices, yindices, out_indices
- {{dtype}}_t[:] x, y
- ndarray[{{rdtype}}_t, ndim=1] out
-
- # suppress Cython compiler warnings due to inlining
- x = x_
- y = y_
-
- # need to do this first to know size of result array
- out_index = xindex.make_union(yindex)
- out = np.empty(out_index.npoints, dtype=np.{{rdtype}})
-
- xindices = xindex.indices
- yindices = yindex.indices
- out_indices = out_index.indices
-
- # walk the two SparseVectors, adding matched locations...
- for out_i in range(out_index.npoints):
- if xi == xindex.npoints:
- # use x fill value
- out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
- yi += 1
- continue
-
- if yi == yindex.npoints:
- # use y fill value
- out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
- xi += 1
- continue
-
- xloc = xindices[xi]
- yloc = yindices[yi]
-
- # each index in the out_index had to come from either x, y, or both
- if xloc == yloc:
- out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}}
- xi += 1
- yi += 1
- elif xloc < yloc:
- # use y fill value
- out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}}
- xi += 1
- else:
- # use x fill value
- out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}}
- yi += 1
-
- return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}}
-
-
-cpdef sparse_{{opname}}_{{dtype}}({{dtype}}_t[:] x,
- SparseIndex xindex, {{dtype}}_t xfill,
- {{dtype}}_t[:] y,
- SparseIndex yindex, {{dtype}}_t yfill):
-
- if isinstance(xindex, BlockIndex):
- return block_op_{{opname}}_{{dtype}}(x, xindex.to_block_index(), xfill,
- y, yindex.to_block_index(), yfill)
- elif isinstance(xindex, IntIndex):
- return int_op_{{opname}}_{{dtype}}(x, xindex.to_int_index(), xfill,
- y, yindex.to_int_index(), yfill)
- else:
- raise NotImplementedError
-
-{{endfor}}
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/headers/portable.h b/contrib/python/pandas/py3/pandas/_libs/src/headers/portable.h
deleted file mode 100644
index 91b4702d324..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/headers/portable.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _PANDAS_PORTABLE_H_
-#define _PANDAS_PORTABLE_H_
-
-#include <string.h>
-
-#if defined(_MSC_VER)
-#define strcasecmp( s1, s2 ) _stricmp( s1, s2 )
-#endif
-
-// GH-23516 - works around locale perf issues
-// from MUSL libc, MIT Licensed - see LICENSES
-#define isdigit_ascii(c) (((unsigned)(c) - '0') < 10u)
-#define getdigit_ascii(c, default) (isdigit_ascii(c) ? ((int)((c) - '0')) : default)
-#define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5))
-#define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c))
-#define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c))
-
-#endif
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/inline_helper.h b/contrib/python/pandas/py3/pandas/_libs/src/inline_helper.h
deleted file mode 100644
index 40fd45762ff..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/inline_helper.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
-Copyright (c) 2016, PyData Development Team
-All rights reserved.
-
-Distributed under the terms of the BSD Simplified License.
-
-The full license is in the LICENSE file, distributed with this software.
-*/
-
-#ifndef PANDAS__LIBS_SRC_INLINE_HELPER_H_
-#define PANDAS__LIBS_SRC_INLINE_HELPER_H_
-
-#ifndef PANDAS_INLINE
- #if defined(__clang__)
- #define PANDAS_INLINE static __inline__ __attribute__ ((__unused__))
- #elif defined(__GNUC__)
- #define PANDAS_INLINE static __inline__
- #elif defined(_MSC_VER)
- #define PANDAS_INLINE static __inline
- #elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
- #define PANDAS_INLINE static inline
- #else
- #define PANDAS_INLINE
- #endif // __GNUC__
-#endif // PANDAS_INLINE
-
-#endif // PANDAS__LIBS_SRC_INLINE_HELPER_H_
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/klib/khash.h b/contrib/python/pandas/py3/pandas/_libs/src/klib/khash.h
deleted file mode 100644
index e17d82d51f0..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/klib/khash.h
+++ /dev/null
@@ -1,719 +0,0 @@
-/* The MIT License
-
- Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-/*
- An example:
-
-#include "khash.h"
-KHASH_MAP_INIT_INT(32, char)
-int main() {
- int ret, is_missing;
- khiter_t k;
- khash_t(32) *h = kh_init(32);
- k = kh_put(32, h, 5, &ret);
- if (!ret) kh_del(32, h, k);
- kh_value(h, k) = 10;
- k = kh_get(32, h, 10);
- is_missing = (k == kh_end(h));
- k = kh_get(32, h, 5);
- kh_del(32, h, k);
- for (k = kh_begin(h); k != kh_end(h); ++k)
- if (kh_exist(h, k)) kh_value(h, k) = 1;
- kh_destroy(32, h);
- return 0;
-}
-*/
-
-/*
- 2011-09-16 (0.2.6):
-
- * The capacity is a power of 2. This seems to dramatically improve the
- speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
-
- - https://github.com/stefanocasazza/ULib
- - https://nothings.org/computer/judy/
-
- * Allow to optionally use linear probing which usually has better
- performance for random input. Double hashing is still the default as it
- is more robust to certain non-random input.
-
- * Added Wang's integer hash function (not used by default). This hash
- function is more robust to certain non-random input.
-
- 2011-02-14 (0.2.5):
-
- * Allow to declare global functions.
-
- 2009-09-26 (0.2.4):
-
- * Improve portability
-
- 2008-09-19 (0.2.3):
-
- * Corrected the example
- * Improved interfaces
-
- 2008-09-11 (0.2.2):
-
- * Improved speed a little in kh_put()
-
- 2008-09-10 (0.2.1):
-
- * Added kh_clear()
- * Fixed a compiling error
-
- 2008-09-02 (0.2.0):
-
- * Changed to token concatenation which increases flexibility.
-
- 2008-08-31 (0.1.2):
-
- * Fixed a bug in kh_get(), which has not been tested previously.
-
- 2008-08-31 (0.1.1):
-
- * Added destructor
-*/
-
-
-#ifndef __AC_KHASH_H
-#define __AC_KHASH_H
-
-/*!
- @header
-
- Generic hash table library.
- */
-
-#define AC_VERSION_KHASH_H "0.2.6"
-
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include "../inline_helper.h"
-
-
-// hooks for memory allocator, C-runtime allocator used per default
-#ifndef KHASH_MALLOC
-#define KHASH_MALLOC malloc
-#endif
-
-#ifndef KHASH_REALLOC
-#define KHASH_REALLOC realloc
-#endif
-
-#ifndef KHASH_CALLOC
-#define KHASH_CALLOC calloc
-#endif
-
-#ifndef KHASH_FREE
-#define KHASH_FREE free
-#endif
-
-
-#if UINT_MAX == 0xffffffffu
-typedef unsigned int khuint32_t;
-typedef signed int khint32_t;
-#elif ULONG_MAX == 0xffffffffu
-typedef unsigned long khuint32_t;
-typedef signed long khint32_t;
-#endif
-
-#if ULONG_MAX == ULLONG_MAX
-typedef unsigned long khuint64_t;
-typedef signed long khint64_t;
-#else
-typedef unsigned long long khuint64_t;
-typedef signed long long khint64_t;
-#endif
-
-#if UINT_MAX == 0xffffu
-typedef unsigned int khuint16_t;
-typedef signed int khint16_t;
-#elif USHRT_MAX == 0xffffu
-typedef unsigned short khuint16_t;
-typedef signed short khint16_t;
-#endif
-
-#if UCHAR_MAX == 0xffu
-typedef unsigned char khuint8_t;
-typedef signed char khint8_t;
-#endif
-
-typedef double khfloat64_t;
-typedef float khfloat32_t;
-
-typedef khuint32_t khuint_t;
-typedef khuint_t khiter_t;
-
-#define __ac_isempty(flag, i) ((flag[i>>5]>>(i&0x1fU))&1)
-#define __ac_isdel(flag, i) (0)
-#define __ac_iseither(flag, i) __ac_isempty(flag, i)
-#define __ac_set_isdel_false(flag, i) (0)
-#define __ac_set_isempty_false(flag, i) (flag[i>>5]&=~(1ul<<(i&0x1fU)))
-#define __ac_set_isempty_true(flag, i) (flag[i>>5]|=(1ul<<(i&0x1fU)))
-#define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i)
-#define __ac_set_isdel_true(flag, i) ((void)0)
-
-
-// specializations of https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
-khuint32_t PANDAS_INLINE murmur2_32to32(khuint32_t k){
- const khuint32_t SEED = 0xc70f6907UL;
- // 'm' and 'r' are mixing constants generated offline.
- // They're not really 'magic', they just happen to work well.
- const khuint32_t M_32 = 0x5bd1e995;
- const int R_32 = 24;
-
- // Initialize the hash to a 'random' value
- khuint32_t h = SEED ^ 4;
-
- //handle 4 bytes:
- k *= M_32;
- k ^= k >> R_32;
- k *= M_32;
-
- h *= M_32;
- h ^= k;
-
- // Do a few final mixes of the hash to ensure the "last few
- // bytes" are well-incorporated. (Really needed here?)
- h ^= h >> 13;
- h *= M_32;
- h ^= h >> 15;
- return h;
-}
-
-// it is possible to have a special x64-version, which would need less operations, but
-// using 32bit version always has also some benefits:
-// - one code for 32bit and 64bit builds
-// - the same case for 32bit and 64bit builds
-// - no performance difference could be measured compared to a possible x64-version
-
-khuint32_t PANDAS_INLINE murmur2_32_32to32(khuint32_t k1, khuint32_t k2){
- const khuint32_t SEED = 0xc70f6907UL;
- // 'm' and 'r' are mixing constants generated offline.
- // They're not really 'magic', they just happen to work well.
- const khuint32_t M_32 = 0x5bd1e995;
- const int R_32 = 24;
-
- // Initialize the hash to a 'random' value
- khuint32_t h = SEED ^ 4;
-
- //handle first 4 bytes:
- k1 *= M_32;
- k1 ^= k1 >> R_32;
- k1 *= M_32;
-
- h *= M_32;
- h ^= k1;
-
- //handle second 4 bytes:
- k2 *= M_32;
- k2 ^= k2 >> R_32;
- k2 *= M_32;
-
- h *= M_32;
- h ^= k2;
-
- // Do a few final mixes of the hash to ensure the "last few
- // bytes" are well-incorporated.
- h ^= h >> 13;
- h *= M_32;
- h ^= h >> 15;
- return h;
-}
-
-khuint32_t PANDAS_INLINE murmur2_64to32(khuint64_t k){
- khuint32_t k1 = (khuint32_t)k;
- khuint32_t k2 = (khuint32_t)(k >> 32);
-
- return murmur2_32_32to32(k1, k2);
-}
-
-
-#ifdef KHASH_LINEAR
-#define __ac_inc(k, m) 1
-#else
-#define __ac_inc(k, m) (murmur2_32to32(k) | 1) & (m)
-#endif
-
-#define __ac_fsize(m) ((m) < 32? 1 : (m)>>5)
-
-#ifndef kroundup32
-#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
-#endif
-
-static const double __ac_HASH_UPPER = 0.77;
-
-#define KHASH_DECLARE(name, khkey_t, khval_t) \
- typedef struct { \
- khuint_t n_buckets, size, n_occupied, upper_bound; \
- khuint32_t *flags; \
- khkey_t *keys; \
- khval_t *vals; \
- } kh_##name##_t; \
- extern kh_##name##_t *kh_init_##name(); \
- extern void kh_destroy_##name(kh_##name##_t *h); \
- extern void kh_clear_##name(kh_##name##_t *h); \
- extern khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \
- extern void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets); \
- extern khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
- extern void kh_del_##name(kh_##name##_t *h, khuint_t x);
-
-#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
- typedef struct { \
- khuint_t n_buckets, size, n_occupied, upper_bound; \
- khuint32_t *flags; \
- khkey_t *keys; \
- khval_t *vals; \
- } kh_##name##_t; \
- SCOPE kh_##name##_t *kh_init_##name(void) { \
- return (kh_##name##_t*)KHASH_CALLOC(1, sizeof(kh_##name##_t)); \
- } \
- SCOPE void kh_destroy_##name(kh_##name##_t *h) \
- { \
- if (h) { \
- KHASH_FREE(h->keys); KHASH_FREE(h->flags); \
- KHASH_FREE(h->vals); \
- KHASH_FREE(h); \
- } \
- } \
- SCOPE void kh_clear_##name(kh_##name##_t *h) \
- { \
- if (h && h->flags) { \
- memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khuint32_t)); \
- h->size = h->n_occupied = 0; \
- } \
- } \
- SCOPE khuint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
- { \
- if (h->n_buckets) { \
- khuint_t inc, k, i, last, mask; \
- mask = h->n_buckets - 1; \
- k = __hash_func(key); i = k & mask; \
- inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
- while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
- i = (i + inc) & mask; \
- if (i == last) return h->n_buckets; \
- } \
- return __ac_iseither(h->flags, i)? h->n_buckets : i; \
- } else return 0; \
- } \
- SCOPE void kh_resize_##name(kh_##name##_t *h, khuint_t new_n_buckets) \
- { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
- khuint32_t *new_flags = 0; \
- khuint_t j = 1; \
- { \
- kroundup32(new_n_buckets); \
- if (new_n_buckets < 4) new_n_buckets = 4; \
- if (h->size >= (khuint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \
- else { /* hash table size to be changed (shrink or expand); rehash */ \
- new_flags = (khuint32_t*)KHASH_MALLOC(__ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \
- memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khuint32_t)); \
- if (h->n_buckets < new_n_buckets) { /* expand */ \
- h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \
- } /* otherwise shrink */ \
- } \
- } \
- if (j) { /* rehashing is needed */ \
- for (j = 0; j != h->n_buckets; ++j) { \
- if (__ac_iseither(h->flags, j) == 0) { \
- khkey_t key = h->keys[j]; \
- khval_t val; \
- khuint_t new_mask; \
- new_mask = new_n_buckets - 1; \
- if (kh_is_map) val = h->vals[j]; \
- __ac_set_isempty_true(h->flags, j); \
- while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
- khuint_t inc, k, i; \
- k = __hash_func(key); \
- i = k & new_mask; \
- inc = __ac_inc(k, new_mask); \
- while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
- __ac_set_isempty_false(new_flags, i); \
- if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
- { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
- if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
- __ac_set_isempty_true(h->flags, i); /* mark it as deleted in the old hash table */ \
- } else { /* write the element and jump out of the loop */ \
- h->keys[i] = key; \
- if (kh_is_map) h->vals[i] = val; \
- break; \
- } \
- } \
- } \
- } \
- if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
- h->keys = (khkey_t*)KHASH_REALLOC(h->keys, new_n_buckets * sizeof(khkey_t)); \
- if (kh_is_map) h->vals = (khval_t*)KHASH_REALLOC(h->vals, new_n_buckets * sizeof(khval_t)); \
- } \
- KHASH_FREE(h->flags); /* free the working space */ \
- h->flags = new_flags; \
- h->n_buckets = new_n_buckets; \
- h->n_occupied = h->size; \
- h->upper_bound = (khuint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
- } \
- } \
- SCOPE khuint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
- { \
- khuint_t x; \
- if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
- if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \
- else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \
- } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
- { \
- khuint_t inc, k, i, site, last, mask = h->n_buckets - 1; \
- x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
- if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \
- else { \
- inc = __ac_inc(k, mask); last = i; \
- while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
- if (__ac_isdel(h->flags, i)) site = i; \
- i = (i + inc) & mask; \
- if (i == last) { x = site; break; } \
- } \
- if (x == h->n_buckets) { \
- if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
- else x = i; \
- } \
- } \
- } \
- if (__ac_isempty(h->flags, x)) { /* not present at all */ \
- h->keys[x] = key; \
- __ac_set_isboth_false(h->flags, x); \
- ++h->size; ++h->n_occupied; \
- *ret = 1; \
- } else if (__ac_isdel(h->flags, x)) { /* deleted */ \
- h->keys[x] = key; \
- __ac_set_isboth_false(h->flags, x); \
- ++h->size; \
- *ret = 2; \
- } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
- return x; \
- } \
- SCOPE void kh_del_##name(kh_##name##_t *h, khuint_t x) \
- { \
- if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
- __ac_set_isdel_true(h->flags, x); \
- --h->size; \
- } \
- }
-
-#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
- KHASH_INIT2(name, PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
-
-/* --- BEGIN OF HASH FUNCTIONS --- */
-
-/*! @function
- @abstract Integer hash function
- @param key The integer [khuint32_t]
- @return The hash value [khuint_t]
- */
-#define kh_int_hash_func(key) (khuint32_t)(key)
-/*! @function
- @abstract Integer comparison function
- */
-#define kh_int_hash_equal(a, b) ((a) == (b))
-/*! @function
- @abstract 64-bit integer hash function
- @param key The integer [khuint64_t]
- @return The hash value [khuint_t]
- */
-PANDAS_INLINE khuint_t kh_int64_hash_func(khuint64_t key)
-{
- return (khuint_t)((key)>>33^(key)^(key)<<11);
-}
-/*! @function
- @abstract 64-bit integer comparison function
- */
-#define kh_int64_hash_equal(a, b) ((a) == (b))
-
-/*! @function
- @abstract const char* hash function
- @param s Pointer to a null terminated string
- @return The hash value
- */
-PANDAS_INLINE khuint_t __ac_X31_hash_string(const char *s)
-{
- khuint_t h = *s;
- if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
- return h;
-}
-/*! @function
- @abstract Another interface to const char* hash function
- @param key Pointer to a null terminated string [const char*]
- @return The hash value [khuint_t]
- */
-#define kh_str_hash_func(key) __ac_X31_hash_string(key)
-/*! @function
- @abstract Const char* comparison function
- */
-#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
-
-PANDAS_INLINE khuint_t __ac_Wang_hash(khuint_t key)
-{
- key += ~(key << 15);
- key ^= (key >> 10);
- key += (key << 3);
- key ^= (key >> 6);
- key += ~(key << 11);
- key ^= (key >> 16);
- return key;
-}
-#define kh_int_hash_func2(k) __ac_Wang_hash((khuint_t)key)
-
-/* --- END OF HASH FUNCTIONS --- */
-
-/* Other convenient macros... */
-
-/*!
- @abstract Type of the hash table.
- @param name Name of the hash table [symbol]
- */
-#define khash_t(name) kh_##name##_t
-
-/*! @function
- @abstract Initiate a hash table.
- @param name Name of the hash table [symbol]
- @return Pointer to the hash table [khash_t(name)*]
- */
-#define kh_init(name) kh_init_##name(void)
-
-/*! @function
- @abstract Destroy a hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- */
-#define kh_destroy(name, h) kh_destroy_##name(h)
-
-/*! @function
- @abstract Reset a hash table without deallocating memory.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- */
-#define kh_clear(name, h) kh_clear_##name(h)
-
-/*! @function
- @abstract Resize a hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param s New size [khuint_t]
- */
-#define kh_resize(name, h, s) kh_resize_##name(h, s)
-
-/*! @function
- @abstract Insert a key to the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Key [type of keys]
- @param r Extra return code: 0 if the key is present in the hash table;
- 1 if the bucket is empty (never used); 2 if the element in
- the bucket has been deleted [int*]
- @return Iterator to the inserted element [khuint_t]
- */
-#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
-
-/*! @function
- @abstract Retrieve a key from the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Key [type of keys]
- @return Iterator to the found element, or kh_end(h) is the element is absent [khuint_t]
- */
-#define kh_get(name, h, k) kh_get_##name(h, k)
-
-/*! @function
- @abstract Remove a key from the hash table.
- @param name Name of the hash table [symbol]
- @param h Pointer to the hash table [khash_t(name)*]
- @param k Iterator to the element to be deleted [khuint_t]
- */
-#define kh_del(name, h, k) kh_del_##name(h, k)
-
-/*! @function
- @abstract Test whether a bucket contains data.
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khuint_t]
- @return 1 if containing data; 0 otherwise [int]
- */
-#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
-
-/*! @function
- @abstract Get key given an iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khuint_t]
- @return Key [type of keys]
- */
-#define kh_key(h, x) ((h)->keys[x])
-
-/*! @function
- @abstract Get value given an iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @param x Iterator to the bucket [khuint_t]
- @return Value [type of values]
- @discussion For hash sets, calling this results in segfault.
- */
-#define kh_val(h, x) ((h)->vals[x])
-
-/*! @function
- @abstract Alias of kh_val()
- */
-#define kh_value(h, x) ((h)->vals[x])
-
-/*! @function
- @abstract Get the start iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @return The start iterator [khuint_t]
- */
-#define kh_begin(h) (khuint_t)(0)
-
-/*! @function
- @abstract Get the end iterator
- @param h Pointer to the hash table [khash_t(name)*]
- @return The end iterator [khuint_t]
- */
-#define kh_end(h) ((h)->n_buckets)
-
-/*! @function
- @abstract Get the number of elements in the hash table
- @param h Pointer to the hash table [khash_t(name)*]
- @return Number of elements in the hash table [khuint_t]
- */
-#define kh_size(h) ((h)->size)
-
-/*! @function
- @abstract Get the number of buckets in the hash table
- @param h Pointer to the hash table [khash_t(name)*]
- @return Number of buckets in the hash table [khuint_t]
- */
-#define kh_n_buckets(h) ((h)->n_buckets)
-
-/* More convenient interfaces */
-
-/*! @function
- @abstract Instantiate a hash set containing integer keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_INT(name) \
- KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing integer keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_INT(name, khval_t) \
- KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-#define KHASH_MAP_INIT_UINT(name, khval_t) \
- KHASH_INIT(name, khuint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing 64-bit integer keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_UINT64(name) \
- KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
-
-#define KHASH_SET_INIT_INT64(name) \
- KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing 64-bit integer keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_UINT64(name, khval_t) \
- KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
-
-#define KHASH_MAP_INIT_INT64(name, khval_t) \
- KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing 16bit-integer keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_INT16(name, khval_t) \
- KHASH_INIT(name, khint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-#define KHASH_MAP_INIT_UINT16(name, khval_t) \
- KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing 8bit-integer keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_INT8(name, khval_t) \
- KHASH_INIT(name, khint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-#define KHASH_MAP_INIT_UINT8(name, khval_t) \
- KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
-
-
-
-typedef const char *kh_cstr_t;
-/*! @function
- @abstract Instantiate a hash map containing const char* keys
- @param name Name of the hash table [symbol]
- */
-#define KHASH_SET_INIT_STR(name) \
- KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
-
-/*! @function
- @abstract Instantiate a hash map containing const char* keys
- @param name Name of the hash table [symbol]
- @param khval_t Type of values [type]
- */
-#define KHASH_MAP_INIT_STR(name, khval_t) \
- KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
-
-
-#define kh_exist_str(h, k) (kh_exist(h, k))
-#define kh_exist_float64(h, k) (kh_exist(h, k))
-#define kh_exist_uint64(h, k) (kh_exist(h, k))
-#define kh_exist_int64(h, k) (kh_exist(h, k))
-#define kh_exist_float32(h, k) (kh_exist(h, k))
-#define kh_exist_int32(h, k) (kh_exist(h, k))
-#define kh_exist_uint32(h, k) (kh_exist(h, k))
-#define kh_exist_int16(h, k) (kh_exist(h, k))
-#define kh_exist_uint16(h, k) (kh_exist(h, k))
-#define kh_exist_int8(h, k) (kh_exist(h, k))
-#define kh_exist_uint8(h, k) (kh_exist(h, k))
-
-KHASH_MAP_INIT_STR(str, size_t)
-KHASH_MAP_INIT_INT(int32, size_t)
-KHASH_MAP_INIT_UINT(uint32, size_t)
-KHASH_MAP_INIT_INT64(int64, size_t)
-KHASH_MAP_INIT_UINT64(uint64, size_t)
-KHASH_MAP_INIT_INT16(int16, size_t)
-KHASH_MAP_INIT_UINT16(uint16, size_t)
-KHASH_MAP_INIT_INT8(int8, size_t)
-KHASH_MAP_INIT_UINT8(uint8, size_t)
-
-
-#endif /* __AC_KHASH_H */
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/klib/khash_python.h b/contrib/python/pandas/py3/pandas/_libs/src/klib/khash_python.h
deleted file mode 100644
index c031a6c7eee..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/klib/khash_python.h
+++ /dev/null
@@ -1,446 +0,0 @@
-#include <string.h>
-#include <Python.h>
-
-
-// use numpy's definitions for complex
-#include <numpy/arrayobject.h>
-typedef npy_complex64 khcomplex64_t;
-typedef npy_complex128 khcomplex128_t;
-
-
-
-// khash should report usage to tracemalloc
-#if PY_VERSION_HEX >= 0x03060000
-#include <pymem.h>
-#if PY_VERSION_HEX < 0x03070000
-#define PyTraceMalloc_Track _PyTraceMalloc_Track
-#define PyTraceMalloc_Untrack _PyTraceMalloc_Untrack
-#endif
-#else
-#define PyTraceMalloc_Track(...)
-#define PyTraceMalloc_Untrack(...)
-#endif
-
-
-static const int KHASH_TRACE_DOMAIN = 424242;
-static void *traced_malloc(size_t size){
- void * ptr = malloc(size);
- if(ptr!=NULL){
- PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
- }
- return ptr;
-}
-
-static void *traced_calloc(size_t num, size_t size){
- void * ptr = calloc(num, size);
- if(ptr!=NULL){
- PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, num*size);
- }
- return ptr;
-}
-
-static void *traced_realloc(void* old_ptr, size_t size){
- void * ptr = realloc(old_ptr, size);
- if(ptr!=NULL){
- if(old_ptr != ptr){
- PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr);
- }
- PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
- }
- return ptr;
-}
-
-static void traced_free(void* ptr){
- if(ptr!=NULL){
- PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)ptr);
- }
- free(ptr);
-}
-
-
-#define KHASH_MALLOC traced_malloc
-#define KHASH_REALLOC traced_realloc
-#define KHASH_CALLOC traced_calloc
-#define KHASH_FREE traced_free
-#include "khash.h"
-
-// Previously we were using the built in cpython hash function for doubles
-// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021
-// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85
-
-// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x))
-// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3).
-// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t
-// is 64 bits the truncation causes collision issues. Given all that, we use our own
-// simple hash, viewing the double bytes as an int64 and using khash's default
-// hash for 64 bit integers.
-// GH 13436 showed that _Py_HashDouble doesn't work well with khash
-// GH 28303 showed, that the simple xoring-version isn't good enough
-// See GH 36729 for evaluation of the currently used murmur2-hash version
-// An interesting alternative to expensive murmur2-hash would be to change
-// the probing strategy and use e.g. the probing strategy from CPython's
-// implementation of dicts, which shines for smaller sizes but is more
-// predisposed to superlinear running times (see GH 36729 for comparison)
-
-
-khuint64_t PANDAS_INLINE asuint64(double key) {
- khuint64_t val;
- memcpy(&val, &key, sizeof(double));
- return val;
-}
-
-khuint32_t PANDAS_INLINE asuint32(float key) {
- khuint32_t val;
- memcpy(&val, &key, sizeof(float));
- return val;
-}
-
-#define ZERO_HASH 0
-#define NAN_HASH 0
-
-khuint32_t PANDAS_INLINE kh_float64_hash_func(double val){
- // 0.0 and -0.0 should have the same hash:
- if (val == 0.0){
- return ZERO_HASH;
- }
- // all nans should have the same hash:
- if ( val!=val ){
- return NAN_HASH;
- }
- khuint64_t as_int = asuint64(val);
- return murmur2_64to32(as_int);
-}
-
-khuint32_t PANDAS_INLINE kh_float32_hash_func(float val){
- // 0.0 and -0.0 should have the same hash:
- if (val == 0.0f){
- return ZERO_HASH;
- }
- // all nans should have the same hash:
- if ( val!=val ){
- return NAN_HASH;
- }
- khuint32_t as_int = asuint32(val);
- return murmur2_32to32(as_int);
-}
-
-#define kh_floats_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a)))
-
-#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \
- KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_floats_hash_equal)
-
-KHASH_MAP_INIT_FLOAT64(float64, size_t)
-
-#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \
- KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, kh_floats_hash_equal)
-
-KHASH_MAP_INIT_FLOAT32(float32, size_t)
-
-khint32_t PANDAS_INLINE kh_complex128_hash_func(khcomplex128_t val){
- return kh_float64_hash_func(val.real)^kh_float64_hash_func(val.imag);
-}
-khint32_t PANDAS_INLINE kh_complex64_hash_func(khcomplex64_t val){
- return kh_float32_hash_func(val.real)^kh_float32_hash_func(val.imag);
-}
-
-#define kh_complex_hash_equal(a, b) \
- (kh_floats_hash_equal(a.real, b.real) && kh_floats_hash_equal(a.imag, b.imag))
-
-
-#define KHASH_MAP_INIT_COMPLEX64(name, khval_t) \
- KHASH_INIT(name, khcomplex64_t, khval_t, 1, kh_complex64_hash_func, kh_complex_hash_equal)
-
-KHASH_MAP_INIT_COMPLEX64(complex64, size_t)
-
-
-#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \
- KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, kh_complex_hash_equal)
-
-KHASH_MAP_INIT_COMPLEX128(complex128, size_t)
-
-
-#define kh_exist_complex64(h, k) (kh_exist(h, k))
-#define kh_exist_complex128(h, k) (kh_exist(h, k))
-
-
-// NaN-floats should be in the same equivalency class, see GH 22119
-int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){
- return (
- Py_IS_NAN(PyFloat_AS_DOUBLE(a)) &&
- Py_IS_NAN(PyFloat_AS_DOUBLE(b))
- )
- ||
- ( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) );
-}
-
-
-// NaNs should be in the same equivalency class, see GH 41836
-// PyObject_RichCompareBool for complexobjects has a different behavior
-// needs to be replaced
-int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){
- return (
- Py_IS_NAN(a->cval.real) &&
- Py_IS_NAN(b->cval.real) &&
- Py_IS_NAN(a->cval.imag) &&
- Py_IS_NAN(b->cval.imag)
- )
- ||
- (
- Py_IS_NAN(a->cval.real) &&
- Py_IS_NAN(b->cval.real) &&
- a->cval.imag == b->cval.imag
- )
- ||
- (
- a->cval.real == b->cval.real &&
- Py_IS_NAN(a->cval.imag) &&
- Py_IS_NAN(b->cval.imag)
- )
- ||
- (
- a->cval.real == b->cval.real &&
- a->cval.imag == b->cval.imag
- );
-}
-
-int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b);
-
-
-// replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN),
-// which treats NaNs as equivalent
-// see GH 41836
-int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){
- Py_ssize_t i;
-
- if (Py_SIZE(a) != Py_SIZE(b)) {
- return 0;
- }
-
- for (i = 0; i < Py_SIZE(a); ++i) {
- if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) {
- return 0;
- }
- }
- return 1;
-}
-
-
-int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) {
- if (a == b) {
- return 1;
- }
- if (Py_TYPE(a) == Py_TYPE(b)) {
- // special handling for some built-in types which could have NaNs
- // as we would like to have them equivalent, but the usual
- // PyObject_RichCompareBool would return False
- if (PyFloat_CheckExact(a)) {
- return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b);
- }
- if (PyComplex_CheckExact(a)) {
- return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b);
- }
- if (PyTuple_CheckExact(a)) {
- return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b);
- }
- // frozenset isn't yet supported
- }
-
- int result = PyObject_RichCompareBool(a, b, Py_EQ);
- if (result < 0) {
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-
-Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) {
- //Since Python3.10, nan is no longer has hash 0
- if (Py_IS_NAN(val)) {
- return 0;
- }
-#if PY_VERSION_HEX < 0x030A0000
- return _Py_HashDouble(val);
-#else
- return _Py_HashDouble(NULL, val);
-#endif
-}
-
-
-Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) {
- return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key));
-}
-
-
-#define _PandasHASH_IMAG 1000003UL
-
-// replaces _Py_HashDouble with _Pandas_HashDouble
-Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) {
- Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real);
- Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag);
- if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) {
- return -1;
- }
- Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash;
- if (combined == (Py_uhash_t)-1) {
- return -2;
- }
- return (Py_hash_t)combined;
-}
-
-
-khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key);
-
-//we could use any hashing algorithm, this is the original CPython's for tuples
-
-#if SIZEOF_PY_UHASH_T > 4
-#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
-#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
-#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
-#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
-#else
-#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
-#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
-#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
-#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
-#endif
-
-Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) {
- Py_ssize_t i, len = Py_SIZE(key);
- PyObject **item = key->ob_item;
-
- Py_uhash_t acc = _PandasHASH_XXPRIME_5;
- for (i = 0; i < len; i++) {
- Py_uhash_t lane = kh_python_hash_func(item[i]);
- if (lane == (Py_uhash_t)-1) {
- return -1;
- }
- acc += lane * _PandasHASH_XXPRIME_2;
- acc = _PandasHASH_XXROTATE(acc);
- acc *= _PandasHASH_XXPRIME_1;
- }
-
- /* Add input length, mangled to keep the historical value of hash(()). */
- acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL);
-
- if (acc == (Py_uhash_t)-1) {
- return 1546275796;
- }
- return acc;
-}
-
-
-khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) {
- Py_hash_t hash;
- // For PyObject_Hash holds:
- // hash(0.0) == 0 == hash(-0.0)
- // yet for different nan-objects different hash-values
- // are possible
- if (PyFloat_CheckExact(key)) {
- // we cannot use kh_float64_hash_func
- // because float(k) == k holds for any int-object k
- // and kh_float64_hash_func doesn't respect it
- hash = floatobject_hash((PyFloatObject*)key);
- }
- else if (PyComplex_CheckExact(key)) {
- // we cannot use kh_complex128_hash_func
- // because complex(k,0) == k holds for any int-object k
- // and kh_complex128_hash_func doesn't respect it
- hash = complexobject_hash((PyComplexObject*)key);
- }
- else if (PyTuple_CheckExact(key)) {
- hash = tupleobject_hash((PyTupleObject*)key);
- }
- else {
- hash = PyObject_Hash(key);
- }
-
- if (hash == -1) {
- PyErr_Clear();
- return 0;
- }
- #if SIZEOF_PY_HASH_T == 4
- // it is already 32bit value
- return hash;
- #else
- // for 64bit builds,
- // we need information of the upper 32bits as well
- // see GH 37615
- khuint64_t as_uint = (khuint64_t) hash;
- // uints avoid undefined behavior of signed ints
- return (as_uint>>32)^as_uint;
- #endif
-}
-
-
-#define kh_python_hash_equal(a, b) (pyobject_cmp(a, b))
-
-
-// Python object
-
-typedef PyObject* kh_pyobject_t;
-
-#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \
- KHASH_INIT(name, kh_pyobject_t, khval_t, 1, \
- kh_python_hash_func, kh_python_hash_equal)
-
-KHASH_MAP_INIT_PYOBJECT(pymap, Py_ssize_t)
-
-#define KHASH_SET_INIT_PYOBJECT(name) \
- KHASH_INIT(name, kh_pyobject_t, char, 0, \
- kh_python_hash_func, kh_python_hash_equal)
-
-KHASH_SET_INIT_PYOBJECT(pyset)
-
-#define kh_exist_pymap(h, k) (kh_exist(h, k))
-#define kh_exist_pyset(h, k) (kh_exist(h, k))
-
-KHASH_MAP_INIT_STR(strbox, kh_pyobject_t)
-
-typedef struct {
- kh_str_t *table;
- int starts[256];
-} kh_str_starts_t;
-
-typedef kh_str_starts_t* p_kh_str_starts_t;
-
-p_kh_str_starts_t PANDAS_INLINE kh_init_str_starts(void) {
- kh_str_starts_t *result = (kh_str_starts_t*)KHASH_CALLOC(1, sizeof(kh_str_starts_t));
- result->table = kh_init_str();
- return result;
-}
-
-khuint_t PANDAS_INLINE kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) {
- khuint_t result = kh_put_str(table->table, key, ret);
- if (*ret != 0) {
- table->starts[(unsigned char)key[0]] = 1;
- }
- return result;
-}
-
-khuint_t PANDAS_INLINE kh_get_str_starts_item(const kh_str_starts_t* table, const char* key) {
- unsigned char ch = *key;
- if (table->starts[ch]) {
- if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) return 1;
- }
- return 0;
-}
-
-void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) {
- kh_destroy_str(table->table);
- KHASH_FREE(table);
-}
-
-void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) {
- kh_resize_str(table->table, val);
-}
-
-// utility function: given the number of elements
-// returns number of necessary buckets
-khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){
- khuint_t candidate = n_elements;
- kroundup32(candidate);
- khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5);
- return (upper_bound < n_elements) ? 2*candidate : candidate;
-
-}
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/parse_helper.h b/contrib/python/pandas/py3/pandas/_libs/src/parse_helper.h
deleted file mode 100644
index d161c4e29fe..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/parse_helper.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
-Copyright (c) 2016, PyData Development Team
-All rights reserved.
-
-Distributed under the terms of the BSD Simplified License.
-
-The full license is in the LICENSE file, distributed with this software.
-*/
-
-#ifndef PANDAS__LIBS_SRC_PARSE_HELPER_H_
-#define PANDAS__LIBS_SRC_PARSE_HELPER_H_
-
-#include <float.h>
-#include "parser/tokenizer.h"
-
-int to_double(char *item, double *p_value, char sci, char decimal,
- int *maybe_int) {
- char *p_end = NULL;
- int error = 0;
-
- /* Switch to precise xstrtod GH 31364 */
- *p_value = precise_xstrtod(item, &p_end, decimal, sci, '\0', 1,
- &error, maybe_int);
-
- return (error == 0) && (!*p_end);
-}
-
-int floatify(PyObject *str, double *result, int *maybe_int) {
- int status;
- char *data;
- PyObject *tmp = NULL;
- const char sci = 'E';
- const char dec = '.';
-
- if (PyBytes_Check(str)) {
- data = PyBytes_AS_STRING(str);
- } else if (PyUnicode_Check(str)) {
- tmp = PyUnicode_AsUTF8String(str);
- if (tmp == NULL) {
- return -1;
- }
- data = PyBytes_AS_STRING(tmp);
- } else {
- PyErr_SetString(PyExc_TypeError, "Invalid object type");
- return -1;
- }
-
- status = to_double(data, result, sci, dec, maybe_int);
-
- if (!status) {
- /* handle inf/-inf infinity/-infinity */
- if (strlen(data) == 3) {
- if (0 == strcasecmp(data, "inf")) {
- *result = HUGE_VAL;
- *maybe_int = 0;
- } else {
- goto parsingerror;
- }
- } else if (strlen(data) == 4) {
- if (0 == strcasecmp(data, "-inf")) {
- *result = -HUGE_VAL;
- *maybe_int = 0;
- } else if (0 == strcasecmp(data, "+inf")) {
- *result = HUGE_VAL;
- *maybe_int = 0;
- } else {
- goto parsingerror;
- }
- } else if (strlen(data) == 8) {
- if (0 == strcasecmp(data, "infinity")) {
- *result = HUGE_VAL;
- *maybe_int = 0;
- } else {
- goto parsingerror;
- }
- } else if (strlen(data) == 9) {
- if (0 == strcasecmp(data, "-infinity")) {
- *result = -HUGE_VAL;
- *maybe_int = 0;
- } else if (0 == strcasecmp(data, "+infinity")) {
- *result = HUGE_VAL;
- *maybe_int = 0;
- } else {
- goto parsingerror;
- }
- } else {
- goto parsingerror;
- }
- }
-
- Py_XDECREF(tmp);
- return 0;
-
-parsingerror:
- PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data);
- Py_XDECREF(tmp);
- return -1;
-}
-
-#endif // PANDAS__LIBS_SRC_PARSE_HELPER_H_
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/parser/io.c b/contrib/python/pandas/py3/pandas/_libs/src/parser/io.c
deleted file mode 100644
index 38304cca94a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/parser/io.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
-Copyright (c) 2016, PyData Development Team
-All rights reserved.
-
-Distributed under the terms of the BSD Simplified License.
-
-The full license is in the LICENSE file, distributed with this software.
-*/
-
-#include "io.h"
-
-/*
- On-disk FILE, uncompressed
-*/
-
-void *new_rd_source(PyObject *obj) {
- rd_source *rds = (rd_source *)malloc(sizeof(rd_source));
-
- if (rds == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
- /* hold on to this object */
- Py_INCREF(obj);
- rds->obj = obj;
- rds->buffer = NULL;
- rds->position = 0;
-
- return (void *)rds;
-}
-
-/*
-
- Cleanup callbacks
-
- */
-
-int del_rd_source(void *rds) {
- Py_XDECREF(RDS(rds)->obj);
- Py_XDECREF(RDS(rds)->buffer);
- free(rds);
-
- return 0;
-}
-
-/*
-
- IO callbacks
-
- */
-
-void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
- int *status, const char *encoding_errors) {
- PyGILState_STATE state;
- PyObject *result, *func, *args, *tmp;
-
- void *retval;
-
- size_t length;
- rd_source *src = RDS(source);
- state = PyGILState_Ensure();
-
- /* delete old object */
- Py_XDECREF(src->buffer);
- src->buffer = NULL;
- args = Py_BuildValue("(i)", nbytes);
-
- func = PyObject_GetAttrString(src->obj, "read");
-
- /* Note: PyObject_CallObject requires the GIL */
- result = PyObject_CallObject(func, args);
- Py_XDECREF(args);
- Py_XDECREF(func);
-
- if (result == NULL) {
- PyGILState_Release(state);
- *bytes_read = 0;
- *status = CALLING_READ_FAILED;
- return NULL;
- } else if (!PyBytes_Check(result)) {
- tmp = PyUnicode_AsEncodedString(result, "utf-8", encoding_errors);
- Py_DECREF(result);
- if (tmp == NULL) {
- PyGILState_Release(state);
- return NULL;
- }
- result = tmp;
- }
-
- length = PySequence_Length(result);
-
- if (length == 0)
- *status = REACHED_EOF;
- else
- *status = 0;
-
- /* hang on to the Python object */
- src->buffer = result;
- retval = (void *)PyBytes_AsString(result);
-
- PyGILState_Release(state);
-
- /* TODO: more error handling */
- *bytes_read = length;
-
- return retval;
-}
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/parser/io.h b/contrib/python/pandas/py3/pandas/_libs/src/parser/io.h
deleted file mode 100644
index f0e8b018553..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/parser/io.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
-Copyright (c) 2016, PyData Development Team
-All rights reserved.
-
-Distributed under the terms of the BSD Simplified License.
-
-The full license is in the LICENSE file, distributed with this software.
-*/
-
-#ifndef PANDAS__LIBS_SRC_PARSER_IO_H_
-#define PANDAS__LIBS_SRC_PARSER_IO_H_
-
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include "tokenizer.h"
-
-#define FS(source) ((file_source *)source)
-
-typedef struct _rd_source {
- PyObject *obj;
- PyObject *buffer;
- size_t position;
-} rd_source;
-
-#define RDS(source) ((rd_source *)source)
-
-void *new_rd_source(PyObject *obj);
-
-int del_rd_source(void *src);
-
-void *buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read,
- int *status, const char *encoding_errors);
-
-#endif // PANDAS__LIBS_SRC_PARSER_IO_H_
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/parser/tokenizer.c b/contrib/python/pandas/py3/pandas/_libs/src/parser/tokenizer.c
deleted file mode 100644
index c337c3eaf13..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/parser/tokenizer.c
+++ /dev/null
@@ -1,2085 +0,0 @@
-/*
-
-Copyright (c) 2012, Lambda Foundry, Inc., except where noted
-
-Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause
-BSD
-
-See LICENSE for the license
-
-*/
-
-/*
-
-Low-level ascii-file processing for pandas. Combines some elements from
-Python's built-in csv module and Warren Weckesser's textreader project on
-GitHub. See Python Software Foundation License and BSD licenses for these.
-
-*/
-
-#include "tokenizer.h"
-
-#include <ctype.h>
-#include <float.h>
-#include <math.h>
-
-#include "../headers/portable.h"
-
-void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
- int64_t start) {
- // column i, starting at 0
- self->words = parser->words;
- self->col = i;
- self->line_start = parser->line_start + start;
-}
-
-static void free_if_not_null(void **ptr) {
- TRACE(("free_if_not_null %p\n", *ptr))
- if (*ptr != NULL) {
- free(*ptr);
- *ptr = NULL;
- }
-}
-
-/*
-
- Parser / tokenizer
-
-*/
-
-static void *grow_buffer(void *buffer, uint64_t length, uint64_t *capacity,
- int64_t space, int64_t elsize, int *error) {
- uint64_t cap = *capacity;
- void *newbuffer = buffer;
-
- // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
- while ((length + space >= cap) && (newbuffer != NULL)) {
- cap = cap ? cap << 1 : 2;
- buffer = newbuffer;
- newbuffer = realloc(newbuffer, elsize * cap);
- }
-
- if (newbuffer == NULL) {
- // realloc failed so don't change *capacity, set *error to errno
- // and return the last good realloc'd buffer so it can be freed
- *error = errno;
- newbuffer = buffer;
- } else {
- // realloc worked, update *capacity and set *error to 0
- // sigh, multiple return values
- *capacity = cap;
- *error = 0;
- }
- return newbuffer;
-}
-
-void parser_set_default_options(parser_t *self) {
- self->decimal = '.';
- self->sci = 'E';
-
- // For tokenization
- self->state = START_RECORD;
-
- self->delimiter = ','; // XXX
- self->delim_whitespace = 0;
-
- self->doublequote = 0;
- self->quotechar = '"';
- self->escapechar = 0;
-
- self->lineterminator = '\0'; /* NUL->standard logic */
-
- self->skipinitialspace = 0;
- self->quoting = QUOTE_MINIMAL;
- self->allow_embedded_newline = 1;
-
- self->expected_fields = -1;
- self->on_bad_lines = ERROR;
-
- self->commentchar = '#';
- self->thousands = '\0';
-
- self->skipset = NULL;
- self->skipfunc = NULL;
- self->skip_first_N_rows = -1;
- self->skip_footer = 0;
-}
-
-parser_t *parser_new() { return (parser_t *)calloc(1, sizeof(parser_t)); }
-
-int parser_clear_data_buffers(parser_t *self) {
- free_if_not_null((void *)&self->stream);
- free_if_not_null((void *)&self->words);
- free_if_not_null((void *)&self->word_starts);
- free_if_not_null((void *)&self->line_start);
- free_if_not_null((void *)&self->line_fields);
- return 0;
-}
-
-int parser_cleanup(parser_t *self) {
- int status = 0;
-
- // XXX where to put this
- free_if_not_null((void *)&self->error_msg);
- free_if_not_null((void *)&self->warn_msg);
-
- if (self->skipset != NULL) {
- kh_destroy_int64((kh_int64_t *)self->skipset);
- self->skipset = NULL;
- }
-
- if (parser_clear_data_buffers(self) < 0) {
- status = -1;
- }
-
- if (self->cb_cleanup != NULL) {
- if (self->cb_cleanup(self->source) < 0) {
- status = -1;
- }
- self->cb_cleanup = NULL;
- }
-
- return status;
-}
-
-int parser_init(parser_t *self) {
- int64_t sz;
-
- /*
- Initialize data buffers
- */
-
- self->stream = NULL;
- self->words = NULL;
- self->word_starts = NULL;
- self->line_start = NULL;
- self->line_fields = NULL;
- self->error_msg = NULL;
- self->warn_msg = NULL;
-
- // token stream
- self->stream = malloc(STREAM_INIT_SIZE * sizeof(char));
- if (self->stream == NULL) {
- parser_cleanup(self);
- return PARSER_OUT_OF_MEMORY;
- }
- self->stream_cap = STREAM_INIT_SIZE;
- self->stream_len = 0;
-
- // word pointers and metadata
- sz = STREAM_INIT_SIZE / 10;
- sz = sz ? sz : 1;
- self->words = malloc(sz * sizeof(char *));
- self->word_starts = malloc(sz * sizeof(int64_t));
- self->max_words_cap = sz;
- self->words_cap = sz;
- self->words_len = 0;
-
- // line pointers and metadata
- self->line_start = malloc(sz * sizeof(int64_t));
-
- self->line_fields = malloc(sz * sizeof(int64_t));
-
- self->lines_cap = sz;
- self->lines = 0;
- self->file_lines = 0;
-
- if (self->stream == NULL || self->words == NULL ||
- self->word_starts == NULL || self->line_start == NULL ||
- self->line_fields == NULL) {
- parser_cleanup(self);
-
- return PARSER_OUT_OF_MEMORY;
- }
-
- /* amount of bytes buffered */
- self->datalen = 0;
- self->datapos = 0;
-
- self->line_start[0] = 0;
- self->line_fields[0] = 0;
-
- self->pword_start = self->stream;
- self->word_start = 0;
-
- self->state = START_RECORD;
-
- self->error_msg = NULL;
- self->warn_msg = NULL;
-
- self->commentchar = '\0';
-
- return 0;
-}
-
-void parser_free(parser_t *self) {
- // opposite of parser_init
- parser_cleanup(self);
-}
-
-void parser_del(parser_t *self) {
- free(self);
-}
-
-static int make_stream_space(parser_t *self, size_t nbytes) {
- uint64_t i, cap, length;
- int status;
- void *orig_ptr, *newptr;
-
- // Can we fit potentially nbytes tokens (+ null terminators) in the stream?
-
- /*
- TOKEN STREAM
- */
-
- orig_ptr = (void *)self->stream;
- TRACE(
- ("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n",
- nbytes))
- self->stream = (char *)grow_buffer((void *)self->stream, self->stream_len,
- &self->stream_cap, nbytes * 2,
- sizeof(char), &status);
- TRACE(
- ("make_stream_space: self->stream=%p, self->stream_len = %zu, "
- "self->stream_cap=%zu, status=%zu\n",
- self->stream, self->stream_len, self->stream_cap, status))
-
- if (status != 0) {
- return PARSER_OUT_OF_MEMORY;
- }
-
- // realloc sets errno when moving buffer?
- if (self->stream != orig_ptr) {
- self->pword_start = self->stream + self->word_start;
-
- for (i = 0; i < self->words_len; ++i) {
- self->words[i] = self->stream + self->word_starts[i];
- }
- }
-
- /*
- WORD VECTORS
- */
-
- cap = self->words_cap;
-
- /**
- * If we are reading in chunks, we need to be aware of the maximum number
- * of words we have seen in previous chunks (self->max_words_cap), so
- * that way, we can properly allocate when reading subsequent ones.
- *
- * Otherwise, we risk a buffer overflow if we mistakenly under-allocate
- * just because a recent chunk did not have as many words.
- */
- if (self->words_len + nbytes < self->max_words_cap) {
- length = self->max_words_cap - nbytes - 1;
- } else {
- length = self->words_len;
- }
-
- self->words =
- (char **)grow_buffer((void *)self->words, length,
- &self->words_cap, nbytes,
- sizeof(char *), &status);
- TRACE(
- ("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, "
- "%d)\n",
- self->words_len, self->words_cap, nbytes, status))
- if (status != 0) {
- return PARSER_OUT_OF_MEMORY;
- }
-
- // realloc took place
- if (cap != self->words_cap) {
- TRACE(
- ("make_stream_space: cap != self->words_cap, nbytes = %d, "
- "self->words_cap=%d\n",
- nbytes, self->words_cap))
- newptr = realloc((void *)self->word_starts,
- sizeof(int64_t) * self->words_cap);
- if (newptr == NULL) {
- return PARSER_OUT_OF_MEMORY;
- } else {
- self->word_starts = (int64_t *)newptr;
- }
- }
-
- /*
- LINE VECTORS
- */
- cap = self->lines_cap;
- self->line_start =
- (int64_t *)grow_buffer((void *)self->line_start, self->lines + 1,
- &self->lines_cap, nbytes,
- sizeof(int64_t), &status);
- TRACE((
- "make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n",
- self->lines + 1, self->lines_cap, nbytes, status))
- if (status != 0) {
- return PARSER_OUT_OF_MEMORY;
- }
-
- // realloc took place
- if (cap != self->lines_cap) {
- TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n",
- nbytes))
- newptr = realloc((void *)self->line_fields,
- sizeof(int64_t) * self->lines_cap);
- if (newptr == NULL) {
- return PARSER_OUT_OF_MEMORY;
- } else {
- self->line_fields = (int64_t *)newptr;
- }
- }
-
- return 0;
-}
-
-static int push_char(parser_t *self, char c) {
- TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n",
- self->stream_len + 1, c, self->stream_cap))
- if (self->stream_len >= self->stream_cap) {
- TRACE(
- ("push_char: ERROR!!! self->stream_len(%d) >= "
- "self->stream_cap(%d)\n",
- self->stream_len, self->stream_cap))
- int64_t bufsize = 100;
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "Buffer overflow caught - possible malformed input file.\n");
- return PARSER_OUT_OF_MEMORY;
- }
- self->stream[self->stream_len++] = c;
- return 0;
-}
-
-int PANDAS_INLINE end_field(parser_t *self) {
- // XXX cruft
- if (self->words_len >= self->words_cap) {
- TRACE(
- ("end_field: ERROR!!! self->words_len(%zu) >= "
- "self->words_cap(%zu)\n",
- self->words_len, self->words_cap))
- int64_t bufsize = 100;
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "Buffer overflow caught - possible malformed input file.\n");
- return PARSER_OUT_OF_MEMORY;
- }
-
- // null terminate token
- push_char(self, '\0');
-
- // set pointer and metadata
- self->words[self->words_len] = self->pword_start;
-
- TRACE(("end_field: Char diff: %d\n", self->pword_start - self->words[0]));
-
- TRACE(("end_field: Saw word %s at: %d. Total: %d\n", self->pword_start,
- self->word_start, self->words_len + 1))
-
- self->word_starts[self->words_len] = self->word_start;
- self->words_len++;
-
- // increment line field count
- self->line_fields[self->lines]++;
-
- // New field begin in stream
- self->pword_start = self->stream + self->stream_len;
- self->word_start = self->stream_len;
-
- return 0;
-}
-
-static void append_warning(parser_t *self, const char *msg) {
- int64_t ex_length;
- int64_t length = strlen(msg);
- void *newptr;
-
- if (self->warn_msg == NULL) {
- self->warn_msg = malloc(length + 1);
- snprintf(self->warn_msg, length + 1, "%s", msg);
- } else {
- ex_length = strlen(self->warn_msg);
- newptr = realloc(self->warn_msg, ex_length + length + 1);
- if (newptr != NULL) {
- self->warn_msg = (char *)newptr;
- snprintf(self->warn_msg + ex_length, length + 1, "%s", msg);
- }
- }
-}
-
-static int end_line(parser_t *self) {
- char *msg;
- int64_t fields;
- int64_t ex_fields = self->expected_fields;
- int64_t bufsize = 100; // for error or warning messages
-
- fields = self->line_fields[self->lines];
-
- TRACE(("end_line: Line end, nfields: %d\n", fields));
-
- TRACE(("end_line: lines: %d\n", self->lines));
- if (self->lines > 0) {
- if (self->expected_fields >= 0) {
- ex_fields = self->expected_fields;
- } else {
- ex_fields = self->line_fields[self->lines - 1];
- }
- }
- TRACE(("end_line: ex_fields: %d\n", ex_fields));
-
- if (self->state == START_FIELD_IN_SKIP_LINE ||
- self->state == IN_FIELD_IN_SKIP_LINE ||
- self->state == IN_QUOTED_FIELD_IN_SKIP_LINE ||
- self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE) {
- TRACE(("end_line: Skipping row %d\n", self->file_lines));
- // increment file line count
- self->file_lines++;
-
- // skip the tokens from this bad line
- self->line_start[self->lines] += fields;
-
- // reset field count
- self->line_fields[self->lines] = 0;
- return 0;
- }
-
- if (!(self->lines <= self->header_end + 1) &&
- (fields > ex_fields) && !(self->usecols)) {
- // increment file line count
- self->file_lines++;
-
- // skip the tokens from this bad line
- self->line_start[self->lines] += fields;
-
- // reset field count
- self->line_fields[self->lines] = 0;
-
- // file_lines is now the actual file line number (starting at 1)
- if (self->on_bad_lines == ERROR) {
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "Expected %" PRId64 " fields in line %" PRIu64 ", saw %"
- PRId64 "\n", ex_fields, self->file_lines, fields);
-
- TRACE(("Error at line %d, %d fields\n", self->file_lines, fields));
-
- return -1;
- } else {
- // simply skip bad lines
- if (self->on_bad_lines == WARN) {
- // pass up error message
- msg = malloc(bufsize);
- snprintf(msg, bufsize,
- "Skipping line %" PRIu64 ": expected %" PRId64
- " fields, saw %" PRId64 "\n",
- self->file_lines, ex_fields, fields);
- append_warning(self, msg);
- free(msg);
- }
- }
- } else {
- // missing trailing delimiters
- if ((self->lines >= self->header_end + 1) &&
- fields < ex_fields) {
- // might overrun the buffer when closing fields
- if (make_stream_space(self, ex_fields - fields) < 0) {
- int64_t bufsize = 100;
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize, "out of memory");
- return -1;
- }
-
- while (fields < ex_fields) {
- end_field(self);
- fields++;
- }
- }
-
- // increment both line counts
- self->file_lines++;
- self->lines++;
-
- // good line, set new start point
- if (self->lines >= self->lines_cap) {
- TRACE((
- "end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n",
- self->lines, self->lines_cap))
- int64_t bufsize = 100;
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "Buffer overflow caught - "
- "possible malformed input file.\n");
- return PARSER_OUT_OF_MEMORY;
- }
- self->line_start[self->lines] =
- (self->line_start[self->lines - 1] + fields);
-
- TRACE(
- ("end_line: new line start: %d\n", self->line_start[self->lines]));
-
- // new line start with 0 fields
- self->line_fields[self->lines] = 0;
- }
-
- TRACE(("end_line: Finished line, at %d\n", self->lines));
-
- return 0;
-}
-
-int parser_add_skiprow(parser_t *self, int64_t row) {
- khiter_t k;
- kh_int64_t *set;
- int ret = 0;
-
- if (self->skipset == NULL) {
- self->skipset = (void *)kh_init_int64();
- }
-
- set = (kh_int64_t *)self->skipset;
-
- k = kh_put_int64(set, row, &ret);
- set->keys[k] = row;
-
- return 0;
-}
-
-int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
- // self->file_lines is zero based so subtract 1 from nrows
- if (nrows > 0) {
- self->skip_first_N_rows = nrows - 1;
- }
-
- return 0;
-}
-
-static int parser_buffer_bytes(parser_t *self, size_t nbytes,
- const char *encoding_errors) {
- int status;
- size_t bytes_read;
-
- status = 0;
- self->datapos = 0;
- self->data = self->cb_io(self->source, nbytes, &bytes_read, &status,
- encoding_errors);
- TRACE((
- "parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n",
- nbytes, bytes_read, status));
- self->datalen = bytes_read;
-
- if (status != REACHED_EOF && self->data == NULL) {
- int64_t bufsize = 200;
- self->error_msg = malloc(bufsize);
-
- if (status == CALLING_READ_FAILED) {
- snprintf(self->error_msg, bufsize,
- "Calling read(nbytes) on source failed. "
- "Try engine='python'.");
- } else {
- snprintf(self->error_msg, bufsize, "Unknown error in IO callback");
- }
- return -1;
- }
-
- TRACE(("datalen: %d\n", self->datalen));
-
- return status;
-}
-
-/*
-
- Tokenization macros and state machine code
-
-*/
-
-#define PUSH_CHAR(c) \
- TRACE( \
- ("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", \
- c, slen, self->stream_cap, self->stream_len)) \
- if (slen >= self->stream_cap) { \
- TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= stream_cap(%d)\n", slen, \
- self->stream_cap)) \
- int64_t bufsize = 100; \
- self->error_msg = malloc(bufsize); \
- snprintf(self->error_msg, bufsize, \
- "Buffer overflow caught - possible malformed input file.\n");\
- return PARSER_OUT_OF_MEMORY; \
- } \
- *stream++ = c; \
- slen++;
-
-// This is a little bit of a hack but works for now
-
-#define END_FIELD() \
- self->stream_len = slen; \
- if (end_field(self) < 0) { \
- goto parsingerror; \
- } \
- stream = self->stream + self->stream_len; \
- slen = self->stream_len;
-
-#define END_LINE_STATE(STATE) \
- self->stream_len = slen; \
- if (end_line(self) < 0) { \
- goto parsingerror; \
- } \
- stream = self->stream + self->stream_len; \
- slen = self->stream_len; \
- self->state = STATE; \
- if (line_limit > 0 && self->lines == start_lines + line_limit) { \
- goto linelimit; \
- }
-
-#define END_LINE_AND_FIELD_STATE(STATE) \
- self->stream_len = slen; \
- if (end_line(self) < 0) { \
- goto parsingerror; \
- } \
- if (end_field(self) < 0) { \
- goto parsingerror; \
- } \
- stream = self->stream + self->stream_len; \
- slen = self->stream_len; \
- self->state = STATE; \
- if (line_limit > 0 && self->lines == start_lines + line_limit) { \
- goto linelimit; \
- }
-
-#define END_LINE() END_LINE_STATE(START_RECORD)
-
-#define IS_TERMINATOR(c) \
- (c == lineterminator)
-
-#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
-
-// don't parse '\r' with a custom line terminator
-#define IS_CARRIAGE(c) (c == carriage_symbol)
-
-#define IS_COMMENT_CHAR(c) (c == comment_symbol)
-
-#define IS_ESCAPE_CHAR(c) (c == escape_symbol)
-
-#define IS_SKIPPABLE_SPACE(c) \
- ((!self->delim_whitespace && c == ' ' && self->skipinitialspace))
-
-// applied when in a field
-#define IS_DELIMITER(c) \
- ((!self->delim_whitespace && c == self->delimiter) || \
- (self->delim_whitespace && isblank(c)))
-
-#define _TOKEN_CLEANUP() \
- self->stream_len = slen; \
- self->datapos = i; \
- TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, \
- self->datalen));
-
-#define CHECK_FOR_BOM() \
- if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \
- buf += 3; \
- self->datapos += 3; \
- }
-
-int skip_this_line(parser_t *self, int64_t rownum) {
- int should_skip;
- PyObject *result;
- PyGILState_STATE state;
-
- if (self->skipfunc != NULL) {
- state = PyGILState_Ensure();
- result = PyObject_CallFunction(self->skipfunc, "i", rownum);
-
- // Error occurred. It will be processed
- // and caught at the Cython level.
- if (result == NULL) {
- should_skip = -1;
- } else {
- should_skip = PyObject_IsTrue(result);
- }
-
- Py_XDECREF(result);
- PyGILState_Release(state);
-
- return should_skip;
- } else if (self->skipset != NULL) {
- return (kh_get_int64((kh_int64_t *)self->skipset, self->file_lines) !=
- ((kh_int64_t *)self->skipset)->n_buckets);
- } else {
- return (rownum <= self->skip_first_N_rows);
- }
-}
-
-int tokenize_bytes(parser_t *self,
- size_t line_limit, uint64_t start_lines) {
- int64_t i;
- uint64_t slen;
- int should_skip;
- char c;
- char *stream;
- char *buf = self->data + self->datapos;
-
- const char lineterminator = (self->lineterminator == '\0') ?
- '\n' : self->lineterminator;
-
- // 1000 is something that couldn't fit in "char"
- // thus comparing a char to it would always be "false"
- const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
- const int comment_symbol = (self->commentchar != '\0') ?
- self->commentchar : 1000;
- const int escape_symbol = (self->escapechar != '\0') ?
- self->escapechar : 1000;
-
- if (make_stream_space(self, self->datalen - self->datapos) < 0) {
- int64_t bufsize = 100;
- self->error_msg = malloc(bufsize);
- snprintf(self->error_msg, bufsize, "out of memory");
- return -1;
- }
-
- stream = self->stream + self->stream_len;
- slen = self->stream_len;
-
- TRACE(("%s\n", buf));
-
- if (self->file_lines == 0) {
- CHECK_FOR_BOM();
- }
-
- for (i = self->datapos; i < self->datalen; ++i) {
- // next character in file
- c = *buf++;
-
- TRACE(
- ("tokenize_bytes - Iter: %d Char: 0x%x Line %d field_count %d, "
- "state %d\n",
- i, c, self->file_lines + 1, self->line_fields[self->lines],
- self->state));
-
- switch (self->state) {
- case START_FIELD_IN_SKIP_LINE:
- if (IS_TERMINATOR(c)) {
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- } else if (IS_QUOTE(c)) {
- self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
- } else if (IS_DELIMITER(c)) {
- // Do nothing, we're starting a new field again.
- } else {
- self->state = IN_FIELD_IN_SKIP_LINE;
- }
- break;
-
- case IN_FIELD_IN_SKIP_LINE:
- if (IS_TERMINATOR(c)) {
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- } else if (IS_DELIMITER(c)) {
- self->state = START_FIELD_IN_SKIP_LINE;
- }
- break;
-
- case IN_QUOTED_FIELD_IN_SKIP_LINE:
- if (IS_QUOTE(c)) {
- if (self->doublequote) {
- self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE;
- } else {
- self->state = IN_FIELD_IN_SKIP_LINE;
- }
- }
- break;
-
- case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE:
- if (IS_QUOTE(c)) {
- self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
- } else if (IS_TERMINATOR(c)) {
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- } else if (IS_DELIMITER(c)) {
- self->state = START_FIELD_IN_SKIP_LINE;
- } else {
- self->state = IN_FIELD_IN_SKIP_LINE;
- }
- break;
-
- case WHITESPACE_LINE:
- if (IS_TERMINATOR(c)) {
- self->file_lines++;
- self->state = START_RECORD;
- break;
- } else if (IS_CARRIAGE(c)) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- break;
- } else if (!self->delim_whitespace) {
- if (isblank(c) && c != self->delimiter) {
- } else { // backtrack
- // use i + 1 because buf has been incremented but not i
- do {
- --buf;
- --i;
- } while (i + 1 > self->datapos && !IS_TERMINATOR(*buf));
-
- // reached a newline rather than the beginning
- if (IS_TERMINATOR(*buf)) {
- ++buf; // move pointer to first char after newline
- ++i;
- }
- self->state = START_FIELD;
- }
- break;
- }
- // fall through
-
- case EAT_WHITESPACE:
- if (IS_TERMINATOR(c)) {
- END_LINE();
- self->state = START_RECORD;
- break;
- } else if (IS_CARRIAGE(c)) {
- self->state = EAT_CRNL;
- break;
- } else if (IS_COMMENT_CHAR(c)) {
- self->state = EAT_COMMENT;
- break;
- } else if (!isblank(c)) {
- self->state = START_FIELD;
- // fall through to subsequent state
- } else {
- // if whitespace char, keep slurping
- break;
- }
-
- case START_RECORD:
- // start of record
- should_skip = skip_this_line(self, self->file_lines);
-
- if (should_skip == -1) {
- goto parsingerror;
- } else if (should_skip) {
- if (IS_QUOTE(c)) {
- self->state = IN_QUOTED_FIELD_IN_SKIP_LINE;
- } else {
- self->state = IN_FIELD_IN_SKIP_LINE;
-
- if (IS_TERMINATOR(c)) {
- END_LINE();
- }
- }
- break;
- } else if (IS_TERMINATOR(c)) {
- // \n\r possible?
- if (self->skip_empty_lines) {
- self->file_lines++;
- } else {
- END_LINE();
- }
- break;
- } else if (IS_CARRIAGE(c)) {
- if (self->skip_empty_lines) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- } else {
- self->state = EAT_CRNL;
- }
- break;
- } else if (IS_COMMENT_CHAR(c)) {
- self->state = EAT_LINE_COMMENT;
- break;
- } else if (isblank(c)) {
- if (self->delim_whitespace) {
- if (self->skip_empty_lines) {
- self->state = WHITESPACE_LINE;
- } else {
- self->state = EAT_WHITESPACE;
- }
- break;
- } else if (c != self->delimiter && self->skip_empty_lines) {
- self->state = WHITESPACE_LINE;
- break;
- }
- // fall through
- }
-
- // normal character - fall through
- // to handle as START_FIELD
- self->state = START_FIELD;
-
- case START_FIELD:
- // expecting field
- if (IS_TERMINATOR(c)) {
- END_FIELD();
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- END_FIELD();
- self->state = EAT_CRNL;
- } else if (IS_QUOTE(c)) {
- // start quoted field
- self->state = IN_QUOTED_FIELD;
- } else if (IS_ESCAPE_CHAR(c)) {
- // possible escaped character
- self->state = ESCAPED_CHAR;
- } else if (IS_SKIPPABLE_SPACE(c)) {
- // ignore space at start of field
- } else if (IS_DELIMITER(c)) {
- if (self->delim_whitespace) {
- self->state = EAT_WHITESPACE;
- } else {
- // save empty field
- END_FIELD();
- }
- } else if (IS_COMMENT_CHAR(c)) {
- END_FIELD();
- self->state = EAT_COMMENT;
- } else {
- // begin new unquoted field
- PUSH_CHAR(c);
- self->state = IN_FIELD;
- }
- break;
-
- case ESCAPED_CHAR:
- PUSH_CHAR(c);
- self->state = IN_FIELD;
- break;
-
- case EAT_LINE_COMMENT:
- if (IS_TERMINATOR(c)) {
- self->file_lines++;
- self->state = START_RECORD;
- } else if (IS_CARRIAGE(c)) {
- self->file_lines++;
- self->state = EAT_CRNL_NOP;
- }
- break;
-
- case IN_FIELD:
- // in unquoted field
- if (IS_TERMINATOR(c)) {
- END_FIELD();
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- END_FIELD();
- self->state = EAT_CRNL;
- } else if (IS_ESCAPE_CHAR(c)) {
- // possible escaped character
- self->state = ESCAPED_CHAR;
- } else if (IS_DELIMITER(c)) {
- // end of field - end of line not reached yet
- END_FIELD();
-
- if (self->delim_whitespace) {
- self->state = EAT_WHITESPACE;
- } else {
- self->state = START_FIELD;
- }
- } else if (IS_COMMENT_CHAR(c)) {
- END_FIELD();
- self->state = EAT_COMMENT;
- } else {
- // normal character - save in field
- PUSH_CHAR(c);
- }
- break;
-
- case IN_QUOTED_FIELD:
- // in quoted field
- if (IS_ESCAPE_CHAR(c)) {
- // possible escape character
- self->state = ESCAPE_IN_QUOTED_FIELD;
- } else if (IS_QUOTE(c)) {
- if (self->doublequote) {
- // double quote - " represented by ""
- self->state = QUOTE_IN_QUOTED_FIELD;
- } else {
- // end of quote part of field
- self->state = IN_FIELD;
- }
- } else {
- // normal character - save in field
- PUSH_CHAR(c);
- }
- break;
-
- case ESCAPE_IN_QUOTED_FIELD:
- PUSH_CHAR(c);
- self->state = IN_QUOTED_FIELD;
- break;
-
- case QUOTE_IN_QUOTED_FIELD:
- // double quote - seen a quote in an quoted field
- if (IS_QUOTE(c)) {
- // save "" as "
-
- PUSH_CHAR(c);
- self->state = IN_QUOTED_FIELD;
- } else if (IS_DELIMITER(c)) {
- // end of field - end of line not reached yet
- END_FIELD();
-
- if (self->delim_whitespace) {
- self->state = EAT_WHITESPACE;
- } else {
- self->state = START_FIELD;
- }
- } else if (IS_TERMINATOR(c)) {
- END_FIELD();
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- END_FIELD();
- self->state = EAT_CRNL;
- } else {
- PUSH_CHAR(c);
- self->state = IN_FIELD;
- }
- break;
-
- case EAT_COMMENT:
- if (IS_TERMINATOR(c)) {
- END_LINE();
- } else if (IS_CARRIAGE(c)) {
- self->state = EAT_CRNL;
- }
- break;
-
- // only occurs with non-custom line terminator,
- // which is why we directly check for '\n'
- case EAT_CRNL:
- if (c == '\n') {
- END_LINE();
- } else if (IS_DELIMITER(c)) {
- if (self->delim_whitespace) {
- END_LINE_STATE(EAT_WHITESPACE);
- } else {
- // Handle \r-delimited files
- END_LINE_AND_FIELD_STATE(START_FIELD);
- }
- } else {
- if (self->delim_whitespace) {
- /* XXX
- * first character of a new record--need to back up and
- * reread
- * to handle properly...
- */
- i--;
- buf--; // back up one character (HACK!)
- END_LINE_STATE(START_RECORD);
- } else {
- // \r line terminator
- // UGH. we don't actually want
- // to consume the token. fix this later
- self->stream_len = slen;
- if (end_line(self) < 0) {
- goto parsingerror;
- }
-
- stream = self->stream + self->stream_len;
- slen = self->stream_len;
- self->state = START_RECORD;
-
- --i;
- buf--; // let's try this character again (HACK!)
- if (line_limit > 0 &&
- self->lines == start_lines + line_limit) {
- goto linelimit;
- }
- }
- }
- break;
-
- // only occurs with non-custom line terminator,
- // which is why we directly check for '\n'
- case EAT_CRNL_NOP: // inside an ignored comment line
- self->state = START_RECORD;
- // \r line terminator -- parse this character again
- if (c != '\n' && !IS_DELIMITER(c)) {
- --i;
- --buf;
- }
- break;
- default:
- break;
- }
- }
-
- _TOKEN_CLEANUP();
-
- TRACE(("Finished tokenizing input\n"))
-
- return 0;
-
-parsingerror:
- i++;
- _TOKEN_CLEANUP();
-
- return -1;
-
-linelimit:
- i++;
- _TOKEN_CLEANUP();
-
- return 0;
-}
-
-static int parser_handle_eof(parser_t *self) {
- int64_t bufsize = 100;
-
- TRACE(
- ("handling eof, datalen: %d, pstate: %d\n", self->datalen, self->state))
-
- if (self->datalen != 0) return -1;
-
- switch (self->state) {
- case START_RECORD:
- case WHITESPACE_LINE:
- case EAT_CRNL_NOP:
- case EAT_LINE_COMMENT:
- return 0;
-
- case ESCAPE_IN_QUOTED_FIELD:
- case IN_QUOTED_FIELD:
- self->error_msg = (char *)malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "EOF inside string starting at row %" PRIu64,
- self->file_lines);
- return -1;
-
- case ESCAPED_CHAR:
- self->error_msg = (char *)malloc(bufsize);
- snprintf(self->error_msg, bufsize,
- "EOF following escape character");
- return -1;
-
- case IN_FIELD:
- case START_FIELD:
- case QUOTE_IN_QUOTED_FIELD:
- if (end_field(self) < 0) return -1;
- break;
-
- default:
- break;
- }
-
- if (end_line(self) < 0)
- return -1;
- else
- return 0;
-}
-
-int parser_consume_rows(parser_t *self, size_t nrows) {
- int64_t offset, word_deletions;
- uint64_t char_count, i;
-
- if (nrows > self->lines) {
- nrows = self->lines;
- }
-
- /* do nothing */
- if (nrows == 0) return 0;
-
- /* cannot guarantee that nrows + 1 has been observed */
- word_deletions = self->line_start[nrows - 1] + self->line_fields[nrows - 1];
- if (word_deletions >= 1) {
- char_count = (self->word_starts[word_deletions - 1] +
- strlen(self->words[word_deletions - 1]) + 1);
- } else {
- /* if word_deletions == 0 (i.e. this case) then char_count must
- * be 0 too, as no data needs to be skipped */
- char_count = 0;
- }
-
- TRACE(("parser_consume_rows: Deleting %d words, %d chars\n", word_deletions,
- char_count));
-
- /* move stream, only if something to move */
- if (char_count < self->stream_len) {
- memmove(self->stream, (self->stream + char_count),
- self->stream_len - char_count);
- }
- /* buffer counts */
- self->stream_len -= char_count;
-
- /* move token metadata */
- // Note: We should always have words_len < word_deletions, so this
- // subtraction will remain appropriately-typed.
- for (i = 0; i < self->words_len - word_deletions; ++i) {
- offset = i + word_deletions;
-
- self->words[i] = self->words[offset] - char_count;
- self->word_starts[i] = self->word_starts[offset] - char_count;
- }
- self->words_len -= word_deletions;
-
- /* move current word pointer to stream */
- self->pword_start -= char_count;
- self->word_start -= char_count;
-
- /* move line metadata */
- // Note: We should always have self->lines - nrows + 1 >= 0, so this
- // subtraction will remain appropriately-typed.
- for (i = 0; i < self->lines - nrows + 1; ++i) {
- offset = i + nrows;
- self->line_start[i] = self->line_start[offset] - word_deletions;
- self->line_fields[i] = self->line_fields[offset];
- }
- self->lines -= nrows;
-
- return 0;
-}
-
-static size_t _next_pow2(size_t sz) {
- size_t result = 1;
- while (result < sz) result *= 2;
- return result;
-}
-
-int parser_trim_buffers(parser_t *self) {
- /*
- Free memory
- */
- size_t new_cap;
- void *newptr;
-
- uint64_t i;
-
- /**
- * Before we free up space and trim, we should
- * save how many words we saw when parsing, if
- * it exceeds the maximum number we saw before.
- *
- * This is important for when we read in chunks,
- * so that we can inform subsequent chunk parsing
- * as to how many words we could possibly see.
- */
- if (self->words_cap > self->max_words_cap) {
- self->max_words_cap = self->words_cap;
- }
-
- /* trim words, word_starts */
- new_cap = _next_pow2(self->words_len) + 1;
- if (new_cap < self->words_cap) {
- TRACE(("parser_trim_buffers: new_cap < self->words_cap\n"));
- self->words = realloc(self->words, new_cap * sizeof(char *));
- if (self->words == NULL) {
- return PARSER_OUT_OF_MEMORY;
- }
- self->word_starts = realloc(self->word_starts,
- new_cap * sizeof(int64_t));
- if (self->word_starts == NULL) {
- return PARSER_OUT_OF_MEMORY;
- }
- self->words_cap = new_cap;
- }
-
- /* trim stream */
- new_cap = _next_pow2(self->stream_len) + 1;
- TRACE(
- ("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = "
- "%zu\n",
- new_cap, self->stream_cap, self->lines_cap));
- if (new_cap < self->stream_cap) {
- TRACE(
- ("parser_trim_buffers: new_cap < self->stream_cap, calling "
- "realloc\n"));
- newptr = realloc(self->stream, new_cap);
- if (newptr == NULL) {
- return PARSER_OUT_OF_MEMORY;
- } else {
- // Update the pointers in the self->words array (char **) if
- // `realloc`
- // moved the `self->stream` buffer. This block mirrors a similar
- // block in
- // `make_stream_space`.
- if (self->stream != newptr) {
- self->pword_start = (char *)newptr + self->word_start;
-
- for (i = 0; i < self->words_len; ++i) {
- self->words[i] = (char *)newptr + self->word_starts[i];
- }
- }
-
- self->stream = newptr;
- self->stream_cap = new_cap;
- }
- }
-
- /* trim line_start, line_fields */
- new_cap = _next_pow2(self->lines) + 1;
- if (new_cap < self->lines_cap) {
- TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n"));
- newptr = realloc(self->line_start,
- new_cap * sizeof(int64_t));
- if (newptr == NULL) {
- return PARSER_OUT_OF_MEMORY;
- } else {
- self->line_start = newptr;
- }
- newptr = realloc(self->line_fields,
- new_cap * sizeof(int64_t));
- if (newptr == NULL) {
- return PARSER_OUT_OF_MEMORY;
- } else {
- self->line_fields = newptr;
- self->lines_cap = new_cap;
- }
- }
-
- return 0;
-}
-
-/*
- nrows : number of rows to tokenize (or until reach EOF)
- all : tokenize all the data vs. certain number of rows
- */
-
-int _tokenize_helper(parser_t *self, size_t nrows, int all,
- const char *encoding_errors) {
- int status = 0;
- uint64_t start_lines = self->lines;
-
- if (self->state == FINISHED) {
- return 0;
- }
-
- TRACE((
- "_tokenize_helper: Asked to tokenize %d rows, datapos=%d, datalen=%d\n",
- nrows, self->datapos, self->datalen));
-
- while (1) {
- if (!all && self->lines - start_lines >= nrows) break;
-
- if (self->datapos == self->datalen) {
- status = parser_buffer_bytes(self, self->chunksize,
- encoding_errors);
-
- if (status == REACHED_EOF) {
- // close out last line
- status = parser_handle_eof(self);
- self->state = FINISHED;
- break;
- } else if (status != 0) {
- return status;
- }
- }
-
- TRACE(
- ("_tokenize_helper: Trying to process %d bytes, datalen=%d, "
- "datapos= %d\n",
- self->datalen - self->datapos, self->datalen, self->datapos));
-
- status = tokenize_bytes(self, nrows, start_lines);
-
- if (status < 0) {
- // XXX
- TRACE(
- ("_tokenize_helper: Status %d returned from tokenize_bytes, "
- "breaking\n",
- status));
- status = -1;
- break;
- }
- }
- TRACE(("leaving tokenize_helper\n"));
- return status;
-}
-
-int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) {
- int status = _tokenize_helper(self, nrows, 0, encoding_errors);
- return status;
-}
-
-int tokenize_all_rows(parser_t *self, const char *encoding_errors) {
- int status = _tokenize_helper(self, -1, 1, encoding_errors);
- return status;
-}
-
-/*
- * Function: to_boolean
- * --------------------
- *
- * Validate if item should be recognized as a boolean field.
- *
- * item: const char* representing parsed text
- * val : pointer to a uint8_t of boolean representation
- *
- * If item is determined to be boolean, this method will set
- * the appropriate value of val and return 0. A non-zero exit
- * status means that item was not inferred to be boolean, and
- * leaves the value of *val unmodified.
- */
-int to_boolean(const char *item, uint8_t *val) {
- if (strcasecmp(item, "TRUE") == 0) {
- *val = 1;
- return 0;
- } else if (strcasecmp(item, "FALSE") == 0) {
- *val = 0;
- return 0;
- }
-
- return -1;
-}
-
-// ---------------------------------------------------------------------------
-// Implementation of xstrtod
-
-//
-// strtod.c
-//
-// Convert string to double
-//
-// Copyright (C) 2002 Michael Ringgaard. All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-// 3. Neither the name of the project nor the names of its contributors
-// may be used to endorse or promote products derived from this software
-// without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND
-// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-// SUCH DAMAGE.
-//
-// -----------------------------------------------------------------------
-// Modifications by Warren Weckesser, March 2011:
-// * Rename strtod() to xstrtod().
-// * Added decimal and sci arguments.
-// * Skip trailing spaces.
-// * Commented out the other functions.
-// Modifications by Richard T Guy, August 2013:
-// * Add tsep argument for thousands separator
-//
-
-// pessimistic but quick assessment,
-// assuming that each decimal digit requires 4 bits to store
-const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4;
-
-double xstrtod(const char *str, char **endptr, char decimal, char sci,
- char tsep, int skip_trailing, int *error, int *maybe_int) {
- double number;
- unsigned int i_number = 0;
- int exponent;
- int negative;
- char *p = (char *)str;
- double p10;
- int n;
- int num_digits;
- int num_decimals;
-
- if (maybe_int != NULL) *maybe_int = 1;
- // Skip leading whitespace.
- while (isspace_ascii(*p)) p++;
-
- // Handle optional sign.
- negative = 0;
- switch (*p) {
- case '-':
- negative = 1; // Fall through to increment position.
- case '+':
- p++;
- }
-
- exponent = 0;
- num_digits = 0;
- num_decimals = 0;
-
- // Process string of digits.
- while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) {
- i_number = i_number * 10 + (*p - '0');
- p++;
- num_digits++;
-
- p += (tsep != '\0' && *p == tsep);
- }
- number = i_number;
-
- if (num_digits > max_int_decimal_digits) {
- // process what's left as double
- while (isdigit_ascii(*p)) {
- number = number * 10. + (*p - '0');
- p++;
- num_digits++;
-
- p += (tsep != '\0' && *p == tsep);
- }
- }
-
- // Process decimal part.
- if (*p == decimal) {
- if (maybe_int != NULL) *maybe_int = 0;
- p++;
-
- while (isdigit_ascii(*p)) {
- number = number * 10. + (*p - '0');
- p++;
- num_digits++;
- num_decimals++;
- }
-
- exponent -= num_decimals;
- }
-
- if (num_digits == 0) {
- *error = ERANGE;
- return 0.0;
- }
-
- // Correct for sign.
- if (negative) number = -number;
-
- // Process an exponent string.
- if (toupper_ascii(*p) == toupper_ascii(sci)) {
- if (maybe_int != NULL) *maybe_int = 0;
-
- // Handle optional sign.
- negative = 0;
- switch (*++p) {
- case '-':
- negative = 1; // Fall through to increment pos.
- case '+':
- p++;
- }
-
- // Process string of digits.
- num_digits = 0;
- n = 0;
- while (isdigit_ascii(*p)) {
- n = n * 10 + (*p - '0');
- num_digits++;
- p++;
- }
-
- if (negative)
- exponent -= n;
- else
- exponent += n;
-
- // If no digits, after the 'e'/'E', un-consume it
- if (num_digits == 0) p--;
- }
-
- if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) {
- *error = ERANGE;
- return HUGE_VAL;
- }
-
- // Scale the result.
- p10 = 10.;
- n = exponent;
- if (n < 0) n = -n;
- while (n) {
- if (n & 1) {
- if (exponent < 0)
- number /= p10;
- else
- number *= p10;
- }
- n >>= 1;
- p10 *= p10;
- }
-
- if (number == HUGE_VAL) {
- *error = ERANGE;
- }
-
- if (skip_trailing) {
- // Skip trailing whitespace.
- while (isspace_ascii(*p)) p++;
- }
-
- if (endptr) *endptr = p;
- return number;
-}
-
-double precise_xstrtod(const char *str, char **endptr, char decimal,
- char sci, char tsep, int skip_trailing,
- int *error, int *maybe_int) {
- double number;
- int exponent;
- int negative;
- char *p = (char *)str;
- int num_digits;
- int num_decimals;
- int max_digits = 17;
- int n;
-
- if (maybe_int != NULL) *maybe_int = 1;
- // Cache powers of 10 in memory.
- static double e[] = {
- 1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9,
- 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
- 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29,
- 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39,
- 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49,
- 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59,
- 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69,
- 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79,
- 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89,
- 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99,
- 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109,
- 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119,
- 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129,
- 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139,
- 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149,
- 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159,
- 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169,
- 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179,
- 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189,
- 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199,
- 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209,
- 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219,
- 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229,
- 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239,
- 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249,
- 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259,
- 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269,
- 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279,
- 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289,
- 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299,
- 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308};
-
- // Skip leading whitespace.
- while (isspace_ascii(*p)) p++;
-
- // Handle optional sign.
- negative = 0;
- switch (*p) {
- case '-':
- negative = 1; // Fall through to increment position.
- case '+':
- p++;
- }
-
- number = 0.;
- exponent = 0;
- num_digits = 0;
- num_decimals = 0;
-
- // Process string of digits.
- while (isdigit_ascii(*p)) {
- if (num_digits < max_digits) {
- number = number * 10. + (*p - '0');
- num_digits++;
- } else {
- ++exponent;
- }
-
- p++;
- p += (tsep != '\0' && *p == tsep);
- }
-
- // Process decimal part
- if (*p == decimal) {
- if (maybe_int != NULL) *maybe_int = 0;
- p++;
-
- while (num_digits < max_digits && isdigit_ascii(*p)) {
- number = number * 10. + (*p - '0');
- p++;
- num_digits++;
- num_decimals++;
- }
-
- if (num_digits >= max_digits) // Consume extra decimal digits.
- while (isdigit_ascii(*p)) ++p;
-
- exponent -= num_decimals;
- }
-
- if (num_digits == 0) {
- *error = ERANGE;
- return 0.0;
- }
-
- // Correct for sign.
- if (negative) number = -number;
-
- // Process an exponent string.
- if (toupper_ascii(*p) == toupper_ascii(sci)) {
- if (maybe_int != NULL) *maybe_int = 0;
-
- // Handle optional sign
- negative = 0;
- switch (*++p) {
- case '-':
- negative = 1; // Fall through to increment pos.
- case '+':
- p++;
- }
-
- // Process string of digits.
- num_digits = 0;
- n = 0;
- while (num_digits < max_digits && isdigit_ascii(*p)) {
- n = n * 10 + (*p - '0');
- num_digits++;
- p++;
- }
-
- if (negative)
- exponent -= n;
- else
- exponent += n;
-
- // If no digits after the 'e'/'E', un-consume it.
- if (num_digits == 0) p--;
- }
-
- if (exponent > 308) {
- *error = ERANGE;
- return HUGE_VAL;
- } else if (exponent > 0) {
- number *= e[exponent];
- } else if (exponent < -308) { // Subnormal
- if (exponent < -616) { // Prevent invalid array access.
- number = 0.;
- } else {
- number /= e[-308 - exponent];
- number /= e[308];
- }
-
- } else {
- number /= e[-exponent];
- }
-
- if (number == HUGE_VAL || number == -HUGE_VAL) *error = ERANGE;
-
- if (skip_trailing) {
- // Skip trailing whitespace.
- while (isspace_ascii(*p)) p++;
- }
-
- if (endptr) *endptr = p;
- return number;
-}
-
-/* copy a decimal number string with `decimal`, `tsep` as decimal point
- and thousands separator to an equivalent c-locale decimal string (striping
- `tsep`, replacing `decimal` with '.'). The returned memory should be free-d
- with a call to `free`.
-*/
-
-char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
- char tsep) {
- const char *p = s;
- size_t length = strlen(s);
- char *s_copy = malloc(length + 1);
- char *dst = s_copy;
- // Skip leading whitespace.
- while (isspace_ascii(*p)) p++;
- // Copy Leading sign
- if (*p == '+' || *p == '-') {
- *dst++ = *p++;
- }
- // Copy integer part dropping `tsep`
- while (isdigit_ascii(*p)) {
- *dst++ = *p++;
- p += (tsep != '\0' && *p == tsep);
- }
- // Replace `decimal` with '.'
- if (*p == decimal) {
- *dst++ = '.';
- p++;
- }
- // Copy fractional part after decimal (if any)
- while (isdigit_ascii(*p)) {
- *dst++ = *p++;
- }
- // Copy exponent if any
- if (toupper_ascii(*p) == toupper_ascii('E')) {
- *dst++ = *p++;
- // Copy leading exponent sign (if any)
- if (*p == '+' || *p == '-') {
- *dst++ = *p++;
- }
- // Copy exponent digits
- while (isdigit_ascii(*p)) {
- *dst++ = *p++;
- }
- }
- *dst++ = '\0'; // terminate
- if (endpos != NULL)
- *endpos = (char *)p;
- return s_copy;
-}
-
-
-double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
- int skip_trailing, int *error, int *maybe_int) {
- // 'normalize' representation to C-locale; replace decimal with '.' and
- // remove t(housand)sep.
- char *endptr;
- char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
- // This is called from a nogil block in parsers.pyx
- // so need to explicitly get GIL before Python calls
- PyGILState_STATE gstate;
- gstate = PyGILState_Ensure();
- char *endpc;
- double r = PyOS_string_to_double(pc, &endpc, 0);
- // PyOS_string_to_double needs to consume the whole string
- if (endpc == pc + strlen(pc)) {
- if (q != NULL) {
- // report endptr from source string (p)
- *q = endptr;
- }
- } else {
- *error = -1;
- if (q != NULL) {
- // p and pc are different len due to tsep removal. Can't report
- // how much it has consumed of p. Just rewind to beginning.
- *q = (char *)p; // TODO(willayd): this could be undefined behavior
- }
- }
- if (maybe_int != NULL) *maybe_int = 0;
- if (PyErr_Occurred() != NULL) *error = -1;
- else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL;
- PyErr_Clear();
-
- PyGILState_Release(gstate);
- free(pc);
- if (skip_trailing && q != NULL && *q != p) {
- while (isspace_ascii(**q)) {
- (*q)++;
- }
- }
- return r;
-}
-
-// End of xstrtod code
-// ---------------------------------------------------------------------------
-
-void uint_state_init(uint_state *self) {
- self->seen_sint = 0;
- self->seen_uint = 0;
- self->seen_null = 0;
-}
-
-int uint64_conflict(uint_state *self) {
- return self->seen_uint && (self->seen_sint || self->seen_null);
-}
-
-int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
- int *error, char tsep) {
- const char *p = p_item;
- int isneg = 0;
- int64_t number = 0;
- int d;
-
- // Skip leading spaces.
- while (isspace_ascii(*p)) {
- ++p;
- }
-
- // Handle sign.
- if (*p == '-') {
- isneg = 1;
- ++p;
- } else if (*p == '+') {
- p++;
- }
-
- // Check that there is a first digit.
- if (!isdigit_ascii(*p)) {
- // Error...
- *error = ERROR_NO_DIGITS;
- return 0;
- }
-
- if (isneg) {
- // If number is greater than pre_min, at least one more digit
- // can be processed without overflowing.
- int dig_pre_min = -(int_min % 10);
- int64_t pre_min = int_min / 10;
-
- // Process the digits.
- d = *p;
- if (tsep != '\0') {
- while (1) {
- if (d == tsep) {
- d = *++p;
- continue;
- } else if (!isdigit_ascii(d)) {
- break;
- }
- if ((number > pre_min) ||
- ((number == pre_min) && (d - '0' <= dig_pre_min))) {
- number = number * 10 - (d - '0');
- d = *++p;
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
- } else {
- while (isdigit_ascii(d)) {
- if ((number > pre_min) ||
- ((number == pre_min) && (d - '0' <= dig_pre_min))) {
- number = number * 10 - (d - '0');
- d = *++p;
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
- }
- } else {
- // If number is less than pre_max, at least one more digit
- // can be processed without overflowing.
- int64_t pre_max = int_max / 10;
- int dig_pre_max = int_max % 10;
-
- // Process the digits.
- d = *p;
- if (tsep != '\0') {
- while (1) {
- if (d == tsep) {
- d = *++p;
- continue;
- } else if (!isdigit_ascii(d)) {
- break;
- }
- if ((number < pre_max) ||
- ((number == pre_max) && (d - '0' <= dig_pre_max))) {
- number = number * 10 + (d - '0');
- d = *++p;
-
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
- } else {
- while (isdigit_ascii(d)) {
- if ((number < pre_max) ||
- ((number == pre_max) && (d - '0' <= dig_pre_max))) {
- number = number * 10 + (d - '0');
- d = *++p;
-
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
- }
- }
-
- // Skip trailing spaces.
- while (isspace_ascii(*p)) {
- ++p;
- }
-
- // Did we use up all the characters?
- if (*p) {
- *error = ERROR_INVALID_CHARS;
- return 0;
- }
-
- *error = 0;
- return number;
-}
-
-uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
- uint64_t uint_max, int *error, char tsep) {
- const char *p = p_item;
- uint64_t pre_max = uint_max / 10;
- int dig_pre_max = uint_max % 10;
- uint64_t number = 0;
- int d;
-
- // Skip leading spaces.
- while (isspace_ascii(*p)) {
- ++p;
- }
-
- // Handle sign.
- if (*p == '-') {
- state->seen_sint = 1;
- *error = 0;
- return 0;
- } else if (*p == '+') {
- p++;
- }
-
- // Check that there is a first digit.
- if (!isdigit_ascii(*p)) {
- // Error...
- *error = ERROR_NO_DIGITS;
- return 0;
- }
-
- // If number is less than pre_max, at least one more digit
- // can be processed without overflowing.
- //
- // Process the digits.
- d = *p;
- if (tsep != '\0') {
- while (1) {
- if (d == tsep) {
- d = *++p;
- continue;
- } else if (!isdigit_ascii(d)) {
- break;
- }
- if ((number < pre_max) ||
- ((number == pre_max) && (d - '0' <= dig_pre_max))) {
- number = number * 10 + (d - '0');
- d = *++p;
-
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
- } else {
- while (isdigit_ascii(d)) {
- if ((number < pre_max) ||
- ((number == pre_max) && (d - '0' <= dig_pre_max))) {
- number = number * 10 + (d - '0');
- d = *++p;
-
- } else {
- *error = ERROR_OVERFLOW;
- return 0;
- }
- }
- }
-
- // Skip trailing spaces.
- while (isspace_ascii(*p)) {
- ++p;
- }
-
- // Did we use up all the characters?
- if (*p) {
- *error = ERROR_INVALID_CHARS;
- return 0;
- }
-
- if (number > (uint64_t)int_max) {
- state->seen_uint = 1;
- }
-
- *error = 0;
- return number;
-}
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/parser/tokenizer.h b/contrib/python/pandas/py3/pandas/_libs/src/parser/tokenizer.h
deleted file mode 100644
index eea9bfd4828..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/parser/tokenizer.h
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
-
-Copyright (c) 2012, Lambda Foundry, Inc., except where noted
-
-Incorporates components of WarrenWeckesser/textreader, licensed under 3-clause
-BSD
-
-See LICENSE for the license
-
-*/
-
-#ifndef PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
-#define PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
-
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-
-#define ERROR_NO_DIGITS 1
-#define ERROR_OVERFLOW 2
-#define ERROR_INVALID_CHARS 3
-
-#include <stdint.h>
-#include "../inline_helper.h"
-#include "../headers/portable.h"
-
-#include "khash.h"
-
-#define STREAM_INIT_SIZE 32
-
-#define REACHED_EOF 1
-#define CALLING_READ_FAILED 2
-
-
-/*
-
- C flat file parsing low level code for pandas / NumPy
-
- */
-
-/*
- * Common set of error types for the read_rows() and tokenize()
- * functions.
- */
-
-// #define VERBOSE
-#if defined(VERBOSE)
-#define TRACE(X) printf X;
-#else
-#define TRACE(X)
-#endif // VERBOSE
-
-#define PARSER_OUT_OF_MEMORY -1
-
-/*
- * TODO: Might want to couple count_rows() with read_rows() to avoid
- * duplication of some file I/O.
- */
-
-typedef enum {
- START_RECORD,
- START_FIELD,
- ESCAPED_CHAR,
- IN_FIELD,
- IN_QUOTED_FIELD,
- ESCAPE_IN_QUOTED_FIELD,
- QUOTE_IN_QUOTED_FIELD,
- EAT_CRNL,
- EAT_CRNL_NOP,
- EAT_WHITESPACE,
- EAT_COMMENT,
- EAT_LINE_COMMENT,
- WHITESPACE_LINE,
- START_FIELD_IN_SKIP_LINE,
- IN_FIELD_IN_SKIP_LINE,
- IN_QUOTED_FIELD_IN_SKIP_LINE,
- QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE,
- FINISHED
-} ParserState;
-
-typedef enum {
- QUOTE_MINIMAL,
- QUOTE_ALL,
- QUOTE_NONNUMERIC,
- QUOTE_NONE
-} QuoteStyle;
-
-typedef enum {
- ERROR,
- WARN,
- SKIP
-} BadLineHandleMethod;
-
-typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
- int *status, const char *encoding_errors);
-typedef int (*io_cleanup)(void *src);
-
-typedef struct parser_t {
- void *source;
- io_callback cb_io;
- io_cleanup cb_cleanup;
-
- int64_t chunksize; // Number of bytes to prepare for each chunk
- char *data; // pointer to data to be processed
- int64_t datalen; // amount of data available
- int64_t datapos;
-
- // where to write out tokenized data
- char *stream;
- uint64_t stream_len;
- uint64_t stream_cap;
-
- // Store words in (potentially ragged) matrix for now, hmm
- char **words;
- int64_t *word_starts; // where we are in the stream
- uint64_t words_len;
- uint64_t words_cap;
- uint64_t max_words_cap; // maximum word cap encountered
-
- char *pword_start; // pointer to stream start of current field
- int64_t word_start; // position start of current field
-
- int64_t *line_start; // position in words for start of line
- int64_t *line_fields; // Number of fields in each line
- uint64_t lines; // Number of (good) lines observed
- uint64_t file_lines; // Number of lines (including bad or skipped)
- uint64_t lines_cap; // Vector capacity
-
- // Tokenizing stuff
- ParserState state;
- int doublequote; /* is " represented by ""? */
- char delimiter; /* field separator */
- int delim_whitespace; /* delimit by consuming space/tabs instead */
- char quotechar; /* quote character */
- char escapechar; /* escape character */
- char lineterminator;
- int skipinitialspace; /* ignore spaces following delimiter? */
- int quoting; /* style of quoting to write */
-
- char commentchar;
- int allow_embedded_newline;
-
- int usecols; // Boolean: 1: usecols provided, 0: none provided
-
- Py_ssize_t expected_fields;
- BadLineHandleMethod on_bad_lines;
-
- // floating point options
- char decimal;
- char sci;
-
- // thousands separator (comma, period)
- char thousands;
-
- int header; // Boolean: 1: has header, 0: no header
- int64_t header_start; // header row start
- uint64_t header_end; // header row end
-
- void *skipset;
- PyObject *skipfunc;
- int64_t skip_first_N_rows;
- int64_t skip_footer;
- double (*double_converter)(const char *, char **,
- char, char, char, int, int *, int *);
-
- // error handling
- char *warn_msg;
- char *error_msg;
-
- int skip_empty_lines;
-} parser_t;
-
-typedef struct coliter_t {
- char **words;
- int64_t *line_start;
- int64_t col;
-} coliter_t;
-
-void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start);
-
-#define COLITER_NEXT(iter, word) \
- do { \
- const int64_t i = *iter.line_start++ + iter.col; \
- word = i >= *iter.line_start ? "" : iter.words[i]; \
- } while (0)
-
-parser_t *parser_new(void);
-
-int parser_init(parser_t *self);
-
-int parser_consume_rows(parser_t *self, size_t nrows);
-
-int parser_trim_buffers(parser_t *self);
-
-int parser_add_skiprow(parser_t *self, int64_t row);
-
-int parser_set_skipfirstnrows(parser_t *self, int64_t nrows);
-
-void parser_free(parser_t *self);
-
-void parser_del(parser_t *self);
-
-void parser_set_default_options(parser_t *self);
-
-int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors);
-
-int tokenize_all_rows(parser_t *self, const char *encoding_errors);
-
-// Have parsed / type-converted a chunk of data
-// and want to free memory from the token stream
-
-typedef struct uint_state {
- int seen_sint;
- int seen_uint;
- int seen_null;
-} uint_state;
-
-void uint_state_init(uint_state *self);
-
-int uint64_conflict(uint_state *self);
-
-uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
- uint64_t uint_max, int *error, char tsep);
-int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
- int *error, char tsep);
-double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
- int skip_trailing, int *error, int *maybe_int);
-double precise_xstrtod(const char *p, char **q, char decimal,
- char sci, char tsep, int skip_trailing,
- int *error, int *maybe_int);
-
-// GH-15140 - round_trip requires and acquires the GIL on its own
-double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
- int skip_trailing, int *error, int *maybe_int);
-int to_boolean(const char *item, uint8_t *val);
-
-#endif // PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/skiplist.h b/contrib/python/pandas/py3/pandas/_libs/src/skiplist.h
deleted file mode 100644
index 5d0b144a1fe..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/skiplist.h
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
-Copyright (c) 2016, PyData Development Team
-All rights reserved.
-
-Distributed under the terms of the BSD Simplified License.
-
-The full license is in the LICENSE file, distributed with this software.
-
-Flexibly-sized, index-able skiplist data structure for maintaining a sorted
-list of values
-
-Port of Wes McKinney's Cython version of Raymond Hettinger's original pure
-Python recipe (https://rhettinger.wordpress.com/2010/02/06/lost-knowledge/)
-*/
-
-#ifndef PANDAS__LIBS_SRC_SKIPLIST_H_
-#define PANDAS__LIBS_SRC_SKIPLIST_H_
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "inline_helper.h"
-
-PANDAS_INLINE float __skiplist_nanf(void) {
- const union {
- int __i;
- float __f;
- } __bint = {0x7fc00000UL};
- return __bint.__f;
-}
-#define PANDAS_NAN ((double)__skiplist_nanf())
-
-PANDAS_INLINE double Log2(double val) { return log(val) / log(2.); }
-
-typedef struct node_t node_t;
-
-struct node_t {
- node_t **next;
- int *width;
- double value;
- int is_nil;
- int levels;
- int ref_count;
-};
-
-typedef struct {
- node_t *head;
- node_t **tmp_chain;
- int *tmp_steps;
- int size;
- int maxlevels;
-} skiplist_t;
-
-PANDAS_INLINE double urand(void) {
- return ((double)rand() + 1) / ((double)RAND_MAX + 2);
-}
-
-PANDAS_INLINE int int_min(int a, int b) { return a < b ? a : b; }
-
-PANDAS_INLINE node_t *node_init(double value, int levels) {
- node_t *result;
- result = (node_t *)malloc(sizeof(node_t));
- if (result) {
- result->value = value;
- result->levels = levels;
- result->is_nil = 0;
- result->ref_count = 0;
- result->next = (node_t **)malloc(levels * sizeof(node_t *));
- result->width = (int *)malloc(levels * sizeof(int));
- if (!(result->next && result->width) && (levels != 0)) {
- free(result->next);
- free(result->width);
- free(result);
- return NULL;
- }
- }
- return result;
-}
-
-// do this ourselves
-PANDAS_INLINE void node_incref(node_t *node) { ++(node->ref_count); }
-
-PANDAS_INLINE void node_decref(node_t *node) { --(node->ref_count); }
-
-static void node_destroy(node_t *node) {
- int i;
- if (node) {
- if (node->ref_count <= 1) {
- for (i = 0; i < node->levels; ++i) {
- node_destroy(node->next[i]);
- }
- free(node->next);
- free(node->width);
- // printf("Reference count was 1, freeing\n");
- free(node);
- } else {
- node_decref(node);
- }
- // pretty sure that freeing the struct above will be enough
- }
-}
-
-PANDAS_INLINE void skiplist_destroy(skiplist_t *skp) {
- if (skp) {
- node_destroy(skp->head);
- free(skp->tmp_steps);
- free(skp->tmp_chain);
- free(skp);
- }
-}
-
-PANDAS_INLINE skiplist_t *skiplist_init(int expected_size) {
- skiplist_t *result;
- node_t *NIL, *head;
- int maxlevels, i;
-
- maxlevels = 1 + Log2((double)expected_size);
- result = (skiplist_t *)malloc(sizeof(skiplist_t));
- if (!result) {
- return NULL;
- }
- result->tmp_chain = (node_t **)malloc(maxlevels * sizeof(node_t *));
- result->tmp_steps = (int *)malloc(maxlevels * sizeof(int));
- result->maxlevels = maxlevels;
- result->size = 0;
-
- head = result->head = node_init(PANDAS_NAN, maxlevels);
- NIL = node_init(0.0, 0);
-
- if (!(result->tmp_chain && result->tmp_steps && result->head && NIL)) {
- skiplist_destroy(result);
- node_destroy(NIL);
- return NULL;
- }
-
- node_incref(head);
-
- NIL->is_nil = 1;
-
- for (i = 0; i < maxlevels; ++i) {
- head->next[i] = NIL;
- head->width[i] = 1;
- node_incref(NIL);
- }
-
- return result;
-}
-
-// 1 if left < right, 0 if left == right, -1 if left > right
-PANDAS_INLINE int _node_cmp(node_t *node, double value) {
- if (node->is_nil || node->value > value) {
- return -1;
- } else if (node->value < value) {
- return 1;
- } else {
- return 0;
- }
-}
-
-PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) {
- node_t *node;
- int level;
-
- if (i < 0 || i >= skp->size) {
- *ret = 0;
- return 0;
- }
-
- node = skp->head;
- ++i;
- for (level = skp->maxlevels - 1; level >= 0; --level) {
- while (node->width[level] <= i) {
- i -= node->width[level];
- node = node->next[level];
- }
- }
-
- *ret = 1;
- return node->value;
-}
-
-// Returns the lowest rank of all elements with value `value`, as opposed to the
-// highest rank returned by `skiplist_insert`.
-PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) {
- node_t *node;
- int level, rank = 0;
-
- node = skp->head;
- for (level = skp->maxlevels - 1; level >= 0; --level) {
- while (_node_cmp(node->next[level], value) > 0) {
- rank += node->width[level];
- node = node->next[level];
- }
- }
-
- return rank + 1;
-}
-
-// Returns the rank of the inserted element. When there are duplicates,
-// `rank` is the highest of the group, i.e. the 'max' method of
-// https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html
-PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) {
- node_t *node, *prevnode, *newnode, *next_at_level;
- int *steps_at_level;
- int size, steps, level, rank = 0;
- node_t **chain;
-
- chain = skp->tmp_chain;
-
- steps_at_level = skp->tmp_steps;
- memset(steps_at_level, 0, skp->maxlevels * sizeof(int));
-
- node = skp->head;
-
- for (level = skp->maxlevels - 1; level >= 0; --level) {
- next_at_level = node->next[level];
- while (_node_cmp(next_at_level, value) >= 0) {
- steps_at_level[level] += node->width[level];
- rank += node->width[level];
- node = next_at_level;
- next_at_level = node->next[level];
- }
- chain[level] = node;
- }
-
- size = int_min(skp->maxlevels, 1 - ((int)Log2(urand())));
-
- newnode = node_init(value, size);
- if (!newnode) {
- return -1;
- }
- steps = 0;
-
- for (level = 0; level < size; ++level) {
- prevnode = chain[level];
- newnode->next[level] = prevnode->next[level];
-
- prevnode->next[level] = newnode;
- node_incref(newnode); // increment the reference count
-
- newnode->width[level] = prevnode->width[level] - steps;
- prevnode->width[level] = steps + 1;
-
- steps += steps_at_level[level];
- }
-
- for (level = size; level < skp->maxlevels; ++level) {
- chain[level]->width[level] += 1;
- }
-
- ++(skp->size);
-
- return rank + 1;
-}
-
-PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) {
- int level, size;
- node_t *node, *prevnode, *tmpnode, *next_at_level;
- node_t **chain;
-
- chain = skp->tmp_chain;
- node = skp->head;
-
- for (level = skp->maxlevels - 1; level >= 0; --level) {
- next_at_level = node->next[level];
- while (_node_cmp(next_at_level, value) > 0) {
- node = next_at_level;
- next_at_level = node->next[level];
- }
- chain[level] = node;
- }
-
- if (value != chain[0]->next[0]->value) {
- return 0;
- }
-
- size = chain[0]->next[0]->levels;
-
- for (level = 0; level < size; ++level) {
- prevnode = chain[level];
-
- tmpnode = prevnode->next[level];
-
- prevnode->width[level] += tmpnode->width[level] - 1;
- prevnode->next[level] = tmpnode->next[level];
-
- tmpnode->next[level] = NULL;
- node_destroy(tmpnode); // decrement refcount or free
- }
-
- for (level = size; level < skp->maxlevels; ++level) {
- --(chain[level]->width[level]);
- }
-
- --(skp->size);
- return 1;
-}
-
-#endif // PANDAS__LIBS_SRC_SKIPLIST_H_
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajson.h b/contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajson.h
deleted file mode 100644
index 5c851254815..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajson.h
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
-Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the ESN Social Software AB nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
-https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
-
-Numeric decoder derived from TCL library
-https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
- * Copyright (c) 1988-1993 The Regents of the University of California.
- * Copyright (c) 1994 Sun Microsystems, Inc.
-*/
-
-/*
-Ultra fast JSON encoder and decoder
-Developed by Jonas Tarnstrom (jonas@esn.me).
-
-Encoder notes:
-------------------
-
-:: Cyclic references ::
-Cyclic referenced objects are not detected.
-Set JSONObjectEncoder.recursionMax to suitable value or make sure input object
-tree doesn't have cyclic references.
-
-*/
-
-#ifndef PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_
-#define PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_
-
-#include <stdio.h>
-#include <wchar.h>
-#include "../../headers/portable.h"
-
-// Don't output any extra whitespaces when encoding
-#define JSON_NO_EXTRA_WHITESPACE
-
-// Max decimals to encode double floating point numbers with
-#ifndef JSON_DOUBLE_MAX_DECIMALS
-#define JSON_DOUBLE_MAX_DECIMALS 15
-#endif
-
-// Max recursion depth, default for encoder
-#ifndef JSON_MAX_RECURSION_DEPTH
-#define JSON_MAX_RECURSION_DEPTH 1024
-#endif
-
-// Max recursion depth, default for decoder
-#ifndef JSON_MAX_OBJECT_DEPTH
-#define JSON_MAX_OBJECT_DEPTH 1024
-#endif
-
-/*
-Dictates and limits how much stack space for buffers UltraJSON will use before resorting to provided heap functions */
-#ifndef JSON_MAX_STACK_BUFFER_SIZE
-#define JSON_MAX_STACK_BUFFER_SIZE 131072
-#endif
-
-#ifdef _WIN32
-
-typedef __int64 JSINT64;
-typedef unsigned __int64 JSUINT64;
-
-typedef __int32 JSINT32;
-typedef unsigned __int32 JSUINT32;
-typedef unsigned __int8 JSUINT8;
-typedef unsigned __int16 JSUTF16;
-typedef unsigned __int32 JSUTF32;
-typedef __int64 JSLONG;
-
-#define EXPORTFUNCTION __declspec(dllexport)
-
-#define FASTCALL_MSVC __fastcall
-
-#define INLINE_PREFIX static __inline
-
-#else
-
-#include <stdint.h>
-typedef int64_t JSINT64;
-typedef uint64_t JSUINT64;
-
-typedef int32_t JSINT32;
-typedef uint32_t JSUINT32;
-
-#define FASTCALL_MSVC
-
-#define INLINE_PREFIX static inline
-
-typedef uint8_t JSUINT8;
-typedef uint16_t JSUTF16;
-typedef uint32_t JSUTF32;
-
-typedef int64_t JSLONG;
-
-#define EXPORTFUNCTION
-#endif
-
-#if !(defined(__LITTLE_ENDIAN__) || defined(__BIG_ENDIAN__))
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define __LITTLE_ENDIAN__
-#else
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define __BIG_ENDIAN__
-#endif
-
-#endif
-
-#endif
-
-#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
-#error "Endianness not supported"
-#endif
-
-enum JSTYPES {
- JT_NULL, // NULL
- JT_TRUE, // boolean true
- JT_FALSE, // boolean false
- JT_INT, // (JSINT32 (signed 32-bit))
- JT_LONG, // (JSINT64 (signed 64-bit))
- JT_DOUBLE, // (double)
- JT_BIGNUM, // integer larger than sys.maxsize
- JT_UTF8, // (char 8-bit)
- JT_ARRAY, // Array structure
- JT_OBJECT, // Key/Value structure
- JT_INVALID, // Internal, do not return nor expect
- JT_POS_INF, // Positive infinity
- JT_NEG_INF, // Negative infinity
-};
-
-typedef void * JSOBJ;
-typedef void * JSITER;
-
-typedef struct __JSONTypeContext {
- int type;
- void *encoder;
- void *prv;
-} JSONTypeContext;
-
-/*
-Function pointer declarations, suitable for implementing UltraJSON */
-typedef void (*JSPFN_ITERBEGIN)(JSOBJ obj, JSONTypeContext *tc);
-typedef int (*JSPFN_ITERNEXT)(JSOBJ obj, JSONTypeContext *tc);
-typedef void (*JSPFN_ITEREND)(JSOBJ obj, JSONTypeContext *tc);
-typedef JSOBJ (*JSPFN_ITERGETVALUE)(JSOBJ obj, JSONTypeContext *tc);
-typedef char *(*JSPFN_ITERGETNAME)(JSOBJ obj, JSONTypeContext *tc,
- size_t *outLen);
-typedef void *(*JSPFN_MALLOC)(size_t size);
-typedef void (*JSPFN_FREE)(void *pptr);
-typedef void *(*JSPFN_REALLOC)(void *base, size_t size);
-
-typedef struct __JSONObjectEncoder {
- void (*beginTypeContext)(JSOBJ obj, JSONTypeContext *tc);
- void (*endTypeContext)(JSOBJ obj, JSONTypeContext *tc);
- const char *(*getStringValue)(JSOBJ obj, JSONTypeContext *tc,
- size_t *_outLen);
- JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc);
- JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc);
- double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc);
- const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc,
- size_t *_outLen);
-
- /*
- Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT)
- Implementor should setup iteration state in ti->prv
- */
- JSPFN_ITERBEGIN iterBegin;
-
- /*
- Retrieve next object in an iteration. Should return 0 to indicate iteration has reached end or 1 if there are more items.
- Implementor is responsible for keeping state of the iteration. Use ti->prv fields for this
- */
- JSPFN_ITERNEXT iterNext;
-
- /*
- Ends the iteration of an iteratable object.
- Any iteration state stored in ti->prv can be freed here
- */
- JSPFN_ITEREND iterEnd;
-
- /*
- Returns a reference to the value object of an iterator
- The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object
- */
- JSPFN_ITERGETVALUE iterGetValue;
-
- /*
- Return name of iterator.
- The is responsible for the life-cycle of the returned string. Use iterNext/iterEnd and ti->prv to keep track of current object
- */
- JSPFN_ITERGETNAME iterGetName;
-
- /*
- Release a value as indicated by setting ti->release = 1 in the previous getValue call.
- The ti->prv array should contain the necessary context to release the value
- */
- void (*releaseObject)(JSOBJ obj);
-
- /* Library functions
- Set to NULL to use STDLIB malloc,realloc,free */
- JSPFN_MALLOC malloc;
- JSPFN_REALLOC realloc;
- JSPFN_FREE free;
-
- /*
- Configuration for max recursion, set to 0 to use default (see JSON_MAX_RECURSION_DEPTH)*/
- int recursionMax;
-
- /*
- Configuration for max decimals of double floating point numbers to encode (0-9) */
- int doublePrecision;
-
- /*
- If true output will be ASCII with all characters above 127 encoded as \uXXXX. If false output will be UTF-8 or what ever charset strings are brought as */
- int forceASCII;
-
- /*
- If true, '<', '>', and '&' characters will be encoded as \u003c, \u003e, and \u0026, respectively. If false, no special encoding will be used. */
- int encodeHTMLChars;
-
- /*
- Configuration for spaces of indent */
- int indent;
-
- /*
- Set to an error message if error occurred */
- const char *errorMsg;
- JSOBJ errorObj;
-
- /* Buffer stuff */
- char *start;
- char *offset;
- char *end;
- int heap;
- int level;
-} JSONObjectEncoder;
-
-/*
-Encode an object structure into JSON.
-
-Arguments:
-obj - An anonymous type representing the object
-enc - Function definitions for querying JSOBJ type
-buffer - Preallocated buffer to store result in. If NULL function allocates own buffer
-cbBuffer - Length of buffer (ignored if buffer is NULL)
-
-Returns:
-Encoded JSON object as a null terminated char string.
-
-NOTE:
-If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer.
-Life cycle of the provided buffer must still be handled by caller.
-
-If the return value doesn't equal the specified buffer caller must release the memory using
-JSONObjectEncoder.free or free() as specified when calling this function.
-*/
-EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc,
- char *buffer, size_t cbBuffer);
-
-typedef struct __JSONObjectDecoder {
- JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end);
- int (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
- int (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value);
- JSOBJ (*newTrue)(void *prv);
- JSOBJ (*newFalse)(void *prv);
- JSOBJ (*newNull)(void *prv);
- JSOBJ (*newPosInf)(void *prv);
- JSOBJ (*newNegInf)(void *prv);
- JSOBJ (*newObject)(void *prv, void *decoder);
- JSOBJ (*endObject)(void *prv, JSOBJ obj);
- JSOBJ (*newArray)(void *prv, void *decoder);
- JSOBJ (*endArray)(void *prv, JSOBJ obj);
- JSOBJ (*newInt)(void *prv, JSINT32 value);
- JSOBJ (*newLong)(void *prv, JSINT64 value);
- JSOBJ (*newUnsignedLong)(void *prv, JSUINT64 value);
- JSOBJ (*newDouble)(void *prv, double value);
- void (*releaseObject)(void *prv, JSOBJ obj, void *decoder);
- JSPFN_MALLOC malloc;
- JSPFN_FREE free;
- JSPFN_REALLOC realloc;
- char *errorStr;
- char *errorOffset;
- int preciseFloat;
- void *prv;
-} JSONObjectDecoder;
-
-EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec,
- const char *buffer, size_t cbBuffer);
-EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t);
-
-#endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajsondec.c b/contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajsondec.c
deleted file mode 100644
index 5347db16556..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajsondec.c
+++ /dev/null
@@ -1,1208 +0,0 @@
-/*
-Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-* Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-* Neither the name of the ESN Social Software AB nor the
-names of its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE
-LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
-https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights
-reserved.
-
-Numeric decoder derived from TCL library
-https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
-* Copyright (c) 1988-1993 The Regents of the University of California.
-* Copyright (c) 1994 Sun Microsystems, Inc.
-*/
-
-#include <assert.h>
-#include <errno.h>
-#include <limits.h>
-#include <locale.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <wchar.h>
-#include "ultrajson.h"
-
-#ifndef TRUE
-#define TRUE 1
-#define FALSE 0
-#endif
-#ifndef NULL
-#define NULL 0
-#endif
-
-struct DecoderState {
- char *start;
- char *end;
- wchar_t *escStart;
- wchar_t *escEnd;
- int escHeap;
- int lastType;
- JSUINT32 objDepth;
- void *prv;
- JSONObjectDecoder *dec;
-};
-
-JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds);
-typedef JSOBJ (*PFN_DECODER)(struct DecoderState *ds);
-
-static JSOBJ SetError(struct DecoderState *ds, int offset,
- const char *message) {
- ds->dec->errorOffset = ds->start + offset;
- ds->dec->errorStr = (char *)message;
- return NULL;
-}
-
-double createDouble(double intNeg, double intValue, double frcValue,
- int frcDecimalCount) {
- static const double g_pow10[] = {1.0,
- 0.1,
- 0.01,
- 0.001,
- 0.0001,
- 0.00001,
- 0.000001,
- 0.0000001,
- 0.00000001,
- 0.000000001,
- 0.0000000001,
- 0.00000000001,
- 0.000000000001,
- 0.0000000000001,
- 0.00000000000001,
- 0.000000000000001};
- return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg;
-}
-
-JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) {
- char *end;
- double value;
- errno = 0;
-
- value = strtod(ds->start, &end);
-
- if (errno == ERANGE) {
- return SetError(ds, -1, "Range error when decoding numeric as double");
- }
-
- ds->start = end;
- return ds->dec->newDouble(ds->prv, value);
-}
-
-JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
- int intNeg = 1;
- JSUINT64 intValue;
- JSUINT64 prevIntValue;
- int chr;
- int decimalCount = 0;
- double frcValue = 0.0;
- double expNeg;
- double expValue;
- char *offset = ds->start;
-
- JSUINT64 overflowLimit = LLONG_MAX;
-
- if (*(offset) == 'I') {
- goto DECODE_INF;
- } else if (*(offset) == 'N') {
- goto DECODE_NAN;
- } else if (*(offset) == '-') {
- offset++;
- intNeg = -1;
- overflowLimit = LLONG_MIN;
- if (*(offset) == 'I') {
- goto DECODE_INF;
- }
- }
-
- // Scan integer part
- intValue = 0;
-
- while (1) {
- chr = (int)(unsigned char)*(offset);
-
- switch (chr) {
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9': {
- // PERF: Don't do 64-bit arithmetic here unless we have to
- prevIntValue = intValue;
- intValue = intValue * 10ULL + (JSLONG) (chr - 48);
-
- if (intNeg == 1 && prevIntValue > intValue) {
- return SetError(ds, -1, "Value is too big!");
- } else if (intNeg == -1 && intValue > overflowLimit) {
- return SetError(ds, -1, overflowLimit == LLONG_MAX ?
- "Value is too big!" : "Value is too small");
- }
-
- offset++;
- break;
- }
- case '.': {
- offset++;
- goto DECODE_FRACTION;
- break;
- }
- case 'e':
- case 'E': {
- offset++;
- goto DECODE_EXPONENT;
- break;
- }
-
- default: {
- goto BREAK_INT_LOOP;
- break;
- }
- }
- }
-
-BREAK_INT_LOOP:
-
- ds->lastType = JT_INT;
- ds->start = offset;
-
- if (intNeg == 1 && (intValue & 0x8000000000000000ULL) != 0)
- return ds->dec->newUnsignedLong(ds->prv, intValue);
- else if ((intValue >> 31))
- return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg));
- else
- return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg));
-
-DECODE_FRACTION:
-
- if (ds->dec->preciseFloat) {
- return decodePreciseFloat(ds);
- }
-
- // Scan fraction part
- frcValue = 0.0;
- for (;;) {
- chr = (int)(unsigned char)*(offset);
-
- switch (chr) {
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9': {
- if (decimalCount < JSON_DOUBLE_MAX_DECIMALS) {
- frcValue = frcValue * 10.0 + (double)(chr - 48);
- decimalCount++;
- }
- offset++;
- break;
- }
- case 'e':
- case 'E': {
- offset++;
- goto DECODE_EXPONENT;
- break;
- }
- default: { goto BREAK_FRC_LOOP; }
- }
- }
-
-BREAK_FRC_LOOP:
- // FIXME: Check for arithmetic overflow here
- ds->lastType = JT_DOUBLE;
- ds->start = offset;
- return ds->dec->newDouble(
- ds->prv,
- createDouble((double)intNeg, (double)intValue, frcValue, decimalCount));
-
-DECODE_EXPONENT:
- if (ds->dec->preciseFloat) {
- return decodePreciseFloat(ds);
- }
-
- expNeg = 1.0;
-
- if (*(offset) == '-') {
- expNeg = -1.0;
- offset++;
- } else if (*(offset) == '+') {
- expNeg = +1.0;
- offset++;
- }
-
- expValue = 0.0;
-
- for (;;) {
- chr = (int)(unsigned char)*(offset);
-
- switch (chr) {
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9': {
- expValue = expValue * 10.0 + (double)(chr - 48);
- offset++;
- break;
- }
- default: { goto BREAK_EXP_LOOP; }
- }
- }
-
-DECODE_NAN:
- offset++;
- if (*(offset++) != 'a') goto SET_NAN_ERROR;
- if (*(offset++) != 'N') goto SET_NAN_ERROR;
-
- ds->lastType = JT_NULL;
- ds->start = offset;
- return ds->dec->newNull(ds->prv);
-
-SET_NAN_ERROR:
- return SetError(ds, -1, "Unexpected character found when decoding 'NaN'");
-
-DECODE_INF:
- offset++;
- if (*(offset++) != 'n') goto SET_INF_ERROR;
- if (*(offset++) != 'f') goto SET_INF_ERROR;
- if (*(offset++) != 'i') goto SET_INF_ERROR;
- if (*(offset++) != 'n') goto SET_INF_ERROR;
- if (*(offset++) != 'i') goto SET_INF_ERROR;
- if (*(offset++) != 't') goto SET_INF_ERROR;
- if (*(offset++) != 'y') goto SET_INF_ERROR;
-
- ds->start = offset;
-
- if (intNeg == 1) {
- ds->lastType = JT_POS_INF;
- return ds->dec->newPosInf(ds->prv);
- } else {
- ds->lastType = JT_NEG_INF;
- return ds->dec->newNegInf(ds->prv);
- }
-
-SET_INF_ERROR:
- if (intNeg == 1) {
- const char *msg = "Unexpected character found when decoding 'Infinity'";
- return SetError(ds, -1, msg);
- } else {
- const char *msg = "Unexpected character found when decoding '-Infinity'";
- return SetError(ds, -1, msg);
- }
-
-
-BREAK_EXP_LOOP:
- // FIXME: Check for arithmetic overflow here
- ds->lastType = JT_DOUBLE;
- ds->start = offset;
- return ds->dec->newDouble(
- ds->prv,
- createDouble((double)intNeg, (double)intValue, frcValue, decimalCount) *
- pow(10.0, expValue * expNeg));
-}
-
-JSOBJ FASTCALL_MSVC decode_true(struct DecoderState *ds) {
- char *offset = ds->start;
- offset++;
-
- if (*(offset++) != 'r') goto SETERROR;
- if (*(offset++) != 'u') goto SETERROR;
- if (*(offset++) != 'e') goto SETERROR;
-
- ds->lastType = JT_TRUE;
- ds->start = offset;
- return ds->dec->newTrue(ds->prv);
-
-SETERROR:
- return SetError(ds, -1, "Unexpected character found when decoding 'true'");
-}
-
-JSOBJ FASTCALL_MSVC decode_false(struct DecoderState *ds) {
- char *offset = ds->start;
- offset++;
-
- if (*(offset++) != 'a') goto SETERROR;
- if (*(offset++) != 'l') goto SETERROR;
- if (*(offset++) != 's') goto SETERROR;
- if (*(offset++) != 'e') goto SETERROR;
-
- ds->lastType = JT_FALSE;
- ds->start = offset;
- return ds->dec->newFalse(ds->prv);
-
-SETERROR:
- return SetError(ds, -1, "Unexpected character found when decoding 'false'");
-}
-
-JSOBJ FASTCALL_MSVC decode_null(struct DecoderState *ds) {
- char *offset = ds->start;
- offset++;
-
- if (*(offset++) != 'u') goto SETERROR;
- if (*(offset++) != 'l') goto SETERROR;
- if (*(offset++) != 'l') goto SETERROR;
-
- ds->lastType = JT_NULL;
- ds->start = offset;
- return ds->dec->newNull(ds->prv);
-
-SETERROR:
- return SetError(ds, -1, "Unexpected character found when decoding 'null'");
-}
-
-void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) {
- char *offset;
-
- for (offset = ds->start; (ds->end - offset) > 0; offset++) {
- switch (*offset) {
- case ' ':
- case '\t':
- case '\r':
- case '\n':
- break;
-
- default:
- ds->start = offset;
- return;
- }
- }
-
- if (offset == ds->end) {
- ds->start = ds->end;
- }
-}
-
-enum DECODESTRINGSTATE {
- DS_ISNULL = 0x32,
- DS_ISQUOTE,
- DS_ISESCAPE,
- DS_UTFLENERROR,
-};
-
-static const JSUINT8 g_decoderLookup[256] = {
- /* 0x00 */ DS_ISNULL,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x10 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x20 */ 1,
- 1,
- DS_ISQUOTE,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x30 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x40 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x50 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- DS_ISESCAPE,
- 1,
- 1,
- 1,
- /* 0x60 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x70 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x80 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x90 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0xa0 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0xb0 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0xc0 */ 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- /* 0xd0 */ 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- /* 0xe0 */ 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- /* 0xf0 */ 4,
- 4,
- 4,
- 4,
- 4,
- 4,
- 4,
- 4,
- DS_UTFLENERROR,
- DS_UTFLENERROR,
- DS_UTFLENERROR,
- DS_UTFLENERROR,
- DS_UTFLENERROR,
- DS_UTFLENERROR,
- DS_UTFLENERROR,
- DS_UTFLENERROR,
-};
-
-JSOBJ FASTCALL_MSVC decode_string(struct DecoderState *ds) {
- JSUTF16 sur[2] = {0};
- int iSur = 0;
- int index;
- wchar_t *escOffset;
- wchar_t *escStart;
- size_t escLen = (ds->escEnd - ds->escStart);
- JSUINT8 *inputOffset;
- JSUINT8 oct;
- JSUTF32 ucs;
- ds->lastType = JT_INVALID;
- ds->start++;
-
- if ((size_t)(ds->end - ds->start) > escLen) {
- size_t newSize = (ds->end - ds->start);
-
- if (ds->escHeap) {
- if (newSize > (SIZE_MAX / sizeof(wchar_t))) {
- return SetError(ds, -1, "Could not reserve memory block");
- }
- escStart = (wchar_t *)ds->dec->realloc(ds->escStart,
- newSize * sizeof(wchar_t));
- if (!escStart) {
- ds->dec->free(ds->escStart);
- return SetError(ds, -1, "Could not reserve memory block");
- }
- ds->escStart = escStart;
- } else {
- wchar_t *oldStart = ds->escStart;
- if (newSize > (SIZE_MAX / sizeof(wchar_t))) {
- return SetError(ds, -1, "Could not reserve memory block");
- }
- ds->escStart =
- (wchar_t *)ds->dec->malloc(newSize * sizeof(wchar_t));
- if (!ds->escStart) {
- return SetError(ds, -1, "Could not reserve memory block");
- }
- ds->escHeap = 1;
- memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t));
- }
-
- ds->escEnd = ds->escStart + newSize;
- }
-
- escOffset = ds->escStart;
- inputOffset = (JSUINT8 *)ds->start;
-
- for (;;) {
- switch (g_decoderLookup[(JSUINT8)(*inputOffset)]) {
- case DS_ISNULL: {
- return SetError(ds, -1,
- "Unmatched ''\"' when when decoding 'string'");
- }
- case DS_ISQUOTE: {
- ds->lastType = JT_UTF8;
- inputOffset++;
- ds->start += ((char *)inputOffset - (ds->start));
- return ds->dec->newString(ds->prv, ds->escStart, escOffset);
- }
- case DS_UTFLENERROR: {
- return SetError(
- ds, -1,
- "Invalid UTF-8 sequence length when decoding 'string'");
- }
- case DS_ISESCAPE:
- inputOffset++;
- switch (*inputOffset) {
- case '\\':
- *(escOffset++) = L'\\';
- inputOffset++;
- continue;
- case '\"':
- *(escOffset++) = L'\"';
- inputOffset++;
- continue;
- case '/':
- *(escOffset++) = L'/';
- inputOffset++;
- continue;
- case 'b':
- *(escOffset++) = L'\b';
- inputOffset++;
- continue;
- case 'f':
- *(escOffset++) = L'\f';
- inputOffset++;
- continue;
- case 'n':
- *(escOffset++) = L'\n';
- inputOffset++;
- continue;
- case 'r':
- *(escOffset++) = L'\r';
- inputOffset++;
- continue;
- case 't':
- *(escOffset++) = L'\t';
- inputOffset++;
- continue;
-
- case 'u': {
- int index;
- inputOffset++;
-
- for (index = 0; index < 4; index++) {
- switch (*inputOffset) {
- case '\0':
- return SetError(ds, -1,
- "Unterminated unicode "
- "escape sequence when "
- "decoding 'string'");
- default:
- return SetError(ds, -1,
- "Unexpected character in "
- "unicode escape sequence "
- "when decoding 'string'");
-
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- sur[iSur] = (sur[iSur] << 4) +
- (JSUTF16)(*inputOffset - '0');
- break;
-
- case 'a':
- case 'b':
- case 'c':
- case 'd':
- case 'e':
- case 'f':
- sur[iSur] = (sur[iSur] << 4) + 10 +
- (JSUTF16)(*inputOffset - 'a');
- break;
-
- case 'A':
- case 'B':
- case 'C':
- case 'D':
- case 'E':
- case 'F':
- sur[iSur] = (sur[iSur] << 4) + 10 +
- (JSUTF16)(*inputOffset - 'A');
- break;
- }
-
- inputOffset++;
- }
-
- if (iSur == 0) {
- if ((sur[iSur] & 0xfc00) == 0xd800) {
- // First of a surrogate pair, continue parsing
- iSur++;
- break;
- }
- (*escOffset++) = (wchar_t)sur[iSur];
- iSur = 0;
- } else {
- // Decode pair
- if ((sur[1] & 0xfc00) != 0xdc00) {
- return SetError(ds, -1,
- "Unpaired high surrogate when "
- "decoding 'string'");
- }
-#if WCHAR_MAX == 0xffff
- (*escOffset++) = (wchar_t)sur[0];
- (*escOffset++) = (wchar_t)sur[1];
-#else
- (*escOffset++) =
- (wchar_t)0x10000 +
- (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
-#endif
- iSur = 0;
- }
- break;
- }
-
- case '\0':
- return SetError(ds, -1,
- "Unterminated escape sequence when "
- "decoding 'string'");
- default:
- return SetError(ds, -1,
- "Unrecognized escape sequence when "
- "decoding 'string'");
- }
- break;
-
- case 1: {
- *(escOffset++) = (wchar_t)(*inputOffset++);
- break;
- }
-
- case 2: {
- ucs = (*inputOffset++) & 0x1f;
- ucs <<= 6;
- if (((*inputOffset) & 0x80) != 0x80) {
- return SetError(ds, -1,
- "Invalid octet in UTF-8 sequence when "
- "decoding 'string'");
- }
- ucs |= (*inputOffset++) & 0x3f;
- if (ucs < 0x80)
- return SetError(ds, -1,
- "Overlong 2 byte UTF-8 sequence detected "
- "when decoding 'string'");
- *(escOffset++) = (wchar_t)ucs;
- break;
- }
-
- case 3: {
- JSUTF32 ucs = 0;
- ucs |= (*inputOffset++) & 0x0f;
-
- for (index = 0; index < 2; index++) {
- ucs <<= 6;
- oct = (*inputOffset++);
-
- if ((oct & 0x80) != 0x80) {
- return SetError(ds, -1,
- "Invalid octet in UTF-8 sequence when "
- "decoding 'string'");
- }
-
- ucs |= oct & 0x3f;
- }
-
- if (ucs < 0x800)
- return SetError(ds, -1,
- "Overlong 3 byte UTF-8 sequence detected "
- "when encoding string");
- *(escOffset++) = (wchar_t)ucs;
- break;
- }
-
- case 4: {
- JSUTF32 ucs = 0;
- ucs |= (*inputOffset++) & 0x07;
-
- for (index = 0; index < 3; index++) {
- ucs <<= 6;
- oct = (*inputOffset++);
-
- if ((oct & 0x80) != 0x80) {
- return SetError(ds, -1,
- "Invalid octet in UTF-8 sequence when "
- "decoding 'string'");
- }
-
- ucs |= oct & 0x3f;
- }
-
- if (ucs < 0x10000)
- return SetError(ds, -1,
- "Overlong 4 byte UTF-8 sequence detected "
- "when decoding 'string'");
-
-#if WCHAR_MAX == 0xffff
- if (ucs >= 0x10000) {
- ucs -= 0x10000;
- *(escOffset++) = (wchar_t)(ucs >> 10) + 0xd800;
- *(escOffset++) = (wchar_t)(ucs & 0x3ff) + 0xdc00;
- } else {
- *(escOffset++) = (wchar_t)ucs;
- }
-#else
- *(escOffset++) = (wchar_t)ucs;
-#endif
- break;
- }
- }
- }
-}
-
-JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) {
- JSOBJ itemValue;
- JSOBJ newObj;
- int len;
- ds->objDepth++;
- if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) {
- return SetError(ds, -1, "Reached object decoding depth limit");
- }
-
- newObj = ds->dec->newArray(ds->prv, ds->dec);
- len = 0;
-
- ds->lastType = JT_INVALID;
- ds->start++;
-
- for (;;) {
- SkipWhitespace(ds);
-
- if ((*ds->start) == ']') {
- ds->objDepth--;
- if (len == 0) {
- ds->start++;
- return ds->dec->endArray(ds->prv, newObj);
- }
-
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return SetError(
- ds, -1,
- "Unexpected character found when decoding array value (1)");
- }
-
- itemValue = decode_any(ds);
-
- if (itemValue == NULL) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return NULL;
- }
-
- if (!ds->dec->arrayAddItem(ds->prv, newObj, itemValue)) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return NULL;
- }
-
- SkipWhitespace(ds);
-
- switch (*(ds->start++)) {
- case ']': {
- ds->objDepth--;
- return ds->dec->endArray(ds->prv, newObj);
- }
- case ',':
- break;
-
- default:
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return SetError(
- ds, -1,
- "Unexpected character found when decoding array value (2)");
- }
-
- len++;
- }
-}
-
-JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) {
- JSOBJ itemName;
- JSOBJ itemValue;
- JSOBJ newObj;
-
- ds->objDepth++;
- if (ds->objDepth > JSON_MAX_OBJECT_DEPTH) {
- return SetError(ds, -1, "Reached object decoding depth limit");
- }
-
- newObj = ds->dec->newObject(ds->prv, ds->dec);
-
- ds->start++;
-
- for (;;) {
- SkipWhitespace(ds);
-
- if ((*ds->start) == '}') {
- ds->objDepth--;
- ds->start++;
- return ds->dec->endObject(ds->prv, newObj);
- }
-
- ds->lastType = JT_INVALID;
- itemName = decode_any(ds);
-
- if (itemName == NULL) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return NULL;
- }
-
- if (ds->lastType != JT_UTF8) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- ds->dec->releaseObject(ds->prv, itemName, ds->dec);
- return SetError(
- ds, -1,
- "Key name of object must be 'string' when decoding 'object'");
- }
-
- SkipWhitespace(ds);
-
- if (*(ds->start++) != ':') {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- ds->dec->releaseObject(ds->prv, itemName, ds->dec);
- return SetError(ds, -1, "No ':' found when decoding object value");
- }
-
- SkipWhitespace(ds);
-
- itemValue = decode_any(ds);
-
- if (itemValue == NULL) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- ds->dec->releaseObject(ds->prv, itemName, ds->dec);
- return NULL;
- }
-
- if (!ds->dec->objectAddKey(ds->prv, newObj, itemName, itemValue)) {
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- ds->dec->releaseObject(ds->prv, itemName, ds->dec);
- ds->dec->releaseObject(ds->prv, itemValue, ds->dec);
- return NULL;
- }
-
- SkipWhitespace(ds);
-
- switch (*(ds->start++)) {
- case '}': {
- ds->objDepth--;
- return ds->dec->endObject(ds->prv, newObj);
- }
- case ',':
- break;
-
- default:
- ds->dec->releaseObject(ds->prv, newObj, ds->dec);
- return SetError(
- ds, -1,
- "Unexpected character found when decoding object value");
- }
- }
-}
-
-JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) {
- for (;;) {
- switch (*ds->start) {
- case '\"':
- return decode_string(ds);
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- case 'I':
- case 'N':
- case '-':
- return decode_numeric(ds);
-
- case '[':
- return decode_array(ds);
- case '{':
- return decode_object(ds);
- case 't':
- return decode_true(ds);
- case 'f':
- return decode_false(ds);
- case 'n':
- return decode_null(ds);
-
- case ' ':
- case '\t':
- case '\r':
- case '\n':
- // White space
- ds->start++;
- break;
-
- default:
- return SetError(ds, -1, "Expected object or value");
- }
- }
-}
-
-JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer,
- size_t cbBuffer) {
- /*
- FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode
- escaping doesn't run into the wall each time */
- char *locale;
- struct DecoderState ds;
- wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))];
- JSOBJ ret;
-
- ds.start = (char *)buffer;
- ds.end = ds.start + cbBuffer;
-
- ds.escStart = escBuffer;
- ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t));
- ds.escHeap = 0;
- ds.prv = dec->prv;
- ds.dec = dec;
- ds.dec->errorStr = NULL;
- ds.dec->errorOffset = NULL;
- ds.objDepth = 0;
-
- ds.dec = dec;
-
- locale = setlocale(LC_NUMERIC, NULL);
- if (!locale) {
- return SetError(&ds, -1, "setlocale call failed");
- }
-
- if (strcmp(locale, "C")) {
- size_t len = strlen(locale) + 1;
- char *saved_locale = malloc(len);
- if (saved_locale == NULL) {
- return SetError(&ds, -1, "Could not reserve memory block");
- }
- memcpy(saved_locale, locale, len);
- setlocale(LC_NUMERIC, "C");
- ret = decode_any(&ds);
- setlocale(LC_NUMERIC, saved_locale);
- free(saved_locale);
- } else {
- ret = decode_any(&ds);
- }
-
- if (ds.escHeap) {
- dec->free(ds.escStart);
- }
-
- SkipWhitespace(&ds);
-
- if (ds.start != ds.end && ret) {
- dec->releaseObject(ds.prv, ret, ds.dec);
- return SetError(&ds, -1, "Trailing data");
- }
-
- return ret;
-}
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajsonenc.c
deleted file mode 100644
index 169c5b68890..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/ujson/lib/ultrajsonenc.c
+++ /dev/null
@@ -1,1207 +0,0 @@
-/*
-Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the ESN Social Software AB nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE
-LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
-https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights
-reserved.
-
-Numeric decoder derived from TCL library
-https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
- * Copyright (c) 1988-1993 The Regents of the University of California.
- * Copyright (c) 1994 Sun Microsystems, Inc.
-*/
-
-#include <assert.h>
-#include <float.h>
-#include <locale.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "ultrajson.h"
-
-#ifndef TRUE
-#define TRUE 1
-#endif
-#ifndef FALSE
-#define FALSE 0
-#endif
-
-/*
-Worst cases being:
-
-Control characters (ASCII < 32)
-0x00 (1 byte) input => \u0000 output (6 bytes)
-1 * 6 => 6 (6 bytes required)
-
-or UTF-16 surrogate pairs
-4 bytes input in UTF-8 => \uXXXX\uYYYY (12 bytes).
-
-4 * 6 => 24 bytes (12 bytes required)
-
-The extra 2 bytes are for the quotes around the string
-
-*/
-#define RESERVE_STRING(_len) (2 + ((_len)*6))
-
-static const double g_pow10[] = {1,
- 10,
- 100,
- 1000,
- 10000,
- 100000,
- 1000000,
- 10000000,
- 100000000,
- 1000000000,
- 10000000000,
- 100000000000,
- 1000000000000,
- 10000000000000,
- 100000000000000,
- 1000000000000000};
-static const char g_hexChars[] = "0123456789abcdef";
-static const char g_escapeChars[] = "0123456789\\b\\t\\n\\f\\r\\\"\\\\\\/";
-
-/*
-FIXME: While this is fine dandy and working it's a magic value mess which
-probably only the author understands.
-Needs a cleanup and more documentation */
-
-/*
-Table for pure ascii output escaping all characters above 127 to \uXXXX */
-static const JSUINT8 g_asciiOutputTable[256] = {
- /* 0x00 */ 0,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 10,
- 12,
- 14,
- 30,
- 16,
- 18,
- 30,
- 30,
- /* 0x10 */ 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- 30,
- /* 0x20 */ 1,
- 1,
- 20,
- 1,
- 1,
- 1,
- 29,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 24,
- /* 0x30 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 29,
- 1,
- 29,
- 1,
- /* 0x40 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x50 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 22,
- 1,
- 1,
- 1,
- /* 0x60 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x70 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x80 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0x90 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0xa0 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0xb0 */ 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- /* 0xc0 */ 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- /* 0xd0 */ 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- /* 0xe0 */ 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- /* 0xf0 */ 4,
- 4,
- 4,
- 4,
- 4,
- 4,
- 4,
- 4,
- 5,
- 5,
- 5,
- 5,
- 6,
- 6,
- 1,
- 1};
-
-static void SetError(JSOBJ obj, JSONObjectEncoder *enc, const char *message) {
- enc->errorMsg = message;
- enc->errorObj = obj;
-}
-
-/*
-FIXME: Keep track of how big these get across several encoder calls and try to
-make an estimate
-That way we won't run our head into the wall each call */
-void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded) {
- size_t curSize = enc->end - enc->start;
- size_t newSize = curSize * 2;
- size_t offset = enc->offset - enc->start;
-
- while (newSize < curSize + cbNeeded) {
- newSize *= 2;
- }
-
- if (enc->heap) {
- enc->start = (char *)enc->realloc(enc->start, newSize);
- if (!enc->start) {
- SetError(NULL, enc, "Could not reserve memory block");
- return;
- }
- } else {
- char *oldStart = enc->start;
- enc->heap = 1;
- enc->start = (char *)enc->malloc(newSize);
- if (!enc->start) {
- SetError(NULL, enc, "Could not reserve memory block");
- return;
- }
- memcpy(enc->start, oldStart, offset);
- }
- enc->offset = enc->start + offset;
- enc->end = enc->start + newSize;
-}
-
-INLINE_PREFIX void FASTCALL_MSVC
-Buffer_AppendShortHexUnchecked(char *outputOffset, unsigned short value) {
- *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12];
- *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8];
- *(outputOffset++) = g_hexChars[(value & 0x00f0) >> 4];
- *(outputOffset++) = g_hexChars[(value & 0x000f) >> 0];
-}
-
-int Buffer_EscapeStringUnvalidated(JSONObjectEncoder *enc, const char *io,
- const char *end) {
- char *of = (char *)enc->offset;
-
- for (;;) {
- switch (*io) {
- case 0x00: {
- if (io < end) {
- *(of++) = '\\';
- *(of++) = 'u';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = '0';
- break;
- } else {
- enc->offset += (of - enc->offset);
- return TRUE;
- }
- }
- case '\"':
- (*of++) = '\\';
- (*of++) = '\"';
- break;
- case '\\':
- (*of++) = '\\';
- (*of++) = '\\';
- break;
- case '/':
- (*of++) = '\\';
- (*of++) = '/';
- break;
- case '\b':
- (*of++) = '\\';
- (*of++) = 'b';
- break;
- case '\f':
- (*of++) = '\\';
- (*of++) = 'f';
- break;
- case '\n':
- (*of++) = '\\';
- (*of++) = 'n';
- break;
- case '\r':
- (*of++) = '\\';
- (*of++) = 'r';
- break;
- case '\t':
- (*of++) = '\\';
- (*of++) = 't';
- break;
-
- case 0x26: // '/'
- case 0x3c: // '<'
- case 0x3e: // '>'
- {
- if (enc->encodeHTMLChars) {
- // Fall through to \u00XX case below.
- } else {
- // Same as default case below.
- (*of++) = (*io);
- break;
- }
- }
- case 0x01:
- case 0x02:
- case 0x03:
- case 0x04:
- case 0x05:
- case 0x06:
- case 0x07:
- case 0x0b:
- case 0x0e:
- case 0x0f:
- case 0x10:
- case 0x11:
- case 0x12:
- case 0x13:
- case 0x14:
- case 0x15:
- case 0x16:
- case 0x17:
- case 0x18:
- case 0x19:
- case 0x1a:
- case 0x1b:
- case 0x1c:
- case 0x1d:
- case 0x1e:
- case 0x1f: {
- *(of++) = '\\';
- *(of++) = 'u';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)];
- *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)];
- break;
- }
- default:
- (*of++) = (*io);
- break;
- }
- io++;
- }
-}
-
-int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc,
- const char *io, const char *end) {
- JSUTF32 ucs;
- char *of = (char *)enc->offset;
-
- for (;;) {
- JSUINT8 utflen = g_asciiOutputTable[(unsigned char)*io];
-
- switch (utflen) {
- case 0: {
- if (io < end) {
- *(of++) = '\\';
- *(of++) = 'u';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = '0';
- io++;
- continue;
- } else {
- enc->offset += (of - enc->offset);
- return TRUE;
- }
- }
-
- case 1: {
- *(of++) = (*io++);
- continue;
- }
-
- case 2: {
- JSUTF32 in;
- JSUTF16 in16;
-
- if (end - io < 1) {
- enc->offset += (of - enc->offset);
- SetError(
- obj, enc,
- "Unterminated UTF-8 sequence when encoding string");
- return FALSE;
- }
-
- memcpy(&in16, io, sizeof(JSUTF16));
- in = (JSUTF32)in16;
-
-#ifdef __LITTLE_ENDIAN__
- ucs = ((in & 0x1f) << 6) | ((in >> 8) & 0x3f);
-#else
- ucs = ((in & 0x1f00) >> 2) | (in & 0x3f);
-#endif
-
- if (ucs < 0x80) {
- enc->offset += (of - enc->offset);
- SetError(obj, enc,
- "Overlong 2 byte UTF-8 sequence detected when "
- "encoding string");
- return FALSE;
- }
-
- io += 2;
- break;
- }
-
- case 3: {
- JSUTF32 in;
- JSUTF16 in16;
- JSUINT8 in8;
-
- if (end - io < 2) {
- enc->offset += (of - enc->offset);
- SetError(
- obj, enc,
- "Unterminated UTF-8 sequence when encoding string");
- return FALSE;
- }
-
- memcpy(&in16, io, sizeof(JSUTF16));
- memcpy(&in8, io + 2, sizeof(JSUINT8));
-#ifdef __LITTLE_ENDIAN__
- in = (JSUTF32)in16;
- in |= in8 << 16;
- ucs = ((in & 0x0f) << 12) | ((in & 0x3f00) >> 2) |
- ((in & 0x3f0000) >> 16);
-#else
- in = in16 << 8;
- in |= in8;
- ucs =
- ((in & 0x0f0000) >> 4) | ((in & 0x3f00) >> 2) | (in & 0x3f);
-#endif
-
- if (ucs < 0x800) {
- enc->offset += (of - enc->offset);
- SetError(obj, enc,
- "Overlong 3 byte UTF-8 sequence detected when "
- "encoding string");
- return FALSE;
- }
-
- io += 3;
- break;
- }
- case 4: {
- JSUTF32 in;
-
- if (end - io < 3) {
- enc->offset += (of - enc->offset);
- SetError(
- obj, enc,
- "Unterminated UTF-8 sequence when encoding string");
- return FALSE;
- }
-
- memcpy(&in, io, sizeof(JSUTF32));
-#ifdef __LITTLE_ENDIAN__
- ucs = ((in & 0x07) << 18) | ((in & 0x3f00) << 4) |
- ((in & 0x3f0000) >> 10) | ((in & 0x3f000000) >> 24);
-#else
- ucs = ((in & 0x07000000) >> 6) | ((in & 0x3f0000) >> 4) |
- ((in & 0x3f00) >> 2) | (in & 0x3f);
-#endif
- if (ucs < 0x10000) {
- enc->offset += (of - enc->offset);
- SetError(obj, enc,
- "Overlong 4 byte UTF-8 sequence detected when "
- "encoding string");
- return FALSE;
- }
-
- io += 4;
- break;
- }
-
- case 5:
- case 6: {
- enc->offset += (of - enc->offset);
- SetError(
- obj, enc,
- "Unsupported UTF-8 sequence length when encoding string");
- return FALSE;
- }
-
- case 29: {
- if (enc->encodeHTMLChars) {
- // Fall through to \u00XX case 30 below.
- } else {
- // Same as case 1 above.
- *(of++) = (*io++);
- continue;
- }
- }
-
- case 30: {
- // \uXXXX encode
- *(of++) = '\\';
- *(of++) = 'u';
- *(of++) = '0';
- *(of++) = '0';
- *(of++) = g_hexChars[(unsigned char)(((*io) & 0xf0) >> 4)];
- *(of++) = g_hexChars[(unsigned char)((*io) & 0x0f)];
- io++;
- continue;
- }
- case 10:
- case 12:
- case 14:
- case 16:
- case 18:
- case 20:
- case 22:
- case 24: {
- *(of++) = *((char *)(g_escapeChars + utflen + 0));
- *(of++) = *((char *)(g_escapeChars + utflen + 1));
- io++;
- continue;
- }
- // This can never happen, it's here to make L4 VC++ happy
- default: {
- ucs = 0;
- break;
- }
- }
-
- /*
- If the character is a UTF8 sequence of length > 1 we end up here */
- if (ucs >= 0x10000) {
- ucs -= 0x10000;
- *(of++) = '\\';
- *(of++) = 'u';
- Buffer_AppendShortHexUnchecked(
- of, (unsigned short)(ucs >> 10) + 0xd800);
- of += 4;
-
- *(of++) = '\\';
- *(of++) = 'u';
- Buffer_AppendShortHexUnchecked(
- of, (unsigned short)(ucs & 0x3ff) + 0xdc00);
- of += 4;
- } else {
- *(of++) = '\\';
- *(of++) = 'u';
- Buffer_AppendShortHexUnchecked(of, (unsigned short)ucs);
- of += 4;
- }
- }
-}
-
-#define Buffer_Reserve(__enc, __len) \
- if ( (size_t) ((__enc)->end - (__enc)->offset) < (size_t) (__len)) \
- { \
- Buffer_Realloc((__enc), (__len));\
- } \
-
-#define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr;
-
-INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin,
- char *end) {
- char aux;
- while (end > begin) aux = *end, *end-- = *begin, *begin++ = aux;
-}
-
-void Buffer_AppendIndentNewlineUnchecked(JSONObjectEncoder *enc) {
- if (enc->indent > 0) Buffer_AppendCharUnchecked(enc, '\n');
-}
-
-// This function could be refactored to only accept enc as an argument,
-// but this is a straight vendor from ujson source
-void Buffer_AppendIndentUnchecked(JSONObjectEncoder *enc, JSINT32 value) {
- int i;
- if (enc->indent > 0) {
- while (value-- > 0)
- for (i = 0; i < enc->indent; i++)
- Buffer_AppendCharUnchecked(enc, ' ');
- }
-}
-
-void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) {
- char *wstr;
- JSUINT32 uvalue = (value < 0) ? -value : value;
- wstr = enc->offset;
-
- // Conversion. Number is reversed.
- do {
- *wstr++ = (char)(48 + (uvalue % 10));
- } while (uvalue /= 10);
- if (value < 0) *wstr++ = '-';
-
- // Reverse string
- strreverse(enc->offset, wstr - 1);
- enc->offset += (wstr - (enc->offset));
-}
-
-void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) {
- char *wstr;
- JSUINT64 uvalue = (value < 0) ? -value : value;
-
- wstr = enc->offset;
- // Conversion. Number is reversed.
-
- do {
- *wstr++ = (char)(48 + (uvalue % 10ULL));
- } while (uvalue /= 10ULL);
- if (value < 0) *wstr++ = '-';
-
- // Reverse string
- strreverse(enc->offset, wstr - 1);
- enc->offset += (wstr - (enc->offset));
-}
-
-int Buffer_AppendDoubleUnchecked(JSOBJ obj, JSONObjectEncoder *enc,
- double value) {
- /* if input is beyond the thresholds, revert to exponential */
- const double thres_max = (double)1e16 - 1;
- const double thres_min = (double)1e-15;
- char precision_str[20];
- int count;
- double diff = 0.0;
- char *str = enc->offset;
- char *wstr = str;
- unsigned long long whole;
- double tmp;
- unsigned long long frac;
- int neg;
- double pow10;
-
- if (value == HUGE_VAL || value == -HUGE_VAL) {
- SetError(obj, enc, "Invalid Inf value when encoding double");
- return FALSE;
- }
-
- if (!(value == value)) {
- SetError(obj, enc, "Invalid Nan value when encoding double");
- return FALSE;
- }
-
- /* we'll work in positive values and deal with the
- negative sign issue later */
- neg = 0;
- if (value < 0) {
- neg = 1;
- value = -value;
- }
-
- /*
- for very large or small numbers switch back to native sprintf for
- exponentials. anyone want to write code to replace this? */
- if (value > thres_max || (value != 0.0 && fabs(value) < thres_min)) {
- precision_str[0] = '%';
- precision_str[1] = '.';
-#if defined(_WIN32) && defined(_MSC_VER)
- sprintf_s(precision_str + 2, sizeof(precision_str) - 2, "%ug",
- enc->doublePrecision);
- enc->offset += sprintf_s(str, enc->end - enc->offset, precision_str,
- neg ? -value : value);
-#else
- snprintf(precision_str + 2, sizeof(precision_str) - 2, "%ug",
- enc->doublePrecision);
- enc->offset += snprintf(str, enc->end - enc->offset, precision_str,
- neg ? -value : value);
-#endif
- return TRUE;
- }
-
- pow10 = g_pow10[enc->doublePrecision];
-
- whole = (unsigned long long)value;
- tmp = (value - whole) * pow10;
- frac = (unsigned long long)(tmp);
- diff = tmp - frac;
-
- if (diff > 0.5) {
- ++frac;
- } else if (diff == 0.5 && ((frac == 0) || (frac & 1))) {
- /* if halfway, round up if odd, OR
- if last digit is 0. That last part is strange */
- ++frac;
- }
-
- // handle rollover, e.g.
- // case 0.99 with prec 1 is 1.0 and case 0.95 with prec is 1.0 as well
- if (frac >= pow10) {
- frac = 0;
- ++whole;
- }
-
- if (enc->doublePrecision == 0) {
- diff = value - whole;
-
- if (diff > 0.5) {
- /* greater than 0.5, round up, e.g. 1.6 -> 2 */
- ++whole;
- } else if (diff == 0.5 && (whole & 1)) {
- /* exactly 0.5 and ODD, then round up */
- /* 1.5 -> 2, but 2.5 -> 2 */
- ++whole;
- }
-
- // vvvvvvvvvvvvvvvvvvv Diff from modp_dto2
- } else if (frac) {
- count = enc->doublePrecision;
- // now do fractional part, as an unsigned number
- // we know it is not 0 but we can have leading zeros, these
- // should be removed
- while (!(frac % 10)) {
- --count;
- frac /= 10;
- }
- //^^^^^^^^^^^^^^^^^^^ Diff from modp_dto2
-
- // now do fractional part, as an unsigned number
- do {
- --count;
- *wstr++ = (char)(48 + (frac % 10));
- } while (frac /= 10);
- // add extra 0s
- while (count-- > 0) {
- *wstr++ = '0';
- }
- // add decimal
- *wstr++ = '.';
- } else {
- *wstr++ = '0';
- *wstr++ = '.';
- }
-
- // Do whole part. Take care of sign
- // conversion. Number is reversed.
- do {
- *wstr++ = (char)(48 + (whole % 10));
- } while (whole /= 10);
-
- if (neg) {
- *wstr++ = '-';
- }
- strreverse(str, wstr - 1);
- enc->offset += (wstr - (enc->offset));
-
- return TRUE;
-}
-
-/*
-FIXME:
-Handle integration functions returning NULL here */
-
-/*
-FIXME:
-Perhaps implement recursion detection */
-
-void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name,
- size_t cbName) {
- const char *value;
- char *objName;
- int count;
- JSOBJ iterObj;
- size_t szlen;
- JSONTypeContext tc;
- tc.encoder = enc;
-
- if (enc->level > enc->recursionMax) {
- SetError(obj, enc, "Maximum recursion level reached");
- return;
- }
-
- /*
- This reservation must hold
-
- length of _name as encoded worst case +
- maxLength of double to string OR maxLength of JSLONG to string
- */
-
- Buffer_Reserve(enc, 256 + RESERVE_STRING(cbName));
- if (enc->errorMsg) {
- return;
- }
-
- if (name) {
- Buffer_AppendCharUnchecked(enc, '\"');
-
- if (enc->forceASCII) {
- if (!Buffer_EscapeStringValidated(obj, enc, name, name + cbName)) {
- return;
- }
- } else {
- if (!Buffer_EscapeStringUnvalidated(enc, name, name + cbName)) {
- return;
- }
- }
-
- Buffer_AppendCharUnchecked(enc, '\"');
-
- Buffer_AppendCharUnchecked(enc, ':');
-#ifndef JSON_NO_EXTRA_WHITESPACE
- Buffer_AppendCharUnchecked(enc, ' ');
-#endif
- }
-
- enc->beginTypeContext(obj, &tc);
-
- switch (tc.type) {
- case JT_INVALID: {
- return;
- }
-
- case JT_ARRAY: {
- count = 0;
- enc->iterBegin(obj, &tc);
-
- Buffer_AppendCharUnchecked(enc, '[');
- Buffer_AppendIndentNewlineUnchecked(enc);
-
- while (enc->iterNext(obj, &tc)) {
- if (count > 0) {
- Buffer_AppendCharUnchecked(enc, ',');
-#ifndef JSON_NO_EXTRA_WHITESPACE
- Buffer_AppendCharUnchecked(buffer, ' ');
-#endif
- Buffer_AppendIndentNewlineUnchecked(enc);
- }
-
- iterObj = enc->iterGetValue(obj, &tc);
-
- enc->level++;
- Buffer_AppendIndentUnchecked(enc, enc->level);
- encode(iterObj, enc, NULL, 0);
- count++;
- }
-
- enc->iterEnd(obj, &tc);
- Buffer_AppendIndentNewlineUnchecked(enc);
- Buffer_AppendIndentUnchecked(enc, enc->level);
- Buffer_AppendCharUnchecked(enc, ']');
- break;
- }
-
- case JT_OBJECT: {
- count = 0;
- enc->iterBegin(obj, &tc);
-
- Buffer_AppendCharUnchecked(enc, '{');
- Buffer_AppendIndentNewlineUnchecked(enc);
-
- while (enc->iterNext(obj, &tc)) {
- if (count > 0) {
- Buffer_AppendCharUnchecked(enc, ',');
-#ifndef JSON_NO_EXTRA_WHITESPACE
- Buffer_AppendCharUnchecked(enc, ' ');
-#endif
- Buffer_AppendIndentNewlineUnchecked(enc);
- }
-
- iterObj = enc->iterGetValue(obj, &tc);
- objName = enc->iterGetName(obj, &tc, &szlen);
-
- enc->level++;
- Buffer_AppendIndentUnchecked(enc, enc->level);
- encode(iterObj, enc, objName, szlen);
- count++;
- }
-
- enc->iterEnd(obj, &tc);
- Buffer_AppendIndentNewlineUnchecked(enc);
- Buffer_AppendIndentUnchecked(enc, enc->level);
- Buffer_AppendCharUnchecked(enc, '}');
- break;
- }
-
- case JT_LONG: {
- Buffer_AppendLongUnchecked(enc, enc->getLongValue(obj, &tc));
- break;
- }
-
- case JT_INT: {
- Buffer_AppendIntUnchecked(enc, enc->getIntValue(obj, &tc));
- break;
- }
-
- case JT_TRUE: {
- Buffer_AppendCharUnchecked(enc, 't');
- Buffer_AppendCharUnchecked(enc, 'r');
- Buffer_AppendCharUnchecked(enc, 'u');
- Buffer_AppendCharUnchecked(enc, 'e');
- break;
- }
-
- case JT_FALSE: {
- Buffer_AppendCharUnchecked(enc, 'f');
- Buffer_AppendCharUnchecked(enc, 'a');
- Buffer_AppendCharUnchecked(enc, 'l');
- Buffer_AppendCharUnchecked(enc, 's');
- Buffer_AppendCharUnchecked(enc, 'e');
- break;
- }
-
- case JT_NULL: {
- Buffer_AppendCharUnchecked(enc, 'n');
- Buffer_AppendCharUnchecked(enc, 'u');
- Buffer_AppendCharUnchecked(enc, 'l');
- Buffer_AppendCharUnchecked(enc, 'l');
- break;
- }
-
- case JT_DOUBLE: {
- if (!Buffer_AppendDoubleUnchecked(obj, enc,
- enc->getDoubleValue(obj, &tc))) {
- enc->endTypeContext(obj, &tc);
- enc->level--;
- return;
- }
- break;
- }
-
- case JT_UTF8: {
- value = enc->getStringValue(obj, &tc, &szlen);
- if (enc->errorMsg) {
- enc->endTypeContext(obj, &tc);
- return;
- }
- Buffer_Reserve(enc, RESERVE_STRING(szlen));
- Buffer_AppendCharUnchecked(enc, '\"');
-
- if (enc->forceASCII) {
- if (!Buffer_EscapeStringValidated(obj, enc, value,
- value + szlen)) {
- enc->endTypeContext(obj, &tc);
- enc->level--;
- return;
- }
- } else {
- if (!Buffer_EscapeStringUnvalidated(enc, value,
- value + szlen)) {
- enc->endTypeContext(obj, &tc);
- enc->level--;
- return;
- }
- }
-
- Buffer_AppendCharUnchecked(enc, '\"');
- break;
- }
-
- case JT_BIGNUM: {
- value = enc->getBigNumStringValue(obj, &tc, &szlen);
-
- Buffer_Reserve(enc, RESERVE_STRING(szlen));
- if (enc->errorMsg) {
- enc->endTypeContext(obj, &tc);
- return;
- }
-
- if (enc->forceASCII) {
- if (!Buffer_EscapeStringValidated(obj, enc, value,
- value + szlen)) {
- enc->endTypeContext(obj, &tc);
- enc->level--;
- return;
- }
- } else {
- if (!Buffer_EscapeStringUnvalidated(enc, value,
- value + szlen)) {
- enc->endTypeContext(obj, &tc);
- enc->level--;
- return;
- }
- }
-
- break;
- }
- }
-
- enc->endTypeContext(obj, &tc);
- enc->level--;
-}
-
-char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer,
- size_t _cbBuffer) {
- char *locale;
- enc->malloc = enc->malloc ? enc->malloc : malloc;
- enc->free = enc->free ? enc->free : free;
- enc->realloc = enc->realloc ? enc->realloc : realloc;
- enc->errorMsg = NULL;
- enc->errorObj = NULL;
- enc->level = 0;
-
- if (enc->recursionMax < 1) {
- enc->recursionMax = JSON_MAX_RECURSION_DEPTH;
- }
-
- if (enc->doublePrecision < 0 ||
- enc->doublePrecision > JSON_DOUBLE_MAX_DECIMALS) {
- enc->doublePrecision = JSON_DOUBLE_MAX_DECIMALS;
- }
-
- if (_buffer == NULL) {
- _cbBuffer = 32768;
- enc->start = (char *)enc->malloc(_cbBuffer);
- if (!enc->start) {
- SetError(obj, enc, "Could not reserve memory block");
- return NULL;
- }
- enc->heap = 1;
- } else {
- enc->start = _buffer;
- enc->heap = 0;
- }
-
- enc->end = enc->start + _cbBuffer;
- enc->offset = enc->start;
-
- locale = setlocale(LC_NUMERIC, NULL);
- if (!locale) {
- SetError(NULL, enc, "setlocale call failed");
- return NULL;
- }
-
- if (strcmp(locale, "C")) {
- size_t len = strlen(locale) + 1;
- char *saved_locale = malloc(len);
- if (saved_locale == NULL) {
- SetError(NULL, enc, "Could not reserve memory block");
- return NULL;
- }
- memcpy(saved_locale, locale, len);
- setlocale(LC_NUMERIC, "C");
- encode(obj, enc, NULL, 0);
- setlocale(LC_NUMERIC, saved_locale);
- free(saved_locale);
- } else {
- encode(obj, enc, NULL, 0);
- }
-
- Buffer_Reserve(enc, 1);
- if (enc->errorMsg) {
- return NULL;
- }
- Buffer_AppendCharUnchecked(enc, '\0');
-
- return enc->start;
-}
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/JSONtoObj.c b/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/JSONtoObj.c
deleted file mode 100644
index d7086ffba62..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/JSONtoObj.c
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
-Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the ESN Social Software AB nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
-https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
-
-Numeric decoder derived from TCL library
-https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
- * Copyright (c) 1988-1993 The Regents of the University of California.
- * Copyright (c) 1994 Sun Microsystems, Inc.
-*/
-
-#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
-#define NO_IMPORT_ARRAY
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include <numpy/arrayobject.h>
-#include <ultrajson.h>
-
-#define PRINTMARK()
-
-typedef struct __PyObjectDecoder {
- JSONObjectDecoder dec;
-
- void *npyarr; // Numpy context buffer
- void *npyarr_addr; // Ref to npyarr ptr to track DECREF calls
- npy_intp curdim; // Current array dimension
-
- PyArray_Descr *dtype;
-} PyObjectDecoder;
-
-typedef struct __NpyArrContext {
- PyObject *ret;
- PyObject *labels[2];
- PyArray_Dims shape;
-
- PyObjectDecoder *dec;
-
- npy_intp i;
- npy_intp elsize;
- npy_intp elcount;
-} NpyArrContext;
-
-// Numpy handling based on numpy internal code, specifically the function
-// PyArray_FromIter.
-
-// numpy related functions are inter-dependent so declare them all here,
-// to ensure the compiler catches any errors
-
-// standard numpy array handling
-JSOBJ Object_npyNewArray(void *prv, void *decoder);
-JSOBJ Object_npyEndArray(void *prv, JSOBJ obj);
-int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value);
-
-// for more complex dtypes (object and string) fill a standard Python list
-// and convert to a numpy array when done.
-JSOBJ Object_npyNewArrayList(void *prv, void *decoder);
-JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj);
-int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value);
-
-// free the numpy context buffer
-void Npy_releaseContext(NpyArrContext *npyarr) {
- PRINTMARK();
- if (npyarr) {
- if (npyarr->shape.ptr) {
- PyObject_Free(npyarr->shape.ptr);
- }
- if (npyarr->dec) {
- npyarr->dec->npyarr = NULL;
- npyarr->dec->curdim = 0;
- }
- Py_XDECREF(npyarr->labels[0]);
- Py_XDECREF(npyarr->labels[1]);
- Py_XDECREF(npyarr->ret);
- PyObject_Free(npyarr);
- }
-}
-
-JSOBJ Object_npyNewArray(void *prv, void *_decoder) {
- NpyArrContext *npyarr;
- PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
- PRINTMARK();
- if (decoder->curdim <= 0) {
- // start of array - initialise the context buffer
- npyarr = decoder->npyarr = PyObject_Malloc(sizeof(NpyArrContext));
- decoder->npyarr_addr = npyarr;
-
- if (!npyarr) {
- PyErr_NoMemory();
- return NULL;
- }
-
- npyarr->dec = decoder;
- npyarr->labels[0] = npyarr->labels[1] = NULL;
-
- npyarr->shape.ptr = PyObject_Malloc(sizeof(npy_intp) * NPY_MAXDIMS);
- npyarr->shape.len = 1;
- npyarr->ret = NULL;
-
- npyarr->elsize = 0;
- npyarr->elcount = 4;
- npyarr->i = 0;
- } else {
- // starting a new dimension continue the current array (and reshape
- // after)
- npyarr = (NpyArrContext *)decoder->npyarr;
- if (decoder->curdim >= npyarr->shape.len) {
- npyarr->shape.len++;
- }
- }
-
- npyarr->shape.ptr[decoder->curdim] = 0;
- decoder->curdim++;
- return npyarr;
-}
-
-PyObject *Npy_returnLabelled(NpyArrContext *npyarr) {
- PyObject *ret = npyarr->ret;
- npy_intp i;
-
- if (npyarr->labels[0] || npyarr->labels[1]) {
- // finished decoding, build tuple with values and labels
- ret = PyTuple_New(npyarr->shape.len + 1);
- for (i = 0; i < npyarr->shape.len; i++) {
- if (npyarr->labels[i]) {
- PyTuple_SET_ITEM(ret, i + 1, npyarr->labels[i]);
- npyarr->labels[i] = NULL;
- } else {
- Py_INCREF(Py_None);
- PyTuple_SET_ITEM(ret, i + 1, Py_None);
- }
- }
- PyTuple_SET_ITEM(ret, 0, npyarr->ret);
- }
-
- return ret;
-}
-
-JSOBJ Object_npyEndArray(void *prv, JSOBJ obj) {
- PyObject *ret;
- char *new_data;
- NpyArrContext *npyarr = (NpyArrContext *)obj;
- int emptyType = NPY_DEFAULT_TYPE;
- npy_intp i;
- PRINTMARK();
- if (!npyarr) {
- return NULL;
- }
-
- ret = npyarr->ret;
- i = npyarr->i;
-
- npyarr->dec->curdim--;
-
- if (i == 0 || !npyarr->ret) {
- // empty array would not have been initialised so do it now.
- if (npyarr->dec->dtype) {
- emptyType = npyarr->dec->dtype->type_num;
- }
- npyarr->ret = ret =
- PyArray_EMPTY(npyarr->shape.len, npyarr->shape.ptr, emptyType, 0);
- } else if (npyarr->dec->curdim <= 0) {
- // realloc to final size
- new_data = PyDataMem_RENEW(PyArray_DATA(ret), i * npyarr->elsize);
- if (new_data == NULL) {
- PyErr_NoMemory();
- Npy_releaseContext(npyarr);
- return NULL;
- }
- ((PyArrayObject *)ret)->data = (void *)new_data;
- // PyArray_BYTES(ret) = new_data;
- }
-
- if (npyarr->dec->curdim <= 0) {
- // finished decoding array, reshape if necessary
- if (npyarr->shape.len > 1) {
- npyarr->ret = PyArray_Newshape((PyArrayObject *)ret, &npyarr->shape,
- NPY_ANYORDER);
- Py_DECREF(ret);
- }
-
- ret = Npy_returnLabelled(npyarr);
-
- npyarr->ret = NULL;
- Npy_releaseContext(npyarr);
- }
-
- return ret;
-}
-
-int Object_npyArrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
- PyObject *type;
- PyArray_Descr *dtype;
- npy_intp i;
- char *new_data, *item;
- NpyArrContext *npyarr = (NpyArrContext *)obj;
- PRINTMARK();
- if (!npyarr) {
- return 0;
- }
-
- i = npyarr->i;
-
- npyarr->shape.ptr[npyarr->dec->curdim - 1]++;
-
- if (PyArray_Check((PyObject *)value)) {
- // multidimensional array, keep decoding values.
- return 1;
- }
-
- if (!npyarr->ret) {
- // Array not initialised yet.
- // We do it here so we can 'sniff' the data type if none was provided
- if (!npyarr->dec->dtype) {
- type = PyObject_Type(value);
- if (!PyArray_DescrConverter(type, &dtype)) {
- Py_DECREF(type);
- goto fail;
- }
- Py_INCREF(dtype);
- Py_DECREF(type);
- } else {
- dtype = PyArray_DescrNew(npyarr->dec->dtype);
- }
-
- // If it's an object or string then fill a Python list and subsequently
- // convert. Otherwise we would need to somehow mess about with
- // reference counts when renewing memory.
- npyarr->elsize = dtype->elsize;
- if (PyDataType_REFCHK(dtype) || npyarr->elsize == 0) {
- Py_XDECREF(dtype);
-
- if (npyarr->dec->curdim > 1) {
- PyErr_SetString(PyExc_ValueError,
- "Cannot decode multidimensional arrays with "
- "variable length elements to numpy");
- goto fail;
- }
- npyarr->elcount = 0;
- npyarr->ret = PyList_New(0);
- if (!npyarr->ret) {
- goto fail;
- }
- ((JSONObjectDecoder *)npyarr->dec)->newArray =
- Object_npyNewArrayList;
- ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem =
- Object_npyArrayListAddItem;
- ((JSONObjectDecoder *)npyarr->dec)->endArray =
- Object_npyEndArrayList;
- return Object_npyArrayListAddItem(prv, obj, value);
- }
-
- npyarr->ret = PyArray_NewFromDescr(
- &PyArray_Type, dtype, 1, &npyarr->elcount, NULL, NULL, 0, NULL);
-
- if (!npyarr->ret) {
- goto fail;
- }
- }
-
- if (i >= npyarr->elcount) {
- // Grow PyArray_DATA(ret):
- // this is similar for the strategy for PyListObject, but we use
- // 50% overallocation => 0, 4, 8, 14, 23, 36, 56, 86 ...
- if (npyarr->elsize == 0) {
- PyErr_SetString(PyExc_ValueError,
- "Cannot decode multidimensional arrays with "
- "variable length elements to numpy");
- goto fail;
- }
-
- npyarr->elcount = (i >> 1) + (i < 4 ? 4 : 2) + i;
- if (npyarr->elcount <= NPY_MAX_INTP / npyarr->elsize) {
- new_data = PyDataMem_RENEW(PyArray_DATA(npyarr->ret),
- npyarr->elcount * npyarr->elsize);
- } else {
- PyErr_NoMemory();
- goto fail;
- }
- ((PyArrayObject *)npyarr->ret)->data = (void *)new_data;
-
- // PyArray_BYTES(npyarr->ret) = new_data;
- }
-
- PyArray_DIMS(npyarr->ret)[0] = i + 1;
-
- if ((item = PyArray_GETPTR1(npyarr->ret, i)) == NULL ||
- PyArray_SETITEM(npyarr->ret, item, value) == -1) {
- goto fail;
- }
-
- Py_DECREF((PyObject *)value);
- npyarr->i++;
- return 1;
-
-fail:
-
- Npy_releaseContext(npyarr);
- return 0;
-}
-
-JSOBJ Object_npyNewArrayList(void *prv, void *_decoder) {
- PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
- PRINTMARK();
- PyErr_SetString(
- PyExc_ValueError,
- "nesting not supported for object or variable length dtypes");
- Npy_releaseContext(decoder->npyarr);
- return NULL;
-}
-
-JSOBJ Object_npyEndArrayList(void *prv, JSOBJ obj) {
- PyObject *list, *ret;
- NpyArrContext *npyarr = (NpyArrContext *)obj;
- PRINTMARK();
- if (!npyarr) {
- return NULL;
- }
-
- // convert decoded list to numpy array
- list = (PyObject *)npyarr->ret;
- npyarr->ret = PyArray_FROM_O(list);
-
- ret = Npy_returnLabelled(npyarr);
- npyarr->ret = list;
-
- ((JSONObjectDecoder *)npyarr->dec)->newArray = Object_npyNewArray;
- ((JSONObjectDecoder *)npyarr->dec)->arrayAddItem = Object_npyArrayAddItem;
- ((JSONObjectDecoder *)npyarr->dec)->endArray = Object_npyEndArray;
- Npy_releaseContext(npyarr);
- return ret;
-}
-
-int Object_npyArrayListAddItem(void *prv, JSOBJ obj, JSOBJ value) {
- NpyArrContext *npyarr = (NpyArrContext *)obj;
- PRINTMARK();
- if (!npyarr) {
- return 0;
- }
- PyList_Append((PyObject *)npyarr->ret, value);
- Py_DECREF((PyObject *)value);
- npyarr->elcount++;
- return 1;
-}
-
-int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
- int ret = PyDict_SetItem(obj, name, value);
- Py_DECREF((PyObject *)name);
- Py_DECREF((PyObject *)value);
- return ret == 0 ? 1 : 0;
-}
-
-int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
- int ret = PyList_Append(obj, value);
- Py_DECREF((PyObject *)value);
- return ret == 0 ? 1 : 0;
-}
-
-JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) {
- return PyUnicode_FromWideChar(start, (end - start));
-}
-
-JSOBJ Object_newTrue(void *prv) { Py_RETURN_TRUE; }
-
-JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; }
-
-JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; }
-
-JSOBJ Object_newPosInf(void *prv) { return PyFloat_FromDouble(Py_HUGE_VAL); }
-
-JSOBJ Object_newNegInf(void *prv) { return PyFloat_FromDouble(-Py_HUGE_VAL); }
-
-JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); }
-
-JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; }
-
-JSOBJ Object_newArray(void *prv, void *decoder) { return PyList_New(0); }
-
-JSOBJ Object_endArray(void *prv, JSOBJ obj) { return obj; }
-
-JSOBJ Object_newInteger(void *prv, JSINT32 value) {
- return PyLong_FromLong((long)value);
-}
-
-JSOBJ Object_newLong(void *prv, JSINT64 value) {
- return PyLong_FromLongLong(value);
-}
-
-JSOBJ Object_newUnsignedLong(void *prv, JSUINT64 value) {
- return PyLong_FromUnsignedLongLong(value);
-}
-
-JSOBJ Object_newDouble(void *prv, double value) {
- return PyFloat_FromDouble(value);
-}
-
-static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) {
- PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
- if (obj != decoder->npyarr_addr) {
- Py_XDECREF(((PyObject *)obj));
- }
-}
-
-static char *g_kwlist[] = {"obj", "precise_float",
- "labelled", "dtype", NULL};
-
-PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) {
- PyObject *ret;
- PyObject *sarg;
- PyObject *arg;
- PyObject *opreciseFloat = NULL;
- JSONObjectDecoder *decoder;
- PyObjectDecoder pyDecoder;
- PyArray_Descr *dtype = NULL;
- int labelled = 0;
-
- JSONObjectDecoder dec = {
- Object_newString, Object_objectAddKey, Object_arrayAddItem,
- Object_newTrue, Object_newFalse, Object_newNull,
- Object_newPosInf, Object_newNegInf, Object_newObject,
- Object_endObject, Object_newArray, Object_endArray,
- Object_newInteger, Object_newLong, Object_newUnsignedLong,
- Object_newDouble,
- Object_releaseObject, PyObject_Malloc, PyObject_Free,
- PyObject_Realloc};
-
- dec.preciseFloat = 0;
- dec.prv = NULL;
-
- pyDecoder.dec = dec;
- pyDecoder.curdim = 0;
- pyDecoder.npyarr = NULL;
- pyDecoder.npyarr_addr = NULL;
-
- decoder = (JSONObjectDecoder *)&pyDecoder;
-
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiiO&", g_kwlist, &arg,
- &opreciseFloat, &labelled,
- PyArray_DescrConverter2, &dtype)) {
- Npy_releaseContext(pyDecoder.npyarr);
- return NULL;
- }
-
- if (opreciseFloat && PyObject_IsTrue(opreciseFloat)) {
- decoder->preciseFloat = 1;
- }
-
- if (PyBytes_Check(arg)) {
- sarg = arg;
- } else if (PyUnicode_Check(arg)) {
- sarg = PyUnicode_AsUTF8String(arg);
- if (sarg == NULL) {
- // Exception raised above us by codec according to docs
- return NULL;
- }
- } else {
- PyErr_Format(PyExc_TypeError, "Expected 'str' or 'bytes'");
- return NULL;
- }
-
- decoder->errorStr = NULL;
- decoder->errorOffset = NULL;
-
- ret = JSON_DecodeObject(decoder, PyBytes_AS_STRING(sarg),
- PyBytes_GET_SIZE(sarg));
-
- if (sarg != arg) {
- Py_DECREF(sarg);
- }
-
- if (PyErr_Occurred()) {
- if (ret) {
- Py_DECREF((PyObject *)ret);
- }
- Npy_releaseContext(pyDecoder.npyarr);
- return NULL;
- }
-
- if (decoder->errorStr) {
- /*
- FIXME: It's possible to give a much nicer error message here with actual
- failing element in input etc*/
-
- PyErr_Format(PyExc_ValueError, "%s", decoder->errorStr);
-
- if (ret) {
- Py_DECREF((PyObject *)ret);
- }
- Npy_releaseContext(pyDecoder.npyarr);
-
- return NULL;
- }
-
- return ret;
-}
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.c b/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.c
deleted file mode 100644
index 6edb1995284..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
-Copyright (c) 2020, PyData Development Team
-All rights reserved.
-Distributed under the terms of the BSD Simplified License.
-The full license is in the LICENSE file, distributed with this software.
-*/
-
-// Conversion routines that are useful for serialization,
-// but which don't interact with JSON objects directly
-
-#include "date_conversions.h"
-#include "../../../tslibs/src/datetime/np_datetime.h"
-#include "../../../tslibs/src/datetime/np_datetime_strings.h"
-
-/*
- * Function: scaleNanosecToUnit
- * -----------------------------
- *
- * Scales an integer value representing time in nanoseconds to provided unit.
- *
- * Mutates the provided value directly. Returns 0 on success, non-zero on error.
- */
-int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) {
- switch (unit) {
- case NPY_FR_ns:
- break;
- case NPY_FR_us:
- *value /= 1000LL;
- break;
- case NPY_FR_ms:
- *value /= 1000000LL;
- break;
- case NPY_FR_s:
- *value /= 1000000000LL;
- break;
- default:
- return -1;
- }
-
- return 0;
-}
-
-/* Converts the int64_t representation of a datetime to ISO; mutates len */
-char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
- npy_datetimestruct dts;
- int ret_code;
-
- pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts);
-
- *len = (size_t)get_datetime_iso_8601_strlen(0, base);
- char *result = PyObject_Malloc(*len);
-
- if (result == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
- // datetime64 is always naive
- ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base);
- if (ret_code != 0) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert datetime value to string");
- PyObject_Free(result);
- }
-
- // Note that get_datetime_iso_8601_strlen just gives a generic size
- // for ISO string conversion, not the actual size used
- *len = strlen(result);
- return result;
-}
-
-npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) {
- scaleNanosecToUnit(&dt, base);
- return dt;
-}
-
-/* Convert PyDatetime To ISO C-string. mutates len */
-char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base,
- size_t *len) {
- npy_datetimestruct dts;
- int ret;
-
- ret = convert_pydatetime_to_datetimestruct(obj, &dts);
- if (ret != 0) {
- if (!PyErr_Occurred()) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert PyDateTime to numpy datetime");
- }
- return NULL;
- }
-
- *len = (size_t)get_datetime_iso_8601_strlen(0, base);
- char *result = PyObject_Malloc(*len);
- // Check to see if PyDateTime has a timezone.
- // Don't convert to UTC if it doesn't.
- int is_tz_aware = 0;
- if (PyObject_HasAttrString(obj, "tzinfo")) {
- PyObject *offset = extract_utc_offset(obj);
- if (offset == NULL) {
- PyObject_Free(result);
- return NULL;
- }
- is_tz_aware = offset != Py_None;
- Py_DECREF(offset);
- }
- ret = make_iso_8601_datetime(&dts, result, *len, is_tz_aware, base);
-
- if (ret != 0) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert datetime value to string");
- PyObject_Free(result);
- return NULL;
- }
-
- // Note that get_datetime_iso_8601_strlen just gives a generic size
- // for ISO string conversion, not the actual size used
- *len = strlen(result);
- return result;
-}
-
-npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) {
- npy_datetimestruct dts;
- int ret;
-
- ret = convert_pydatetime_to_datetimestruct(dt, &dts);
- if (ret != 0) {
- if (!PyErr_Occurred()) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert PyDateTime to numpy datetime");
- }
- // TODO(username): is setting errMsg required?
- // ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
- // return NULL;
- }
-
- npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts);
- return NpyDateTimeToEpoch(npy_dt, base);
-}
-
-/* Converts the int64_t representation of a duration to ISO; mutates len */
-char *int64ToIsoDuration(int64_t value, size_t *len) {
- pandas_timedeltastruct tds;
- int ret_code;
-
- pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds);
-
- // Max theoretical length of ISO Duration with 64 bit day
- // as the largest unit is 70 characters + 1 for a null terminator
- char *result = PyObject_Malloc(71);
- if (result == NULL) {
- PyErr_NoMemory();
- return NULL;
- }
-
- ret_code = make_iso_8601_timedelta(&tds, result, len);
- if (ret_code == -1) {
- PyErr_SetString(PyExc_ValueError,
- "Could not convert timedelta value to string");
- PyObject_Free(result);
- return NULL;
- }
-
- return result;
-}
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.h b/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.h
deleted file mode 100644
index efd707f0419..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
-Copyright (c) 2020, PyData Development Team
-All rights reserved.
-Distributed under the terms of the BSD Simplified License.
-The full license is in the LICENSE file, distributed with this software.
-*/
-
-#ifndef PANDAS__LIBS_SRC_UJSON_PYTHON_DATE_CONVERSIONS_H_
-#define PANDAS__LIBS_SRC_UJSON_PYTHON_DATE_CONVERSIONS_H_
-
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include <numpy/ndarraytypes.h>
-
-// Scales value inplace from nanosecond resolution to unit resolution
-int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit);
-
-// Converts an int64 object representing a date to ISO format
-// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
-// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
-// len is mutated to save the length of the returned string
-char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len);
-
-// TODO(username): this function doesn't do a lot; should augment or
-// replace with scaleNanosecToUnit
-npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base);
-
-// Converts a Python object representing a Date / Datetime to ISO format
-// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
-// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
-// len is mutated to save the length of the returned string
-char *PyDateTimeToIso(PyObject *obj, NPY_DATETIMEUNIT base, size_t *len);
-
-// Convert a Python Date/Datetime to Unix epoch with resolution base
-npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base);
-
-char *int64ToIsoDuration(int64_t value, size_t *len);
-
-#endif // PANDAS__LIBS_SRC_UJSON_PYTHON_DATE_CONVERSIONS_H_
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/objToJSON.c b/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/objToJSON.c
deleted file mode 100644
index e892b505153..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/objToJSON.c
+++ /dev/null
@@ -1,2130 +0,0 @@
-/*
-Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-* Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-* Neither the name of the ESN Social Software AB nor the
-names of its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
-GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
-THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
-https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights
-reserved.
-
-Numeric decoder derived from TCL library
-https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
-* Copyright (c) 1988-1993 The Regents of the University of California.
-* Copyright (c) 1994 Sun Microsystems, Inc.
-*/
-
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include <math.h>
-
-#define NO_IMPORT_ARRAY
-#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
-#include <numpy/arrayobject.h>
-#include <numpy/arrayscalars.h>
-#include <numpy/ndarraytypes.h>
-#include <numpy/npy_math.h>
-#include <ultrajson.h>
-#include "date_conversions.h"
-#include "datetime.h"
-
-npy_int64 get_nat(void) { return NPY_MIN_INT64; }
-
-typedef char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti,
- size_t *_outLen);
-
-int object_is_decimal_type(PyObject *obj);
-int object_is_dataframe_type(PyObject *obj);
-int object_is_series_type(PyObject *obj);
-int object_is_index_type(PyObject *obj);
-int object_is_nat_type(PyObject *obj);
-int object_is_na_type(PyObject *obj);
-
-typedef struct __NpyArrContext {
- PyObject *array;
- char *dataptr;
- int curdim; // current dimension in array's order
- int stridedim; // dimension we are striding over
- int inc; // stride dimension increment (+/- 1)
- npy_intp dim;
- npy_intp stride;
- npy_intp ndim;
- npy_intp index[NPY_MAXDIMS];
- int type_num;
- PyArray_GetItemFunc *getitem;
-
- char **rowLabels;
- char **columnLabels;
-} NpyArrContext;
-
-typedef struct __PdBlockContext {
- int colIdx;
- int ncols;
- int transpose;
-
- NpyArrContext **npyCtxts; // NpyArrContext for each column
-} PdBlockContext;
-
-typedef struct __TypeContext {
- JSPFN_ITERBEGIN iterBegin;
- JSPFN_ITEREND iterEnd;
- JSPFN_ITERNEXT iterNext;
- JSPFN_ITERGETNAME iterGetName;
- JSPFN_ITERGETVALUE iterGetValue;
- PFN_PyTypeToUTF8 PyTypeToUTF8;
- PyObject *newObj;
- PyObject *dictObj;
- Py_ssize_t index;
- Py_ssize_t size;
- PyObject *itemValue;
- PyObject *itemName;
- PyObject *attrList;
- PyObject *iterator;
-
- double doubleValue;
- JSINT64 longValue;
-
- char *cStr;
- NpyArrContext *npyarr;
- PdBlockContext *pdblock;
- int transpose;
- char **rowLabels;
- char **columnLabels;
- npy_intp rowLabelsLen;
- npy_intp columnLabelsLen;
-} TypeContext;
-
-typedef struct __PyObjectEncoder {
- JSONObjectEncoder enc;
-
- // pass through the NpyArrContext when encoding multi-dimensional arrays
- NpyArrContext *npyCtxtPassthru;
-
- // pass through the PdBlockContext when encoding blocks
- PdBlockContext *blkCtxtPassthru;
-
- // pass-through to encode numpy data directly
- int npyType;
- void *npyValue;
-
- int datetimeIso;
- NPY_DATETIMEUNIT datetimeUnit;
-
- // output format style for pandas data types
- int outputFormat;
- int originalOutputFormat;
-
- PyObject *defaultHandler;
-} PyObjectEncoder;
-
-#define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv))
-
-enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES };
-
-int PdBlock_iterNext(JSOBJ, JSONTypeContext *);
-
-static TypeContext *createTypeContext(void) {
- TypeContext *pc;
-
- pc = PyObject_Malloc(sizeof(TypeContext));
- if (!pc) {
- PyErr_NoMemory();
- return NULL;
- }
- pc->newObj = NULL;
- pc->dictObj = NULL;
- pc->itemValue = NULL;
- pc->itemName = NULL;
- pc->attrList = NULL;
- pc->index = 0;
- pc->size = 0;
- pc->longValue = 0;
- pc->doubleValue = 0.0;
- pc->cStr = NULL;
- pc->npyarr = NULL;
- pc->pdblock = NULL;
- pc->rowLabels = NULL;
- pc->columnLabels = NULL;
- pc->transpose = 0;
- pc->rowLabelsLen = 0;
- pc->columnLabelsLen = 0;
-
- return pc;
-}
-
-static PyObject *get_values(PyObject *obj) {
- PyObject *values = NULL;
-
- if (object_is_index_type(obj) || object_is_series_type(obj)) {
- // The special cases to worry about are dt64tz and category[dt64tz].
- // In both cases we want the UTC-localized datetime64 ndarray,
- // without going through and object array of Timestamps.
- if (PyObject_HasAttrString(obj, "tz")) {
- PyObject *tz = PyObject_GetAttrString(obj, "tz");
- if (tz != Py_None) {
- // Go through object array if we have dt64tz, since tz info will
- // be lost if values is used directly.
- Py_DECREF(tz);
- values = PyObject_CallMethod(obj, "__array__", NULL);
- return values;
- }
- Py_DECREF(tz);
- }
- values = PyObject_GetAttrString(obj, "values");
- if (values == NULL) {
- // Clear so we can subsequently try another method
- PyErr_Clear();
- } else if (PyObject_HasAttrString(values, "__array__")) {
- // We may have gotten a Categorical or Sparse array so call np.array
- PyObject *array_values = PyObject_CallMethod(values, "__array__",
- NULL);
- Py_DECREF(values);
- values = array_values;
- } else if (!PyArray_CheckExact(values)) {
- // Didn't get a numpy array, so keep trying
- Py_DECREF(values);
- values = NULL;
- }
- }
-
- if (values == NULL) {
- PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj));
- PyObject *repr;
- if (PyObject_HasAttrString(obj, "dtype")) {
- PyObject *dtype = PyObject_GetAttrString(obj, "dtype");
- repr = PyObject_Repr(dtype);
- Py_DECREF(dtype);
- } else {
- repr = PyUnicode_FromString("<unknown dtype>");
- }
-
- PyErr_Format(PyExc_ValueError, "%R or %R are not JSON serializable yet",
- repr, typeRepr);
- Py_DECREF(repr);
- Py_DECREF(typeRepr);
-
- return NULL;
- }
-
- return values;
-}
-
-static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) {
- PyObject *tmp = PyObject_GetAttrString(obj, attr);
- PyObject *ret;
-
- if (tmp == 0) {
- return 0;
- }
- ret = PyObject_GetAttrString(tmp, subAttr);
- Py_DECREF(tmp);
-
- return ret;
-}
-
-static Py_ssize_t get_attr_length(PyObject *obj, char *attr) {
- PyObject *tmp = PyObject_GetAttrString(obj, attr);
- Py_ssize_t ret;
-
- if (tmp == 0) {
- return 0;
- }
- ret = PyObject_Length(tmp);
- Py_DECREF(tmp);
-
- if (ret == -1) {
- return 0;
- }
-
- return ret;
-}
-
-static int is_simple_frame(PyObject *obj) {
- PyObject *mgr = PyObject_GetAttrString(obj, "_mgr");
- if (!mgr) {
- return 0;
- }
- int ret;
- if (PyObject_HasAttrString(mgr, "blocks")) {
- ret = (get_attr_length(mgr, "blocks") <= 1);
- } else {
- ret = 0;
- }
-
- Py_DECREF(mgr);
- return ret;
-}
-
-static npy_int64 get_long_attr(PyObject *o, const char *attr) {
- // NB we are implicitly assuming that o is a Timedelta or Timestamp, or NaT
-
- npy_int64 long_val;
- PyObject *value = PyObject_GetAttrString(o, attr);
- long_val =
- (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value));
-
- Py_DECREF(value);
-
- if (object_is_nat_type(o)) {
- // i.e. o is NaT, long_val will be NPY_MIN_INT64
- return long_val;
- }
-
- // ensure we are in nanoseconds, similar to Timestamp._as_creso or _as_unit
- PyObject* reso = PyObject_GetAttrString(o, "_creso");
- if (!PyLong_Check(reso)) {
- // https://github.com/pandas-dev/pandas/pull/49034#discussion_r1023165139
- Py_DECREF(reso);
- return -1;
- }
-
- long cReso = PyLong_AsLong(reso);
- Py_DECREF(reso);
- if (cReso == -1 && PyErr_Occurred()) {
- return -1;
- }
-
- if (cReso == NPY_FR_us) {
- long_val = long_val * 1000L;
- } else if (cReso == NPY_FR_ms) {
- long_val = long_val * 1000000L;
- } else if (cReso == NPY_FR_s) {
- long_val = long_val * 1000000000L;
- }
-
- return long_val;
-}
-
-static npy_float64 total_seconds(PyObject *td) {
- npy_float64 double_val;
- PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL);
- double_val = PyFloat_AS_DOUBLE(value);
- Py_DECREF(value);
- return double_val;
-}
-
-static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
- size_t *_outLen) {
- PyObject *obj = (PyObject *)_obj;
- *_outLen = PyBytes_GET_SIZE(obj);
- return PyBytes_AS_STRING(obj);
-}
-
-static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc,
- size_t *_outLen) {
- char *encoded = (char *)PyUnicode_AsUTF8AndSize(_obj,
- (Py_ssize_t *)_outLen);
- if (encoded == NULL) {
- /* Something went wrong.
- Set errorMsg(to tell encoder to stop),
- and let Python exception propagate. */
- JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder;
- enc->errorMsg = "Encoding failed.";
- }
- return encoded;
-}
-
-/* JSON callback. returns a char* and mutates the pointer to *len */
-static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused),
- JSONTypeContext *tc, size_t *len) {
- NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, base, len);
- return GET_TC(tc)->cStr;
-}
-
-/* JSON callback. returns a char* and mutates the pointer to *len */
-static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused),
- JSONTypeContext *tc, size_t *len) {
- GET_TC(tc)->cStr = int64ToIsoDuration(GET_TC(tc)->longValue, len);
- return GET_TC(tc)->cStr;
-}
-
-/* JSON callback */
-static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc,
- size_t *len) {
- if (!PyDate_Check(obj)) {
- PyErr_SetString(PyExc_TypeError, "Expected date object");
- return NULL;
- }
-
- NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- return PyDateTimeToIso(obj, base, len);
-}
-
-static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) {
- PyObject *obj = (PyObject *)_obj;
- PyObject *str;
- PyObject *tmp;
-
- str = PyObject_CallMethod(obj, "isoformat", NULL);
- if (str == NULL) {
- *outLen = 0;
- if (!PyErr_Occurred()) {
- PyErr_SetString(PyExc_ValueError, "Failed to convert time");
- }
- ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
- return NULL;
- }
- if (PyUnicode_Check(str)) {
- tmp = str;
- str = PyUnicode_AsUTF8String(str);
- Py_DECREF(tmp);
- }
-
- GET_TC(tc)->newObj = str;
-
- *outLen = PyBytes_GET_SIZE(str);
- char *outValue = PyBytes_AS_STRING(str);
- return outValue;
-}
-
-//=============================================================================
-// Numpy array iteration functions
-//=============================================================================
-
-static void NpyArr_freeItemValue(JSOBJ Py_UNUSED(_obj), JSONTypeContext *tc) {
- if (GET_TC(tc)->npyarr &&
- GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) {
- Py_XDECREF(GET_TC(tc)->itemValue);
- GET_TC(tc)->itemValue = NULL;
- }
-}
-
-int NpyArr_iterNextNone(JSOBJ Py_UNUSED(_obj), JSONTypeContext *Py_UNUSED(tc)) {
- return 0;
-}
-
-void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
- PyArrayObject *obj;
- NpyArrContext *npyarr;
-
- if (GET_TC(tc)->newObj) {
- obj = (PyArrayObject *)GET_TC(tc)->newObj;
- } else {
- obj = (PyArrayObject *)_obj;
- }
-
- npyarr = PyObject_Malloc(sizeof(NpyArrContext));
- GET_TC(tc)->npyarr = npyarr;
-
- if (!npyarr) {
- PyErr_NoMemory();
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- return;
- }
-
- npyarr->array = (PyObject *)obj;
- npyarr->getitem = (PyArray_GetItemFunc *)PyArray_DESCR(obj)->f->getitem;
- npyarr->dataptr = PyArray_DATA(obj);
- npyarr->ndim = PyArray_NDIM(obj) - 1;
- npyarr->curdim = 0;
- npyarr->type_num = PyArray_DESCR(obj)->type_num;
-
- if (GET_TC(tc)->transpose) {
- npyarr->dim = PyArray_DIM(obj, npyarr->ndim);
- npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim);
- npyarr->stridedim = npyarr->ndim;
- npyarr->index[npyarr->ndim] = 0;
- npyarr->inc = -1;
- } else {
- npyarr->dim = PyArray_DIM(obj, 0);
- npyarr->stride = PyArray_STRIDE(obj, 0);
- npyarr->stridedim = 0;
- npyarr->index[0] = 0;
- npyarr->inc = 1;
- }
-
- npyarr->columnLabels = GET_TC(tc)->columnLabels;
- npyarr->rowLabels = GET_TC(tc)->rowLabels;
-}
-
-void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
- NpyArrContext *npyarr = GET_TC(tc)->npyarr;
-
- if (npyarr) {
- NpyArr_freeItemValue(obj, tc);
- PyObject_Free(npyarr);
- }
-}
-
-void NpyArrPassThru_iterBegin(JSOBJ Py_UNUSED(obj),
- JSONTypeContext *Py_UNUSED(tc)) {}
-
-void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
- NpyArrContext *npyarr = GET_TC(tc)->npyarr;
- // finished this dimension, reset the data pointer
- npyarr->curdim--;
- npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim];
- npyarr->stridedim -= npyarr->inc;
- npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim);
- npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim);
- npyarr->dataptr += npyarr->stride;
-
- NpyArr_freeItemValue(obj, tc);
-}
-
-int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
- NpyArrContext *npyarr = GET_TC(tc)->npyarr;
-
- if (PyErr_Occurred()) {
- return 0;
- }
-
- if (npyarr->index[npyarr->stridedim] >= npyarr->dim) {
- return 0;
- }
-
- NpyArr_freeItemValue(obj, tc);
-
- if (PyArray_ISDATETIME(npyarr->array)) {
- GET_TC(tc)->itemValue = obj;
- Py_INCREF(obj);
- ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array);
- ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr;
- ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
- } else {
- GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array);
- }
-
- npyarr->dataptr += npyarr->stride;
- npyarr->index[npyarr->stridedim]++;
- return 1;
-}
-
-int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) {
- NpyArrContext *npyarr = GET_TC(tc)->npyarr;
-
- if (PyErr_Occurred()) {
- return 0;
- }
-
- if (npyarr->curdim >= npyarr->ndim ||
- npyarr->index[npyarr->stridedim] >= npyarr->dim) {
- // innermost dimension, start retrieving item values
- GET_TC(tc)->iterNext = NpyArr_iterNextItem;
- return NpyArr_iterNextItem(_obj, tc);
- }
-
- // dig a dimension deeper
- npyarr->index[npyarr->stridedim]++;
-
- npyarr->curdim++;
- npyarr->stridedim += npyarr->inc;
- npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim);
- npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim);
- npyarr->index[npyarr->stridedim] = 0;
-
- ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
- GET_TC(tc)->itemValue = npyarr->array;
- return 1;
-}
-
-JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
-}
-
-char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- NpyArrContext *npyarr = GET_TC(tc)->npyarr;
- npy_intp idx;
- char *cStr;
-
- if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) {
- idx = npyarr->index[npyarr->stridedim] - 1;
- cStr = npyarr->columnLabels[idx];
- } else {
- idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1;
- cStr = npyarr->rowLabels[idx];
- }
-
- *outLen = strlen(cStr);
-
- return cStr;
-}
-
-//=============================================================================
-// Pandas block iteration functions
-//
-// Serialises a DataFrame column by column to avoid unnecessary data copies and
-// more representative serialisation when dealing with mixed dtypes.
-//
-// Uses a dedicated NpyArrContext for each column.
-//=============================================================================
-
-void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
-
- if (blkCtxt->transpose) {
- blkCtxt->colIdx++;
- } else {
- blkCtxt->colIdx = 0;
- }
-
- NpyArr_freeItemValue(obj, tc);
-}
-
-int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
-
- if (blkCtxt->colIdx >= blkCtxt->ncols) {
- return 0;
- }
-
- GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
- blkCtxt->colIdx++;
- return NpyArr_iterNextItem(obj, tc);
-}
-
-char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
- NpyArrContext *npyarr = blkCtxt->npyCtxts[0];
- npy_intp idx;
- char *cStr;
-
- if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) {
- idx = blkCtxt->colIdx - 1;
- cStr = npyarr->columnLabels[idx];
- } else {
- idx = GET_TC(tc)->iterNext != PdBlock_iterNext
- ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1
- : npyarr->index[npyarr->stridedim];
-
- cStr = npyarr->rowLabels[idx];
- }
-
- *outLen = strlen(cStr);
- return cStr;
-}
-
-char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
- NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
- npy_intp idx;
- char *cStr;
-
- if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) {
- idx = npyarr->index[npyarr->stridedim] - 1;
- cStr = npyarr->columnLabels[idx];
- } else {
- idx = blkCtxt->colIdx;
- cStr = npyarr->rowLabels[idx];
- }
-
- *outLen = strlen(cStr);
- return cStr;
-}
-
-int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
- NpyArrContext *npyarr;
-
- if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) {
- return 0;
- }
-
- if (blkCtxt->transpose) {
- if (blkCtxt->colIdx >= blkCtxt->ncols) {
- return 0;
- }
- } else {
- npyarr = blkCtxt->npyCtxts[0];
- if (npyarr->index[npyarr->stridedim] >= npyarr->dim) {
- return 0;
- }
- }
-
- ((PyObjectEncoder *)tc->encoder)->blkCtxtPassthru = blkCtxt;
- GET_TC(tc)->itemValue = obj;
-
- return 1;
-}
-
-void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
-
- if (blkCtxt->transpose) {
- // if transposed we exhaust each column before moving to the next
- GET_TC(tc)->iterNext = NpyArr_iterNextItem;
- GET_TC(tc)->iterGetName = PdBlock_iterGetName_Transpose;
- GET_TC(tc)->npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
- }
-}
-
-void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
- PyObject *obj, *values, *arrays, *array;
- PdBlockContext *blkCtxt;
- NpyArrContext *npyarr;
- Py_ssize_t i;
-
- obj = (PyObject *)_obj;
-
- GET_TC(tc)->iterGetName = GET_TC(tc)->transpose
- ? PdBlock_iterGetName_Transpose
- : PdBlock_iterGetName;
-
- blkCtxt = PyObject_Malloc(sizeof(PdBlockContext));
- if (!blkCtxt) {
- PyErr_NoMemory();
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- return;
- }
- GET_TC(tc)->pdblock = blkCtxt;
-
- blkCtxt->colIdx = 0;
- blkCtxt->transpose = GET_TC(tc)->transpose;
- blkCtxt->ncols = get_attr_length(obj, "columns");
-
- if (blkCtxt->ncols == 0) {
- blkCtxt->npyCtxts = NULL;
-
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- return;
- }
-
- blkCtxt->npyCtxts =
- PyObject_Malloc(sizeof(NpyArrContext *) * blkCtxt->ncols);
- if (!blkCtxt->npyCtxts) {
- PyErr_NoMemory();
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- return;
- }
-
- arrays = get_sub_attr(obj, "_mgr", "column_arrays");
- if (!arrays) {
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- return;
- }
-
- for (i = 0; i < PyObject_Length(arrays); i++) {
- array = PyList_GET_ITEM(arrays, i);
- if (!array) {
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- goto ARR_RET;
- }
-
- // ensure we have a numpy array (i.e. np.asarray)
- values = PyObject_CallMethod(array, "__array__", NULL);
- if ((!values) || (!PyArray_CheckExact(values))) {
- // Didn't get a numpy array
- ((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
- GET_TC(tc)->iterNext = NpyArr_iterNextNone;
- goto ARR_RET;
- }
-
- GET_TC(tc)->newObj = values;
-
- // init a dedicated context for this column
- NpyArr_iterBegin(obj, tc);
- npyarr = GET_TC(tc)->npyarr;
-
- GET_TC(tc)->itemValue = NULL;
- ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = NULL;
-
- blkCtxt->npyCtxts[i] = npyarr;
- GET_TC(tc)->newObj = NULL;
- }
- GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0];
- goto ARR_RET;
-
-ARR_RET:
- Py_DECREF(arrays);
-}
-
-void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
- PdBlockContext *blkCtxt;
- NpyArrContext *npyarr;
- int i;
-
- GET_TC(tc)->itemValue = NULL;
- npyarr = GET_TC(tc)->npyarr;
-
- blkCtxt = GET_TC(tc)->pdblock;
-
- if (blkCtxt) {
- for (i = 0; i < blkCtxt->ncols; i++) {
- npyarr = blkCtxt->npyCtxts[i];
- if (npyarr) {
- if (npyarr->array) {
- Py_DECREF(npyarr->array);
- npyarr->array = NULL;
- }
-
- GET_TC(tc)->npyarr = npyarr;
- NpyArr_iterEnd(obj, tc);
-
- blkCtxt->npyCtxts[i] = NULL;
- }
- }
-
- if (blkCtxt->npyCtxts) {
- PyObject_Free(blkCtxt->npyCtxts);
- }
- PyObject_Free(blkCtxt);
- }
-}
-
-//=============================================================================
-// Tuple iteration functions
-// itemValue is borrowed reference, no ref counting
-//=============================================================================
-void Tuple_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->index = 0;
- GET_TC(tc)->size = PyTuple_GET_SIZE((PyObject *)obj);
- GET_TC(tc)->itemValue = NULL;
-}
-
-int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- PyObject *item;
-
- if (GET_TC(tc)->index >= GET_TC(tc)->size) {
- return 0;
- }
-
- item = PyTuple_GET_ITEM(obj, GET_TC(tc)->index);
-
- GET_TC(tc)->itemValue = item;
- GET_TC(tc)->index++;
- return 1;
-}
-
-void Tuple_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {}
-
-JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
-}
-
-char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc),
- size_t *Py_UNUSED(outLen)) {
- return NULL;
-}
-
-//=============================================================================
-// Set iteration functions
-// itemValue is borrowed reference, no ref counting
-//=============================================================================
-void Set_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->itemValue = NULL;
- GET_TC(tc)->iterator = PyObject_GetIter(obj);
-}
-
-int Set_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObject *item;
-
- if (GET_TC(tc)->itemValue) {
- Py_DECREF(GET_TC(tc)->itemValue);
- GET_TC(tc)->itemValue = NULL;
- }
-
- item = PyIter_Next(GET_TC(tc)->iterator);
-
- if (item == NULL) {
- return 0;
- }
-
- GET_TC(tc)->itemValue = item;
- return 1;
-}
-
-void Set_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- if (GET_TC(tc)->itemValue) {
- Py_DECREF(GET_TC(tc)->itemValue);
- GET_TC(tc)->itemValue = NULL;
- }
-
- if (GET_TC(tc)->iterator) {
- Py_DECREF(GET_TC(tc)->iterator);
- GET_TC(tc)->iterator = NULL;
- }
-}
-
-JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
-}
-
-char *Set_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc),
- size_t *Py_UNUSED(outLen)) {
- return NULL;
-}
-
-//=============================================================================
-// Dir iteration functions
-// itemName ref is borrowed from PyObject_Dir (attrList). No refcount
-// itemValue ref is from PyObject_GetAttr. Ref counted
-//=============================================================================
-void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->attrList = PyObject_Dir(obj);
- GET_TC(tc)->index = 0;
- GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList);
-}
-
-void Dir_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- if (GET_TC(tc)->itemValue) {
- Py_DECREF(GET_TC(tc)->itemValue);
- GET_TC(tc)->itemValue = NULL;
- }
-
- if (GET_TC(tc)->itemName) {
- Py_DECREF(GET_TC(tc)->itemName);
- GET_TC(tc)->itemName = NULL;
- }
-
- Py_DECREF((PyObject *)GET_TC(tc)->attrList);
-}
-
-int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) {
- PyObject *obj = (PyObject *)_obj;
- PyObject *itemValue = GET_TC(tc)->itemValue;
- PyObject *itemName = GET_TC(tc)->itemName;
- PyObject *attr;
- PyObject *attrName;
- char *attrStr;
-
- if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) {
- return 0;
- }
-
- if (itemValue) {
- Py_DECREF(GET_TC(tc)->itemValue);
- GET_TC(tc)->itemValue = itemValue = NULL;
- }
-
- if (itemName) {
- Py_DECREF(GET_TC(tc)->itemName);
- GET_TC(tc)->itemName = itemName = NULL;
- }
-
- for (; GET_TC(tc)->index < GET_TC(tc)->size; GET_TC(tc)->index++) {
- attrName = PyList_GET_ITEM(GET_TC(tc)->attrList, GET_TC(tc)->index);
- attr = PyUnicode_AsUTF8String(attrName);
- attrStr = PyBytes_AS_STRING(attr);
-
- if (attrStr[0] == '_') {
- Py_DECREF(attr);
- continue;
- }
-
- itemValue = PyObject_GetAttr(obj, attrName);
- if (itemValue == NULL) {
- PyErr_Clear();
- Py_DECREF(attr);
- continue;
- }
-
- if (PyCallable_Check(itemValue)) {
- Py_DECREF(itemValue);
- Py_DECREF(attr);
- continue;
- }
-
- GET_TC(tc)->itemName = itemName;
- GET_TC(tc)->itemValue = itemValue;
-
- itemName = attr;
- break;
- }
-
- if (itemName == NULL) {
- GET_TC(tc)->index = GET_TC(tc)->size;
- GET_TC(tc)->itemValue = NULL;
- return 0;
- }
-
- GET_TC(tc)->itemName = itemName;
- GET_TC(tc)->itemValue = itemValue;
- GET_TC(tc)->index++;
-
- return 1;
-}
-
-JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
-}
-
-char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName);
- return PyBytes_AS_STRING(GET_TC(tc)->itemName);
-}
-
-//=============================================================================
-// List iteration functions
-// itemValue is borrowed from object (which is list). No refcounting
-//=============================================================================
-void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->index = 0;
- GET_TC(tc)->size = PyList_GET_SIZE((PyObject *)obj);
-}
-
-int List_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- if (GET_TC(tc)->index >= GET_TC(tc)->size) {
- return 0;
- }
-
- GET_TC(tc)->itemValue = PyList_GET_ITEM(obj, GET_TC(tc)->index);
- GET_TC(tc)->index++;
- return 1;
-}
-
-void List_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {}
-
-JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
-}
-
-char *List_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc),
- size_t *Py_UNUSED(outLen)) {
- return NULL;
-}
-
-//=============================================================================
-// pandas Index iteration functions
-//=============================================================================
-void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- GET_TC(tc)->index = 0;
- GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
- if (!GET_TC(tc)->cStr) {
- PyErr_NoMemory();
- }
-}
-
-int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- Py_ssize_t index;
- if (!GET_TC(tc)->cStr) {
- return 0;
- }
-
- index = GET_TC(tc)->index;
- Py_XDECREF(GET_TC(tc)->itemValue);
- if (index == 0) {
- memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5);
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name");
- } else if (index == 1) {
- memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
- GET_TC(tc)->itemValue = get_values(obj);
- if (!GET_TC(tc)->itemValue) {
- return 0;
- }
- } else {
- return 0;
- }
-
- GET_TC(tc)->index++;
- return 1;
-}
-
-void Index_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {}
-
-JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
-}
-
-char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- *outLen = strlen(GET_TC(tc)->cStr);
- return GET_TC(tc)->cStr;
-}
-
-//=============================================================================
-// pandas Series iteration functions
-//=============================================================================
-void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
- GET_TC(tc)->index = 0;
- GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
- enc->outputFormat = VALUES; // for contained series
- if (!GET_TC(tc)->cStr) {
- PyErr_NoMemory();
- }
-}
-
-int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- Py_ssize_t index;
- if (!GET_TC(tc)->cStr) {
- return 0;
- }
-
- index = GET_TC(tc)->index;
- Py_XDECREF(GET_TC(tc)->itemValue);
- if (index == 0) {
- memcpy(GET_TC(tc)->cStr, "name", sizeof(char) * 5);
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name");
- } else if (index == 1) {
- memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6);
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index");
- } else if (index == 2) {
- memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
- GET_TC(tc)->itemValue = get_values(obj);
- if (!GET_TC(tc)->itemValue) {
- return 0;
- }
- } else {
- return 0;
- }
-
- GET_TC(tc)->index++;
- return 1;
-}
-
-void Series_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
- enc->outputFormat = enc->originalOutputFormat;
-}
-
-JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
-}
-
-char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- *outLen = strlen(GET_TC(tc)->cStr);
- return GET_TC(tc)->cStr;
-}
-
-//=============================================================================
-// pandas DataFrame iteration functions
-//=============================================================================
-void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
- GET_TC(tc)->index = 0;
- GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char));
- enc->outputFormat = VALUES; // for contained series & index
- if (!GET_TC(tc)->cStr) {
- PyErr_NoMemory();
- }
-}
-
-int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- Py_ssize_t index;
- if (!GET_TC(tc)->cStr) {
- return 0;
- }
-
- index = GET_TC(tc)->index;
- Py_XDECREF(GET_TC(tc)->itemValue);
- if (index == 0) {
- memcpy(GET_TC(tc)->cStr, "columns", sizeof(char) * 8);
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns");
- } else if (index == 1) {
- memcpy(GET_TC(tc)->cStr, "index", sizeof(char) * 6);
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index");
- } else if (index == 2) {
- memcpy(GET_TC(tc)->cStr, "data", sizeof(char) * 5);
- if (is_simple_frame(obj)) {
- GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values");
- if (!GET_TC(tc)->itemValue) {
- return 0;
- }
- } else {
- Py_INCREF(obj);
- GET_TC(tc)->itemValue = obj;
- }
- } else {
- return 0;
- }
-
- GET_TC(tc)->index++;
- return 1;
-}
-
-void DataFrame_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder;
- enc->outputFormat = enc->originalOutputFormat;
-}
-
-JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
-}
-
-char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- *outLen = strlen(GET_TC(tc)->cStr);
- return GET_TC(tc)->cStr;
-}
-
-//=============================================================================
-// Dict iteration functions
-// itemName might converted to string (Python_Str). Do refCounting
-// itemValue is borrowed from object (which is dict). No refCounting
-//=============================================================================
-void Dict_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- GET_TC(tc)->index = 0;
-}
-
-int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- PyObject *itemNameTmp;
-
- if (GET_TC(tc)->itemName) {
- Py_DECREF(GET_TC(tc)->itemName);
- GET_TC(tc)->itemName = NULL;
- }
-
- if (!PyDict_Next((PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index,
- &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) {
- return 0;
- }
-
- if (PyUnicode_Check(GET_TC(tc)->itemName)) {
- GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName);
- } else if (!PyBytes_Check(GET_TC(tc)->itemName)) {
- GET_TC(tc)->itemName = PyObject_Str(GET_TC(tc)->itemName);
- itemNameTmp = GET_TC(tc)->itemName;
- GET_TC(tc)->itemName = PyUnicode_AsUTF8String(GET_TC(tc)->itemName);
- Py_DECREF(itemNameTmp);
- } else {
- Py_INCREF(GET_TC(tc)->itemName);
- }
- return 1;
-}
-
-void Dict_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- if (GET_TC(tc)->itemName) {
- Py_DECREF(GET_TC(tc)->itemName);
- GET_TC(tc)->itemName = NULL;
- }
- Py_DECREF(GET_TC(tc)->dictObj);
-}
-
-JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->itemValue;
-}
-
-char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
- size_t *outLen) {
- *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName);
- return PyBytes_AS_STRING(GET_TC(tc)->itemName);
-}
-
-void NpyArr_freeLabels(char **labels, npy_intp len) {
- npy_intp i;
-
- if (labels) {
- for (i = 0; i < len; i++) {
- PyObject_Free(labels[i]);
- }
- PyObject_Free(labels);
- }
-}
-
-/*
- * Function: NpyArr_encodeLabels
- * -----------------------------
- *
- * Builds an array of "encoded" labels.
- *
- * labels: PyArrayObject pointer for labels to be "encoded"
- * num : number of labels
- *
- * "encode" is quoted above because we aren't really doing encoding
- * For historical reasons this function would actually encode the entire
- * array into a separate buffer with a separate call to JSON_Encode
- * and would leave it to complex pointer manipulation from there to
- * unpack values as needed. To make things simpler and more idiomatic
- * this has instead just stringified any input save for datetime values,
- * which may need to be represented in various formats.
- */
-char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
- npy_intp num) {
- // NOTE this function steals a reference to labels.
- PyObject *item = NULL;
- size_t len;
- npy_intp i, stride;
- char **ret;
- char *dataptr, *cLabel;
- int type_num;
- NPY_DATETIMEUNIT base = enc->datetimeUnit;
-
- if (!labels) {
- return 0;
- }
-
- if (PyArray_SIZE(labels) < num) {
- PyErr_SetString(
- PyExc_ValueError,
- "Label array sizes do not match corresponding data shape");
- Py_DECREF(labels);
- return 0;
- }
-
- ret = PyObject_Malloc(sizeof(char *) * num);
- if (!ret) {
- PyErr_NoMemory();
- Py_DECREF(labels);
- return 0;
- }
-
- for (i = 0; i < num; i++) {
- ret[i] = NULL;
- }
-
- stride = PyArray_STRIDE(labels, 0);
- dataptr = PyArray_DATA(labels);
- type_num = PyArray_TYPE(labels);
-
- for (i = 0; i < num; i++) {
- item = PyArray_GETITEM(labels, dataptr);
- if (!item) {
- NpyArr_freeLabels(ret, num);
- ret = 0;
- break;
- }
-
- int is_datetimelike = 0;
- npy_int64 nanosecVal;
- if (PyTypeNum_ISDATETIME(type_num)) {
- is_datetimelike = 1;
- PyArray_VectorUnaryFunc *castfunc =
- PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64);
- if (!castfunc) {
- PyErr_Format(PyExc_ValueError,
- "Cannot cast numpy dtype %d to long",
- enc->npyType);
- }
- castfunc(dataptr, &nanosecVal, 1, NULL, NULL);
- } else if (PyDate_Check(item) || PyDelta_Check(item)) {
- is_datetimelike = 1;
- if (PyObject_HasAttrString(item, "_value")) {
- // see test_date_index_and_values for case with non-nano
- nanosecVal = get_long_attr(item, "_value");
- } else {
- if (PyDelta_Check(item)) {
- nanosecVal = total_seconds(item) *
- 1000000000LL; // nanoseconds per second
- } else {
- // datetime.* objects don't follow above rules
- nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns);
- }
- }
- }
-
- if (is_datetimelike) {
- if (nanosecVal == get_nat()) {
- len = 4;
- cLabel = PyObject_Malloc(len + 1);
- strncpy(cLabel, "null", len + 1);
- } else {
- if (enc->datetimeIso) {
- if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) {
- cLabel = int64ToIsoDuration(nanosecVal, &len);
- } else {
- if (type_num == NPY_DATETIME) {
- cLabel = int64ToIso(nanosecVal, base, &len);
- } else {
- cLabel = PyDateTimeToIso(item, base, &len);
- }
- }
- if (cLabel == NULL) {
- Py_DECREF(item);
- NpyArr_freeLabels(ret, num);
- ret = 0;
- break;
- }
- } else {
- int size_of_cLabel = 21; // 21 chars for int 64
- cLabel = PyObject_Malloc(size_of_cLabel);
- snprintf(cLabel, size_of_cLabel, "%" NPY_DATETIME_FMT,
- NpyDateTimeToEpoch(nanosecVal, base));
- len = strlen(cLabel);
- }
- }
- } else { // Fallback to string representation
- // Replace item with the string to keep it alive.
- Py_SETREF(item, PyObject_Str(item));
- if (item == NULL) {
- NpyArr_freeLabels(ret, num);
- ret = 0;
- break;
- }
-
- cLabel = (char *)PyUnicode_AsUTF8(item);
- len = strlen(cLabel);
- }
-
- // Add 1 to include NULL terminator
- ret[i] = PyObject_Malloc(len + 1);
- memcpy(ret[i], cLabel, len + 1);
- Py_DECREF(item);
-
- if (is_datetimelike) {
- PyObject_Free(cLabel);
- }
-
- if (PyErr_Occurred()) {
- NpyArr_freeLabels(ret, num);
- ret = 0;
- break;
- }
-
- if (!ret[i]) {
- PyErr_NoMemory();
- ret = 0;
- break;
- }
-
- dataptr += stride;
- }
-
- Py_DECREF(labels);
- return ret;
-}
-
-void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) {
- PyObject *tmpObj = NULL;
- tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL);
- if (!PyErr_Occurred()) {
- if (tmpObj == NULL) {
- PyErr_SetString(PyExc_TypeError,
- "Failed to execute default handler");
- } else {
- encode(tmpObj, (JSONObjectEncoder *)enc, NULL, 0);
- }
- }
- Py_XDECREF(tmpObj);
- return;
-}
-
-void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
- PyObject *obj, *exc, *toDictFunc, *tmpObj, *values;
- TypeContext *pc;
- PyObjectEncoder *enc;
- double val;
- npy_int64 value;
- int unit;
-
- tc->prv = NULL;
-
- if (!_obj) {
- tc->type = JT_INVALID;
- return;
- }
-
- obj = (PyObject *)_obj;
- enc = (PyObjectEncoder *)tc->encoder;
-
- if (PyBool_Check(obj)) {
- tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE;
- return;
- } else if (obj == Py_None) {
- tc->type = JT_NULL;
- return;
- }
-
- pc = createTypeContext();
- if (!pc) {
- tc->type = JT_INVALID;
- return;
- }
- tc->prv = pc;
-
- if (PyTypeNum_ISDATETIME(enc->npyType)) {
- int64_t longVal;
- PyArray_VectorUnaryFunc *castfunc =
- PyArray_GetCastFunc(PyArray_DescrFromType(enc->npyType), NPY_INT64);
- if (!castfunc) {
- PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long",
- enc->npyType);
- }
- castfunc(enc->npyValue, &longVal, 1, NULL, NULL);
- if (longVal == get_nat()) {
- tc->type = JT_NULL;
- } else {
- if (enc->datetimeIso) {
- if (enc->npyType == NPY_TIMEDELTA) {
- pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback;
- } else {
- pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
- }
- // Currently no way to pass longVal to iso function, so use
- // state management
- GET_TC(tc)->longValue = longVal;
- tc->type = JT_UTF8;
- } else {
- NPY_DATETIMEUNIT base =
- ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- GET_TC(tc)->longValue = NpyDateTimeToEpoch(longVal, base);
- tc->type = JT_LONG;
- }
- }
-
- // TODO(username): this prevents infinite loop with
- // mixed-type DataFrames;
- // refactor
- enc->npyCtxtPassthru = NULL;
- enc->npyType = -1;
- return;
- }
-
- if (PyIter_Check(obj) ||
- (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) {
- goto ISITERABLE;
- }
-
- if (PyLong_Check(obj)) {
- tc->type = JT_LONG;
- int overflow = 0;
- GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow);
- int err;
- err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred();
-
- if (overflow) {
- tc->type = JT_BIGNUM;
- } else if (err) {
- goto INVALID;
- }
-
- return;
- } else if (PyFloat_Check(obj)) {
- val = PyFloat_AS_DOUBLE(obj);
- if (npy_isnan(val) || npy_isinf(val)) {
- tc->type = JT_NULL;
- } else {
- GET_TC(tc)->doubleValue = val;
- tc->type = JT_DOUBLE;
- }
- return;
- } else if (PyBytes_Check(obj)) {
- pc->PyTypeToUTF8 = PyBytesToUTF8;
- tc->type = JT_UTF8;
- return;
- } else if (PyUnicode_Check(obj)) {
- pc->PyTypeToUTF8 = PyUnicodeToUTF8;
- tc->type = JT_UTF8;
- return;
- } else if (object_is_decimal_type(obj)) {
- GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj);
- tc->type = JT_DOUBLE;
- return;
- } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) {
- if (object_is_nat_type(obj)) {
- tc->type = JT_NULL;
- return;
- }
-
- if (enc->datetimeIso) {
- pc->PyTypeToUTF8 = PyDateTimeToIsoCallback;
- tc->type = JT_UTF8;
- } else {
- NPY_DATETIMEUNIT base =
- ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base);
- tc->type = JT_LONG;
- }
- return;
- } else if (PyTime_Check(obj)) {
- pc->PyTypeToUTF8 = PyTimeToJSON;
- tc->type = JT_UTF8;
- return;
- } else if (PyArray_IsScalar(obj, Datetime)) {
- if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) {
- tc->type = JT_NULL;
- return;
- }
-
- if (enc->datetimeIso) {
- pc->PyTypeToUTF8 = PyDateTimeToIsoCallback;
- tc->type = JT_UTF8;
- } else {
- NPY_DATETIMEUNIT base =
- ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base);
- tc->type = JT_LONG;
- }
- return;
- } else if (PyDelta_Check(obj)) {
- if (PyObject_HasAttrString(obj, "_value")) {
- value = get_long_attr(obj, "_value");
- } else {
- value = total_seconds(obj) * 1000000000LL; // nanoseconds per sec
- }
-
- if (value == get_nat()) {
- tc->type = JT_NULL;
- return;
- } else if (enc->datetimeIso) {
- pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback;
- tc->type = JT_UTF8;
- } else {
- unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
- if (scaleNanosecToUnit(&value, unit) != 0) {
- // TODO(username): Add some kind of error handling here
- }
-
- exc = PyErr_Occurred();
-
- if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
- goto INVALID;
- }
-
- tc->type = JT_LONG;
- }
- GET_TC(tc)->longValue = value;
- return;
- } else if (PyArray_IsScalar(obj, Integer)) {
- tc->type = JT_LONG;
- PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue),
- PyArray_DescrFromType(NPY_INT64));
-
- exc = PyErr_Occurred();
-
- if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
- goto INVALID;
- }
-
- return;
- } else if (PyArray_IsScalar(obj, Bool)) {
- PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue),
- PyArray_DescrFromType(NPY_BOOL));
- tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE;
- return;
- } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) {
- PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->doubleValue),
- PyArray_DescrFromType(NPY_DOUBLE));
- tc->type = JT_DOUBLE;
- return;
- } else if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) {
- PyErr_Format(PyExc_TypeError,
- "%R (0d array) is not JSON serializable at the moment",
- obj);
- goto INVALID;
- } else if (object_is_na_type(obj)) {
- tc->type = JT_NULL;
- return;
- }
-
-ISITERABLE:
-
- if (object_is_index_type(obj)) {
- if (enc->outputFormat == SPLIT) {
- tc->type = JT_OBJECT;
- pc->iterBegin = Index_iterBegin;
- pc->iterEnd = Index_iterEnd;
- pc->iterNext = Index_iterNext;
- pc->iterGetValue = Index_iterGetValue;
- pc->iterGetName = Index_iterGetName;
- return;
- }
-
- pc->newObj = get_values(obj);
- if (pc->newObj) {
- tc->type = JT_ARRAY;
- pc->iterBegin = NpyArr_iterBegin;
- pc->iterEnd = NpyArr_iterEnd;
- pc->iterNext = NpyArr_iterNext;
- pc->iterGetValue = NpyArr_iterGetValue;
- pc->iterGetName = NpyArr_iterGetName;
- } else {
- goto INVALID;
- }
-
- return;
- } else if (object_is_series_type(obj)) {
- if (enc->outputFormat == SPLIT) {
- tc->type = JT_OBJECT;
- pc->iterBegin = Series_iterBegin;
- pc->iterEnd = Series_iterEnd;
- pc->iterNext = Series_iterNext;
- pc->iterGetValue = Series_iterGetValue;
- pc->iterGetName = Series_iterGetName;
- return;
- }
-
- pc->newObj = get_values(obj);
- if (!pc->newObj) {
- goto INVALID;
- }
-
- if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) {
- tc->type = JT_OBJECT;
- tmpObj = PyObject_GetAttrString(obj, "index");
- if (!tmpObj) {
- goto INVALID;
- }
- values = get_values(tmpObj);
- Py_DECREF(tmpObj);
- if (!values) {
- goto INVALID;
- }
- pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0);
- pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
- pc->columnLabelsLen);
- if (!pc->columnLabels) {
- goto INVALID;
- }
- } else {
- tc->type = JT_ARRAY;
- }
- pc->iterBegin = NpyArr_iterBegin;
- pc->iterEnd = NpyArr_iterEnd;
- pc->iterNext = NpyArr_iterNext;
- pc->iterGetValue = NpyArr_iterGetValue;
- pc->iterGetName = NpyArr_iterGetName;
- return;
- } else if (PyArray_Check(obj)) {
- if (enc->npyCtxtPassthru) {
- pc->npyarr = enc->npyCtxtPassthru;
- tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY);
-
- pc->iterBegin = NpyArrPassThru_iterBegin;
- pc->iterNext = NpyArr_iterNext;
- pc->iterEnd = NpyArrPassThru_iterEnd;
- pc->iterGetValue = NpyArr_iterGetValue;
- pc->iterGetName = NpyArr_iterGetName;
-
- enc->npyCtxtPassthru = NULL;
- return;
- }
-
- tc->type = JT_ARRAY;
- pc->iterBegin = NpyArr_iterBegin;
- pc->iterEnd = NpyArr_iterEnd;
- pc->iterNext = NpyArr_iterNext;
- pc->iterGetValue = NpyArr_iterGetValue;
- pc->iterGetName = NpyArr_iterGetName;
- return;
- } else if (object_is_dataframe_type(obj)) {
- if (enc->blkCtxtPassthru) {
- pc->pdblock = enc->blkCtxtPassthru;
- tc->type =
- (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY);
-
- pc->iterBegin = PdBlockPassThru_iterBegin;
- pc->iterEnd = PdBlockPassThru_iterEnd;
- pc->iterNext = PdBlock_iterNextItem;
- pc->iterGetName = PdBlock_iterGetName;
- pc->iterGetValue = NpyArr_iterGetValue;
-
- enc->blkCtxtPassthru = NULL;
- return;
- }
-
- if (enc->outputFormat == SPLIT) {
- tc->type = JT_OBJECT;
- pc->iterBegin = DataFrame_iterBegin;
- pc->iterEnd = DataFrame_iterEnd;
- pc->iterNext = DataFrame_iterNext;
- pc->iterGetValue = DataFrame_iterGetValue;
- pc->iterGetName = DataFrame_iterGetName;
- return;
- }
-
- if (is_simple_frame(obj)) {
- pc->iterBegin = NpyArr_iterBegin;
- pc->iterEnd = NpyArr_iterEnd;
- pc->iterNext = NpyArr_iterNext;
- pc->iterGetName = NpyArr_iterGetName;
-
- pc->newObj = PyObject_GetAttrString(obj, "values");
- if (!pc->newObj) {
- goto INVALID;
- }
- } else {
- pc->iterBegin = PdBlock_iterBegin;
- pc->iterEnd = PdBlock_iterEnd;
- pc->iterNext = PdBlock_iterNext;
- pc->iterGetName = PdBlock_iterGetName;
- }
- pc->iterGetValue = NpyArr_iterGetValue;
-
- if (enc->outputFormat == VALUES) {
- tc->type = JT_ARRAY;
- } else if (enc->outputFormat == RECORDS) {
- tc->type = JT_ARRAY;
- tmpObj = PyObject_GetAttrString(obj, "columns");
- if (!tmpObj) {
- goto INVALID;
- }
- values = get_values(tmpObj);
- if (!values) {
- Py_DECREF(tmpObj);
- goto INVALID;
- }
- pc->columnLabelsLen = PyObject_Size(tmpObj);
- pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
- pc->columnLabelsLen);
- Py_DECREF(tmpObj);
- if (!pc->columnLabels) {
- goto INVALID;
- }
- } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) {
- tc->type = JT_OBJECT;
- tmpObj = (enc->outputFormat == INDEX
- ? PyObject_GetAttrString(obj, "index")
- : PyObject_GetAttrString(obj, "columns"));
- if (!tmpObj) {
- goto INVALID;
- }
- values = get_values(tmpObj);
- if (!values) {
- Py_DECREF(tmpObj);
- goto INVALID;
- }
- pc->rowLabelsLen = PyObject_Size(tmpObj);
- pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
- pc->rowLabelsLen);
- Py_DECREF(tmpObj);
- tmpObj = (enc->outputFormat == INDEX
- ? PyObject_GetAttrString(obj, "columns")
- : PyObject_GetAttrString(obj, "index"));
- if (!tmpObj) {
- NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
- pc->rowLabels = NULL;
- goto INVALID;
- }
- values = get_values(tmpObj);
- if (!values) {
- Py_DECREF(tmpObj);
- NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
- pc->rowLabels = NULL;
- goto INVALID;
- }
- pc->columnLabelsLen = PyObject_Size(tmpObj);
- pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
- pc->columnLabelsLen);
- Py_DECREF(tmpObj);
- if (!pc->columnLabels) {
- NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
- pc->rowLabels = NULL;
- goto INVALID;
- }
-
- if (enc->outputFormat == COLUMNS) {
- pc->transpose = 1;
- }
- } else {
- goto INVALID;
- }
- return;
- } else if (PyDict_Check(obj)) {
- tc->type = JT_OBJECT;
- pc->iterBegin = Dict_iterBegin;
- pc->iterEnd = Dict_iterEnd;
- pc->iterNext = Dict_iterNext;
- pc->iterGetValue = Dict_iterGetValue;
- pc->iterGetName = Dict_iterGetName;
- pc->dictObj = obj;
- Py_INCREF(obj);
-
- return;
- } else if (PyList_Check(obj)) {
- tc->type = JT_ARRAY;
- pc->iterBegin = List_iterBegin;
- pc->iterEnd = List_iterEnd;
- pc->iterNext = List_iterNext;
- pc->iterGetValue = List_iterGetValue;
- pc->iterGetName = List_iterGetName;
- return;
- } else if (PyTuple_Check(obj)) {
- tc->type = JT_ARRAY;
- pc->iterBegin = Tuple_iterBegin;
- pc->iterEnd = Tuple_iterEnd;
- pc->iterNext = Tuple_iterNext;
- pc->iterGetValue = Tuple_iterGetValue;
- pc->iterGetName = Tuple_iterGetName;
- return;
- } else if (PyAnySet_Check(obj)) {
- tc->type = JT_ARRAY;
- pc->iterBegin = Set_iterBegin;
- pc->iterEnd = Set_iterEnd;
- pc->iterNext = Set_iterNext;
- pc->iterGetValue = Set_iterGetValue;
- pc->iterGetName = Set_iterGetName;
- return;
- }
-
- toDictFunc = PyObject_GetAttrString(obj, "toDict");
-
- if (toDictFunc) {
- PyObject *tuple = PyTuple_New(0);
- PyObject *toDictResult = PyObject_Call(toDictFunc, tuple, NULL);
- Py_DECREF(tuple);
- Py_DECREF(toDictFunc);
-
- if (toDictResult == NULL) {
- PyErr_Clear();
- tc->type = JT_NULL;
- return;
- }
-
- if (!PyDict_Check(toDictResult)) {
- Py_DECREF(toDictResult);
- tc->type = JT_NULL;
- return;
- }
-
- tc->type = JT_OBJECT;
- pc->iterBegin = Dict_iterBegin;
- pc->iterEnd = Dict_iterEnd;
- pc->iterNext = Dict_iterNext;
- pc->iterGetValue = Dict_iterGetValue;
- pc->iterGetName = Dict_iterGetName;
- pc->dictObj = toDictResult;
- return;
- }
-
- PyErr_Clear();
-
- if (enc->defaultHandler) {
- Object_invokeDefaultHandler(obj, enc);
- goto INVALID;
- }
-
- tc->type = JT_OBJECT;
- pc->iterBegin = Dir_iterBegin;
- pc->iterEnd = Dir_iterEnd;
- pc->iterNext = Dir_iterNext;
- pc->iterGetValue = Dir_iterGetValue;
- pc->iterGetName = Dir_iterGetName;
- return;
-
-INVALID:
- tc->type = JT_INVALID;
- PyObject_Free(tc->prv);
- tc->prv = NULL;
- return;
-}
-
-void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- if (tc->prv) {
- Py_XDECREF(GET_TC(tc)->newObj);
- GET_TC(tc)->newObj = NULL;
- NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen);
- GET_TC(tc)->rowLabels = NULL;
- NpyArr_freeLabels(GET_TC(tc)->columnLabels,
- GET_TC(tc)->columnLabelsLen);
- GET_TC(tc)->columnLabels = NULL;
- PyObject_Free(GET_TC(tc)->cStr);
- GET_TC(tc)->cStr = NULL;
- PyObject_Free(tc->prv);
- tc->prv = NULL;
- }
-}
-
-const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc,
- size_t *_outLen) {
- return GET_TC(tc)->PyTypeToUTF8(obj, tc, _outLen);
-}
-
-JSINT64 Object_getLongValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->longValue;
-}
-
-double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
- return GET_TC(tc)->doubleValue;
-}
-
-const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc,
- size_t *_outLen) {
- PyObject *repr = PyObject_Str(obj);
- const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *)_outLen);
- char *bytes = PyObject_Malloc(*_outLen + 1);
- memcpy(bytes, str, *_outLen + 1);
- GET_TC(tc)->cStr = bytes;
-
- Py_DECREF(repr);
-
- return GET_TC(tc)->cStr;
-}
-
-static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); }
-
-void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->iterBegin(obj, tc);
-}
-
-int Object_iterNext(JSOBJ obj, JSONTypeContext *tc) {
- return GET_TC(tc)->iterNext(obj, tc);
-}
-
-void Object_iterEnd(JSOBJ obj, JSONTypeContext *tc) {
- GET_TC(tc)->iterEnd(obj, tc);
-}
-
-JSOBJ Object_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
- return GET_TC(tc)->iterGetValue(obj, tc);
-}
-
-char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
- return GET_TC(tc)->iterGetName(obj, tc, outLen);
-}
-
-PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args,
- PyObject *kwargs) {
- PyDateTime_IMPORT;
- if (PyDateTimeAPI == NULL) {
- return NULL;
- }
-
- static char *kwlist[] = {"obj",
- "ensure_ascii",
- "double_precision",
- "encode_html_chars",
- "orient",
- "date_unit",
- "iso_dates",
- "default_handler",
- "indent",
- NULL};
-
- char buffer[65536];
- char *ret;
- PyObject *newobj;
- PyObject *oinput = NULL;
- PyObject *oensureAscii = NULL;
- int idoublePrecision = 10; // default double precision setting
- PyObject *oencodeHTMLChars = NULL;
- char *sOrient = NULL;
- char *sdateFormat = NULL;
- PyObject *oisoDates = 0;
- PyObject *odefHandler = 0;
- int indent = 0;
-
- PyObjectEncoder pyEncoder = {{
- Object_beginTypeContext,
- Object_endTypeContext,
- Object_getStringValue,
- Object_getLongValue,
- NULL, // getIntValue is unused
- Object_getDoubleValue,
- Object_getBigNumStringValue,
- Object_iterBegin,
- Object_iterNext,
- Object_iterEnd,
- Object_iterGetValue,
- Object_iterGetName,
- Object_releaseObject,
- PyObject_Malloc,
- PyObject_Realloc,
- PyObject_Free,
- -1, // recursionMax
- idoublePrecision,
- 1, // forceAscii
- 0, // encodeHTMLChars
- 0, // indent
- }};
- JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder;
-
- pyEncoder.npyCtxtPassthru = NULL;
- pyEncoder.blkCtxtPassthru = NULL;
- pyEncoder.npyType = -1;
- pyEncoder.npyValue = NULL;
- pyEncoder.datetimeIso = 0;
- pyEncoder.datetimeUnit = NPY_FR_ms;
- pyEncoder.outputFormat = COLUMNS;
- pyEncoder.defaultHandler = 0;
-
- if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist,
- &oinput, &oensureAscii, &idoublePrecision,
- &oencodeHTMLChars, &sOrient, &sdateFormat,
- &oisoDates, &odefHandler, &indent)) {
- return NULL;
- }
-
- if (oensureAscii != NULL && !PyObject_IsTrue(oensureAscii)) {
- encoder->forceASCII = 0;
- }
-
- if (oencodeHTMLChars != NULL && PyObject_IsTrue(oencodeHTMLChars)) {
- encoder->encodeHTMLChars = 1;
- }
-
- if (idoublePrecision > JSON_DOUBLE_MAX_DECIMALS || idoublePrecision < 0) {
- PyErr_Format(
- PyExc_ValueError,
- "Invalid value '%d' for option 'double_precision', max is '%u'",
- idoublePrecision, JSON_DOUBLE_MAX_DECIMALS);
- return NULL;
- }
- encoder->doublePrecision = idoublePrecision;
-
- if (sOrient != NULL) {
- if (strcmp(sOrient, "records") == 0) {
- pyEncoder.outputFormat = RECORDS;
- } else if (strcmp(sOrient, "index") == 0) {
- pyEncoder.outputFormat = INDEX;
- } else if (strcmp(sOrient, "split") == 0) {
- pyEncoder.outputFormat = SPLIT;
- } else if (strcmp(sOrient, "values") == 0) {
- pyEncoder.outputFormat = VALUES;
- } else if (strcmp(sOrient, "columns") != 0) {
- PyErr_Format(PyExc_ValueError,
- "Invalid value '%s' for option 'orient'", sOrient);
- return NULL;
- }
- }
-
- if (sdateFormat != NULL) {
- if (strcmp(sdateFormat, "s") == 0) {
- pyEncoder.datetimeUnit = NPY_FR_s;
- } else if (strcmp(sdateFormat, "ms") == 0) {
- pyEncoder.datetimeUnit = NPY_FR_ms;
- } else if (strcmp(sdateFormat, "us") == 0) {
- pyEncoder.datetimeUnit = NPY_FR_us;
- } else if (strcmp(sdateFormat, "ns") == 0) {
- pyEncoder.datetimeUnit = NPY_FR_ns;
- } else {
- PyErr_Format(PyExc_ValueError,
- "Invalid value '%s' for option 'date_unit'",
- sdateFormat);
- return NULL;
- }
- }
-
- if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) {
- pyEncoder.datetimeIso = 1;
- }
-
- if (odefHandler != NULL && odefHandler != Py_None) {
- if (!PyCallable_Check(odefHandler)) {
- PyErr_SetString(PyExc_TypeError, "Default handler is not callable");
- return NULL;
- }
- pyEncoder.defaultHandler = odefHandler;
- }
-
- encoder->indent = indent;
-
- pyEncoder.originalOutputFormat = pyEncoder.outputFormat;
- ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer));
- if (PyErr_Occurred()) {
- return NULL;
- }
-
- if (encoder->errorMsg) {
- if (ret != buffer) {
- encoder->free(ret);
- }
- PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg);
- return NULL;
- }
-
- newobj = PyUnicode_FromString(ret);
-
- if (ret != buffer) {
- encoder->free(ret);
- }
-
- return newobj;
-}
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/ujson.c b/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/ujson.c
deleted file mode 100644
index c12f88d2f93..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/ujson.c
+++ /dev/null
@@ -1,451 +0,0 @@
-/*
-Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-* Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-* Neither the name of the ESN Social Software AB nor the
-names of its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
-https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
-
-Numeric decoder derived from TCL library
-https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
-* Copyright (c) 1988-1993 The Regents of the University of California.
-* Copyright (c) 1994 Sun Microsystems, Inc.
-*/
-
-#include "version.h"
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
-#include "numpy/arrayobject.h"
-
-/* objToJSON */
-PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs);
-void *initObjToJSON(void);
-
-/* JSONToObj */
-PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs);
-
-#define ENCODER_HELP_TEXT \
- "Use ensure_ascii=false to output UTF-8. Pass in double_precision to " \
- "alter the maximum digit precision of doubles. Set " \
- "encode_html_chars=True to encode < > & as unicode escape sequences."
-
-static PyMethodDef ujsonMethods[] = {
- {"encode", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS,
- "Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT},
- {"decode", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS,
- "Converts JSON as string to dict object structure. Use precise_float=True "
- "to use high precision float decoder."},
- {"dumps", (PyCFunction)objToJSON, METH_VARARGS | METH_KEYWORDS,
- "Converts arbitrary object recursively into JSON. " ENCODER_HELP_TEXT},
- {"loads", (PyCFunction)JSONToObj, METH_VARARGS | METH_KEYWORDS,
- "Converts JSON as string to dict object structure. Use precise_float=True "
- "to use high precision float decoder."},
- {NULL, NULL, 0, NULL} /* Sentinel */
-};
-
-typedef struct {
- PyObject *type_decimal;
- PyObject *type_dataframe;
- PyObject *type_series;
- PyObject *type_index;
- PyObject *type_nat;
- PyObject *type_na;
-} modulestate;
-
-#define modulestate(o) ((modulestate *)PyModule_GetState(o))
-
-static int module_traverse(PyObject *m, visitproc visit, void *arg);
-static int module_clear(PyObject *m);
-static void module_free(void *module);
-
-static struct PyModuleDef moduledef = {.m_base = PyModuleDef_HEAD_INIT,
- .m_name = "_libjson",
- .m_methods = ujsonMethods,
- .m_size = sizeof(modulestate),
- .m_traverse = module_traverse,
- .m_clear = module_clear,
- .m_free = module_free};
-
-#ifndef PYPY_VERSION
-/* Used in objToJSON.c */
-int object_is_decimal_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_decimal = state->type_decimal;
- if (type_decimal == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_decimal);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-int object_is_dataframe_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_dataframe = state->type_dataframe;
- if (type_dataframe == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_dataframe);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-int object_is_series_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_series = state->type_series;
- if (type_series == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_series);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-int object_is_index_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_index = state->type_index;
- if (type_index == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_index);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-int object_is_nat_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_nat = state->type_nat;
- if (type_nat == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_nat);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-int object_is_na_type(PyObject *obj) {
- PyObject *module = PyState_FindModule(&moduledef);
- if (module == NULL)
- return 0;
- modulestate *state = modulestate(module);
- if (state == NULL)
- return 0;
- PyObject *type_na = state->type_na;
- if (type_na == NULL) {
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_na);
- if (result == -1) {
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-#else
- /* Used in objToJSON.c */
-int object_is_decimal_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("decimal");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_decimal = PyObject_GetAttrString(module, "Decimal");
- if (type_decimal == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_decimal);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_decimal);
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-int object_is_dataframe_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("pandas");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_dataframe = PyObject_GetAttrString(module, "DataFrame");
- if (type_dataframe == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_dataframe);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_dataframe);
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-int object_is_series_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("pandas");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_series = PyObject_GetAttrString(module, "Series");
- if (type_series == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_series);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_series);
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-int object_is_index_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("pandas");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_index = PyObject_GetAttrString(module, "Index");
- if (type_index == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_index);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_index);
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-int object_is_nat_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("pandas._libs.tslibs.nattype");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_nat = PyObject_GetAttrString(module, "NaTType");
- if (type_nat == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_nat);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_nat);
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-int object_is_na_type(PyObject *obj) {
- PyObject *module = PyImport_ImportModule("pandas._libs.missing");
- if (module == NULL) {
- PyErr_Clear();
- return 0;
- }
- PyObject *type_na = PyObject_GetAttrString(module, "NAType");
- if (type_na == NULL) {
- Py_DECREF(module);
- PyErr_Clear();
- return 0;
- }
- int result = PyObject_IsInstance(obj, type_na);
- if (result == -1) {
- Py_DECREF(module);
- Py_DECREF(type_na);
- PyErr_Clear();
- return 0;
- }
- return result;
-}
-
-#endif
-
-static int module_traverse(PyObject *m, visitproc visit, void *arg) {
- Py_VISIT(modulestate(m)->type_decimal);
- Py_VISIT(modulestate(m)->type_dataframe);
- Py_VISIT(modulestate(m)->type_series);
- Py_VISIT(modulestate(m)->type_index);
- Py_VISIT(modulestate(m)->type_nat);
- Py_VISIT(modulestate(m)->type_na);
- return 0;
-}
-
-static int module_clear(PyObject *m) {
- Py_CLEAR(modulestate(m)->type_decimal);
- Py_CLEAR(modulestate(m)->type_dataframe);
- Py_CLEAR(modulestate(m)->type_series);
- Py_CLEAR(modulestate(m)->type_index);
- Py_CLEAR(modulestate(m)->type_nat);
- Py_CLEAR(modulestate(m)->type_na);
- return 0;
-}
-
-static void module_free(void *module) { module_clear((PyObject *)module); }
-
-PyMODINIT_FUNC PyInit_json(void) {
- import_array()
- PyObject *module;
-
-#ifndef PYPY_VERSION
- // This function is not supported in PyPy.
- if ((module = PyState_FindModule(&moduledef)) != NULL) {
- Py_INCREF(module);
- return module;
- }
-#endif
-
- module = PyModule_Create(&moduledef);
- if (module == NULL) {
- return NULL;
- }
-
-#ifndef PYPY_VERSION
- PyObject *mod_decimal = PyImport_ImportModule("decimal");
- if (mod_decimal) {
- PyObject *type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal");
- assert(type_decimal != NULL);
- modulestate(module)->type_decimal = type_decimal;
- Py_DECREF(mod_decimal);
- }
-
- PyObject *mod_pandas = PyImport_ImportModule("pandas");
- if (mod_pandas) {
- PyObject *type_dataframe =
- PyObject_GetAttrString(mod_pandas, "DataFrame");
- assert(type_dataframe != NULL);
- modulestate(module)->type_dataframe = type_dataframe;
-
- PyObject *type_series = PyObject_GetAttrString(mod_pandas, "Series");
- assert(type_series != NULL);
- modulestate(module)->type_series = type_series;
-
- PyObject *type_index = PyObject_GetAttrString(mod_pandas, "Index");
- assert(type_index != NULL);
- modulestate(module)->type_index = type_index;
-
- Py_DECREF(mod_pandas);
- }
-
- PyObject *mod_nattype =
- PyImport_ImportModule("pandas._libs.tslibs.nattype");
- if (mod_nattype) {
- PyObject *type_nat = PyObject_GetAttrString(mod_nattype, "NaTType");
- assert(type_nat != NULL);
- modulestate(module)->type_nat = type_nat;
-
- Py_DECREF(mod_nattype);
- }
-
- PyObject *mod_natype = PyImport_ImportModule("pandas._libs.missing");
- if (mod_natype) {
- PyObject *type_na = PyObject_GetAttrString(mod_natype, "NAType");
- assert(type_na != NULL);
- modulestate(module)->type_na = type_na;
-
- Py_DECREF(mod_natype);
- } else {
- PyErr_Clear();
- }
-#endif
-
- /* Not vendored for now
- JSONDecodeError = PyErr_NewException("ujson.JSONDecodeError",
- PyExc_ValueError, NULL); Py_XINCREF(JSONDecodeError); if
- (PyModule_AddObject(module, "JSONDecodeError", JSONDecodeError) < 0)
- {
- Py_XDECREF(JSONDecodeError);
- Py_CLEAR(JSONDecodeError);
- Py_DECREF(module);
- return NULL;
- }
- */
-
- return module;
-}
diff --git a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/version.h b/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/version.h
deleted file mode 100644
index 15c55309d62..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/src/ujson/python/version.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the ESN Social Software AB nor the
- names of its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc)
-https://github.com/client9/stringencoders
-Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved.
-
-Numeric decoder derived from TCL library
-https://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms
- * Copyright (c) 1988-1993 The Regents of the University of California.
- * Copyright (c) 1994 Sun Microsystems, Inc.
-*/
-
-#ifndef PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_
-#define PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_
-
-#define UJSON_VERSION "1.33"
-
-#endif // PANDAS__LIBS_SRC_UJSON_PYTHON_VERSION_H_
diff --git a/contrib/python/pandas/py3/pandas/_libs/testing.pyi b/contrib/python/pandas/py3/pandas/_libs/testing.pyi
deleted file mode 100644
index 01da496975f..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/testing.pyi
+++ /dev/null
@@ -1,12 +0,0 @@
-def assert_dict_equal(a, b, compare_keys: bool = ...): ...
-def assert_almost_equal(
- a,
- b,
- rtol: float = ...,
- atol: float = ...,
- check_dtype: bool = ...,
- obj=...,
- lobj=...,
- robj=...,
- index_values=...,
-): ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/testing.pyx b/contrib/python/pandas/py3/pandas/_libs/testing.pyx
deleted file mode 100644
index 733879154b9..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/testing.pyx
+++ /dev/null
@@ -1,219 +0,0 @@
-import cmath
-import math
-
-import numpy as np
-
-from numpy cimport import_array
-
-import_array()
-
-from pandas._libs.util cimport (
- is_array,
- is_complex_object,
- is_real_number_object,
-)
-
-from pandas.core.dtypes.common import is_dtype_equal
-from pandas.core.dtypes.missing import (
- array_equivalent,
- isna,
-)
-
-
-cdef bint isiterable(obj):
- return hasattr(obj, "__iter__")
-
-
-cdef bint has_length(obj):
- return hasattr(obj, "__len__")
-
-
-cdef bint is_dictlike(obj):
- return hasattr(obj, "keys") and hasattr(obj, "__getitem__")
-
-
-cpdef assert_dict_equal(a, b, bint compare_keys=True):
- assert is_dictlike(a) and is_dictlike(b), (
- "Cannot compare dict objects, one or both is not dict-like"
- )
-
- a_keys = frozenset(a.keys())
- b_keys = frozenset(b.keys())
-
- if compare_keys:
- assert a_keys == b_keys
-
- for k in a_keys:
- assert_almost_equal(a[k], b[k])
-
- return True
-
-
-cpdef assert_almost_equal(a, b,
- rtol=1.e-5, atol=1.e-8,
- bint check_dtype=True,
- obj=None, lobj=None, robj=None, index_values=None):
- """
- Check that left and right objects are almost equal.
-
- Parameters
- ----------
- a : object
- b : object
- rtol : float, default 1e-5
- Relative tolerance.
-
- .. versionadded:: 1.1.0
- atol : float, default 1e-8
- Absolute tolerance.
-
- .. versionadded:: 1.1.0
- check_dtype: bool, default True
- check dtype if both a and b are np.ndarray.
- obj : str, default None
- Specify object name being compared, internally used to show
- appropriate assertion message.
- lobj : str, default None
- Specify left object name being compared, internally used to show
- appropriate assertion message.
- robj : str, default None
- Specify right object name being compared, internally used to show
- appropriate assertion message.
- index_values : ndarray, default None
- Specify shared index values of objects being compared, internally used
- to show appropriate assertion message.
-
- .. versionadded:: 1.1.0
-
- """
- cdef:
- double diff = 0.0
- Py_ssize_t i, na, nb
- double fa, fb
- bint is_unequal = False, a_is_ndarray, b_is_ndarray
- str first_diff = ""
-
- if lobj is None:
- lobj = a
- if robj is None:
- robj = b
-
- if isinstance(a, dict) or isinstance(b, dict):
- return assert_dict_equal(a, b)
-
- if isinstance(a, str) or isinstance(b, str):
- assert a == b, f"{a} != {b}"
- return True
-
- a_is_ndarray = is_array(a)
- b_is_ndarray = is_array(b)
-
- if obj is None:
- if a_is_ndarray or b_is_ndarray:
- obj = "numpy array"
- else:
- obj = "Iterable"
-
- if isiterable(a):
-
- if not isiterable(b):
- from pandas._testing import assert_class_equal
-
- # classes can't be the same, to raise error
- assert_class_equal(a, b, obj=obj)
-
- assert has_length(a) and has_length(b), (
- f"Can't compare objects without length, one or both is invalid: ({a}, {b})"
- )
-
- if a_is_ndarray and b_is_ndarray:
- na, nb = a.size, b.size
- if a.shape != b.shape:
- from pandas._testing import raise_assert_detail
- raise_assert_detail(
- obj, f"{obj} shapes are different", a.shape, b.shape)
-
- if check_dtype and not is_dtype_equal(a.dtype, b.dtype):
- from pandas._testing import assert_attr_equal
- assert_attr_equal("dtype", a, b, obj=obj)
-
- if array_equivalent(a, b, strict_nan=True):
- return True
-
- else:
- na, nb = len(a), len(b)
-
- if na != nb:
- from pandas._testing import raise_assert_detail
-
- # if we have a small diff set, print it
- if abs(na - nb) < 10:
- r = list(set(a) ^ set(b))
- else:
- r = None
-
- raise_assert_detail(obj, f"{obj} length are different", na, nb, r)
-
- for i in range(len(a)):
- try:
- assert_almost_equal(a[i], b[i], rtol=rtol, atol=atol)
- except AssertionError:
- is_unequal = True
- diff += 1
- if not first_diff:
- first_diff = (
- f"At positional index {i}, first diff: {a[i]} != {b[i]}"
- )
-
- if is_unequal:
- from pandas._testing import raise_assert_detail
- msg = (f"{obj} values are different "
- f"({np.round(diff * 100.0 / na, 5)} %)")
- raise_assert_detail(
- obj, msg, lobj, robj, first_diff=first_diff, index_values=index_values
- )
-
- return True
-
- elif isiterable(b):
- from pandas._testing import assert_class_equal
-
- # classes can't be the same, to raise error
- assert_class_equal(a, b, obj=obj)
-
- if isna(a) and isna(b):
- # TODO: Should require same-dtype NA?
- # nan / None comparison
- return True
-
- if isna(a) and not isna(b) or not isna(a) and isna(b):
- # boolean value of pd.NA is ambigous
- raise AssertionError(f"{a} != {b}")
-
- if a == b:
- # object comparison
- return True
-
- if is_real_number_object(a) and is_real_number_object(b):
- if array_equivalent(a, b, strict_nan=True):
- # inf comparison
- return True
-
- fa, fb = a, b
-
- if not math.isclose(fa, fb, rel_tol=rtol, abs_tol=atol):
- assert False, (f"expected {fb:.5f} but got {fa:.5f}, "
- f"with rtol={rtol}, atol={atol}")
- return True
-
- if is_complex_object(a) and is_complex_object(b):
- if array_equivalent(a, b, strict_nan=True):
- # inf comparison
- return True
-
- if not cmath.isclose(a, b, rel_tol=rtol, abs_tol=atol):
- assert False, (f"expected {b:.5f} but got {a:.5f}, "
- f"with rtol={rtol}, atol={atol}")
- return True
-
- raise AssertionError(f"{a} != {b}")
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslib.pyi b/contrib/python/pandas/py3/pandas/_libs/tslib.pyi
deleted file mode 100644
index 9819b5173db..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslib.pyi
+++ /dev/null
@@ -1,32 +0,0 @@
-from datetime import tzinfo
-
-import numpy as np
-
-from pandas._typing import npt
-
-def format_array_from_datetime(
- values: npt.NDArray[np.int64],
- tz: tzinfo | None = ...,
- format: str | None = ...,
- na_rep: str | float = ...,
- reso: int = ..., # NPY_DATETIMEUNIT
-) -> npt.NDArray[np.object_]: ...
-def array_with_unit_to_datetime(
- values: npt.NDArray[np.object_],
- unit: str,
- errors: str = ...,
-) -> tuple[np.ndarray, tzinfo | None]: ...
-def first_non_null(values: np.ndarray) -> int: ...
-def array_to_datetime(
- values: npt.NDArray[np.object_],
- errors: str = ...,
- dayfirst: bool = ...,
- yearfirst: bool = ...,
- utc: bool = ...,
-) -> tuple[np.ndarray, tzinfo | None]: ...
-
-# returned ndarray may be object dtype or datetime64[ns]
-
-def array_to_datetime_with_tz(
- values: npt.NDArray[np.object_], tz: tzinfo
-) -> npt.NDArray[np.int64]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslib.pyx b/contrib/python/pandas/py3/pandas/_libs/tslib.pyx
deleted file mode 100644
index 19dd7aabe6b..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslib.pyx
+++ /dev/null
@@ -1,715 +0,0 @@
-import warnings
-
-from pandas.util._exceptions import find_stack_level
-
-cimport cython
-
-from datetime import timezone
-
-from cpython.datetime cimport (
- PyDate_Check,
- PyDateTime_Check,
- datetime,
- import_datetime,
- timedelta,
- tzinfo,
-)
-from cpython.object cimport PyObject
-
-# import datetime C API
-import_datetime()
-
-
-cimport numpy as cnp
-from numpy cimport (
- int64_t,
- ndarray,
-)
-
-import numpy as np
-
-cnp.import_array()
-
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- NPY_FR_ns,
- check_dts_bounds,
- npy_datetimestruct,
- npy_datetimestruct_to_datetime,
- pandas_datetime_to_datetimestruct,
- pydate_to_dt64,
- string_to_dts,
-)
-from pandas._libs.tslibs.strptime cimport parse_today_now
-from pandas._libs.util cimport (
- is_datetime64_object,
- is_float_object,
- is_integer_object,
-)
-
-from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
-
-from pandas._libs.tslibs.conversion cimport (
- _TSObject,
- cast_from_unit,
- convert_str_to_tsobject,
- convert_timezone,
- get_datetime64_nanos,
- parse_pydatetime,
- precision_from_unit,
-)
-from pandas._libs.tslibs.nattype cimport (
- NPY_NAT,
- c_NaT as NaT,
- c_nat_strings as nat_strings,
-)
-from pandas._libs.tslibs.timestamps cimport _Timestamp
-
-from pandas._libs.tslibs import (
- Resolution,
- get_resolution,
-)
-from pandas._libs.tslibs.timestamps import Timestamp
-
-# Note: this is the only non-tslibs intra-pandas dependency here
-
-from pandas._libs.missing cimport checknull_with_nat_and_na
-from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
-
-
-def _test_parse_iso8601(ts: str):
- """
- TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used
- only for testing, actual construction uses `convert_str_to_tsobject`
- """
- cdef:
- _TSObject obj
- int out_local = 0, out_tzoffset = 0
- NPY_DATETIMEUNIT out_bestunit
-
- obj = _TSObject()
-
- string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True)
- obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts)
- check_dts_bounds(&obj.dts)
- if out_local == 1:
- obj.tzinfo = timezone(timedelta(minutes=out_tzoffset))
- obj.value = tz_localize_to_utc_single(obj.value, obj.tzinfo)
- return Timestamp(obj.value, tz=obj.tzinfo)
- else:
- return Timestamp(obj.value)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def format_array_from_datetime(
- ndarray values,
- tzinfo tz=None,
- str format=None,
- na_rep: str | float = "NaT",
- NPY_DATETIMEUNIT reso=NPY_FR_ns,
-) -> np.ndarray:
- """
- return a np object array of the string formatted values
-
- Parameters
- ----------
- values : ndarray[int64_t], arbitrary ndim
- tz : tzinfo or None, default None
- format : str or None, default None
- a strftime capable string
- na_rep : optional, default is None
- a nat format
- reso : NPY_DATETIMEUNIT, default NPY_FR_ns
-
- Returns
- -------
- np.ndarray[object]
- """
- cdef:
- int64_t val, ns, N = values.size
- bint show_ms = False, show_us = False, show_ns = False
- bint basic_format = False, basic_format_day = False
- _Timestamp ts
- object res
- npy_datetimestruct dts
-
- # Note that `result` (and thus `result_flat`) is C-order and
- # `it` iterates C-order as well, so the iteration matches
- # See discussion at
- # github.com/pandas-dev/pandas/pull/46886#discussion_r860261305
- ndarray result = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
- object[::1] res_flat = result.ravel() # should NOT be a copy
- cnp.flatiter it = cnp.PyArray_IterNew(values)
-
- if tz is None:
- # if we don't have a format nor tz, then choose
- # a format based on precision
- basic_format = format is None
- if basic_format:
- reso_obj = get_resolution(values, tz=tz, reso=reso)
- show_ns = reso_obj == Resolution.RESO_NS
- show_us = reso_obj == Resolution.RESO_US
- show_ms = reso_obj == Resolution.RESO_MS
-
- elif format == "%Y-%m-%d %H:%M:%S":
- # Same format as default, but with hardcoded precision (s)
- basic_format = True
- show_ns = show_us = show_ms = False
-
- elif format == "%Y-%m-%d %H:%M:%S.%f":
- # Same format as default, but with hardcoded precision (us)
- basic_format = show_us = True
- show_ns = show_ms = False
-
- elif format == "%Y-%m-%d":
- # Default format for dates
- basic_format_day = True
-
- assert not (basic_format_day and basic_format)
-
- for i in range(N):
- # Analogous to: utc_val = values[i]
- val = (<int64_t*>cnp.PyArray_ITER_DATA(it))[0]
-
- if val == NPY_NAT:
- res = na_rep
- elif basic_format_day:
-
- pandas_datetime_to_datetimestruct(val, reso, &dts)
- res = f"{dts.year}-{dts.month:02d}-{dts.day:02d}"
-
- elif basic_format:
-
- pandas_datetime_to_datetimestruct(val, reso, &dts)
- res = (f"{dts.year}-{dts.month:02d}-{dts.day:02d} "
- f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}")
-
- if show_ns:
- ns = dts.ps // 1000
- res += f".{ns + dts.us * 1000:09d}"
- elif show_us:
- res += f".{dts.us:06d}"
- elif show_ms:
- res += f".{dts.us // 1000:03d}"
-
- else:
-
- ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz)
- if format is None:
- # Use datetime.str, that returns ts.isoformat(sep=' ')
- res = str(ts)
- else:
-
- # invalid format string
- # requires dates > 1900
- try:
- # Note: dispatches to pydatetime
- res = ts.strftime(format)
- except ValueError:
- # Use datetime.str, that returns ts.isoformat(sep=' ')
- res = str(ts)
-
- # Note: we can index result directly instead of using PyArray_MultiIter_DATA
- # like we do for the other functions because result is known C-contiguous
- # and is the first argument to PyArray_MultiIterNew2. The usual pattern
- # does not seem to work with object dtype.
- # See discussion at
- # github.com/pandas-dev/pandas/pull/46886#discussion_r860261305
- res_flat[i] = res
-
- cnp.PyArray_ITER_NEXT(it)
-
- return result
-
-
-def array_with_unit_to_datetime(
- ndarray[object] values,
- str unit,
- str errors="coerce"
-):
- """
- Convert the ndarray to datetime according to the time unit.
-
- This function converts an array of objects into a numpy array of
- datetime64[ns]. It returns the converted array
- and also returns the timezone offset
-
- if errors:
- - raise: return converted values or raise OutOfBoundsDatetime
- if out of range on the conversion or
- ValueError for other conversions (e.g. a string)
- - ignore: return non-convertible values as the same unit
- - coerce: NaT for non-convertibles
-
- Parameters
- ----------
- values : ndarray
- Date-like objects to convert.
- unit : str
- Time unit to use during conversion.
- errors : str, default 'raise'
- Error behavior when parsing.
-
- Returns
- -------
- result : ndarray of m8 values
- tz : parsed timezone offset or None
- """
- cdef:
- Py_ssize_t i, n=len(values)
- int64_t mult
- bint is_ignore = errors == "ignore"
- bint is_coerce = errors == "coerce"
- bint is_raise = errors == "raise"
- ndarray[int64_t] iresult
- tzinfo tz = None
- float fval
-
- assert is_ignore or is_coerce or is_raise
-
- if unit == "ns":
- result, tz = array_to_datetime(
- values.astype(object, copy=False),
- errors=errors,
- )
- return result, tz
-
- mult, _ = precision_from_unit(unit)
-
- result = np.empty(n, dtype="M8[ns]")
- iresult = result.view("i8")
-
- for i in range(n):
- val = values[i]
-
- try:
- if checknull_with_nat_and_na(val):
- iresult[i] = NPY_NAT
-
- elif is_integer_object(val) or is_float_object(val):
-
- if val != val or val == NPY_NAT:
- iresult[i] = NPY_NAT
- else:
- iresult[i] = cast_from_unit(val, unit)
-
- elif isinstance(val, str):
- if len(val) == 0 or val in nat_strings:
- iresult[i] = NPY_NAT
-
- else:
-
- try:
- fval = float(val)
- except ValueError:
- raise ValueError(
- f"non convertible value {val} with the unit '{unit}'"
- )
- warnings.warn(
- "The behavior of 'to_datetime' with 'unit' when parsing "
- "strings is deprecated. In a future version, strings will "
- "be parsed as datetime strings, matching the behavior "
- "without a 'unit'. To retain the old behavior, explicitly "
- "cast ints or floats to numeric type before calling "
- "to_datetime.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
-
- iresult[i] = cast_from_unit(fval, unit)
-
- else:
- # TODO: makes more sense as TypeError, but that would be an
- # API change.
- raise ValueError(
- f"unit='{unit}' not valid with non-numerical val='{val}'"
- )
-
- except (ValueError, OutOfBoundsDatetime, TypeError) as err:
- if is_raise:
- err.args = (f"{err}, at position {i}",)
- raise
- elif is_ignore:
- # we have hit an exception
- # and are in ignore mode
- # redo as object
- return _array_with_unit_to_datetime_object_fallback(values, unit)
- else:
- # is_coerce
- iresult[i] = NPY_NAT
-
- return result, tz
-
-
-cdef _array_with_unit_to_datetime_object_fallback(ndarray[object] values, str unit):
- cdef:
- Py_ssize_t i, n = len(values)
- ndarray[object] oresult
- tzinfo tz = None
-
- # TODO: fix subtle differences between this and no-unit code
- oresult = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
- for i in range(n):
- val = values[i]
-
- if checknull_with_nat_and_na(val):
- oresult[i] = <object>NaT
- elif is_integer_object(val) or is_float_object(val):
-
- if val != val or val == NPY_NAT:
- oresult[i] = <object>NaT
- else:
- try:
- oresult[i] = Timestamp(val, unit=unit)
- except OutOfBoundsDatetime:
- oresult[i] = val
-
- elif isinstance(val, str):
- if len(val) == 0 or val in nat_strings:
- oresult[i] = <object>NaT
-
- else:
- oresult[i] = val
-
- return oresult, tz
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def first_non_null(values: ndarray) -> int:
- """Find position of first non-null value, return -1 if there isn't one."""
- cdef:
- Py_ssize_t n = len(values)
- Py_ssize_t i
- for i in range(n):
- val = values[i]
- if checknull_with_nat_and_na(val):
- continue
- if (
- isinstance(val, str)
- and
- (len(val) == 0 or val in nat_strings or val in ("now", "today"))
- ):
- continue
- return i
- else:
- return -1
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef array_to_datetime(
- ndarray values, # object dtype, arbitrary ndim
- str errors="raise",
- bint dayfirst=False,
- bint yearfirst=False,
- bint utc=False,
-):
- """
- Converts a 1D array of date-like values to a numpy array of either:
- 1) datetime64[ns] data
- 2) datetime.datetime objects, if OutOfBoundsDatetime or TypeError
- is encountered
-
- Also returns a fixed-offset tzinfo object if an array of strings with the same
- timezone offset is passed and utc=True is not passed. Otherwise, None
- is returned
-
- Handles datetime.date, datetime.datetime, np.datetime64 objects, numeric,
- strings
-
- Parameters
- ----------
- values : ndarray of object
- date-like objects to convert
- errors : str, default 'raise'
- error behavior when parsing
- dayfirst : bool, default False
- dayfirst parsing behavior when encountering datetime strings
- yearfirst : bool, default False
- yearfirst parsing behavior when encountering datetime strings
- utc : bool, default False
- indicator whether the dates should be UTC
-
- Returns
- -------
- np.ndarray
- May be datetime64[ns] or object dtype
- tzinfo or None
- """
- cdef:
- Py_ssize_t i, n = values.size
- object val, tz
- ndarray[int64_t] iresult
- npy_datetimestruct dts
- bint utc_convert = bool(utc)
- bint seen_datetime_offset = False
- bint is_raise = errors == "raise"
- bint is_ignore = errors == "ignore"
- bint is_coerce = errors == "coerce"
- bint is_same_offsets
- _TSObject _ts
- float tz_offset
- set out_tzoffset_vals = set()
- tzinfo tz_out = None
- bint found_tz = False, found_naive = False
- cnp.broadcast mi
-
- # specify error conditions
- assert is_raise or is_ignore or is_coerce
-
- result = np.empty((<object>values).shape, dtype="M8[ns]")
- mi = cnp.PyArray_MultiIterNew2(result, values)
- iresult = result.view("i8").ravel()
-
- for i in range(n):
- # Analogous to `val = values[i]`
- val = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- try:
- if checknull_with_nat_and_na(val):
- iresult[i] = NPY_NAT
-
- elif PyDateTime_Check(val):
- if val.tzinfo is not None:
- found_tz = True
- else:
- found_naive = True
- tz_out = convert_timezone(
- val.tzinfo,
- tz_out,
- found_naive,
- found_tz,
- utc_convert,
- )
- iresult[i] = parse_pydatetime(val, &dts, utc_convert)
-
- elif PyDate_Check(val):
- iresult[i] = pydate_to_dt64(val, &dts)
- check_dts_bounds(&dts)
-
- elif is_datetime64_object(val):
- iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
-
- elif is_integer_object(val) or is_float_object(val):
- # these must be ns unit by-definition
-
- if val != val or val == NPY_NAT:
- iresult[i] = NPY_NAT
- else:
- # we now need to parse this as if unit='ns'
- iresult[i] = cast_from_unit(val, "ns")
-
- elif isinstance(val, str):
- # string
- if type(val) is not str:
- # GH#32264 np.str_ object
- val = str(val)
-
- if parse_today_now(val, &iresult[i], utc):
- # We can't _quite_ dispatch this to convert_str_to_tsobject
- # bc there isn't a nice way to pass "utc"
- cnp.PyArray_MultiIter_NEXT(mi)
- continue
-
- _ts = convert_str_to_tsobject(
- val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst
- )
- _ts.ensure_reso(NPY_FR_ns, val)
-
- iresult[i] = _ts.value
-
- tz = _ts.tzinfo
- if tz is not None:
- # dateutil timezone objects cannot be hashed, so
- # store the UTC offsets in seconds instead
- nsecs = tz.utcoffset(None).total_seconds()
- out_tzoffset_vals.add(nsecs)
- # need to set seen_datetime_offset *after* the
- # potentially-raising timezone(timedelta(...)) call,
- # otherwise we can go down the is_same_offsets path
- # bc len(out_tzoffset_vals) == 0
- seen_datetime_offset = True
- else:
- # Add a marker for naive string, to track if we are
- # parsing mixed naive and aware strings
- out_tzoffset_vals.add("naive")
-
- else:
- raise TypeError(f"{type(val)} is not convertible to datetime")
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- except (TypeError, OverflowError, ValueError) as ex:
- ex.args = (f"{ex}, at position {i}",)
- if is_coerce:
- iresult[i] = NPY_NAT
- cnp.PyArray_MultiIter_NEXT(mi)
- continue
- elif is_raise:
- raise
- return values, None
-
- if seen_datetime_offset and not utc_convert:
- # GH#17697
- # 1) If all the offsets are equal, return one offset for
- # the parsed dates to (maybe) pass to DatetimeIndex
- # 2) If the offsets are different, then force the parsing down the
- # object path where an array of datetimes
- # (with individual dateutil.tzoffsets) are returned
- is_same_offsets = len(out_tzoffset_vals) == 1
- if not is_same_offsets:
- return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
- else:
- tz_offset = out_tzoffset_vals.pop()
- tz_out = timezone(timedelta(seconds=tz_offset))
- return result, tz_out
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef _array_to_datetime_object(
- ndarray[object] values,
- str errors,
- bint dayfirst=False,
- bint yearfirst=False,
-):
- """
- Fall back function for array_to_datetime
-
- Attempts to parse datetime strings with dateutil to return an array
- of datetime objects
-
- Parameters
- ----------
- values : ndarray[object]
- date-like objects to convert
- errors : str
- error behavior when parsing
- dayfirst : bool, default False
- dayfirst parsing behavior when encountering datetime strings
- yearfirst : bool, default False
- yearfirst parsing behavior when encountering datetime strings
-
- Returns
- -------
- np.ndarray[object]
- Literal[None]
- """
- cdef:
- Py_ssize_t i, n = values.size
- object val
- bint is_ignore = errors == "ignore"
- bint is_coerce = errors == "coerce"
- bint is_raise = errors == "raise"
- ndarray oresult_nd
- ndarray[object] oresult
- npy_datetimestruct dts
- cnp.broadcast mi
- _TSObject tsobj
-
- assert is_raise or is_ignore or is_coerce
-
- oresult_nd = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
- mi = cnp.PyArray_MultiIterNew2(oresult_nd, values)
- oresult = oresult_nd.ravel()
-
- # We return an object array and only attempt to parse:
- # 1) NaT or NaT-like values
- # 2) datetime strings, which we return as datetime.datetime
- # 3) special strings - "now" & "today"
- for i in range(n):
- # Analogous to: val = values[i]
- val = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if checknull_with_nat_and_na(val) or PyDateTime_Check(val):
- # GH 25978. No need to parse NaT-like or datetime-like vals
- oresult[i] = val
- elif isinstance(val, str):
- if type(val) is not str:
- # GH#32264 np.str_ objects
- val = str(val)
-
- if len(val) == 0 or val in nat_strings:
- oresult[i] = "NaT"
- cnp.PyArray_MultiIter_NEXT(mi)
- continue
-
- try:
- tsobj = convert_str_to_tsobject(
- val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst
- )
- tsobj.ensure_reso(NPY_FR_ns, val)
-
- dts = tsobj.dts
- oresult[i] = datetime(
- dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us,
- tzinfo=tsobj.tzinfo,
- fold=tsobj.fold,
- )
-
- except (ValueError, OverflowError) as ex:
- ex.args = (f"{ex}, at position {i}", )
- if is_coerce:
- oresult[i] = <object>NaT
- cnp.PyArray_MultiIter_NEXT(mi)
- continue
- if is_raise:
- raise
- return values, None
- else:
- if is_raise:
- raise
- return values, None
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return oresult_nd, None
-
-
-def array_to_datetime_with_tz(ndarray values, tzinfo tz):
- """
- Vectorized analogue to pd.Timestamp(value, tz=tz)
-
- values has object-dtype, unrestricted ndim.
-
- Major differences between this and array_to_datetime with utc=True
- - np.datetime64 objects are treated as _wall_ times.
- - tznaive datetimes are treated as _wall_ times.
- """
- cdef:
- ndarray result = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_INT64, 0)
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(result, values)
- Py_ssize_t i, n = values.size
- object item
- int64_t ival
- datetime ts
-
- for i in range(n):
- # Analogous to `item = values[i]`
- item = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if checknull_with_nat_and_na(item):
- # this catches pd.NA which would raise in the Timestamp constructor
- ival = NPY_NAT
-
- else:
- ts = Timestamp(item)
- if ts is NaT:
- ival = NPY_NAT
- else:
- if ts.tz is not None:
- ts = ts.tz_convert(tz)
- else:
- # datetime64, tznaive pydatetime, int, float
- ts = ts.tz_localize(tz)
- ts = ts.as_unit("ns")
- ival = ts._value
-
- # Analogous to: result[i] = ival
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = ival
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return result
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/__init__.py b/contrib/python/pandas/py3/pandas/_libs/tslibs/__init__.py
deleted file mode 100644
index 42f84619ddb..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/__init__.py
+++ /dev/null
@@ -1,85 +0,0 @@
-__all__ = [
- "dtypes",
- "localize_pydatetime",
- "NaT",
- "NaTType",
- "iNaT",
- "nat_strings",
- "OutOfBoundsDatetime",
- "OutOfBoundsTimedelta",
- "IncompatibleFrequency",
- "Period",
- "Resolution",
- "Timedelta",
- "normalize_i8_timestamps",
- "is_date_array_normalized",
- "dt64arr_to_periodarr",
- "delta_to_nanoseconds",
- "ints_to_pydatetime",
- "ints_to_pytimedelta",
- "get_resolution",
- "Timestamp",
- "tz_convert_from_utc_single",
- "tz_convert_from_utc",
- "to_offset",
- "Tick",
- "BaseOffset",
- "tz_compare",
- "is_unitless",
- "astype_overflowsafe",
- "get_unit_from_dtype",
- "periods_per_day",
- "periods_per_second",
- "is_supported_unit",
- "npy_unit_to_abbrev",
- "get_supported_reso",
-]
-
-from pandas._libs.tslibs import dtypes
-from pandas._libs.tslibs.conversion import localize_pydatetime
-from pandas._libs.tslibs.dtypes import (
- Resolution,
- get_supported_reso,
- is_supported_unit,
- npy_unit_to_abbrev,
- periods_per_day,
- periods_per_second,
-)
-from pandas._libs.tslibs.nattype import (
- NaT,
- NaTType,
- iNaT,
- nat_strings,
-)
-from pandas._libs.tslibs.np_datetime import (
- OutOfBoundsDatetime,
- OutOfBoundsTimedelta,
- astype_overflowsafe,
- is_unitless,
- py_get_unit_from_dtype as get_unit_from_dtype,
-)
-from pandas._libs.tslibs.offsets import (
- BaseOffset,
- Tick,
- to_offset,
-)
-from pandas._libs.tslibs.period import (
- IncompatibleFrequency,
- Period,
-)
-from pandas._libs.tslibs.timedeltas import (
- Timedelta,
- delta_to_nanoseconds,
- ints_to_pytimedelta,
-)
-from pandas._libs.tslibs.timestamps import Timestamp
-from pandas._libs.tslibs.timezones import tz_compare
-from pandas._libs.tslibs.tzconversion import tz_convert_from_utc_single
-from pandas._libs.tslibs.vectorized import (
- dt64arr_to_periodarr,
- get_resolution,
- ints_to_pydatetime,
- is_date_array_normalized,
- normalize_i8_timestamps,
- tz_convert_from_utc,
-)
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/base.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/base.pxd
deleted file mode 100644
index 3bffff7aca4..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/base.pxd
+++ /dev/null
@@ -1,5 +0,0 @@
-from cpython.datetime cimport datetime
-
-
-cdef class ABCTimestamp(datetime):
- pass
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/base.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/base.pyx
deleted file mode 100644
index 1677a8b0be1..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/base.pyx
+++ /dev/null
@@ -1,12 +0,0 @@
-"""
-We define base classes that will be inherited by Timestamp, Timedelta, etc
-in order to allow for fast isinstance checks without circular dependency issues.
-
-This is analogous to core.dtypes.generic.
-"""
-
-from cpython.datetime cimport datetime
-
-
-cdef class ABCTimestamp(datetime):
- pass
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pxd
deleted file mode 100644
index 341f2176f5e..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pxd
+++ /dev/null
@@ -1,20 +0,0 @@
-from cython cimport Py_ssize_t
-from numpy cimport (
- int32_t,
- int64_t,
-)
-
-ctypedef (int32_t, int32_t, int32_t) iso_calendar_t
-
-cdef int dayofweek(int y, int m, int d) nogil
-cdef bint is_leapyear(int64_t year) nogil
-cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil
-cpdef int32_t get_week_of_year(int year, int month, int day) nogil
-cpdef iso_calendar_t get_iso_calendar(int year, int month, int day) nogil
-cpdef int32_t get_day_of_year(int year, int month, int day) nogil
-cpdef int get_lastbday(int year, int month) nogil
-cpdef int get_firstbday(int year, int month) nogil
-
-cdef dict c_MONTH_NUMBERS
-
-cdef int32_t* month_offset
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pyi
deleted file mode 100644
index 993f18a61d7..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pyi
+++ /dev/null
@@ -1,12 +0,0 @@
-DAYS: list[str]
-MONTH_ALIASES: dict[int, str]
-MONTH_NUMBERS: dict[str, int]
-MONTHS: list[str]
-int_to_weekday: dict[int, str]
-
-def get_firstbday(year: int, month: int) -> int: ...
-def get_lastbday(year: int, month: int) -> int: ...
-def get_day_of_year(year: int, month: int, day: int) -> int: ...
-def get_iso_calendar(year: int, month: int, day: int) -> tuple[int, int, int]: ...
-def get_week_of_year(year: int, month: int, day: int) -> int: ...
-def get_days_in_month(year: int, month: int) -> int: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pyx
deleted file mode 100644
index 8b5b079649c..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/ccalendar.pyx
+++ /dev/null
@@ -1,310 +0,0 @@
-# cython: boundscheck=False
-"""
-Cython implementations of functions resembling the stdlib calendar module
-"""
-cimport cython
-from numpy cimport (
- int32_t,
- int64_t,
-)
-
-# ----------------------------------------------------------------------
-# Constants
-
-# Slightly more performant cython lookups than a 2D table
-# The first 12 entries correspond to month lengths for non-leap years.
-# The remaining 12 entries give month lengths for leap years
-cdef int32_t* days_per_month_array = [
- 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31,
- 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
-
-cdef int* em = [0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334]
-
-# The first 13 entries give the month days elapsed as of the first of month N
-# (or the total number of days in the year for N=13) in non-leap years.
-# The remaining 13 entries give the days elapsed in leap years.
-cdef int32_t* month_offset = [
- 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365,
- 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366]
-
-# Canonical location for other modules to find name constants
-MONTHS = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL",
- "AUG", "SEP", "OCT", "NOV", "DEC"]
-# The first blank line is consistent with calendar.month_name in the calendar
-# standard library
-MONTHS_FULL = ["", "January", "February", "March", "April", "May", "June",
- "July", "August", "September", "October", "November",
- "December"]
-MONTH_NUMBERS = {name: num for num, name in enumerate(MONTHS)}
-cdef dict c_MONTH_NUMBERS = MONTH_NUMBERS
-MONTH_ALIASES = {(num + 1): name for num, name in enumerate(MONTHS)}
-MONTH_TO_CAL_NUM = {name: num + 1 for num, name in enumerate(MONTHS)}
-
-DAYS = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"]
-DAYS_FULL = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
- "Saturday", "Sunday"]
-int_to_weekday = {num: name for num, name in enumerate(DAYS)}
-weekday_to_int = {int_to_weekday[key]: key for key in int_to_weekday}
-
-
-# ----------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil:
- """
- Return the number of days in the given month of the given year.
-
- Parameters
- ----------
- year : int
- month : int
-
- Returns
- -------
- days_in_month : int
-
- Notes
- -----
- Assumes that the arguments are valid. Passing a month not between 1 and 12
- risks a segfault.
- """
- return days_per_month_array[12 * is_leapyear(year) + month - 1]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-@cython.cdivision(True)
-cdef long quot(long a , long b) noexcept nogil:
- cdef long x
- x = a/b
- if (a < 0):
- x -= (a % b != 0)
- return x
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-@cython.cdivision(True)
-cdef int dayofweek(int y, int m, int d) noexcept nogil:
- """
- Find the day of week for the date described by the Y/M/D triple y, m, d
- using Gauss' method, from wikipedia.
-
- 0 represents Monday. See [1]_.
-
- Parameters
- ----------
- y : int
- m : int
- d : int
-
- Returns
- -------
- weekday : int
-
- Notes
- -----
- Assumes that y, m, d, represents a valid date.
-
- See Also
- --------
- [1] https://docs.python.org/3/library/calendar.html#calendar.weekday
-
- [2] https://en.wikipedia.org/wiki/\
- Determination_of_the_day_of_the_week#Gauss's_algorithm
- """
- # Note: this particular implementation comes from
- # http://berndt-schwerdtfeger.de/wp-content/uploads/pdf/cal.pdf
- cdef:
- long c
- int g
- int f
- int e
-
- if (m < 3):
- y -= 1
-
- c = quot(y, 100)
- g = y - c * 100
- f = 5 * (c - quot(c, 4) * 4)
- e = em[m]
-
- if (m > 2):
- e -= 1
- return (-1 + d + e + f + g + g/4) % 7
-
-cdef bint is_leapyear(int64_t year) nogil:
- """
- Returns 1 if the given year is a leap year, 0 otherwise.
-
- Parameters
- ----------
- year : int
-
- Returns
- -------
- is_leap : bool
- """
- return ((year & 0x3) == 0 and # year % 4 == 0
- ((year % 100) != 0 or (year % 400) == 0))
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef int32_t get_week_of_year(int year, int month, int day) nogil:
- """
- Return the ordinal week-of-year for the given day.
-
- Parameters
- ----------
- year : int
- month : int
- day : int
-
- Returns
- -------
- week_of_year : int32_t
-
- Notes
- -----
- Assumes the inputs describe a valid date.
- """
- return get_iso_calendar(year, month, day)[1]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef iso_calendar_t get_iso_calendar(int year, int month, int day) nogil:
- """
- Return the year, week, and day of year corresponding to ISO 8601
-
- Parameters
- ----------
- year : int
- month : int
- day : int
-
- Returns
- -------
- year : int32_t
- week : int32_t
- day : int32_t
-
- Notes
- -----
- Assumes the inputs describe a valid date.
- """
- cdef:
- int32_t doy, dow
- int32_t iso_year, iso_week
-
- doy = get_day_of_year(year, month, day)
- dow = dayofweek(year, month, day)
-
- # estimate
- iso_week = (doy - 1) - dow + 3
- if iso_week >= 0:
- iso_week = iso_week // 7 + 1
-
- # verify
- if iso_week < 0:
- if (iso_week > -2) or (iso_week == -2 and is_leapyear(year - 1)):
- iso_week = 53
- else:
- iso_week = 52
- elif iso_week == 53:
- if 31 - day + dow < 3:
- iso_week = 1
-
- iso_year = year
- if iso_week == 1 and month == 12:
- iso_year += 1
-
- elif iso_week >= 52 and month == 1:
- iso_year -= 1
-
- return iso_year, iso_week, dow + 1
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef int32_t get_day_of_year(int year, int month, int day) nogil:
- """
- Return the ordinal day-of-year for the given day.
-
- Parameters
- ----------
- year : int
- month : int
- day : int
-
- Returns
- -------
- day_of_year : int32_t
-
- Notes
- -----
- Assumes the inputs describe a valid date.
- """
- cdef:
- bint isleap
- int32_t mo_off
- int day_of_year
-
- isleap = is_leapyear(year)
-
- mo_off = month_offset[isleap * 13 + month - 1]
-
- day_of_year = mo_off + day
- return day_of_year
-
-
-# ---------------------------------------------------------------------
-# Business Helpers
-
-cpdef int get_lastbday(int year, int month) nogil:
- """
- Find the last day of the month that is a business day.
-
- Parameters
- ----------
- year : int
- month : int
-
- Returns
- -------
- last_bday : int
- """
- cdef:
- int wkday, days_in_month
-
- wkday = dayofweek(year, month, 1)
- days_in_month = get_days_in_month(year, month)
- return days_in_month - max(((wkday + days_in_month - 1) % 7) - 4, 0)
-
-
-cpdef int get_firstbday(int year, int month) nogil:
- """
- Find the first day of the month that is a business day.
-
- Parameters
- ----------
- year : int
- month : int
-
- Returns
- -------
- first_bday : int
- """
- cdef:
- int first, wkday
-
- wkday = dayofweek(year, month, 1)
- first = 1
- if wkday == 5: # on Saturday
- first = 3
- elif wkday == 6: # on Sunday
- first = 2
- return first
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pxd
deleted file mode 100644
index 1b321460558..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pxd
+++ /dev/null
@@ -1,62 +0,0 @@
-from cpython.datetime cimport (
- datetime,
- tzinfo,
-)
-from numpy cimport (
- int32_t,
- int64_t,
- ndarray,
-)
-
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- npy_datetimestruct,
-)
-from pandas._libs.tslibs.timestamps cimport _Timestamp
-from pandas._libs.tslibs.timezones cimport tz_compare
-
-
-cdef class _TSObject:
- cdef readonly:
- npy_datetimestruct dts # npy_datetimestruct
- int64_t value # numpy dt64
- tzinfo tzinfo
- bint fold
- NPY_DATETIMEUNIT creso
-
- cdef int64_t ensure_reso(self, NPY_DATETIMEUNIT creso, str val=*) except? -1
-
-
-cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,
- bint dayfirst, bint yearfirst,
- int32_t nanos=*)
-
-cdef _TSObject convert_datetime_to_tsobject(datetime ts, tzinfo tz,
- int32_t nanos=*,
- NPY_DATETIMEUNIT reso=*)
-
-cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit,
- bint dayfirst=*,
- bint yearfirst=*)
-
-cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1
-
-cpdef datetime localize_pydatetime(datetime dt, tzinfo tz)
-cdef int64_t cast_from_unit(object ts, str unit, NPY_DATETIMEUNIT out_reso=*) except? -1
-cpdef (int64_t, int) precision_from_unit(str unit, NPY_DATETIMEUNIT out_reso=*)
-
-cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso)
-
-cdef tzinfo convert_timezone(
- tzinfo tz_in,
- tzinfo tz_out,
- bint found_naive,
- bint found_tz,
- bint utc_convert,
-)
-
-cdef int64_t parse_pydatetime(
- datetime val,
- npy_datetimestruct *dts,
- bint utc_convert,
-) except? -1
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pyi
deleted file mode 100644
index d564d767f7f..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pyi
+++ /dev/null
@@ -1,14 +0,0 @@
-from datetime import (
- datetime,
- tzinfo,
-)
-
-import numpy as np
-
-DT64NS_DTYPE: np.dtype
-TD64NS_DTYPE: np.dtype
-
-def precision_from_unit(
- unit: str,
-) -> tuple[int, int]: ... # (int64_t, _)
-def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pyx
deleted file mode 100644
index 03a53b1b451..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/conversion.pyx
+++ /dev/null
@@ -1,779 +0,0 @@
-import numpy as np
-
-cimport numpy as cnp
-from libc.math cimport log10
-from numpy cimport (
- int32_t,
- int64_t,
-)
-
-cnp.import_array()
-
-# stdlib datetime imports
-
-from datetime import timezone
-
-from cpython.datetime cimport (
- PyDate_Check,
- PyDateTime_Check,
- datetime,
- import_datetime,
- time,
- timedelta,
- tzinfo,
-)
-
-import_datetime()
-
-from pandas._libs.tslibs.base cimport ABCTimestamp
-from pandas._libs.tslibs.dtypes cimport (
- abbrev_to_npy_unit,
- get_supported_reso,
- periods_per_second,
-)
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- NPY_FR_ns,
- NPY_FR_us,
- check_dts_bounds,
- convert_reso,
- get_datetime64_unit,
- get_datetime64_value,
- get_implementation_bounds,
- npy_datetime,
- npy_datetimestruct,
- npy_datetimestruct_to_datetime,
- pandas_datetime_to_datetimestruct,
- pydatetime_to_dt64,
- pydatetime_to_dtstruct,
- string_to_dts,
-)
-
-from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
-
-from pandas._libs.tslibs.nattype cimport (
- NPY_NAT,
- c_NaT as NaT,
- c_nat_strings as nat_strings,
-)
-from pandas._libs.tslibs.parsing cimport parse_datetime_string
-from pandas._libs.tslibs.timestamps cimport _Timestamp
-from pandas._libs.tslibs.timezones cimport (
- get_utcoffset,
- is_utc,
-)
-from pandas._libs.tslibs.tzconversion cimport (
- Localizer,
- tz_localize_to_utc_single,
-)
-from pandas._libs.tslibs.util cimport (
- is_datetime64_object,
- is_float_object,
- is_integer_object,
-)
-
-# ----------------------------------------------------------------------
-# Constants
-
-DT64NS_DTYPE = np.dtype("M8[ns]")
-TD64NS_DTYPE = np.dtype("m8[ns]")
-
-
-# ----------------------------------------------------------------------
-# Unit Conversion Helpers
-
-cdef int64_t cast_from_unit(
- object ts,
- str unit,
- NPY_DATETIMEUNIT out_reso=NPY_FR_ns
-) except? -1:
- """
- Return a casting of the unit represented to nanoseconds
- round the fractional part of a float to our precision, p.
-
- Parameters
- ----------
- ts : int, float, or None
- unit : str
-
- Returns
- -------
- int64_t
- """
- cdef:
- int64_t m
- int p
-
- m, p = precision_from_unit(unit, out_reso)
-
- # just give me the unit back
- if ts is None:
- return m
-
- if unit in ["Y", "M"]:
- if is_float_object(ts) and not ts.is_integer():
- # GH#47267 it is clear that 2 "M" corresponds to 1970-02-01,
- # but not clear what 2.5 "M" corresponds to, so we will
- # disallow that case.
- raise ValueError(
- f"Conversion of non-round float with unit={unit} "
- "is ambiguous"
- )
- # GH#47266 go through np.datetime64 to avoid weird results e.g. with "Y"
- # and 150 we'd get 2120-01-01 09:00:00
- if is_float_object(ts):
- ts = int(ts)
- dt64obj = np.datetime64(ts, unit)
- return get_datetime64_nanos(dt64obj, out_reso)
-
- # cast the unit, multiply base/frac separately
- # to avoid precision issues from float -> int
- try:
- base = <int64_t>ts
- except OverflowError as err:
- raise OutOfBoundsDatetime(
- f"cannot convert input {ts} with the unit '{unit}'"
- ) from err
-
- frac = ts - base
- if p:
- frac = round(frac, p)
-
- try:
- return <int64_t>(base * m) + <int64_t>(frac * m)
- except OverflowError as err:
- raise OutOfBoundsDatetime(
- f"cannot convert input {ts} with the unit '{unit}'"
- ) from err
-
-
-cpdef inline (int64_t, int) precision_from_unit(
- str unit,
- NPY_DATETIMEUNIT out_reso=NPY_DATETIMEUNIT.NPY_FR_ns,
-):
- """
- Return a casting of the unit represented to nanoseconds + the precision
- to round the fractional part.
-
- Notes
- -----
- The caller is responsible for ensuring that the default value of "ns"
- takes the place of None.
- """
- cdef:
- int64_t m
- int64_t multiplier
- int p
- NPY_DATETIMEUNIT reso = abbrev_to_npy_unit(unit)
-
- multiplier = periods_per_second(out_reso)
-
- if reso == NPY_DATETIMEUNIT.NPY_FR_Y:
- # each 400 years we have 97 leap years, for an average of 97/400=.2425
- # extra days each year. We get 31556952 by writing
- # 3600*24*365.2425=31556952
- m = multiplier * 31556952
- elif reso == NPY_DATETIMEUNIT.NPY_FR_M:
- # 2629746 comes from dividing the "Y" case by 12.
- m = multiplier * 2629746
- elif reso == NPY_DATETIMEUNIT.NPY_FR_W:
- m = multiplier * 3600 * 24 * 7
- elif reso == NPY_DATETIMEUNIT.NPY_FR_D:
- m = multiplier * 3600 * 24
- elif reso == NPY_DATETIMEUNIT.NPY_FR_h:
- m = multiplier * 3600
- elif reso == NPY_DATETIMEUNIT.NPY_FR_m:
- m = multiplier * 60
- elif reso == NPY_DATETIMEUNIT.NPY_FR_s:
- m = multiplier
- elif reso == NPY_DATETIMEUNIT.NPY_FR_ms:
- m = multiplier // 1_000
- elif reso == NPY_DATETIMEUNIT.NPY_FR_us:
- m = multiplier // 1_000_000
- elif reso == NPY_DATETIMEUNIT.NPY_FR_ns or reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
- m = multiplier // 1_000_000_000
- else:
- raise ValueError(f"cannot cast unit {unit}")
- p = <int>log10(m) # number of digits in 'm' minus 1
- return m, p
-
-
-cdef int64_t get_datetime64_nanos(object val, NPY_DATETIMEUNIT reso) except? -1:
- """
- Extract the value and unit from a np.datetime64 object, then convert the
- value to nanoseconds if necessary.
- """
- cdef:
- npy_datetimestruct dts
- NPY_DATETIMEUNIT unit
- npy_datetime ival
-
- ival = get_datetime64_value(val)
- if ival == NPY_NAT:
- return NPY_NAT
-
- unit = get_datetime64_unit(val)
-
- if unit != reso:
- pandas_datetime_to_datetimestruct(ival, unit, &dts)
- check_dts_bounds(&dts, reso)
- ival = npy_datetimestruct_to_datetime(reso, &dts)
-
- return ival
-
-
-# ----------------------------------------------------------------------
-# _TSObject Conversion
-
-# lightweight C object to hold datetime & int64 pair
-cdef class _TSObject:
- # cdef:
- # npy_datetimestruct dts # npy_datetimestruct
- # int64_t value # numpy dt64
- # tzinfo tzinfo
- # bint fold
- # NPY_DATETIMEUNIT creso
-
- def __cinit__(self):
- # GH 25057. As per PEP 495, set fold to 0 by default
- self.fold = 0
- self.creso = NPY_FR_ns # default value
-
- cdef int64_t ensure_reso(self, NPY_DATETIMEUNIT creso, str val=None) except? -1:
- if self.creso != creso:
- try:
- self.value = convert_reso(self.value, self.creso, creso, False)
- except OverflowError as err:
- if val is not None:
- raise OutOfBoundsDatetime(
- f"Out of bounds nanosecond timestamp: {val}"
- ) from err
- raise OutOfBoundsDatetime from err
-
- self.creso = creso
- return self.value
-
-
-cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit,
- bint dayfirst, bint yearfirst, int32_t nanos=0):
- """
- Extract datetime and int64 from any of:
- - np.int64 (with unit providing a possible modifier)
- - np.datetime64
- - a float (with unit providing a possible modifier)
- - python int or long object (with unit providing a possible modifier)
- - iso8601 string object
- - python datetime object
- - another timestamp object
-
- Raises
- ------
- OutOfBoundsDatetime : ts cannot be converted within implementation bounds
- """
- cdef:
- _TSObject obj
- NPY_DATETIMEUNIT reso
-
- obj = _TSObject()
-
- if isinstance(ts, str):
- return convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst)
-
- if ts is None or ts is NaT:
- obj.value = NPY_NAT
- elif is_datetime64_object(ts):
- reso = get_supported_reso(get_datetime64_unit(ts))
- obj.creso = reso
- obj.value = get_datetime64_nanos(ts, reso)
- if obj.value != NPY_NAT:
- pandas_datetime_to_datetimestruct(obj.value, reso, &obj.dts)
- elif is_integer_object(ts):
- try:
- ts = <int64_t>ts
- except OverflowError:
- # GH#26651 re-raise as OutOfBoundsDatetime
- raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp {ts}")
- if ts == NPY_NAT:
- obj.value = NPY_NAT
- else:
- if unit is None:
- unit = "ns"
- in_reso = abbrev_to_npy_unit(unit)
- reso = get_supported_reso(in_reso)
- ts = cast_from_unit(ts, unit, reso)
- obj.value = ts
- obj.creso = reso
- pandas_datetime_to_datetimestruct(ts, reso, &obj.dts)
- elif is_float_object(ts):
- if ts != ts or ts == NPY_NAT:
- obj.value = NPY_NAT
- else:
- ts = cast_from_unit(ts, unit)
- obj.value = ts
- pandas_datetime_to_datetimestruct(ts, NPY_FR_ns, &obj.dts)
- elif PyDateTime_Check(ts):
- if nanos == 0:
- if isinstance(ts, ABCTimestamp):
- reso = abbrev_to_npy_unit(ts.unit) # TODO: faster way to do this?
- else:
- # TODO: what if user explicitly passes nanos=0?
- reso = NPY_FR_us
- else:
- reso = NPY_FR_ns
- return convert_datetime_to_tsobject(ts, tz, nanos, reso=reso)
- elif PyDate_Check(ts):
- # Keep the converter same as PyDateTime's
- # For date object we give the lowest supported resolution, i.e. "s"
- ts = datetime.combine(ts, time())
- return convert_datetime_to_tsobject(
- ts, tz, nanos=0, reso=NPY_DATETIMEUNIT.NPY_FR_s
- )
- else:
- from .period import Period
- if isinstance(ts, Period):
- raise ValueError("Cannot convert Period to Timestamp "
- "unambiguously. Use to_timestamp")
- raise TypeError(f"Cannot convert input [{ts}] of type {type(ts)} to "
- f"Timestamp")
-
- maybe_localize_tso(obj, tz, obj.creso)
- return obj
-
-
-cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso):
- if tz is not None:
- _localize_tso(obj, tz, reso)
-
- if obj.value != NPY_NAT:
- # check_overflows needs to run after _localize_tso
- check_dts_bounds(&obj.dts, reso)
- check_overflows(obj, reso)
-
-
-cdef _TSObject convert_datetime_to_tsobject(
- datetime ts,
- tzinfo tz,
- int32_t nanos=0,
- NPY_DATETIMEUNIT reso=NPY_FR_ns,
-):
- """
- Convert a datetime (or Timestamp) input `ts`, along with optional timezone
- object `tz` to a _TSObject.
-
- The optional argument `nanos` allows for cases where datetime input
- needs to be supplemented with higher-precision information.
-
- Parameters
- ----------
- ts : datetime or Timestamp
- Value to be converted to _TSObject
- tz : tzinfo or None
- timezone for the timezone-aware output
- nanos : int32_t, default is 0
- nanoseconds supplement the precision of the datetime input ts
- reso : NPY_DATETIMEUNIT, default NPY_FR_ns
-
- Returns
- -------
- obj : _TSObject
- """
- cdef:
- _TSObject obj = _TSObject()
- int64_t pps
-
- obj.creso = reso
- obj.fold = ts.fold
- if tz is not None:
-
- if ts.tzinfo is not None:
- # Convert the current timezone to the passed timezone
- ts = ts.astimezone(tz)
- pydatetime_to_dtstruct(ts, &obj.dts)
- obj.tzinfo = ts.tzinfo
- elif not is_utc(tz):
- ts = _localize_pydatetime(ts, tz)
- pydatetime_to_dtstruct(ts, &obj.dts)
- obj.tzinfo = ts.tzinfo
- else:
- # UTC
- pydatetime_to_dtstruct(ts, &obj.dts)
- obj.tzinfo = tz
- else:
- pydatetime_to_dtstruct(ts, &obj.dts)
- obj.tzinfo = ts.tzinfo
-
- if isinstance(ts, ABCTimestamp):
- obj.dts.ps = ts.nanosecond * 1000
-
- if nanos:
- obj.dts.ps = nanos * 1000
-
- obj.value = npy_datetimestruct_to_datetime(reso, &obj.dts)
-
- if obj.tzinfo is not None and not is_utc(obj.tzinfo):
- offset = get_utcoffset(obj.tzinfo, ts)
- pps = periods_per_second(reso)
- obj.value -= int(offset.total_seconds() * pps)
-
- check_dts_bounds(&obj.dts, reso)
- check_overflows(obj, reso)
- return obj
-
-
-cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts,
- int tzoffset, tzinfo tz=None,
- NPY_DATETIMEUNIT reso=NPY_FR_ns):
- """
- Convert a datetimestruct `dts`, along with initial timezone offset
- `tzoffset` to a _TSObject (with timezone object `tz` - optional).
-
- Parameters
- ----------
- dts : npy_datetimestruct
- tzoffset : int
- tz : tzinfo or None
- timezone for the timezone-aware output.
- reso : NPY_DATETIMEUNIT, default NPY_FR_ns
-
- Returns
- -------
- obj : _TSObject
- """
- cdef:
- _TSObject obj = _TSObject()
- int64_t value # numpy dt64
- datetime dt
- Py_ssize_t pos
-
- value = npy_datetimestruct_to_datetime(reso, &dts)
- obj.dts = dts
- obj.tzinfo = timezone(timedelta(minutes=tzoffset))
- obj.value = tz_localize_to_utc_single(
- value, obj.tzinfo, ambiguous=None, nonexistent=None, creso=reso
- )
- obj.creso = reso
- if tz is None:
- check_overflows(obj, reso)
- return obj
-
- cdef:
- Localizer info = Localizer(tz, reso)
-
- # Infer fold from offset-adjusted obj.value
- # see PEP 495 https://www.python.org/dev/peps/pep-0495/#the-fold-attribute
- if info.use_utc:
- pass
- elif info.use_tzlocal:
- info.utc_val_to_local_val(obj.value, &pos, &obj.fold)
- elif info.use_dst and not info.use_pytz:
- # i.e. dateutil
- info.utc_val_to_local_val(obj.value, &pos, &obj.fold)
-
- # Keep the converter same as PyDateTime's
- dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day,
- obj.dts.hour, obj.dts.min, obj.dts.sec,
- obj.dts.us, obj.tzinfo, fold=obj.fold)
- obj = convert_datetime_to_tsobject(
- dt, tz, nanos=obj.dts.ps // 1000)
- obj.ensure_reso(reso) # TODO: more performant to get reso right up front?
- return obj
-
-
-cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, str unit,
- bint dayfirst=False,
- bint yearfirst=False):
- """
- Convert a string input `ts`, along with optional timezone object`tz`
- to a _TSObject.
-
- The optional arguments `dayfirst` and `yearfirst` are passed to the
- dateutil parser.
-
- Parameters
- ----------
- ts : str
- Value to be converted to _TSObject
- tz : tzinfo or None
- timezone for the timezone-aware output
- unit : str or None
- dayfirst : bool, default False
- When parsing an ambiguous date string, interpret e.g. "3/4/1975" as
- April 3, as opposed to the standard US interpretation March 4.
- yearfirst : bool, default False
- When parsing an ambiguous date string, interpret e.g. "01/05/09"
- as "May 9, 2001", as opposed to the default "Jan 5, 2009"
-
- Returns
- -------
- obj : _TSObject
- """
- cdef:
- npy_datetimestruct dts
- int out_local = 0, out_tzoffset = 0, string_to_dts_failed
- datetime dt
- int64_t ival
- NPY_DATETIMEUNIT out_bestunit, reso
-
- if len(ts) == 0 or ts in nat_strings:
- obj = _TSObject()
- obj.value = NPY_NAT
- obj.tzinfo = tz
- return obj
- elif ts == "now":
- # Issue 9000, we short-circuit rather than going
- # into np_datetime_strings which returns utc
- dt = datetime.now(tz)
- elif ts == "today":
- # Issue 9000, we short-circuit rather than going
- # into np_datetime_strings which returns a normalized datetime
- dt = datetime.now(tz)
- # equiv: datetime.today().replace(tzinfo=tz)
- else:
- string_to_dts_failed = string_to_dts(
- ts, &dts, &out_bestunit, &out_local,
- &out_tzoffset, False
- )
- if not string_to_dts_failed:
- reso = get_supported_reso(out_bestunit)
- check_dts_bounds(&dts, reso)
- if out_local == 1:
- return _create_tsobject_tz_using_offset(
- dts, out_tzoffset, tz, reso
- )
- else:
- ival = npy_datetimestruct_to_datetime(reso, &dts)
- if tz is not None:
- # shift for _localize_tso
- ival = tz_localize_to_utc_single(
- ival, tz, ambiguous="raise", nonexistent=None, creso=reso
- )
- obj = _TSObject()
- obj.dts = dts
- obj.value = ival
- obj.creso = reso
- maybe_localize_tso(obj, tz, obj.creso)
- return obj
-
- dt = parse_datetime_string(
- ts, dayfirst=dayfirst, yearfirst=yearfirst, out_bestunit=&out_bestunit
- )
- reso = get_supported_reso(out_bestunit)
- return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=reso)
-
- return convert_datetime_to_tsobject(dt, tz)
-
-
-cdef check_overflows(_TSObject obj, NPY_DATETIMEUNIT reso=NPY_FR_ns):
- """
- Check that we haven't silently overflowed in timezone conversion
-
- Parameters
- ----------
- obj : _TSObject
- reso : NPY_DATETIMEUNIT, default NPY_FR_ns
-
- Returns
- -------
- None
-
- Raises
- ------
- OutOfBoundsDatetime
- """
- # GH#12677
- cdef:
- npy_datetimestruct lb, ub
-
- get_implementation_bounds(reso, &lb, &ub)
-
- if obj.dts.year == lb.year:
- if not (obj.value < 0):
- from pandas._libs.tslibs.timestamps import Timestamp
- fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} "
- f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}")
- raise OutOfBoundsDatetime(
- f"Converting {fmt} underflows past {Timestamp.min}"
- )
- elif obj.dts.year == ub.year:
- if not (obj.value > 0):
- from pandas._libs.tslibs.timestamps import Timestamp
- fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} "
- f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}")
- raise OutOfBoundsDatetime(
- f"Converting {fmt} overflows past {Timestamp.max}"
- )
-
-# ----------------------------------------------------------------------
-# Localization
-
-cdef void _localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso):
- """
- Given the UTC nanosecond timestamp in obj.value, find the wall-clock
- representation of that timestamp in the given timezone.
-
- Parameters
- ----------
- obj : _TSObject
- tz : tzinfo
- reso : NPY_DATETIMEUNIT
-
- Returns
- -------
- None
-
- Notes
- -----
- Sets obj.tzinfo inplace, alters obj.dts inplace.
- """
- cdef:
- int64_t local_val
- Py_ssize_t outpos = -1
- Localizer info = Localizer(tz, reso)
-
- assert obj.tzinfo is None
-
- if info.use_utc:
- pass
- elif obj.value == NPY_NAT:
- pass
- else:
- local_val = info.utc_val_to_local_val(obj.value, &outpos, &obj.fold)
-
- if info.use_pytz:
- # infer we went through a pytz path, will have outpos!=-1
- tz = tz._tzinfos[tz._transition_info[outpos]]
-
- pandas_datetime_to_datetimestruct(local_val, reso, &obj.dts)
-
- obj.tzinfo = tz
-
-
-cdef datetime _localize_pydatetime(datetime dt, tzinfo tz):
- """
- Take a datetime/Timestamp in UTC and localizes to timezone tz.
-
- NB: Unlike the public version, this treats datetime and Timestamp objects
- identically, i.e. discards nanos from Timestamps.
- It also assumes that the `tz` input is not None.
- """
- try:
- # datetime.replace with pytz may be incorrect result
- return tz.localize(dt)
- except AttributeError:
- return dt.replace(tzinfo=tz)
-
-
-cpdef inline datetime localize_pydatetime(datetime dt, tzinfo tz):
- """
- Take a datetime/Timestamp in UTC and localizes to timezone tz.
-
- Parameters
- ----------
- dt : datetime or Timestamp
- tz : tzinfo or None
-
- Returns
- -------
- localized : datetime or Timestamp
- """
- if tz is None:
- return dt
- elif isinstance(dt, ABCTimestamp):
- return dt.tz_localize(tz)
- return _localize_pydatetime(dt, tz)
-
-
-cdef tzinfo convert_timezone(
- tzinfo tz_in,
- tzinfo tz_out,
- bint found_naive,
- bint found_tz,
- bint utc_convert,
-):
- """
- Validate that ``tz_in`` can be converted/localized to ``tz_out``.
-
- Parameters
- ----------
- tz_in : tzinfo or None
- Timezone info of element being processed.
- tz_out : tzinfo or None
- Timezone info of output.
- found_naive : bool
- Whether a timezone-naive element has been found so far.
- found_tz : bool
- Whether a timezone-aware element has been found so far.
- utc_convert : bool
- Whether to convert/localize to UTC.
-
- Returns
- -------
- tz_info
- Timezone info of output.
-
- Raises
- ------
- ValueError
- If ``tz_in`` can't be converted/localized to ``tz_out``.
- """
- if tz_in is not None:
- if utc_convert:
- pass
- elif found_naive:
- raise ValueError("Tz-aware datetime.datetime "
- "cannot be converted to "
- "datetime64 unless utc=True")
- elif tz_out is not None and not tz_compare(tz_out, tz_in):
- raise ValueError("Tz-aware datetime.datetime "
- "cannot be converted to "
- "datetime64 unless utc=True")
- else:
- tz_out = tz_in
- else:
- if found_tz and not utc_convert:
- raise ValueError("Cannot mix tz-aware with "
- "tz-naive values")
- return tz_out
-
-
-cdef int64_t parse_pydatetime(
- datetime val,
- npy_datetimestruct *dts,
- bint utc_convert,
-) except? -1:
- """
- Convert pydatetime to datetime64.
-
- Parameters
- ----------
- val : datetime
- Element being processed.
- dts : *npy_datetimestruct
- Needed to use in pydatetime_to_dt64, which writes to it.
- utc_convert : bool
- Whether to convert/localize to UTC.
-
- Raises
- ------
- OutOfBoundsDatetime
- """
- cdef:
- _TSObject _ts
- int64_t result
-
- if val.tzinfo is not None:
- if utc_convert:
- _ts = convert_datetime_to_tsobject(val, None)
- _ts.ensure_reso(NPY_FR_ns)
- result = _ts.value
- else:
- _ts = convert_datetime_to_tsobject(val, None)
- _ts.ensure_reso(NPY_FR_ns)
- result = _ts.value
- else:
- if isinstance(val, _Timestamp):
- result = val.as_unit("ns")._value
- else:
- result = pydatetime_to_dt64(val, dts)
- check_dts_bounds(dts)
- return result
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pxd
deleted file mode 100644
index 6c2871cd746..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pxd
+++ /dev/null
@@ -1,106 +0,0 @@
-from numpy cimport int64_t
-
-from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
-
-
-cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
-cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev)
-cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil
-cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1
-cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1
-cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso)
-
-cdef dict attrname_to_abbrevs
-cdef dict npy_unit_to_attrname
-cdef dict attrname_to_npy_unit
-
-cdef enum c_FreqGroup:
- # Mirrors FreqGroup in the .pyx file
- FR_ANN = 1000
- FR_QTR = 2000
- FR_MTH = 3000
- FR_WK = 4000
- FR_BUS = 5000
- FR_DAY = 6000
- FR_HR = 7000
- FR_MIN = 8000
- FR_SEC = 9000
- FR_MS = 10000
- FR_US = 11000
- FR_NS = 12000
- FR_UND = -10000 # undefined
-
-
-cdef enum c_Resolution:
- # Mirrors Resolution in the .pyx file
- RESO_NS = 0
- RESO_US = 1
- RESO_MS = 2
- RESO_SEC = 3
- RESO_MIN = 4
- RESO_HR = 5
- RESO_DAY = 6
- RESO_MTH = 7
- RESO_QTR = 8
- RESO_YR = 9
-
-
-cdef enum PeriodDtypeCode:
- # Annual freqs with various fiscal year ends.
- # eg, 2005 for A_FEB runs Mar 1, 2004 to Feb 28, 2005
- A = 1000 # Default alias
- A_DEC = 1000 # Annual - December year end
- A_JAN = 1001 # Annual - January year end
- A_FEB = 1002 # Annual - February year end
- A_MAR = 1003 # Annual - March year end
- A_APR = 1004 # Annual - April year end
- A_MAY = 1005 # Annual - May year end
- A_JUN = 1006 # Annual - June year end
- A_JUL = 1007 # Annual - July year end
- A_AUG = 1008 # Annual - August year end
- A_SEP = 1009 # Annual - September year end
- A_OCT = 1010 # Annual - October year end
- A_NOV = 1011 # Annual - November year end
-
- # Quarterly frequencies with various fiscal year ends.
- # eg, Q42005 for Q_OCT runs Aug 1, 2005 to Oct 31, 2005
- Q_DEC = 2000 # Quarterly - December year end
- Q_JAN = 2001 # Quarterly - January year end
- Q_FEB = 2002 # Quarterly - February year end
- Q_MAR = 2003 # Quarterly - March year end
- Q_APR = 2004 # Quarterly - April year end
- Q_MAY = 2005 # Quarterly - May year end
- Q_JUN = 2006 # Quarterly - June year end
- Q_JUL = 2007 # Quarterly - July year end
- Q_AUG = 2008 # Quarterly - August year end
- Q_SEP = 2009 # Quarterly - September year end
- Q_OCT = 2010 # Quarterly - October year end
- Q_NOV = 2011 # Quarterly - November year end
-
- M = 3000 # Monthly
-
- W_SUN = 4000 # Weekly - Sunday end of week
- W_MON = 4001 # Weekly - Monday end of week
- W_TUE = 4002 # Weekly - Tuesday end of week
- W_WED = 4003 # Weekly - Wednesday end of week
- W_THU = 4004 # Weekly - Thursday end of week
- W_FRI = 4005 # Weekly - Friday end of week
- W_SAT = 4006 # Weekly - Saturday end of week
-
- B = 5000 # Business days
- D = 6000 # Daily
- H = 7000 # Hourly
- T = 8000 # Minutely
- S = 9000 # Secondly
- L = 10000 # Millisecondly
- U = 11000 # Microsecondly
- N = 12000 # Nanosecondly
-
- UNDEFINED = -10_000
-
-
-cdef class PeriodDtypeBase:
- cdef readonly:
- PeriodDtypeCode _dtype_code
-
- cpdef int _get_to_timestamp_base(self)
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pyi
deleted file mode 100644
index b872241d79a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pyi
+++ /dev/null
@@ -1,81 +0,0 @@
-from enum import Enum
-
-# These are not public API, but are exposed in the .pyi file because they
-# are imported in tests.
-_attrname_to_abbrevs: dict[str, str]
-_period_code_map: dict[str, int]
-
-def periods_per_day(reso: int) -> int: ...
-def periods_per_second(reso: int) -> int: ...
-def is_supported_unit(reso: int) -> bool: ...
-def npy_unit_to_abbrev(reso: int) -> str: ...
-def get_supported_reso(reso: int) -> int: ...
-def abbrev_to_npy_unit(abbrev: str) -> int: ...
-
-class PeriodDtypeBase:
- _dtype_code: int # PeriodDtypeCode
-
- # actually __cinit__
- def __new__(cls, code: int): ...
- @property
- def _freq_group_code(self) -> int: ...
- @property
- def _resolution_obj(self) -> Resolution: ...
- def _get_to_timestamp_base(self) -> int: ...
- @property
- def _freqstr(self) -> str: ...
-
-class FreqGroup(Enum):
- FR_ANN: int
- FR_QTR: int
- FR_MTH: int
- FR_WK: int
- FR_BUS: int
- FR_DAY: int
- FR_HR: int
- FR_MIN: int
- FR_SEC: int
- FR_MS: int
- FR_US: int
- FR_NS: int
- FR_UND: int
- @staticmethod
- def from_period_dtype_code(code: int) -> FreqGroup: ...
-
-class Resolution(Enum):
- RESO_NS: int
- RESO_US: int
- RESO_MS: int
- RESO_SEC: int
- RESO_MIN: int
- RESO_HR: int
- RESO_DAY: int
- RESO_MTH: int
- RESO_QTR: int
- RESO_YR: int
- def __lt__(self, other: Resolution) -> bool: ...
- def __ge__(self, other: Resolution) -> bool: ...
- @property
- def attrname(self) -> str: ...
- @classmethod
- def from_attrname(cls, attrname: str) -> Resolution: ...
- @classmethod
- def get_reso_from_freqstr(cls, freq: str) -> Resolution: ...
- @property
- def attr_abbrev(self) -> str: ...
-
-class NpyDatetimeUnit(Enum):
- NPY_FR_Y: int
- NPY_FR_M: int
- NPY_FR_W: int
- NPY_FR_D: int
- NPY_FR_h: int
- NPY_FR_m: int
- NPY_FR_s: int
- NPY_FR_ms: int
- NPY_FR_us: int
- NPY_FR_ns: int
- NPY_FR_ps: int
- NPY_FR_fs: int
- NPY_FR_as: int
- NPY_FR_GENERIC: int
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pyx
deleted file mode 100644
index 699e8aba76d..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/dtypes.pyx
+++ /dev/null
@@ -1,438 +0,0 @@
-# period frequency constants corresponding to scikits timeseries
-# originals
-from enum import Enum
-
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- get_conversion_factor,
-)
-
-
-cdef class PeriodDtypeBase:
- """
- Similar to an actual dtype, this contains all of the information
- describing a PeriodDtype in an integer code.
- """
- # cdef readonly:
- # PeriodDtypeCode _dtype_code
-
- def __cinit__(self, PeriodDtypeCode code):
- self._dtype_code = code
-
- def __eq__(self, other):
- if not isinstance(other, PeriodDtypeBase):
- return False
- if not isinstance(self, PeriodDtypeBase):
- # cython semantics, this is a reversed op
- return False
- return self._dtype_code == other._dtype_code
-
- @property
- def _freq_group_code(self) -> int:
- # See also: libperiod.get_freq_group
- return (self._dtype_code // 1000) * 1000
-
- @property
- def _resolution_obj(self) -> "Resolution":
- fgc = self._freq_group_code
- freq_group = FreqGroup(fgc)
- abbrev = _reverse_period_code_map[freq_group.value].split("-")[0]
- if abbrev == "B":
- return Resolution.RESO_DAY
- attrname = _abbrev_to_attrnames[abbrev]
- return Resolution.from_attrname(attrname)
-
- @property
- def _freqstr(self) -> str:
- # Will be passed to to_offset in Period._maybe_convert_freq
- return _reverse_period_code_map.get(self._dtype_code)
-
- cpdef int _get_to_timestamp_base(self):
- """
- Return frequency code group used for base of to_timestamp against
- frequency code.
-
- Return day freq code against longer freq than day.
- Return second freq code against hour between second.
-
- Returns
- -------
- int
- """
- base = <c_FreqGroup>self._dtype_code
- if base < FR_BUS:
- return FR_DAY
- elif FR_HR <= base <= FR_SEC:
- return FR_SEC
- return base
-
-
-_period_code_map = {
- # Annual freqs with various fiscal year ends.
- # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005
- "A-DEC": PeriodDtypeCode.A_DEC, # Annual - December year end
- "A-JAN": PeriodDtypeCode.A_JAN, # Annual - January year end
- "A-FEB": PeriodDtypeCode.A_FEB, # Annual - February year end
- "A-MAR": PeriodDtypeCode.A_MAR, # Annual - March year end
- "A-APR": PeriodDtypeCode.A_APR, # Annual - April year end
- "A-MAY": PeriodDtypeCode.A_MAY, # Annual - May year end
- "A-JUN": PeriodDtypeCode.A_JUN, # Annual - June year end
- "A-JUL": PeriodDtypeCode.A_JUL, # Annual - July year end
- "A-AUG": PeriodDtypeCode.A_AUG, # Annual - August year end
- "A-SEP": PeriodDtypeCode.A_SEP, # Annual - September year end
- "A-OCT": PeriodDtypeCode.A_OCT, # Annual - October year end
- "A-NOV": PeriodDtypeCode.A_NOV, # Annual - November year end
-
- # Quarterly frequencies with various fiscal year ends.
- # eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005
- "Q-DEC": PeriodDtypeCode.Q_DEC, # Quarterly - December year end
- "Q-JAN": PeriodDtypeCode.Q_JAN, # Quarterly - January year end
- "Q-FEB": PeriodDtypeCode.Q_FEB, # Quarterly - February year end
- "Q-MAR": PeriodDtypeCode.Q_MAR, # Quarterly - March year end
- "Q-APR": PeriodDtypeCode.Q_APR, # Quarterly - April year end
- "Q-MAY": PeriodDtypeCode.Q_MAY, # Quarterly - May year end
- "Q-JUN": PeriodDtypeCode.Q_JUN, # Quarterly - June year end
- "Q-JUL": PeriodDtypeCode.Q_JUL, # Quarterly - July year end
- "Q-AUG": PeriodDtypeCode.Q_AUG, # Quarterly - August year end
- "Q-SEP": PeriodDtypeCode.Q_SEP, # Quarterly - September year end
- "Q-OCT": PeriodDtypeCode.Q_OCT, # Quarterly - October year end
- "Q-NOV": PeriodDtypeCode.Q_NOV, # Quarterly - November year end
-
- "M": PeriodDtypeCode.M, # Monthly
-
- "W-SUN": PeriodDtypeCode.W_SUN, # Weekly - Sunday end of week
- "W-MON": PeriodDtypeCode.W_MON, # Weekly - Monday end of week
- "W-TUE": PeriodDtypeCode.W_TUE, # Weekly - Tuesday end of week
- "W-WED": PeriodDtypeCode.W_WED, # Weekly - Wednesday end of week
- "W-THU": PeriodDtypeCode.W_THU, # Weekly - Thursday end of week
- "W-FRI": PeriodDtypeCode.W_FRI, # Weekly - Friday end of week
- "W-SAT": PeriodDtypeCode.W_SAT, # Weekly - Saturday end of week
-
- "B": PeriodDtypeCode.B, # Business days
- "D": PeriodDtypeCode.D, # Daily
- "H": PeriodDtypeCode.H, # Hourly
- "T": PeriodDtypeCode.T, # Minutely
- "S": PeriodDtypeCode.S, # Secondly
- "L": PeriodDtypeCode.L, # Millisecondly
- "U": PeriodDtypeCode.U, # Microsecondly
- "N": PeriodDtypeCode.N, # Nanosecondly
-}
-
-_reverse_period_code_map = {
- _period_code_map[key]: key for key in _period_code_map}
-
-# Yearly aliases; careful not to put these in _reverse_period_code_map
-_period_code_map.update({"Y" + key[1:]: _period_code_map[key]
- for key in _period_code_map
- if key.startswith("A-")})
-
-_period_code_map.update({
- "Q": 2000, # Quarterly - December year end (default quarterly)
- "A": PeriodDtypeCode.A, # Annual
- "W": 4000, # Weekly
- "C": 5000, # Custom Business Day
-})
-
-cdef set _month_names = {
- x.split("-")[-1] for x in _period_code_map.keys() if x.startswith("A-")
-}
-
-# Map attribute-name resolutions to resolution abbreviations
-_attrname_to_abbrevs = {
- "year": "A",
- "quarter": "Q",
- "month": "M",
- "day": "D",
- "hour": "H",
- "minute": "T",
- "second": "S",
- "millisecond": "L",
- "microsecond": "U",
- "nanosecond": "N",
-}
-cdef dict attrname_to_abbrevs = _attrname_to_abbrevs
-cdef dict _abbrev_to_attrnames = {v: k for k, v in attrname_to_abbrevs.items()}
-
-
-class FreqGroup(Enum):
- # Mirrors c_FreqGroup in the .pxd file
- FR_ANN = c_FreqGroup.FR_ANN
- FR_QTR = c_FreqGroup.FR_QTR
- FR_MTH = c_FreqGroup.FR_MTH
- FR_WK = c_FreqGroup.FR_WK
- FR_BUS = c_FreqGroup.FR_BUS
- FR_DAY = c_FreqGroup.FR_DAY
- FR_HR = c_FreqGroup.FR_HR
- FR_MIN = c_FreqGroup.FR_MIN
- FR_SEC = c_FreqGroup.FR_SEC
- FR_MS = c_FreqGroup.FR_MS
- FR_US = c_FreqGroup.FR_US
- FR_NS = c_FreqGroup.FR_NS
- FR_UND = c_FreqGroup.FR_UND # undefined
-
- @staticmethod
- def from_period_dtype_code(code: int) -> "FreqGroup":
- # See also: PeriodDtypeBase._freq_group_code
- code = (code // 1000) * 1000
- return FreqGroup(code)
-
-
-class Resolution(Enum):
- RESO_NS = c_Resolution.RESO_NS
- RESO_US = c_Resolution.RESO_US
- RESO_MS = c_Resolution.RESO_MS
- RESO_SEC = c_Resolution.RESO_SEC
- RESO_MIN = c_Resolution.RESO_MIN
- RESO_HR = c_Resolution.RESO_HR
- RESO_DAY = c_Resolution.RESO_DAY
- RESO_MTH = c_Resolution.RESO_MTH
- RESO_QTR = c_Resolution.RESO_QTR
- RESO_YR = c_Resolution.RESO_YR
-
- def __lt__(self, other):
- return self.value < other.value
-
- def __ge__(self, other):
- return self.value >= other.value
-
- @property
- def attr_abbrev(self) -> str:
- # string that we can pass to to_offset
- return _attrname_to_abbrevs[self.attrname]
-
- @property
- def attrname(self) -> str:
- """
- Return datetime attribute name corresponding to this Resolution.
-
- Examples
- --------
- >>> Resolution.RESO_SEC.attrname
- 'second'
- """
- return _reso_str_map[self.value]
-
- @classmethod
- def from_attrname(cls, attrname: str) -> "Resolution":
- """
- Return resolution str against resolution code.
-
- Examples
- --------
- >>> Resolution.from_attrname('second')
- <Resolution.RESO_SEC: 3>
-
- >>> Resolution.from_attrname('second') == Resolution.RESO_SEC
- True
- """
- return cls(_str_reso_map[attrname])
-
- @classmethod
- def get_reso_from_freqstr(cls, freq: str) -> "Resolution":
- """
- Return resolution code against frequency str.
-
- `freq` is given by the `offset.freqstr` for some DateOffset object.
-
- Examples
- --------
- >>> Resolution.get_reso_from_freqstr('H')
- <Resolution.RESO_HR: 5>
-
- >>> Resolution.get_reso_from_freqstr('H') == Resolution.RESO_HR
- True
- """
- try:
- attr_name = _abbrev_to_attrnames[freq]
- except KeyError:
- # For quarterly and yearly resolutions, we need to chop off
- # a month string.
- split_freq = freq.split("-")
- if len(split_freq) != 2:
- raise
- if split_freq[1] not in _month_names:
- # i.e. we want e.g. "Q-DEC", not "Q-INVALID"
- raise
- attr_name = _abbrev_to_attrnames[split_freq[0]]
-
- return cls.from_attrname(attr_name)
-
-
-class NpyDatetimeUnit(Enum):
- """
- Python-space analogue to NPY_DATETIMEUNIT.
- """
- NPY_FR_Y = NPY_DATETIMEUNIT.NPY_FR_Y
- NPY_FR_M = NPY_DATETIMEUNIT.NPY_FR_M
- NPY_FR_W = NPY_DATETIMEUNIT.NPY_FR_W
- NPY_FR_D = NPY_DATETIMEUNIT.NPY_FR_D
- NPY_FR_h = NPY_DATETIMEUNIT.NPY_FR_h
- NPY_FR_m = NPY_DATETIMEUNIT.NPY_FR_m
- NPY_FR_s = NPY_DATETIMEUNIT.NPY_FR_s
- NPY_FR_ms = NPY_DATETIMEUNIT.NPY_FR_ms
- NPY_FR_us = NPY_DATETIMEUNIT.NPY_FR_us
- NPY_FR_ns = NPY_DATETIMEUNIT.NPY_FR_ns
- NPY_FR_ps = NPY_DATETIMEUNIT.NPY_FR_ps
- NPY_FR_fs = NPY_DATETIMEUNIT.NPY_FR_fs
- NPY_FR_as = NPY_DATETIMEUNIT.NPY_FR_as
- NPY_FR_GENERIC = NPY_DATETIMEUNIT.NPY_FR_GENERIC
-
-
-cpdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso):
- # If we have an unsupported reso, return the nearest supported reso.
- if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
- # TODO: or raise ValueError? trying this gives unraisable errors, but
- # "except? -1" breaks at compile-time for unknown reasons
- return NPY_DATETIMEUNIT.NPY_FR_ns
- if reso < NPY_DATETIMEUNIT.NPY_FR_s:
- return NPY_DATETIMEUNIT.NPY_FR_s
- elif reso > NPY_DATETIMEUNIT.NPY_FR_ns:
- return NPY_DATETIMEUNIT.NPY_FR_ns
- return reso
-
-
-def is_supported_unit(NPY_DATETIMEUNIT reso):
- return (
- reso == NPY_DATETIMEUNIT.NPY_FR_ns
- or reso == NPY_DATETIMEUNIT.NPY_FR_us
- or reso == NPY_DATETIMEUNIT.NPY_FR_ms
- or reso == NPY_DATETIMEUNIT.NPY_FR_s
- )
-
-
-cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
- if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
- # generic -> default to nanoseconds
- return "ns"
- elif unit == NPY_DATETIMEUNIT.NPY_FR_us:
- return "us"
- elif unit == NPY_DATETIMEUNIT.NPY_FR_ms:
- return "ms"
- elif unit == NPY_DATETIMEUNIT.NPY_FR_s:
- return "s"
- elif unit == NPY_DATETIMEUNIT.NPY_FR_m:
- return "m"
- elif unit == NPY_DATETIMEUNIT.NPY_FR_h:
- return "h"
- elif unit == NPY_DATETIMEUNIT.NPY_FR_D:
- return "D"
- elif unit == NPY_DATETIMEUNIT.NPY_FR_W:
- return "W"
- elif unit == NPY_DATETIMEUNIT.NPY_FR_M:
- return "M"
- elif unit == NPY_DATETIMEUNIT.NPY_FR_Y:
- return "Y"
-
- # Checks for not-really-supported units go at the end, as we don't expect
- # to see these often
- elif unit == NPY_DATETIMEUNIT.NPY_FR_ps:
- return "ps"
- elif unit == NPY_DATETIMEUNIT.NPY_FR_fs:
- return "fs"
- elif unit == NPY_DATETIMEUNIT.NPY_FR_as:
- return "as"
-
- else:
- raise NotImplementedError(unit)
-
-
-cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev):
- if abbrev == "Y":
- return NPY_DATETIMEUNIT.NPY_FR_Y
- elif abbrev == "M":
- return NPY_DATETIMEUNIT.NPY_FR_M
- elif abbrev == "W":
- return NPY_DATETIMEUNIT.NPY_FR_W
- elif abbrev == "D" or abbrev == "d":
- return NPY_DATETIMEUNIT.NPY_FR_D
- elif abbrev == "h":
- return NPY_DATETIMEUNIT.NPY_FR_h
- elif abbrev == "m":
- return NPY_DATETIMEUNIT.NPY_FR_m
- elif abbrev == "s":
- return NPY_DATETIMEUNIT.NPY_FR_s
- elif abbrev == "ms":
- return NPY_DATETIMEUNIT.NPY_FR_ms
- elif abbrev == "us":
- return NPY_DATETIMEUNIT.NPY_FR_us
- elif abbrev == "ns":
- return NPY_DATETIMEUNIT.NPY_FR_ns
- elif abbrev == "ps":
- return NPY_DATETIMEUNIT.NPY_FR_ps
- elif abbrev == "fs":
- return NPY_DATETIMEUNIT.NPY_FR_fs
- elif abbrev == "as":
- return NPY_DATETIMEUNIT.NPY_FR_as
- elif abbrev is None:
- return NPY_DATETIMEUNIT.NPY_FR_GENERIC
- else:
- raise ValueError(f"Unrecognized unit {abbrev}")
-
-
-cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil:
- """
- Convert the freq to the corresponding NPY_DATETIMEUNIT to pass
- to npy_datetimestruct_to_datetime.
- """
- if freq == FR_MTH:
- return NPY_DATETIMEUNIT.NPY_FR_M
- elif freq == FR_DAY:
- return NPY_DATETIMEUNIT.NPY_FR_D
- elif freq == FR_HR:
- return NPY_DATETIMEUNIT.NPY_FR_h
- elif freq == FR_MIN:
- return NPY_DATETIMEUNIT.NPY_FR_m
- elif freq == FR_SEC:
- return NPY_DATETIMEUNIT.NPY_FR_s
- elif freq == FR_MS:
- return NPY_DATETIMEUNIT.NPY_FR_ms
- elif freq == FR_US:
- return NPY_DATETIMEUNIT.NPY_FR_us
- elif freq == FR_NS:
- return NPY_DATETIMEUNIT.NPY_FR_ns
- elif freq == FR_UND:
- # Default to Day
- return NPY_DATETIMEUNIT.NPY_FR_D
-
-
-# TODO: use in _matplotlib.converter?
-cpdef int64_t periods_per_day(
- NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns
-) except? -1:
- """
- How many of the given time units fit into a single day?
- """
- return get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, reso)
-
-
-cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1:
- return get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, reso)
-
-
-cdef dict _reso_str_map = {
- Resolution.RESO_NS.value: "nanosecond",
- Resolution.RESO_US.value: "microsecond",
- Resolution.RESO_MS.value: "millisecond",
- Resolution.RESO_SEC.value: "second",
- Resolution.RESO_MIN.value: "minute",
- Resolution.RESO_HR.value: "hour",
- Resolution.RESO_DAY.value: "day",
- Resolution.RESO_MTH.value: "month",
- Resolution.RESO_QTR.value: "quarter",
- Resolution.RESO_YR.value: "year",
-}
-
-cdef dict _str_reso_map = {v: k for k, v in _reso_str_map.items()}
-
-cdef dict npy_unit_to_attrname = {
- NPY_DATETIMEUNIT.NPY_FR_Y: "year",
- NPY_DATETIMEUNIT.NPY_FR_M: "month",
- NPY_DATETIMEUNIT.NPY_FR_D: "day",
- NPY_DATETIMEUNIT.NPY_FR_h: "hour",
- NPY_DATETIMEUNIT.NPY_FR_m: "minute",
- NPY_DATETIMEUNIT.NPY_FR_s: "second",
- NPY_DATETIMEUNIT.NPY_FR_ms: "millisecond",
- NPY_DATETIMEUNIT.NPY_FR_us: "microsecond",
- NPY_DATETIMEUNIT.NPY_FR_ns: "nanosecond",
-}
-cdef dict attrname_to_npy_unit = {v: k for k, v in npy_unit_to_attrname.items()}
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/fields.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/fields.pyi
deleted file mode 100644
index c6cfd44e9f6..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/fields.pyi
+++ /dev/null
@@ -1,62 +0,0 @@
-import numpy as np
-
-from pandas._typing import npt
-
-def build_field_sarray(
- dtindex: npt.NDArray[np.int64], # const int64_t[:]
- reso: int, # NPY_DATETIMEUNIT
-) -> np.ndarray: ...
-def month_position_check(fields, weekdays) -> str | None: ...
-def get_date_name_field(
- dtindex: npt.NDArray[np.int64], # const int64_t[:]
- field: str,
- locale: str | None = ...,
- reso: int = ..., # NPY_DATETIMEUNIT
-) -> npt.NDArray[np.object_]: ...
-def get_start_end_field(
- dtindex: npt.NDArray[np.int64],
- field: str,
- freqstr: str | None = ...,
- month_kw: int = ...,
- reso: int = ..., # NPY_DATETIMEUNIT
-) -> npt.NDArray[np.bool_]: ...
-def get_date_field(
- dtindex: npt.NDArray[np.int64], # const int64_t[:]
- field: str,
- reso: int = ..., # NPY_DATETIMEUNIT
-) -> npt.NDArray[np.int32]: ...
-def get_timedelta_field(
- tdindex: npt.NDArray[np.int64], # const int64_t[:]
- field: str,
- reso: int = ..., # NPY_DATETIMEUNIT
-) -> npt.NDArray[np.int32]: ...
-def get_timedelta_days(
- tdindex: npt.NDArray[np.int64], # const int64_t[:]
- reso: int = ..., # NPY_DATETIMEUNIT
-) -> npt.NDArray[np.int64]: ...
-def isleapyear_arr(
- years: np.ndarray,
-) -> npt.NDArray[np.bool_]: ...
-def build_isocalendar_sarray(
- dtindex: npt.NDArray[np.int64], # const int64_t[:]
- reso: int, # NPY_DATETIMEUNIT
-) -> np.ndarray: ...
-def _get_locale_names(name_type: str, locale: str | None = ...): ...
-
-class RoundTo:
- @property
- def MINUS_INFTY(self) -> int: ...
- @property
- def PLUS_INFTY(self) -> int: ...
- @property
- def NEAREST_HALF_EVEN(self) -> int: ...
- @property
- def NEAREST_HALF_PLUS_INFTY(self) -> int: ...
- @property
- def NEAREST_HALF_MINUS_INFTY(self) -> int: ...
-
-def round_nsint64(
- values: npt.NDArray[np.int64],
- mode: RoundTo,
- nanos: int,
-) -> npt.NDArray[np.int64]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/fields.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/fields.pyx
deleted file mode 100644
index 9a145311eaf..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/fields.pyx
+++ /dev/null
@@ -1,792 +0,0 @@
-"""
-Functions for accessing attributes of Timestamp/datetime64/datetime-like
-objects and arrays
-"""
-from locale import LC_TIME
-
-from _strptime import LocaleTime
-
-cimport cython
-from cython cimport Py_ssize_t
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- int8_t,
- int32_t,
- int64_t,
- ndarray,
- uint32_t,
-)
-
-cnp.import_array()
-
-from pandas._config.localization import set_locale
-
-from pandas._libs.tslibs.ccalendar import (
- DAYS_FULL,
- MONTHS_FULL,
-)
-
-from pandas._libs.tslibs.ccalendar cimport (
- dayofweek,
- get_day_of_year,
- get_days_in_month,
- get_firstbday,
- get_iso_calendar,
- get_lastbday,
- get_week_of_year,
- iso_calendar_t,
-)
-from pandas._libs.tslibs.nattype cimport NPY_NAT
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- NPY_FR_ns,
- npy_datetimestruct,
- pandas_datetime_to_datetimestruct,
- pandas_timedelta_to_timedeltastruct,
- pandas_timedeltastruct,
-)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def build_field_sarray(const int64_t[:] dtindex, NPY_DATETIMEUNIT reso):
- """
- Datetime as int64 representation to a structured array of fields
- """
- cdef:
- Py_ssize_t i, count = len(dtindex)
- npy_datetimestruct dts
- ndarray[int32_t] years, months, days, hours, minutes, seconds, mus
-
- sa_dtype = [
- ("Y", "i4"), # year
- ("M", "i4"), # month
- ("D", "i4"), # day
- ("h", "i4"), # hour
- ("m", "i4"), # min
- ("s", "i4"), # second
- ("u", "i4"), # microsecond
- ]
-
- out = np.empty(count, dtype=sa_dtype)
-
- years = out["Y"]
- months = out["M"]
- days = out["D"]
- hours = out["h"]
- minutes = out["m"]
- seconds = out["s"]
- mus = out["u"]
-
- for i in range(count):
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- years[i] = dts.year
- months[i] = dts.month
- days[i] = dts.day
- hours[i] = dts.hour
- minutes[i] = dts.min
- seconds[i] = dts.sec
- mus[i] = dts.us
-
- return out
-
-
-def month_position_check(fields, weekdays) -> str | None:
- cdef:
- int32_t daysinmonth, y, m, d
- bint calendar_end = True
- bint business_end = True
- bint calendar_start = True
- bint business_start = True
- bint cal
- int32_t[:] years = fields["Y"]
- int32_t[:] months = fields["M"]
- int32_t[:] days = fields["D"]
-
- for y, m, d, wd in zip(years, months, days, weekdays):
- if calendar_start:
- calendar_start &= d == 1
- if business_start:
- business_start &= d == 1 or (d <= 3 and wd == 0)
-
- if calendar_end or business_end:
- daysinmonth = get_days_in_month(y, m)
- cal = d == daysinmonth
- if calendar_end:
- calendar_end &= cal
- if business_end:
- business_end &= cal or (daysinmonth - d < 3 and wd == 4)
- elif not calendar_start and not business_start:
- break
-
- if calendar_end:
- return "ce"
- elif business_end:
- return "be"
- elif calendar_start:
- return "cs"
- elif business_start:
- return "bs"
- else:
- return None
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def get_date_name_field(
- const int64_t[:] dtindex,
- str field,
- object locale=None,
- NPY_DATETIMEUNIT reso=NPY_FR_ns,
-):
- """
- Given a int64-based datetime index, return array of strings of date
- name based on requested field (e.g. day_name)
- """
- cdef:
- Py_ssize_t i, count = dtindex.shape[0]
- ndarray[object] out, names
- npy_datetimestruct dts
- int dow
-
- out = np.empty(count, dtype=object)
-
- if field == "day_name":
- if locale is None:
- names = np.array(DAYS_FULL, dtype=np.object_)
- else:
- names = np.array(_get_locale_names("f_weekday", locale),
- dtype=np.object_)
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = np.nan
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- dow = dayofweek(dts.year, dts.month, dts.day)
- out[i] = names[dow].capitalize()
-
- elif field == "month_name":
- if locale is None:
- names = np.array(MONTHS_FULL, dtype=np.object_)
- else:
- names = np.array(_get_locale_names("f_month", locale),
- dtype=np.object_)
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = np.nan
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = names[dts.month].capitalize()
-
- else:
- raise ValueError(f"Field {field} not supported")
-
- return out
-
-
-cdef bint _is_on_month(int month, int compare_month, int modby) nogil:
- """
- Analogous to DateOffset.is_on_offset checking for the month part of a date.
- """
- if modby == 1:
- return True
- elif modby == 3:
- return (month - compare_month) % 3 == 0
- else:
- return month == compare_month
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def get_start_end_field(
- const int64_t[:] dtindex,
- str field,
- str freqstr=None,
- int month_kw=12,
- NPY_DATETIMEUNIT reso=NPY_FR_ns,
-):
- """
- Given an int64-based datetime index return array of indicators
- of whether timestamps are at the start/end of the month/quarter/year
- (defined by frequency).
-
- Parameters
- ----------
- dtindex : ndarray[int64]
- field : str
- frestr : str or None, default None
- month_kw : int, default 12
- reso : NPY_DATETIMEUNIT, default NPY_FR_ns
-
- Returns
- -------
- ndarray[bool]
- """
- cdef:
- Py_ssize_t i
- int count = dtindex.shape[0]
- bint is_business = 0
- int end_month = 12
- int start_month = 1
- ndarray[int8_t] out
- npy_datetimestruct dts
- int compare_month, modby
-
- out = np.zeros(count, dtype="int8")
-
- if freqstr:
- if freqstr == "C":
- raise ValueError(f"Custom business days is not supported by {field}")
- is_business = freqstr[0] == "B"
-
- # YearBegin(), BYearBegin() use month = starting month of year.
- # QuarterBegin(), BQuarterBegin() use startingMonth = starting
- # month of year. Other offsets use month, startingMonth as ending
- # month of year.
-
- if (freqstr[0:2] in ["MS", "QS", "AS"]) or (
- freqstr[1:3] in ["MS", "QS", "AS"]):
- end_month = 12 if month_kw == 1 else month_kw - 1
- start_month = month_kw
- else:
- end_month = month_kw
- start_month = (end_month % 12) + 1
- else:
- end_month = 12
- start_month = 1
-
- compare_month = start_month if "start" in field else end_month
- if "month" in field:
- modby = 1
- elif "quarter" in field:
- modby = 3
- else:
- modby = 12
-
- if field in ["is_month_start", "is_quarter_start", "is_year_start"]:
- if is_business:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = 0
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
-
- if _is_on_month(dts.month, compare_month, modby) and (
- dts.day == get_firstbday(dts.year, dts.month)):
- out[i] = 1
-
- else:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = 0
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
-
- if _is_on_month(dts.month, compare_month, modby) and dts.day == 1:
- out[i] = 1
-
- elif field in ["is_month_end", "is_quarter_end", "is_year_end"]:
- if is_business:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = 0
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
-
- if _is_on_month(dts.month, compare_month, modby) and (
- dts.day == get_lastbday(dts.year, dts.month)):
- out[i] = 1
-
- else:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = 0
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
-
- if _is_on_month(dts.month, compare_month, modby) and (
- dts.day == get_days_in_month(dts.year, dts.month)):
- out[i] = 1
-
- else:
- raise ValueError(f"Field {field} not supported")
-
- return out.view(bool)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def get_date_field(
- const int64_t[:] dtindex,
- str field,
- NPY_DATETIMEUNIT reso=NPY_FR_ns,
-):
- """
- Given a int64-based datetime index, extract the year, month, etc.,
- field and return an array of these values.
- """
- cdef:
- Py_ssize_t i, count = len(dtindex)
- ndarray[int32_t] out
- npy_datetimestruct dts
-
- out = np.empty(count, dtype="i4")
-
- if field == "Y":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = dts.year
- return out
-
- elif field == "M":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = dts.month
- return out
-
- elif field == "D":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = dts.day
- return out
-
- elif field == "h":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = dts.hour
- # TODO: can we de-dup with period.pyx <accessor>s?
- return out
-
- elif field == "m":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = dts.min
- return out
-
- elif field == "s":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = dts.sec
- return out
-
- elif field == "us":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = dts.us
- return out
-
- elif field == "ns":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = dts.ps // 1000
- return out
- elif field == "doy":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = get_day_of_year(dts.year, dts.month, dts.day)
- return out
-
- elif field == "dow":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = dayofweek(dts.year, dts.month, dts.day)
- return out
-
- elif field == "woy":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = get_week_of_year(dts.year, dts.month, dts.day)
- return out
-
- elif field == "q":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = dts.month
- out[i] = ((out[i] - 1) // 3) + 1
- return out
-
- elif field == "dim":
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- out[i] = get_days_in_month(dts.year, dts.month)
- return out
- elif field == "is_leap_year":
- return isleapyear_arr(get_date_field(dtindex, "Y", reso=reso))
-
- raise ValueError(f"Field {field} not supported")
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def get_timedelta_field(
- const int64_t[:] tdindex,
- str field,
- NPY_DATETIMEUNIT reso=NPY_FR_ns,
-):
- """
- Given a int64-based timedelta index, extract the days, hrs, sec.,
- field and return an array of these values.
- """
- cdef:
- Py_ssize_t i, count = len(tdindex)
- ndarray[int32_t] out
- pandas_timedeltastruct tds
-
- out = np.empty(count, dtype="i4")
-
- if field == "seconds":
- with nogil:
- for i in range(count):
- if tdindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_timedelta_to_timedeltastruct(tdindex[i], reso, &tds)
- out[i] = tds.seconds
- return out
-
- elif field == "microseconds":
- with nogil:
- for i in range(count):
- if tdindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_timedelta_to_timedeltastruct(tdindex[i], reso, &tds)
- out[i] = tds.microseconds
- return out
-
- elif field == "nanoseconds":
- with nogil:
- for i in range(count):
- if tdindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_timedelta_to_timedeltastruct(tdindex[i], reso, &tds)
- out[i] = tds.nanoseconds
- return out
-
- raise ValueError(f"Field {field} not supported")
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def get_timedelta_days(
- const int64_t[:] tdindex,
- NPY_DATETIMEUNIT reso=NPY_FR_ns,
-):
- """
- Given a int64-based timedelta index, extract the days,
- field and return an array of these values.
- """
- cdef:
- Py_ssize_t i, count = len(tdindex)
- ndarray[int64_t] out
- pandas_timedeltastruct tds
-
- out = np.empty(count, dtype="i8")
-
- with nogil:
- for i in range(count):
- if tdindex[i] == NPY_NAT:
- out[i] = -1
- continue
-
- pandas_timedelta_to_timedeltastruct(tdindex[i], reso, &tds)
- out[i] = tds.days
- return out
-
-
-cpdef isleapyear_arr(ndarray years):
- """vectorized version of isleapyear; NaT evaluates as False"""
- cdef:
- ndarray[int8_t] out
-
- out = np.zeros(len(years), dtype="int8")
- out[np.logical_or(years % 400 == 0,
- np.logical_and(years % 4 == 0,
- years % 100 > 0))] = 1
- return out.view(bool)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def build_isocalendar_sarray(const int64_t[:] dtindex, NPY_DATETIMEUNIT reso):
- """
- Given a int64-based datetime array, return the ISO 8601 year, week, and day
- as a structured array.
- """
- cdef:
- Py_ssize_t i, count = len(dtindex)
- npy_datetimestruct dts
- ndarray[uint32_t] iso_years, iso_weeks, days
- iso_calendar_t ret_val
-
- sa_dtype = [
- ("year", "u4"),
- ("week", "u4"),
- ("day", "u4"),
- ]
-
- out = np.empty(count, dtype=sa_dtype)
-
- iso_years = out["year"]
- iso_weeks = out["week"]
- days = out["day"]
-
- with nogil:
- for i in range(count):
- if dtindex[i] == NPY_NAT:
- ret_val = 0, 0, 0
- else:
- pandas_datetime_to_datetimestruct(dtindex[i], reso, &dts)
- ret_val = get_iso_calendar(dts.year, dts.month, dts.day)
-
- iso_years[i] = ret_val[0]
- iso_weeks[i] = ret_val[1]
- days[i] = ret_val[2]
- return out
-
-
-def _get_locale_names(name_type: str, locale: object = None):
- """
- Returns an array of localized day or month names.
-
- Parameters
- ----------
- name_type : str
- Attribute of LocaleTime() in which to return localized names.
- locale : str
-
- Returns
- -------
- list of locale names
- """
- with set_locale(locale, LC_TIME):
- return getattr(LocaleTime(), name_type)
-
-
-# ---------------------------------------------------------------------
-# Rounding
-
-
-class RoundTo:
- """
- enumeration defining the available rounding modes
-
- Attributes
- ----------
- MINUS_INFTY
- round towards -∞, or floor [2]_
- PLUS_INFTY
- round towards +∞, or ceil [3]_
- NEAREST_HALF_EVEN
- round to nearest, tie-break half to even [6]_
- NEAREST_HALF_MINUS_INFTY
- round to nearest, tie-break half to -∞ [5]_
- NEAREST_HALF_PLUS_INFTY
- round to nearest, tie-break half to +∞ [4]_
-
-
- References
- ----------
- .. [1] "Rounding - Wikipedia"
- https://en.wikipedia.org/wiki/Rounding
- .. [2] "Rounding down"
- https://en.wikipedia.org/wiki/Rounding#Rounding_down
- .. [3] "Rounding up"
- https://en.wikipedia.org/wiki/Rounding#Rounding_up
- .. [4] "Round half up"
- https://en.wikipedia.org/wiki/Rounding#Round_half_up
- .. [5] "Round half down"
- https://en.wikipedia.org/wiki/Rounding#Round_half_down
- .. [6] "Round half to even"
- https://en.wikipedia.org/wiki/Rounding#Round_half_to_even
- """
- @property
- def MINUS_INFTY(self) -> int:
- return 0
-
- @property
- def PLUS_INFTY(self) -> int:
- return 1
-
- @property
- def NEAREST_HALF_EVEN(self) -> int:
- return 2
-
- @property
- def NEAREST_HALF_PLUS_INFTY(self) -> int:
- return 3
-
- @property
- def NEAREST_HALF_MINUS_INFTY(self) -> int:
- return 4
-
-
-cdef ndarray[int64_t] _floor_int64(const int64_t[:] values, int64_t unit):
- cdef:
- Py_ssize_t i, n = len(values)
- ndarray[int64_t] result = np.empty(n, dtype="i8")
- int64_t res, value
-
- with cython.overflowcheck(True):
- for i in range(n):
- value = values[i]
- if value == NPY_NAT:
- res = NPY_NAT
- else:
- res = value - value % unit
- result[i] = res
-
- return result
-
-
-cdef ndarray[int64_t] _ceil_int64(const int64_t[:] values, int64_t unit):
- cdef:
- Py_ssize_t i, n = len(values)
- ndarray[int64_t] result = np.empty(n, dtype="i8")
- int64_t res, value
-
- with cython.overflowcheck(True):
- for i in range(n):
- value = values[i]
-
- if value == NPY_NAT:
- res = NPY_NAT
- else:
- remainder = value % unit
- if remainder == 0:
- res = value
- else:
- res = value + (unit - remainder)
-
- result[i] = res
-
- return result
-
-
-cdef ndarray[int64_t] _rounddown_int64(values, int64_t unit):
- return _ceil_int64(values - unit // 2, unit)
-
-
-cdef ndarray[int64_t] _roundup_int64(values, int64_t unit):
- return _floor_int64(values + unit // 2, unit)
-
-
-def round_nsint64(values: np.ndarray, mode: RoundTo, nanos: int) -> np.ndarray:
- """
- Applies rounding mode at given frequency
-
- Parameters
- ----------
- values : np.ndarray[int64_t]`
- mode : instance of `RoundTo` enumeration
- nanos : np.int64
- Freq to round to, expressed in nanoseconds
-
- Returns
- -------
- np.ndarray[int64_t]
- """
- cdef:
- int64_t unit = nanos
-
- if mode == RoundTo.MINUS_INFTY:
- return _floor_int64(values, unit)
- elif mode == RoundTo.PLUS_INFTY:
- return _ceil_int64(values, unit)
- elif mode == RoundTo.NEAREST_HALF_MINUS_INFTY:
- return _rounddown_int64(values, unit)
- elif mode == RoundTo.NEAREST_HALF_PLUS_INFTY:
- return _roundup_int64(values, unit)
- elif mode == RoundTo.NEAREST_HALF_EVEN:
- # for odd unit there is no need of a tie break
- if unit % 2:
- return _rounddown_int64(values, unit)
- quotient, remainder = np.divmod(values, unit)
- mask = np.logical_or(
- remainder > (unit // 2),
- np.logical_and(remainder == (unit // 2), quotient % 2)
- )
- quotient[mask] += 1
- return quotient * unit
-
- # if/elif above should catch all rounding modes defined in enum 'RoundTo':
- # if flow of control arrives here, it is a bug
- raise ValueError("round_nsint64 called with an unrecognized rounding mode")
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pxd
deleted file mode 100644
index 32705aa6331..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pxd
+++ /dev/null
@@ -1,18 +0,0 @@
-from cpython.datetime cimport datetime
-from numpy cimport int64_t
-
-
-cdef int64_t NPY_NAT
-
-cdef set c_nat_strings
-
-cdef class _NaT(datetime):
- cdef readonly:
- int64_t _value
-
-cdef _NaT c_NaT
-
-
-cdef bint checknull_with_nat(object val)
-cdef bint is_dt64nat(object val)
-cdef bint is_td64nat(object val)
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pyi
deleted file mode 100644
index 04f89437104..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pyi
+++ /dev/null
@@ -1,132 +0,0 @@
-from datetime import (
- datetime,
- timedelta,
- tzinfo as _tzinfo,
-)
-
-import numpy as np
-
-from pandas._libs.tslibs.period import Period
-
-NaT: NaTType
-iNaT: int
-nat_strings: set[str]
-
-_NaTComparisonTypes = datetime | timedelta | Period | np.datetime64 | np.timedelta64
-
-class _NatComparison:
- def __call__(self, other: _NaTComparisonTypes) -> bool: ...
-
-class NaTType:
- _value: np.int64
- @property
- def value(self) -> int: ...
- @property
- def asm8(self) -> np.datetime64: ...
- def to_datetime64(self) -> np.datetime64: ...
- def to_numpy(
- self, dtype: np.dtype | str | None = ..., copy: bool = ...
- ) -> np.datetime64 | np.timedelta64: ...
- @property
- def is_leap_year(self) -> bool: ...
- @property
- def is_month_start(self) -> bool: ...
- @property
- def is_quarter_start(self) -> bool: ...
- @property
- def is_year_start(self) -> bool: ...
- @property
- def is_month_end(self) -> bool: ...
- @property
- def is_quarter_end(self) -> bool: ...
- @property
- def is_year_end(self) -> bool: ...
- @property
- def day_of_year(self) -> float: ...
- @property
- def dayofyear(self) -> float: ...
- @property
- def days_in_month(self) -> float: ...
- @property
- def daysinmonth(self) -> float: ...
- @property
- def day_of_week(self) -> float: ...
- @property
- def dayofweek(self) -> float: ...
- @property
- def week(self) -> float: ...
- @property
- def weekofyear(self) -> float: ...
- def day_name(self) -> float: ...
- def month_name(self) -> float: ...
- def weekday(self) -> float: ...
- def isoweekday(self) -> float: ...
- def total_seconds(self) -> float: ...
- def today(self, *args, **kwargs) -> NaTType: ...
- def now(self, *args, **kwargs) -> NaTType: ...
- def to_pydatetime(self) -> NaTType: ...
- def date(self) -> NaTType: ...
- def round(self) -> NaTType: ...
- def floor(self) -> NaTType: ...
- def ceil(self) -> NaTType: ...
- @property
- def tzinfo(self) -> None: ...
- @property
- def tz(self) -> None: ...
- def tz_convert(self, tz: _tzinfo | str | None) -> NaTType: ...
- def tz_localize(
- self,
- tz: _tzinfo | str | None,
- ambiguous: str = ...,
- nonexistent: str = ...,
- ) -> NaTType: ...
- def replace(
- self,
- year: int | None = ...,
- month: int | None = ...,
- day: int | None = ...,
- hour: int | None = ...,
- minute: int | None = ...,
- second: int | None = ...,
- microsecond: int | None = ...,
- nanosecond: int | None = ...,
- tzinfo: _tzinfo | None = ...,
- fold: int | None = ...,
- ) -> NaTType: ...
- @property
- def year(self) -> float: ...
- @property
- def quarter(self) -> float: ...
- @property
- def month(self) -> float: ...
- @property
- def day(self) -> float: ...
- @property
- def hour(self) -> float: ...
- @property
- def minute(self) -> float: ...
- @property
- def second(self) -> float: ...
- @property
- def millisecond(self) -> float: ...
- @property
- def microsecond(self) -> float: ...
- @property
- def nanosecond(self) -> float: ...
- # inject Timedelta properties
- @property
- def days(self) -> float: ...
- @property
- def microseconds(self) -> float: ...
- @property
- def nanoseconds(self) -> float: ...
- # inject Period properties
- @property
- def qyear(self) -> float: ...
- def __eq__(self, other: object) -> bool: ...
- def __ne__(self, other: object) -> bool: ...
- __lt__: _NatComparison
- __le__: _NatComparison
- __gt__: _NatComparison
- __ge__: _NatComparison
- def as_unit(self, unit: str, round_ok: bool = ...) -> NaTType: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pyx
deleted file mode 100644
index ff07f5d7993..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/nattype.pyx
+++ /dev/null
@@ -1,1245 +0,0 @@
-from cpython.datetime cimport (
- PyDate_Check,
- PyDateTime_Check,
- PyDelta_Check,
- datetime,
- import_datetime,
- timedelta,
-)
-
-import_datetime()
-from cpython.object cimport (
- Py_EQ,
- Py_NE,
- PyObject_RichCompare,
-)
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport int64_t
-
-cnp.import_array()
-
-cimport pandas._libs.tslibs.util as util
-from pandas._libs.tslibs.np_datetime cimport (
- get_datetime64_value,
- get_timedelta64_value,
-)
-
-# ----------------------------------------------------------------------
-# Constants
-nat_strings = {"NaT", "nat", "NAT", "nan", "NaN", "NAN"}
-cdef set c_nat_strings = nat_strings
-
-cdef int64_t NPY_NAT = util.get_nat()
-iNaT = NPY_NAT # python-visible constant
-
-# ----------------------------------------------------------------------
-
-
-def _make_nan_func(func_name: str, doc: str):
- def f(*args, **kwargs):
- return np.nan
- f.__name__ = func_name
- f.__doc__ = doc
- return f
-
-
-def _make_nat_func(func_name: str, doc: str):
- def f(*args, **kwargs):
- return c_NaT
- f.__name__ = func_name
- f.__doc__ = doc
- return f
-
-
-def _make_error_func(func_name: str, cls):
- def f(*args, **kwargs):
- raise ValueError(f"NaTType does not support {func_name}")
-
- f.__name__ = func_name
- if isinstance(cls, str):
- # passed the literal docstring directly
- f.__doc__ = cls
- elif cls is not None:
- f.__doc__ = getattr(cls, func_name).__doc__
- return f
-
-
-cdef _nat_divide_op(self, other):
- if PyDelta_Check(other) or util.is_timedelta64_object(other) or other is c_NaT:
- return np.nan
- if util.is_integer_object(other) or util.is_float_object(other):
- return c_NaT
- return NotImplemented
-
-
-cdef _nat_rdivide_op(self, other):
- if PyDelta_Check(other):
- return np.nan
- return NotImplemented
-
-
-def __nat_unpickle(*args):
- # return constant defined in the module
- return c_NaT
-
-# ----------------------------------------------------------------------
-
-
-cdef class _NaT(datetime):
- # cdef readonly:
- # int64_t value
-
- # higher than np.ndarray and np.matrix
- __array_priority__ = 100
-
- def __richcmp__(_NaT self, object other, int op):
- if util.is_datetime64_object(other) or PyDateTime_Check(other):
- # We treat NaT as datetime-like for this comparison
- return op == Py_NE
-
- elif util.is_timedelta64_object(other) or PyDelta_Check(other):
- # We treat NaT as timedelta-like for this comparison
- return op == Py_NE
-
- elif util.is_array(other):
- if other.dtype.kind in "mM":
- result = np.empty(other.shape, dtype=np.bool_)
- result.fill(op == Py_NE)
- elif other.dtype.kind == "O":
- result = np.array([PyObject_RichCompare(self, x, op) for x in other])
- elif op == Py_EQ:
- result = np.zeros(other.shape, dtype=bool)
- elif op == Py_NE:
- result = np.ones(other.shape, dtype=bool)
- else:
- return NotImplemented
- return result
-
- elif PyDate_Check(other):
- # GH#39151 don't defer to datetime.date object
- if op == Py_EQ:
- return False
- if op == Py_NE:
- return True
- raise TypeError("Cannot compare NaT with datetime.date object")
-
- return NotImplemented
-
- def __add__(self, other):
- if self is not c_NaT:
- # TODO(cython3): remove this it moved to __radd__
- # cython __radd__ semantics
- self, other = other, self
-
- if PyDateTime_Check(other):
- return c_NaT
- elif PyDelta_Check(other):
- return c_NaT
- elif util.is_datetime64_object(other) or util.is_timedelta64_object(other):
- return c_NaT
-
- elif util.is_integer_object(other):
- # For Period compat
- return c_NaT
-
- elif util.is_array(other):
- if other.dtype.kind in "mM":
- # If we are adding to datetime64, we treat NaT as timedelta
- # Either way, result dtype is datetime64
- result = np.empty(other.shape, dtype="datetime64[ns]")
- result.fill("NaT")
- return result
- raise TypeError(f"Cannot add NaT to ndarray with dtype {other.dtype}")
-
- # Includes Period, DateOffset going through here
- return NotImplemented
-
- def __radd__(self, other):
- return self.__add__(other)
-
- def __sub__(self, other):
- # Duplicate some logic from _Timestamp.__sub__ to avoid needing
- # to subclass; allows us to @final(_Timestamp.__sub__)
- cdef:
- bint is_rsub = False
-
- if self is not c_NaT:
- # cython __rsub__ semantics
- # TODO(cython3): remove __rsub__ logic from here
- self, other = other, self
- is_rsub = True
-
- if PyDateTime_Check(other):
- return c_NaT
- elif PyDelta_Check(other):
- return c_NaT
- elif util.is_datetime64_object(other) or util.is_timedelta64_object(other):
- return c_NaT
-
- elif util.is_integer_object(other):
- # For Period compat
- return c_NaT
-
- elif util.is_array(other):
- if other.dtype.kind == "m":
- if not is_rsub:
- # NaT - timedelta64 we treat NaT as datetime64, so result
- # is datetime64
- result = np.empty(other.shape, dtype="datetime64[ns]")
- result.fill("NaT")
- return result
-
- # __rsub__ logic here
- # TODO(cython3): remove this, move above code out of
- # ``if not is_rsub`` block
- # timedelta64 - NaT we have to treat NaT as timedelta64
- # for this to be meaningful, and the result is timedelta64
- result = np.empty(other.shape, dtype="timedelta64[ns]")
- result.fill("NaT")
- return result
-
- elif other.dtype.kind == "M":
- # We treat NaT as a datetime, so regardless of whether this is
- # NaT - other or other - NaT, the result is timedelta64
- result = np.empty(other.shape, dtype="timedelta64[ns]")
- result.fill("NaT")
- return result
-
- raise TypeError(
- f"Cannot subtract NaT from ndarray with dtype {other.dtype}"
- )
-
- # Includes Period, DateOffset going through here
- return NotImplemented
-
- def __rsub__(self, other):
- if util.is_array(other):
- if other.dtype.kind == "m":
- # timedelta64 - NaT we have to treat NaT as timedelta64
- # for this to be meaningful, and the result is timedelta64
- result = np.empty(other.shape, dtype="timedelta64[ns]")
- result.fill("NaT")
- return result
-
- elif other.dtype.kind == "M":
- # We treat NaT as a datetime, so regardless of whether this is
- # NaT - other or other - NaT, the result is timedelta64
- result = np.empty(other.shape, dtype="timedelta64[ns]")
- result.fill("NaT")
- return result
- # other cases are same, swap operands is allowed even though we subtract
- # because this is NaT
- return self.__sub__(other)
-
- def __pos__(self):
- return NaT
-
- def __neg__(self):
- return NaT
-
- def __truediv__(self, other):
- return _nat_divide_op(self, other)
-
- def __floordiv__(self, other):
- return _nat_divide_op(self, other)
-
- def __mul__(self, other):
- if util.is_integer_object(other) or util.is_float_object(other):
- return NaT
- return NotImplemented
-
- @property
- def asm8(self) -> np.datetime64:
- return np.datetime64(NPY_NAT, "ns")
-
- def to_datetime64(self) -> np.datetime64:
- """
- Return a numpy.datetime64 object with 'ns' precision.
- """
- return np.datetime64("NaT", "ns")
-
- def to_numpy(self, dtype=None, copy=False) -> np.datetime64 | np.timedelta64:
- """
- Convert the Timestamp to a NumPy datetime64 or timedelta64.
-
- With the default 'dtype', this is an alias method for `NaT.to_datetime64()`.
-
- The copy parameter is available here only for compatibility. Its value
- will not affect the return value.
-
- Returns
- -------
- numpy.datetime64 or numpy.timedelta64
-
- See Also
- --------
- DatetimeIndex.to_numpy : Similar method for DatetimeIndex.
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> ts.to_numpy()
- numpy.datetime64('2020-03-14T15:32:52.192548651')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.to_numpy()
- numpy.datetime64('NaT')
-
- >>> pd.NaT.to_numpy("m8[ns]")
- numpy.timedelta64('NaT','ns')
- """
- if dtype is not None:
- # GH#44460
- dtype = np.dtype(dtype)
- if dtype.kind == "M":
- return np.datetime64("NaT").astype(dtype)
- elif dtype.kind == "m":
- return np.timedelta64("NaT").astype(dtype)
- else:
- raise ValueError(
- "NaT.to_numpy dtype must be a datetime64 dtype, timedelta64 "
- "dtype, or None."
- )
- return self.to_datetime64()
-
- def __repr__(self) -> str:
- return "NaT"
-
- def __str__(self) -> str:
- return "NaT"
-
- def isoformat(self, sep: str = "T", timespec: str = "auto") -> str:
- # This allows Timestamp(ts.isoformat()) to always correctly roundtrip.
- return "NaT"
-
- def __hash__(self) -> int:
- return NPY_NAT
-
- @property
- def is_leap_year(self) -> bool:
- return False
-
- @property
- def is_month_start(self) -> bool:
- return False
-
- @property
- def is_quarter_start(self) -> bool:
- return False
-
- @property
- def is_year_start(self) -> bool:
- return False
-
- @property
- def is_month_end(self) -> bool:
- return False
-
- @property
- def is_quarter_end(self) -> bool:
- return False
-
- @property
- def is_year_end(self) -> bool:
- return False
-
-
-class NaTType(_NaT):
- """
- (N)ot-(A)-(T)ime, the time equivalent of NaN.
- """
-
- def __new__(cls):
- cdef _NaT base
-
- base = _NaT.__new__(cls, 1, 1, 1)
- base._value= NPY_NAT
-
- return base
-
- @property
- def value(self) -> int:
- return self._value
-
- def __reduce_ex__(self, protocol):
- # python 3.6 compat
- # https://bugs.python.org/issue28730
- # now __reduce_ex__ is defined and higher priority than __reduce__
- return self.__reduce__()
-
- def __reduce__(self):
- return (__nat_unpickle, (None, ))
-
- def __rtruediv__(self, other):
- return _nat_rdivide_op(self, other)
-
- def __rfloordiv__(self, other):
- return _nat_rdivide_op(self, other)
-
- def __rmul__(self, other):
- if util.is_integer_object(other) or util.is_float_object(other):
- return c_NaT
- return NotImplemented
-
- # ----------------------------------------------------------------------
- # inject the Timestamp field properties
- # these by definition return np.nan
-
- year = property(fget=lambda self: np.nan)
- quarter = property(fget=lambda self: np.nan)
- month = property(fget=lambda self: np.nan)
- day = property(fget=lambda self: np.nan)
- hour = property(fget=lambda self: np.nan)
- minute = property(fget=lambda self: np.nan)
- second = property(fget=lambda self: np.nan)
- millisecond = property(fget=lambda self: np.nan)
- microsecond = property(fget=lambda self: np.nan)
- nanosecond = property(fget=lambda self: np.nan)
-
- week = property(fget=lambda self: np.nan)
- dayofyear = property(fget=lambda self: np.nan)
- day_of_year = property(fget=lambda self: np.nan)
- weekofyear = property(fget=lambda self: np.nan)
- days_in_month = property(fget=lambda self: np.nan)
- daysinmonth = property(fget=lambda self: np.nan)
- dayofweek = property(fget=lambda self: np.nan)
- day_of_week = property(fget=lambda self: np.nan)
-
- # inject Timedelta properties
- days = property(fget=lambda self: np.nan)
- seconds = property(fget=lambda self: np.nan)
- microseconds = property(fget=lambda self: np.nan)
- nanoseconds = property(fget=lambda self: np.nan)
-
- # inject pd.Period properties
- qyear = property(fget=lambda self: np.nan)
-
- # ----------------------------------------------------------------------
- # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or
- # return NaT create functions that raise, for binding to NaTType
- # These are the ones that can get their docstrings from datetime.
-
- # nan methods
- weekday = _make_nan_func(
- "weekday",
- """
- Return the day of the week represented by the date.
-
- Monday == 0 ... Sunday == 6.
- """,
- )
- isoweekday = _make_nan_func(
- "isoweekday",
- """
- Return the day of the week represented by the date.
-
- Monday == 1 ... Sunday == 7.
- """,
- )
- total_seconds = _make_nan_func("total_seconds", timedelta.total_seconds.__doc__)
- month_name = _make_nan_func(
- "month_name",
- """
- Return the month name of the Timestamp with specified locale.
-
- Parameters
- ----------
- locale : str, default None (English locale)
- Locale determining the language in which to return the month name.
-
- Returns
- -------
- str
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> ts.month_name()
- 'March'
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.month_name()
- nan
- """,
- )
- day_name = _make_nan_func(
- "day_name",
- """
- Return the day name of the Timestamp with specified locale.
-
- Parameters
- ----------
- locale : str, default None (English locale)
- Locale determining the language in which to return the day name.
-
- Returns
- -------
- str
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> ts.day_name()
- 'Saturday'
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.day_name()
- nan
- """,
- )
- # _nat_methods
- date = _make_nat_func("date", datetime.date.__doc__)
-
- utctimetuple = _make_error_func("utctimetuple", datetime)
- timetz = _make_error_func("timetz", datetime)
- timetuple = _make_error_func("timetuple", datetime)
- isocalendar = _make_error_func("isocalendar", datetime)
- dst = _make_error_func("dst", datetime)
- ctime = _make_error_func("ctime", datetime)
- time = _make_error_func("time", datetime)
- toordinal = _make_error_func("toordinal", datetime)
- tzname = _make_error_func("tzname", datetime)
- utcoffset = _make_error_func("utcoffset", datetime)
-
- # "fromisocalendar" was introduced in 3.8
- fromisocalendar = _make_error_func("fromisocalendar", datetime)
-
- # ----------------------------------------------------------------------
- # The remaining methods have docstrings copy/pasted from the analogous
- # Timestamp methods.
-
- strftime = _make_error_func(
- "strftime",
- """
- Return a formatted string of the Timestamp.
-
- Parameters
- ----------
- format : str
- Format string to convert Timestamp to string.
- See strftime documentation for more information on the format string:
- https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> ts.strftime('%Y-%m-%d %X')
- '2020-03-14 15:32:52'
- """,
- )
-
- strptime = _make_error_func(
- "strptime",
- """
- Timestamp.strptime(string, format)
-
- Function is not implemented. Use pd.to_datetime().
- """,
- )
-
- utcfromtimestamp = _make_error_func(
- "utcfromtimestamp",
- """
- Timestamp.utcfromtimestamp(ts)
-
- Construct a timezone-aware UTC datetime from a POSIX timestamp.
-
- Notes
- -----
- Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp
- in returning a timezone-aware object.
-
- Examples
- --------
- >>> pd.Timestamp.utcfromtimestamp(1584199972)
- Timestamp('2020-03-14 15:32:52+0000', tz='UTC')
- """,
- )
- fromtimestamp = _make_error_func(
- "fromtimestamp",
- """
- Timestamp.fromtimestamp(ts)
-
- Transform timestamp[, tz] to tz's local time from POSIX timestamp.
-
- Examples
- --------
- >>> pd.Timestamp.fromtimestamp(1584199972) # doctest: +SKIP
- Timestamp('2020-03-14 15:32:52')
-
- Note that the output may change depending on your local time.
- """,
- )
- combine = _make_error_func(
- "combine",
- """
- Timestamp.combine(date, time)
-
- Combine date, time into datetime with same date and time fields.
-
- Examples
- --------
- >>> from datetime import date, time
- >>> pd.Timestamp.combine(date(2020, 3, 14), time(15, 30, 15))
- Timestamp('2020-03-14 15:30:15')
- """,
- )
- utcnow = _make_error_func(
- "utcnow",
- """
- Timestamp.utcnow()
-
- Return a new Timestamp representing UTC day and time.
-
- Examples
- --------
- >>> pd.Timestamp.utcnow() # doctest: +SKIP
- Timestamp('2020-11-16 22:50:18.092888+0000', tz='UTC')
- """,
- )
-
- timestamp = _make_error_func(
- "timestamp",
- """
- Return POSIX timestamp as float.
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548')
- >>> ts.timestamp()
- 1584199972.192548
- """
- )
-
- # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or
- # return NaT create functions that raise, for binding to NaTType
- astimezone = _make_error_func(
- "astimezone",
- """
- Convert timezone-aware Timestamp to another time zone.
-
- Parameters
- ----------
- tz : str, pytz.timezone, dateutil.tz.tzfile or None
- Time zone for time which Timestamp will be converted to.
- None will remove timezone holding UTC time.
-
- Returns
- -------
- converted : Timestamp
-
- Raises
- ------
- TypeError
- If Timestamp is tz-naive.
-
- Examples
- --------
- Create a timestamp object with UTC timezone:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651', tz='UTC')
- >>> ts
- Timestamp('2020-03-14 15:32:52.192548651+0000', tz='UTC')
-
- Change to Tokyo timezone:
-
- >>> ts.tz_convert(tz='Asia/Tokyo')
- Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo')
-
- Can also use ``astimezone``:
-
- >>> ts.astimezone(tz='Asia/Tokyo')
- Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.tz_convert(tz='Asia/Tokyo')
- NaT
- """,
- )
- fromordinal = _make_error_func(
- "fromordinal",
- """
- Construct a timestamp from a a proleptic Gregorian ordinal.
-
- Parameters
- ----------
- ordinal : int
- Date corresponding to a proleptic Gregorian ordinal.
- tz : str, pytz.timezone, dateutil.tz.tzfile or None
- Time zone for the Timestamp.
-
- Notes
- -----
- By definition there cannot be any tz info on the ordinal itself.
-
- Examples
- --------
- >>> pd.Timestamp.fromordinal(737425)
- Timestamp('2020-01-01 00:00:00')
- """,
- )
-
- # _nat_methods
- to_pydatetime = _make_nat_func(
- "to_pydatetime",
- """
- Convert a Timestamp object to a native Python datetime object.
-
- If warn=True, issue a warning if nanoseconds is nonzero.
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548')
- >>> ts.to_pydatetime()
- datetime.datetime(2020, 3, 14, 15, 32, 52, 192548)
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.to_pydatetime()
- NaT
- """,
- )
-
- now = _make_nat_func(
- "now",
- """
- Return new Timestamp object representing current time local to tz.
-
- Parameters
- ----------
- tz : str or timezone object, default None
- Timezone to localize to.
-
- Examples
- --------
- >>> pd.Timestamp.now() # doctest: +SKIP
- Timestamp('2020-11-16 22:06:16.378782')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.now()
- NaT
- """,
- )
- today = _make_nat_func(
- "today",
- """
- Return the current time in the local timezone.
-
- This differs from datetime.today() in that it can be localized to a
- passed timezone.
-
- Parameters
- ----------
- tz : str or timezone object, default None
- Timezone to localize to.
-
- Examples
- --------
- >>> pd.Timestamp.today() # doctest: +SKIP
- Timestamp('2020-11-16 22:37:39.969883')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.today()
- NaT
- """,
- )
- round = _make_nat_func(
- "round",
- """
- Round the Timestamp to the specified resolution.
-
- Parameters
- ----------
- freq : str
- Frequency string indicating the rounding resolution.
- ambiguous : bool or {'raise', 'NaT'}, default 'raise'
- The behavior is as follows:
-
- * bool contains flags to determine if time is dst or not (note
- that this flag is only applicable for ambiguous fall dst dates).
- * 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
-
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
-timedelta}, default 'raise'
- A nonexistent time does not exist in a particular timezone
- where clocks moved forward due to DST.
-
- * 'shift_forward' will shift the nonexistent time forward to the
- closest existing time.
- * 'shift_backward' will shift the nonexistent time backward to the
- closest existing time.
- * 'NaT' will return NaT where there are nonexistent times.
- * timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
- nonexistent times.
-
- Returns
- -------
- a new Timestamp rounded to the given resolution of `freq`
-
- Raises
- ------
- ValueError if the freq cannot be converted
-
- Notes
- -----
- If the Timestamp has a timezone, rounding will take place relative to the
- local ("wall") time and re-localized to the same timezone. When rounding
- near daylight savings time, use ``nonexistent`` and ``ambiguous`` to
- control the re-localization behavior.
-
- Examples
- --------
- Create a timestamp object:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
-
- A timestamp can be rounded using multiple frequency units:
-
- >>> ts.round(freq='H') # hour
- Timestamp('2020-03-14 16:00:00')
-
- >>> ts.round(freq='T') # minute
- Timestamp('2020-03-14 15:33:00')
-
- >>> ts.round(freq='S') # seconds
- Timestamp('2020-03-14 15:32:52')
-
- >>> ts.round(freq='L') # milliseconds
- Timestamp('2020-03-14 15:32:52.193000')
-
- ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes):
-
- >>> ts.round(freq='5T')
- Timestamp('2020-03-14 15:35:00')
-
- or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes):
-
- >>> ts.round(freq='1H30T')
- Timestamp('2020-03-14 15:00:00')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.round()
- NaT
-
- When rounding near a daylight savings time transition, use ``ambiguous`` or
- ``nonexistent`` to control how the timestamp should be re-localized.
-
- >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam")
-
- >>> ts_tz.round("H", ambiguous=False)
- Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam')
-
- >>> ts_tz.round("H", ambiguous=True)
- Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam')
- """,
- )
- floor = _make_nat_func(
- "floor",
- """
- Return a new Timestamp floored to this resolution.
-
- Parameters
- ----------
- freq : str
- Frequency string indicating the flooring resolution.
- ambiguous : bool or {'raise', 'NaT'}, default 'raise'
- The behavior is as follows:
-
- * bool contains flags to determine if time is dst or not (note
- that this flag is only applicable for ambiguous fall dst dates).
- * 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
-
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
-timedelta}, default 'raise'
- A nonexistent time does not exist in a particular timezone
- where clocks moved forward due to DST.
-
- * 'shift_forward' will shift the nonexistent time forward to the
- closest existing time.
- * 'shift_backward' will shift the nonexistent time backward to the
- closest existing time.
- * 'NaT' will return NaT where there are nonexistent times.
- * timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
- nonexistent times.
-
- Raises
- ------
- ValueError if the freq cannot be converted.
-
- Notes
- -----
- If the Timestamp has a timezone, flooring will take place relative to the
- local ("wall") time and re-localized to the same timezone. When flooring
- near daylight savings time, use ``nonexistent`` and ``ambiguous`` to
- control the re-localization behavior.
-
- Examples
- --------
- Create a timestamp object:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
-
- A timestamp can be floored using multiple frequency units:
-
- >>> ts.floor(freq='H') # hour
- Timestamp('2020-03-14 15:00:00')
-
- >>> ts.floor(freq='T') # minute
- Timestamp('2020-03-14 15:32:00')
-
- >>> ts.floor(freq='S') # seconds
- Timestamp('2020-03-14 15:32:52')
-
- >>> ts.floor(freq='N') # nanoseconds
- Timestamp('2020-03-14 15:32:52.192548651')
-
- ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes):
-
- >>> ts.floor(freq='5T')
- Timestamp('2020-03-14 15:30:00')
-
- or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes):
-
- >>> ts.floor(freq='1H30T')
- Timestamp('2020-03-14 15:00:00')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.floor()
- NaT
-
- When rounding near a daylight savings time transition, use ``ambiguous`` or
- ``nonexistent`` to control how the timestamp should be re-localized.
-
- >>> ts_tz = pd.Timestamp("2021-10-31 03:30:00").tz_localize("Europe/Amsterdam")
-
- >>> ts_tz.floor("2H", ambiguous=False)
- Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam')
-
- >>> ts_tz.floor("2H", ambiguous=True)
- Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam')
- """,
- )
- ceil = _make_nat_func(
- "ceil",
- """
- Return a new Timestamp ceiled to this resolution.
-
- Parameters
- ----------
- freq : str
- Frequency string indicating the ceiling resolution.
- ambiguous : bool or {'raise', 'NaT'}, default 'raise'
- The behavior is as follows:
-
- * bool contains flags to determine if time is dst or not (note
- that this flag is only applicable for ambiguous fall dst dates).
- * 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
-
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
-timedelta}, default 'raise'
- A nonexistent time does not exist in a particular timezone
- where clocks moved forward due to DST.
-
- * 'shift_forward' will shift the nonexistent time forward to the
- closest existing time.
- * 'shift_backward' will shift the nonexistent time backward to the
- closest existing time.
- * 'NaT' will return NaT where there are nonexistent times.
- * timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
- nonexistent times.
-
- Raises
- ------
- ValueError if the freq cannot be converted.
-
- Notes
- -----
- If the Timestamp has a timezone, ceiling will take place relative to the
- local ("wall") time and re-localized to the same timezone. When ceiling
- near daylight savings time, use ``nonexistent`` and ``ambiguous`` to
- control the re-localization behavior.
-
- Examples
- --------
- Create a timestamp object:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
-
- A timestamp can be ceiled using multiple frequency units:
-
- >>> ts.ceil(freq='H') # hour
- Timestamp('2020-03-14 16:00:00')
-
- >>> ts.ceil(freq='T') # minute
- Timestamp('2020-03-14 15:33:00')
-
- >>> ts.ceil(freq='S') # seconds
- Timestamp('2020-03-14 15:32:53')
-
- >>> ts.ceil(freq='U') # microseconds
- Timestamp('2020-03-14 15:32:52.192549')
-
- ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes):
-
- >>> ts.ceil(freq='5T')
- Timestamp('2020-03-14 15:35:00')
-
- or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes):
-
- >>> ts.ceil(freq='1H30T')
- Timestamp('2020-03-14 16:30:00')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.ceil()
- NaT
-
- When rounding near a daylight savings time transition, use ``ambiguous`` or
- ``nonexistent`` to control how the timestamp should be re-localized.
-
- >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam")
-
- >>> ts_tz.ceil("H", ambiguous=False)
- Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam')
-
- >>> ts_tz.ceil("H", ambiguous=True)
- Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam')
- """,
- )
-
- tz_convert = _make_nat_func(
- "tz_convert",
- """
- Convert timezone-aware Timestamp to another time zone.
-
- Parameters
- ----------
- tz : str, pytz.timezone, dateutil.tz.tzfile or None
- Time zone for time which Timestamp will be converted to.
- None will remove timezone holding UTC time.
-
- Returns
- -------
- converted : Timestamp
-
- Raises
- ------
- TypeError
- If Timestamp is tz-naive.
-
- Examples
- --------
- Create a timestamp object with UTC timezone:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651', tz='UTC')
- >>> ts
- Timestamp('2020-03-14 15:32:52.192548651+0000', tz='UTC')
-
- Change to Tokyo timezone:
-
- >>> ts.tz_convert(tz='Asia/Tokyo')
- Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo')
-
- Can also use ``astimezone``:
-
- >>> ts.astimezone(tz='Asia/Tokyo')
- Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.tz_convert(tz='Asia/Tokyo')
- NaT
- """,
- )
- tz_localize = _make_nat_func(
- "tz_localize",
- """
- Localize the Timestamp to a timezone.
-
- Convert naive Timestamp to local time zone or remove
- timezone from timezone-aware Timestamp.
-
- Parameters
- ----------
- tz : str, pytz.timezone, dateutil.tz.tzfile or None
- Time zone for time which Timestamp will be converted to.
- None will remove timezone holding local time.
-
- ambiguous : bool, 'NaT', default 'raise'
- When clocks moved backward due to DST, ambiguous times may arise.
- For example in Central European Time (UTC+01), when going from
- 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
- 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
- `ambiguous` parameter dictates how ambiguous times should be
- handled.
-
- The behavior is as follows:
-
- * bool contains flags to determine if time is dst or not (note
- that this flag is only applicable for ambiguous fall dst dates).
- * 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
-
- nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \
-default 'raise'
- A nonexistent time does not exist in a particular timezone
- where clocks moved forward due to DST.
-
- The behavior is as follows:
-
- * 'shift_forward' will shift the nonexistent time forward to the
- closest existing time.
- * 'shift_backward' will shift the nonexistent time backward to the
- closest existing time.
- * 'NaT' will return NaT where there are nonexistent times.
- * timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
- nonexistent times.
-
- Returns
- -------
- localized : Timestamp
-
- Raises
- ------
- TypeError
- If the Timestamp is tz-aware and tz is not None.
-
- Examples
- --------
- Create a naive timestamp object:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> ts
- Timestamp('2020-03-14 15:32:52.192548651')
-
- Add 'Europe/Stockholm' as timezone:
-
- >>> ts.tz_localize(tz='Europe/Stockholm')
- Timestamp('2020-03-14 15:32:52.192548651+0100', tz='Europe/Stockholm')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.tz_localize()
- NaT
- """,
- )
- replace = _make_nat_func(
- "replace",
- """
- Implements datetime.replace, handles nanoseconds.
-
- Parameters
- ----------
- year : int, optional
- month : int, optional
- day : int, optional
- hour : int, optional
- minute : int, optional
- second : int, optional
- microsecond : int, optional
- nanosecond : int, optional
- tzinfo : tz-convertible, optional
- fold : int, optional
-
- Returns
- -------
- Timestamp with fields replaced
-
- Examples
- --------
- Create a timestamp object:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651', tz='UTC')
- >>> ts
- Timestamp('2020-03-14 15:32:52.192548651+0000', tz='UTC')
-
- Replace year and the hour:
-
- >>> ts.replace(year=1999, hour=10)
- Timestamp('1999-03-14 10:32:52.192548651+0000', tz='UTC')
-
- Replace timezone (not a conversion):
-
- >>> import pytz
- >>> ts.replace(tzinfo=pytz.timezone('US/Pacific'))
- Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific'))
- NaT
- """,
- )
-
- @property
- def tz(self) -> None:
- return None
-
- @property
- def tzinfo(self) -> None:
- return None
-
- def as_unit(self, str unit, bint round_ok=True) -> "NaTType":
- """
- Convert the underlying int64 representaton to the given unit.
-
- Parameters
- ----------
- unit : {"ns", "us", "ms", "s"}
- round_ok : bool, default True
- If False and the conversion requires rounding, raise.
-
- Returns
- -------
- Timestamp
- """
- return c_NaT
-
-
-c_NaT = NaTType() # C-visible
-NaT = c_NaT # Python-visible
-
-
-# ----------------------------------------------------------------------
-
-cdef bint checknull_with_nat(object val):
- """
- Utility to check if a value is a nat or not.
- """
- return val is None or util.is_nan(val) or val is c_NaT
-
-
-cdef bint is_dt64nat(object val):
- """
- Is this a np.datetime64 object np.datetime64("NaT").
- """
- if util.is_datetime64_object(val):
- return get_datetime64_value(val) == NPY_NAT
- return False
-
-
-cdef bint is_td64nat(object val):
- """
- Is this a np.timedelta64 object np.timedelta64("NaT").
- """
- if util.is_timedelta64_object(val):
- return get_timedelta64_value(val) == NPY_NAT
- return False
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pxd
deleted file mode 100644
index 3faef6ed5d4..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pxd
+++ /dev/null
@@ -1,132 +0,0 @@
-cimport numpy as cnp
-from cpython.datetime cimport (
- date,
- datetime,
-)
-from numpy cimport (
- int32_t,
- int64_t,
-)
-
-
-# TODO(cython3): most of these can be cimported directly from numpy
-cdef extern from "numpy/ndarrayobject.h":
- ctypedef int64_t npy_timedelta
- ctypedef int64_t npy_datetime
-
-cdef extern from "numpy/ndarraytypes.h":
- ctypedef struct PyArray_DatetimeMetaData:
- NPY_DATETIMEUNIT base
- int64_t num
-
-cdef extern from "numpy/arrayscalars.h":
- ctypedef struct PyDatetimeScalarObject:
- # PyObject_HEAD
- npy_datetime obval
- PyArray_DatetimeMetaData obmeta
-
- ctypedef struct PyTimedeltaScalarObject:
- # PyObject_HEAD
- npy_timedelta obval
- PyArray_DatetimeMetaData obmeta
-
-cdef extern from "numpy/ndarraytypes.h":
- ctypedef struct npy_datetimestruct:
- int64_t year
- int32_t month, day, hour, min, sec, us, ps, as
-
- ctypedef enum NPY_DATETIMEUNIT:
- NPY_FR_Y
- NPY_FR_M
- NPY_FR_W
- NPY_FR_D
- NPY_FR_B
- NPY_FR_h
- NPY_FR_m
- NPY_FR_s
- NPY_FR_ms
- NPY_FR_us
- NPY_FR_ns
- NPY_FR_ps
- NPY_FR_fs
- NPY_FR_as
- NPY_FR_GENERIC
-
- int64_t NPY_DATETIME_NAT # elswhere we call this NPY_NAT
-
-cdef extern from "src/datetime/np_datetime.h":
- ctypedef struct pandas_timedeltastruct:
- int64_t days
- int32_t hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds
-
- void pandas_datetime_to_datetimestruct(npy_datetime val,
- NPY_DATETIMEUNIT fr,
- npy_datetimestruct *result) nogil
-
- npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr,
- npy_datetimestruct *d) nogil
-
- void pandas_timedelta_to_timedeltastruct(npy_timedelta val,
- NPY_DATETIMEUNIT fr,
- pandas_timedeltastruct *result
- ) nogil
-
-cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1
-
-cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=?)
-
-cdef int64_t pydatetime_to_dt64(
- datetime val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=?
-)
-cdef void pydatetime_to_dtstruct(datetime dt, npy_datetimestruct *dts)
-cdef int64_t pydate_to_dt64(
- date val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=?
-)
-cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts)
-
-cdef npy_datetime get_datetime64_value(object obj) nogil
-cdef npy_timedelta get_timedelta64_value(object obj) nogil
-cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil
-
-cdef int string_to_dts(
- str val,
- npy_datetimestruct* dts,
- NPY_DATETIMEUNIT* out_bestunit,
- int* out_local,
- int* out_tzoffset,
- bint want_exc,
- format: str | None = *,
- bint exact = *
-) except? -1
-
-cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype)
-
-cpdef cnp.ndarray astype_overflowsafe(
- cnp.ndarray values, # ndarray[datetime64[anyunit]]
- cnp.dtype dtype, # ndarray[datetime64[anyunit]]
- bint copy=*,
- bint round_ok=*,
- bint is_coerce=*,
-)
-cdef int64_t get_conversion_factor(
- NPY_DATETIMEUNIT from_unit,
- NPY_DATETIMEUNIT to_unit,
-) except? -1
-
-cdef bint cmp_dtstructs(npy_datetimestruct* left, npy_datetimestruct* right, int op)
-cdef get_implementation_bounds(
- NPY_DATETIMEUNIT reso, npy_datetimestruct *lower, npy_datetimestruct *upper
-)
-
-cdef int64_t convert_reso(
- int64_t value,
- NPY_DATETIMEUNIT from_reso,
- NPY_DATETIMEUNIT to_reso,
- bint round_ok,
-) except? -1
-
-cdef extern from "src/datetime/np_datetime_strings.h":
- ctypedef enum FormatRequirement:
- PARTIAL_MATCH
- EXACT_MATCH
- INFER_FORMAT
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pyi
deleted file mode 100644
index 0cb0e3b0237..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pyi
+++ /dev/null
@@ -1,21 +0,0 @@
-import numpy as np
-
-from pandas._typing import npt
-
-class OutOfBoundsDatetime(ValueError): ...
-class OutOfBoundsTimedelta(ValueError): ...
-
-# only exposed for testing
-def py_get_unit_from_dtype(dtype: np.dtype): ...
-def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ...
-def astype_overflowsafe(
- arr: np.ndarray,
- dtype: np.dtype,
- copy: bool = ...,
- round_ok: bool = ...,
- is_coerce: bool = ...,
-) -> np.ndarray: ...
-def is_unitless(dtype: np.dtype) -> bool: ...
-def compare_mismatched_resolutions(
- left: np.ndarray, right: np.ndarray, op
-) -> npt.NDArray[np.bool_]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pyx
deleted file mode 100644
index aa341138559..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/np_datetime.pyx
+++ /dev/null
@@ -1,629 +0,0 @@
-cimport cython
-from cpython.datetime cimport (
- PyDateTime_CheckExact,
- PyDateTime_DATE_GET_HOUR,
- PyDateTime_DATE_GET_MICROSECOND,
- PyDateTime_DATE_GET_MINUTE,
- PyDateTime_DATE_GET_SECOND,
- PyDateTime_GET_DAY,
- PyDateTime_GET_MONTH,
- PyDateTime_GET_YEAR,
- import_datetime,
-)
-from cpython.object cimport (
- Py_EQ,
- Py_GE,
- Py_GT,
- Py_LE,
- Py_LT,
- Py_NE,
-)
-
-import_datetime()
-
-import numpy as np
-
-cimport numpy as cnp
-
-cnp.import_array()
-from numpy cimport (
- int64_t,
- ndarray,
- uint8_t,
-)
-
-from pandas._libs.tslibs.util cimport get_c_string_buf_and_size
-
-
-cdef extern from "src/datetime/np_datetime.h":
- int cmp_npy_datetimestruct(npy_datetimestruct *a,
- npy_datetimestruct *b)
-
- # AS, FS, PS versions exist but are not imported because they are not used.
- npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS
- npy_datetimestruct _US_MIN_DTS, _US_MAX_DTS
- npy_datetimestruct _MS_MIN_DTS, _MS_MAX_DTS
- npy_datetimestruct _S_MIN_DTS, _S_MAX_DTS
- npy_datetimestruct _M_MIN_DTS, _M_MAX_DTS
-
- PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(cnp.PyArray_Descr *dtype)
-
-cdef extern from "src/datetime/np_datetime_strings.h":
- int parse_iso_8601_datetime(const char *str, int len, int want_exc,
- npy_datetimestruct *out,
- NPY_DATETIMEUNIT *out_bestunit,
- int *out_local, int *out_tzoffset,
- const char *format, int format_len,
- FormatRequirement exact)
-
-
-# ----------------------------------------------------------------------
-# numpy object inspection
-
-cdef npy_datetime get_datetime64_value(object obj) nogil:
- """
- returns the int64 value underlying scalar numpy datetime64 object
-
- Note that to interpret this as a datetime, the corresponding unit is
- also needed. That can be found using `get_datetime64_unit`.
- """
- return (<PyDatetimeScalarObject*>obj).obval
-
-
-cdef npy_timedelta get_timedelta64_value(object obj) nogil:
- """
- returns the int64 value underlying scalar numpy timedelta64 object
- """
- return (<PyTimedeltaScalarObject*>obj).obval
-
-
-cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil:
- """
- returns the unit part of the dtype for a numpy datetime64 object.
- """
- return <NPY_DATETIMEUNIT>(<PyDatetimeScalarObject*>obj).obmeta.base
-
-
-cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype):
- # NB: caller is responsible for ensuring this is *some* datetime64 or
- # timedelta64 dtype, otherwise we can segfault
- cdef:
- cnp.PyArray_Descr* descr = <cnp.PyArray_Descr*>dtype
- PyArray_DatetimeMetaData meta
- meta = get_datetime_metadata_from_dtype(descr)
- return meta.base
-
-
-def py_get_unit_from_dtype(dtype):
- # for testing get_unit_from_dtype; adds 896 bytes to the .so file.
- return get_unit_from_dtype(dtype)
-
-
-def is_unitless(dtype: cnp.dtype) -> bool:
- """
- Check if a datetime64 or timedelta64 dtype has no attached unit.
- """
- if dtype.type_num not in [cnp.NPY_DATETIME, cnp.NPY_TIMEDELTA]:
- raise ValueError("is_unitless dtype must be datetime64 or timedelta64")
- cdef:
- NPY_DATETIMEUNIT unit = get_unit_from_dtype(dtype)
-
- return unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
-
-
-# ----------------------------------------------------------------------
-# Comparison
-
-
-cdef bint cmp_dtstructs(
- npy_datetimestruct* left, npy_datetimestruct* right, int op
-):
- cdef:
- int cmp_res
-
- cmp_res = cmp_npy_datetimestruct(left, right)
- if op == Py_EQ:
- return cmp_res == 0
- if op == Py_NE:
- return cmp_res != 0
- if op == Py_GT:
- return cmp_res == 1
- if op == Py_LT:
- return cmp_res == -1
- if op == Py_GE:
- return cmp_res == 1 or cmp_res == 0
- else:
- # i.e. op == Py_LE
- return cmp_res == -1 or cmp_res == 0
-
-
-cdef bint cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1:
- """
- cmp_scalar is a more performant version of PyObject_RichCompare
- typed for int64_t arguments.
- """
- if op == Py_EQ:
- return lhs == rhs
- elif op == Py_NE:
- return lhs != rhs
- elif op == Py_LT:
- return lhs < rhs
- elif op == Py_LE:
- return lhs <= rhs
- elif op == Py_GT:
- return lhs > rhs
- elif op == Py_GE:
- return lhs >= rhs
-
-
-class OutOfBoundsDatetime(ValueError):
- """
- Raised when the datetime is outside the range that can be represented.
- """
- pass
-
-
-class OutOfBoundsTimedelta(ValueError):
- """
- Raised when encountering a timedelta value that cannot be represented.
-
- Representation should be within a timedelta64[ns].
- """
- # Timedelta analogue to OutOfBoundsDatetime
- pass
-
-
-cdef get_implementation_bounds(
- NPY_DATETIMEUNIT reso,
- npy_datetimestruct *lower,
- npy_datetimestruct *upper,
-):
- if reso == NPY_FR_ns:
- upper[0] = _NS_MAX_DTS
- lower[0] = _NS_MIN_DTS
- elif reso == NPY_FR_us:
- upper[0] = _US_MAX_DTS
- lower[0] = _US_MIN_DTS
- elif reso == NPY_FR_ms:
- upper[0] = _MS_MAX_DTS
- lower[0] = _MS_MIN_DTS
- elif reso == NPY_FR_s:
- upper[0] = _S_MAX_DTS
- lower[0] = _S_MIN_DTS
- elif reso == NPY_FR_m:
- upper[0] = _M_MAX_DTS
- lower[0] = _M_MIN_DTS
- else:
- raise NotImplementedError(reso)
-
-
-cdef check_dts_bounds(npy_datetimestruct *dts, NPY_DATETIMEUNIT unit=NPY_FR_ns):
- """Raises OutOfBoundsDatetime if the given date is outside the range that
- can be represented by nanosecond-resolution 64-bit integers."""
- cdef:
- bint error = False
- npy_datetimestruct cmp_upper, cmp_lower
-
- get_implementation_bounds(unit, &cmp_lower, &cmp_upper)
-
- if cmp_npy_datetimestruct(dts, &cmp_lower) == -1:
- error = True
- elif cmp_npy_datetimestruct(dts, &cmp_upper) == 1:
- error = True
-
- if error:
- fmt = (f"{dts.year}-{dts.month:02d}-{dts.day:02d} "
- f"{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}")
- # TODO: "nanosecond" in the message assumes NPY_FR_ns
- raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {fmt}")
-
-
-# ----------------------------------------------------------------------
-# Conversion
-
-
-# just exposed for testing at the moment
-def py_td64_to_tdstruct(int64_t td64, NPY_DATETIMEUNIT unit):
- cdef:
- pandas_timedeltastruct tds
- pandas_timedelta_to_timedeltastruct(td64, unit, &tds)
- return tds # <- returned as a dict to python
-
-
-cdef void pydatetime_to_dtstruct(datetime dt, npy_datetimestruct *dts):
- if PyDateTime_CheckExact(dt):
- dts.year = PyDateTime_GET_YEAR(dt)
- else:
- # We use dt.year instead of PyDateTime_GET_YEAR because with Timestamp
- # we override year such that PyDateTime_GET_YEAR is incorrect.
- dts.year = dt.year
-
- dts.month = PyDateTime_GET_MONTH(dt)
- dts.day = PyDateTime_GET_DAY(dt)
- dts.hour = PyDateTime_DATE_GET_HOUR(dt)
- dts.min = PyDateTime_DATE_GET_MINUTE(dt)
- dts.sec = PyDateTime_DATE_GET_SECOND(dt)
- dts.us = PyDateTime_DATE_GET_MICROSECOND(dt)
- dts.ps = dts.as = 0
-
-
-cdef int64_t pydatetime_to_dt64(datetime val,
- npy_datetimestruct *dts,
- NPY_DATETIMEUNIT reso=NPY_FR_ns):
- """
- Note we are assuming that the datetime object is timezone-naive.
- """
- pydatetime_to_dtstruct(val, dts)
- return npy_datetimestruct_to_datetime(reso, dts)
-
-
-cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts):
- dts.year = PyDateTime_GET_YEAR(val)
- dts.month = PyDateTime_GET_MONTH(val)
- dts.day = PyDateTime_GET_DAY(val)
- dts.hour = dts.min = dts.sec = dts.us = 0
- dts.ps = dts.as = 0
- return
-
-cdef int64_t pydate_to_dt64(
- date val, npy_datetimestruct *dts, NPY_DATETIMEUNIT reso=NPY_FR_ns
-):
- pydate_to_dtstruct(val, dts)
- return npy_datetimestruct_to_datetime(reso, dts)
-
-
-cdef int string_to_dts(
- str val,
- npy_datetimestruct* dts,
- NPY_DATETIMEUNIT* out_bestunit,
- int* out_local,
- int* out_tzoffset,
- bint want_exc,
- format: str | None=None,
- bint exact=True,
-) except? -1:
- cdef:
- Py_ssize_t length
- const char* buf
- Py_ssize_t format_length
- const char* format_buf
- FormatRequirement format_requirement
-
- buf = get_c_string_buf_and_size(val, &length)
- if format is None:
- format_buf = b""
- format_length = 0
- format_requirement = INFER_FORMAT
- else:
- format_buf = get_c_string_buf_and_size(format, &format_length)
- format_requirement = <FormatRequirement>exact
- return parse_iso_8601_datetime(buf, length, want_exc,
- dts, out_bestunit, out_local, out_tzoffset,
- format_buf, format_length,
- format_requirement)
-
-
-cpdef ndarray astype_overflowsafe(
- ndarray values,
- cnp.dtype dtype,
- bint copy=True,
- bint round_ok=True,
- bint is_coerce=False,
-):
- """
- Convert an ndarray with datetime64[X] to datetime64[Y]
- or timedelta64[X] to timedelta64[Y],
- raising on overflow.
- """
- if values.descr.type_num == dtype.type_num == cnp.NPY_DATETIME:
- # i.e. dtype.kind == "M"
- dtype_name = "datetime64"
- elif values.descr.type_num == dtype.type_num == cnp.NPY_TIMEDELTA:
- # i.e. dtype.kind == "m"
- dtype_name = "timedelta64"
- else:
- raise TypeError(
- "astype_overflowsafe values.dtype and dtype must be either "
- "both-datetime64 or both-timedelta64."
- )
-
- cdef:
- NPY_DATETIMEUNIT from_unit = get_unit_from_dtype(values.dtype)
- NPY_DATETIMEUNIT to_unit = get_unit_from_dtype(dtype)
-
- if from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
- raise TypeError(f"{dtype_name} values must have a unit specified")
-
- if to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
- # without raising explicitly here, we end up with a SystemError
- # built-in function [...] returned a result with an error
- raise ValueError(
- f"{dtype_name} dtype must have a unit specified"
- )
-
- if from_unit == to_unit:
- # Check this before allocating result for perf, might save some memory
- if copy:
- return values.copy()
- return values
-
- elif from_unit > to_unit:
- if round_ok:
- # e.g. ns -> us, so there is no risk of overflow, so we can use
- # numpy's astype safely. Note there _is_ risk of truncation.
- return values.astype(dtype)
- else:
- iresult2 = astype_round_check(values.view("i8"), from_unit, to_unit)
- return iresult2.view(dtype)
-
- if (<object>values).dtype.byteorder == ">":
- # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap
- values = values.astype(values.dtype.newbyteorder("<"))
-
- cdef:
- ndarray i8values = values.view("i8")
-
- # equiv: result = np.empty((<object>values).shape, dtype="i8")
- ndarray iresult = cnp.PyArray_EMPTY(
- values.ndim, values.shape, cnp.NPY_INT64, 0
- )
-
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(iresult, i8values)
- Py_ssize_t i, N = values.size
- int64_t value, new_value
- npy_datetimestruct dts
- bint is_td = dtype.type_num == cnp.NPY_TIMEDELTA
-
- for i in range(N):
- # Analogous to: item = values[i]
- value = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if value == NPY_DATETIME_NAT:
- new_value = NPY_DATETIME_NAT
- else:
- pandas_datetime_to_datetimestruct(value, from_unit, &dts)
-
- try:
- check_dts_bounds(&dts, to_unit)
- except OutOfBoundsDatetime as err:
- if is_coerce:
- new_value = NPY_DATETIME_NAT
- elif is_td:
- from_abbrev = np.datetime_data(values.dtype)[0]
- np_val = np.timedelta64(value, from_abbrev)
- msg = (
- "Cannot convert {np_val} to {dtype} without overflow"
- .format(np_val=str(np_val), dtype=str(dtype))
- )
- raise OutOfBoundsTimedelta(msg) from err
- else:
- raise
- else:
- new_value = npy_datetimestruct_to_datetime(to_unit, &dts)
-
- # Analogous to: iresult[i] = new_value
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return iresult.view(dtype)
-
-
-# TODO: try to upstream this fix to numpy
-def compare_mismatched_resolutions(ndarray left, ndarray right, op):
- """
- Overflow-safe comparison of timedelta64/datetime64 with mismatched resolutions.
-
- >>> left = np.array([500], dtype="M8[Y]")
- >>> right = np.array([0], dtype="M8[ns]")
- >>> left < right # <- wrong!
- array([ True])
- """
-
- if left.dtype.kind != right.dtype.kind or left.dtype.kind not in ["m", "M"]:
- raise ValueError("left and right must both be timedelta64 or both datetime64")
-
- cdef:
- int op_code = op_to_op_code(op)
- NPY_DATETIMEUNIT left_unit = get_unit_from_dtype(left.dtype)
- NPY_DATETIMEUNIT right_unit = get_unit_from_dtype(right.dtype)
-
- # equiv: result = np.empty((<object>left).shape, dtype="bool")
- ndarray result = cnp.PyArray_EMPTY(
- left.ndim, left.shape, cnp.NPY_BOOL, 0
- )
-
- ndarray lvalues = left.view("i8")
- ndarray rvalues = right.view("i8")
-
- cnp.broadcast mi = cnp.PyArray_MultiIterNew3(result, lvalues, rvalues)
- int64_t lval, rval
- bint res_value
-
- Py_ssize_t i, N = left.size
- npy_datetimestruct ldts, rdts
-
- for i in range(N):
- # Analogous to: lval = lvalues[i]
- lval = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- # Analogous to: rval = rvalues[i]
- rval = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 2))[0]
-
- if lval == NPY_DATETIME_NAT or rval == NPY_DATETIME_NAT:
- res_value = op_code == Py_NE
-
- else:
- pandas_datetime_to_datetimestruct(lval, left_unit, &ldts)
- pandas_datetime_to_datetimestruct(rval, right_unit, &rdts)
-
- res_value = cmp_dtstructs(&ldts, &rdts, op_code)
-
- # Analogous to: result[i] = res_value
- (<uint8_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_value
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return result
-
-
-import operator
-
-
-cdef int op_to_op_code(op):
- # TODO: should exist somewhere?
- if op is operator.eq:
- return Py_EQ
- if op is operator.ne:
- return Py_NE
- if op is operator.le:
- return Py_LE
- if op is operator.lt:
- return Py_LT
- if op is operator.ge:
- return Py_GE
- if op is operator.gt:
- return Py_GT
-
-
-cdef ndarray astype_round_check(
- ndarray i8values,
- NPY_DATETIMEUNIT from_unit,
- NPY_DATETIMEUNIT to_unit
-):
- # cases with from_unit > to_unit, e.g. ns->us, raise if the conversion
- # involves truncation, e.g. 1500ns->1us
- cdef:
- Py_ssize_t i, N = i8values.size
-
- # equiv: iresult = np.empty((<object>i8values).shape, dtype="i8")
- ndarray iresult = cnp.PyArray_EMPTY(
- i8values.ndim, i8values.shape, cnp.NPY_INT64, 0
- )
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(iresult, i8values)
-
- # Note the arguments to_unit, from unit are swapped vs how they
- # are passed when going to a higher-frequency reso.
- int64_t mult = get_conversion_factor(to_unit, from_unit)
- int64_t value, mod
-
- for i in range(N):
- # Analogous to: item = i8values[i]
- value = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if value == NPY_DATETIME_NAT:
- new_value = NPY_DATETIME_NAT
- else:
- new_value, mod = divmod(value, mult)
- if mod != 0:
- # TODO: avoid runtime import
- from pandas._libs.tslibs.dtypes import npy_unit_to_abbrev
- from_abbrev = npy_unit_to_abbrev(from_unit)
- to_abbrev = npy_unit_to_abbrev(to_unit)
- raise ValueError(
- f"Cannot losslessly cast '{value} {from_abbrev}' to {to_abbrev}"
- )
-
- # Analogous to: iresult[i] = new_value
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return iresult
-
-
-@cython.overflowcheck(True)
-cdef int64_t get_conversion_factor(
- NPY_DATETIMEUNIT from_unit,
- NPY_DATETIMEUNIT to_unit
-) except? -1:
- """
- Find the factor by which we need to multiply to convert from from_unit to to_unit.
- """
- if (
- from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
- or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
- ):
- raise ValueError("unit-less resolutions are not supported")
- if from_unit > to_unit:
- raise ValueError
-
- if from_unit == to_unit:
- return 1
-
- if from_unit == NPY_DATETIMEUNIT.NPY_FR_W:
- return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D:
- return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h:
- return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m:
- return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit)
- elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs:
- return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit)
-
-
-cdef int64_t convert_reso(
- int64_t value,
- NPY_DATETIMEUNIT from_reso,
- NPY_DATETIMEUNIT to_reso,
- bint round_ok,
-) except? -1:
- cdef:
- int64_t res_value, mult, div, mod
-
- if from_reso == to_reso:
- return value
-
- elif to_reso < from_reso:
- # e.g. ns -> us, no risk of overflow, but can be lossy rounding
- mult = get_conversion_factor(to_reso, from_reso)
- div, mod = divmod(value, mult)
- if mod > 0 and not round_ok:
- raise ValueError("Cannot losslessly convert units")
-
- # Note that when mod > 0, we follow np.timedelta64 in always
- # rounding down.
- res_value = div
-
- elif (
- from_reso == NPY_FR_Y
- or from_reso == NPY_FR_M
- or to_reso == NPY_FR_Y
- or to_reso == NPY_FR_M
- ):
- # Converting by multiplying isn't _quite_ right bc the number of
- # seconds in a month/year isn't fixed.
- res_value = _convert_reso_with_dtstruct(value, from_reso, to_reso)
-
- else:
- # e.g. ns -> us, risk of overflow, but no risk of lossy rounding
- mult = get_conversion_factor(from_reso, to_reso)
- with cython.overflowcheck(True):
- # Note: caller is responsible for re-raising as OutOfBoundsTimedelta
- res_value = value * mult
-
- return res_value
-
-
-cdef int64_t _convert_reso_with_dtstruct(
- int64_t value,
- NPY_DATETIMEUNIT from_unit,
- NPY_DATETIMEUNIT to_unit,
-) except? -1:
- cdef:
- npy_datetimestruct dts
-
- pandas_datetime_to_datetimestruct(value, from_unit, &dts)
- check_dts_bounds(&dts, to_unit)
- return npy_datetimestruct_to_datetime(to_unit, &dts)
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pxd
deleted file mode 100644
index 215c3f84928..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pxd
+++ /dev/null
@@ -1,12 +0,0 @@
-from numpy cimport int64_t
-
-
-cpdef to_offset(object obj)
-cdef bint is_offset_object(object obj)
-cdef bint is_tick_object(object obj)
-
-cdef class BaseOffset:
- cdef readonly:
- int64_t n
- bint normalize
- dict _cache
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pyi
deleted file mode 100644
index f1aca471766..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pyi
+++ /dev/null
@@ -1,279 +0,0 @@
-from datetime import (
- datetime,
- timedelta,
-)
-from typing import (
- Any,
- Collection,
- Literal,
- TypeVar,
- overload,
-)
-
-import numpy as np
-
-from pandas._libs.tslibs.nattype import NaTType
-from pandas._typing import npt
-
-from .timedeltas import Timedelta
-
-_BaseOffsetT = TypeVar("_BaseOffsetT", bound=BaseOffset)
-_DatetimeT = TypeVar("_DatetimeT", bound=datetime)
-_TimedeltaT = TypeVar("_TimedeltaT", bound=timedelta)
-
-_relativedelta_kwds: set[str]
-prefix_mapping: dict[str, type]
-
-class ApplyTypeError(TypeError): ...
-
-class BaseOffset:
- n: int
- def __init__(self, n: int = ..., normalize: bool = ...) -> None: ...
- def __eq__(self, other) -> bool: ...
- def __ne__(self, other) -> bool: ...
- def __hash__(self) -> int: ...
- @property
- def kwds(self) -> dict: ...
- @property
- def base(self) -> BaseOffset: ...
- @overload
- def __add__(self, other: npt.NDArray[np.object_]) -> npt.NDArray[np.object_]: ...
- @overload
- def __add__(self: _BaseOffsetT, other: BaseOffset) -> _BaseOffsetT: ...
- @overload
- def __add__(self, other: _DatetimeT) -> _DatetimeT: ...
- @overload
- def __add__(self, other: _TimedeltaT) -> _TimedeltaT: ...
- @overload
- def __radd__(self, other: npt.NDArray[np.object_]) -> npt.NDArray[np.object_]: ...
- @overload
- def __radd__(self: _BaseOffsetT, other: BaseOffset) -> _BaseOffsetT: ...
- @overload
- def __radd__(self, other: _DatetimeT) -> _DatetimeT: ...
- @overload
- def __radd__(self, other: _TimedeltaT) -> _TimedeltaT: ...
- @overload
- def __radd__(self, other: NaTType) -> NaTType: ...
- def __sub__(self: _BaseOffsetT, other: BaseOffset) -> _BaseOffsetT: ...
- @overload
- def __rsub__(self, other: npt.NDArray[np.object_]) -> npt.NDArray[np.object_]: ...
- @overload
- def __rsub__(self: _BaseOffsetT, other: BaseOffset) -> _BaseOffsetT: ...
- @overload
- def __rsub__(self, other: _DatetimeT) -> _DatetimeT: ...
- @overload
- def __rsub__(self, other: _TimedeltaT) -> _TimedeltaT: ...
- @overload
- def __mul__(self, other: np.ndarray) -> np.ndarray: ...
- @overload
- def __mul__(self: _BaseOffsetT, other: int) -> _BaseOffsetT: ...
- @overload
- def __rmul__(self, other: np.ndarray) -> np.ndarray: ...
- @overload
- def __rmul__(self: _BaseOffsetT, other: int) -> _BaseOffsetT: ...
- def __neg__(self: _BaseOffsetT) -> _BaseOffsetT: ...
- def copy(self: _BaseOffsetT) -> _BaseOffsetT: ...
- @property
- def name(self) -> str: ...
- @property
- def rule_code(self) -> str: ...
- @property
- def freqstr(self) -> str: ...
- def _apply(self, other): ...
- def _apply_array(self, dtarr) -> None: ...
- def rollback(self, dt: datetime) -> datetime: ...
- def rollforward(self, dt: datetime) -> datetime: ...
- def is_on_offset(self, dt: datetime) -> bool: ...
- def __setstate__(self, state) -> None: ...
- def __getstate__(self): ...
- @property
- def nanos(self) -> int: ...
- def is_anchored(self) -> bool: ...
-
-def _get_offset(name: str) -> BaseOffset: ...
-
-class SingleConstructorOffset(BaseOffset):
- @classmethod
- def _from_name(cls, suffix: None = ...): ...
- def __reduce__(self): ...
-
-@overload
-def to_offset(freq: None) -> None: ...
-@overload
-def to_offset(freq: _BaseOffsetT) -> _BaseOffsetT: ...
-@overload
-def to_offset(freq: timedelta | str) -> BaseOffset: ...
-
-class Tick(SingleConstructorOffset):
- _creso: int
- _prefix: str
- _td64_unit: str
- def __init__(self, n: int = ..., normalize: bool = ...) -> None: ...
- @property
- def delta(self) -> Timedelta: ...
- @property
- def nanos(self) -> int: ...
-
-def delta_to_tick(delta: timedelta) -> Tick: ...
-
-class Day(Tick): ...
-class Hour(Tick): ...
-class Minute(Tick): ...
-class Second(Tick): ...
-class Milli(Tick): ...
-class Micro(Tick): ...
-class Nano(Tick): ...
-
-class RelativeDeltaOffset(BaseOffset):
- def __init__(self, n: int = ..., normalize: bool = ..., **kwds: Any) -> None: ...
-
-class BusinessMixin(SingleConstructorOffset):
- def __init__(
- self, n: int = ..., normalize: bool = ..., offset: timedelta = ...
- ) -> None: ...
-
-class BusinessDay(BusinessMixin): ...
-
-class BusinessHour(BusinessMixin):
- def __init__(
- self,
- n: int = ...,
- normalize: bool = ...,
- start: str | Collection[str] = ...,
- end: str | Collection[str] = ...,
- offset: timedelta = ...,
- ) -> None: ...
-
-class WeekOfMonthMixin(SingleConstructorOffset):
- def __init__(
- self, n: int = ..., normalize: bool = ..., weekday: int = ...
- ) -> None: ...
-
-class YearOffset(SingleConstructorOffset):
- def __init__(
- self, n: int = ..., normalize: bool = ..., month: int | None = ...
- ) -> None: ...
-
-class BYearEnd(YearOffset): ...
-class BYearBegin(YearOffset): ...
-class YearEnd(YearOffset): ...
-class YearBegin(YearOffset): ...
-
-class QuarterOffset(SingleConstructorOffset):
- def __init__(
- self, n: int = ..., normalize: bool = ..., startingMonth: int | None = ...
- ) -> None: ...
-
-class BQuarterEnd(QuarterOffset): ...
-class BQuarterBegin(QuarterOffset): ...
-class QuarterEnd(QuarterOffset): ...
-class QuarterBegin(QuarterOffset): ...
-class MonthOffset(SingleConstructorOffset): ...
-class MonthEnd(MonthOffset): ...
-class MonthBegin(MonthOffset): ...
-class BusinessMonthEnd(MonthOffset): ...
-class BusinessMonthBegin(MonthOffset): ...
-
-class SemiMonthOffset(SingleConstructorOffset):
- def __init__(
- self, n: int = ..., normalize: bool = ..., day_of_month: int | None = ...
- ) -> None: ...
-
-class SemiMonthEnd(SemiMonthOffset): ...
-class SemiMonthBegin(SemiMonthOffset): ...
-
-class Week(SingleConstructorOffset):
- def __init__(
- self, n: int = ..., normalize: bool = ..., weekday: int | None = ...
- ) -> None: ...
-
-class WeekOfMonth(WeekOfMonthMixin):
- def __init__(
- self, n: int = ..., normalize: bool = ..., week: int = ..., weekday: int = ...
- ) -> None: ...
-
-class LastWeekOfMonth(WeekOfMonthMixin): ...
-
-class FY5253Mixin(SingleConstructorOffset):
- def __init__(
- self,
- n: int = ...,
- normalize: bool = ...,
- weekday: int = ...,
- startingMonth: int = ...,
- variation: Literal["nearest", "last"] = ...,
- ) -> None: ...
-
-class FY5253(FY5253Mixin): ...
-
-class FY5253Quarter(FY5253Mixin):
- def __init__(
- self,
- n: int = ...,
- normalize: bool = ...,
- weekday: int = ...,
- startingMonth: int = ...,
- qtr_with_extra_week: int = ...,
- variation: Literal["nearest", "last"] = ...,
- ) -> None: ...
-
-class Easter(SingleConstructorOffset): ...
-
-class _CustomBusinessMonth(BusinessMixin):
- def __init__(
- self,
- n: int = ...,
- normalize: bool = ...,
- weekmask: str = ...,
- holidays: list | None = ...,
- calendar: np.busdaycalendar | None = ...,
- offset: timedelta = ...,
- ) -> None: ...
-
-class CustomBusinessDay(BusinessDay):
- def __init__(
- self,
- n: int = ...,
- normalize: bool = ...,
- weekmask: str = ...,
- holidays: list | None = ...,
- calendar: np.busdaycalendar | None = ...,
- offset: timedelta = ...,
- ) -> None: ...
-
-class CustomBusinessHour(BusinessHour):
- def __init__(
- self,
- n: int = ...,
- normalize: bool = ...,
- weekmask: str = ...,
- holidays: list | None = ...,
- calendar: np.busdaycalendar | None = ...,
- start: str = ...,
- end: str = ...,
- offset: timedelta = ...,
- ) -> None: ...
-
-class CustomBusinessMonthEnd(_CustomBusinessMonth): ...
-class CustomBusinessMonthBegin(_CustomBusinessMonth): ...
-class OffsetMeta(type): ...
-class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta): ...
-
-BDay = BusinessDay
-BMonthEnd = BusinessMonthEnd
-BMonthBegin = BusinessMonthBegin
-CBMonthEnd = CustomBusinessMonthEnd
-CBMonthBegin = CustomBusinessMonthBegin
-CDay = CustomBusinessDay
-
-def roll_qtrday(
- other: datetime, n: int, month: int, day_opt: str, modby: int
-) -> int: ...
-
-INVALID_FREQ_ERR_MSG: Literal["Invalid frequency: {0}"]
-
-def shift_months(
- dtindex: npt.NDArray[np.int64], months: int, day_opt: str | None = ...
-) -> npt.NDArray[np.int64]: ...
-
-_offset_map: dict[str, BaseOffset]
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pyx
deleted file mode 100644
index 9a9a325eb16..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/offsets.pyx
+++ /dev/null
@@ -1,4595 +0,0 @@
-import re
-import time
-
-cimport cython
-from cpython.datetime cimport (
- PyDate_Check,
- PyDateTime_Check,
- PyDelta_Check,
- date,
- datetime,
- import_datetime,
- time as dt_time,
- timedelta,
-)
-
-import_datetime()
-
-from dateutil.easter import easter
-from dateutil.relativedelta import relativedelta
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- int64_t,
- ndarray,
-)
-
-cnp.import_array()
-
-# TODO: formalize having _libs.properties "above" tslibs in the dependency structure
-
-from pandas._libs.properties import cache_readonly
-
-from pandas._libs.tslibs cimport util
-from pandas._libs.tslibs.util cimport (
- is_datetime64_object,
- is_float_object,
- is_integer_object,
-)
-
-from pandas._libs.tslibs.ccalendar import (
- MONTH_ALIASES,
- MONTH_TO_CAL_NUM,
- int_to_weekday,
- weekday_to_int,
-)
-
-from pandas._libs.tslibs.ccalendar cimport (
- dayofweek,
- get_days_in_month,
- get_firstbday,
- get_lastbday,
-)
-from pandas._libs.tslibs.conversion cimport localize_pydatetime
-from pandas._libs.tslibs.dtypes cimport periods_per_day
-from pandas._libs.tslibs.nattype cimport (
- NPY_NAT,
- c_NaT as NaT,
-)
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- get_unit_from_dtype,
- npy_datetimestruct,
- npy_datetimestruct_to_datetime,
- pandas_datetime_to_datetimestruct,
- pydate_to_dtstruct,
-)
-
-from .dtypes cimport PeriodDtypeCode
-from .timedeltas cimport (
- _Timedelta,
- delta_to_nanoseconds,
- is_any_td_scalar,
-)
-
-from .timedeltas import Timedelta
-
-from .timestamps cimport _Timestamp
-
-from .timestamps import Timestamp
-
-# ---------------------------------------------------------------------
-# Misc Helpers
-
-cdef bint is_offset_object(object obj):
- return isinstance(obj, BaseOffset)
-
-
-cdef bint is_tick_object(object obj):
- return isinstance(obj, Tick)
-
-
-cdef datetime _as_datetime(datetime obj):
- if isinstance(obj, _Timestamp):
- return obj.to_pydatetime()
- return obj
-
-
-cdef bint _is_normalized(datetime dt):
- if dt.hour != 0 or dt.minute != 0 or dt.second != 0 or dt.microsecond != 0:
- # Regardless of whether dt is datetime vs Timestamp
- return False
- if isinstance(dt, _Timestamp):
- return dt.nanosecond == 0
- return True
-
-
-def apply_wrapper_core(func, self, other) -> ndarray:
- result = func(self, other)
- result = np.asarray(result)
-
- if self.normalize:
- # TODO: Avoid circular/runtime import
- from .vectorized import normalize_i8_timestamps
- reso = get_unit_from_dtype(other.dtype)
- result = normalize_i8_timestamps(result.view("i8"), None, reso=reso)
-
- return result
-
-
-def apply_array_wraps(func):
- # Note: normally we would use `@functools.wraps(func)`, but this does
- # not play nicely with cython class methods
- def wrapper(self, other) -> np.ndarray:
- # other is a DatetimeArray
- result = apply_wrapper_core(func, self, other)
- return result
-
- # do @functools.wraps(func) manually since it doesn't work on cdef funcs
- wrapper.__name__ = func.__name__
- wrapper.__doc__ = func.__doc__
- return wrapper
-
-
-def apply_wraps(func):
- # Note: normally we would use `@functools.wraps(func)`, but this does
- # not play nicely with cython class methods
-
- def wrapper(self, other):
-
- if other is NaT:
- return NaT
- elif (
- isinstance(other, BaseOffset)
- or PyDelta_Check(other)
- or util.is_timedelta64_object(other)
- ):
- # timedelta path
- return func(self, other)
- elif is_datetime64_object(other) or PyDate_Check(other):
- # PyDate_Check includes date, datetime
- other = Timestamp(other)
- else:
- # This will end up returning NotImplemented back in __add__
- raise ApplyTypeError
-
- tz = other.tzinfo
- nano = other.nanosecond
-
- if self._adjust_dst:
- other = other.tz_localize(None)
-
- result = func(self, other)
-
- result2 = Timestamp(result).as_unit(other.unit)
- if result == result2:
- # i.e. the conversion is non-lossy, not the case for e.g.
- # test_milliseconds_combination
- result = result2
-
- if self._adjust_dst:
- result = result.tz_localize(tz)
-
- if self.normalize:
- result = result.normalize()
-
- # If the offset object does not have a nanoseconds component,
- # the result's nanosecond component may be lost.
- if not self.normalize and nano != 0 and not hasattr(self, "nanoseconds"):
- if result.nanosecond != nano:
- if result.tz is not None:
- # convert to UTC
- res = result.tz_localize(None)
- else:
- res = result
- value = res.as_unit("ns")._value
- result = Timestamp(value + nano)
-
- if tz is not None and result.tzinfo is None:
- result = result.tz_localize(tz)
-
- return result
-
- # do @functools.wraps(func) manually since it doesn't work on cdef funcs
- wrapper.__name__ = func.__name__
- wrapper.__doc__ = func.__doc__
- return wrapper
-
-
-cdef _wrap_timedelta_result(result):
- """
- Tick operations dispatch to their Timedelta counterparts. Wrap the result
- of these operations in a Tick if possible.
-
- Parameters
- ----------
- result : object
-
- Returns
- -------
- object
- """
- if PyDelta_Check(result):
- # convert Timedelta back to a Tick
- return delta_to_tick(result)
-
- return result
-
-# ---------------------------------------------------------------------
-# Business Helpers
-
-
-cdef _get_calendar(weekmask, holidays, calendar):
- """
- Generate busdaycalendar
- """
- if isinstance(calendar, np.busdaycalendar):
- if not holidays:
- holidays = tuple(calendar.holidays)
- elif not isinstance(holidays, tuple):
- holidays = tuple(holidays)
- else:
- # trust that calendar.holidays and holidays are
- # consistent
- pass
- return calendar, holidays
-
- if holidays is None:
- holidays = []
- try:
- holidays = holidays + calendar.holidays().tolist()
- except AttributeError:
- pass
- holidays = [_to_dt64D(dt) for dt in holidays]
- holidays = tuple(sorted(holidays))
-
- kwargs = {"weekmask": weekmask}
- if holidays:
- kwargs["holidays"] = holidays
-
- busdaycalendar = np.busdaycalendar(**kwargs)
- return busdaycalendar, holidays
-
-
-cdef _to_dt64D(dt):
- # Currently
- # > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]')
- # numpy.datetime64('2013-05-01T02:00:00.000000+0200')
- # Thus astype is needed to cast datetime to datetime64[D]
- if getattr(dt, "tzinfo", None) is not None:
- # Get the nanosecond timestamp,
- # equiv `Timestamp(dt).value` or `dt.timestamp() * 10**9`
- # The `naive` must be the `dt` naive wall time
- # instead of the naive absolute time (GH#49441)
- naive = dt.replace(tzinfo=None)
- dt = np.datetime64(naive, "D")
- else:
- dt = np.datetime64(dt)
- if dt.dtype.name != "datetime64[D]":
- dt = dt.astype("datetime64[D]")
- return dt
-
-
-# ---------------------------------------------------------------------
-# Validation
-
-
-cdef _validate_business_time(t_input):
- if isinstance(t_input, str):
- try:
- t = time.strptime(t_input, "%H:%M")
- return dt_time(hour=t.tm_hour, minute=t.tm_min)
- except ValueError:
- raise ValueError("time data must match '%H:%M' format")
- elif isinstance(t_input, dt_time):
- if t_input.second != 0 or t_input.microsecond != 0:
- raise ValueError(
- "time data must be specified only with hour and minute")
- return t_input
- else:
- raise ValueError("time data must be string or datetime.time")
-
-
-# ---------------------------------------------------------------------
-# Constructor Helpers
-
-_relativedelta_kwds = {"years", "months", "weeks", "days", "year", "month",
- "day", "weekday", "hour", "minute", "second",
- "microsecond", "millisecond", "nanosecond",
- "nanoseconds", "hours", "minutes", "seconds",
- "milliseconds", "microseconds"}
-
-
-cdef _determine_offset(kwds):
- if not kwds:
- # GH 45643/45890: (historically) defaults to 1 day
- return timedelta(days=1), False
-
- if "millisecond" in kwds:
- raise NotImplementedError(
- "Using DateOffset to replace `millisecond` component in "
- "datetime object is not supported. Use "
- "`microsecond=timestamp.microsecond % 1000 + ms * 1000` "
- "instead."
- )
-
- nanos = {"nanosecond", "nanoseconds"}
-
- # nanos are handled by apply_wraps
- if all(k in nanos for k in kwds):
- return timedelta(days=0), False
-
- kwds_no_nanos = {k: v for k, v in kwds.items() if k not in nanos}
-
- kwds_use_relativedelta = {
- "year", "month", "day", "hour", "minute",
- "second", "microsecond", "weekday", "years", "months", "weeks", "days",
- "hours", "minutes", "seconds", "microseconds"
- }
-
- # "weeks" and "days" are left out despite being valid args for timedelta,
- # because (historically) timedelta is used only for sub-daily.
- kwds_use_timedelta = {
- "seconds", "microseconds", "milliseconds", "minutes", "hours",
- }
-
- if all(k in kwds_use_timedelta for k in kwds_no_nanos):
- # Sub-daily offset - use timedelta (tz-aware)
- # This also handles "milliseconds" (plur): see GH 49897
- return timedelta(**kwds_no_nanos), False
-
- # convert milliseconds to microseconds, so relativedelta can parse it
- if "milliseconds" in kwds_no_nanos:
- micro = kwds_no_nanos.pop("milliseconds") * 1000
- kwds_no_nanos["microseconds"] = kwds_no_nanos.get("microseconds", 0) + micro
-
- if all(k in kwds_use_relativedelta for k in kwds_no_nanos):
- return relativedelta(**kwds_no_nanos), True
-
- raise ValueError(
- f"Invalid argument/s or bad combination of arguments: {list(kwds.keys())}"
- )
-
-# ---------------------------------------------------------------------
-# Mixins & Singletons
-
-
-class ApplyTypeError(TypeError):
- # sentinel class for catching the apply error to return NotImplemented
- pass
-
-
-# ---------------------------------------------------------------------
-# Base Classes
-
-cdef class BaseOffset:
- """
- Base class for DateOffset methods that are not overridden by subclasses.
-
- Parameters
- ----------
- n : int
- Number of multiples of the frequency.
-
- normalize : bool
- Whether the frequency can align with midnight.
-
- Examples
- --------
- >>> pd.offsets.Hour(5).n
- 5
- >>> pd.offsets.Hour(5).normalize
- False
- """
- # ensure that reversed-ops with numpy scalars return NotImplemented
- __array_priority__ = 1000
-
- _day_opt = None
- _attributes = tuple(["n", "normalize"])
- _use_relativedelta = False
- _adjust_dst = True
-
- # cdef readonly:
- # int64_t n
- # bint normalize
- # dict _cache
-
- def __init__(self, n=1, normalize=False):
- n = self._validate_n(n)
- self.n = n
- self.normalize = normalize
- self._cache = {}
-
- def __eq__(self, other) -> bool:
- if isinstance(other, str):
- try:
- # GH#23524 if to_offset fails, we are dealing with an
- # incomparable type so == is False and != is True
- other = to_offset(other)
- except ValueError:
- # e.g. "infer"
- return False
- try:
- return self._params == other._params
- except AttributeError:
- # other is not a DateOffset object
- return False
-
- def __ne__(self, other):
- return not self == other
-
- def __hash__(self) -> int:
- return hash(self._params)
-
- @cache_readonly
- def _params(self):
- """
- Returns a tuple containing all of the attributes needed to evaluate
- equality between two DateOffset objects.
- """
- d = getattr(self, "__dict__", {})
- all_paras = d.copy()
- all_paras["n"] = self.n
- all_paras["normalize"] = self.normalize
- for attr in self._attributes:
- if hasattr(self, attr) and attr not in d:
- # cython attributes are not in __dict__
- all_paras[attr] = getattr(self, attr)
-
- if "holidays" in all_paras and not all_paras["holidays"]:
- all_paras.pop("holidays")
- exclude = ["kwds", "name", "calendar"]
- attrs = [(k, v) for k, v in all_paras.items()
- if (k not in exclude) and (k[0] != "_")]
- attrs = sorted(set(attrs))
- params = tuple([str(type(self))] + attrs)
- return params
-
- @property
- def kwds(self) -> dict:
- """
- Return a dict of extra parameters for the offset.
-
- Examples
- --------
- >>> pd.DateOffset(5).kwds
- {}
-
- >>> pd.offsets.FY5253Quarter().kwds
- {'weekday': 0,
- 'startingMonth': 1,
- 'qtr_with_extra_week': 1,
- 'variation': 'nearest'}
- """
- # for backwards-compatibility
- kwds = {name: getattr(self, name, None) for name in self._attributes
- if name not in ["n", "normalize"]}
- return {name: kwds[name] for name in kwds if kwds[name] is not None}
-
- @property
- def base(self):
- """
- Returns a copy of the calling offset object with n=1 and all other
- attributes equal.
- """
- return type(self)(n=1, normalize=self.normalize, **self.kwds)
-
- def __add__(self, other):
- if not isinstance(self, BaseOffset):
- # cython semantics; this is __radd__
- # TODO(cython3): remove this, this moved to __radd__
- return other.__add__(self)
-
- elif util.is_array(other) and other.dtype == object:
- return np.array([self + x for x in other])
-
- try:
- return self._apply(other)
- except ApplyTypeError:
- return NotImplemented
-
- def __radd__(self, other):
- return self.__add__(other)
-
- def __sub__(self, other):
- if PyDateTime_Check(other):
- raise TypeError("Cannot subtract datetime from offset.")
- elif type(other) == type(self):
- return type(self)(self.n - other.n, normalize=self.normalize,
- **self.kwds)
- elif not isinstance(self, BaseOffset):
- # TODO(cython3): remove, this moved to __rsub__
- # cython semantics, this is __rsub__
- return (-other).__add__(self)
- else:
- # e.g. PeriodIndex
- return NotImplemented
-
- def __rsub__(self, other):
- return (-self).__add__(other)
-
- def __mul__(self, other):
- if util.is_array(other):
- return np.array([self * x for x in other])
- elif is_integer_object(other):
- return type(self)(n=other * self.n, normalize=self.normalize,
- **self.kwds)
- elif not isinstance(self, BaseOffset):
- # TODO(cython3): remove this, this moved to __rmul__
- # cython semantics, this is __rmul__
- return other.__mul__(self)
- return NotImplemented
-
- def __rmul__(self, other):
- return self.__mul__(other)
-
- def __neg__(self):
- # Note: we are deferring directly to __mul__ instead of __rmul__, as
- # that allows us to use methods that can go in a `cdef class`
- return self * -1
-
- def copy(self):
- # Note: we are deferring directly to __mul__ instead of __rmul__, as
- # that allows us to use methods that can go in a `cdef class`
- """
- Return a copy of the frequency.
-
- Examples
- --------
- >>> freq = pd.DateOffset(1)
- >>> freq_copy = freq.copy()
- >>> freq is freq_copy
- False
- """
- return self * 1
-
- # ------------------------------------------------------------------
- # Name and Rendering Methods
-
- def __repr__(self) -> str:
- # _output_name used by B(Year|Quarter)(End|Begin) to
- # expand "B" -> "Business"
- class_name = getattr(self, "_output_name", type(self).__name__)
-
- if abs(self.n) != 1:
- plural = "s"
- else:
- plural = ""
-
- n_str = ""
- if self.n != 1:
- n_str = f"{self.n} * "
-
- out = f"<{n_str}{class_name}{plural}{self._repr_attrs()}>"
- return out
-
- def _repr_attrs(self) -> str:
- exclude = {"n", "inc", "normalize"}
- attrs = []
- for attr in sorted(self._attributes):
- # _attributes instead of __dict__ because cython attrs are not in __dict__
- if attr.startswith("_") or attr == "kwds" or not hasattr(self, attr):
- # DateOffset may not have some of these attributes
- continue
- elif attr not in exclude:
- value = getattr(self, attr)
- attrs.append(f"{attr}={value}")
-
- out = ""
- if attrs:
- out += ": " + ", ".join(attrs)
- return out
-
- @property
- def name(self) -> str:
- """
- Return a string representing the base frequency.
-
- Examples
- --------
- >>> pd.offsets.Hour().name
- 'H'
-
- >>> pd.offsets.Hour(5).name
- 'H'
- """
- return self.rule_code
-
- @property
- def _prefix(self) -> str:
- raise NotImplementedError("Prefix not defined")
-
- @property
- def rule_code(self) -> str:
- return self._prefix
-
- @cache_readonly
- def freqstr(self) -> str:
- """
- Return a string representing the frequency.
-
- Examples
- --------
- >>> pd.DateOffset(5).freqstr
- '<5 * DateOffsets>'
-
- >>> pd.offsets.BusinessHour(2).freqstr
- '2BH'
-
- >>> pd.offsets.Nano().freqstr
- 'N'
-
- >>> pd.offsets.Nano(-3).freqstr
- '-3N'
- """
- try:
- code = self.rule_code
- except NotImplementedError:
- return str(repr(self))
-
- if self.n != 1:
- fstr = f"{self.n}{code}"
- else:
- fstr = code
-
- try:
- if self._offset:
- fstr += self._offset_str()
- except AttributeError:
- # TODO: standardize `_offset` vs `offset` naming convention
- pass
-
- return fstr
-
- def _offset_str(self) -> str:
- return ""
-
- # ------------------------------------------------------------------
-
- def _apply(self, other):
- raise NotImplementedError("implemented by subclasses")
-
- @apply_array_wraps
- def _apply_array(self, dtarr):
- raise NotImplementedError(
- f"DateOffset subclass {type(self).__name__} "
- "does not have a vectorized implementation"
- )
-
- def rollback(self, dt) -> datetime:
- """
- Roll provided date backward to next offset only if not on offset.
-
- Returns
- -------
- TimeStamp
- Rolled timestamp if not on offset, otherwise unchanged timestamp.
- """
- dt = Timestamp(dt)
- if not self.is_on_offset(dt):
- dt = dt - type(self)(1, normalize=self.normalize, **self.kwds)
- return dt
-
- def rollforward(self, dt) -> datetime:
- """
- Roll provided date forward to next offset only if not on offset.
-
- Returns
- -------
- TimeStamp
- Rolled timestamp if not on offset, otherwise unchanged timestamp.
- """
- dt = Timestamp(dt)
- if not self.is_on_offset(dt):
- dt = dt + type(self)(1, normalize=self.normalize, **self.kwds)
- return dt
-
- def _get_offset_day(self, other: datetime) -> int:
- # subclass must implement `_day_opt`; calling from the base class
- # will implicitly assume day_opt = "business_end", see get_day_of_month.
- cdef:
- npy_datetimestruct dts
- pydate_to_dtstruct(other, &dts)
- return get_day_of_month(&dts, self._day_opt)
-
- def is_on_offset(self, dt: datetime) -> bool:
- """
- Return boolean whether a timestamp intersects with this frequency.
-
- Parameters
- ----------
- dt : datetime.datetime
- Timestamp to check intersections with frequency.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> freq = pd.offsets.Day(1)
- >>> freq.is_on_offset(ts)
- True
-
- >>> ts = pd.Timestamp(2022, 8, 6)
- >>> ts.day_name()
- 'Saturday'
- >>> freq = pd.offsets.BusinessDay(1)
- >>> freq.is_on_offset(ts)
- False
- """
- if self.normalize and not _is_normalized(dt):
- return False
-
- # Default (slow) method for determining if some date is a member of the
- # date range generated by this offset. Subclasses may have this
- # re-implemented in a nicer way.
- a = dt
- b = (dt + self) - self
- return a == b
-
- # ------------------------------------------------------------------
-
- # Staticmethod so we can call from Tick.__init__, will be unnecessary
- # once BaseOffset is a cdef class and is inherited by Tick
- @staticmethod
- def _validate_n(n) -> int:
- """
- Require that `n` be an integer.
-
- Parameters
- ----------
- n : int
-
- Returns
- -------
- nint : int
-
- Raises
- ------
- TypeError if `int(n)` raises
- ValueError if n != int(n)
- """
- if util.is_timedelta64_object(n):
- raise TypeError(f"`n` argument must be an integer, got {type(n)}")
- try:
- nint = int(n)
- except (ValueError, TypeError):
- raise TypeError(f"`n` argument must be an integer, got {type(n)}")
- if n != nint:
- raise ValueError(f"`n` argument must be an integer, got {n}")
- return nint
-
- def __setstate__(self, state):
- """
- Reconstruct an instance from a pickled state
- """
- self.n = state.pop("n")
- self.normalize = state.pop("normalize")
- self._cache = state.pop("_cache", {})
- # At this point we expect state to be empty
-
- def __getstate__(self):
- """
- Return a pickleable state
- """
- state = {}
- state["n"] = self.n
- state["normalize"] = self.normalize
-
- # we don't want to actually pickle the calendar object
- # as its a np.busyday; we recreate on deserialization
- state.pop("calendar", None)
- if "kwds" in state:
- state["kwds"].pop("calendar", None)
-
- return state
-
- @property
- def nanos(self):
- raise ValueError(f"{self} is a non-fixed frequency")
-
- def is_anchored(self) -> bool:
- # TODO: Does this make sense for the general case? It would help
- # if there were a canonical docstring for what is_anchored means.
- """
- Return boolean whether the frequency is a unit frequency (n=1).
-
- Examples
- --------
- >>> pd.DateOffset().is_anchored()
- True
- >>> pd.DateOffset(2).is_anchored()
- False
- """
- return self.n == 1
-
- # ------------------------------------------------------------------
-
- def is_month_start(self, _Timestamp ts):
- """
- Return boolean whether a timestamp occurs on the month start.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> freq = pd.offsets.Hour(5)
- >>> freq.is_month_start(ts)
- True
- """
- return ts._get_start_end_field("is_month_start", self)
-
- def is_month_end(self, _Timestamp ts):
- """
- Return boolean whether a timestamp occurs on the month end.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> freq = pd.offsets.Hour(5)
- >>> freq.is_month_end(ts)
- False
- """
- return ts._get_start_end_field("is_month_end", self)
-
- def is_quarter_start(self, _Timestamp ts):
- """
- Return boolean whether a timestamp occurs on the quarter start.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> freq = pd.offsets.Hour(5)
- >>> freq.is_quarter_start(ts)
- True
- """
- return ts._get_start_end_field("is_quarter_start", self)
-
- def is_quarter_end(self, _Timestamp ts):
- """
- Return boolean whether a timestamp occurs on the quarter end.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> freq = pd.offsets.Hour(5)
- >>> freq.is_quarter_end(ts)
- False
- """
- return ts._get_start_end_field("is_quarter_end", self)
-
- def is_year_start(self, _Timestamp ts):
- """
- Return boolean whether a timestamp occurs on the year start.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> freq = pd.offsets.Hour(5)
- >>> freq.is_year_start(ts)
- True
- """
- return ts._get_start_end_field("is_year_start", self)
-
- def is_year_end(self, _Timestamp ts):
- """
- Return boolean whether a timestamp occurs on the year end.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> freq = pd.offsets.Hour(5)
- >>> freq.is_year_end(ts)
- False
- """
- return ts._get_start_end_field("is_year_end", self)
-
-
-cdef class SingleConstructorOffset(BaseOffset):
- @classmethod
- def _from_name(cls, suffix=None):
- # default _from_name calls cls with no args
- if suffix:
- raise ValueError(f"Bad freq suffix {suffix}")
- return cls()
-
- def __reduce__(self):
- # This __reduce__ implementation is for all BaseOffset subclasses
- # except for RelativeDeltaOffset
- # np.busdaycalendar objects do not pickle nicely, but we can reconstruct
- # from attributes that do get pickled.
- tup = tuple(
- getattr(self, attr) if attr != "calendar" else None
- for attr in self._attributes
- )
- return type(self), tup
-
-
-# ---------------------------------------------------------------------
-# Tick Offsets
-
-cdef class Tick(SingleConstructorOffset):
- _adjust_dst = False
- _prefix = "undefined"
- _td64_unit = "undefined"
- _attributes = tuple(["n", "normalize"])
-
- def __init__(self, n=1, normalize=False):
- n = self._validate_n(n)
- self.n = n
- self.normalize = False
- self._cache = {}
- if normalize:
- # GH#21427
- raise ValueError(
- "Tick offset with `normalize=True` are not allowed."
- )
-
- # Note: Without making this cpdef, we get AttributeError when calling
- # from __mul__
- cpdef Tick _next_higher_resolution(Tick self):
- if type(self) is Day:
- return Hour(self.n * 24)
- if type(self) is Hour:
- return Minute(self.n * 60)
- if type(self) is Minute:
- return Second(self.n * 60)
- if type(self) is Second:
- return Milli(self.n * 1000)
- if type(self) is Milli:
- return Micro(self.n * 1000)
- if type(self) is Micro:
- return Nano(self.n * 1000)
- raise ValueError("Could not convert to integer offset at any resolution")
-
- # --------------------------------------------------------------------
-
- def _repr_attrs(self) -> str:
- # Since cdef classes have no __dict__, we need to override
- return ""
-
- @property
- def delta(self):
- return self.n * Timedelta(self._nanos_inc)
-
- @property
- def nanos(self) -> int64_t:
- """
- Return an integer of the total number of nanoseconds.
-
- Raises
- ------
- ValueError
- If the frequency is non-fixed.
-
- Examples
- --------
- >>> pd.offsets.Hour(5).nanos
- 18000000000000
- """
- return self.n * self._nanos_inc
-
- def is_on_offset(self, dt: datetime) -> bool:
- return True
-
- def is_anchored(self) -> bool:
- return False
-
- # This is identical to BaseOffset.__hash__, but has to be redefined here
- # for Python 3, because we've redefined __eq__.
- def __hash__(self) -> int:
- return hash(self._params)
-
- # --------------------------------------------------------------------
- # Comparison and Arithmetic Methods
-
- def __eq__(self, other):
- if isinstance(other, str):
- try:
- # GH#23524 if to_offset fails, we are dealing with an
- # incomparable type so == is False and != is True
- other = to_offset(other)
- except ValueError:
- # e.g. "infer"
- return False
- return self.delta == other
-
- def __ne__(self, other):
- return not (self == other)
-
- def __le__(self, other):
- return self.delta.__le__(other)
-
- def __lt__(self, other):
- return self.delta.__lt__(other)
-
- def __ge__(self, other):
- return self.delta.__ge__(other)
-
- def __gt__(self, other):
- return self.delta.__gt__(other)
-
- def __mul__(self, other):
- if not isinstance(self, Tick):
- # TODO(cython3), remove this, this moved to __rmul__
- # cython semantics, this is __rmul__
- return other.__mul__(self)
- if is_float_object(other):
- n = other * self.n
- # If the new `n` is an integer, we can represent it using the
- # same Tick subclass as self, otherwise we need to move up
- # to a higher-resolution subclass
- if np.isclose(n % 1, 0):
- return type(self)(int(n))
- new_self = self._next_higher_resolution()
- return new_self * other
- return BaseOffset.__mul__(self, other)
-
- def __rmul__(self, other):
- return self.__mul__(other)
-
- def __truediv__(self, other):
- if not isinstance(self, Tick):
- # cython semantics mean the args are sometimes swapped
- result = other.delta.__rtruediv__(self)
- else:
- result = self.delta.__truediv__(other)
- return _wrap_timedelta_result(result)
-
- def __rtruediv__(self, other):
- result = self.delta.__rtruediv__(other)
- return _wrap_timedelta_result(result)
-
- def __add__(self, other):
- if not isinstance(self, Tick):
- # cython semantics; this is __radd__
- # TODO(cython3): remove this, this moved to __radd__
- return other.__add__(self)
-
- if isinstance(other, Tick):
- if type(self) == type(other):
- return type(self)(self.n + other.n)
- else:
- return delta_to_tick(self.delta + other.delta)
- try:
- return self._apply(other)
- except ApplyTypeError:
- # Includes pd.Period
- return NotImplemented
- except OverflowError as err:
- raise OverflowError(
- f"the add operation between {self} and {other} will overflow"
- ) from err
-
- def __radd__(self, other):
- return self.__add__(other)
-
- def _apply(self, other):
- # Timestamp can handle tz and nano sec, thus no need to use apply_wraps
- if isinstance(other, _Timestamp):
- # GH#15126
- return other + self.delta
- elif other is NaT:
- return NaT
- elif is_datetime64_object(other) or PyDate_Check(other):
- # PyDate_Check includes date, datetime
- return Timestamp(other) + self
-
- if util.is_timedelta64_object(other) or PyDelta_Check(other):
- return other + self.delta
-
- raise ApplyTypeError(f"Unhandled type: {type(other).__name__}")
-
- # --------------------------------------------------------------------
- # Pickle Methods
-
- def __setstate__(self, state):
- self.n = state["n"]
- self.normalize = False
-
-
-cdef class Day(Tick):
- _nanos_inc = 24 * 3600 * 1_000_000_000
- _prefix = "D"
- _td64_unit = "D"
- _period_dtype_code = PeriodDtypeCode.D
- _creso = NPY_DATETIMEUNIT.NPY_FR_D
-
-
-cdef class Hour(Tick):
- _nanos_inc = 3600 * 1_000_000_000
- _prefix = "H"
- _td64_unit = "h"
- _period_dtype_code = PeriodDtypeCode.H
- _creso = NPY_DATETIMEUNIT.NPY_FR_h
-
-
-cdef class Minute(Tick):
- _nanos_inc = 60 * 1_000_000_000
- _prefix = "T"
- _td64_unit = "m"
- _period_dtype_code = PeriodDtypeCode.T
- _creso = NPY_DATETIMEUNIT.NPY_FR_m
-
-
-cdef class Second(Tick):
- _nanos_inc = 1_000_000_000
- _prefix = "S"
- _td64_unit = "s"
- _period_dtype_code = PeriodDtypeCode.S
- _creso = NPY_DATETIMEUNIT.NPY_FR_s
-
-
-cdef class Milli(Tick):
- _nanos_inc = 1_000_000
- _prefix = "L"
- _td64_unit = "ms"
- _period_dtype_code = PeriodDtypeCode.L
- _creso = NPY_DATETIMEUNIT.NPY_FR_ms
-
-
-cdef class Micro(Tick):
- _nanos_inc = 1000
- _prefix = "U"
- _td64_unit = "us"
- _period_dtype_code = PeriodDtypeCode.U
- _creso = NPY_DATETIMEUNIT.NPY_FR_us
-
-
-cdef class Nano(Tick):
- _nanos_inc = 1
- _prefix = "N"
- _td64_unit = "ns"
- _period_dtype_code = PeriodDtypeCode.N
- _creso = NPY_DATETIMEUNIT.NPY_FR_ns
-
-
-def delta_to_tick(delta: timedelta) -> Tick:
- if delta.microseconds == 0 and getattr(delta, "nanoseconds", 0) == 0:
- # nanoseconds only for pd.Timedelta
- if delta.seconds == 0:
- return Day(delta.days)
- else:
- seconds = delta.days * 86400 + delta.seconds
- if seconds % 3600 == 0:
- return Hour(seconds / 3600)
- elif seconds % 60 == 0:
- return Minute(seconds / 60)
- else:
- return Second(seconds)
- else:
- nanos = delta_to_nanoseconds(delta)
- if nanos % 1_000_000 == 0:
- return Milli(nanos // 1_000_000)
- elif nanos % 1000 == 0:
- return Micro(nanos // 1000)
- else: # pragma: no cover
- return Nano(nanos)
-
-
-# --------------------------------------------------------------------
-
-cdef class RelativeDeltaOffset(BaseOffset):
- """
- DateOffset subclass backed by a dateutil relativedelta object.
- """
- _attributes = tuple(["n", "normalize"] + list(_relativedelta_kwds))
- _adjust_dst = False
-
- def __init__(self, n=1, normalize=False, **kwds):
- BaseOffset.__init__(self, n, normalize)
- off, use_rd = _determine_offset(kwds)
- object.__setattr__(self, "_offset", off)
- object.__setattr__(self, "_use_relativedelta", use_rd)
- for key in kwds:
- val = kwds[key]
- object.__setattr__(self, key, val)
-
- def __getstate__(self):
- """
- Return a pickleable state
- """
- # RelativeDeltaOffset (technically DateOffset) is the only non-cdef
- # class, so the only one with __dict__
- state = self.__dict__.copy()
- state["n"] = self.n
- state["normalize"] = self.normalize
- return state
-
- def __setstate__(self, state):
- """
- Reconstruct an instance from a pickled state
- """
-
- if "offset" in state:
- # Older (<0.22.0) versions have offset attribute instead of _offset
- if "_offset" in state: # pragma: no cover
- raise AssertionError("Unexpected key `_offset`")
- state["_offset"] = state.pop("offset")
- state["kwds"]["offset"] = state["_offset"]
-
- self.n = state.pop("n")
- self.normalize = state.pop("normalize")
- self._cache = state.pop("_cache", {})
-
- self.__dict__.update(state)
-
- @apply_wraps
- def _apply(self, other: datetime) -> datetime:
- if self._use_relativedelta:
- other = _as_datetime(other)
-
- if len(self.kwds) > 0:
- tzinfo = getattr(other, "tzinfo", None)
- if tzinfo is not None and self._use_relativedelta:
- # perform calculation in UTC
- other = other.replace(tzinfo=None)
-
- if hasattr(self, "nanoseconds"):
- td_nano = Timedelta(nanoseconds=self.nanoseconds)
- else:
- td_nano = Timedelta(0)
-
- if self.n > 0:
- for i in range(self.n):
- other = other + self._offset + td_nano
- else:
- for i in range(-self.n):
- other = other - self._offset - td_nano
-
- if tzinfo is not None and self._use_relativedelta:
- # bring tz back from UTC calculation
- other = localize_pydatetime(other, tzinfo)
-
- return Timestamp(other)
- else:
- return other + timedelta(self.n)
-
- @apply_array_wraps
- def _apply_array(self, dtarr):
- reso = get_unit_from_dtype(dtarr.dtype)
- dt64other = np.asarray(dtarr)
- kwds = self.kwds
- relativedelta_fast = {
- "years",
- "months",
- "weeks",
- "days",
- "hours",
- "minutes",
- "seconds",
- "microseconds",
- }
- # relativedelta/_offset path only valid for base DateOffset
- if self._use_relativedelta and set(kwds).issubset(relativedelta_fast):
-
- months = (kwds.get("years", 0) * 12 + kwds.get("months", 0)) * self.n
- if months:
- shifted = shift_months(dt64other.view("i8"), months, reso=reso)
- dt64other = shifted.view(dtarr.dtype)
-
- weeks = kwds.get("weeks", 0) * self.n
- if weeks:
- delta = Timedelta(days=7 * weeks)
- td = (<_Timedelta>delta)._as_creso(reso)
- dt64other = dt64other + td
-
- timedelta_kwds = {
- k: v
- for k, v in kwds.items()
- if k in ["days", "hours", "minutes", "seconds", "microseconds"]
- }
- if timedelta_kwds:
- delta = Timedelta(**timedelta_kwds)
- td = (<_Timedelta>delta)._as_creso(reso)
- dt64other = dt64other + (self.n * td)
- return dt64other
- elif not self._use_relativedelta and hasattr(self, "_offset"):
- # timedelta
- num_nano = getattr(self, "nanoseconds", 0)
- if num_nano != 0:
- rem_nano = Timedelta(nanoseconds=num_nano)
- delta = Timedelta((self._offset + rem_nano) * self.n)
- else:
- delta = Timedelta(self._offset * self.n)
- td = (<_Timedelta>delta)._as_creso(reso)
- return dt64other + td
- else:
- # relativedelta with other keywords
- kwd = set(kwds) - relativedelta_fast
- raise NotImplementedError(
- "DateOffset with relativedelta "
- f"keyword(s) {kwd} not able to be "
- "applied vectorized"
- )
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- return True
-
-
-class OffsetMeta(type):
- """
- Metaclass that allows us to pretend that all BaseOffset subclasses
- inherit from DateOffset (which is needed for backward-compatibility).
- """
-
- @classmethod
- def __instancecheck__(cls, obj) -> bool:
- return isinstance(obj, BaseOffset)
-
- @classmethod
- def __subclasscheck__(cls, obj) -> bool:
- return issubclass(obj, BaseOffset)
-
-
-# TODO: figure out a way to use a metaclass with a cdef class
-class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta):
- """
- Standard kind of date increment used for a date range.
-
- Works exactly like the keyword argument form of relativedelta.
- Note that the positional argument form of relativedelata is not
- supported. Use of the keyword n is discouraged-- you would be better
- off specifying n in the keywords you use, but regardless it is
- there for you. n is needed for DateOffset subclasses.
-
- DateOffset works as follows. Each offset specify a set of dates
- that conform to the DateOffset. For example, Bday defines this
- set to be the set of dates that are weekdays (M-F). To test if a
- date is in the set of a DateOffset dateOffset we can use the
- is_on_offset method: dateOffset.is_on_offset(date).
-
- If a date is not on a valid date, the rollback and rollforward
- methods can be used to roll the date to the nearest valid date
- before/after the date.
-
- DateOffsets can be created to move dates forward a given number of
- valid dates. For example, Bday(2) can be added to a date to move
- it two business days forward. If the date does not start on a
- valid date, first it is moved to a valid date. Thus pseudo code
- is::
-
- def __add__(date):
- date = rollback(date) # does nothing if date is valid
- return date + <n number of periods>
-
- When a date offset is created for a negative number of periods,
- the date is first rolled forward. The pseudo code is::
-
- def __add__(date):
- date = rollforward(date) # does nothing if date is valid
- return date + <n number of periods>
-
- Zero presents a problem. Should it roll forward or back? We
- arbitrarily have it rollforward:
-
- date + BDay(0) == BDay.rollforward(date)
-
- Since 0 is a bit weird, we suggest avoiding its use.
-
- Besides, adding a DateOffsets specified by the singular form of the date
- component can be used to replace certain component of the timestamp.
-
- Parameters
- ----------
- n : int, default 1
- The number of time periods the offset represents.
- If specified without a temporal pattern, defaults to n days.
- normalize : bool, default False
- Whether to round the result of a DateOffset addition down to the
- previous midnight.
- **kwds
- Temporal parameter that add to or replace the offset value.
-
- Parameters that **add** to the offset (like Timedelta):
-
- - years
- - months
- - weeks
- - days
- - hours
- - minutes
- - seconds
- - milliseconds
- - microseconds
- - nanoseconds
-
- Parameters that **replace** the offset value:
-
- - year
- - month
- - day
- - weekday
- - hour
- - minute
- - second
- - microsecond
- - nanosecond.
-
- See Also
- --------
- dateutil.relativedelta.relativedelta : The relativedelta type is designed
- to be applied to an existing datetime an can replace specific components of
- that datetime, or represents an interval of time.
-
- Examples
- --------
- >>> from pandas.tseries.offsets import DateOffset
- >>> ts = pd.Timestamp('2017-01-01 09:10:11')
- >>> ts + DateOffset(months=3)
- Timestamp('2017-04-01 09:10:11')
-
- >>> ts = pd.Timestamp('2017-01-01 09:10:11')
- >>> ts + DateOffset(months=2)
- Timestamp('2017-03-01 09:10:11')
- >>> ts + DateOffset(day=31)
- Timestamp('2017-01-31 09:10:11')
-
- >>> ts + pd.DateOffset(hour=8)
- Timestamp('2017-01-01 08:10:11')
- """
- def __setattr__(self, name, value):
- raise AttributeError("DateOffset objects are immutable.")
-
-# --------------------------------------------------------------------
-
-
-cdef class BusinessMixin(SingleConstructorOffset):
- """
- Mixin to business types to provide related functions.
- """
-
- cdef readonly:
- timedelta _offset
- # Only Custom subclasses use weekmask, holiday, calendar
- object weekmask, holidays, calendar
-
- def __init__(self, n=1, normalize=False, offset=timedelta(0)):
- BaseOffset.__init__(self, n, normalize)
- self._offset = offset
-
- cpdef _init_custom(self, weekmask, holidays, calendar):
- """
- Additional __init__ for Custom subclasses.
- """
- calendar, holidays = _get_calendar(
- weekmask=weekmask, holidays=holidays, calendar=calendar
- )
- # Custom offset instances are identified by the
- # following two attributes. See DateOffset._params()
- # holidays, weekmask
- self.weekmask = weekmask
- self.holidays = holidays
- self.calendar = calendar
-
- @property
- def offset(self):
- """
- Alias for self._offset.
- """
- # Alias for backward compat
- return self._offset
-
- def _repr_attrs(self) -> str:
- if self.offset:
- attrs = [f"offset={repr(self.offset)}"]
- else:
- attrs = []
- out = ""
- if attrs:
- out += ": " + ", ".join(attrs)
- return out
-
- cpdef __setstate__(self, state):
- # We need to use a cdef/cpdef method to set the readonly _offset attribute
- if "_offset" in state:
- self._offset = state.pop("_offset")
- elif "offset" in state:
- # Older (<0.22.0) versions have offset attribute instead of _offset
- self._offset = state.pop("offset")
-
- if self._prefix.startswith("C"):
- # i.e. this is a Custom class
- weekmask = state.pop("weekmask")
- holidays = state.pop("holidays")
- calendar, holidays = _get_calendar(weekmask=weekmask,
- holidays=holidays,
- calendar=None)
- self.weekmask = weekmask
- self.calendar = calendar
- self.holidays = holidays
-
- BaseOffset.__setstate__(self, state)
-
-
-cdef class BusinessDay(BusinessMixin):
- """
- DateOffset subclass representing possibly n business days.
-
- Parameters
- ----------
- n : int, default 1
- The number of days represented.
- normalize : bool, default False
- Normalize start/end dates to midnight.
-
- Examples
- --------
- You can use the parameter ``n`` to represent a shift of n business days.
-
- >>> ts = pd.Timestamp(2022, 12, 9, 15)
- >>> ts.strftime('%a %d %b %Y %H:%M')
- 'Fri 09 Dec 2022 15:00'
- >>> (ts + pd.offsets.BusinessDay(n=5)).strftime('%a %d %b %Y %H:%M')
- 'Fri 16 Dec 2022 15:00'
-
- Passing the parameter ``normalize`` equal to True, you shift the start
- of the next business day to midnight.
-
- >>> ts = pd.Timestamp(2022, 12, 9, 15)
- >>> ts + pd.offsets.BusinessDay(normalize=True)
- Timestamp('2022-12-12 00:00:00')
- """
- _period_dtype_code = PeriodDtypeCode.B
- _prefix = "B"
- _attributes = tuple(["n", "normalize", "offset"])
-
- cpdef __setstate__(self, state):
- self.n = state.pop("n")
- self.normalize = state.pop("normalize")
- if "_offset" in state:
- self._offset = state.pop("_offset")
- elif "offset" in state:
- self._offset = state.pop("offset")
- self._cache = state.pop("_cache", {})
-
- def _offset_str(self) -> str:
- def get_str(td):
- off_str = ""
- if td.days > 0:
- off_str += str(td.days) + "D"
- if td.seconds > 0:
- s = td.seconds
- hrs = int(s / 3600)
- if hrs != 0:
- off_str += str(hrs) + "H"
- s -= hrs * 3600
- mts = int(s / 60)
- if mts != 0:
- off_str += str(mts) + "Min"
- s -= mts * 60
- if s != 0:
- off_str += str(s) + "s"
- if td.microseconds > 0:
- off_str += str(td.microseconds) + "us"
- return off_str
-
- if PyDelta_Check(self.offset):
- zero = timedelta(0, 0, 0)
- if self.offset >= zero:
- off_str = "+" + get_str(self.offset)
- else:
- off_str = "-" + get_str(-self.offset)
- return off_str
- else:
- return "+" + repr(self.offset)
-
- @apply_wraps
- def _apply(self, other):
- if PyDateTime_Check(other):
- n = self.n
- wday = other.weekday()
-
- # avoid slowness below by operating on weeks first
- weeks = n // 5
- days = self._adjust_ndays(wday, weeks)
-
- result = other + timedelta(days=7 * weeks + days)
- if self.offset:
- result = result + self.offset
- return result
-
- elif is_any_td_scalar(other):
- td = Timedelta(self.offset) + other
- return BusinessDay(
- self.n, offset=td.to_pytimedelta(), normalize=self.normalize
- )
- else:
- raise ApplyTypeError(
- "Only know how to combine business day with datetime or timedelta."
- )
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef ndarray _shift_bdays(
- self,
- ndarray i8other,
- NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns,
- ):
- """
- Implementation of BusinessDay.apply_offset.
-
- Parameters
- ----------
- i8other : const int64_t[:]
- reso : NPY_DATETIMEUNIT, default NPY_FR_ns
-
- Returns
- -------
- ndarray[int64_t]
- """
- cdef:
- int periods = self.n
- Py_ssize_t i, n = i8other.size
- ndarray result = cnp.PyArray_EMPTY(
- i8other.ndim, i8other.shape, cnp.NPY_INT64, 0
- )
- int64_t val, res_val
- int wday, days
- npy_datetimestruct dts
- int64_t DAY_PERIODS = periods_per_day(reso)
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(result, i8other)
-
- for i in range(n):
- # Analogous to: val = i8other[i]
- val = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if val == NPY_NAT:
- res_val = NPY_NAT
- else:
- # The rest of this is effectively a copy of BusinessDay.apply
- weeks = periods // 5
- pandas_datetime_to_datetimestruct(val, reso, &dts)
- wday = dayofweek(dts.year, dts.month, dts.day)
-
- days = self._adjust_ndays(wday, weeks)
- res_val = val + (7 * weeks + days) * DAY_PERIODS
-
- # Analogous to: out[i] = res_val
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_val
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return result
-
- cdef int _adjust_ndays(self, int wday, int weeks):
- cdef:
- int n = self.n
- int days
-
- if n <= 0 and wday > 4:
- # roll forward
- n += 1
-
- n -= 5 * weeks
-
- # n is always >= 0 at this point
- if n == 0 and wday > 4:
- # roll back
- days = 4 - wday
- elif wday > 4:
- # roll forward
- days = (7 - wday) + (n - 1)
- elif wday + n <= 4:
- # shift by n days without leaving the current week
- days = n
- else:
- # shift by n days plus 2 to get past the weekend
- days = n + 2
- return days
-
- @apply_array_wraps
- def _apply_array(self, dtarr):
- i8other = dtarr.view("i8")
- reso = get_unit_from_dtype(dtarr.dtype)
- res = self._shift_bdays(i8other, reso=reso)
- if self.offset:
- res = res.view(dtarr.dtype) + Timedelta(self.offset)
- res = res.view("i8")
- return res
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- return dt.weekday() < 5
-
-
-cdef class BusinessHour(BusinessMixin):
- """
- DateOffset subclass representing possibly n business hours.
-
- Parameters
- ----------
- n : int, default 1
- The number of hours represented.
- normalize : bool, default False
- Normalize start/end dates to midnight before generating date range.
- start : str, time, or list of str/time, default "09:00"
- Start time of your custom business hour in 24h format.
- end : str, time, or list of str/time, default: "17:00"
- End time of your custom business hour in 24h format.
-
- Examples
- --------
- You can use the parameter ``n`` to represent a shift of n hours.
-
- >>> ts = pd.Timestamp(2022, 12, 9, 8)
- >>> ts + pd.offsets.BusinessHour(n=5)
- Timestamp('2022-12-09 14:00:00')
-
- You can also change the start and the end of business hours.
-
- >>> ts = pd.Timestamp(2022, 8, 5, 16)
- >>> ts + pd.offsets.BusinessHour(start="11:00")
- Timestamp('2022-08-08 11:00:00')
-
- >>> from datetime import time as dt_time
- >>> ts = pd.Timestamp(2022, 8, 5, 22)
- >>> ts + pd.offsets.BusinessHour(end=dt_time(19, 0))
- Timestamp('2022-08-08 10:00:00')
-
- Passing the parameter ``normalize`` equal to True, you shift the start
- of the next business hour to midnight.
-
- >>> ts = pd.Timestamp(2022, 12, 9, 8)
- >>> ts + pd.offsets.BusinessHour(normalize=True)
- Timestamp('2022-12-09 00:00:00')
-
- You can divide your business day hours into several parts.
-
- >>> import datetime as dt
- >>> freq = pd.offsets.BusinessHour(start=["06:00", "10:00", "15:00"],
- ... end=["08:00", "12:00", "17:00"])
- >>> pd.date_range(dt.datetime(2022, 12, 9), dt.datetime(2022, 12, 13), freq=freq)
- DatetimeIndex(['2022-12-09 06:00:00', '2022-12-09 07:00:00',
- '2022-12-09 10:00:00', '2022-12-09 11:00:00',
- '2022-12-09 15:00:00', '2022-12-09 16:00:00',
- '2022-12-12 06:00:00', '2022-12-12 07:00:00',
- '2022-12-12 10:00:00', '2022-12-12 11:00:00',
- '2022-12-12 15:00:00', '2022-12-12 16:00:00'],
- dtype='datetime64[ns]', freq='BH')
- """
-
- _prefix = "BH"
- _anchor = 0
- _attributes = tuple(["n", "normalize", "start", "end", "offset"])
- _adjust_dst = False
-
- cdef readonly:
- tuple start, end
-
- def __init__(
- self, n=1, normalize=False, start="09:00", end="17:00", offset=timedelta(0)
- ):
- BusinessMixin.__init__(self, n, normalize, offset)
-
- # must be validated here to equality check
- if np.ndim(start) == 0:
- # i.e. not is_list_like
- start = [start]
- if not len(start):
- raise ValueError("Must include at least 1 start time")
-
- if np.ndim(end) == 0:
- # i.e. not is_list_like
- end = [end]
- if not len(end):
- raise ValueError("Must include at least 1 end time")
-
- start = np.array([_validate_business_time(x) for x in start])
- end = np.array([_validate_business_time(x) for x in end])
-
- # Validation of input
- if len(start) != len(end):
- raise ValueError("number of starting time and ending time must be the same")
- num_openings = len(start)
-
- # sort starting and ending time by starting time
- index = np.argsort(start)
-
- # convert to tuple so that start and end are hashable
- start = tuple(start[index])
- end = tuple(end[index])
-
- total_secs = 0
- for i in range(num_openings):
- total_secs += self._get_business_hours_by_sec(start[i], end[i])
- total_secs += self._get_business_hours_by_sec(
- end[i], start[(i + 1) % num_openings]
- )
- if total_secs != 24 * 60 * 60:
- raise ValueError(
- "invalid starting and ending time(s): "
- "opening hours should not touch or overlap with "
- "one another"
- )
-
- self.start = start
- self.end = end
-
- cpdef __setstate__(self, state):
- start = state.pop("start")
- start = (start,) if np.ndim(start) == 0 else tuple(start)
- end = state.pop("end")
- end = (end,) if np.ndim(end) == 0 else tuple(end)
- self.start = start
- self.end = end
-
- state.pop("kwds", {})
- state.pop("next_bday", None)
- BusinessMixin.__setstate__(self, state)
-
- def _repr_attrs(self) -> str:
- out = super()._repr_attrs()
- # Use python string formatting to be faster than strftime
- hours = ",".join(
- f"{st.hour:02d}:{st.minute:02d}-{en.hour:02d}:{en.minute:02d}"
- for st, en in zip(self.start, self.end)
- )
- attrs = [f"{self._prefix}={hours}"]
- out += ": " + ", ".join(attrs)
- return out
-
- def _get_business_hours_by_sec(self, start, end):
- """
- Return business hours in a day by seconds.
- """
- # create dummy datetime to calculate business hours in a day
- dtstart = datetime(2014, 4, 1, start.hour, start.minute)
- day = 1 if start < end else 2
- until = datetime(2014, 4, day, end.hour, end.minute)
- return int((until - dtstart).total_seconds())
-
- def _get_closing_time(self, dt: datetime) -> datetime:
- """
- Get the closing time of a business hour interval by its opening time.
-
- Parameters
- ----------
- dt : datetime
- Opening time of a business hour interval.
-
- Returns
- -------
- result : datetime
- Corresponding closing time.
- """
- for i, st in enumerate(self.start):
- if st.hour == dt.hour and st.minute == dt.minute:
- return dt + timedelta(
- seconds=self._get_business_hours_by_sec(st, self.end[i])
- )
- assert False
-
- @cache_readonly
- def next_bday(self):
- """
- Used for moving to next business day.
- """
- if self.n >= 0:
- nb_offset = 1
- else:
- nb_offset = -1
- if self._prefix.startswith("C"):
- # CustomBusinessHour
- return CustomBusinessDay(
- n=nb_offset,
- weekmask=self.weekmask,
- holidays=self.holidays,
- calendar=self.calendar,
- )
- else:
- return BusinessDay(n=nb_offset)
-
- def _next_opening_time(self, other, sign=1):
- """
- If self.n and sign have the same sign, return the earliest opening time
- later than or equal to current time.
- Otherwise the latest opening time earlier than or equal to current
- time.
-
- Opening time always locates on BusinessDay.
- However, closing time may not if business hour extends over midnight.
-
- Parameters
- ----------
- other : datetime
- Current time.
- sign : int, default 1.
- Either 1 or -1. Going forward in time if it has the same sign as
- self.n. Going backward in time otherwise.
-
- Returns
- -------
- result : datetime
- Next opening time.
- """
- earliest_start = self.start[0]
- latest_start = self.start[-1]
-
- if self.n == 0:
- is_same_sign = sign > 0
- else:
- is_same_sign = self.n * sign >= 0
-
- if not self.next_bday.is_on_offset(other):
- # today is not business day
- other = other + sign * self.next_bday
- if is_same_sign:
- hour, minute = earliest_start.hour, earliest_start.minute
- else:
- hour, minute = latest_start.hour, latest_start.minute
- else:
- if is_same_sign:
- if latest_start < other.time():
- # current time is after latest starting time in today
- other = other + sign * self.next_bday
- hour, minute = earliest_start.hour, earliest_start.minute
- else:
- # find earliest starting time no earlier than current time
- for st in self.start:
- if other.time() <= st:
- hour, minute = st.hour, st.minute
- break
- else:
- if other.time() < earliest_start:
- # current time is before earliest starting time in today
- other = other + sign * self.next_bday
- hour, minute = latest_start.hour, latest_start.minute
- else:
- # find latest starting time no later than current time
- for st in reversed(self.start):
- if other.time() >= st:
- hour, minute = st.hour, st.minute
- break
-
- return datetime(other.year, other.month, other.day, hour, minute)
-
- def _prev_opening_time(self, other: datetime) -> datetime:
- """
- If n is positive, return the latest opening time earlier than or equal
- to current time.
- Otherwise the earliest opening time later than or equal to current
- time.
-
- Parameters
- ----------
- other : datetime
- Current time.
-
- Returns
- -------
- result : datetime
- Previous opening time.
- """
- return self._next_opening_time(other, sign=-1)
-
- @apply_wraps
- def rollback(self, dt: datetime) -> datetime:
- """
- Roll provided date backward to next offset only if not on offset.
- """
- if not self.is_on_offset(dt):
- if self.n >= 0:
- dt = self._prev_opening_time(dt)
- else:
- dt = self._next_opening_time(dt)
- return self._get_closing_time(dt)
- return dt
-
- @apply_wraps
- def rollforward(self, dt: datetime) -> datetime:
- """
- Roll provided date forward to next offset only if not on offset.
- """
- if not self.is_on_offset(dt):
- if self.n >= 0:
- return self._next_opening_time(dt)
- else:
- return self._prev_opening_time(dt)
- return dt
-
- @apply_wraps
- def _apply(self, other: datetime) -> datetime:
- # used for detecting edge condition
- nanosecond = getattr(other, "nanosecond", 0)
- # reset timezone and nanosecond
- # other may be a Timestamp, thus not use replace
- other = datetime(
- other.year,
- other.month,
- other.day,
- other.hour,
- other.minute,
- other.second,
- other.microsecond,
- )
- n = self.n
-
- # adjust other to reduce number of cases to handle
- if n >= 0:
- if other.time() in self.end or not self._is_on_offset(other):
- other = self._next_opening_time(other)
- else:
- if other.time() in self.start:
- # adjustment to move to previous business day
- other = other - timedelta(seconds=1)
- if not self._is_on_offset(other):
- other = self._next_opening_time(other)
- other = self._get_closing_time(other)
-
- # get total business hours by sec in one business day
- businesshours = sum(
- self._get_business_hours_by_sec(st, en)
- for st, en in zip(self.start, self.end)
- )
-
- bd, r = divmod(abs(n * 60), businesshours // 60)
- if n < 0:
- bd, r = -bd, -r
-
- # adjust by business days first
- if bd != 0:
- if self._prefix.startswith("C"):
- # GH#30593 this is a Custom offset
- skip_bd = CustomBusinessDay(
- n=bd,
- weekmask=self.weekmask,
- holidays=self.holidays,
- calendar=self.calendar,
- )
- else:
- skip_bd = BusinessDay(n=bd)
- # midnight business hour may not on BusinessDay
- if not self.next_bday.is_on_offset(other):
- prev_open = self._prev_opening_time(other)
- remain = other - prev_open
- other = prev_open + skip_bd + remain
- else:
- other = other + skip_bd
-
- # remaining business hours to adjust
- bhour_remain = timedelta(minutes=r)
-
- if n >= 0:
- while bhour_remain != timedelta(0):
- # business hour left in this business time interval
- bhour = (
- self._get_closing_time(self._prev_opening_time(other)) - other
- )
- if bhour_remain < bhour:
- # finish adjusting if possible
- other += bhour_remain
- bhour_remain = timedelta(0)
- else:
- # go to next business time interval
- bhour_remain -= bhour
- other = self._next_opening_time(other + bhour)
- else:
- while bhour_remain != timedelta(0):
- # business hour left in this business time interval
- bhour = self._next_opening_time(other) - other
- if (
- bhour_remain > bhour
- or bhour_remain == bhour
- and nanosecond != 0
- ):
- # finish adjusting if possible
- other += bhour_remain
- bhour_remain = timedelta(0)
- else:
- # go to next business time interval
- bhour_remain -= bhour
- other = self._get_closing_time(
- self._next_opening_time(
- other + bhour - timedelta(seconds=1)
- )
- )
-
- return other
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
-
- if dt.tzinfo is not None:
- dt = datetime(
- dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond
- )
- # Valid BH can be on the different BusinessDay during midnight
- # Distinguish by the time spent from previous opening time
- return self._is_on_offset(dt)
-
- def _is_on_offset(self, dt: datetime) -> bool:
- """
- Slight speedups using calculated values.
- """
- # if self.normalize and not _is_normalized(dt):
- # return False
- # Valid BH can be on the different BusinessDay during midnight
- # Distinguish by the time spent from previous opening time
- if self.n >= 0:
- op = self._prev_opening_time(dt)
- else:
- op = self._next_opening_time(dt)
- span = (dt - op).total_seconds()
- businesshours = 0
- for i, st in enumerate(self.start):
- if op.hour == st.hour and op.minute == st.minute:
- businesshours = self._get_business_hours_by_sec(st, self.end[i])
- if span <= businesshours:
- return True
- else:
- return False
-
-
-cdef class WeekOfMonthMixin(SingleConstructorOffset):
- """
- Mixin for methods common to WeekOfMonth and LastWeekOfMonth.
- """
-
- cdef readonly:
- int weekday, week
-
- def __init__(self, n=1, normalize=False, weekday=0):
- BaseOffset.__init__(self, n, normalize)
- self.weekday = weekday
-
- if weekday < 0 or weekday > 6:
- raise ValueError(f"Day must be 0<=day<=6, got {weekday}")
-
- @apply_wraps
- def _apply(self, other: datetime) -> datetime:
- compare_day = self._get_offset_day(other)
-
- months = self.n
- months = roll_convention(other.day, months, compare_day)
-
- shifted = shift_month(other, months, "start")
- to_day = self._get_offset_day(shifted)
- return _shift_day(shifted, to_day - shifted.day)
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- return dt.day == self._get_offset_day(dt)
-
- @property
- def rule_code(self) -> str:
- weekday = int_to_weekday.get(self.weekday, "")
- if self.week == -1:
- # LastWeekOfMonth
- return f"{self._prefix}-{weekday}"
- return f"{self._prefix}-{self.week + 1}{weekday}"
-
-
-# ----------------------------------------------------------------------
-# Year-Based Offset Classes
-
-cdef class YearOffset(SingleConstructorOffset):
- """
- DateOffset that just needs a month.
- """
- _attributes = tuple(["n", "normalize", "month"])
-
- # FIXME(cython#4446): python annotation here gives compile-time errors
- # _default_month: int
-
- cdef readonly:
- int month
-
- def __init__(self, n=1, normalize=False, month=None):
- BaseOffset.__init__(self, n, normalize)
-
- month = month if month is not None else self._default_month
- self.month = month
-
- if month < 1 or month > 12:
- raise ValueError("Month must go from 1 to 12")
-
- cpdef __setstate__(self, state):
- self.month = state.pop("month")
- self.n = state.pop("n")
- self.normalize = state.pop("normalize")
- self._cache = {}
-
- @classmethod
- def _from_name(cls, suffix=None):
- kwargs = {}
- if suffix:
- kwargs["month"] = MONTH_TO_CAL_NUM[suffix]
- return cls(**kwargs)
-
- @property
- def rule_code(self) -> str:
- month = MONTH_ALIASES[self.month]
- return f"{self._prefix}-{month}"
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- return dt.month == self.month and dt.day == self._get_offset_day(dt)
-
- def _get_offset_day(self, other: datetime) -> int:
- # override BaseOffset method to use self.month instead of other.month
- cdef:
- npy_datetimestruct dts
- pydate_to_dtstruct(other, &dts)
- dts.month = self.month
- return get_day_of_month(&dts, self._day_opt)
-
- @apply_wraps
- def _apply(self, other: datetime) -> datetime:
- years = roll_qtrday(other, self.n, self.month, self._day_opt, modby=12)
- months = years * 12 + (self.month - other.month)
- return shift_month(other, months, self._day_opt)
-
- @apply_array_wraps
- def _apply_array(self, dtarr):
- reso = get_unit_from_dtype(dtarr.dtype)
- shifted = shift_quarters(
- dtarr.view("i8"), self.n, self.month, self._day_opt, modby=12, reso=reso
- )
- return shifted
-
-
-cdef class BYearEnd(YearOffset):
- """
- DateOffset increments between the last business day of the year.
-
- Examples
- --------
- >>> from pandas.tseries.offsets import BYearEnd
- >>> ts = pd.Timestamp('2020-05-24 05:01:15')
- >>> ts - BYearEnd()
- Timestamp('2019-12-31 05:01:15')
- >>> ts + BYearEnd()
- Timestamp('2020-12-31 05:01:15')
- >>> ts + BYearEnd(3)
- Timestamp('2022-12-30 05:01:15')
- >>> ts + BYearEnd(-3)
- Timestamp('2017-12-29 05:01:15')
- >>> ts + BYearEnd(month=11)
- Timestamp('2020-11-30 05:01:15')
- """
-
- _outputName = "BusinessYearEnd"
- _default_month = 12
- _prefix = "BA"
- _day_opt = "business_end"
-
-
-cdef class BYearBegin(YearOffset):
- """
- DateOffset increments between the first business day of the year.
-
- Examples
- --------
- >>> from pandas.tseries.offsets import BYearBegin
- >>> ts = pd.Timestamp('2020-05-24 05:01:15')
- >>> ts + BYearBegin()
- Timestamp('2021-01-01 05:01:15')
- >>> ts - BYearBegin()
- Timestamp('2020-01-01 05:01:15')
- >>> ts + BYearBegin(-1)
- Timestamp('2020-01-01 05:01:15')
- >>> ts + BYearBegin(2)
- Timestamp('2022-01-03 05:01:15')
- """
-
- _outputName = "BusinessYearBegin"
- _default_month = 1
- _prefix = "BAS"
- _day_opt = "business_start"
-
-
-cdef class YearEnd(YearOffset):
- """
- DateOffset increments between calendar year ends.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> ts + pd.offsets.YearEnd()
- Timestamp('2022-12-31 00:00:00')
- """
-
- _default_month = 12
- _prefix = "A"
- _day_opt = "end"
-
- cdef readonly:
- int _period_dtype_code
-
- def __init__(self, n=1, normalize=False, month=None):
- # Because YearEnd can be the freq for a Period, define its
- # _period_dtype_code at construction for performance
- YearOffset.__init__(self, n, normalize, month)
- self._period_dtype_code = PeriodDtypeCode.A + self.month % 12
-
-
-cdef class YearBegin(YearOffset):
- """
- DateOffset of one year at beginning.
-
- YearBegin goes to the next date which is a start of the year.
-
- See Also
- --------
- :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 12, 1)
- >>> ts + pd.offsets.YearBegin()
- Timestamp('2023-01-01 00:00:00')
-
- >>> ts = pd.Timestamp(2023, 1, 1)
- >>> ts + pd.offsets.YearBegin()
- Timestamp('2024-01-01 00:00:00')
-
- If you want to get the start of the current year:
-
- >>> ts = pd.Timestamp(2023, 1, 1)
- >>> pd.offsets.YearBegin().rollback(ts)
- Timestamp('2023-01-01 00:00:00')
- """
-
- _default_month = 1
- _prefix = "AS"
- _day_opt = "start"
-
-
-# ----------------------------------------------------------------------
-# Quarter-Based Offset Classes
-
-cdef class QuarterOffset(SingleConstructorOffset):
- _attributes = tuple(["n", "normalize", "startingMonth"])
- # TODO: Consider combining QuarterOffset and YearOffset __init__ at some
- # point. Also apply_index, is_on_offset, rule_code if
- # startingMonth vs month attr names are resolved
-
- # FIXME(cython#4446): python annotation here gives compile-time errors
- # _default_starting_month: int
- # _from_name_starting_month: int
-
- cdef readonly:
- int startingMonth
-
- def __init__(self, n=1, normalize=False, startingMonth=None):
- BaseOffset.__init__(self, n, normalize)
-
- if startingMonth is None:
- startingMonth = self._default_starting_month
- self.startingMonth = startingMonth
-
- cpdef __setstate__(self, state):
- self.startingMonth = state.pop("startingMonth")
- self.n = state.pop("n")
- self.normalize = state.pop("normalize")
-
- @classmethod
- def _from_name(cls, suffix=None):
- kwargs = {}
- if suffix:
- kwargs["startingMonth"] = MONTH_TO_CAL_NUM[suffix]
- else:
- if cls._from_name_starting_month is not None:
- kwargs["startingMonth"] = cls._from_name_starting_month
- return cls(**kwargs)
-
- @property
- def rule_code(self) -> str:
- month = MONTH_ALIASES[self.startingMonth]
- return f"{self._prefix}-{month}"
-
- def is_anchored(self) -> bool:
- return self.n == 1 and self.startingMonth is not None
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- mod_month = (dt.month - self.startingMonth) % 3
- return mod_month == 0 and dt.day == self._get_offset_day(dt)
-
- @apply_wraps
- def _apply(self, other: datetime) -> datetime:
- # months_since: find the calendar quarter containing other.month,
- # e.g. if other.month == 8, the calendar quarter is [Jul, Aug, Sep].
- # Then find the month in that quarter containing an is_on_offset date for
- # self. `months_since` is the number of months to shift other.month
- # to get to this on-offset month.
- months_since = other.month % 3 - self.startingMonth % 3
- qtrs = roll_qtrday(
- other, self.n, self.startingMonth, day_opt=self._day_opt, modby=3
- )
- months = qtrs * 3 - months_since
- return shift_month(other, months, self._day_opt)
-
- @apply_array_wraps
- def _apply_array(self, dtarr):
- reso = get_unit_from_dtype(dtarr.dtype)
- shifted = shift_quarters(
- dtarr.view("i8"),
- self.n,
- self.startingMonth,
- self._day_opt,
- modby=3,
- reso=reso,
- )
- return shifted
-
-
-cdef class BQuarterEnd(QuarterOffset):
- """
- DateOffset increments between the last business day of each Quarter.
-
- startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ...
- startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
- startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ...
-
- Examples
- --------
- >>> from pandas.tseries.offsets import BQuarterEnd
- >>> ts = pd.Timestamp('2020-05-24 05:01:15')
- >>> ts + BQuarterEnd()
- Timestamp('2020-06-30 05:01:15')
- >>> ts + BQuarterEnd(2)
- Timestamp('2020-09-30 05:01:15')
- >>> ts + BQuarterEnd(1, startingMonth=2)
- Timestamp('2020-05-29 05:01:15')
- >>> ts + BQuarterEnd(startingMonth=2)
- Timestamp('2020-05-29 05:01:15')
- """
- _output_name = "BusinessQuarterEnd"
- _default_starting_month = 3
- _from_name_starting_month = 12
- _prefix = "BQ"
- _day_opt = "business_end"
-
-
-cdef class BQuarterBegin(QuarterOffset):
- """
- DateOffset increments between the first business day of each Quarter.
-
- startingMonth = 1 corresponds to dates like 1/01/2007, 4/01/2007, ...
- startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ...
- startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ...
-
- Examples
- --------
- >>> from pandas.tseries.offsets import BQuarterBegin
- >>> ts = pd.Timestamp('2020-05-24 05:01:15')
- >>> ts + BQuarterBegin()
- Timestamp('2020-06-01 05:01:15')
- >>> ts + BQuarterBegin(2)
- Timestamp('2020-09-01 05:01:15')
- >>> ts + BQuarterBegin(startingMonth=2)
- Timestamp('2020-08-03 05:01:15')
- >>> ts + BQuarterBegin(-1)
- Timestamp('2020-03-02 05:01:15')
- """
- _output_name = "BusinessQuarterBegin"
- _default_starting_month = 3
- _from_name_starting_month = 1
- _prefix = "BQS"
- _day_opt = "business_start"
-
-
-cdef class QuarterEnd(QuarterOffset):
- """
- DateOffset increments between Quarter end dates.
-
- startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ...
- startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
- startingMonth = 3 corresponds to dates like 3/31/2007, 6/30/2007, ...
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> ts + pd.offsets.QuarterEnd()
- Timestamp('2022-03-31 00:00:00')
- """
- _default_starting_month = 3
- _prefix = "Q"
- _day_opt = "end"
-
- cdef readonly:
- int _period_dtype_code
-
- def __init__(self, n=1, normalize=False, startingMonth=None):
- # Because QuarterEnd can be the freq for a Period, define its
- # _period_dtype_code at construction for performance
- QuarterOffset.__init__(self, n, normalize, startingMonth)
- self._period_dtype_code = PeriodDtypeCode.Q_DEC + self.startingMonth % 12
-
-
-cdef class QuarterBegin(QuarterOffset):
- """
- DateOffset increments between Quarter start dates.
-
- startingMonth = 1 corresponds to dates like 1/01/2007, 4/01/2007, ...
- startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ...
- startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ...
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> ts + pd.offsets.QuarterBegin()
- Timestamp('2022-03-01 00:00:00')
- """
- _default_starting_month = 3
- _from_name_starting_month = 1
- _prefix = "QS"
- _day_opt = "start"
-
-
-# ----------------------------------------------------------------------
-# Month-Based Offset Classes
-
-cdef class MonthOffset(SingleConstructorOffset):
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- return dt.day == self._get_offset_day(dt)
-
- @apply_wraps
- def _apply(self, other: datetime) -> datetime:
- compare_day = self._get_offset_day(other)
- n = roll_convention(other.day, self.n, compare_day)
- return shift_month(other, n, self._day_opt)
-
- @apply_array_wraps
- def _apply_array(self, dtarr):
- reso = get_unit_from_dtype(dtarr.dtype)
- shifted = shift_months(dtarr.view("i8"), self.n, self._day_opt, reso=reso)
- return shifted
-
- cpdef __setstate__(self, state):
- state.pop("_use_relativedelta", False)
- state.pop("offset", None)
- state.pop("_offset", None)
- state.pop("kwds", {})
-
- BaseOffset.__setstate__(self, state)
-
-
-cdef class MonthEnd(MonthOffset):
- """
- DateOffset of one month end.
-
- MonthEnd goes to the next date which is an end of the month.
-
- See Also
- --------
- :class:`~pandas.tseries.offsets.DateOffset` : Standard kind of date increment.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 30)
- >>> ts + pd.offsets.MonthEnd()
- Timestamp('2022-01-31 00:00:00')
-
- >>> ts = pd.Timestamp(2022, 1, 31)
- >>> ts + pd.offsets.MonthEnd()
- Timestamp('2022-02-28 00:00:00')
-
- If you want to get the end of the current month:
-
- >>> ts = pd.Timestamp(2022, 1, 31)
- >>> pd.offsets.MonthEnd().rollforward(ts)
- Timestamp('2022-01-31 00:00:00')
- """
- _period_dtype_code = PeriodDtypeCode.M
- _prefix = "M"
- _day_opt = "end"
-
-
-cdef class MonthBegin(MonthOffset):
- """
- DateOffset of one month at beginning.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> ts + pd.offsets.MonthBegin()
- Timestamp('2022-02-01 00:00:00')
- """
- _prefix = "MS"
- _day_opt = "start"
-
-
-cdef class BusinessMonthEnd(MonthOffset):
- """
- DateOffset increments between the last business day of the month.
-
- BusinessMonthEnd goes to the next date which is the last business day of the month.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 11, 29)
- >>> ts + pd.offsets.BMonthEnd()
- Timestamp('2022-11-30 00:00:00')
-
- >>> ts = pd.Timestamp(2022, 11, 30)
- >>> ts + pd.offsets.BMonthEnd()
- Timestamp('2022-12-30 00:00:00')
-
- If you want to get the end of the current business month:
-
- >>> ts = pd.Timestamp(2022, 11, 30)
- >>> pd.offsets.BMonthEnd().rollforward(ts)
- Timestamp('2022-11-30 00:00:00')
- """
- _prefix = "BM"
- _day_opt = "business_end"
-
-
-cdef class BusinessMonthBegin(MonthOffset):
- """
- DateOffset of one month at the first business day.
-
- Examples
- --------
- >>> from pandas.tseries.offsets import BMonthBegin
- >>> ts=pd.Timestamp('2020-05-24 05:01:15')
- >>> ts + BMonthBegin()
- Timestamp('2020-06-01 05:01:15')
- >>> ts + BMonthBegin(2)
- Timestamp('2020-07-01 05:01:15')
- >>> ts + BMonthBegin(-3)
- Timestamp('2020-03-02 05:01:15')
- """
- _prefix = "BMS"
- _day_opt = "business_start"
-
-
-# ---------------------------------------------------------------------
-# Semi-Month Based Offsets
-
-cdef class SemiMonthOffset(SingleConstructorOffset):
- _default_day_of_month = 15
- _min_day_of_month = 2
- _attributes = tuple(["n", "normalize", "day_of_month"])
-
- cdef readonly:
- int day_of_month
-
- def __init__(self, n=1, normalize=False, day_of_month=None):
- BaseOffset.__init__(self, n, normalize)
-
- if day_of_month is None:
- day_of_month = self._default_day_of_month
-
- self.day_of_month = int(day_of_month)
- if not self._min_day_of_month <= self.day_of_month <= 27:
- raise ValueError(
- "day_of_month must be "
- f"{self._min_day_of_month}<=day_of_month<=27, "
- f"got {self.day_of_month}"
- )
-
- cpdef __setstate__(self, state):
- self.n = state.pop("n")
- self.normalize = state.pop("normalize")
- self.day_of_month = state.pop("day_of_month")
-
- @classmethod
- def _from_name(cls, suffix=None):
- return cls(day_of_month=suffix)
-
- @property
- def rule_code(self) -> str:
- suffix = f"-{self.day_of_month}"
- return self._prefix + suffix
-
- @apply_wraps
- def _apply(self, other: datetime) -> datetime:
- is_start = isinstance(self, SemiMonthBegin)
-
- # shift `other` to self.day_of_month, incrementing `n` if necessary
- n = roll_convention(other.day, self.n, self.day_of_month)
-
- days_in_month = get_days_in_month(other.year, other.month)
- # For SemiMonthBegin on other.day == 1 and
- # SemiMonthEnd on other.day == days_in_month,
- # shifting `other` to `self.day_of_month` _always_ requires
- # incrementing/decrementing `n`, regardless of whether it is
- # initially positive.
- if is_start and (self.n <= 0 and other.day == 1):
- n -= 1
- elif (not is_start) and (self.n > 0 and other.day == days_in_month):
- n += 1
-
- if is_start:
- months = n // 2 + n % 2
- to_day = 1 if n % 2 else self.day_of_month
- else:
- months = n // 2
- to_day = 31 if n % 2 else self.day_of_month
-
- return shift_month(other, months, to_day)
-
- @apply_array_wraps
- @cython.wraparound(False)
- @cython.boundscheck(False)
- def _apply_array(self, dtarr):
- cdef:
- ndarray i8other = dtarr.view("i8")
- Py_ssize_t i, count = dtarr.size
- int64_t val, res_val
- ndarray out = cnp.PyArray_EMPTY(
- i8other.ndim, i8other.shape, cnp.NPY_INT64, 0
- )
- npy_datetimestruct dts
- int months, to_day, nadj, n = self.n
- int days_in_month, day, anchor_dom = self.day_of_month
- bint is_start = isinstance(self, SemiMonthBegin)
- NPY_DATETIMEUNIT reso = get_unit_from_dtype(dtarr.dtype)
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(out, i8other)
-
- with nogil:
- for i in range(count):
- # Analogous to: val = i8other[i]
- val = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if val == NPY_NAT:
- res_val = NPY_NAT
-
- else:
- pandas_datetime_to_datetimestruct(val, reso, &dts)
- day = dts.day
-
- # Adjust so that we are always looking at self.day_of_month,
- # incrementing/decrementing n if necessary.
- nadj = roll_convention(day, n, anchor_dom)
-
- days_in_month = get_days_in_month(dts.year, dts.month)
- # For SemiMonthBegin on other.day == 1 and
- # SemiMonthEnd on other.day == days_in_month,
- # shifting `other` to `self.day_of_month` _always_ requires
- # incrementing/decrementing `n`, regardless of whether it is
- # initially positive.
- if is_start and (n <= 0 and day == 1):
- nadj -= 1
- elif (not is_start) and (n > 0 and day == days_in_month):
- nadj += 1
-
- if is_start:
- # See also: SemiMonthBegin._apply
- months = nadj // 2 + nadj % 2
- to_day = 1 if nadj % 2 else anchor_dom
-
- else:
- # See also: SemiMonthEnd._apply
- months = nadj // 2
- to_day = 31 if nadj % 2 else anchor_dom
-
- dts.year = year_add_months(dts, months)
- dts.month = month_add_months(dts, months)
- days_in_month = get_days_in_month(dts.year, dts.month)
- dts.day = min(to_day, days_in_month)
-
- res_val = npy_datetimestruct_to_datetime(reso, &dts)
-
- # Analogous to: out[i] = res_val
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_val
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return out
-
-
-cdef class SemiMonthEnd(SemiMonthOffset):
- """
- Two DateOffset's per month repeating on the last day of the month & day_of_month.
-
- Parameters
- ----------
- n : int
- normalize : bool, default False
- day_of_month : int, {1, 3,...,27}, default 15
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 14)
- >>> ts + pd.offsets.SemiMonthEnd()
- Timestamp('2022-01-15 00:00:00')
-
- >>> ts = pd.Timestamp(2022, 1, 15)
- >>> ts + pd.offsets.SemiMonthEnd()
- Timestamp('2022-01-31 00:00:00')
-
- >>> ts = pd.Timestamp(2022, 1, 31)
- >>> ts + pd.offsets.SemiMonthEnd()
- Timestamp('2022-02-15 00:00:00')
-
- If you want to get the result for the current month:
-
- >>> ts = pd.Timestamp(2022, 1, 15)
- >>> pd.offsets.SemiMonthEnd().rollforward(ts)
- Timestamp('2022-01-15 00:00:00')
- """
- _prefix = "SM"
- _min_day_of_month = 1
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- days_in_month = get_days_in_month(dt.year, dt.month)
- return dt.day in (self.day_of_month, days_in_month)
-
-
-cdef class SemiMonthBegin(SemiMonthOffset):
- """
- Two DateOffset's per month repeating on the first day of the month & day_of_month.
-
- Parameters
- ----------
- n : int
- normalize : bool, default False
- day_of_month : int, {2, 3,...,27}, default 15
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> ts + pd.offsets.SemiMonthBegin()
- Timestamp('2022-01-15 00:00:00')
- """
-
- _prefix = "SMS"
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- return dt.day in (1, self.day_of_month)
-
-
-# ---------------------------------------------------------------------
-# Week-Based Offset Classes
-
-
-cdef class Week(SingleConstructorOffset):
- """
- Weekly offset.
-
- Parameters
- ----------
- weekday : int or None, default None
- Always generate specific day of week.
- 0 for Monday and 6 for Sunday.
-
- See Also
- --------
- pd.tseries.offsets.WeekOfMonth :
- Describes monthly dates like, the Tuesday of the
- 2nd week of each month.
-
- Examples
- --------
-
- >>> date_object = pd.Timestamp("2023-01-13")
- >>> date_object
- Timestamp('2023-01-13 00:00:00')
-
- >>> date_plus_one_week = date_object + pd.tseries.offsets.Week(n=1)
- >>> date_plus_one_week
- Timestamp('2023-01-20 00:00:00')
-
- >>> date_next_monday = date_object + pd.tseries.offsets.Week(weekday=0)
- >>> date_next_monday
- Timestamp('2023-01-16 00:00:00')
-
- >>> date_next_sunday = date_object + pd.tseries.offsets.Week(weekday=6)
- >>> date_next_sunday
- Timestamp('2023-01-15 00:00:00')
- """
-
- _inc = timedelta(weeks=1)
- _prefix = "W"
- _attributes = tuple(["n", "normalize", "weekday"])
-
- cdef readonly:
- object weekday # int or None
- int _period_dtype_code
-
- def __init__(self, n=1, normalize=False, weekday=None):
- BaseOffset.__init__(self, n, normalize)
- self.weekday = weekday
-
- if self.weekday is not None:
- if self.weekday < 0 or self.weekday > 6:
- raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}")
-
- self._period_dtype_code = PeriodDtypeCode.W_SUN + (weekday + 1) % 7
-
- cpdef __setstate__(self, state):
- self.n = state.pop("n")
- self.normalize = state.pop("normalize")
- self.weekday = state.pop("weekday")
- self._cache = state.pop("_cache", {})
-
- def is_anchored(self) -> bool:
- return self.n == 1 and self.weekday is not None
-
- @apply_wraps
- def _apply(self, other):
- if self.weekday is None:
- return other + self.n * self._inc
-
- if not PyDateTime_Check(other):
- raise TypeError(
- f"Cannot add {type(other).__name__} to {type(self).__name__}"
- )
-
- k = self.n
- otherDay = other.weekday()
- if otherDay != self.weekday:
- other = other + timedelta((self.weekday - otherDay) % 7)
- if k > 0:
- k -= 1
-
- return other + timedelta(weeks=k)
-
- @apply_array_wraps
- def _apply_array(self, dtarr):
- if self.weekday is None:
- td = timedelta(days=7 * self.n)
- td64 = np.timedelta64(td, "ns")
- return dtarr + td64
- else:
- reso = get_unit_from_dtype(dtarr.dtype)
- i8other = dtarr.view("i8")
- return self._end_apply_index(i8other, reso=reso)
-
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef ndarray _end_apply_index(self, ndarray i8other, NPY_DATETIMEUNIT reso):
- """
- Add self to the given DatetimeIndex, specialized for case where
- self.weekday is non-null.
-
- Parameters
- ----------
- i8other : const int64_t[:]
- reso : NPY_DATETIMEUNIT
-
- Returns
- -------
- ndarray[int64_t]
- """
- cdef:
- Py_ssize_t i, count = i8other.size
- int64_t val, res_val
- ndarray out = cnp.PyArray_EMPTY(
- i8other.ndim, i8other.shape, cnp.NPY_INT64, 0
- )
- npy_datetimestruct dts
- int wday, days, weeks, n = self.n
- int anchor_weekday = self.weekday
- int64_t DAY_PERIODS = periods_per_day(reso)
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(out, i8other)
-
- with nogil:
- for i in range(count):
- # Analogous to: val = i8other[i]
- val = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if val == NPY_NAT:
- res_val = NPY_NAT
- else:
- pandas_datetime_to_datetimestruct(val, reso, &dts)
- wday = dayofweek(dts.year, dts.month, dts.day)
-
- days = 0
- weeks = n
- if wday != anchor_weekday:
- days = (anchor_weekday - wday) % 7
- if weeks > 0:
- weeks -= 1
-
- res_val = val + (7 * weeks + days) * DAY_PERIODS
-
- # Analogous to: out[i] = res_val
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_val
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return out
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- elif self.weekday is None:
- return True
- return dt.weekday() == self.weekday
-
- @property
- def rule_code(self) -> str:
- suffix = ""
- if self.weekday is not None:
- weekday = int_to_weekday[self.weekday]
- suffix = f"-{weekday}"
- return self._prefix + suffix
-
- @classmethod
- def _from_name(cls, suffix=None):
- if not suffix:
- weekday = None
- else:
- weekday = weekday_to_int[suffix]
- return cls(weekday=weekday)
-
-
-cdef class WeekOfMonth(WeekOfMonthMixin):
- """
- Describes monthly dates like "the Tuesday of the 2nd week of each month".
-
- Parameters
- ----------
- n : int
- week : int {0, 1, 2, 3, ...}, default 0
- A specific integer for the week of the month.
- e.g. 0 is 1st week of month, 1 is the 2nd week, etc.
- weekday : int {0, 1, ..., 6}, default 0
- A specific integer for the day of the week.
-
- - 0 is Monday
- - 1 is Tuesday
- - 2 is Wednesday
- - 3 is Thursday
- - 4 is Friday
- - 5 is Saturday
- - 6 is Sunday.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> ts + pd.offsets.WeekOfMonth()
- Timestamp('2022-01-03 00:00:00')
- """
-
- _prefix = "WOM"
- _attributes = tuple(["n", "normalize", "week", "weekday"])
-
- def __init__(self, n=1, normalize=False, week=0, weekday=0):
- WeekOfMonthMixin.__init__(self, n, normalize, weekday)
- self.week = week
-
- if self.week < 0 or self.week > 3:
- raise ValueError(f"Week must be 0<=week<=3, got {self.week}")
-
- cpdef __setstate__(self, state):
- self.n = state.pop("n")
- self.normalize = state.pop("normalize")
- self.weekday = state.pop("weekday")
- self.week = state.pop("week")
-
- def _get_offset_day(self, other: datetime) -> int:
- """
- Find the day in the same month as other that has the same
- weekday as self.weekday and is the self.week'th such day in the month.
-
- Parameters
- ----------
- other : datetime
-
- Returns
- -------
- day : int
- """
- mstart = datetime(other.year, other.month, 1)
- wday = mstart.weekday()
- shift_days = (self.weekday - wday) % 7
- return 1 + shift_days + self.week * 7
-
- @classmethod
- def _from_name(cls, suffix=None):
- if not suffix:
- raise ValueError(f"Prefix {repr(cls._prefix)} requires a suffix.")
- # only one digit weeks (1 --> week 0, 2 --> week 1, etc.)
- week = int(suffix[0]) - 1
- weekday = weekday_to_int[suffix[1:]]
- return cls(week=week, weekday=weekday)
-
-
-cdef class LastWeekOfMonth(WeekOfMonthMixin):
- """
- Describes monthly dates in last week of month.
-
- For example "the last Tuesday of each month".
-
- Parameters
- ----------
- n : int, default 1
- weekday : int {0, 1, ..., 6}, default 0
- A specific integer for the day of the week.
-
- - 0 is Monday
- - 1 is Tuesday
- - 2 is Wednesday
- - 3 is Thursday
- - 4 is Friday
- - 5 is Saturday
- - 6 is Sunday.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> ts + pd.offsets.LastWeekOfMonth()
- Timestamp('2022-01-31 00:00:00')
- """
-
- _prefix = "LWOM"
- _attributes = tuple(["n", "normalize", "weekday"])
-
- def __init__(self, n=1, normalize=False, weekday=0):
- WeekOfMonthMixin.__init__(self, n, normalize, weekday)
- self.week = -1
-
- if self.n == 0:
- raise ValueError("N cannot be 0")
-
- cpdef __setstate__(self, state):
- self.n = state.pop("n")
- self.normalize = state.pop("normalize")
- self.weekday = state.pop("weekday")
- self.week = -1
-
- def _get_offset_day(self, other: datetime) -> int:
- """
- Find the day in the same month as other that has the same
- weekday as self.weekday and is the last such day in the month.
-
- Parameters
- ----------
- other: datetime
-
- Returns
- -------
- day: int
- """
- dim = get_days_in_month(other.year, other.month)
- mend = datetime(other.year, other.month, dim)
- wday = mend.weekday()
- shift_days = (wday - self.weekday) % 7
- return dim - shift_days
-
- @classmethod
- def _from_name(cls, suffix=None):
- if not suffix:
- raise ValueError(f"Prefix {repr(cls._prefix)} requires a suffix.")
- weekday = weekday_to_int[suffix]
- return cls(weekday=weekday)
-
-
-# ---------------------------------------------------------------------
-# Special Offset Classes
-
-cdef class FY5253Mixin(SingleConstructorOffset):
- cdef readonly:
- int startingMonth
- int weekday
- str variation
-
- def __init__(
- self, n=1, normalize=False, weekday=0, startingMonth=1, variation="nearest"
- ):
- BaseOffset.__init__(self, n, normalize)
- self.startingMonth = startingMonth
- self.weekday = weekday
- self.variation = variation
-
- if self.n == 0:
- raise ValueError("N cannot be 0")
-
- if self.variation not in ["nearest", "last"]:
- raise ValueError(f"{self.variation} is not a valid variation")
-
- cpdef __setstate__(self, state):
- self.n = state.pop("n")
- self.normalize = state.pop("normalize")
- self.weekday = state.pop("weekday")
- self.variation = state.pop("variation")
-
- def is_anchored(self) -> bool:
- return (
- self.n == 1 and self.startingMonth is not None and self.weekday is not None
- )
-
- # --------------------------------------------------------------------
- # Name-related methods
-
- @property
- def rule_code(self) -> str:
- prefix = self._prefix
- suffix = self.get_rule_code_suffix()
- return f"{prefix}-{suffix}"
-
- def _get_suffix_prefix(self) -> str:
- if self.variation == "nearest":
- return "N"
- else:
- return "L"
-
- def get_rule_code_suffix(self) -> str:
- prefix = self._get_suffix_prefix()
- month = MONTH_ALIASES[self.startingMonth]
- weekday = int_to_weekday[self.weekday]
- return f"{prefix}-{month}-{weekday}"
-
-
-cdef class FY5253(FY5253Mixin):
- """
- Describes 52-53 week fiscal year. This is also known as a 4-4-5 calendar.
-
- It is used by companies that desire that their
- fiscal year always end on the same day of the week.
-
- It is a method of managing accounting periods.
- It is a common calendar structure for some industries,
- such as retail, manufacturing and parking industry.
-
- For more information see:
- https://en.wikipedia.org/wiki/4-4-5_calendar
-
- The year may either:
-
- - end on the last X day of the Y month.
- - end on the last X day closest to the last day of the Y month.
-
- X is a specific day of the week.
- Y is a certain month of the year
-
- Parameters
- ----------
- n : int
- weekday : int {0, 1, ..., 6}, default 0
- A specific integer for the day of the week.
-
- - 0 is Monday
- - 1 is Tuesday
- - 2 is Wednesday
- - 3 is Thursday
- - 4 is Friday
- - 5 is Saturday
- - 6 is Sunday.
-
- startingMonth : int {1, 2, ... 12}, default 1
- The month in which the fiscal year ends.
-
- variation : str, default "nearest"
- Method of employing 4-4-5 calendar.
-
- There are two options:
-
- - "nearest" means year end is **weekday** closest to last day of month in year.
- - "last" means year end is final **weekday** of the final month in fiscal year.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> ts + pd.offsets.FY5253()
- Timestamp('2022-01-31 00:00:00')
- """
-
- _prefix = "RE"
- _attributes = tuple(["n", "normalize", "weekday", "startingMonth", "variation"])
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- dt = datetime(dt.year, dt.month, dt.day)
- year_end = self.get_year_end(dt)
-
- if self.variation == "nearest":
- # We have to check the year end of "this" cal year AND the previous
- return year_end == dt or self.get_year_end(shift_month(dt, -1, None)) == dt
- else:
- return year_end == dt
-
- @apply_wraps
- def _apply(self, other: datetime) -> datetime:
- norm = Timestamp(other).normalize()
-
- n = self.n
- prev_year = self.get_year_end(datetime(other.year - 1, self.startingMonth, 1))
- cur_year = self.get_year_end(datetime(other.year, self.startingMonth, 1))
- next_year = self.get_year_end(datetime(other.year + 1, self.startingMonth, 1))
-
- prev_year = localize_pydatetime(prev_year, other.tzinfo)
- cur_year = localize_pydatetime(cur_year, other.tzinfo)
- next_year = localize_pydatetime(next_year, other.tzinfo)
-
- # Note: next_year.year == other.year + 1, so we will always
- # have other < next_year
- if norm == prev_year:
- n -= 1
- elif norm == cur_year:
- pass
- elif n > 0:
- if norm < prev_year:
- n -= 2
- elif prev_year < norm < cur_year:
- n -= 1
- elif cur_year < norm < next_year:
- pass
- else:
- if cur_year < norm < next_year:
- n += 1
- elif prev_year < norm < cur_year:
- pass
- elif (
- norm.year == prev_year.year
- and norm < prev_year
- and prev_year - norm <= timedelta(6)
- ):
- # GH#14774, error when next_year.year == cur_year.year
- # e.g. prev_year == datetime(2004, 1, 3),
- # other == datetime(2004, 1, 1)
- n -= 1
- else:
- assert False
-
- shifted = datetime(other.year + n, self.startingMonth, 1)
- result = self.get_year_end(shifted)
- result = datetime(
- result.year,
- result.month,
- result.day,
- other.hour,
- other.minute,
- other.second,
- other.microsecond,
- )
- return result
-
- def get_year_end(self, dt: datetime) -> datetime:
- assert dt.tzinfo is None
-
- dim = get_days_in_month(dt.year, self.startingMonth)
- target_date = datetime(dt.year, self.startingMonth, dim)
- wkday_diff = self.weekday - target_date.weekday()
- if wkday_diff == 0:
- # year_end is the same for "last" and "nearest" cases
- return target_date
-
- if self.variation == "last":
- days_forward = (wkday_diff % 7) - 7
-
- # days_forward is always negative, so we always end up
- # in the same year as dt
- return target_date + timedelta(days=days_forward)
- else:
- # variation == "nearest":
- days_forward = wkday_diff % 7
- if days_forward <= 3:
- # The upcoming self.weekday is closer than the previous one
- return target_date + timedelta(days_forward)
- else:
- # The previous self.weekday is closer than the upcoming one
- return target_date + timedelta(days_forward - 7)
-
- @classmethod
- def _parse_suffix(cls, varion_code, startingMonth_code, weekday_code):
- if varion_code == "N":
- variation = "nearest"
- elif varion_code == "L":
- variation = "last"
- else:
- raise ValueError(f"Unable to parse varion_code: {varion_code}")
-
- startingMonth = MONTH_TO_CAL_NUM[startingMonth_code]
- weekday = weekday_to_int[weekday_code]
-
- return {
- "weekday": weekday,
- "startingMonth": startingMonth,
- "variation": variation,
- }
-
- @classmethod
- def _from_name(cls, *args):
- return cls(**cls._parse_suffix(*args))
-
-
-cdef class FY5253Quarter(FY5253Mixin):
- """
- DateOffset increments between business quarter dates for 52-53 week fiscal year.
-
- Also known as a 4-4-5 calendar.
-
- It is used by companies that desire that their
- fiscal year always end on the same day of the week.
-
- It is a method of managing accounting periods.
- It is a common calendar structure for some industries,
- such as retail, manufacturing and parking industry.
-
- For more information see:
- https://en.wikipedia.org/wiki/4-4-5_calendar
-
- The year may either:
-
- - end on the last X day of the Y month.
- - end on the last X day closest to the last day of the Y month.
-
- X is a specific day of the week.
- Y is a certain month of the year
-
- startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ...
- startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
- startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ...
-
- Parameters
- ----------
- n : int
- weekday : int {0, 1, ..., 6}, default 0
- A specific integer for the day of the week.
-
- - 0 is Monday
- - 1 is Tuesday
- - 2 is Wednesday
- - 3 is Thursday
- - 4 is Friday
- - 5 is Saturday
- - 6 is Sunday.
-
- startingMonth : int {1, 2, ..., 12}, default 1
- The month in which fiscal years end.
-
- qtr_with_extra_week : int {1, 2, 3, 4}, default 1
- The quarter number that has the leap or 14 week when needed.
-
- variation : str, default "nearest"
- Method of employing 4-4-5 calendar.
-
- There are two options:
-
- - "nearest" means year end is **weekday** closest to last day of month in year.
- - "last" means year end is final **weekday** of the final month in fiscal year.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> ts + pd.offsets.FY5253Quarter()
- Timestamp('2022-01-31 00:00:00')
- """
-
- _prefix = "REQ"
- _attributes = tuple(
- [
- "n",
- "normalize",
- "weekday",
- "startingMonth",
- "qtr_with_extra_week",
- "variation",
- ]
- )
-
- cdef readonly:
- int qtr_with_extra_week
-
- def __init__(
- self,
- n=1,
- normalize=False,
- weekday=0,
- startingMonth=1,
- qtr_with_extra_week=1,
- variation="nearest",
- ):
- FY5253Mixin.__init__(
- self, n, normalize, weekday, startingMonth, variation
- )
- self.qtr_with_extra_week = qtr_with_extra_week
-
- cpdef __setstate__(self, state):
- FY5253Mixin.__setstate__(self, state)
- self.qtr_with_extra_week = state.pop("qtr_with_extra_week")
-
- @cache_readonly
- def _offset(self):
- return FY5253(
- startingMonth=self.startingMonth,
- weekday=self.weekday,
- variation=self.variation,
- )
-
- def _rollback_to_year(self, other: datetime):
- """
- Roll `other` back to the most recent date that was on a fiscal year
- end.
-
- Return the date of that year-end, the number of full quarters
- elapsed between that year-end and other, and the remaining Timedelta
- since the most recent quarter-end.
-
- Parameters
- ----------
- other : datetime or Timestamp
-
- Returns
- -------
- tuple of
- prev_year_end : Timestamp giving most recent fiscal year end
- num_qtrs : int
- tdelta : Timedelta
- """
- num_qtrs = 0
-
- norm = Timestamp(other).tz_localize(None)
- start = self._offset.rollback(norm)
- # Note: start <= norm and self._offset.is_on_offset(start)
-
- if start < norm:
- # roll adjustment
- qtr_lens = self.get_weeks(norm)
-
- # check that qtr_lens is consistent with self._offset addition
- end = _shift_day(start, days=7 * sum(qtr_lens))
- assert self._offset.is_on_offset(end), (start, end, qtr_lens)
-
- tdelta = norm - start
- for qlen in qtr_lens:
- if qlen * 7 <= tdelta.days:
- num_qtrs += 1
- tdelta -= (
- <_Timedelta>Timedelta(days=qlen * 7)
- )._as_creso(norm._creso)
- else:
- break
- else:
- tdelta = Timedelta(0)
-
- # Note: we always have tdelta._value>= 0
- return start, num_qtrs, tdelta
-
- @apply_wraps
- def _apply(self, other: datetime) -> datetime:
- # Note: self.n == 0 is not allowed.
-
- n = self.n
-
- prev_year_end, num_qtrs, tdelta = self._rollback_to_year(other)
- res = prev_year_end
- n += num_qtrs
- if self.n <= 0 and tdelta._value > 0:
- n += 1
-
- # Possible speedup by handling years first.
- years = n // 4
- if years:
- res += self._offset * years
- n -= years * 4
-
- # Add an extra day to make *sure* we are getting the quarter lengths
- # for the upcoming year, not the previous year
- qtr_lens = self.get_weeks(res + Timedelta(days=1))
-
- # Note: we always have 0 <= n < 4
- weeks = sum(qtr_lens[:n])
- if weeks:
- res = _shift_day(res, days=weeks * 7)
-
- return res
-
- def get_weeks(self, dt: datetime):
- ret = [13] * 4
-
- year_has_extra_week = self.year_has_extra_week(dt)
-
- if year_has_extra_week:
- ret[self.qtr_with_extra_week - 1] = 14
-
- return ret
-
- def year_has_extra_week(self, dt: datetime) -> bool:
- # Avoid round-down errors --> normalize to get
- # e.g. '370D' instead of '360D23H'
- norm = Timestamp(dt).normalize().tz_localize(None)
-
- next_year_end = self._offset.rollforward(norm)
- prev_year_end = norm - self._offset
- weeks_in_year = (next_year_end - prev_year_end).days / 7
- assert weeks_in_year in [52, 53], weeks_in_year
- return weeks_in_year == 53
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- if self._offset.is_on_offset(dt):
- return True
-
- next_year_end = dt - self._offset
-
- qtr_lens = self.get_weeks(dt)
-
- current = next_year_end
- for qtr_len in qtr_lens:
- current = _shift_day(current, days=qtr_len * 7)
- if dt == current:
- return True
- return False
-
- @property
- def rule_code(self) -> str:
- suffix = FY5253Mixin.rule_code.__get__(self)
- qtr = self.qtr_with_extra_week
- return f"{suffix}-{qtr}"
-
- @classmethod
- def _from_name(cls, *args):
- return cls(
- **dict(FY5253._parse_suffix(*args[:-1]), qtr_with_extra_week=int(args[-1]))
- )
-
-
-cdef class Easter(SingleConstructorOffset):
- """
- DateOffset for the Easter holiday using logic defined in dateutil.
-
- Right now uses the revised method which is valid in years 1583-4099.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 1, 1)
- >>> ts + pd.offsets.Easter()
- Timestamp('2022-04-17 00:00:00')
- """
-
- cpdef __setstate__(self, state):
- self.n = state.pop("n")
- self.normalize = state.pop("normalize")
-
- @apply_wraps
- def _apply(self, other: datetime) -> datetime:
- current_easter = easter(other.year)
- current_easter = datetime(
- current_easter.year, current_easter.month, current_easter.day
- )
- current_easter = localize_pydatetime(current_easter, other.tzinfo)
-
- n = self.n
- if n >= 0 and other < current_easter:
- n -= 1
- elif n < 0 and other > current_easter:
- n += 1
- # TODO: Why does this handle the 0 case the opposite of others?
-
- # NOTE: easter returns a datetime.date so we have to convert to type of
- # other
- new = easter(other.year + n)
- new = datetime(
- new.year,
- new.month,
- new.day,
- other.hour,
- other.minute,
- other.second,
- other.microsecond,
- )
- return new
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- return date(dt.year, dt.month, dt.day) == easter(dt.year)
-
-
-# ----------------------------------------------------------------------
-# Custom Offset classes
-
-
-cdef class CustomBusinessDay(BusinessDay):
- """
- DateOffset subclass representing custom business days excluding holidays.
-
- Parameters
- ----------
- n : int, default 1
- The number of days represented.
- normalize : bool, default False
- Normalize start/end dates to midnight before generating date range.
- weekmask : str, Default 'Mon Tue Wed Thu Fri'
- Weekmask of valid business days, passed to ``numpy.busdaycalendar``.
- holidays : list
- List/array of dates to exclude from the set of valid business days,
- passed to ``numpy.busdaycalendar``.
- calendar : np.busdaycalendar
- offset : timedelta, default timedelta(0)
-
- Examples
- --------
- >>> ts = pd.Timestamp(2022, 8, 5)
- >>> ts + pd.offsets.CustomBusinessDay(1)
- Timestamp('2022-08-08 00:00:00')
- """
-
- _prefix = "C"
- _attributes = tuple(
- ["n", "normalize", "weekmask", "holidays", "calendar", "offset"]
- )
-
- _apply_array = BaseOffset._apply_array
-
- def __init__(
- self,
- n=1,
- normalize=False,
- weekmask="Mon Tue Wed Thu Fri",
- holidays=None,
- calendar=None,
- offset=timedelta(0),
- ):
- BusinessDay.__init__(self, n, normalize, offset)
- self._init_custom(weekmask, holidays, calendar)
-
- cpdef __setstate__(self, state):
- self.holidays = state.pop("holidays")
- self.weekmask = state.pop("weekmask")
- BusinessDay.__setstate__(self, state)
-
- @apply_wraps
- def _apply(self, other):
- if self.n <= 0:
- roll = "forward"
- else:
- roll = "backward"
-
- if PyDateTime_Check(other):
- date_in = other
- np_dt = np.datetime64(date_in.date())
-
- np_incr_dt = np.busday_offset(
- np_dt, self.n, roll=roll, busdaycal=self.calendar
- )
-
- dt_date = np_incr_dt.astype(datetime)
- result = datetime.combine(dt_date, date_in.time())
-
- if self.offset:
- result = result + self.offset
- return result
-
- elif is_any_td_scalar(other):
- td = Timedelta(self.offset) + other
- return BDay(self.n, offset=td.to_pytimedelta(), normalize=self.normalize)
- else:
- raise ApplyTypeError(
- "Only know how to combine trading day with "
- "datetime, datetime64 or timedelta."
- )
-
- def is_on_offset(self, dt: datetime) -> bool:
- if self.normalize and not _is_normalized(dt):
- return False
- day64 = _to_dt64D(dt)
- return np.is_busday(day64, busdaycal=self.calendar)
-
-
-cdef class CustomBusinessHour(BusinessHour):
- """
- DateOffset subclass representing possibly n custom business days.
-
- In CustomBusinessHour we can use custom weekmask, holidays, and calendar.
-
- Parameters
- ----------
- n : int, default 1
- The number of hours represented.
- normalize : bool, default False
- Normalize start/end dates to midnight before generating date range.
- weekmask : str, Default 'Mon Tue Wed Thu Fri'
- Weekmask of valid business days, passed to ``numpy.busdaycalendar``.
- holidays : list
- List/array of dates to exclude from the set of valid business days,
- passed to ``numpy.busdaycalendar``.
- calendar : np.busdaycalendar
- Calendar to integrate.
- start : str, time, or list of str/time, default "09:00"
- Start time of your custom business hour in 24h format.
- end : str, time, or list of str/time, default: "17:00"
- End time of your custom business hour in 24h format.
-
- Examples
- --------
- In the example below the default parameters give the next business hour.
-
- >>> ts = pd.Timestamp(2022, 8, 5, 16)
- >>> ts + pd.offsets.CustomBusinessHour()
- Timestamp('2022-08-08 09:00:00')
-
- We can also change the start and the end of business hours.
-
- >>> ts = pd.Timestamp(2022, 8, 5, 16)
- >>> ts + pd.offsets.CustomBusinessHour(start="11:00")
- Timestamp('2022-08-08 11:00:00')
-
- >>> from datetime import time as dt_time
- >>> ts = pd.Timestamp(2022, 8, 5, 16)
- >>> ts + pd.offsets.CustomBusinessHour(end=dt_time(19, 0))
- Timestamp('2022-08-05 17:00:00')
-
- >>> ts = pd.Timestamp(2022, 8, 5, 22)
- >>> ts + pd.offsets.CustomBusinessHour(end=dt_time(19, 0))
- Timestamp('2022-08-08 10:00:00')
-
- You can divide your business day hours into several parts.
-
- >>> import datetime as dt
- >>> freq = pd.offsets.CustomBusinessHour(start=["06:00", "10:00", "15:00"],
- ... end=["08:00", "12:00", "17:00"])
- >>> pd.date_range(dt.datetime(2022, 12, 9), dt.datetime(2022, 12, 13), freq=freq)
- DatetimeIndex(['2022-12-09 06:00:00', '2022-12-09 07:00:00',
- '2022-12-09 10:00:00', '2022-12-09 11:00:00',
- '2022-12-09 15:00:00', '2022-12-09 16:00:00',
- '2022-12-12 06:00:00', '2022-12-12 07:00:00',
- '2022-12-12 10:00:00', '2022-12-12 11:00:00',
- '2022-12-12 15:00:00', '2022-12-12 16:00:00'],
- dtype='datetime64[ns]', freq='CBH')
-
- Business days can be specified by ``weekmask`` parameter. To convert
- the returned datetime object to its string representation
- the function strftime() is used in the next example.
-
- >>> import datetime as dt
- >>> freq = pd.offsets.CustomBusinessHour(weekmask="Mon Wed Fri",
- ... start="10:00", end="13:00")
- >>> pd.date_range(dt.datetime(2022, 12, 10), dt.datetime(2022, 12, 18),
- ... freq=freq).strftime('%a %d %b %Y %H:%M')
- Index(['Mon 12 Dec 2022 10:00', 'Mon 12 Dec 2022 11:00',
- 'Mon 12 Dec 2022 12:00', 'Wed 14 Dec 2022 10:00',
- 'Wed 14 Dec 2022 11:00', 'Wed 14 Dec 2022 12:00',
- 'Fri 16 Dec 2022 10:00', 'Fri 16 Dec 2022 11:00',
- 'Fri 16 Dec 2022 12:00'],
- dtype='object')
-
- Using NumPy business day calendar you can define custom holidays.
-
- >>> import datetime as dt
- >>> bdc = np.busdaycalendar(holidays=['2022-12-12', '2022-12-14'])
- >>> freq = pd.offsets.CustomBusinessHour(calendar=bdc, start="10:00", end="13:00")
- >>> pd.date_range(dt.datetime(2022, 12, 10), dt.datetime(2022, 12, 18), freq=freq)
- DatetimeIndex(['2022-12-13 10:00:00', '2022-12-13 11:00:00',
- '2022-12-13 12:00:00', '2022-12-15 10:00:00',
- '2022-12-15 11:00:00', '2022-12-15 12:00:00',
- '2022-12-16 10:00:00', '2022-12-16 11:00:00',
- '2022-12-16 12:00:00'],
- dtype='datetime64[ns]', freq='CBH')
- """
-
- _prefix = "CBH"
- _anchor = 0
- _attributes = tuple(
- ["n", "normalize", "weekmask", "holidays", "calendar", "start", "end", "offset"]
- )
-
- def __init__(
- self,
- n=1,
- normalize=False,
- weekmask="Mon Tue Wed Thu Fri",
- holidays=None,
- calendar=None,
- start="09:00",
- end="17:00",
- offset=timedelta(0),
- ):
- BusinessHour.__init__(self, n, normalize, start=start, end=end, offset=offset)
- self._init_custom(weekmask, holidays, calendar)
-
-
-cdef class _CustomBusinessMonth(BusinessMixin):
- """
- DateOffset subclass representing custom business month(s).
-
- Increments between beginning/end of month dates.
-
- Parameters
- ----------
- n : int, default 1
- The number of months represented.
- normalize : bool, default False
- Normalize start/end dates to midnight before generating date range.
- weekmask : str, Default 'Mon Tue Wed Thu Fri'
- Weekmask of valid business days, passed to ``numpy.busdaycalendar``.
- holidays : list
- List/array of dates to exclude from the set of valid business days,
- passed to ``numpy.busdaycalendar``.
- calendar : np.busdaycalendar
- Calendar to integrate.
- offset : timedelta, default timedelta(0)
- Time offset to apply.
- """
-
- _attributes = tuple(
- ["n", "normalize", "weekmask", "holidays", "calendar", "offset"]
- )
-
- def __init__(
- self,
- n=1,
- normalize=False,
- weekmask="Mon Tue Wed Thu Fri",
- holidays=None,
- calendar=None,
- offset=timedelta(0),
- ):
- BusinessMixin.__init__(self, n, normalize, offset)
- self._init_custom(weekmask, holidays, calendar)
-
- @cache_readonly
- def cbday_roll(self):
- """
- Define default roll function to be called in apply method.
- """
- cbday_kwds = self.kwds.copy()
- cbday_kwds["offset"] = timedelta(0)
-
- cbday = CustomBusinessDay(n=1, normalize=False, **cbday_kwds)
-
- if self._prefix.endswith("S"):
- # MonthBegin
- roll_func = cbday.rollforward
- else:
- # MonthEnd
- roll_func = cbday.rollback
- return roll_func
-
- @cache_readonly
- def m_offset(self):
- if self._prefix.endswith("S"):
- # MonthBegin
- moff = MonthBegin(n=1, normalize=False)
- else:
- # MonthEnd
- moff = MonthEnd(n=1, normalize=False)
- return moff
-
- @cache_readonly
- def month_roll(self):
- """
- Define default roll function to be called in apply method.
- """
- if self._prefix.endswith("S"):
- # MonthBegin
- roll_func = self.m_offset.rollback
- else:
- # MonthEnd
- roll_func = self.m_offset.rollforward
- return roll_func
-
- @apply_wraps
- def _apply(self, other: datetime) -> datetime:
- # First move to month offset
- cur_month_offset_date = self.month_roll(other)
-
- # Find this custom month offset
- compare_date = self.cbday_roll(cur_month_offset_date)
- n = roll_convention(other.day, self.n, compare_date.day)
-
- new = cur_month_offset_date + n * self.m_offset
- result = self.cbday_roll(new)
-
- if self.offset:
- result = result + self.offset
- return result
-
-
-cdef class CustomBusinessMonthEnd(_CustomBusinessMonth):
- _prefix = "CBM"
-
-
-cdef class CustomBusinessMonthBegin(_CustomBusinessMonth):
- _prefix = "CBMS"
-
-
-BDay = BusinessDay
-BMonthEnd = BusinessMonthEnd
-BMonthBegin = BusinessMonthBegin
-CBMonthEnd = CustomBusinessMonthEnd
-CBMonthBegin = CustomBusinessMonthBegin
-CDay = CustomBusinessDay
-
-# ----------------------------------------------------------------------
-# to_offset helpers
-
-prefix_mapping = {
- offset._prefix: offset
- for offset in [
- YearBegin, # 'AS'
- YearEnd, # 'A'
- BYearBegin, # 'BAS'
- BYearEnd, # 'BA'
- BusinessDay, # 'B'
- BusinessMonthBegin, # 'BMS'
- BusinessMonthEnd, # 'BM'
- BQuarterEnd, # 'BQ'
- BQuarterBegin, # 'BQS'
- BusinessHour, # 'BH'
- CustomBusinessDay, # 'C'
- CustomBusinessMonthEnd, # 'CBM'
- CustomBusinessMonthBegin, # 'CBMS'
- CustomBusinessHour, # 'CBH'
- MonthEnd, # 'M'
- MonthBegin, # 'MS'
- Nano, # 'N'
- SemiMonthEnd, # 'SM'
- SemiMonthBegin, # 'SMS'
- Week, # 'W'
- Second, # 'S'
- Minute, # 'T'
- Micro, # 'U'
- QuarterEnd, # 'Q'
- QuarterBegin, # 'QS'
- Milli, # 'L'
- Hour, # 'H'
- Day, # 'D'
- WeekOfMonth, # 'WOM'
- FY5253,
- FY5253Quarter,
- ]
-}
-
-# hack to handle WOM-1MON
-opattern = re.compile(
- r"([+\-]?\d*|[+\-]?\d*\.\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)"
-)
-
-_lite_rule_alias = {
- "W": "W-SUN",
- "Q": "Q-DEC",
-
- "A": "A-DEC", # YearEnd(month=12),
- "Y": "A-DEC",
- "AS": "AS-JAN", # YearBegin(month=1),
- "YS": "AS-JAN",
- "BA": "BA-DEC", # BYearEnd(month=12),
- "BY": "BA-DEC",
- "BAS": "BAS-JAN", # BYearBegin(month=1),
- "BYS": "BAS-JAN",
-
- "Min": "T",
- "min": "T",
- "ms": "L",
- "us": "U",
- "ns": "N",
-}
-
-_dont_uppercase = {"MS", "ms"}
-
-INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}"
-
-# TODO: still needed?
-# cache of previously seen offsets
-_offset_map = {}
-
-
-# TODO: better name?
-def _get_offset(name: str) -> BaseOffset:
- """
- Return DateOffset object associated with rule name.
-
- Examples
- --------
- _get_offset('EOM') --> BMonthEnd(1)
- """
- if name not in _dont_uppercase:
- name = name.upper()
- name = _lite_rule_alias.get(name, name)
- name = _lite_rule_alias.get(name.lower(), name)
- else:
- name = _lite_rule_alias.get(name, name)
-
- if name not in _offset_map:
- try:
- split = name.split("-")
- klass = prefix_mapping[split[0]]
- # handles case where there's no suffix (and will TypeError if too
- # many '-')
- offset = klass._from_name(*split[1:])
- except (ValueError, TypeError, KeyError) as err:
- # bad prefix or suffix
- raise ValueError(INVALID_FREQ_ERR_MSG.format(name)) from err
- # cache
- _offset_map[name] = offset
-
- return _offset_map[name]
-
-
-cpdef to_offset(freq):
- """
- Return DateOffset object from string or datetime.timedelta object.
-
- Parameters
- ----------
- freq : str, datetime.timedelta, BaseOffset or None
-
- Returns
- -------
- DateOffset or None
-
- Raises
- ------
- ValueError
- If freq is an invalid frequency
-
- See Also
- --------
- BaseOffset : Standard kind of date increment used for a date range.
-
- Examples
- --------
- >>> to_offset("5min")
- <5 * Minutes>
-
- >>> to_offset("1D1H")
- <25 * Hours>
-
- >>> to_offset("2W")
- <2 * Weeks: weekday=6>
-
- >>> to_offset("2B")
- <2 * BusinessDays>
-
- >>> to_offset(pd.Timedelta(days=1))
- <Day>
-
- >>> to_offset(Hour())
- <Hour>
- """
- if freq is None:
- return None
-
- if isinstance(freq, BaseOffset):
- return freq
-
- if isinstance(freq, tuple):
- raise TypeError(
- f"to_offset does not support tuples {freq}, pass as a string instead"
- )
-
- elif PyDelta_Check(freq):
- return delta_to_tick(freq)
-
- elif isinstance(freq, str):
- delta = None
- stride_sign = None
-
- try:
- split = opattern.split(freq)
- if split[-1] != "" and not split[-1].isspace():
- # the last element must be blank
- raise ValueError("last element must be blank")
-
- tups = zip(split[0::4], split[1::4], split[2::4])
- for n, (sep, stride, name) in enumerate(tups):
- if sep != "" and not sep.isspace():
- raise ValueError("separator must be spaces")
- prefix = _lite_rule_alias.get(name) or name
- if stride_sign is None:
- stride_sign = -1 if stride.startswith("-") else 1
- if not stride:
- stride = 1
-
- if prefix in {"D", "H", "T", "S", "L", "U", "N"}:
- # For these prefixes, we have something like "3H" or
- # "2.5T", so we can construct a Timedelta with the
- # matching unit and get our offset from delta_to_tick
- td = Timedelta(1, unit=prefix)
- off = delta_to_tick(td)
- offset = off * float(stride)
- if n != 0:
- # If n==0, then stride_sign is already incorporated
- # into the offset
- offset *= stride_sign
- else:
- stride = int(stride)
- offset = _get_offset(name)
- offset = offset * int(np.fabs(stride) * stride_sign)
-
- if delta is None:
- delta = offset
- else:
- delta = delta + offset
- except (ValueError, TypeError) as err:
- raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) from err
- else:
- delta = None
-
- if delta is None:
- raise ValueError(INVALID_FREQ_ERR_MSG.format(freq))
-
- return delta
-
-
-# ----------------------------------------------------------------------
-# RelativeDelta Arithmetic
-
-cdef datetime _shift_day(datetime other, int days):
- """
- Increment the datetime `other` by the given number of days, retaining
- the time-portion of the datetime. For tz-naive datetimes this is
- equivalent to adding a timedelta. For tz-aware datetimes it is similar to
- dateutil's relativedelta.__add__, but handles pytz tzinfo objects.
-
- Parameters
- ----------
- other : datetime or Timestamp
- days : int
-
- Returns
- -------
- shifted: datetime or Timestamp
- """
- if other.tzinfo is None:
- return other + timedelta(days=days)
-
- tz = other.tzinfo
- naive = other.replace(tzinfo=None)
- shifted = naive + timedelta(days=days)
- return localize_pydatetime(shifted, tz)
-
-
-cdef int year_add_months(npy_datetimestruct dts, int months) nogil:
- """
- New year number after shifting npy_datetimestruct number of months.
- """
- return dts.year + (dts.month + months - 1) // 12
-
-
-cdef int month_add_months(npy_datetimestruct dts, int months) nogil:
- """
- New month number after shifting npy_datetimestruct
- number of months.
- """
- cdef:
- int new_month = (dts.month + months) % 12
- return 12 if new_month == 0 else new_month
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef ndarray shift_quarters(
- ndarray dtindex,
- int quarters,
- int q1start_month,
- str day_opt,
- int modby=3,
- NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns,
-):
- """
- Given an int64 array representing nanosecond timestamps, shift all elements
- by the specified number of quarters using DateOffset semantics.
-
- Parameters
- ----------
- dtindex : int64_t[:] timestamps for input dates
- quarters : int number of quarters to shift
- q1start_month : int month in which Q1 begins by convention
- day_opt : {'start', 'end', 'business_start', 'business_end'}
- modby : int (3 for quarters, 12 for years)
- reso : NPY_DATETIMEUNIT, default NPY_FR_ns
-
- Returns
- -------
- out : ndarray[int64_t]
- """
- if day_opt not in ["start", "end", "business_start", "business_end"]:
- raise ValueError("day must be None, 'start', 'end', "
- "'business_start', or 'business_end'")
-
- cdef:
- Py_ssize_t count = dtindex.size
- ndarray out = cnp.PyArray_EMPTY(dtindex.ndim, dtindex.shape, cnp.NPY_INT64, 0)
- Py_ssize_t i
- int64_t val, res_val
- int months_since, n
- npy_datetimestruct dts
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(out, dtindex)
-
- with nogil:
- for i in range(count):
- # Analogous to: val = dtindex[i]
- val = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if val == NPY_NAT:
- res_val = NPY_NAT
- else:
- pandas_datetime_to_datetimestruct(val, reso, &dts)
- n = quarters
-
- months_since = (dts.month - q1start_month) % modby
- n = _roll_qtrday(&dts, n, months_since, day_opt)
-
- dts.year = year_add_months(dts, modby * n - months_since)
- dts.month = month_add_months(dts, modby * n - months_since)
- dts.day = get_day_of_month(&dts, day_opt)
-
- res_val = npy_datetimestruct_to_datetime(reso, &dts)
-
- # Analogous to: out[i] = res_val
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_val
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return out
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def shift_months(
- ndarray dtindex, # int64_t, arbitrary ndim
- int months,
- str day_opt=None,
- NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns,
-):
- """
- Given an int64-based datetime index, shift all elements
- specified number of months using DateOffset semantics
-
- day_opt: {None, 'start', 'end', 'business_start', 'business_end'}
- * None: day of month
- * 'start' 1st day of month
- * 'end' last day of month
- """
- cdef:
- Py_ssize_t i
- npy_datetimestruct dts
- int count = dtindex.size
- ndarray out = cnp.PyArray_EMPTY(dtindex.ndim, dtindex.shape, cnp.NPY_INT64, 0)
- int months_to_roll
- int64_t val, res_val
-
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(out, dtindex)
-
- if day_opt is not None and day_opt not in {
- "start", "end", "business_start", "business_end"
- }:
- raise ValueError("day must be None, 'start', 'end', "
- "'business_start', or 'business_end'")
-
- if day_opt is None:
- # TODO: can we combine this with the non-None case?
- with nogil:
- for i in range(count):
- # Analogous to: val = i8other[i]
- val = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if val == NPY_NAT:
- res_val = NPY_NAT
- else:
- pandas_datetime_to_datetimestruct(val, reso, &dts)
- dts.year = year_add_months(dts, months)
- dts.month = month_add_months(dts, months)
-
- dts.day = min(dts.day, get_days_in_month(dts.year, dts.month))
- res_val = npy_datetimestruct_to_datetime(reso, &dts)
-
- # Analogous to: out[i] = res_val
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_val
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- else:
- with nogil:
- for i in range(count):
-
- # Analogous to: val = i8other[i]
- val = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if val == NPY_NAT:
- res_val = NPY_NAT
- else:
- pandas_datetime_to_datetimestruct(val, reso, &dts)
- months_to_roll = months
-
- months_to_roll = _roll_qtrday(&dts, months_to_roll, 0, day_opt)
-
- dts.year = year_add_months(dts, months_to_roll)
- dts.month = month_add_months(dts, months_to_roll)
- dts.day = get_day_of_month(&dts, day_opt)
-
- res_val = npy_datetimestruct_to_datetime(reso, &dts)
-
- # Analogous to: out[i] = res_val
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_val
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return out
-
-
-def shift_month(stamp: datetime, months: int, day_opt: object = None) -> datetime:
- """
- Given a datetime (or Timestamp) `stamp`, an integer `months` and an
- option `day_opt`, return a new datetimelike that many months later,
- with day determined by `day_opt` using relativedelta semantics.
-
- Scalar analogue of shift_months.
-
- Parameters
- ----------
- stamp : datetime or Timestamp
- months : int
- day_opt : None, 'start', 'end', 'business_start', 'business_end', or int
- None: returned datetimelike has the same day as the input, or the
- last day of the month if the new month is too short
- 'start': returned datetimelike has day=1
- 'end': returned datetimelike has day on the last day of the month
- 'business_start': returned datetimelike has day on the first
- business day of the month
- 'business_end': returned datetimelike has day on the last
- business day of the month
- int: returned datetimelike has day equal to day_opt
-
- Returns
- -------
- shifted : datetime or Timestamp (same as input `stamp`)
- """
- cdef:
- int year, month, day
- int days_in_month, dy
-
- dy = (stamp.month + months) // 12
- month = (stamp.month + months) % 12
-
- if month == 0:
- month = 12
- dy -= 1
- year = stamp.year + dy
-
- if day_opt is None:
- days_in_month = get_days_in_month(year, month)
- day = min(stamp.day, days_in_month)
- elif day_opt == "start":
- day = 1
- elif day_opt == "end":
- day = get_days_in_month(year, month)
- elif day_opt == "business_start":
- # first business day of month
- day = get_firstbday(year, month)
- elif day_opt == "business_end":
- # last business day of month
- day = get_lastbday(year, month)
- elif is_integer_object(day_opt):
- days_in_month = get_days_in_month(year, month)
- day = min(day_opt, days_in_month)
- else:
- raise ValueError(day_opt)
- return stamp.replace(year=year, month=month, day=day)
-
-
-cdef int get_day_of_month(npy_datetimestruct* dts, str day_opt) nogil:
- """
- Find the day in `other`'s month that satisfies a DateOffset's is_on_offset
- policy, as described by the `day_opt` argument.
-
- Parameters
- ----------
- dts : npy_datetimestruct*
- day_opt : {'start', 'end', 'business_start', 'business_end'}
- 'start': returns 1
- 'end': returns last day of the month
- 'business_start': returns the first business day of the month
- 'business_end': returns the last business day of the month
-
- Returns
- -------
- day_of_month : int
-
- Examples
- -------
- >>> other = datetime(2017, 11, 14)
- >>> get_day_of_month(other, 'start')
- 1
- >>> get_day_of_month(other, 'end')
- 30
-
- Notes
- -----
- Caller is responsible for ensuring one of the four accepted day_opt values
- is passed.
- """
-
- if day_opt == "start":
- return 1
- elif day_opt == "end":
- return get_days_in_month(dts.year, dts.month)
- elif day_opt == "business_start":
- # first business day of month
- return get_firstbday(dts.year, dts.month)
- else:
- # i.e. day_opt == "business_end":
- # last business day of month
- return get_lastbday(dts.year, dts.month)
-
-
-cpdef int roll_convention(int other, int n, int compare) nogil:
- """
- Possibly increment or decrement the number of periods to shift
- based on rollforward/rollbackward conventions.
-
- Parameters
- ----------
- other : int, generally the day component of a datetime
- n : number of periods to increment, before adjusting for rolling
- compare : int, generally the day component of a datetime, in the same
- month as the datetime form which `other` was taken.
-
- Returns
- -------
- n : int number of periods to increment
- """
- if n > 0 and other < compare:
- n -= 1
- elif n <= 0 and other > compare:
- # as if rolled forward already
- n += 1
- return n
-
-
-def roll_qtrday(other: datetime, n: int, month: int,
- day_opt: str, modby: int) -> int:
- """
- Possibly increment or decrement the number of periods to shift
- based on rollforward/rollbackward conventions.
-
- Parameters
- ----------
- other : datetime or Timestamp
- n : number of periods to increment, before adjusting for rolling
- month : int reference month giving the first month of the year
- day_opt : {'start', 'end', 'business_start', 'business_end'}
- The convention to use in finding the day in a given month against
- which to compare for rollforward/rollbackward decisions.
- modby : int 3 for quarters, 12 for years
-
- Returns
- -------
- n : int number of periods to increment
-
- See Also
- --------
- get_day_of_month : Find the day in a month provided an offset.
- """
- cdef:
- int months_since
- npy_datetimestruct dts
-
- if day_opt not in ["start", "end", "business_start", "business_end"]:
- raise ValueError(day_opt)
-
- pydate_to_dtstruct(other, &dts)
-
- if modby == 12:
- # We care about the month-of-year, not month-of-quarter, so skip mod
- months_since = other.month - month
- else:
- months_since = other.month % modby - month % modby
-
- return _roll_qtrday(&dts, n, months_since, day_opt)
-
-
-cdef int _roll_qtrday(npy_datetimestruct* dts,
- int n,
- int months_since,
- str day_opt) except? -1 nogil:
- """
- See roll_qtrday.__doc__
- """
-
- if n > 0:
- if months_since < 0 or (months_since == 0 and
- dts.day < get_day_of_month(dts, day_opt)):
- # pretend to roll back if on same month but
- # before compare_day
- n -= 1
- else:
- if months_since > 0 or (months_since == 0 and
- dts.day > get_day_of_month(dts, day_opt)):
- # make sure to roll forward, so negate
- n += 1
- return n
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pxd
deleted file mode 100644
index 8809c81b530..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-from cpython.datetime cimport datetime
-
-from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
-
-
-cpdef str get_rule_month(str source)
-cpdef quarter_to_myear(int year, int quarter, str freq)
-
-cdef datetime parse_datetime_string(
- str date_string,
- bint dayfirst,
- bint yearfirst,
- NPY_DATETIMEUNIT* out_bestunit
-)
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pyi
deleted file mode 100644
index 83a5b0085f0..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pyi
+++ /dev/null
@@ -1,38 +0,0 @@
-from datetime import datetime
-
-import numpy as np
-
-from pandas._typing import npt
-
-class DateParseError(ValueError): ...
-
-def py_parse_datetime_string(
- date_string: str,
- dayfirst: bool = ...,
- yearfirst: bool = ...,
-) -> datetime: ...
-def parse_datetime_string_with_reso(
- date_string: str,
- freq: str | None = ...,
- dayfirst: bool | None = ...,
- yearfirst: bool | None = ...,
-) -> tuple[datetime, str]: ...
-def _does_string_look_like_datetime(py_string: str) -> bool: ...
-def quarter_to_myear(year: int, quarter: int, freq: str) -> tuple[int, int]: ...
-def try_parse_dates(
- values: npt.NDArray[np.object_], # object[:]
- parser,
-) -> npt.NDArray[np.object_]: ...
-def try_parse_year_month_day(
- years: npt.NDArray[np.object_], # object[:]
- months: npt.NDArray[np.object_], # object[:]
- days: npt.NDArray[np.object_], # object[:]
-) -> npt.NDArray[np.object_]: ...
-def guess_datetime_format(
- dt_str,
- dayfirst: bool | None = ...,
-) -> str | None: ...
-def concat_date_cols(
- date_cols: tuple,
-) -> npt.NDArray[np.object_]: ...
-def get_rule_month(source: str) -> str: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pyx
deleted file mode 100644
index 146e14f622c..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/parsing.pyx
+++ /dev/null
@@ -1,1189 +0,0 @@
-"""
-Parsing functions for datetime and datetime-like strings.
-"""
-import re
-import time
-import warnings
-
-from pandas.util._exceptions import find_stack_level
-
-cimport cython
-from cpython.datetime cimport (
- datetime,
- datetime_new,
- import_datetime,
- timedelta,
- tzinfo,
-)
-
-from datetime import timezone
-
-from cpython.object cimport PyObject_Str
-from cython cimport Py_ssize_t
-from libc.string cimport strchr
-
-import_datetime()
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- PyArray_GETITEM,
- PyArray_ITER_DATA,
- PyArray_ITER_NEXT,
- PyArray_IterNew,
- flatiter,
- float64_t,
-)
-
-cnp.import_array()
-
-# dateutil compat
-
-from decimal import InvalidOperation
-
-from dateutil.parser import (
- DEFAULTPARSER,
- parse as du_parse,
-)
-from dateutil.relativedelta import relativedelta
-from dateutil.tz import (
- tzlocal as _dateutil_tzlocal,
- tzoffset,
- tzutc as _dateutil_tzutc,
-)
-
-from pandas._config import get_option
-
-from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS
-from pandas._libs.tslibs.dtypes cimport (
- attrname_to_npy_unit,
- npy_unit_to_attrname,
-)
-from pandas._libs.tslibs.nattype cimport (
- c_NaT as NaT,
- c_nat_strings as nat_strings,
-)
-
-from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
-
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- npy_datetimestruct,
- string_to_dts,
-)
-
-from pandas._libs.tslibs.strptime import array_strptime
-
-from pandas._libs.tslibs.util cimport (
- get_c_string_buf_and_size,
- is_array,
-)
-
-
-cdef extern from "../src/headers/portable.h":
- int getdigit_ascii(char c, int default) nogil
-
-cdef extern from "../src/parser/tokenizer.h":
- double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
- int skip_trailing, int *error, int *maybe_int)
-
-
-# ----------------------------------------------------------------------
-# Constants
-
-
-class DateParseError(ValueError):
- pass
-
-
-_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0,
- second=0, microsecond=0)
-
-cdef:
- set _not_datelike_strings = {"a", "A", "m", "M", "p", "P", "t", "T"}
-
- # _timestamp_units -> units that we round to nanos
- set _timestamp_units = {
- NPY_DATETIMEUNIT.NPY_FR_ns,
- NPY_DATETIMEUNIT.NPY_FR_ps,
- NPY_DATETIMEUNIT.NPY_FR_fs,
- NPY_DATETIMEUNIT.NPY_FR_as,
- }
-
-# ----------------------------------------------------------------------
-cdef:
- const char* delimiters = " /-."
- int MAX_DAYS_IN_MONTH = 31, MAX_MONTH = 12
-
-
-cdef bint _is_delimiter(const char ch):
- return strchr(delimiters, ch) != NULL
-
-
-cdef int _parse_1digit(const char* s):
- cdef int result = 0
- result += getdigit_ascii(s[0], -10) * 1
- return result
-
-
-cdef int _parse_2digit(const char* s):
- cdef int result = 0
- result += getdigit_ascii(s[0], -10) * 10
- result += getdigit_ascii(s[1], -100) * 1
- return result
-
-
-cdef int _parse_4digit(const char* s):
- cdef int result = 0
- result += getdigit_ascii(s[0], -10) * 1000
- result += getdigit_ascii(s[1], -100) * 100
- result += getdigit_ascii(s[2], -1000) * 10
- result += getdigit_ascii(s[3], -10000) * 1
- return result
-
-
-cdef datetime _parse_delimited_date(
- str date_string, bint dayfirst, NPY_DATETIMEUNIT* out_bestunit
-):
- """
- Parse special cases of dates: MM/DD/YYYY, DD/MM/YYYY, MM/YYYY.
-
- At the beginning function tries to parse date in MM/DD/YYYY format, but
- if month > 12 - in DD/MM/YYYY (`dayfirst == False`).
- With `dayfirst == True` function makes an attempt to parse date in
- DD/MM/YYYY, if an attempt is wrong - in DD/MM/YYYY
-
- For MM/DD/YYYY, DD/MM/YYYY: delimiter can be a space or one of /-.
- For MM/YYYY: delimiter can be a space or one of /-
- If `date_string` can't be converted to date, then function returns
- None, None
-
- Parameters
- ----------
- date_string : str
- dayfirst : bool
- out_bestunit : NPY_DATETIMEUNIT*
- For specifying identified resolution.
-
- Returns:
- --------
- datetime or None
- """
- cdef:
- const char* buf
- Py_ssize_t length
- int day = 1, month = 1, year
- bint can_swap = 0
-
- buf = get_c_string_buf_and_size(date_string, &length)
- if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]):
- # parsing MM?DD?YYYY and DD?MM?YYYY dates
- month = _parse_2digit(buf)
- day = _parse_2digit(buf + 3)
- year = _parse_4digit(buf + 6)
- out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
- can_swap = 1
- elif length == 9 and _is_delimiter(buf[1]) and _is_delimiter(buf[4]):
- # parsing M?DD?YYYY and D?MM?YYYY dates
- month = _parse_1digit(buf)
- day = _parse_2digit(buf + 2)
- year = _parse_4digit(buf + 5)
- out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
- can_swap = 1
- elif length == 9 and _is_delimiter(buf[2]) and _is_delimiter(buf[4]):
- # parsing MM?D?YYYY and DD?M?YYYY dates
- month = _parse_2digit(buf)
- day = _parse_1digit(buf + 3)
- year = _parse_4digit(buf + 5)
- out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
- can_swap = 1
- elif length == 8 and _is_delimiter(buf[1]) and _is_delimiter(buf[3]):
- # parsing M?D?YYYY and D?M?YYYY dates
- month = _parse_1digit(buf)
- day = _parse_1digit(buf + 2)
- year = _parse_4digit(buf + 4)
- out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_D
- can_swap = 1
- elif length == 7 and _is_delimiter(buf[2]):
- # parsing MM?YYYY dates
- if buf[2] == b".":
- # we cannot reliably tell whether e.g. 10.2010 is a float
- # or a date, thus we refuse to parse it here
- return None
- month = _parse_2digit(buf)
- year = _parse_4digit(buf + 3)
- out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
- else:
- return None
-
- if month < 0 or day < 0 or year < 1000:
- # some part is not an integer, so
- # date_string can't be converted to date, above format
- return None
-
- if 1 <= month <= MAX_DAYS_IN_MONTH and 1 <= day <= MAX_DAYS_IN_MONTH \
- and (month <= MAX_MONTH or day <= MAX_MONTH):
- if (month > MAX_MONTH or (day <= MAX_MONTH and dayfirst)) and can_swap:
- day, month = month, day
- # In Python <= 3.6.0 there is no range checking for invalid dates
- # in C api, thus we call faster C version for 3.6.1 or newer
- return datetime_new(year, month, day, 0, 0, 0, 0, None)
-
- raise DateParseError(f"Invalid date specified ({month}/{day})")
-
-
-cdef bint _does_string_look_like_time(str parse_string):
- """
- Checks whether given string is a time: it has to start either from
- H:MM or from HH:MM, and hour and minute values must be valid.
-
- Parameters
- ----------
- parse_string : str
-
- Returns:
- --------
- bool
- Whether given string is potentially a time.
- """
- cdef:
- const char* buf
- Py_ssize_t length
- int hour = -1, minute = -1
-
- buf = get_c_string_buf_and_size(parse_string, &length)
- if length >= 4:
- if buf[1] == b":":
- # h:MM format
- hour = getdigit_ascii(buf[0], -1)
- minute = _parse_2digit(buf + 2)
- elif buf[2] == b":":
- # HH:MM format
- hour = _parse_2digit(buf)
- minute = _parse_2digit(buf + 3)
-
- return 0 <= hour <= 23 and 0 <= minute <= 59
-
-
-def py_parse_datetime_string(
- str date_string, bint dayfirst=False, bint yearfirst=False
-):
- # Python-accessible version for testing (we can't just make
- # parse_datetime_string cpdef bc it has a pointer argument)
- cdef:
- NPY_DATETIMEUNIT out_bestunit
-
- return parse_datetime_string(date_string, dayfirst, yearfirst, &out_bestunit)
-
-
-cdef datetime parse_datetime_string(
- # NB: This will break with np.str_ (GH#32264) even though
- # isinstance(npstrobj, str) evaluates to True, so caller must ensure
- # the argument is *exactly* 'str'
- str date_string,
- bint dayfirst,
- bint yearfirst,
- NPY_DATETIMEUNIT* out_bestunit
-):
- """
- Parse datetime string, only returns datetime.
- Also cares special handling matching time patterns.
-
- Returns
- -------
- datetime
-
- Notes
- -----
- Does not handle "today" or "now", which caller is responsible for handling.
- """
-
- cdef:
- datetime dt
- bint is_quarter = 0
-
- if not _does_string_look_like_datetime(date_string):
- raise ValueError(f'Given date string "{date_string}" not likely a datetime')
-
- if _does_string_look_like_time(date_string):
- # use current datetime as default, not pass _DEFAULT_DATETIME
- dt = du_parse(date_string, dayfirst=dayfirst,
- yearfirst=yearfirst)
- return dt
-
- dt = _parse_delimited_date(date_string, dayfirst, out_bestunit)
- if dt is not None:
- return dt
-
- try:
- dt = _parse_dateabbr_string(
- date_string, _DEFAULT_DATETIME, None, out_bestunit, &is_quarter
- )
- return dt
- except DateParseError:
- raise
- except ValueError:
- pass
-
- dt = dateutil_parse(date_string, default=_DEFAULT_DATETIME,
- dayfirst=dayfirst, yearfirst=yearfirst,
- ignoretz=False, out_bestunit=out_bestunit)
- return dt
-
-
-def parse_datetime_string_with_reso(
- str date_string, str freq=None, dayfirst=None, yearfirst=None
-):
- # NB: This will break with np.str_ (GH#45580) even though
- # isinstance(npstrobj, str) evaluates to True, so caller must ensure
- # the argument is *exactly* 'str'
- """
- Try hard to parse datetime string, leveraging dateutil plus some extra
- goodies like quarter recognition.
-
- Parameters
- ----------
- date_string : str
- freq : str or None, default None
- Helps with interpreting time string if supplied
- Corresponds to `offset.rule_code`
- dayfirst : bool, default None
- If None uses default from print_config
- yearfirst : bool, default None
- If None uses default from print_config
-
- Returns
- -------
- datetime
- str
- Describing resolution of parsed string.
-
- Raises
- ------
- ValueError : preliminary check suggests string is not datetime
- DateParseError : error within dateutil
- """
-
- if dayfirst is None:
- dayfirst = get_option("display.date_dayfirst")
- if yearfirst is None:
- yearfirst = get_option("display.date_yearfirst")
-
- cdef:
- datetime parsed
- str reso
- bint string_to_dts_failed
- npy_datetimestruct dts
- NPY_DATETIMEUNIT out_bestunit
- int out_local = 0
- int out_tzoffset
- tzinfo tz
- bint is_quarter = 0
-
- if not _does_string_look_like_datetime(date_string):
- raise ValueError(f'Given date string "{date_string}" not likely a datetime')
-
- # Try iso8601 first, as it handles nanoseconds
- string_to_dts_failed = string_to_dts(
- date_string, &dts, &out_bestunit, &out_local,
- &out_tzoffset, False
- )
- if not string_to_dts_failed:
- # Match Timestamp and drop picoseconds, femtoseconds, attoseconds
- # The new resolution will just be nano
- # GH#50417
- if out_bestunit in _timestamp_units:
- out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns
-
- if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns:
- # TODO: avoid circular import
- from pandas import Timestamp
- parsed = Timestamp(date_string)
- else:
- if out_local:
- tz = timezone(timedelta(minutes=out_tzoffset))
- else:
- tz = None
- parsed = datetime_new(
- dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz
- )
-
- reso = npy_unit_to_attrname[out_bestunit]
- return parsed, reso
-
- parsed = _parse_delimited_date(date_string, dayfirst, &out_bestunit)
- if parsed is not None:
- reso = npy_unit_to_attrname[out_bestunit]
- return parsed, reso
-
- try:
- parsed = _parse_dateabbr_string(
- date_string, _DEFAULT_DATETIME, freq, &out_bestunit, &is_quarter
- )
- except DateParseError:
- raise
- except ValueError:
- pass
- else:
- if is_quarter:
- reso = "quarter"
- else:
- reso = npy_unit_to_attrname[out_bestunit]
- return parsed, reso
-
- parsed = dateutil_parse(date_string, _DEFAULT_DATETIME,
- dayfirst=dayfirst, yearfirst=yearfirst,
- ignoretz=False, out_bestunit=&out_bestunit)
- reso = npy_unit_to_attrname[out_bestunit]
- return parsed, reso
-
-
-cpdef bint _does_string_look_like_datetime(str py_string):
- """
- Checks whether given string is a datetime: it has to start with '0' or
- be greater than 1000.
-
- Parameters
- ----------
- py_string: str
-
- Returns
- -------
- bool
- Whether given string is potentially a datetime.
- """
- cdef:
- const char *buf
- char *endptr = NULL
- Py_ssize_t length = -1
- double converted_date
- char first
- int error = 0
-
- buf = get_c_string_buf_and_size(py_string, &length)
- if length >= 1:
- first = buf[0]
- if first == b"0":
- # Strings starting with 0 are more consistent with a
- # date-like string than a number
- return True
- elif py_string in _not_datelike_strings:
- return False
- else:
- # xstrtod with such parameters copies behavior of python `float`
- # cast; for example, " 35.e-1 " is valid string for this cast so,
- # for correctly xstrtod call necessary to pass these params:
- # b'.' - a dot is used as separator, b'e' - an exponential form of
- # a float number can be used, b'\0' - not to use a thousand
- # separator, 1 - skip extra spaces before and after,
- converted_date = xstrtod(buf, &endptr,
- b".", b"e", b"\0", 1, &error, NULL)
- # if there were no errors and the whole line was parsed, then ...
- if error == 0 and endptr == buf + length:
- return converted_date >= 1000
-
- return True
-
-
-cdef datetime _parse_dateabbr_string(str date_string, datetime default,
- str freq, NPY_DATETIMEUNIT* out_bestunit,
- bint* is_quarter):
- # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
- cdef:
- datetime ret
- # year initialized to prevent compiler warnings
- int year = -1, quarter = -1, month
- Py_ssize_t date_len
- const char* buf
-
- if date_string in nat_strings:
- # default to nanos, could also reasonably do NPY_FR_GENERIC
- out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_ns
- return NaT
-
- date_string = date_string.upper()
- date_len = len(date_string)
-
- if date_len == 4:
- # parse year only like 2000
- try:
- ret = default.replace(year=int(date_string))
- out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_Y
- return ret
- except ValueError:
- pass
-
- if 4 <= date_len <= 7:
- buf = get_c_string_buf_and_size(date_string, &date_len)
- try:
- i = date_string.index("Q", 1, 6)
- if i == 1:
- quarter = _parse_1digit(buf) # i.e. int(date_string[0])
- if date_len == 4 or (date_len == 5
- and date_string[i + 1] == "-"):
- # r'(\d)Q-?(\d\d)')
- year = 2000 + int(date_string[-2:])
- elif date_len == 6 or (date_len == 7
- and date_string[i + 1] == "-"):
- # r'(\d)Q-?(\d\d\d\d)')
- year = int(date_string[-4:])
- else:
- raise ValueError
- elif i == 2 or i == 3:
- # r'(\d\d)-?Q(\d)'
- if date_len == 4 or (date_len == 5
- and date_string[i - 1] == "-"):
- # i.e. quarter = int(date_string[-1])
- quarter = _parse_1digit(buf + date_len - 1)
- year = 2000 + int(date_string[:2])
- else:
- raise ValueError
- elif i == 4 or i == 5:
- if date_len == 6 or (date_len == 7
- and date_string[i - 1] == "-"):
- # r'(\d\d\d\d)-?Q(\d)'
- # i.e. quarter = int(date_string[-1])
- quarter = _parse_1digit(buf + date_len - 1)
- year = int(date_string[:4])
- else:
- raise ValueError
-
- if not (1 <= quarter <= 4):
- raise DateParseError(f"Incorrect quarterly string is given, "
- f"quarter must be "
- f"between 1 and 4: {date_string}")
-
- try:
- # GH#1228
- year, month = quarter_to_myear(year, quarter, freq)
- except KeyError:
- raise DateParseError("Unable to retrieve month "
- "information from given "
- f"freq: {freq}")
-
- ret = default.replace(year=year, month=month)
- # Monthly is as close as we can get to a non-existent NPY_FR_Q
- out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
- is_quarter[0] = 1
- return ret
-
- except DateParseError:
- raise
- except ValueError:
- # e.g. if "Q" is not in date_string and .index raised
- pass
-
- if date_len == 6 and freq == "M":
- year = int(date_string[:4])
- month = int(date_string[4:6])
- try:
- ret = default.replace(year=year, month=month)
- out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
- return ret
- except ValueError as err:
- # We can infer that none of the patterns below will match
- raise ValueError(f"Unable to parse {date_string}") from err
-
- for pat in ["%Y-%m", "%b %Y", "%b-%Y"]:
- try:
- ret = datetime.strptime(date_string, pat)
- out_bestunit[0] = NPY_DATETIMEUNIT.NPY_FR_M
- return ret
- except ValueError:
- pass
-
- raise ValueError(f"Unable to parse {date_string}")
-
-
-cpdef quarter_to_myear(int year, int quarter, str freq):
- """
- A quarterly frequency defines a "year" which may not coincide with
- the calendar-year. Find the calendar-year and calendar-month associated
- with the given year and quarter under the `freq`-derived calendar.
-
- Parameters
- ----------
- year : int
- quarter : int
- freq : str or None
-
- Returns
- -------
- year : int
- month : int
-
- See Also
- --------
- Period.qyear
- """
- if quarter <= 0 or quarter > 4:
- raise ValueError("Quarter must be 1 <= q <= 4")
-
- if freq is not None:
- mnum = c_MONTH_NUMBERS[get_rule_month(freq)] + 1
- month = (mnum + (quarter - 1) * 3) % 12 + 1
- if month > mnum:
- year -= 1
- else:
- month = (quarter - 1) * 3 + 1
-
- return year, month
-
-
-cdef datetime dateutil_parse(
- str timestr,
- datetime default,
- bint ignoretz,
- bint dayfirst,
- bint yearfirst,
- NPY_DATETIMEUNIT* out_bestunit
-):
- """ lifted from dateutil to get resolution"""
-
- cdef:
- str attr
- datetime ret
- object res
- str reso = None
- dict repl = {}
-
- try:
- res, _ = DEFAULTPARSER._parse(timestr, dayfirst=dayfirst, yearfirst=yearfirst)
- except InvalidOperation:
- # GH#51157 dateutil can raise decimal.InvalidOperation
- res = None
-
- if res is None:
- raise DateParseError(
- f"Unknown datetime string format, unable to parse: {timestr}"
- )
-
- for attr in ["year", "month", "day", "hour",
- "minute", "second", "microsecond"]:
- value = getattr(res, attr)
- if value is not None:
- repl[attr] = value
- reso = attr
-
- if reso is None:
- raise DateParseError(f"Unable to parse datetime string: {timestr}")
-
- if reso == "microsecond":
- if repl["microsecond"] == 0:
- reso = "second"
- elif repl["microsecond"] % 1000 == 0:
- reso = "millisecond"
-
- try:
- ret = default.replace(**repl)
- except ValueError as err:
- # e.g. "day is out of range for month"
- # we re-raise to match dateutil's exception message
- raise DateParseError(str(err) + ": " + timestr) from err
- except OverflowError as err:
- # with e.g. "08335394550" dateutil raises when trying to pass
- # year=8335394550 to datetime.replace
- raise OutOfBoundsDatetime(
- f'Parsing "{timestr}" to datetime overflows'
- ) from err
-
- if res.weekday is not None and not res.day:
- ret = ret + relativedelta.relativedelta(weekday=res.weekday)
- if not ignoretz:
- if res.tzname and res.tzname in time.tzname:
- # GH#50791
- if res.tzname != "UTC":
- # If the system is localized in UTC (as many CI runs are)
- # we get tzlocal, once the deprecation is enforced will get
- # timezone.utc, not raise.
- warnings.warn(
- "Parsing '{res.tzname}' as tzlocal (dependent on system timezone) "
- "is deprecated and will raise in a future version. Pass the 'tz' "
- "keyword or call tz_localize after construction instead",
- FutureWarning,
- stacklevel=find_stack_level()
- )
- ret = ret.replace(tzinfo=_dateutil_tzlocal())
- elif res.tzoffset == 0:
- ret = ret.replace(tzinfo=_dateutil_tzutc())
- elif res.tzoffset:
- ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset))
-
- # dateutil can return a datetime with a tzoffset outside of (-24H, 24H)
- # bounds, which is invalid (can be constructed, but raises if we call
- # str(ret)). Check that and raise here if necessary.
- try:
- ret.utcoffset()
- except ValueError as err:
- # offset must be a timedelta strictly between -timedelta(hours=24)
- # and timedelta(hours=24)
- raise ValueError(
- f'Parsed string "{timestr}" gives an invalid tzoffset, '
- "which must be between -timedelta(hours=24) and timedelta(hours=24)"
- )
-
- out_bestunit[0] = attrname_to_npy_unit[reso]
- return ret
-
-
-# ----------------------------------------------------------------------
-# Parsing for type-inference
-
-
-def try_parse_dates(object[:] values, parser) -> np.ndarray:
- cdef:
- Py_ssize_t i, n
- object[::1] result
-
- n = len(values)
- result = np.empty(n, dtype="O")
-
- for i in range(n):
- if values[i] == "":
- result[i] = np.nan
- else:
- result[i] = parser(values[i])
-
- return result.base # .base to access underlying ndarray
-
-
-def try_parse_year_month_day(
- object[:] years, object[:] months, object[:] days
-) -> np.ndarray:
- cdef:
- Py_ssize_t i, n
- object[::1] result
-
- n = len(years)
- # TODO(cython3): Use len instead of `shape[0]`
- if months.shape[0] != n or days.shape[0] != n:
- raise ValueError("Length of years/months/days must all be equal")
- result = np.empty(n, dtype="O")
-
- for i in range(n):
- result[i] = datetime(int(years[i]), int(months[i]), int(days[i]))
-
- return result.base # .base to access underlying ndarray
-
-
-# ----------------------------------------------------------------------
-# Miscellaneous
-
-
-# Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
-#
-# We use this class to parse and tokenize date strings. However, as it is
-# a private class in the dateutil library, relying on backwards compatibility
-# is not practical. In fact, using this class issues warnings (xref gh-21322).
-# Thus, we port the class over so that both issues are resolved.
-#
-# Copyright (c) 2017 - dateutil contributors
-class _timelex:
- def __init__(self, instream):
- if getattr(instream, "decode", None) is not None:
- instream = instream.decode()
-
- if isinstance(instream, str):
- self.stream = instream
- elif getattr(instream, "read", None) is None:
- raise TypeError(
- "Parser must be a string or character stream, not "
- f"{type(instream).__name__}")
- else:
- self.stream = instream.read()
-
- def get_tokens(self):
- """
- This function breaks the time string into lexical units (tokens), which
- can be parsed by the parser. Lexical units are demarcated by changes in
- the character set, so any continuous string of letters is considered
- one unit, any continuous string of numbers is considered one unit.
- The main complication arises from the fact that dots ('.') can be used
- both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
- "4:30:21.447"). As such, it is necessary to read the full context of
- any dot-separated strings before breaking it into tokens; as such, this
- function maintains a "token stack", for when the ambiguous context
- demands that multiple tokens be parsed at once.
- """
- cdef:
- Py_ssize_t n
-
- stream = self.stream.replace("\x00", "")
-
- # TODO: Change \s --> \s+ (this doesn't match existing behavior)
- # TODO: change the punctuation block to punc+ (does not match existing)
- # TODO: can we merge the two digit patterns?
- tokens = re.findall(r"\s|"
- r"(?<![\.\d])\d+\.\d+(?![\.\d])"
- r"|\d+"
- r"|[a-zA-Z]+"
- r"|[\./:]+"
- r"|[^\da-zA-Z\./:\s]+", stream)
-
- # Re-combine token tuples of the form ["59", ",", "456"] because
- # in this context the "," is treated as a decimal
- # (e.g. in python's default logging format)
- for n, token in enumerate(tokens[:-2]):
- # Kludge to match ,-decimal behavior; it'd be better to do this
- # later in the process and have a simpler tokenization
- if (token is not None and token.isdigit() and
- tokens[n + 1] == "," and tokens[n + 2].isdigit()):
- # Have to check None b/c it might be replaced during the loop
- # TODO: I _really_ don't faking the value here
- tokens[n] = token + "." + tokens[n + 2]
- tokens[n + 1] = None
- tokens[n + 2] = None
-
- tokens = [x for x in tokens if x is not None]
- return tokens
-
- @classmethod
- def split(cls, s):
- return cls(s).get_tokens()
-
-
-_DATEUTIL_LEXER_SPLIT = _timelex.split
-
-
-def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
- """
- Guess the datetime format of a given datetime string.
-
- Parameters
- ----------
- dt_str : str
- Datetime string to guess the format of.
- dayfirst : bool, default False
- If True parses dates with the day first, eg 20/01/2005
- Warning: dayfirst=True is not strict, but will prefer to parse
- with day first (this is a known bug).
-
- Returns
- -------
- str or None : ret
- datetime format string (for `strftime` or `strptime`),
- or None if it can't be guessed.
- """
- day_attribute_and_format = (("day",), "%d", 2)
-
- # attr name, format, padding (if any)
- datetime_attrs_to_format = [
- (("year", "month", "day", "hour", "minute", "second"), "%Y%m%d%H%M%S", 0),
- (("year", "month", "day", "hour", "minute"), "%Y%m%d%H%M", 0),
- (("year", "month", "day", "hour"), "%Y%m%d%H", 0),
- (("year", "month", "day"), "%Y%m%d", 0),
- (("hour", "minute", "second"), "%H%M%S", 0),
- (("hour", "minute"), "%H%M", 0),
- (("year",), "%Y", 0),
- (("month",), "%B", 0),
- (("month",), "%b", 0),
- (("month",), "%m", 2),
- day_attribute_and_format,
- (("hour",), "%H", 2),
- (("minute",), "%M", 2),
- (("second",), "%S", 2),
- (("second", "microsecond"), "%S.%f", 0),
- (("tzinfo",), "%z", 0),
- (("tzinfo",), "%Z", 0),
- (("day_of_week",), "%a", 0),
- (("day_of_week",), "%A", 0),
- (("meridiem",), "%p", 0),
- ]
-
- if dayfirst:
- datetime_attrs_to_format.remove(day_attribute_and_format)
- datetime_attrs_to_format.insert(0, day_attribute_and_format)
-
- try:
- parsed_datetime = du_parse(dt_str, dayfirst=dayfirst)
- except (ValueError, OverflowError, InvalidOperation):
- # In case the datetime can't be parsed, its format cannot be guessed
- return None
-
- if parsed_datetime is None:
- return None
-
- # _DATEUTIL_LEXER_SPLIT from dateutil will never raise here
- tokens = _DATEUTIL_LEXER_SPLIT(dt_str)
-
- # Normalize offset part of tokens.
- # There are multiple formats for the timezone offset.
- # To pass the comparison condition between the output of `strftime` and
- # joined tokens, which is carried out at the final step of the function,
- # the offset part of the tokens must match the '%z' format like '+0900'
- # instead of ‘+09:00’.
- if parsed_datetime.tzinfo is not None:
- offset_index = None
- if len(tokens) > 0 and tokens[-1] == "Z":
- # the last 'Z' means zero offset
- offset_index = -1
- elif len(tokens) > 1 and tokens[-2] in ("+", "-"):
- # ex. [..., '+', '0900']
- offset_index = -2
- elif len(tokens) > 3 and tokens[-4] in ("+", "-"):
- # ex. [..., '+', '09', ':', '00']
- offset_index = -4
-
- if offset_index is not None:
- # If the input string has a timezone offset like '+0900',
- # the offset is separated into two tokens, ex. ['+', '0900’].
- # This separation will prevent subsequent processing
- # from correctly parsing the time zone format.
- # So in addition to the format nomalization, we rejoin them here.
- try:
- tokens[offset_index] = parsed_datetime.strftime("%z")
- except ValueError:
- # Invalid offset might not have raised in du_parse
- # https://github.com/dateutil/dateutil/issues/188
- return None
- tokens = tokens[:offset_index + 1 or None]
-
- format_guess = [None] * len(tokens)
- found_attrs = set()
-
- for attrs, attr_format, padding in datetime_attrs_to_format:
- # If a given attribute has been placed in the format string, skip
- # over other formats for that same underlying attribute (IE, month
- # can be represented in multiple different ways)
- if set(attrs) & found_attrs:
- continue
-
- if parsed_datetime.tzinfo is None and attr_format in ("%Z", "%z"):
- continue
-
- parsed_formatted = parsed_datetime.strftime(attr_format)
- for i, token_format in enumerate(format_guess):
- token_filled = _fill_token(tokens[i], padding)
- if token_format is None and token_filled == parsed_formatted:
- format_guess[i] = attr_format
- tokens[i] = token_filled
- found_attrs.update(attrs)
- break
-
- # Only consider it a valid guess if we have a year, month and day.
- # We make exceptions for %Y and %Y-%m (only with the `-` separator)
- # as they conform with ISO8601.
- if (
- len({"year", "month", "day"} & found_attrs) != 3
- and format_guess != ["%Y"]
- and not (
- format_guess == ["%Y", None, "%m"] and tokens[1] == "-"
- )
- ):
- return None
-
- output_format = []
- for i, guess in enumerate(format_guess):
- if guess is not None:
- # Either fill in the format placeholder (like %Y)
- output_format.append(guess)
- else:
- # Or just the token separate (IE, the dashes in "01-01-2013")
- try:
- # If the token is numeric, then we likely didn't parse it
- # properly, so our guess is wrong
- float(tokens[i])
- return None
- except ValueError:
- pass
-
- output_format.append(tokens[i])
-
- # if am/pm token present, replace 24-hour %H, with 12-hour %I
- if "%p" in output_format and "%H" in output_format:
- i = output_format.index("%H")
- output_format[i] = "%I"
-
- guessed_format = "".join(output_format)
-
- try:
- array_strptime(np.asarray([dt_str], dtype=object), guessed_format)
- except ValueError:
- # Doesn't parse, so this can't be the correct format.
- return None
- # rebuild string, capturing any inferred padding
- dt_str = "".join(tokens)
- if parsed_datetime.strftime(guessed_format) == dt_str:
- _maybe_warn_about_dayfirst(guessed_format, dayfirst)
- return guessed_format
- else:
- return None
-
-
-cdef str _fill_token(token: str, padding: int):
- cdef str token_filled
- if re.search(r"\d+\.\d+", token) is None:
- # For example: 98
- token_filled = token.zfill(padding)
- else:
- # For example: 00.123
- seconds, nanoseconds = token.split(".")
- seconds = f"{int(seconds):02d}"
- # right-pad so we get nanoseconds, then only take
- # first 6 digits (microseconds) as stdlib datetime
- # doesn't support nanoseconds
- nanoseconds = nanoseconds.ljust(9, "0")[:6]
- token_filled = f"{seconds}.{nanoseconds}"
- return token_filled
-
-
-cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst):
- """Warn if guessed datetime format doesn't respect dayfirst argument."""
- cdef:
- int day_index = format.find("%d")
- int month_index = format.find("%m")
-
- if (day_index != -1) and (month_index != -1):
- if (day_index > month_index) and dayfirst:
- warnings.warn(
- f"Parsing dates in {format} format when dayfirst=True was specified. "
- "Pass `dayfirst=False` or specify a format to silence this warning.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- if (day_index < month_index) and not dayfirst:
- warnings.warn(
- f"Parsing dates in {format} format when dayfirst=False (the default) "
- "was specified. "
- "Pass `dayfirst=True` or specify a format to silence this warning.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef object convert_to_unicode(object item, bint keep_trivial_numbers):
- """
- Convert `item` to str.
-
- Parameters
- ----------
- item : object
- keep_trivial_numbers : bool
- if True, then conversion (to string from integer/float zero)
- is not performed
-
- Returns
- -------
- str or int or float
- """
- cdef:
- float64_t float_item
-
- if keep_trivial_numbers:
- if isinstance(item, int):
- if <int>item == 0:
- return item
- elif isinstance(item, float):
- float_item = item
- if float_item == 0.0 or float_item != float_item:
- return item
-
- if not isinstance(item, str):
- item = PyObject_Str(item)
-
- return item
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def concat_date_cols(tuple date_cols) -> np.ndarray:
- """
- Concatenates elements from numpy arrays in `date_cols` into strings.
-
- Parameters
- ----------
- date_cols : tuple[ndarray]
-
- Returns
- -------
- arr_of_rows : ndarray[object]
-
- Examples
- --------
- >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
- >>> times=np.array(['11:20', '10:45'], dtype=object)
- >>> result = concat_date_cols((dates, times))
- >>> result
- array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
- """
- cdef:
- Py_ssize_t rows_count = 0, col_count = len(date_cols)
- Py_ssize_t col_idx, row_idx
- list list_to_join
- cnp.ndarray[object] iters
- object[::1] iters_view
- flatiter it
- cnp.ndarray[object] result
- object[::1] result_view
-
- if col_count == 0:
- return np.zeros(0, dtype=object)
-
- if not all(is_array(array) for array in date_cols):
- raise ValueError("not all elements from date_cols are numpy arrays")
-
- rows_count = min(len(array) for array in date_cols)
- result = np.zeros(rows_count, dtype=object)
- result_view = result
-
- if col_count == 1:
- array = date_cols[0]
- it = <flatiter>PyArray_IterNew(array)
- for row_idx in range(rows_count):
- item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
- result_view[row_idx] = convert_to_unicode(item, True)
- PyArray_ITER_NEXT(it)
- else:
- # create fixed size list - more efficient memory allocation
- list_to_join = [None] * col_count
- iters = np.zeros(col_count, dtype=object)
-
- # create memoryview of iters ndarray, that will contain some
- # flatiter's for each array in `date_cols` - more efficient indexing
- iters_view = iters
- for col_idx, array in enumerate(date_cols):
- iters_view[col_idx] = PyArray_IterNew(array)
-
- # array elements that are on the same line are converted to one string
- for row_idx in range(rows_count):
- for col_idx, array in enumerate(date_cols):
- # this cast is needed, because we did not find a way
- # to efficiently store `flatiter` type objects in ndarray
- it = <flatiter>iters_view[col_idx]
- item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
- list_to_join[col_idx] = convert_to_unicode(item, False)
- PyArray_ITER_NEXT(it)
- result_view[row_idx] = " ".join(list_to_join)
-
- return result
-
-
-cpdef str get_rule_month(str source):
- """
- Return starting month of given freq, default is December.
-
- Parameters
- ----------
- source : str
- Derived from `freq.rule_code` or `freq.freqstr`.
-
- Returns
- -------
- rule_month: str
-
- Examples
- --------
- >>> get_rule_month('D')
- 'DEC'
-
- >>> get_rule_month('A-JAN')
- 'JAN'
- """
- source = source.upper()
- if "-" not in source:
- return "DEC"
- else:
- return source.split("-")[1]
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/period.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/period.pxd
deleted file mode 100644
index 46c6e52cb91..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/period.pxd
+++ /dev/null
@@ -1,7 +0,0 @@
-from numpy cimport int64_t
-
-from .np_datetime cimport npy_datetimestruct
-
-
-cdef bint is_period_object(object obj)
-cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/period.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/period.pyi
deleted file mode 100644
index 946ae1215f1..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/period.pyi
+++ /dev/null
@@ -1,127 +0,0 @@
-from datetime import timedelta
-from typing import Literal
-
-import numpy as np
-
-from pandas._libs.tslibs.nattype import NaTType
-from pandas._libs.tslibs.offsets import BaseOffset
-from pandas._libs.tslibs.timestamps import Timestamp
-from pandas._typing import (
- Frequency,
- npt,
-)
-
-INVALID_FREQ_ERR_MSG: str
-DIFFERENT_FREQ: str
-
-class IncompatibleFrequency(ValueError): ...
-
-def periodarr_to_dt64arr(
- periodarr: npt.NDArray[np.int64], # const int64_t[:]
- freq: int,
-) -> npt.NDArray[np.int64]: ...
-def period_asfreq_arr(
- arr: npt.NDArray[np.int64],
- freq1: int,
- freq2: int,
- end: bool,
-) -> npt.NDArray[np.int64]: ...
-def get_period_field_arr(
- field: str,
- arr: npt.NDArray[np.int64], # const int64_t[:]
- freq: int,
-) -> npt.NDArray[np.int64]: ...
-def from_ordinals(
- values: npt.NDArray[np.int64], # const int64_t[:]
- freq: timedelta | BaseOffset | str,
-) -> npt.NDArray[np.int64]: ...
-def extract_ordinals(
- values: npt.NDArray[np.object_],
- freq: Frequency | int,
-) -> npt.NDArray[np.int64]: ...
-def extract_freq(
- values: npt.NDArray[np.object_],
-) -> BaseOffset: ...
-
-# exposed for tests
-def period_asfreq(ordinal: int, freq1: int, freq2: int, end: bool) -> int: ...
-def period_ordinal(
- y: int, m: int, d: int, h: int, min: int, s: int, us: int, ps: int, freq: int
-) -> int: ...
-def freq_to_dtype_code(freq: BaseOffset) -> int: ...
-def validate_end_alias(how: str) -> Literal["E", "S"]: ...
-
-class PeriodMixin:
- @property
- def end_time(self) -> Timestamp: ...
- @property
- def start_time(self) -> Timestamp: ...
- def _require_matching_freq(self, other, base: bool = ...) -> None: ...
-
-class Period(PeriodMixin):
- ordinal: int # int64_t
- freq: BaseOffset
-
- # error: "__new__" must return a class instance (got "Union[Period, NaTType]")
- def __new__( # type: ignore[misc]
- cls,
- value=...,
- freq: int | str | BaseOffset | None = ...,
- ordinal: int | None = ...,
- year: int | None = ...,
- month: int | None = ...,
- quarter: int | None = ...,
- day: int | None = ...,
- hour: int | None = ...,
- minute: int | None = ...,
- second: int | None = ...,
- ) -> Period | NaTType: ...
- @classmethod
- def _maybe_convert_freq(cls, freq) -> BaseOffset: ...
- @classmethod
- def _from_ordinal(cls, ordinal: int, freq) -> Period: ...
- @classmethod
- def now(cls, freq: BaseOffset = ...) -> Period: ...
- def strftime(self, fmt: str) -> str: ...
- def to_timestamp(
- self,
- freq: str | BaseOffset | None = ...,
- how: str = ...,
- ) -> Timestamp: ...
- def asfreq(self, freq: str | BaseOffset, how: str = ...) -> Period: ...
- @property
- def freqstr(self) -> str: ...
- @property
- def is_leap_year(self) -> bool: ...
- @property
- def daysinmonth(self) -> int: ...
- @property
- def days_in_month(self) -> int: ...
- @property
- def qyear(self) -> int: ...
- @property
- def quarter(self) -> int: ...
- @property
- def day_of_year(self) -> int: ...
- @property
- def weekday(self) -> int: ...
- @property
- def day_of_week(self) -> int: ...
- @property
- def week(self) -> int: ...
- @property
- def weekofyear(self) -> int: ...
- @property
- def second(self) -> int: ...
- @property
- def minute(self) -> int: ...
- @property
- def hour(self) -> int: ...
- @property
- def day(self) -> int: ...
- @property
- def month(self) -> int: ...
- @property
- def year(self) -> int: ...
- def __sub__(self, other) -> Period | BaseOffset: ...
- def __add__(self, other) -> Period: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/period.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/period.pyx
deleted file mode 100644
index f4417870050..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/period.pyx
+++ /dev/null
@@ -1,2708 +0,0 @@
-import re
-
-cimport numpy as cnp
-from cpython.object cimport (
- Py_EQ,
- Py_NE,
- PyObject,
- PyObject_RichCompare,
- PyObject_RichCompareBool,
-)
-from numpy cimport (
- int32_t,
- int64_t,
- ndarray,
-)
-
-import numpy as np
-
-cnp.import_array()
-
-cimport cython
-from cpython.datetime cimport (
- PyDate_Check,
- PyDateTime_Check,
- datetime,
- import_datetime,
-)
-from libc.stdlib cimport (
- free,
- malloc,
-)
-from libc.string cimport (
- memset,
- strlen,
-)
-from libc.time cimport (
- strftime,
- tm,
-)
-
-# import datetime C API
-import_datetime()
-
-cimport pandas._libs.tslibs.util as util
-from pandas._libs.missing cimport C_NA
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- NPY_FR_D,
- astype_overflowsafe,
- check_dts_bounds,
- get_timedelta64_value,
- npy_datetimestruct,
- npy_datetimestruct_to_datetime,
- pandas_datetime_to_datetimestruct,
-)
-
-from pandas._libs.tslibs.timestamps import Timestamp
-
-from pandas._libs.tslibs.ccalendar cimport (
- dayofweek,
- get_day_of_year,
- get_days_in_month,
- get_week_of_year,
- is_leapyear,
-)
-from pandas._libs.tslibs.timedeltas cimport (
- delta_to_nanoseconds,
- is_any_td_scalar,
-)
-
-from pandas._libs.tslibs.conversion import DT64NS_DTYPE
-
-from pandas._libs.tslibs.dtypes cimport (
- FR_ANN,
- FR_BUS,
- FR_DAY,
- FR_HR,
- FR_MIN,
- FR_MS,
- FR_MTH,
- FR_NS,
- FR_QTR,
- FR_SEC,
- FR_UND,
- FR_US,
- FR_WK,
- PeriodDtypeBase,
- attrname_to_abbrevs,
- freq_group_code_to_npy_unit,
-)
-from pandas._libs.tslibs.parsing cimport quarter_to_myear
-
-from pandas._libs.tslibs.parsing import parse_datetime_string_with_reso
-
-from pandas._libs.tslibs.nattype cimport (
- NPY_NAT,
- c_NaT as NaT,
- c_nat_strings as nat_strings,
- checknull_with_nat,
-)
-from pandas._libs.tslibs.offsets cimport (
- BaseOffset,
- is_offset_object,
- is_tick_object,
- to_offset,
-)
-
-from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG
-
-cdef:
- enum:
- INT32_MIN = -2_147_483_648LL
-
-
-ctypedef struct asfreq_info:
- int64_t intraday_conversion_factor
- int is_end
- int to_end
- int from_end
-
-ctypedef int64_t (*freq_conv_func)(int64_t, asfreq_info*) nogil
-
-
-cdef extern from *:
- """
- // must use npy typedef b/c int64_t is aliased in cython-generated c
- // unclear why we need LL for that row.
- // see https://github.com/pandas-dev/pandas/pull/34416/
- static npy_int64 daytime_conversion_factor_matrix[7][7] = {
- {1, 24, 1440, 86400, 86400000, 86400000000, 86400000000000},
- {0LL, 1LL, 60LL, 3600LL, 3600000LL, 3600000000LL, 3600000000000LL},
- {0, 0, 1, 60, 60000, 60000000, 60000000000},
- {0, 0, 0, 1, 1000, 1000000, 1000000000},
- {0, 0, 0, 0, 1, 1000, 1000000},
- {0, 0, 0, 0, 0, 1, 1000},
- {0, 0, 0, 0, 0, 0, 1}};
- """
- int64_t daytime_conversion_factor_matrix[7][7]
-
-
-cdef int max_value(int left, int right) nogil:
- if left > right:
- return left
- return right
-
-
-cdef int min_value(int left, int right) nogil:
- if left < right:
- return left
- return right
-
-
-cdef int64_t get_daytime_conversion_factor(int from_index, int to_index) nogil:
- cdef:
- int row = min_value(from_index, to_index)
- int col = max_value(from_index, to_index)
- # row or col < 6 means frequency strictly lower than Daily, which
- # do not use daytime_conversion_factors
- if row < 6:
- return 0
- elif col < 6:
- return 0
- return daytime_conversion_factor_matrix[row - 6][col - 6]
-
-
-cdef int64_t nofunc(int64_t ordinal, asfreq_info *af_info) nogil:
- return INT32_MIN
-
-
-cdef int64_t no_op(int64_t ordinal, asfreq_info *af_info) nogil:
- return ordinal
-
-
-cdef freq_conv_func get_asfreq_func(int from_freq, int to_freq) nogil:
- cdef:
- int from_group = get_freq_group(from_freq)
- int to_group = get_freq_group(to_freq)
-
- if from_group == FR_UND:
- from_group = FR_DAY
-
- if from_group == FR_BUS:
- if to_group == FR_ANN:
- return <freq_conv_func>asfreq_BtoA
- elif to_group == FR_QTR:
- return <freq_conv_func>asfreq_BtoQ
- elif to_group == FR_MTH:
- return <freq_conv_func>asfreq_BtoM
- elif to_group == FR_WK:
- return <freq_conv_func>asfreq_BtoW
- elif to_group == FR_BUS:
- return <freq_conv_func>no_op
- elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
- return <freq_conv_func>asfreq_BtoDT
- else:
- return <freq_conv_func>nofunc
-
- elif to_group == FR_BUS:
- if from_group == FR_ANN:
- return <freq_conv_func>asfreq_AtoB
- elif from_group == FR_QTR:
- return <freq_conv_func>asfreq_QtoB
- elif from_group == FR_MTH:
- return <freq_conv_func>asfreq_MtoB
- elif from_group == FR_WK:
- return <freq_conv_func>asfreq_WtoB
- elif from_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
- return <freq_conv_func>asfreq_DTtoB
- else:
- return <freq_conv_func>nofunc
-
- elif from_group == FR_ANN:
- if to_group == FR_ANN:
- return <freq_conv_func>asfreq_AtoA
- elif to_group == FR_QTR:
- return <freq_conv_func>asfreq_AtoQ
- elif to_group == FR_MTH:
- return <freq_conv_func>asfreq_AtoM
- elif to_group == FR_WK:
- return <freq_conv_func>asfreq_AtoW
- elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
- return <freq_conv_func>asfreq_AtoDT
- else:
- return <freq_conv_func>nofunc
-
- elif from_group == FR_QTR:
- if to_group == FR_ANN:
- return <freq_conv_func>asfreq_QtoA
- elif to_group == FR_QTR:
- return <freq_conv_func>asfreq_QtoQ
- elif to_group == FR_MTH:
- return <freq_conv_func>asfreq_QtoM
- elif to_group == FR_WK:
- return <freq_conv_func>asfreq_QtoW
- elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
- return <freq_conv_func>asfreq_QtoDT
- else:
- return <freq_conv_func>nofunc
-
- elif from_group == FR_MTH:
- if to_group == FR_ANN:
- return <freq_conv_func>asfreq_MtoA
- elif to_group == FR_QTR:
- return <freq_conv_func>asfreq_MtoQ
- elif to_group == FR_MTH:
- return <freq_conv_func>no_op
- elif to_group == FR_WK:
- return <freq_conv_func>asfreq_MtoW
- elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
- return <freq_conv_func>asfreq_MtoDT
- else:
- return <freq_conv_func>nofunc
-
- elif from_group == FR_WK:
- if to_group == FR_ANN:
- return <freq_conv_func>asfreq_WtoA
- elif to_group == FR_QTR:
- return <freq_conv_func>asfreq_WtoQ
- elif to_group == FR_MTH:
- return <freq_conv_func>asfreq_WtoM
- elif to_group == FR_WK:
- return <freq_conv_func>asfreq_WtoW
- elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
- return <freq_conv_func>asfreq_WtoDT
- else:
- return <freq_conv_func>nofunc
-
- elif from_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
- if to_group == FR_ANN:
- return <freq_conv_func>asfreq_DTtoA
- elif to_group == FR_QTR:
- return <freq_conv_func>asfreq_DTtoQ
- elif to_group == FR_MTH:
- return <freq_conv_func>asfreq_DTtoM
- elif to_group == FR_WK:
- return <freq_conv_func>asfreq_DTtoW
- elif to_group in [FR_DAY, FR_HR, FR_MIN, FR_SEC, FR_MS, FR_US, FR_NS]:
- if from_group > to_group:
- return <freq_conv_func>downsample_daytime
- else:
- return <freq_conv_func>upsample_daytime
-
- else:
- return <freq_conv_func>nofunc
-
- else:
- return <freq_conv_func>nofunc
-
-
-# --------------------------------------------------------------------
-# Frequency Conversion Helpers
-
-cdef int64_t DtoB_weekday(int64_t unix_date) nogil:
- return ((unix_date + 4) // 7) * 5 + ((unix_date + 4) % 7) - 4
-
-
-cdef int64_t DtoB(npy_datetimestruct *dts, int roll_back,
- int64_t unix_date) nogil:
- # calculate the current week (counting from 1970-01-01) treating
- # sunday as last day of a week
- cdef:
- int day_of_week = dayofweek(dts.year, dts.month, dts.day)
-
- if roll_back == 1:
- if day_of_week > 4:
- # change to friday before weekend
- unix_date -= (day_of_week - 4)
- else:
- if day_of_week > 4:
- # change to Monday after weekend
- unix_date += (7 - day_of_week)
-
- return DtoB_weekday(unix_date)
-
-
-cdef int64_t upsample_daytime(int64_t ordinal, asfreq_info *af_info) nogil:
- if af_info.is_end:
- return (ordinal + 1) * af_info.intraday_conversion_factor - 1
- else:
- return ordinal * af_info.intraday_conversion_factor
-
-
-cdef int64_t downsample_daytime(int64_t ordinal, asfreq_info *af_info) nogil:
- return ordinal // af_info.intraday_conversion_factor
-
-
-cdef int64_t transform_via_day(int64_t ordinal,
- asfreq_info *af_info,
- freq_conv_func first_func,
- freq_conv_func second_func) nogil:
- cdef:
- int64_t result
-
- result = first_func(ordinal, af_info)
- result = second_func(result, af_info)
- return result
-
-
-# --------------------------------------------------------------------
-# Conversion _to_ Daily Freq
-
-cdef int64_t asfreq_AtoDT(int64_t ordinal, asfreq_info *af_info) nogil:
- cdef:
- int64_t unix_date
- npy_datetimestruct dts
-
- ordinal += af_info.is_end
-
- dts.year = ordinal + 1970
- dts.month = 1
- adjust_dts_for_month(&dts, af_info.from_end)
-
- unix_date = unix_date_from_ymd(dts.year, dts.month, 1)
- unix_date -= af_info.is_end
- return upsample_daytime(unix_date, af_info)
-
-
-cdef int64_t asfreq_QtoDT(int64_t ordinal, asfreq_info *af_info) nogil:
- cdef:
- int64_t unix_date
- npy_datetimestruct dts
-
- ordinal += af_info.is_end
-
- dts.year = ordinal // 4 + 1970
- dts.month = (ordinal % 4) * 3 + 1
- adjust_dts_for_month(&dts, af_info.from_end)
-
- unix_date = unix_date_from_ymd(dts.year, dts.month, 1)
- unix_date -= af_info.is_end
- return upsample_daytime(unix_date, af_info)
-
-
-cdef int64_t asfreq_MtoDT(int64_t ordinal, asfreq_info *af_info) nogil:
- cdef:
- int64_t unix_date
- int year, month
-
- ordinal += af_info.is_end
-
- year = ordinal // 12 + 1970
- month = ordinal % 12 + 1
-
- unix_date = unix_date_from_ymd(year, month, 1)
- unix_date -= af_info.is_end
- return upsample_daytime(unix_date, af_info)
-
-
-cdef int64_t asfreq_WtoDT(int64_t ordinal, asfreq_info *af_info) nogil:
- ordinal = (ordinal * 7 + af_info.from_end - 4 +
- (7 - 1) * (af_info.is_end - 1))
- return upsample_daytime(ordinal, af_info)
-
-
-# --------------------------------------------------------------------
-# Conversion _to_ BusinessDay Freq
-
-cdef int64_t asfreq_AtoB(int64_t ordinal, asfreq_info *af_info) nogil:
- cdef:
- int roll_back
- npy_datetimestruct dts
- int64_t unix_date = asfreq_AtoDT(ordinal, af_info)
-
- pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, &dts)
- roll_back = af_info.is_end
- return DtoB(&dts, roll_back, unix_date)
-
-
-cdef int64_t asfreq_QtoB(int64_t ordinal, asfreq_info *af_info) nogil:
- cdef:
- int roll_back
- npy_datetimestruct dts
- int64_t unix_date = asfreq_QtoDT(ordinal, af_info)
-
- pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, &dts)
- roll_back = af_info.is_end
- return DtoB(&dts, roll_back, unix_date)
-
-
-cdef int64_t asfreq_MtoB(int64_t ordinal, asfreq_info *af_info) nogil:
- cdef:
- int roll_back
- npy_datetimestruct dts
- int64_t unix_date = asfreq_MtoDT(ordinal, af_info)
-
- pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, &dts)
- roll_back = af_info.is_end
- return DtoB(&dts, roll_back, unix_date)
-
-
-cdef int64_t asfreq_WtoB(int64_t ordinal, asfreq_info *af_info) nogil:
- cdef:
- int roll_back
- npy_datetimestruct dts
- int64_t unix_date = asfreq_WtoDT(ordinal, af_info)
-
- pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, &dts)
- roll_back = af_info.is_end
- return DtoB(&dts, roll_back, unix_date)
-
-
-cdef int64_t asfreq_DTtoB(int64_t ordinal, asfreq_info *af_info) nogil:
- cdef:
- int roll_back
- npy_datetimestruct dts
- int64_t unix_date = downsample_daytime(ordinal, af_info)
-
- pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, &dts)
- # This usage defines roll_back the opposite way from the others
- roll_back = 1 - af_info.is_end
- return DtoB(&dts, roll_back, unix_date)
-
-
-# ----------------------------------------------------------------------
-# Conversion _from_ Daily Freq
-
-cdef int64_t asfreq_DTtoA(int64_t ordinal, asfreq_info *af_info) nogil:
- cdef:
- npy_datetimestruct dts
-
- ordinal = downsample_daytime(ordinal, af_info)
- pandas_datetime_to_datetimestruct(ordinal, NPY_FR_D, &dts)
- return dts_to_year_ordinal(&dts, af_info.to_end)
-
-
-cdef int DtoQ_yq(int64_t ordinal, asfreq_info *af_info, npy_datetimestruct* dts) nogil:
- cdef:
- int quarter
-
- pandas_datetime_to_datetimestruct(ordinal, NPY_FR_D, dts)
- adjust_dts_for_qtr(dts, af_info.to_end)
-
- quarter = month_to_quarter(dts.month)
- return quarter
-
-
-cdef int64_t asfreq_DTtoQ(int64_t ordinal, asfreq_info *af_info) nogil:
- cdef:
- int quarter
- npy_datetimestruct dts
-
- ordinal = downsample_daytime(ordinal, af_info)
-
- quarter = DtoQ_yq(ordinal, af_info, &dts)
- return <int64_t>((dts.year - 1970) * 4 + quarter - 1)
-
-
-cdef int64_t asfreq_DTtoM(int64_t ordinal, asfreq_info *af_info) nogil:
- cdef:
- npy_datetimestruct dts
-
- ordinal = downsample_daytime(ordinal, af_info)
- pandas_datetime_to_datetimestruct(ordinal, NPY_FR_D, &dts)
- return dts_to_month_ordinal(&dts)
-
-
-cdef int64_t asfreq_DTtoW(int64_t ordinal, asfreq_info *af_info) nogil:
- ordinal = downsample_daytime(ordinal, af_info)
- return unix_date_to_week(ordinal, af_info.to_end)
-
-
-cdef int64_t unix_date_to_week(int64_t unix_date, int to_end) nogil:
- return (unix_date + 3 - to_end) // 7 + 1
-
-
-# --------------------------------------------------------------------
-# Conversion _from_ BusinessDay Freq
-
-cdef int64_t asfreq_BtoDT(int64_t ordinal, asfreq_info *af_info) nogil:
- ordinal = ((ordinal + 3) // 5) * 7 + (ordinal + 3) % 5 - 3
- return upsample_daytime(ordinal, af_info)
-
-
-cdef int64_t asfreq_BtoA(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_BtoDT,
- <freq_conv_func>asfreq_DTtoA)
-
-
-cdef int64_t asfreq_BtoQ(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_BtoDT,
- <freq_conv_func>asfreq_DTtoQ)
-
-
-cdef int64_t asfreq_BtoM(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_BtoDT,
- <freq_conv_func>asfreq_DTtoM)
-
-
-cdef int64_t asfreq_BtoW(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_BtoDT,
- <freq_conv_func>asfreq_DTtoW)
-
-
-# ----------------------------------------------------------------------
-# Conversion _from_ Annual Freq
-
-cdef int64_t asfreq_AtoA(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_AtoDT,
- <freq_conv_func>asfreq_DTtoA)
-
-
-cdef int64_t asfreq_AtoQ(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_AtoDT,
- <freq_conv_func>asfreq_DTtoQ)
-
-
-cdef int64_t asfreq_AtoM(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_AtoDT,
- <freq_conv_func>asfreq_DTtoM)
-
-
-cdef int64_t asfreq_AtoW(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_AtoDT,
- <freq_conv_func>asfreq_DTtoW)
-
-
-# ----------------------------------------------------------------------
-# Conversion _from_ Quarterly Freq
-
-cdef int64_t asfreq_QtoQ(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_QtoDT,
- <freq_conv_func>asfreq_DTtoQ)
-
-
-cdef int64_t asfreq_QtoA(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_QtoDT,
- <freq_conv_func>asfreq_DTtoA)
-
-
-cdef int64_t asfreq_QtoM(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_QtoDT,
- <freq_conv_func>asfreq_DTtoM)
-
-
-cdef int64_t asfreq_QtoW(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_QtoDT,
- <freq_conv_func>asfreq_DTtoW)
-
-
-# ----------------------------------------------------------------------
-# Conversion _from_ Monthly Freq
-
-cdef int64_t asfreq_MtoA(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_MtoDT,
- <freq_conv_func>asfreq_DTtoA)
-
-
-cdef int64_t asfreq_MtoQ(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_MtoDT,
- <freq_conv_func>asfreq_DTtoQ)
-
-
-cdef int64_t asfreq_MtoW(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_MtoDT,
- <freq_conv_func>asfreq_DTtoW)
-
-
-# ----------------------------------------------------------------------
-# Conversion _from_ Weekly Freq
-
-cdef int64_t asfreq_WtoA(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_WtoDT,
- <freq_conv_func>asfreq_DTtoA)
-
-
-cdef int64_t asfreq_WtoQ(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_WtoDT,
- <freq_conv_func>asfreq_DTtoQ)
-
-
-cdef int64_t asfreq_WtoM(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_WtoDT,
- <freq_conv_func>asfreq_DTtoM)
-
-
-cdef int64_t asfreq_WtoW(int64_t ordinal, asfreq_info *af_info) nogil:
- return transform_via_day(ordinal, af_info,
- <freq_conv_func>asfreq_WtoDT,
- <freq_conv_func>asfreq_DTtoW)
-
-
-# ----------------------------------------------------------------------
-
-@cython.cdivision
-cdef char* c_strftime(npy_datetimestruct *dts, char *fmt):
- """
- Generate a nice string representation of the period
- object, originally from DateObject_strftime
-
- Parameters
- ----------
- dts : npy_datetimestruct*
- fmt : char*
-
- Returns
- -------
- result : char*
- """
- cdef:
- tm c_date
- char *result
- int result_len = strlen(fmt) + 50
-
- c_date.tm_sec = dts.sec
- c_date.tm_min = dts.min
- c_date.tm_hour = dts.hour
- c_date.tm_mday = dts.day
- c_date.tm_mon = dts.month - 1
- c_date.tm_year = dts.year - 1900
- c_date.tm_wday = (dayofweek(dts.year, dts.month, dts.day) + 1) % 7
- c_date.tm_yday = get_day_of_year(dts.year, dts.month, dts.day) - 1
- c_date.tm_isdst = -1
-
- result = <char*>malloc(result_len * sizeof(char))
-
- strftime(result, result_len, fmt, &c_date)
-
- return result
-
-
-# ----------------------------------------------------------------------
-# Conversion between date_info and npy_datetimestruct
-
-cdef int get_freq_group(int freq) nogil:
- # See also FreqGroup.get_freq_group
- return (freq // 1000) * 1000
-
-
-cdef int get_freq_group_index(int freq) nogil:
- return freq // 1000
-
-
-cdef void adjust_dts_for_month(npy_datetimestruct* dts, int from_end) nogil:
- if from_end != 12:
- dts.month += from_end
- if dts.month > 12:
- dts.month -= 12
- else:
- dts.year -= 1
-
-
-cdef void adjust_dts_for_qtr(npy_datetimestruct* dts, int to_end) nogil:
- if to_end != 12:
- dts.month -= to_end
- if dts.month <= 0:
- dts.month += 12
- else:
- dts.year += 1
-
-
-# Find the unix_date (days elapsed since datetime(1970, 1, 1)
-# for the given year/month/day.
-# Assumes GREGORIAN_CALENDAR */
-cdef int64_t unix_date_from_ymd(int year, int month, int day) nogil:
- # Calculate the absolute date
- cdef:
- npy_datetimestruct dts
- int64_t unix_date
-
- memset(&dts, 0, sizeof(npy_datetimestruct))
- dts.year = year
- dts.month = month
- dts.day = day
- unix_date = npy_datetimestruct_to_datetime(NPY_FR_D, &dts)
- return unix_date
-
-
-cdef int64_t dts_to_month_ordinal(npy_datetimestruct* dts) nogil:
- # AKA: use npy_datetimestruct_to_datetime(NPY_FR_M, &dts)
- return <int64_t>((dts.year - 1970) * 12 + dts.month - 1)
-
-
-cdef int64_t dts_to_year_ordinal(npy_datetimestruct *dts, int to_end) nogil:
- cdef:
- int64_t result
-
- result = npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT.NPY_FR_Y, dts)
- if dts.month > to_end:
- return result + 1
- else:
- return result
-
-
-cdef int64_t dts_to_qtr_ordinal(npy_datetimestruct* dts, int to_end) nogil:
- cdef:
- int quarter
-
- adjust_dts_for_qtr(dts, to_end)
- quarter = month_to_quarter(dts.month)
- return <int64_t>((dts.year - 1970) * 4 + quarter - 1)
-
-
-cdef int get_anchor_month(int freq, int freq_group) nogil:
- cdef:
- int fmonth
- fmonth = freq - freq_group
- if fmonth == 0:
- fmonth = 12
- return fmonth
-
-
-# specifically _dont_ use cdvision or else ordinals near -1 are assigned to
-# incorrect dates GH#19643
-@cython.cdivision(False)
-cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil:
- """
- Generate an ordinal in period space
-
- Parameters
- ----------
- dts : npy_datetimestruct*
- freq : int
-
- Returns
- -------
- period_ordinal : int64_t
- """
- cdef:
- int64_t unix_date
- int freq_group, fmonth
- NPY_DATETIMEUNIT unit
-
- freq_group = get_freq_group(freq)
-
- if freq_group == FR_ANN:
- fmonth = get_anchor_month(freq, freq_group)
- return dts_to_year_ordinal(dts, fmonth)
-
- elif freq_group == FR_QTR:
- fmonth = get_anchor_month(freq, freq_group)
- return dts_to_qtr_ordinal(dts, fmonth)
-
- elif freq_group == FR_WK:
- unix_date = npy_datetimestruct_to_datetime(NPY_FR_D, dts)
- return unix_date_to_week(unix_date, freq - FR_WK)
-
- elif freq == FR_BUS:
- unix_date = npy_datetimestruct_to_datetime(NPY_FR_D, dts)
- return DtoB(dts, 0, unix_date)
-
- unit = freq_group_code_to_npy_unit(freq)
- return npy_datetimestruct_to_datetime(unit, dts)
-
-
-cdef void get_date_info(int64_t ordinal, int freq, npy_datetimestruct *dts) nogil:
- cdef:
- int64_t unix_date, nanos
- npy_datetimestruct dts2
-
- unix_date = get_unix_date(ordinal, freq)
- nanos = get_time_nanos(freq, unix_date, ordinal)
-
- pandas_datetime_to_datetimestruct(unix_date, NPY_FR_D, dts)
-
- pandas_datetime_to_datetimestruct(nanos, NPY_DATETIMEUNIT.NPY_FR_ns, &dts2)
- dts.hour = dts2.hour
- dts.min = dts2.min
- dts.sec = dts2.sec
- dts.us = dts2.us
- dts.ps = dts2.ps
-
-
-cdef int64_t get_unix_date(int64_t period_ordinal, int freq) nogil:
- """
- Returns the proleptic Gregorian ordinal of the date, as an integer.
- This corresponds to the number of days since Jan., 1st, 1970 AD.
- When the instance has a frequency less than daily, the proleptic date
- is calculated for the last day of the period.
-
- Parameters
- ----------
- period_ordinal : int64_t
- freq : int
-
- Returns
- -------
- unix_date : int64_t number of days since datetime(1970, 1, 1)
- """
- cdef:
- asfreq_info af_info
- freq_conv_func toDaily = NULL
-
- if freq == FR_DAY:
- return period_ordinal
-
- toDaily = get_asfreq_func(freq, FR_DAY)
- get_asfreq_info(freq, FR_DAY, True, &af_info)
- return toDaily(period_ordinal, &af_info)
-
-
-@cython.cdivision
-cdef int64_t get_time_nanos(int freq, int64_t unix_date, int64_t ordinal) nogil:
- """
- Find the number of nanoseconds after midnight on the given unix_date
- that the ordinal represents in the given frequency.
-
- Parameters
- ----------
- freq : int
- unix_date : int64_t
- ordinal : int64_t
-
- Returns
- -------
- int64_t
- """
- cdef:
- int64_t sub, factor
- int64_t nanos_in_day = 24 * 3600 * 10**9
-
- freq = get_freq_group(freq)
-
- if freq <= FR_DAY:
- return 0
-
- elif freq == FR_NS:
- factor = 1
-
- elif freq == FR_US:
- factor = 10**3
-
- elif freq == FR_MS:
- factor = 10**6
-
- elif freq == FR_SEC:
- factor = 10 **9
-
- elif freq == FR_MIN:
- factor = 10**9 * 60
-
- else:
- # We must have freq == FR_HR
- factor = 10**9 * 3600
-
- sub = ordinal - unix_date * (nanos_in_day / factor)
- return sub * factor
-
-
-cdef int get_yq(int64_t ordinal, int freq, npy_datetimestruct* dts):
- """
- Find the year and quarter of a Period with the given ordinal and frequency
-
- Parameters
- ----------
- ordinal : int64_t
- freq : int
- dts : *npy_datetimestruct
-
- Returns
- -------
- quarter : int
- describes the implied quarterly frequency associated with `freq`
-
- Notes
- -----
- Sets dts.year in-place.
- """
- cdef:
- asfreq_info af_info
- int qtr_freq
- int64_t unix_date
- int quarter
-
- unix_date = get_unix_date(ordinal, freq)
-
- if get_freq_group(freq) == FR_QTR:
- qtr_freq = freq
- else:
- qtr_freq = FR_QTR
-
- get_asfreq_info(FR_DAY, qtr_freq, True, &af_info)
-
- quarter = DtoQ_yq(unix_date, &af_info, dts)
- return quarter
-
-
-cdef int month_to_quarter(int month) nogil:
- return (month - 1) // 3 + 1
-
-
-# ----------------------------------------------------------------------
-# Period logic
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def periodarr_to_dt64arr(const int64_t[:] periodarr, int freq):
- """
- Convert array to datetime64 values from a set of ordinals corresponding to
- periods per period convention.
- """
- cdef:
- int64_t[::1] out
- Py_ssize_t i, N
-
- if freq < 6000: # i.e. FR_DAY, hard-code to avoid need to cast
- N = len(periodarr)
- out = np.empty(N, dtype="i8")
-
- # We get here with freqs that do not correspond to a datetime64 unit
- for i in range(N):
- out[i] = period_ordinal_to_dt64(periodarr[i], freq)
-
- return out.base # .base to access underlying np.ndarray
-
- else:
- # Short-circuit for performance
- if freq == FR_NS:
- # TODO: copy?
- return periodarr.base
-
- if freq == FR_US:
- dta = periodarr.base.view("M8[us]")
- elif freq == FR_MS:
- dta = periodarr.base.view("M8[ms]")
- elif freq == FR_SEC:
- dta = periodarr.base.view("M8[s]")
- elif freq == FR_MIN:
- dta = periodarr.base.view("M8[m]")
- elif freq == FR_HR:
- dta = periodarr.base.view("M8[h]")
- elif freq == FR_DAY:
- dta = periodarr.base.view("M8[D]")
- return astype_overflowsafe(dta, dtype=DT64NS_DTYPE)
-
-
-cdef void get_asfreq_info(int from_freq, int to_freq,
- bint is_end, asfreq_info *af_info) nogil:
- """
- Construct the `asfreq_info` object used to convert an ordinal from
- `from_freq` to `to_freq`.
-
- Parameters
- ----------
- from_freq : int
- to_freq int
- is_end : bool
- af_info : *asfreq_info
- """
- cdef:
- int from_group = get_freq_group(from_freq)
- int to_group = get_freq_group(to_freq)
-
- af_info.is_end = is_end
-
- af_info.intraday_conversion_factor = get_daytime_conversion_factor(
- get_freq_group_index(max_value(from_group, FR_DAY)),
- get_freq_group_index(max_value(to_group, FR_DAY)))
-
- if from_group == FR_WK:
- af_info.from_end = calc_week_end(from_freq, from_group)
- elif from_group == FR_ANN:
- af_info.from_end = calc_a_year_end(from_freq, from_group)
- elif from_group == FR_QTR:
- af_info.from_end = calc_a_year_end(from_freq, from_group)
-
- if to_group == FR_WK:
- af_info.to_end = calc_week_end(to_freq, to_group)
- elif to_group == FR_ANN:
- af_info.to_end = calc_a_year_end(to_freq, to_group)
- elif to_group == FR_QTR:
- af_info.to_end = calc_a_year_end(to_freq, to_group)
-
-
-@cython.cdivision
-cdef int calc_a_year_end(int freq, int group) nogil:
- cdef:
- int result = (freq - group) % 12
- if result == 0:
- return 12
- else:
- return result
-
-
-cdef int calc_week_end(int freq, int group) nogil:
- return freq - group
-
-
-cpdef int64_t period_asfreq(int64_t ordinal, int freq1, int freq2, bint end):
- """
- Convert period ordinal from one frequency to another, and if upsampling,
- choose to use start ('S') or end ('E') of period.
- """
- cdef:
- int64_t retval
-
- _period_asfreq(&ordinal, &retval, 1, freq1, freq2, end)
- return retval
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end):
- """
- Convert int64-array of period ordinals from one frequency to another, and
- if upsampling, choose to use start ('S') or end ('E') of period.
- """
- cdef:
- Py_ssize_t n = len(arr)
- Py_ssize_t increment = arr.strides[0] // 8
- ndarray[int64_t] result = cnp.PyArray_EMPTY(
- arr.ndim, arr.shape, cnp.NPY_INT64, 0
- )
-
- _period_asfreq(
- <int64_t*>cnp.PyArray_DATA(arr),
- <int64_t*>cnp.PyArray_DATA(result),
- n,
- freq1,
- freq2,
- end,
- increment,
- )
- return result
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef void _period_asfreq(
- int64_t* ordinals,
- int64_t* out,
- Py_ssize_t length,
- int freq1,
- int freq2,
- bint end,
- Py_ssize_t increment=1,
-):
- """See period_asfreq.__doc__"""
- cdef:
- Py_ssize_t i
- freq_conv_func func
- asfreq_info af_info
- int64_t val
-
- if length == 1 and ordinals[0] == NPY_NAT:
- # fastpath avoid calling get_asfreq_func
- out[0] = NPY_NAT
- return
-
- func = get_asfreq_func(freq1, freq2)
- get_asfreq_info(freq1, freq2, end, &af_info)
-
- for i in range(length):
- val = ordinals[i * increment]
- if val != NPY_NAT:
- val = func(val, &af_info)
- out[i] = val
-
-
-cpdef int64_t period_ordinal(int y, int m, int d, int h, int min,
- int s, int us, int ps, int freq):
- """
- Find the ordinal representation of the given datetime components at the
- frequency `freq`.
-
- Parameters
- ----------
- y : int
- m : int
- d : int
- h : int
- min : int
- s : int
- us : int
- ps : int
-
- Returns
- -------
- ordinal : int64_t
- """
- cdef:
- npy_datetimestruct dts
- dts.year = y
- dts.month = m
- dts.day = d
- dts.hour = h
- dts.min = min
- dts.sec = s
- dts.us = us
- dts.ps = ps
- return get_period_ordinal(&dts, freq)
-
-
-cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1:
- cdef:
- npy_datetimestruct dts
-
- if ordinal == NPY_NAT:
- return NPY_NAT
-
- get_date_info(ordinal, freq, &dts)
-
- check_dts_bounds(&dts)
- return npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT.NPY_FR_ns, &dts)
-
-
-cdef str period_format(int64_t value, int freq, object fmt=None):
- cdef:
- int freq_group
-
- if value == NPY_NAT:
- return "NaT"
-
- if isinstance(fmt, str):
- # Encode using current locale, in case fmt contains non-utf8 chars
- fmt = <bytes>util.string_encode_locale(fmt)
-
- if fmt is None:
- freq_group = get_freq_group(freq)
- if freq_group == FR_ANN:
- fmt = b"%Y"
- elif freq_group == FR_QTR:
- fmt = b"%FQ%q"
- elif freq_group == FR_MTH:
- fmt = b"%Y-%m"
- elif freq_group == FR_WK:
- left = period_asfreq(value, freq, FR_DAY, 0)
- right = period_asfreq(value, freq, FR_DAY, 1)
- return f"{period_format(left, FR_DAY)}/{period_format(right, FR_DAY)}"
- elif freq_group == FR_BUS or freq_group == FR_DAY:
- fmt = b"%Y-%m-%d"
- elif freq_group == FR_HR:
- fmt = b"%Y-%m-%d %H:00"
- elif freq_group == FR_MIN:
- fmt = b"%Y-%m-%d %H:%M"
- elif freq_group == FR_SEC:
- fmt = b"%Y-%m-%d %H:%M:%S"
- elif freq_group == FR_MS:
- fmt = b"%Y-%m-%d %H:%M:%S.%l"
- elif freq_group == FR_US:
- fmt = b"%Y-%m-%d %H:%M:%S.%u"
- elif freq_group == FR_NS:
- fmt = b"%Y-%m-%d %H:%M:%S.%n"
- else:
- raise ValueError(f"Unknown freq: {freq}")
-
- return _period_strftime(value, freq, fmt)
-
-
-cdef list extra_fmts = [(b"%q", b"^`AB`^"),
- (b"%f", b"^`CD`^"),
- (b"%F", b"^`EF`^"),
- (b"%l", b"^`GH`^"),
- (b"%u", b"^`IJ`^"),
- (b"%n", b"^`KL`^")]
-
-cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^",
- "^`GH`^", "^`IJ`^", "^`KL`^"]
-
-cdef str _period_strftime(int64_t value, int freq, bytes fmt):
- cdef:
- Py_ssize_t i
- npy_datetimestruct dts
- char *formatted
- bytes pat, brepl
- list found_pat = [False] * len(extra_fmts)
- int quarter
- int32_t us, ps
- str result, repl
-
- get_date_info(value, freq, &dts)
-
- # Find our additional directives in the pattern and replace them with
- # placeholders that are not processed by c_strftime
- for i in range(len(extra_fmts)):
- pat = extra_fmts[i][0]
- brepl = extra_fmts[i][1]
- if pat in fmt:
- fmt = fmt.replace(pat, brepl)
- found_pat[i] = True
-
- # Execute c_strftime to process the usual datetime directives
- formatted = c_strftime(&dts, <char*>fmt)
-
- # Decode result according to current locale
- result = util.char_to_string_locale(formatted)
- free(formatted)
-
- # Now we will fill the placeholders corresponding to our additional directives
-
- # First prepare the contents
- # Save these to local vars as dts can be modified by get_yq below
- us = dts.us
- ps = dts.ps
- if any(found_pat[0:3]):
- # Note: this modifies `dts` in-place so that year becomes fiscal year
- # However it looses the us and ps
- quarter = get_yq(value, freq, &dts)
- else:
- quarter = 0
-
- # Now do the filling per se
- for i in range(len(extra_fmts)):
- if found_pat[i]:
-
- if i == 0: # %q, 1-digit quarter.
- repl = f"{quarter}"
- elif i == 1: # %f, 2-digit 'Fiscal' year
- repl = f"{(dts.year % 100):02d}"
- elif i == 2: # %F, 'Fiscal' year with a century
- repl = str(dts.year)
- elif i == 3: # %l, milliseconds
- repl = f"{(us // 1_000):03d}"
- elif i == 4: # %u, microseconds
- repl = f"{(us):06d}"
- elif i == 5: # %n, nanoseconds
- repl = f"{((us * 1000) + (ps // 1000)):09d}"
-
- result = result.replace(str_extra_fmts[i], repl)
-
- return result
-
-
-# ----------------------------------------------------------------------
-# period accessors
-
-ctypedef int (*accessor)(int64_t ordinal, int freq) except INT32_MIN
-
-
-cdef int pyear(int64_t ordinal, int freq):
- cdef:
- npy_datetimestruct dts
- get_date_info(ordinal, freq, &dts)
- return dts.year
-
-
-cdef int pqyear(int64_t ordinal, int freq):
- cdef:
- npy_datetimestruct dts
-
- get_yq(ordinal, freq, &dts)
- return dts.year
-
-
-cdef int pquarter(int64_t ordinal, int freq):
- cdef:
- int quarter
- npy_datetimestruct dts
- quarter = get_yq(ordinal, freq, &dts)
- return quarter
-
-
-cdef int pmonth(int64_t ordinal, int freq):
- cdef:
- npy_datetimestruct dts
- get_date_info(ordinal, freq, &dts)
- return dts.month
-
-
-cdef int pday(int64_t ordinal, int freq):
- cdef:
- npy_datetimestruct dts
- get_date_info(ordinal, freq, &dts)
- return dts.day
-
-
-cdef int pweekday(int64_t ordinal, int freq):
- cdef:
- npy_datetimestruct dts
- get_date_info(ordinal, freq, &dts)
- return dayofweek(dts.year, dts.month, dts.day)
-
-
-cdef int pday_of_year(int64_t ordinal, int freq):
- cdef:
- npy_datetimestruct dts
- get_date_info(ordinal, freq, &dts)
- return get_day_of_year(dts.year, dts.month, dts.day)
-
-
-cdef int pweek(int64_t ordinal, int freq):
- cdef:
- npy_datetimestruct dts
- get_date_info(ordinal, freq, &dts)
- return get_week_of_year(dts.year, dts.month, dts.day)
-
-
-cdef int phour(int64_t ordinal, int freq):
- cdef:
- npy_datetimestruct dts
- get_date_info(ordinal, freq, &dts)
- return dts.hour
-
-
-cdef int pminute(int64_t ordinal, int freq):
- cdef:
- npy_datetimestruct dts
- get_date_info(ordinal, freq, &dts)
- return dts.min
-
-
-cdef int psecond(int64_t ordinal, int freq):
- cdef:
- npy_datetimestruct dts
- get_date_info(ordinal, freq, &dts)
- return <int>dts.sec
-
-
-cdef int pdays_in_month(int64_t ordinal, int freq):
- cdef:
- npy_datetimestruct dts
- get_date_info(ordinal, freq, &dts)
- return get_days_in_month(dts.year, dts.month)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def get_period_field_arr(str field, const int64_t[:] arr, int freq):
- cdef:
- Py_ssize_t i, sz
- int64_t[::1] out
-
- func = _get_accessor_func(field)
- if func is NULL:
- raise ValueError(f"Unrecognized field name: {field}")
-
- sz = len(arr)
- out = np.empty(sz, dtype=np.int64)
-
- for i in range(sz):
- if arr[i] == NPY_NAT:
- out[i] = -1
- continue
- out[i] = func(arr[i], freq)
-
- return out.base # .base to access underlying np.ndarray
-
-
-cdef accessor _get_accessor_func(str field):
- if field == "year":
- return <accessor>pyear
- elif field == "qyear":
- return <accessor>pqyear
- elif field == "quarter":
- return <accessor>pquarter
- elif field == "month":
- return <accessor>pmonth
- elif field == "day":
- return <accessor>pday
- elif field == "hour":
- return <accessor>phour
- elif field == "minute":
- return <accessor>pminute
- elif field == "second":
- return <accessor>psecond
- elif field == "week":
- return <accessor>pweek
- elif field == "day_of_year":
- return <accessor>pday_of_year
- elif field == "weekday" or field == "day_of_week":
- return <accessor>pweekday
- elif field == "days_in_month":
- return <accessor>pdays_in_month
- return NULL
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def from_ordinals(const int64_t[:] values, freq):
- cdef:
- Py_ssize_t i, n = len(values)
- int64_t[::1] result = np.empty(len(values), dtype="i8")
- int64_t val
-
- freq = to_offset(freq)
- if not isinstance(freq, BaseOffset):
- raise ValueError("freq not specified and cannot be inferred")
-
- for i in range(n):
- val = values[i]
- if val == NPY_NAT:
- result[i] = NPY_NAT
- else:
- result[i] = Period(val, freq=freq).ordinal
-
- return result.base
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def extract_ordinals(ndarray values, freq) -> np.ndarray:
- # values is object-dtype, may be 2D
-
- cdef:
- Py_ssize_t i, n = values.size
- int64_t ordinal
- ndarray ordinals = cnp.PyArray_EMPTY(
- values.ndim, values.shape, cnp.NPY_INT64, 0
- )
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(ordinals, values)
- object p
-
- if values.descr.type_num != cnp.NPY_OBJECT:
- # if we don't raise here, we'll segfault later!
- raise TypeError("extract_ordinals values must be object-dtype")
-
- freqstr = Period._maybe_convert_freq(freq).freqstr
-
- for i in range(n):
- # Analogous to: p = values[i]
- p = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- ordinal = _extract_ordinal(p, freqstr, freq)
-
- # Analogous to: ordinals[i] = ordinal
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = ordinal
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return ordinals
-
-
-cdef int64_t _extract_ordinal(object item, str freqstr, freq) except? -1:
- """
- See extract_ordinals.
- """
- cdef:
- int64_t ordinal
-
- if checknull_with_nat(item) or item is C_NA:
- ordinal = NPY_NAT
- elif util.is_integer_object(item):
- if item == NPY_NAT:
- ordinal = NPY_NAT
- else:
- raise TypeError(item)
- else:
- try:
- ordinal = item.ordinal
-
- if item.freqstr != freqstr:
- msg = DIFFERENT_FREQ.format(cls="PeriodIndex",
- own_freq=freqstr,
- other_freq=item.freqstr)
- raise IncompatibleFrequency(msg)
-
- except AttributeError:
- item = Period(item, freq=freq)
- if item is NaT:
- # input may contain NaT-like string
- ordinal = NPY_NAT
- else:
- ordinal = item.ordinal
-
- return ordinal
-
-
-def extract_freq(ndarray[object] values) -> BaseOffset:
- # TODO: Change type to const object[:] when Cython supports that.
-
- cdef:
- Py_ssize_t i, n = len(values)
- object value
-
- for i in range(n):
- value = values[i]
-
- if is_period_object(value):
- return value.freq
-
- raise ValueError("freq not specified and cannot be inferred")
-
-# -----------------------------------------------------------------------
-# period helpers
-
-
-DIFFERENT_FREQ = ("Input has different freq={other_freq} "
- "from {cls}(freq={own_freq})")
-
-
-class IncompatibleFrequency(ValueError):
- pass
-
-
-cdef class PeriodMixin:
- # Methods shared between Period and PeriodArray
-
- @property
- def start_time(self) -> Timestamp:
- """
- Get the Timestamp for the start of the period.
-
- Returns
- -------
- Timestamp
-
- See Also
- --------
- Period.end_time : Return the end Timestamp.
- Period.dayofyear : Return the day of year.
- Period.daysinmonth : Return the days in that month.
- Period.dayofweek : Return the day of the week.
-
- Examples
- --------
- >>> period = pd.Period('2012-1-1', freq='D')
- >>> period
- Period('2012-01-01', 'D')
-
- >>> period.start_time
- Timestamp('2012-01-01 00:00:00')
-
- >>> period.end_time
- Timestamp('2012-01-01 23:59:59.999999999')
- """
- return self.to_timestamp(how="start")
-
- @property
- def end_time(self) -> Timestamp:
- """
- Get the Timestamp for the end of the period.
-
- Returns
- -------
- Timestamp
-
- See Also
- --------
- Period.start_time : Return the start Timestamp.
- Period.dayofyear : Return the day of year.
- Period.daysinmonth : Return the days in that month.
- Period.dayofweek : Return the day of the week.
- """
- return self.to_timestamp(how="end")
-
- def _require_matching_freq(self, other, base=False):
- # See also arrays.period.raise_on_incompatible
- if is_offset_object(other):
- other_freq = other
- else:
- other_freq = other.freq
-
- if base:
- condition = self.freq.base != other_freq.base
- else:
- condition = self.freq != other_freq
-
- if condition:
- msg = DIFFERENT_FREQ.format(
- cls=type(self).__name__,
- own_freq=self.freqstr,
- other_freq=other_freq.freqstr,
- )
- raise IncompatibleFrequency(msg)
-
-
-cdef class _Period(PeriodMixin):
-
- cdef readonly:
- int64_t ordinal
- PeriodDtypeBase _dtype
- BaseOffset freq
-
- # higher than np.ndarray, np.matrix, np.timedelta64
- __array_priority__ = 100
-
- dayofweek = _Period.day_of_week
- dayofyear = _Period.day_of_year
-
- def __cinit__(self, int64_t ordinal, BaseOffset freq):
- self.ordinal = ordinal
- self.freq = freq
- # Note: this is more performant than PeriodDtype.from_date_offset(freq)
- # because from_date_offset cannot be made a cdef method (until cython
- # supported cdef classmethods)
- self._dtype = PeriodDtypeBase(freq._period_dtype_code)
-
- @classmethod
- def _maybe_convert_freq(cls, object freq) -> BaseOffset:
- """
- Internally we allow integer and tuple representations (for now) that
- are not recognized by to_offset, so we convert them here. Also, a
- Period's freq attribute must have `freq.n > 0`, which we check for here.
-
- Returns
- -------
- DateOffset
- """
- if isinstance(freq, int):
- # We already have a dtype code
- dtype = PeriodDtypeBase(freq)
- freq = dtype._freqstr
-
- freq = to_offset(freq)
-
- if freq.n <= 0:
- raise ValueError("Frequency must be positive, because it "
- f"represents span: {freq.freqstr}")
-
- return freq
-
- @classmethod
- def _from_ordinal(cls, ordinal: int64_t, freq) -> "Period":
- """
- Fast creation from an ordinal and freq that are already validated!
- """
- if ordinal == NPY_NAT:
- return NaT
- else:
- freq = cls._maybe_convert_freq(freq)
- self = _Period.__new__(cls, ordinal, freq)
- return self
-
- def __richcmp__(self, other, op):
- if is_period_object(other):
- if other.freq != self.freq:
- if op == Py_EQ:
- return False
- elif op == Py_NE:
- return True
- self._require_matching_freq(other)
- return PyObject_RichCompareBool(self.ordinal, other.ordinal, op)
- elif other is NaT:
- return op == Py_NE
- elif util.is_array(other):
- # GH#44285
- if cnp.PyArray_IsZeroDim(other):
- return PyObject_RichCompare(self, other.item(), op)
- else:
- # in particular ndarray[object]; see test_pi_cmp_period
- return np.array([PyObject_RichCompare(self, x, op) for x in other])
- return NotImplemented
-
- def __hash__(self):
- return hash((self.ordinal, self.freqstr))
-
- def _add_timedeltalike_scalar(self, other) -> "Period":
- cdef:
- int64_t inc
-
- if not is_tick_object(self.freq):
- raise IncompatibleFrequency("Input cannot be converted to "
- f"Period(freq={self.freqstr})")
-
- if (
- util.is_timedelta64_object(other) and
- get_timedelta64_value(other) == NPY_NAT
- ):
- # i.e. np.timedelta64("nat")
- return NaT
-
- try:
- inc = delta_to_nanoseconds(other, reso=self.freq._creso, round_ok=False)
- except ValueError as err:
- raise IncompatibleFrequency("Input cannot be converted to "
- f"Period(freq={self.freqstr})") from err
- # TODO: overflow-check here
- ordinal = self.ordinal + inc
- return Period(ordinal=ordinal, freq=self.freq)
-
- def _add_offset(self, other) -> "Period":
- # Non-Tick DateOffset other
- cdef:
- int64_t ordinal
-
- self._require_matching_freq(other, base=True)
-
- ordinal = self.ordinal + other.n
- return Period(ordinal=ordinal, freq=self.freq)
-
- def __add__(self, other):
- if not is_period_object(self):
- # cython semantics; this is analogous to a call to __radd__
- # TODO(cython3): remove this
- if self is NaT:
- return NaT
- return other.__add__(self)
-
- if is_any_td_scalar(other):
- return self._add_timedeltalike_scalar(other)
- elif is_offset_object(other):
- return self._add_offset(other)
- elif other is NaT:
- return NaT
- elif util.is_integer_object(other):
- ordinal = self.ordinal + other * self.freq.n
- return Period(ordinal=ordinal, freq=self.freq)
-
- elif is_period_object(other):
- # can't add datetime-like
- # GH#17983; can't just return NotImplemented bc we get a RecursionError
- # when called via np.add.reduce see TestNumpyReductions.test_add
- # in npdev build
- sname = type(self).__name__
- oname = type(other).__name__
- raise TypeError(f"unsupported operand type(s) for +: '{sname}' "
- f"and '{oname}'")
-
- elif util.is_array(other):
- if other.dtype == object:
- # GH#50162
- return np.array([self + x for x in other], dtype=object)
-
- return NotImplemented
-
- def __radd__(self, other):
- return self.__add__(other)
-
- def __sub__(self, other):
- if not is_period_object(self):
- # cython semantics; this is like a call to __rsub__
- # TODO(cython3): remove this
- if self is NaT:
- return NaT
- return NotImplemented
-
- elif (
- is_any_td_scalar(other)
- or is_offset_object(other)
- or util.is_integer_object(other)
- ):
- return self + (-other)
- elif is_period_object(other):
- self._require_matching_freq(other)
- # GH 23915 - mul by base freq since __add__ is agnostic of n
- return (self.ordinal - other.ordinal) * self.freq.base
- elif other is NaT:
- return NaT
-
- elif util.is_array(other):
- if other.dtype == object:
- # GH#50162
- return np.array([self - x for x in other], dtype=object)
-
- return NotImplemented
-
- def __rsub__(self, other):
- if other is NaT:
- return NaT
-
- elif util.is_array(other):
- if other.dtype == object:
- # GH#50162
- return np.array([x - self for x in other], dtype=object)
-
- return NotImplemented
-
- def asfreq(self, freq, how="E") -> "Period":
- """
- Convert Period to desired frequency, at the start or end of the interval.
-
- Parameters
- ----------
- freq : str, BaseOffset
- The desired frequency.
- how : {'E', 'S', 'end', 'start'}, default 'end'
- Start or end of the timespan.
-
- Returns
- -------
- resampled : Period
- """
- freq = self._maybe_convert_freq(freq)
- how = validate_end_alias(how)
- base1 = self._dtype._dtype_code
- base2 = freq_to_dtype_code(freq)
-
- # self.n can't be negative or 0
- end = how == "E"
- if end:
- ordinal = self.ordinal + self.freq.n - 1
- else:
- ordinal = self.ordinal
- ordinal = period_asfreq(ordinal, base1, base2, end)
-
- return Period(ordinal=ordinal, freq=freq)
-
- def to_timestamp(self, freq=None, how="start") -> Timestamp:
- """
- Return the Timestamp representation of the Period.
-
- Uses the target frequency specified at the part of the period specified
- by `how`, which is either `Start` or `Finish`.
-
- Parameters
- ----------
- freq : str or DateOffset
- Target frequency. Default is 'D' if self.freq is week or
- longer and 'S' otherwise.
- how : str, default 'S' (start)
- One of 'S', 'E'. Can be aliased as case insensitive
- 'Start', 'Finish', 'Begin', 'End'.
-
- Returns
- -------
- Timestamp
-
- Examples
- --------
- >>> period = pd.Period('2023-1-1', freq='D')
- >>> timestamp = period.to_timestamp()
- >>> timestamp
- Timestamp('2023-01-01 00:00:00')
- """
- how = validate_end_alias(how)
-
- end = how == "E"
- if end:
- if freq == "B" or self.freq == "B":
- # roll forward to ensure we land on B date
- adjust = np.timedelta64(1, "D") - np.timedelta64(1, "ns")
- return self.to_timestamp(how="start") + adjust
- endpoint = (self + self.freq).to_timestamp(how="start")
- return endpoint - np.timedelta64(1, "ns")
-
- if freq is None:
- freq = self._dtype._get_to_timestamp_base()
- base = freq
- else:
- freq = self._maybe_convert_freq(freq)
- base = freq._period_dtype_code
-
- val = self.asfreq(freq, how)
-
- dt64 = period_ordinal_to_dt64(val.ordinal, base)
- return Timestamp(dt64)
-
- @property
- def year(self) -> int:
- """
- Return the year this Period falls on.
- """
- base = self._dtype._dtype_code
- return pyear(self.ordinal, base)
-
- @property
- def month(self) -> int:
- """
- Return the month this Period falls on.
- """
- base = self._dtype._dtype_code
- return pmonth(self.ordinal, base)
-
- @property
- def day(self) -> int:
- """
- Get day of the month that a Period falls on.
-
- Returns
- -------
- int
-
- See Also
- --------
- Period.dayofweek : Get the day of the week.
- Period.dayofyear : Get the day of the year.
-
- Examples
- --------
- >>> p = pd.Period("2018-03-11", freq='H')
- >>> p.day
- 11
- """
- base = self._dtype._dtype_code
- return pday(self.ordinal, base)
-
- @property
- def hour(self) -> int:
- """
- Get the hour of the day component of the Period.
-
- Returns
- -------
- int
- The hour as an integer, between 0 and 23.
-
- See Also
- --------
- Period.second : Get the second component of the Period.
- Period.minute : Get the minute component of the Period.
-
- Examples
- --------
- >>> p = pd.Period("2018-03-11 13:03:12.050000")
- >>> p.hour
- 13
-
- Period longer than a day
-
- >>> p = pd.Period("2018-03-11", freq="M")
- >>> p.hour
- 0
- """
- base = self._dtype._dtype_code
- return phour(self.ordinal, base)
-
- @property
- def minute(self) -> int:
- """
- Get minute of the hour component of the Period.
-
- Returns
- -------
- int
- The minute as an integer, between 0 and 59.
-
- See Also
- --------
- Period.hour : Get the hour component of the Period.
- Period.second : Get the second component of the Period.
-
- Examples
- --------
- >>> p = pd.Period("2018-03-11 13:03:12.050000")
- >>> p.minute
- 3
- """
- base = self._dtype._dtype_code
- return pminute(self.ordinal, base)
-
- @property
- def second(self) -> int:
- """
- Get the second component of the Period.
-
- Returns
- -------
- int
- The second of the Period (ranges from 0 to 59).
-
- See Also
- --------
- Period.hour : Get the hour component of the Period.
- Period.minute : Get the minute component of the Period.
-
- Examples
- --------
- >>> p = pd.Period("2018-03-11 13:03:12.050000")
- >>> p.second
- 12
- """
- base = self._dtype._dtype_code
- return psecond(self.ordinal, base)
-
- @property
- def weekofyear(self) -> int:
- """
- Get the week of the year on the given Period.
-
- Returns
- -------
- int
-
- See Also
- --------
- Period.dayofweek : Get the day component of the Period.
- Period.weekday : Get the day component of the Period.
-
- Examples
- --------
- >>> p = pd.Period("2018-03-11", "H")
- >>> p.weekofyear
- 10
-
- >>> p = pd.Period("2018-02-01", "D")
- >>> p.weekofyear
- 5
-
- >>> p = pd.Period("2018-01-06", "D")
- >>> p.weekofyear
- 1
- """
- base = self._dtype._dtype_code
- return pweek(self.ordinal, base)
-
- @property
- def week(self) -> int:
- """
- Get the week of the year on the given Period.
-
- Returns
- -------
- int
-
- See Also
- --------
- Period.dayofweek : Get the day component of the Period.
- Period.weekday : Get the day component of the Period.
-
- Examples
- --------
- >>> p = pd.Period("2018-03-11", "H")
- >>> p.week
- 10
-
- >>> p = pd.Period("2018-02-01", "D")
- >>> p.week
- 5
-
- >>> p = pd.Period("2018-01-06", "D")
- >>> p.week
- 1
- """
- return self.weekofyear
-
- @property
- def day_of_week(self) -> int:
- """
- Day of the week the period lies in, with Monday=0 and Sunday=6.
-
- If the period frequency is lower than daily (e.g. hourly), and the
- period spans over multiple days, the day at the start of the period is
- used.
-
- If the frequency is higher than daily (e.g. monthly), the last day
- of the period is used.
-
- Returns
- -------
- int
- Day of the week.
-
- See Also
- --------
- Period.day_of_week : Day of the week the period lies in.
- Period.weekday : Alias of Period.day_of_week.
- Period.day : Day of the month.
- Period.dayofyear : Day of the year.
-
- Examples
- --------
- >>> per = pd.Period('2017-12-31 22:00', 'H')
- >>> per.day_of_week
- 6
-
- For periods that span over multiple days, the day at the beginning of
- the period is returned.
-
- >>> per = pd.Period('2017-12-31 22:00', '4H')
- >>> per.day_of_week
- 6
- >>> per.start_time.day_of_week
- 6
-
- For periods with a frequency higher than days, the last day of the
- period is returned.
-
- >>> per = pd.Period('2018-01', 'M')
- >>> per.day_of_week
- 2
- >>> per.end_time.day_of_week
- 2
- """
- base = self._dtype._dtype_code
- return pweekday(self.ordinal, base)
-
- @property
- def weekday(self) -> int:
- """
- Day of the week the period lies in, with Monday=0 and Sunday=6.
-
- If the period frequency is lower than daily (e.g. hourly), and the
- period spans over multiple days, the day at the start of the period is
- used.
-
- If the frequency is higher than daily (e.g. monthly), the last day
- of the period is used.
-
- Returns
- -------
- int
- Day of the week.
-
- See Also
- --------
- Period.dayofweek : Day of the week the period lies in.
- Period.weekday : Alias of Period.dayofweek.
- Period.day : Day of the month.
- Period.dayofyear : Day of the year.
-
- Examples
- --------
- >>> per = pd.Period('2017-12-31 22:00', 'H')
- >>> per.dayofweek
- 6
-
- For periods that span over multiple days, the day at the beginning of
- the period is returned.
-
- >>> per = pd.Period('2017-12-31 22:00', '4H')
- >>> per.dayofweek
- 6
- >>> per.start_time.dayofweek
- 6
-
- For periods with a frequency higher than days, the last day of the
- period is returned.
-
- >>> per = pd.Period('2018-01', 'M')
- >>> per.dayofweek
- 2
- >>> per.end_time.dayofweek
- 2
- """
- # Docstring is a duplicate from dayofweek. Reusing docstrings with
- # Appender doesn't work for properties in Cython files, and setting
- # the __doc__ attribute is also not possible.
- return self.dayofweek
-
- @property
- def day_of_year(self) -> int:
- """
- Return the day of the year.
-
- This attribute returns the day of the year on which the particular
- date occurs. The return value ranges between 1 to 365 for regular
- years and 1 to 366 for leap years.
-
- Returns
- -------
- int
- The day of year.
-
- See Also
- --------
- Period.day : Return the day of the month.
- Period.day_of_week : Return the day of week.
- PeriodIndex.day_of_year : Return the day of year of all indexes.
-
- Examples
- --------
- >>> period = pd.Period("2015-10-23", freq='H')
- >>> period.day_of_year
- 296
- >>> period = pd.Period("2012-12-31", freq='D')
- >>> period.day_of_year
- 366
- >>> period = pd.Period("2013-01-01", freq='D')
- >>> period.day_of_year
- 1
- """
- base = self._dtype._dtype_code
- return pday_of_year(self.ordinal, base)
-
- @property
- def quarter(self) -> int:
- """
- Return the quarter this Period falls on.
- """
- base = self._dtype._dtype_code
- return pquarter(self.ordinal, base)
-
- @property
- def qyear(self) -> int:
- """
- Fiscal year the Period lies in according to its starting-quarter.
-
- The `year` and the `qyear` of the period will be the same if the fiscal
- and calendar years are the same. When they are not, the fiscal year
- can be different from the calendar year of the period.
-
- Returns
- -------
- int
- The fiscal year of the period.
-
- See Also
- --------
- Period.year : Return the calendar year of the period.
-
- Examples
- --------
- If the natural and fiscal year are the same, `qyear` and `year` will
- be the same.
-
- >>> per = pd.Period('2018Q1', freq='Q')
- >>> per.qyear
- 2018
- >>> per.year
- 2018
-
- If the fiscal year starts in April (`Q-MAR`), the first quarter of
- 2018 will start in April 2017. `year` will then be 2017, but `qyear`
- will be the fiscal year, 2018.
-
- >>> per = pd.Period('2018Q1', freq='Q-MAR')
- >>> per.start_time
- Timestamp('2017-04-01 00:00:00')
- >>> per.qyear
- 2018
- >>> per.year
- 2017
- """
- base = self._dtype._dtype_code
- return pqyear(self.ordinal, base)
-
- @property
- def days_in_month(self) -> int:
- """
- Get the total number of days in the month that this period falls on.
-
- Returns
- -------
- int
-
- See Also
- --------
- Period.daysinmonth : Gets the number of days in the month.
- DatetimeIndex.daysinmonth : Gets the number of days in the month.
- calendar.monthrange : Returns a tuple containing weekday
- (0-6 ~ Mon-Sun) and number of days (28-31).
-
- Examples
- --------
- >>> p = pd.Period('2018-2-17')
- >>> p.days_in_month
- 28
-
- >>> pd.Period('2018-03-01').days_in_month
- 31
-
- Handles the leap year case as well:
-
- >>> p = pd.Period('2016-2-17')
- >>> p.days_in_month
- 29
- """
- base = self._dtype._dtype_code
- return pdays_in_month(self.ordinal, base)
-
- @property
- def daysinmonth(self) -> int:
- """
- Get the total number of days of the month that this period falls on.
-
- Returns
- -------
- int
-
- See Also
- --------
- Period.days_in_month : Return the days of the month.
- Period.dayofyear : Return the day of the year.
-
- Examples
- --------
- >>> p = pd.Period("2018-03-11", freq='H')
- >>> p.daysinmonth
- 31
- """
- return self.days_in_month
-
- @property
- def is_leap_year(self) -> bool:
- """
- Return True if the period's year is in a leap year.
- """
- return bool(is_leapyear(self.year))
-
- @classmethod
- def now(cls, freq):
- """
- Return the period of now's date.
-
- Parameters
- ----------
- freq : str, BaseOffset
- Frequency to use for the returned period.
- """
- return Period(datetime.now(), freq=freq)
-
- @property
- def freqstr(self) -> str:
- """
- Return a string representation of the frequency.
- """
- return self.freq.freqstr
-
- def __repr__(self) -> str:
- base = self._dtype._dtype_code
- formatted = period_format(self.ordinal, base)
- return f"Period('{formatted}', '{self.freqstr}')"
-
- def __str__(self) -> str:
- """
- Return a string representation for a particular DataFrame
- """
- base = self._dtype._dtype_code
- formatted = period_format(self.ordinal, base)
- value = str(formatted)
- return value
-
- def __setstate__(self, state):
- self.freq = state[1]
- self.ordinal = state[2]
-
- def __reduce__(self):
- object_state = None, self.freq, self.ordinal
- return (Period, object_state)
-
- def strftime(self, fmt: str) -> str:
- r"""
- Returns a formatted string representation of the :class:`Period`.
-
- ``fmt`` must be a string containing one or several directives.
- The method recognizes the same directives as the :func:`time.strftime`
- function of the standard Python distribution, as well as the specific
- additional directives ``%f``, ``%F``, ``%q``, ``%l``, ``%u``, ``%n``.
- (formatting & docs originally from scikits.timeries).
-
- +-----------+--------------------------------+-------+
- | Directive | Meaning | Notes |
- +===========+================================+=======+
- | ``%a`` | Locale's abbreviated weekday | |
- | | name. | |
- +-----------+--------------------------------+-------+
- | ``%A`` | Locale's full weekday name. | |
- +-----------+--------------------------------+-------+
- | ``%b`` | Locale's abbreviated month | |
- | | name. | |
- +-----------+--------------------------------+-------+
- | ``%B`` | Locale's full month name. | |
- +-----------+--------------------------------+-------+
- | ``%c`` | Locale's appropriate date and | |
- | | time representation. | |
- +-----------+--------------------------------+-------+
- | ``%d`` | Day of the month as a decimal | |
- | | number [01,31]. | |
- +-----------+--------------------------------+-------+
- | ``%f`` | 'Fiscal' year without a | \(1) |
- | | century as a decimal number | |
- | | [00,99] | |
- +-----------+--------------------------------+-------+
- | ``%F`` | 'Fiscal' year with a century | \(2) |
- | | as a decimal number | |
- +-----------+--------------------------------+-------+
- | ``%H`` | Hour (24-hour clock) as a | |
- | | decimal number [00,23]. | |
- +-----------+--------------------------------+-------+
- | ``%I`` | Hour (12-hour clock) as a | |
- | | decimal number [01,12]. | |
- +-----------+--------------------------------+-------+
- | ``%j`` | Day of the year as a decimal | |
- | | number [001,366]. | |
- +-----------+--------------------------------+-------+
- | ``%m`` | Month as a decimal number | |
- | | [01,12]. | |
- +-----------+--------------------------------+-------+
- | ``%M`` | Minute as a decimal number | |
- | | [00,59]. | |
- +-----------+--------------------------------+-------+
- | ``%p`` | Locale's equivalent of either | \(3) |
- | | AM or PM. | |
- +-----------+--------------------------------+-------+
- | ``%q`` | Quarter as a decimal number | |
- | | [1,4] | |
- +-----------+--------------------------------+-------+
- | ``%S`` | Second as a decimal number | \(4) |
- | | [00,61]. | |
- +-----------+--------------------------------+-------+
- | ``%l`` | Millisecond as a decimal number| |
- | | [000,999]. | |
- +-----------+--------------------------------+-------+
- | ``%u`` | Microsecond as a decimal number| |
- | | [000000,999999]. | |
- +-----------+--------------------------------+-------+
- | ``%n`` | Nanosecond as a decimal number | |
- | | [000000000,999999999]. | |
- +-----------+--------------------------------+-------+
- | ``%U`` | Week number of the year | \(5) |
- | | (Sunday as the first day of | |
- | | the week) as a decimal number | |
- | | [00,53]. All days in a new | |
- | | year preceding the first | |
- | | Sunday are considered to be in | |
- | | week 0. | |
- +-----------+--------------------------------+-------+
- | ``%w`` | Weekday as a decimal number | |
- | | [0(Sunday),6]. | |
- +-----------+--------------------------------+-------+
- | ``%W`` | Week number of the year | \(5) |
- | | (Monday as the first day of | |
- | | the week) as a decimal number | |
- | | [00,53]. All days in a new | |
- | | year preceding the first | |
- | | Monday are considered to be in | |
- | | week 0. | |
- +-----------+--------------------------------+-------+
- | ``%x`` | Locale's appropriate date | |
- | | representation. | |
- +-----------+--------------------------------+-------+
- | ``%X`` | Locale's appropriate time | |
- | | representation. | |
- +-----------+--------------------------------+-------+
- | ``%y`` | Year without century as a | |
- | | decimal number [00,99]. | |
- +-----------+--------------------------------+-------+
- | ``%Y`` | Year with century as a decimal | |
- | | number. | |
- +-----------+--------------------------------+-------+
- | ``%Z`` | Time zone name (no characters | |
- | | if no time zone exists). | |
- +-----------+--------------------------------+-------+
- | ``%%`` | A literal ``'%'`` character. | |
- +-----------+--------------------------------+-------+
-
- Notes
- -----
-
- (1)
- The ``%f`` directive is the same as ``%y`` if the frequency is
- not quarterly.
- Otherwise, it corresponds to the 'fiscal' year, as defined by
- the :attr:`qyear` attribute.
-
- (2)
- The ``%F`` directive is the same as ``%Y`` if the frequency is
- not quarterly.
- Otherwise, it corresponds to the 'fiscal' year, as defined by
- the :attr:`qyear` attribute.
-
- (3)
- The ``%p`` directive only affects the output hour field
- if the ``%I`` directive is used to parse the hour.
-
- (4)
- The range really is ``0`` to ``61``; this accounts for leap
- seconds and the (very rare) double leap seconds.
-
- (5)
- The ``%U`` and ``%W`` directives are only used in calculations
- when the day of the week and the year are specified.
-
- Examples
- --------
-
- >>> from pandas import Period
- >>> a = Period(freq='Q-JUL', year=2006, quarter=1)
- >>> a.strftime('%F-Q%q')
- '2006-Q1'
- >>> # Output the last month in the quarter of this date
- >>> a.strftime('%b-%Y')
- 'Oct-2005'
- >>>
- >>> a = Period(freq='D', year=2001, month=1, day=1)
- >>> a.strftime('%d-%b-%Y')
- '01-Jan-2001'
- >>> a.strftime('%b. %d, %Y was a %A')
- 'Jan. 01, 2001 was a Monday'
- """
- base = self._dtype._dtype_code
- return period_format(self.ordinal, base, fmt)
-
-
-class Period(_Period):
- """
- Represents a period of time.
-
- Parameters
- ----------
- value : Period or str, default None
- The time period represented (e.g., '4Q2005'). This represents neither
- the start or the end of the period, but rather the entire period itself.
- freq : str, default None
- One of pandas period strings or corresponding objects. Accepted
- strings are listed in the
- :ref:`offset alias section <timeseries.offset_aliases>` in the user docs.
- ordinal : int, default None
- The period offset from the proleptic Gregorian epoch.
- year : int, default None
- Year value of the period.
- month : int, default 1
- Month value of the period.
- quarter : int, default None
- Quarter value of the period.
- day : int, default 1
- Day value of the period.
- hour : int, default 0
- Hour value of the period.
- minute : int, default 0
- Minute value of the period.
- second : int, default 0
- Second value of the period.
-
- Examples
- --------
- >>> period = pd.Period('2012-1-1', freq='D')
- >>> period
- Period('2012-01-01', 'D')
- """
-
- def __new__(cls, value=None, freq=None, ordinal=None,
- year=None, month=None, quarter=None, day=None,
- hour=None, minute=None, second=None):
- # freq points to a tuple (base, mult); base is one of the defined
- # periods such as A, Q, etc. Every five minutes would be, e.g.,
- # ('T', 5) but may be passed in as a string like '5T'
-
- # ordinal is the period offset from the gregorian proleptic epoch
-
- if freq is not None:
- freq = cls._maybe_convert_freq(freq)
- nanosecond = 0
-
- if ordinal is not None and value is not None:
- raise ValueError("Only value or ordinal but not both should be "
- "given but not both")
- elif ordinal is not None:
- if not util.is_integer_object(ordinal):
- raise ValueError("Ordinal must be an integer")
- if freq is None:
- raise ValueError("Must supply freq for ordinal value")
-
- elif value is None:
- if (year is None and month is None and
- quarter is None and day is None and
- hour is None and minute is None and second is None):
- ordinal = NPY_NAT
- else:
- if freq is None:
- raise ValueError("If value is None, freq cannot be None")
-
- # set defaults
- month = 1 if month is None else month
- day = 1 if day is None else day
- hour = 0 if hour is None else hour
- minute = 0 if minute is None else minute
- second = 0 if second is None else second
-
- ordinal = _ordinal_from_fields(year, month, quarter, day,
- hour, minute, second, freq)
-
- elif is_period_object(value):
- other = value
- if freq is None or freq._period_dtype_code == other.freq._period_dtype_code:
- ordinal = other.ordinal
- freq = other.freq
- else:
- converted = other.asfreq(freq)
- ordinal = converted.ordinal
-
- elif checknull_with_nat(value) or (isinstance(value, str) and
- (value in nat_strings or len(value) == 0)):
- # explicit str check is necessary to avoid raising incorrectly
- # if we have a non-hashable value.
- ordinal = NPY_NAT
-
- elif isinstance(value, str) or util.is_integer_object(value):
- if util.is_integer_object(value):
- if value == NPY_NAT:
- value = "NaT"
-
- value = str(value)
- value = value.upper()
-
- freqstr = freq.rule_code if freq is not None else None
- try:
- dt, reso = parse_datetime_string_with_reso(value, freqstr)
- except ValueError as err:
- match = re.search(r"^\d{4}-\d{2}-\d{2}/\d{4}-\d{2}-\d{2}", value)
- if match:
- # Case that cannot be parsed (correctly) by our datetime
- # parsing logic
- dt, freq = _parse_weekly_str(value, freq)
- else:
- raise err
-
- else:
- if reso == "nanosecond":
- nanosecond = dt.nanosecond
- if dt is NaT:
- ordinal = NPY_NAT
-
- if freq is None and ordinal != NPY_NAT:
- # Skip NaT, since it doesn't have a resolution
- freq = attrname_to_abbrevs[reso]
- freq = to_offset(freq)
-
- elif PyDateTime_Check(value):
- dt = value
- if freq is None:
- raise ValueError("Must supply freq for datetime value")
- if isinstance(dt, Timestamp):
- nanosecond = dt.nanosecond
- elif util.is_datetime64_object(value):
- dt = Timestamp(value)
- if freq is None:
- raise ValueError("Must supply freq for datetime value")
- nanosecond = dt.nanosecond
- elif PyDate_Check(value):
- dt = datetime(year=value.year, month=value.month, day=value.day)
- if freq is None:
- raise ValueError("Must supply freq for datetime value")
- else:
- msg = "Value must be Period, string, integer, or datetime"
- raise ValueError(msg)
-
- if ordinal is None:
- base = freq_to_dtype_code(freq)
- ordinal = period_ordinal(dt.year, dt.month, dt.day,
- dt.hour, dt.minute, dt.second,
- dt.microsecond, 1000*nanosecond, base)
-
- return cls._from_ordinal(ordinal, freq)
-
-
-cdef bint is_period_object(object obj):
- return isinstance(obj, _Period)
-
-
-cpdef int freq_to_dtype_code(BaseOffset freq) except? -1:
- try:
- return freq._period_dtype_code
- except AttributeError as err:
- raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) from err
-
-
-cdef int64_t _ordinal_from_fields(int year, int month, quarter, int day,
- int hour, int minute, int second,
- BaseOffset freq):
- base = freq_to_dtype_code(freq)
- if quarter is not None:
- year, month = quarter_to_myear(year, quarter, freq.freqstr)
-
- return period_ordinal(year, month, day, hour,
- minute, second, 0, 0, base)
-
-
-def validate_end_alias(how: str) -> str: # Literal["E", "S"]
- how_dict = {"S": "S", "E": "E",
- "START": "S", "FINISH": "E",
- "BEGIN": "S", "END": "E"}
- how = how_dict.get(str(how).upper())
- if how not in {"S", "E"}:
- raise ValueError("How must be one of S or E")
- return how
-
-
-cdef _parse_weekly_str(value, BaseOffset freq):
- """
- Parse e.g. "2017-01-23/2017-01-29", which cannot be parsed by the general
- datetime-parsing logic. This ensures that we can round-trip with
- Period.__str__ with weekly freq.
- """
- # GH#50803
- start, end = value.split("/")
- start = Timestamp(start)
- end = Timestamp(end)
-
- if (end - start).days != 6:
- # We are interested in cases where this is str(period)
- # of a Week-freq period
- raise ValueError("Could not parse as weekly-freq Period")
-
- if freq is None:
- day_name = end.day_name()[:3].upper()
- freqstr = f"W-{day_name}"
- freq = to_offset(freqstr)
- # We _should_ have freq.is_on_offset(end)
-
- return end, freq
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime.c b/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime.c
deleted file mode 100644
index 2bac6c720c3..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime.c
+++ /dev/null
@@ -1,1093 +0,0 @@
-/*
-
-Copyright (c) 2016, PyData Development Team
-All rights reserved.
-
-Distributed under the terms of the BSD Simplified License.
-
-The full license is in the LICENSE file, distributed with this software.
-
-Copyright (c) 2005-2011, NumPy Developers
-All rights reserved.
-
-This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
-
-*/
-
-#define NO_IMPORT
-
-#ifndef NPY_NO_DEPRECATED_API
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#endif // NPY_NO_DEPRECATED_API
-
-#include <Python.h>
-
-#include <numpy/arrayobject.h>
-#include <numpy/arrayscalars.h>
-#include <numpy/ndarraytypes.h>
-#include "np_datetime.h"
-
-
-const npy_datetimestruct _AS_MIN_DTS = {
- 1969, 12, 31, 23, 59, 50, 776627, 963145, 224193};
-const npy_datetimestruct _FS_MIN_DTS = {
- 1969, 12, 31, 21, 26, 16, 627963, 145224, 193000};
-const npy_datetimestruct _PS_MIN_DTS = {
- 1969, 9, 16, 5, 57, 7, 963145, 224193, 0};
-const npy_datetimestruct _NS_MIN_DTS = {
- 1677, 9, 21, 0, 12, 43, 145224, 193000, 0};
-const npy_datetimestruct _US_MIN_DTS = {
- -290308, 12, 21, 19, 59, 05, 224193, 0, 0};
-const npy_datetimestruct _MS_MIN_DTS = {
- -292275055, 5, 16, 16, 47, 4, 193000, 0, 0};
-const npy_datetimestruct _S_MIN_DTS = {
- -292277022657, 1, 27, 8, 29, 53, 0, 0, 0};
-const npy_datetimestruct _M_MIN_DTS = {
- -17536621475646, 5, 4, 5, 53, 0, 0, 0, 0};
-
-const npy_datetimestruct _AS_MAX_DTS = {
- 1970, 1, 1, 0, 0, 9, 223372, 36854, 775807};
-const npy_datetimestruct _FS_MAX_DTS = {
- 1970, 1, 1, 2, 33, 43, 372036, 854775, 807000};
-const npy_datetimestruct _PS_MAX_DTS = {
- 1970, 4, 17, 18, 2, 52, 36854, 775807, 0};
-const npy_datetimestruct _NS_MAX_DTS = {
- 2262, 4, 11, 23, 47, 16, 854775, 807000, 0};
-const npy_datetimestruct _US_MAX_DTS = {
- 294247, 1, 10, 4, 0, 54, 775807, 0, 0};
-const npy_datetimestruct _MS_MAX_DTS = {
- 292278994, 8, 17, 7, 12, 55, 807000, 0, 0};
-const npy_datetimestruct _S_MAX_DTS = {
- 292277026596, 12, 4, 15, 30, 7, 0, 0, 0};
-const npy_datetimestruct _M_MAX_DTS = {
- 17536621479585, 8, 30, 18, 7, 0, 0, 0, 0};
-
-
-const int days_per_month_table[2][12] = {
- {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
- {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
-
-/*
- * Returns 1 if the given year is a leap year, 0 otherwise.
- */
-int is_leapyear(npy_int64 year) {
- return (year & 0x3) == 0 && /* year % 4 == 0 */
- ((year % 100) != 0 || (year % 400) == 0);
-}
-
-/*
- * Adjusts a datetimestruct based on a minutes offset. Assumes
- * the current values are valid.g
- */
-void add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes) {
- int isleap;
-
- /* MINUTES */
- dts->min += minutes;
- while (dts->min < 0) {
- dts->min += 60;
- dts->hour--;
- }
- while (dts->min >= 60) {
- dts->min -= 60;
- dts->hour++;
- }
-
- /* HOURS */
- while (dts->hour < 0) {
- dts->hour += 24;
- dts->day--;
- }
- while (dts->hour >= 24) {
- dts->hour -= 24;
- dts->day++;
- }
-
- /* DAYS */
- if (dts->day < 1) {
- dts->month--;
- if (dts->month < 1) {
- dts->year--;
- dts->month = 12;
- }
- isleap = is_leapyear(dts->year);
- dts->day += days_per_month_table[isleap][dts->month - 1];
- } else if (dts->day > 28) {
- isleap = is_leapyear(dts->year);
- if (dts->day > days_per_month_table[isleap][dts->month - 1]) {
- dts->day -= days_per_month_table[isleap][dts->month - 1];
- dts->month++;
- if (dts->month > 12) {
- dts->year++;
- dts->month = 1;
- }
- }
- }
-}
-
-/*
- * Calculates the days offset from the 1970 epoch.
- */
-npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) {
- int i, month;
- npy_int64 year, days = 0;
- const int *month_lengths;
-
- year = dts->year - 1970;
- days = year * 365;
-
- /* Adjust for leap years */
- if (days >= 0) {
- /*
- * 1968 is the closest leap year before 1970.
- * Exclude the current year, so add 1.
- */
- year += 1;
- /* Add one day for each 4 years */
- days += year / 4;
- /* 1900 is the closest previous year divisible by 100 */
- year += 68;
- /* Subtract one day for each 100 years */
- days -= year / 100;
- /* 1600 is the closest previous year divisible by 400 */
- year += 300;
- /* Add one day for each 400 years */
- days += year / 400;
- } else {
- /*
- * 1972 is the closest later year after 1970.
- * Include the current year, so subtract 2.
- */
- year -= 2;
- /* Subtract one day for each 4 years */
- days += year / 4;
- /* 2000 is the closest later year divisible by 100 */
- year -= 28;
- /* Add one day for each 100 years */
- days -= year / 100;
- /* 2000 is also the closest later year divisible by 400 */
- /* Subtract one day for each 400 years */
- days += year / 400;
- }
-
- month_lengths = days_per_month_table[is_leapyear(dts->year)];
- month = dts->month - 1;
-
- /* Add the months */
- for (i = 0; i < month; ++i) {
- days += month_lengths[i];
- }
-
- /* Add the days */
- days += dts->day - 1;
-
- return days;
-}
-
-/*
- * Modifies '*days_' to be the day offset within the year,
- * and returns the year.
- */
-static npy_int64 days_to_yearsdays(npy_int64 *days_) {
- const npy_int64 days_per_400years = (400 * 365 + 100 - 4 + 1);
- /* Adjust so it's relative to the year 2000 (divisible by 400) */
- npy_int64 days = (*days_) - (365 * 30 + 7);
- npy_int64 year;
-
- /* Break down the 400 year cycle to get the year and day within the year */
- if (days >= 0) {
- year = 400 * (days / days_per_400years);
- days = days % days_per_400years;
- } else {
- year = 400 * ((days - (days_per_400years - 1)) / days_per_400years);
- days = days % days_per_400years;
- if (days < 0) {
- days += days_per_400years;
- }
- }
-
- /* Work out the year/day within the 400 year cycle */
- if (days >= 366) {
- year += 100 * ((days - 1) / (100 * 365 + 25 - 1));
- days = (days - 1) % (100 * 365 + 25 - 1);
- if (days >= 365) {
- year += 4 * ((days + 1) / (4 * 365 + 1));
- days = (days + 1) % (4 * 365 + 1);
- if (days >= 366) {
- year += (days - 1) / 365;
- days = (days - 1) % 365;
- }
- }
- }
-
- *days_ = days;
- return year + 2000;
-}
-
-/*
- * Adjusts a datetimestruct based on a seconds offset. Assumes
- * the current values are valid.
- */
-NPY_NO_EXPORT void add_seconds_to_datetimestruct(npy_datetimestruct *dts,
- int seconds) {
- int minutes;
-
- dts->sec += seconds;
- if (dts->sec < 0) {
- minutes = dts->sec / 60;
- dts->sec = dts->sec % 60;
- if (dts->sec < 0) {
- --minutes;
- dts->sec += 60;
- }
- add_minutes_to_datetimestruct(dts, minutes);
- } else if (dts->sec >= 60) {
- minutes = dts->sec / 60;
- dts->sec = dts->sec % 60;
- add_minutes_to_datetimestruct(dts, minutes);
- }
-}
-
-/*
- * Fills in the year, month, day in 'dts' based on the days
- * offset from 1970.
- */
-static void set_datetimestruct_days(npy_int64 days, npy_datetimestruct *dts) {
- const int *month_lengths;
- int i;
-
- dts->year = days_to_yearsdays(&days);
- month_lengths = days_per_month_table[is_leapyear(dts->year)];
-
- for (i = 0; i < 12; ++i) {
- if (days < month_lengths[i]) {
- dts->month = i + 1;
- dts->day = days + 1;
- return;
- } else {
- days -= month_lengths[i];
- }
- }
-}
-
-/*
- * Compares two npy_datetimestruct objects chronologically
- */
-int cmp_npy_datetimestruct(const npy_datetimestruct *a,
- const npy_datetimestruct *b) {
- if (a->year > b->year) {
- return 1;
- } else if (a->year < b->year) {
- return -1;
- }
-
- if (a->month > b->month) {
- return 1;
- } else if (a->month < b->month) {
- return -1;
- }
-
- if (a->day > b->day) {
- return 1;
- } else if (a->day < b->day) {
- return -1;
- }
-
- if (a->hour > b->hour) {
- return 1;
- } else if (a->hour < b->hour) {
- return -1;
- }
-
- if (a->min > b->min) {
- return 1;
- } else if (a->min < b->min) {
- return -1;
- }
-
- if (a->sec > b->sec) {
- return 1;
- } else if (a->sec < b->sec) {
- return -1;
- }
-
- if (a->us > b->us) {
- return 1;
- } else if (a->us < b->us) {
- return -1;
- }
-
- if (a->ps > b->ps) {
- return 1;
- } else if (a->ps < b->ps) {
- return -1;
- }
-
- if (a->as > b->as) {
- return 1;
- } else if (a->as < b->as) {
- return -1;
- }
-
- return 0;
-}
-/*
-* Returns the offset from utc of the timezone as a timedelta.
-* The caller is responsible for ensuring that the tzinfo
-* attribute exists on the datetime object.
-*
-* If the passed object is timezone naive, Py_None is returned.
-* If extraction of the offset fails, NULL is returned.
-*
-* NOTE: This function is not vendored from numpy.
-*/
-PyObject *extract_utc_offset(PyObject *obj) {
- PyObject *tmp = PyObject_GetAttrString(obj, "tzinfo");
- if (tmp == NULL) {
- return NULL;
- }
- if (tmp != Py_None) {
- PyObject *offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj);
- if (offset == NULL) {
- Py_DECREF(tmp);
- return NULL;
- }
- return offset;
- }
- return tmp;
-}
-
-/*
- *
- * Converts a Python datetime.datetime or datetime.date
- * object into a NumPy npy_datetimestruct. Uses tzinfo (if present)
- * to convert to UTC time.
- *
- * The following implementation just asks for attributes, and thus
- * supports datetime duck typing. The tzinfo time zone conversion
- * requires this style of access as well.
- *
- * Returns -1 on error, 0 on success, and 1 (with no error set)
- * if obj doesn't have the needed date or datetime attributes.
- */
-int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
- npy_datetimestruct *out) {
- // Assumes that obj is a valid datetime object
- PyObject *tmp;
- PyObject *obj = (PyObject*)dtobj;
-
- /* Initialize the output to all zeros */
- memset(out, 0, sizeof(npy_datetimestruct));
- out->month = 1;
- out->day = 1;
-
- out->year = PyLong_AsLong(PyObject_GetAttrString(obj, "year"));
- out->month = PyLong_AsLong(PyObject_GetAttrString(obj, "month"));
- out->day = PyLong_AsLong(PyObject_GetAttrString(obj, "day"));
-
- // TODO(anyone): If we can get PyDateTime_IMPORT to work, we could use
- // PyDateTime_Check here, and less verbose attribute lookups.
-
- /* Check for time attributes (if not there, return success as a date) */
- if (!PyObject_HasAttrString(obj, "hour") ||
- !PyObject_HasAttrString(obj, "minute") ||
- !PyObject_HasAttrString(obj, "second") ||
- !PyObject_HasAttrString(obj, "microsecond")) {
- return 0;
- }
-
- out->hour = PyLong_AsLong(PyObject_GetAttrString(obj, "hour"));
- out->min = PyLong_AsLong(PyObject_GetAttrString(obj, "minute"));
- out->sec = PyLong_AsLong(PyObject_GetAttrString(obj, "second"));
- out->us = PyLong_AsLong(PyObject_GetAttrString(obj, "microsecond"));
-
- if (PyObject_HasAttrString(obj, "tzinfo")) {
- PyObject *offset = extract_utc_offset(obj);
- /* Apply the time zone offset if datetime obj is tz-aware */
- if (offset != NULL) {
- if (offset == Py_None) {
- Py_DECREF(offset);
- return 0;
- }
- PyObject *tmp_int;
- int seconds_offset, minutes_offset;
- /*
- * The timedelta should have a function "total_seconds"
- * which contains the value we want.
- */
- tmp = PyObject_CallMethod(offset, "total_seconds", "");
- Py_DECREF(offset);
- if (tmp == NULL) {
- return -1;
- }
- tmp_int = PyNumber_Long(tmp);
- if (tmp_int == NULL) {
- Py_DECREF(tmp);
- return -1;
- }
- seconds_offset = PyLong_AsLong(tmp_int);
- if (seconds_offset == -1 && PyErr_Occurred()) {
- Py_DECREF(tmp_int);
- Py_DECREF(tmp);
- return -1;
- }
- Py_DECREF(tmp_int);
- Py_DECREF(tmp);
-
- /* Convert to a minutes offset and apply it */
- minutes_offset = seconds_offset / 60;
-
- add_minutes_to_datetimestruct(out, -minutes_offset);
- }
- }
-
- return 0;
-}
-
-
-/*
- * Converts a datetime from a datetimestruct to a datetime based
- * on a metadata unit. The date is assumed to be valid.
- */
-npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
- const npy_datetimestruct *dts) {
- npy_datetime ret;
-
- if (base == NPY_FR_Y) {
- /* Truncate to the year */
- ret = dts->year - 1970;
- } else if (base == NPY_FR_M) {
- /* Truncate to the month */
- ret = 12 * (dts->year - 1970) + (dts->month - 1);
- } else {
- /* Otherwise calculate the number of days to start */
- npy_int64 days = get_datetimestruct_days(dts);
-
- switch (base) {
- case NPY_FR_W:
- /* Truncate to weeks */
- if (days >= 0) {
- ret = days / 7;
- } else {
- ret = (days - 6) / 7;
- }
- break;
- case NPY_FR_D:
- ret = days;
- break;
- case NPY_FR_h:
- ret = days * 24 + dts->hour;
- break;
- case NPY_FR_m:
- ret = (days * 24 + dts->hour) * 60 + dts->min;
- break;
- case NPY_FR_s:
- ret = ((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec;
- break;
- case NPY_FR_ms:
- ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000 +
- dts->us / 1000;
- break;
- case NPY_FR_us:
- ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000000 +
- dts->us;
- break;
- case NPY_FR_ns:
- ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000000 +
- dts->us) *
- 1000 +
- dts->ps / 1000;
- break;
- case NPY_FR_ps:
- ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000000 +
- dts->us) *
- 1000000 +
- dts->ps;
- break;
- case NPY_FR_fs:
- /* only 2.6 hours */
- ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000000 +
- dts->us) *
- 1000000 +
- dts->ps) *
- 1000 +
- dts->as / 1000;
- break;
- case NPY_FR_as:
- /* only 9.2 secs */
- ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 +
- dts->sec) *
- 1000000 +
- dts->us) *
- 1000000 +
- dts->ps) *
- 1000000 +
- dts->as;
- break;
- default:
- /* Something got corrupted */
- PyErr_SetString(
- PyExc_ValueError,
- "NumPy datetime metadata with corrupt unit value");
- return -1;
- }
- }
- return ret;
-}
-
-/*
- * Port numpy#13188 https://github.com/numpy/numpy/pull/13188/
- *
- * Computes the python `ret, d = divmod(d, unit)`.
- *
- * Note that GCC is smart enough at -O2 to eliminate the `if(*d < 0)` branch
- * for subsequent calls to this command - it is able to deduce that `*d >= 0`.
- */
-npy_int64 extract_unit(npy_datetime *d, npy_datetime unit) {
- assert(unit > 0);
- npy_int64 div = *d / unit;
- npy_int64 mod = *d % unit;
- if (mod < 0) {
- mod += unit;
- div -= 1;
- }
- assert(mod >= 0);
- *d = mod;
- return div;
-}
-
-/*
- * Converts a datetime based on the given metadata into a datetimestruct
- */
-void pandas_datetime_to_datetimestruct(npy_datetime dt,
- NPY_DATETIMEUNIT base,
- npy_datetimestruct *out) {
- npy_int64 perday;
-
- /* Initialize the output to all zeros */
- memset(out, 0, sizeof(npy_datetimestruct));
- out->year = 1970;
- out->month = 1;
- out->day = 1;
-
- /*
- * Note that care must be taken with the / and % operators
- * for negative values.
- */
- switch (base) {
- case NPY_FR_Y:
- out->year = 1970 + dt;
- break;
-
- case NPY_FR_M:
- out->year = 1970 + extract_unit(&dt, 12);
- out->month = dt + 1;
- break;
-
- case NPY_FR_W:
- /* A week is 7 days */
- set_datetimestruct_days(dt * 7, out);
- break;
-
- case NPY_FR_D:
- set_datetimestruct_days(dt, out);
- break;
-
- case NPY_FR_h:
- perday = 24LL;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = dt;
- break;
-
- case NPY_FR_m:
- perday = 24LL * 60;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 60);
- out->min = (int)dt;
- break;
-
- case NPY_FR_s:
- perday = 24LL * 60 * 60;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 60 * 60);
- out->min = (int)extract_unit(&dt, 60);
- out->sec = (int)dt;
- break;
-
- case NPY_FR_ms:
- perday = 24LL * 60 * 60 * 1000;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 1000LL * 60 * 60);
- out->min = (int)extract_unit(&dt, 1000LL * 60);
- out->sec = (int)extract_unit(&dt, 1000LL);
- out->us = (int)(dt * 1000);
- break;
-
- case NPY_FR_us:
- perday = 24LL * 60LL * 60LL * 1000LL * 1000LL;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 60 * 60);
- out->min = (int)extract_unit(&dt, 1000LL * 1000 * 60);
- out->sec = (int)extract_unit(&dt, 1000LL * 1000);
- out->us = (int)dt;
- break;
-
- case NPY_FR_ns:
- perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60);
- out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60);
- out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000);
- out->us = (int)extract_unit(&dt, 1000LL);
- out->ps = (int)(dt * 1000);
- break;
-
- case NPY_FR_ps:
- perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000;
-
- set_datetimestruct_days(extract_unit(&dt, perday), out);
- out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60 * 60);
- out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 60);
- out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000);
- out->us = (int)extract_unit(&dt, 1000LL);
- out->ps = (int)(dt * 1000);
- break;
-
- case NPY_FR_fs:
- /* entire range is only +- 2.6 hours */
- out->hour = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 *
- 1000 * 60 * 60);
- if (out->hour < 0) {
- out->year = 1969;
- out->month = 12;
- out->day = 31;
- out->hour += 24;
- assert(out->hour >= 0);
- }
- out->min = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 *
- 1000 * 60);
- out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 *
- 1000);
- out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000);
- out->ps = (int)extract_unit(&dt, 1000LL);
- out->as = (int)(dt * 1000);
- break;
-
- case NPY_FR_as:
- /* entire range is only +- 9.2 seconds */
- out->sec = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000 *
- 1000 * 1000);
- if (out->sec < 0) {
- out->year = 1969;
- out->month = 12;
- out->day = 31;
- out->hour = 23;
- out->min = 59;
- out->sec += 60;
- assert(out->sec >= 0);
- }
- out->us = (int)extract_unit(&dt, 1000LL * 1000 * 1000 * 1000);
- out->ps = (int)extract_unit(&dt, 1000LL * 1000);
- out->as = (int)dt;
- break;
-
- default:
- PyErr_SetString(PyExc_RuntimeError,
- "NumPy datetime metadata is corrupted with invalid "
- "base unit");
- }
-}
-
-/*
- * Converts a timedelta from a timedeltastruct to a timedelta based
- * on a metadata unit. The timedelta is assumed to be valid.
- *
- * Returns 0 on success, -1 on failure.
- */
-void pandas_timedelta_to_timedeltastruct(npy_timedelta td,
- NPY_DATETIMEUNIT base,
- pandas_timedeltastruct *out) {
- npy_int64 frac;
- npy_int64 sfrac;
- npy_int64 ifrac;
- int sign;
- npy_int64 per_day;
- npy_int64 per_sec;
-
- /* Initialize the output to all zeros */
- memset(out, 0, sizeof(pandas_timedeltastruct));
-
- switch (base) {
- case NPY_FR_ns:
-
- per_day = 86400000000000LL;
- per_sec = 1000LL * 1000LL * 1000LL;
-
- // put frac in seconds
- if (td < 0 && td % per_sec != 0)
- frac = td / per_sec - 1;
- else
- frac = td / per_sec;
-
- if (frac < 0) {
- sign = -1;
-
- // even fraction
- if ((-frac % 86400LL) != 0) {
- out->days = -frac / 86400LL + 1;
- frac += 86400LL * out->days;
- } else {
- frac = -frac;
- }
- } else {
- sign = 1;
- out->days = 0;
- }
-
- if (frac >= 86400) {
- out->days += frac / 86400LL;
- frac -= out->days * 86400LL;
- }
-
- if (frac >= 3600) {
- out->hrs = frac / 3600LL;
- frac -= out->hrs * 3600LL;
- } else {
- out->hrs = 0;
- }
-
- if (frac >= 60) {
- out->min = frac / 60LL;
- frac -= out->min * 60LL;
- } else {
- out->min = 0;
- }
-
- if (frac >= 0) {
- out->sec = frac;
- frac -= out->sec;
- } else {
- out->sec = 0;
- }
-
- sfrac = (out->hrs * 3600LL + out->min * 60LL
- + out->sec) * per_sec;
-
- if (sign < 0)
- out->days = -out->days;
-
- ifrac = td - (out->days * per_day + sfrac);
-
- if (ifrac != 0) {
- out->ms = ifrac / (1000LL * 1000LL);
- ifrac -= out->ms * 1000LL * 1000LL;
- out->us = ifrac / 1000LL;
- ifrac -= out->us * 1000LL;
- out->ns = ifrac;
- } else {
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- }
- break;
-
- case NPY_FR_us:
-
- per_day = 86400000000LL;
- per_sec = 1000LL * 1000LL;
-
- // put frac in seconds
- if (td < 0 && td % per_sec != 0)
- frac = td / per_sec - 1;
- else
- frac = td / per_sec;
-
- if (frac < 0) {
- sign = -1;
-
- // even fraction
- if ((-frac % 86400LL) != 0) {
- out->days = -frac / 86400LL + 1;
- frac += 86400LL * out->days;
- } else {
- frac = -frac;
- }
- } else {
- sign = 1;
- out->days = 0;
- }
-
- if (frac >= 86400) {
- out->days += frac / 86400LL;
- frac -= out->days * 86400LL;
- }
-
- if (frac >= 3600) {
- out->hrs = frac / 3600LL;
- frac -= out->hrs * 3600LL;
- } else {
- out->hrs = 0;
- }
-
- if (frac >= 60) {
- out->min = frac / 60LL;
- frac -= out->min * 60LL;
- } else {
- out->min = 0;
- }
-
- if (frac >= 0) {
- out->sec = frac;
- frac -= out->sec;
- } else {
- out->sec = 0;
- }
-
- sfrac = (out->hrs * 3600LL + out->min * 60LL
- + out->sec) * per_sec;
-
- if (sign < 0)
- out->days = -out->days;
-
- ifrac = td - (out->days * per_day + sfrac);
-
- if (ifrac != 0) {
- out->ms = ifrac / 1000LL;
- ifrac -= out->ms * 1000LL;
- out->us = ifrac / 1L;
- ifrac -= out->us * 1L;
- out->ns = ifrac;
- } else {
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- }
- break;
-
- case NPY_FR_ms:
-
- per_day = 86400000LL;
- per_sec = 1000LL;
-
- // put frac in seconds
- if (td < 0 && td % per_sec != 0)
- frac = td / per_sec - 1;
- else
- frac = td / per_sec;
-
- if (frac < 0) {
- sign = -1;
-
- // even fraction
- if ((-frac % 86400LL) != 0) {
- out->days = -frac / 86400LL + 1;
- frac += 86400LL * out->days;
- } else {
- frac = -frac;
- }
- } else {
- sign = 1;
- out->days = 0;
- }
-
- if (frac >= 86400) {
- out->days += frac / 86400LL;
- frac -= out->days * 86400LL;
- }
-
- if (frac >= 3600) {
- out->hrs = frac / 3600LL;
- frac -= out->hrs * 3600LL;
- } else {
- out->hrs = 0;
- }
-
- if (frac >= 60) {
- out->min = frac / 60LL;
- frac -= out->min * 60LL;
- } else {
- out->min = 0;
- }
-
- if (frac >= 0) {
- out->sec = frac;
- frac -= out->sec;
- } else {
- out->sec = 0;
- }
-
- sfrac = (out->hrs * 3600LL + out->min * 60LL
- + out->sec) * per_sec;
-
- if (sign < 0)
- out->days = -out->days;
-
- ifrac = td - (out->days * per_day + sfrac);
-
- if (ifrac != 0) {
- out->ms = ifrac;
- out->us = 0;
- out->ns = 0;
- } else {
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- }
- break;
-
- case NPY_FR_s:
- // special case where we can simplify many expressions bc per_sec=1
-
- per_day = 86400LL;
- per_sec = 1L;
-
- // put frac in seconds
- if (td < 0 && td % per_sec != 0)
- frac = td / per_sec - 1;
- else
- frac = td / per_sec;
-
- if (frac < 0) {
- sign = -1;
-
- // even fraction
- if ((-frac % 86400LL) != 0) {
- out->days = -frac / 86400LL + 1;
- frac += 86400LL * out->days;
- } else {
- frac = -frac;
- }
- } else {
- sign = 1;
- out->days = 0;
- }
-
- if (frac >= 86400) {
- out->days += frac / 86400LL;
- frac -= out->days * 86400LL;
- }
-
- if (frac >= 3600) {
- out->hrs = frac / 3600LL;
- frac -= out->hrs * 3600LL;
- } else {
- out->hrs = 0;
- }
-
- if (frac >= 60) {
- out->min = frac / 60LL;
- frac -= out->min * 60LL;
- } else {
- out->min = 0;
- }
-
- if (frac >= 0) {
- out->sec = frac;
- frac -= out->sec;
- } else {
- out->sec = 0;
- }
-
- sfrac = (out->hrs * 3600LL + out->min * 60LL
- + out->sec) * per_sec;
-
- if (sign < 0)
- out->days = -out->days;
-
- ifrac = td - (out->days * per_day + sfrac);
-
- if (ifrac != 0) {
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- } else {
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- }
- break;
-
- case NPY_FR_m:
-
- out->days = td / 1440LL;
- td -= out->days * 1440LL;
- out->hrs = td / 60LL;
- td -= out->hrs * 60LL;
- out->min = td;
-
- out->sec = 0;
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- break;
-
- case NPY_FR_h:
- out->days = td / 24LL;
- td -= out->days * 24LL;
- out->hrs = td;
-
- out->min = 0;
- out->sec = 0;
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- break;
-
- case NPY_FR_D:
- out->days = td;
- out->hrs = 0;
- out->min = 0;
- out->sec = 0;
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- break;
-
- case NPY_FR_W:
- out->days = 7 * td;
- out->hrs = 0;
- out->min = 0;
- out->sec = 0;
- out->ms = 0;
- out->us = 0;
- out->ns = 0;
- break;
-
- default:
- PyErr_SetString(PyExc_RuntimeError,
- "NumPy timedelta metadata is corrupted with "
- "invalid base unit");
- }
-
- out->seconds = out->hrs * 3600 + out->min * 60 + out->sec;
- out->microseconds = out->ms * 1000 + out->us;
- out->nanoseconds = out->ns;
-}
-
-
-/*
- * This function returns a pointer to the DateTimeMetaData
- * contained within the provided datetime dtype.
- *
- * Copied near-verbatim from numpy/core/src/multiarray/datetime.c
- */
-PyArray_DatetimeMetaData
-get_datetime_metadata_from_dtype(PyArray_Descr *dtype) {
- return (((PyArray_DatetimeDTypeMetaData *)dtype->c_metadata)->meta);
-}
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime.h b/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime.h
deleted file mode 100644
index 6ab915e517c..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
-
-Copyright (c) 2016, PyData Development Team
-All rights reserved.
-
-Distributed under the terms of the BSD Simplified License.
-
-The full license is in the LICENSE file, distributed with this software.
-
-Copyright (c) 2005-2011, NumPy Developers
-All rights reserved.
-
-This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
-
-*/
-
-#ifndef PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_H_
-#define PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_H_
-
-#ifndef NPY_NO_DEPRECATED_API
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#endif // NPY_NO_DEPRECATED_API
-
-#include <numpy/ndarraytypes.h>
-
-typedef struct {
- npy_int64 days;
- npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds;
-} pandas_timedeltastruct;
-
-extern const npy_datetimestruct _AS_MIN_DTS;
-extern const npy_datetimestruct _AS_MAX_DTS;
-extern const npy_datetimestruct _FS_MIN_DTS;
-extern const npy_datetimestruct _FS_MAX_DTS;
-extern const npy_datetimestruct _PS_MIN_DTS;
-extern const npy_datetimestruct _PS_MAX_DTS;
-extern const npy_datetimestruct _NS_MIN_DTS;
-extern const npy_datetimestruct _NS_MAX_DTS;
-extern const npy_datetimestruct _US_MIN_DTS;
-extern const npy_datetimestruct _US_MAX_DTS;
-extern const npy_datetimestruct _MS_MIN_DTS;
-extern const npy_datetimestruct _MS_MAX_DTS;
-extern const npy_datetimestruct _S_MIN_DTS;
-extern const npy_datetimestruct _S_MAX_DTS;
-extern const npy_datetimestruct _M_MIN_DTS;
-extern const npy_datetimestruct _M_MAX_DTS;
-
-// stuff pandas needs
-// ----------------------------------------------------------------------------
-
-PyObject *extract_utc_offset(PyObject *obj);
-
-int convert_pydatetime_to_datetimestruct(PyObject *dtobj,
- npy_datetimestruct *out);
-
-npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
- const npy_datetimestruct *dts);
-
-void pandas_datetime_to_datetimestruct(npy_datetime val, NPY_DATETIMEUNIT fr,
- npy_datetimestruct *result);
-
-void pandas_timedelta_to_timedeltastruct(npy_timedelta val,
- NPY_DATETIMEUNIT fr,
- pandas_timedeltastruct *result);
-
-extern const int days_per_month_table[2][12];
-
-// stuff numpy-derived code needs in header
-// ----------------------------------------------------------------------------
-
-int is_leapyear(npy_int64 year);
-
-/*
- * Calculates the days offset from the 1970 epoch.
- */
-npy_int64
-get_datetimestruct_days(const npy_datetimestruct *dts);
-
-
-/*
- * Compares two npy_datetimestruct objects chronologically
- */
-int cmp_npy_datetimestruct(const npy_datetimestruct *a,
- const npy_datetimestruct *b);
-
-
-/*
- * Adjusts a datetimestruct based on a minutes offset. Assumes
- * the current values are valid.
- */
-void
-add_minutes_to_datetimestruct(npy_datetimestruct *dts, int minutes);
-
-/*
- * This function returns the DateTimeMetaData
- * contained within the provided datetime dtype.
- */
-PyArray_DatetimeMetaData get_datetime_metadata_from_dtype(
- PyArray_Descr *dtype);
-
-
-#endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_H_
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
deleted file mode 100644
index f1f03e6467e..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
+++ /dev/null
@@ -1,1150 +0,0 @@
-/*
-
-Copyright (c) 2016, PyData Development Team
-All rights reserved.
-
-Distributed under the terms of the BSD Simplified License.
-
-The full license is in the LICENSE file, distributed with this software.
-
-Written by Mark Wiebe (mwwiebe@gmail.com)
-Copyright (c) 2011 by Enthought, Inc.
-
-Copyright (c) 2005-2011, NumPy Developers
-All rights reserved.
-
-See NUMPY_LICENSE.txt for the license.
-
-This file implements string parsing and creation for NumPy datetime.
-
-*/
-
-#define PY_SSIZE_T_CLEAN
-#define NO_IMPORT
-
-#ifndef NPY_NO_DEPRECATED_API
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#endif // NPY_NO_DEPRECATED_API
-
-#include <Python.h>
-
-#include <time.h>
-
-#include <numpy/arrayobject.h>
-#include <numpy/arrayscalars.h>
-#include <numpy/ndarraytypes.h>
-
-#include "np_datetime.h"
-#include "np_datetime_strings.h"
-
-
-/*
- * Parses (almost) standard ISO 8601 date strings. The differences are:
- *
- * + Only seconds may have a decimal point, with up to 18 digits after it
- * (maximum attoseconds precision).
- * + Either a 'T' as in ISO 8601 or a ' ' may be used to separate
- * the date and the time. Both are treated equivalently.
- * + Doesn't (yet) handle the "YYYY-DDD" or "YYYY-Www" formats.
- * + Doesn't handle leap seconds (seconds value has 60 in these cases).
- * + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow
- * + Accepts special values "NaT" (not a time), "Today", (current
- * day according to local time) and "Now" (current time in UTC).
- * + ':' separator between hours, minutes, and seconds is optional. When
- * omitted, each component must be 2 digits if it appears. (GH-10041)
- *
- * 'str' must be a NULL-terminated string, and 'len' must be its length.
- *
- * 'out' gets filled with the parsed date-time.
- * 'out_local' gets set to 1 if the parsed time contains timezone,
- * to 0 otherwise.
- * 'out_tzoffset' gets set to timezone offset by minutes
- * if the parsed time was in local time,
- * to 0 otherwise. The values 'now' and 'today' don't get counted
- * as local, and neither do UTC +/-#### timezone offsets, because
- * they aren't using the computer's local timezone offset.
- *
- * Returns 0 on success, -1 on failure.
- */
-
-typedef enum {
- COMPARISON_SUCCESS,
- COMPLETED_PARTIAL_MATCH,
- COMPARISON_ERROR
-} DatetimePartParseResult;
-// This function will advance the pointer on format
-// and decrement characters_remaining by n on success
-// On failure will return COMPARISON_ERROR without incrementing
-// If `format_requirement` is PARTIAL_MATCH, and the `format` string has
-// been exhausted, then return COMPLETED_PARTIAL_MATCH.
-static DatetimePartParseResult compare_format(
- const char **format,
- int *characters_remaining,
- const char *compare_to,
- int n,
- const FormatRequirement format_requirement
-) {
- if (format_requirement == INFER_FORMAT) {
- return COMPARISON_SUCCESS;
- }
- if (*characters_remaining < 0) {
- return COMPARISON_ERROR;
- }
- if (format_requirement == PARTIAL_MATCH && *characters_remaining == 0) {
- return COMPLETED_PARTIAL_MATCH;
- }
- if (*characters_remaining < n) {
- // TODO(pandas-dev): PyErr to differentiate what went wrong
- return COMPARISON_ERROR;
- } else {
- if (strncmp(*format, compare_to, n)) {
- // TODO(pandas-dev): PyErr to differentiate what went wrong
- return COMPARISON_ERROR;
- } else {
- *format += n;
- *characters_remaining -= n;
- return COMPARISON_SUCCESS;
- }
- }
- return COMPARISON_SUCCESS;
-}
-
-int parse_iso_8601_datetime(const char *str, int len, int want_exc,
- npy_datetimestruct *out,
- NPY_DATETIMEUNIT *out_bestunit,
- int *out_local, int *out_tzoffset,
- const char* format, int format_len,
- FormatRequirement format_requirement) {
- if (len < 0 || format_len < 0)
- goto parse_error;
- int year_leap = 0;
- int i, numdigits;
- const char *substr;
- int sublen;
- NPY_DATETIMEUNIT bestunit = NPY_FR_GENERIC;
- DatetimePartParseResult comparison;
-
- /* If year-month-day are separated by a valid separator,
- * months/days without leading zeroes will be parsed
- * (though not iso8601). If the components aren't separated,
- * 4 (YYYY) or 8 (YYYYMMDD) digits are expected. 6 digits are
- * forbidden here (but parsed as YYMMDD elsewhere).
- */
- int has_ymd_sep = 0;
- char ymd_sep = '\0';
- char valid_ymd_sep[] = {'-', '.', '/', '\\', ' '};
- int valid_ymd_sep_len = sizeof(valid_ymd_sep);
-
- /* hour-minute-second may or may not separated by ':'. If not, then
- * each component must be 2 digits. */
- int has_hms_sep = 0;
- int hour_was_2_digits = 0;
-
- /* Initialize the output to all zeros */
- memset(out, 0, sizeof(npy_datetimestruct));
- out->month = 1;
- out->day = 1;
-
- substr = str;
- sublen = len;
-
- /* Skip leading whitespace */
- while (sublen > 0 && isspace(*substr)) {
- ++substr;
- --sublen;
- comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- }
-
- /* Leading '-' sign for negative year */
- if (*substr == '-') {
- ++substr;
- --sublen;
- }
-
- if (sublen == 0) {
- goto parse_error;
- }
-
- /* PARSE THE YEAR (4 digits) */
- comparison = compare_format(&format, &format_len, "%Y", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
-
- out->year = 0;
- if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) &&
- isdigit(substr[2]) && isdigit(substr[3])) {
- out->year = 1000 * (substr[0] - '0') + 100 * (substr[1] - '0') +
- 10 * (substr[2] - '0') + (substr[3] - '0');
-
- substr += 4;
- sublen -= 4;
- }
-
- /* Negate the year if necessary */
- if (str[0] == '-') {
- out->year = -out->year;
- }
- /* Check whether it's a leap-year */
- year_leap = is_leapyear(out->year);
-
- /* Next character must be a separator, start of month, or end of string */
- if (sublen == 0) {
- if (out_local != NULL) {
- *out_local = 0;
- }
- if (format_len) {
- goto parse_error;
- }
- bestunit = NPY_FR_Y;
- goto finish;
- }
-
- if (!isdigit(*substr)) {
- for (i = 0; i < valid_ymd_sep_len; ++i) {
- if (*substr == valid_ymd_sep[i]) {
- break;
- }
- }
- if (i == valid_ymd_sep_len) {
- goto parse_error;
- }
- has_ymd_sep = 1;
- ymd_sep = valid_ymd_sep[i];
- ++substr;
- --sublen;
-
- comparison = compare_format(&format, &format_len, &ymd_sep, 1,
- format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- /* Cannot have trailing separator */
- if (sublen == 0 || !isdigit(*substr)) {
- goto parse_error;
- }
- }
-
- /* PARSE THE MONTH */
- comparison = compare_format(&format, &format_len, "%m", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- /* First digit required */
- out->month = (*substr - '0');
- ++substr;
- --sublen;
- /* Second digit optional if there was a separator */
- if (isdigit(*substr)) {
- out->month = 10 * out->month + (*substr - '0');
- ++substr;
- --sublen;
- } else if (!has_ymd_sep) {
- goto parse_error;
- }
- if (out->month < 1 || out->month > 12) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Month out of range in datetime string \"%s\"", str);
- }
- goto error;
- }
-
- /* Next character must be the separator, start of day, or end of string */
- if (sublen == 0) {
- bestunit = NPY_FR_M;
- /* Forbid YYYYMM. Parsed instead as YYMMDD by someone else. */
- if (!has_ymd_sep) {
- goto parse_error;
- }
- if (format_len) {
- goto parse_error;
- }
- if (out_local != NULL) {
- *out_local = 0;
- }
- goto finish;
- }
-
- if (has_ymd_sep) {
- /* Must have separator, but cannot be trailing */
- if (*substr != ymd_sep || sublen == 1) {
- goto parse_error;
- }
- ++substr;
- --sublen;
- comparison = compare_format(&format, &format_len, &ymd_sep, 1,
- format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- }
-
- /* PARSE THE DAY */
- comparison = compare_format(&format, &format_len, "%d", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- /* First digit required */
- if (!isdigit(*substr)) {
- goto parse_error;
- }
- out->day = (*substr - '0');
- ++substr;
- --sublen;
- /* Second digit optional if there was a separator */
- if (isdigit(*substr)) {
- out->day = 10 * out->day + (*substr - '0');
- ++substr;
- --sublen;
- } else if (!has_ymd_sep) {
- goto parse_error;
- }
- if (out->day < 1 ||
- out->day > days_per_month_table[year_leap][out->month - 1]) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Day out of range in datetime string \"%s\"", str);
- }
- goto error;
- }
-
- /* Next character must be a 'T', ' ', or end of string */
- if (sublen == 0) {
- if (out_local != NULL) {
- *out_local = 0;
- }
- if (format_len) {
- goto parse_error;
- }
- bestunit = NPY_FR_D;
- goto finish;
- }
-
- if ((*substr != 'T' && *substr != ' ') || sublen == 1) {
- goto parse_error;
- }
- comparison = compare_format(&format, &format_len, substr, 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- ++substr;
- --sublen;
-
- /* PARSE THE HOURS */
- comparison = compare_format(&format, &format_len, "%H", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- /* First digit required */
- if (!isdigit(*substr)) {
- goto parse_error;
- }
- out->hour = (*substr - '0');
- ++substr;
- --sublen;
- /* Second digit optional */
- if (isdigit(*substr)) {
- hour_was_2_digits = 1;
- out->hour = 10 * out->hour + (*substr - '0');
- ++substr;
- --sublen;
- if (out->hour >= 24) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Hours out of range in datetime string \"%s\"",
- str);
- }
- goto error;
- }
- }
-
- /* Next character must be a ':' or the end of the string */
- if (sublen == 0) {
- if (!hour_was_2_digits) {
- goto parse_error;
- }
- if (format_len) {
- goto parse_error;
- }
- bestunit = NPY_FR_h;
- goto finish;
- }
-
- if (*substr == ':') {
- has_hms_sep = 1;
- ++substr;
- --sublen;
- /* Cannot have a trailing separator */
- if (sublen == 0 || !isdigit(*substr)) {
- goto parse_error;
- }
- comparison = compare_format(&format, &format_len, ":", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- } else if (!isdigit(*substr)) {
- if (!hour_was_2_digits) {
- goto parse_error;
- }
- goto parse_timezone;
- }
-
- /* PARSE THE MINUTES */
- comparison = compare_format(&format, &format_len, "%M", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- /* First digit required */
- out->min = (*substr - '0');
- ++substr;
- --sublen;
- /* Second digit optional if there was a separator */
- if (isdigit(*substr)) {
- out->min = 10 * out->min + (*substr - '0');
- ++substr;
- --sublen;
- if (out->min >= 60) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Minutes out of range in datetime string \"%s\"",
- str);
- }
- goto error;
- }
- } else if (!has_hms_sep) {
- goto parse_error;
- }
-
- if (sublen == 0) {
- bestunit = NPY_FR_m;
- if (format_len) {
- goto parse_error;
- }
- goto finish;
- }
-
- /* If we make it through this condition block, then the next
- * character is a digit. */
- if (has_hms_sep && *substr == ':') {
- comparison = compare_format(&format, &format_len, ":", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- ++substr;
- --sublen;
- /* Cannot have a trailing ':' */
- if (sublen == 0 || !isdigit(*substr)) {
- goto parse_error;
- }
- } else if (!has_hms_sep && isdigit(*substr)) {
- } else {
- goto parse_timezone;
- }
-
- /* PARSE THE SECONDS */
- comparison = compare_format(&format, &format_len, "%S", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- /* First digit required */
- out->sec = (*substr - '0');
- ++substr;
- --sublen;
- /* Second digit optional if there was a separator */
- if (isdigit(*substr)) {
- out->sec = 10 * out->sec + (*substr - '0');
- ++substr;
- --sublen;
- if (out->sec >= 60) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Seconds out of range in datetime string \"%s\"",
- str);
- }
- goto error;
- }
- } else if (!has_hms_sep) {
- goto parse_error;
- }
-
- /* Next character may be a '.' indicating fractional seconds */
- if (sublen > 0 && *substr == '.') {
- ++substr;
- --sublen;
- comparison = compare_format(&format, &format_len, ".", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- } else {
- bestunit = NPY_FR_s;
- goto parse_timezone;
- }
-
- /* PARSE THE MICROSECONDS (0 to 6 digits) */
- comparison = compare_format(&format, &format_len, "%f", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- numdigits = 0;
- for (i = 0; i < 6; ++i) {
- out->us *= 10;
- if (sublen > 0 && isdigit(*substr)) {
- out->us += (*substr - '0');
- ++substr;
- --sublen;
- ++numdigits;
- }
- }
-
- if (sublen == 0 || !isdigit(*substr)) {
- if (numdigits > 3) {
- bestunit = NPY_FR_us;
- } else {
- bestunit = NPY_FR_ms;
- }
- goto parse_timezone;
- }
-
- /* PARSE THE PICOSECONDS (0 to 6 digits) */
- numdigits = 0;
- for (i = 0; i < 6; ++i) {
- out->ps *= 10;
- if (sublen > 0 && isdigit(*substr)) {
- out->ps += (*substr - '0');
- ++substr;
- --sublen;
- ++numdigits;
- }
- }
-
- if (sublen == 0 || !isdigit(*substr)) {
- if (numdigits > 3) {
- bestunit = NPY_FR_ps;
- } else {
- bestunit = NPY_FR_ns;
- }
- goto parse_timezone;
- }
-
- /* PARSE THE ATTOSECONDS (0 to 6 digits) */
- numdigits = 0;
- for (i = 0; i < 6; ++i) {
- out->as *= 10;
- if (sublen > 0 && isdigit(*substr)) {
- out->as += (*substr - '0');
- ++substr;
- --sublen;
- ++numdigits;
- }
- }
-
- if (numdigits > 3) {
- bestunit = NPY_FR_as;
- } else {
- bestunit = NPY_FR_fs;
- }
-
-parse_timezone:
- /* trim any whitespace between time/timezone */
- while (sublen > 0 && isspace(*substr)) {
- ++substr;
- --sublen;
- comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- }
-
- if (sublen == 0) {
- // Unlike NumPy, treating no time zone as naive
- if (format_len > 0) {
- goto parse_error;
- }
- goto finish;
- }
-
- /* UTC specifier */
- if (*substr == 'Z') {
- comparison = compare_format(&format, &format_len, "%z", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- /* "Z" should be equivalent to tz offset "+00:00" */
- if (out_local != NULL) {
- *out_local = 1;
- }
-
- if (out_tzoffset != NULL) {
- *out_tzoffset = 0;
- }
-
- if (sublen == 1) {
- if (format_len > 0) {
- goto parse_error;
- }
- goto finish;
- } else {
- ++substr;
- --sublen;
- }
- } else if (*substr == '-' || *substr == '+') {
- comparison = compare_format(&format, &format_len, "%z", 2, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- /* Time zone offset */
- int offset_neg = 0, offset_hour = 0, offset_minute = 0;
-
- /*
- * Since "local" means local with respect to the current
- * machine, we say this is non-local.
- */
-
- if (*substr == '-') {
- offset_neg = 1;
- }
- ++substr;
- --sublen;
-
- /* The hours offset */
- if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
- offset_hour = 10 * (substr[0] - '0') + (substr[1] - '0');
- substr += 2;
- sublen -= 2;
- if (offset_hour >= 24) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Timezone hours offset out of range "
- "in datetime string \"%s\"",
- str);
- }
- goto error;
- }
- } else if (sublen >= 1 && isdigit(substr[0])) {
- offset_hour = substr[0] - '0';
- ++substr;
- --sublen;
- } else {
- goto parse_error;
- }
-
- /* The minutes offset is optional */
- if (sublen > 0) {
- /* Optional ':' */
- if (*substr == ':') {
- ++substr;
- --sublen;
- }
-
- /* The minutes offset (at the end of the string) */
- if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) {
- offset_minute = 10 * (substr[0] - '0') + (substr[1] - '0');
- substr += 2;
- sublen -= 2;
- if (offset_minute >= 60) {
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Timezone minutes offset out of range "
- "in datetime string \"%s\"",
- str);
- }
- goto error;
- }
- } else if (sublen >= 1 && isdigit(substr[0])) {
- offset_minute = substr[0] - '0';
- ++substr;
- --sublen;
- } else {
- goto parse_error;
- }
- }
-
- /* Apply the time zone offset */
- if (offset_neg) {
- offset_hour = -offset_hour;
- offset_minute = -offset_minute;
- }
- if (out_local != NULL) {
- *out_local = 1;
- // Unlike NumPy, do not change internal value to local time
- *out_tzoffset = 60 * offset_hour + offset_minute;
- }
- }
-
- /* Skip trailing whitespace */
- while (sublen > 0 && isspace(*substr)) {
- ++substr;
- --sublen;
- comparison = compare_format(&format, &format_len, " ", 1, format_requirement);
- if (comparison == COMPARISON_ERROR) {
- goto parse_error;
- } else if (comparison == COMPLETED_PARTIAL_MATCH) {
- goto finish;
- }
- }
-
- if ((sublen != 0) || (format_len != 0)) {
- goto parse_error;
- }
-
-finish:
- if (out_bestunit != NULL) {
- *out_bestunit = bestunit;
- }
- return 0;
-
-parse_error:
- if (want_exc) {
- PyErr_Format(PyExc_ValueError,
- "Error parsing datetime string \"%s\" at position %d", str,
- (int)(substr - str));
- }
- return -1;
-
-error:
- return -1;
-}
-
-/*
- * Provides a string length to use for converting datetime
- * objects with the given local and unit settings.
- */
-int get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base) {
- int len = 0;
-
- switch (base) {
- /* Generic units can only be used to represent NaT */
- /* return 4;*/
- case NPY_FR_as:
- len += 3; /* "###" */
- case NPY_FR_fs:
- len += 3; /* "###" */
- case NPY_FR_ps:
- len += 3; /* "###" */
- case NPY_FR_ns:
- len += 3; /* "###" */
- case NPY_FR_us:
- len += 3; /* "###" */
- case NPY_FR_ms:
- len += 4; /* ".###" */
- case NPY_FR_s:
- len += 3; /* ":##" */
- case NPY_FR_m:
- len += 3; /* ":##" */
- case NPY_FR_h:
- len += 3; /* "T##" */
- case NPY_FR_D:
- case NPY_FR_W:
- len += 3; /* "-##" */
- case NPY_FR_M:
- len += 3; /* "-##" */
- case NPY_FR_Y:
- len += 21; /* 64-bit year */
- break;
- default:
- len += 3; /* handle the now defunct NPY_FR_B */
- break;
- }
-
- if (base >= NPY_FR_h) {
- if (local) {
- len += 5; /* "+####" or "-####" */
- } else {
- len += 1; /* "Z" */
- }
- }
-
- len += 1; /* NULL terminator */
-
- return len;
-}
-
-
-/*
- * Converts an npy_datetimestruct to an (almost) ISO 8601
- * NULL-terminated string using timezone Z (UTC). If the string fits in
- * the space exactly, it leaves out the NULL terminator and returns success.
- *
- * The differences from ISO 8601 are the 'NaT' string, and
- * the number of year digits is >= 4 instead of strictly 4.
- *
- * 'base' restricts the output to that unit. Set 'base' to
- * -1 to auto-detect a base after which all the values are zero.
- *
- * Returns 0 on success, -1 on failure (for example if the output
- * string was too short).
- */
-int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
- int utc, NPY_DATETIMEUNIT base) {
- char *substr = outstr;
- int sublen = outlen;
- int tmplen;
-
- /*
- * Print weeks with the same precision as days.
- *
- * TODO: Could print weeks with YYYY-Www format if the week
- * epoch is a Monday.
- */
- if (base == NPY_FR_W) {
- base = NPY_FR_D;
- }
-
-/* YEAR */
-/*
- * Can't use PyOS_snprintf, because it always produces a '\0'
- * character at the end, and NumPy string types are permitted
- * to have data all the way to the end of the buffer.
- */
-#ifdef _WIN32
- tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year);
-#else
- tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year);
-#endif // _WIN32
- /* If it ran out of space or there isn't space for the NULL terminator */
- if (tmplen < 0 || tmplen > sublen) {
- goto string_too_short;
- }
- substr += tmplen;
- sublen -= tmplen;
-
- /* Stop if the unit is years */
- if (base == NPY_FR_Y) {
- if (sublen > 0) {
- *substr = '\0';
- }
- return 0;
- }
-
- /* MONTH */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = '-';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->month / 10) + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->month % 10) + '0');
- substr += 3;
- sublen -= 3;
-
- /* Stop if the unit is months */
- if (base == NPY_FR_M) {
- if (sublen > 0) {
- *substr = '\0';
- }
- return 0;
- }
-
- /* DAY */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = '-';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->day / 10) + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->day % 10) + '0');
- substr += 3;
- sublen -= 3;
-
- /* Stop if the unit is days */
- if (base == NPY_FR_D) {
- if (sublen > 0) {
- *substr = '\0';
- }
- return 0;
- }
-
- /* HOUR */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = 'T';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->hour / 10) + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->hour % 10) + '0');
- substr += 3;
- sublen -= 3;
-
- /* Stop if the unit is hours */
- if (base == NPY_FR_h) {
- goto add_time_zone;
- }
-
- /* MINUTE */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = ':';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->min / 10) + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->min % 10) + '0');
- substr += 3;
- sublen -= 3;
-
- /* Stop if the unit is minutes */
- if (base == NPY_FR_m) {
- goto add_time_zone;
- }
-
- /* SECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = ':';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->sec / 10) + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->sec % 10) + '0');
- substr += 3;
- sublen -= 3;
-
- /* Stop if the unit is seconds */
- if (base == NPY_FR_s) {
- goto add_time_zone;
- }
-
- /* MILLISECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = '.';
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->us / 100000) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->us / 10000) % 10 + '0');
- if (sublen < 4) {
- goto string_too_short;
- }
- substr[3] = (char)((dts->us / 1000) % 10 + '0');
- substr += 4;
- sublen -= 4;
-
- /* Stop if the unit is milliseconds */
- if (base == NPY_FR_ms) {
- goto add_time_zone;
- }
-
- /* MICROSECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = (char)((dts->us / 100) % 10 + '0');
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->us / 10) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)(dts->us % 10 + '0');
- substr += 3;
- sublen -= 3;
-
- /* Stop if the unit is microseconds */
- if (base == NPY_FR_us) {
- goto add_time_zone;
- }
-
- /* NANOSECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = (char)((dts->ps / 100000) % 10 + '0');
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->ps / 10000) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->ps / 1000) % 10 + '0');
- substr += 3;
- sublen -= 3;
-
- /* Stop if the unit is nanoseconds */
- if (base == NPY_FR_ns) {
- goto add_time_zone;
- }
-
- /* PICOSECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = (char)((dts->ps / 100) % 10 + '0');
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->ps / 10) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)(dts->ps % 10 + '0');
- substr += 3;
- sublen -= 3;
-
- /* Stop if the unit is picoseconds */
- if (base == NPY_FR_ps) {
- goto add_time_zone;
- }
-
- /* FEMTOSECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = (char)((dts->as / 100000) % 10 + '0');
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->as / 10000) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)((dts->as / 1000) % 10 + '0');
- substr += 3;
- sublen -= 3;
-
- /* Stop if the unit is femtoseconds */
- if (base == NPY_FR_fs) {
- goto add_time_zone;
- }
-
- /* ATTOSECOND */
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = (char)((dts->as / 100) % 10 + '0');
- if (sublen < 2) {
- goto string_too_short;
- }
- substr[1] = (char)((dts->as / 10) % 10 + '0');
- if (sublen < 3) {
- goto string_too_short;
- }
- substr[2] = (char)(dts->as % 10 + '0');
- substr += 3;
- sublen -= 3;
-
-add_time_zone:
- /* UTC "Zulu" time */
- if (utc) {
- if (sublen < 1) {
- goto string_too_short;
- }
- substr[0] = 'Z';
- substr += 1;
- sublen -= 1;
- }
- /* Add a NULL terminator, and return */
- if (sublen > 0) {
- substr[0] = '\0';
- }
-
- return 0;
-
-string_too_short:
- PyErr_Format(PyExc_RuntimeError,
- "The string provided for NumPy ISO datetime formatting "
- "was too short, with length %d",
- outlen);
- return -1;
-}
-
-
-int make_iso_8601_timedelta(pandas_timedeltastruct *tds,
- char *outstr, size_t *outlen) {
- *outlen = 0;
- *outlen += snprintf(outstr, 60, // NOLINT
- "P%" NPY_INT64_FMT
- "DT%" NPY_INT32_FMT
- "H%" NPY_INT32_FMT
- "M%" NPY_INT32_FMT,
- tds->days, tds->hrs, tds->min, tds->sec);
- outstr += *outlen;
-
- if (tds->ns != 0) {
- *outlen += snprintf(outstr, 12, // NOLINT
- ".%03" NPY_INT32_FMT
- "%03" NPY_INT32_FMT
- "%03" NPY_INT32_FMT
- "S", tds->ms, tds->us, tds->ns);
- } else if (tds->us != 0) {
- *outlen += snprintf(outstr, 9, // NOLINT
- ".%03" NPY_INT32_FMT
- "%03" NPY_INT32_FMT
- "S", tds->ms, tds->us);
- } else if (tds->ms != 0) {
- *outlen += snprintf(outstr, 6, // NOLINT
- ".%03" NPY_INT32_FMT "S", tds->ms);
- } else {
- *outlen += snprintf(outstr, 2, // NOLINT
- "%s", "S");
- }
-
- return 0;
-}
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
deleted file mode 100644
index a635192d708..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
-
-Copyright (c) 2016, PyData Development Team
-All rights reserved.
-
-Distributed under the terms of the BSD Simplified License.
-
-The full license is in the LICENSE file, distributed with this software.
-
-Written by Mark Wiebe (mwwiebe@gmail.com)
-Copyright (c) 2011 by Enthought, Inc.
-
-Copyright (c) 2005-2011, NumPy Developers
-All rights reserved.
-
-See NUMPY_LICENSE.txt for the license.
-
-This file implements string parsing and creation for NumPy datetime.
-
-*/
-
-#ifndef PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_
-#define PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_
-
-#ifndef NPY_NO_DEPRECATED_API
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#endif // NPY_NO_DEPRECATED_API
-
-/* 'format_requirement' can be one of three values:
- * * PARTIAL_MATCH : Only require a partial match with 'format'.
- * For example, if the string is '2020-01-01 05:00:00' and
- * 'format' is '%Y-%m-%d', then parse '2020-01-01';
- * * EXACT_MATCH : require an exact match with 'format'. If the
- * string is '2020-01-01', then the only format which will
- * be able to parse it without error is '%Y-%m-%d';
- * * INFER_FORMAT: parse without comparing 'format' (i.e. infer it).
- */
-typedef enum {
- PARTIAL_MATCH,
- EXACT_MATCH,
- INFER_FORMAT
-} FormatRequirement;
-
-/*
- * Parses (almost) standard ISO 8601 date strings. The differences are:
- *
- * + The date "20100312" is parsed as the year 20100312, not as
- * equivalent to "2010-03-12". The '-' in the dates are not optional.
- * + Only seconds may have a decimal point, with up to 18 digits after it
- * (maximum attoseconds precision).
- * + Either a 'T' as in ISO 8601 or a ' ' may be used to separate
- * the date and the time. Both are treated equivalently.
- * + Doesn't (yet) handle the "YYYY-DDD" or "YYYY-Www" formats.
- * + Doesn't handle leap seconds (seconds value has 60 in these cases).
- * + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow
- * + Accepts special values "NaT" (not a time), "Today", (current
- * day according to local time) and "Now" (current time in UTC).
- *
- * 'str' must be a NULL-terminated string, and 'len' must be its length.
- *
- * 'out' gets filled with the parsed date-time.
- * 'out_local' gets whether returned value contains timezone. 0 for UTC, 1 for local time.
- * 'out_tzoffset' gets set to timezone offset by minutes
- * if the parsed time was in local time,
- * to 0 otherwise. The values 'now' and 'today' don't get counted
- * as local, and neither do UTC +/-#### timezone offsets, because
- * they aren't using the computer's local timezone offset.
- *
- * Returns 0 on success, -1 on failure.
- */
-int
-parse_iso_8601_datetime(const char *str, int len, int want_exc,
- npy_datetimestruct *out,
- NPY_DATETIMEUNIT *out_bestunit,
- int *out_local,
- int *out_tzoffset,
- const char* format,
- int format_len,
- FormatRequirement format_requirement);
-
-/*
- * Provides a string length to use for converting datetime
- * objects with the given local and unit settings.
- */
-int
-get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base);
-
-/*
- * Converts an npy_datetimestruct to an (almost) ISO 8601
- * NULL-terminated string using timezone Z (UTC).
- *
- * 'base' restricts the output to that unit. Set 'base' to
- * -1 to auto-detect a base after which all the values are zero.
- *
- * Returns 0 on success, -1 on failure (for example if the output
- * string was too short).
- */
-int
-make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
- int utc, NPY_DATETIMEUNIT base);
-
-/*
- * Converts an pandas_timedeltastruct to an ISO 8601 string.
- *
- * Mutates outlen to provide size of (non-NULL terminated) string.
- *
- * Currently has no error handling
- */
-int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr,
- size_t *outlen);
-#endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pxd
deleted file mode 100644
index 175195d4362..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pxd
+++ /dev/null
@@ -1,4 +0,0 @@
-from numpy cimport int64_t
-
-
-cdef bint parse_today_now(str val, int64_t* iresult, bint utc)
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pyi
deleted file mode 100644
index 4565bb7ecf9..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pyi
+++ /dev/null
@@ -1,13 +0,0 @@
-import numpy as np
-
-from pandas._typing import npt
-
-def array_strptime(
- values: npt.NDArray[np.object_],
- fmt: str | None,
- exact: bool = ...,
- errors: str = ...,
- utc: bool = ...,
-) -> tuple[np.ndarray, np.ndarray]: ...
-
-# first ndarray is M8[ns], second is object ndarray of tzinfo | None
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pyx
deleted file mode 100644
index cf847746f16..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/strptime.pyx
+++ /dev/null
@@ -1,700 +0,0 @@
-"""Strptime-related classes and functions.
-
-TimeRE, _calc_julian_from_U_or_W are vendored
-from the standard library, see
-https://github.com/python/cpython/blob/main/Lib/_strptime.py
-The original module-level docstring follows.
-
-Strptime-related classes and functions.
-CLASSES:
- LocaleTime -- Discovers and stores locale-specific time information
- TimeRE -- Creates regexes for pattern matching a string of text containing
- time information
-FUNCTIONS:
- _getlang -- Figure out what language is being used for the locale
- strptime -- Calculates the time struct represented by the passed-in string
-"""
-from datetime import timezone
-
-from cpython.datetime cimport (
- PyDate_Check,
- PyDateTime_Check,
- date,
- import_datetime,
- timedelta,
- tzinfo,
-)
-from _strptime import (
- TimeRE as _TimeRE,
- _getlang,
-)
-from _strptime import LocaleTime # no-cython-lint
-
-import_datetime()
-
-from _thread import allocate_lock as _thread_allocate_lock
-import re
-
-import numpy as np
-import pytz
-
-cimport numpy as cnp
-from numpy cimport (
- int64_t,
- ndarray,
-)
-
-from pandas._libs.missing cimport checknull_with_nat_and_na
-from pandas._libs.tslibs.conversion cimport (
- convert_timezone,
- get_datetime64_nanos,
-)
-from pandas._libs.tslibs.nattype cimport (
- NPY_NAT,
- c_nat_strings as nat_strings,
-)
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- NPY_FR_ns,
- check_dts_bounds,
- npy_datetimestruct,
- npy_datetimestruct_to_datetime,
- pydate_to_dt64,
- pydatetime_to_dt64,
- string_to_dts,
-)
-from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
-from pandas._libs.tslibs.timestamps cimport _Timestamp
-from pandas._libs.util cimport (
- is_datetime64_object,
- is_float_object,
- is_integer_object,
-)
-
-from pandas._libs.tslibs.timestamps import Timestamp
-
-cnp.import_array()
-
-cdef bint format_is_iso(f: str):
- """
- Does format match the iso8601 set that can be handled by the C parser?
- Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
- but must be consistent. Leading 0s in dates and times are optional.
- """
- iso_regex = re.compile(
- r"""
- ^ # start of string
- %Y # Year
- (?:([-/ \\.]?)%m # month with or without separators
- (?: \1%d # day with same separator as for year-month
- (?:[ T]%H # hour with separator
- (?:\:%M # minute with separator
- (?:\:%S # second with separator
- (?:%z|\.%f(?:%z)? # timezone or fractional second
- )?)?)?)?)?)? # optional
- $ # end of string
- """,
- re.VERBOSE,
- )
- excluded_formats = ["%Y%m"]
- return re.match(iso_regex, f) is not None and f not in excluded_formats
-
-
-def _test_format_is_iso(f: str) -> bool:
- """Only used in testing."""
- return format_is_iso(f)
-
-
-cdef bint parse_today_now(str val, int64_t* iresult, bint utc):
- # We delay this check for as long as possible
- # because it catches relatively rare cases
-
- # Multiply by 1000 to convert to nanos, since these methods naturally have
- # microsecond resolution
- if val == "now":
- if utc:
- iresult[0] = Timestamp.utcnow()._value * 1000
- else:
- # GH#18705 make sure to_datetime("now") matches Timestamp("now")
- # Note using Timestamp.now() is faster than Timestamp("now")
- iresult[0] = Timestamp.now()._value * 1000
- return True
- elif val == "today":
- iresult[0] = Timestamp.today()._value * 1000
- return True
- return False
-
-cdef dict _parse_code_table = {"y": 0,
- "Y": 1,
- "m": 2,
- "B": 3,
- "b": 4,
- "d": 5,
- "H": 6,
- "I": 7,
- "M": 8,
- "S": 9,
- "f": 10,
- "A": 11,
- "a": 12,
- "w": 13,
- "j": 14,
- "U": 15,
- "W": 16,
- "Z": 17,
- "p": 18, # an additional key, only with I
- "z": 19,
- "G": 20,
- "V": 21,
- "u": 22}
-
-
-def array_strptime(
- ndarray[object] values,
- str fmt,
- bint exact=True,
- errors="raise",
- bint utc=False,
-):
- """
- Calculates the datetime structs represented by the passed array of strings
-
- Parameters
- ----------
- values : ndarray of string-like objects
- fmt : string-like regex
- exact : matches must be exact if True, search if False
- errors : string specifying error handling, {'raise', 'ignore', 'coerce'}
- """
-
- cdef:
- Py_ssize_t i, n = len(values)
- npy_datetimestruct dts
- int64_t[::1] iresult
- object[::1] result_timezone
- int year, month, day, minute, hour, second, weekday, julian
- int week_of_year, week_of_year_start, parse_code, ordinal
- int iso_week, iso_year
- int64_t us, ns
- object val, group_key, ampm, found, tz
- bint is_raise = errors=="raise"
- bint is_ignore = errors=="ignore"
- bint is_coerce = errors=="coerce"
- bint found_naive = False
- bint found_tz = False
- tzinfo tz_out = None
- bint iso_format = format_is_iso(fmt)
- NPY_DATETIMEUNIT out_bestunit
- int out_local = 0, out_tzoffset = 0
- bint string_to_dts_succeeded = 0
-
- assert is_raise or is_ignore or is_coerce
-
- if "%W" in fmt or "%U" in fmt:
- if "%Y" not in fmt and "%y" not in fmt:
- raise ValueError("Cannot use '%W' or '%U' without day and year")
- if "%A" not in fmt and "%a" not in fmt and "%w" not in fmt:
- raise ValueError("Cannot use '%W' or '%U' without day and year")
- elif "%Z" in fmt and "%z" in fmt:
- raise ValueError("Cannot parse both %Z and %z")
- elif "%j" in fmt and "%G" in fmt:
- raise ValueError("Day of the year directive '%j' is not "
- "compatible with ISO year directive '%G'. "
- "Use '%Y' instead.")
- elif "%G" in fmt and (
- "%V" not in fmt
- or not (
- "%A" in fmt
- or "%a" in fmt
- or "%w" in fmt
- or "%u" in fmt
- )
- ):
- raise ValueError("ISO year directive '%G' must be used with "
- "the ISO week directive '%V' and a weekday "
- "directive '%A', '%a', '%w', or '%u'.")
- elif "%V" in fmt and "%Y" in fmt:
- raise ValueError("ISO week directive '%V' is incompatible with "
- "the year directive '%Y'. Use the ISO year "
- "'%G' instead.")
- elif "%V" in fmt and (
- "%G" not in fmt
- or not (
- "%A" in fmt
- or "%a" in fmt
- or "%w" in fmt
- or "%u" in fmt
- )
- ):
- raise ValueError("ISO week directive '%V' must be used with "
- "the ISO year directive '%G' and a weekday "
- "directive '%A', '%a', '%w', or '%u'.")
-
- global _TimeRE_cache, _regex_cache
- with _cache_lock:
- if _getlang() != _TimeRE_cache.locale_time.lang:
- _TimeRE_cache = TimeRE()
- _regex_cache.clear()
- if len(_regex_cache) > _CACHE_MAX_SIZE:
- _regex_cache.clear()
- locale_time = _TimeRE_cache.locale_time
- format_regex = _regex_cache.get(fmt)
- if not format_regex:
- try:
- format_regex = _TimeRE_cache.compile(fmt)
- # KeyError raised when a bad format is found; can be specified as
- # \\, in which case it was a stray % but with a space after it
- except KeyError, err:
- bad_directive = err.args[0]
- if bad_directive == "\\":
- bad_directive = "%"
- del err
- raise ValueError(f"'{bad_directive}' is a bad directive "
- f"in format '{fmt}'")
- # IndexError only occurs when the format string is "%"
- except IndexError:
- raise ValueError(f"stray % in format '{fmt}'")
- _regex_cache[fmt] = format_regex
-
- result = np.empty(n, dtype="M8[ns]")
- iresult = result.view("i8")
- result_timezone = np.empty(n, dtype="object")
-
- dts.us = dts.ps = dts.as = 0
-
- for i in range(n):
- val = values[i]
- try:
- if isinstance(val, str):
- if len(val) == 0 or val in nat_strings:
- iresult[i] = NPY_NAT
- continue
- elif checknull_with_nat_and_na(val):
- iresult[i] = NPY_NAT
- continue
- elif PyDateTime_Check(val):
- if val.tzinfo is not None:
- found_tz = True
- else:
- found_naive = True
- tz_out = convert_timezone(
- val.tzinfo,
- tz_out,
- found_naive,
- found_tz,
- utc,
- )
- if isinstance(val, _Timestamp):
- iresult[i] = val.tz_localize(None).as_unit("ns")._value
- else:
- iresult[i] = pydatetime_to_dt64(val.replace(tzinfo=None), &dts)
- check_dts_bounds(&dts)
- result_timezone[i] = val.tzinfo
- continue
- elif PyDate_Check(val):
- iresult[i] = pydate_to_dt64(val, &dts)
- check_dts_bounds(&dts)
- continue
- elif is_datetime64_object(val):
- iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
- continue
- elif (
- (is_integer_object(val) or is_float_object(val))
- and (val != val or val == NPY_NAT)
- ):
- iresult[i] = NPY_NAT
- continue
- else:
- val = str(val)
-
- if fmt == "ISO8601":
- string_to_dts_succeeded = not string_to_dts(
- val, &dts, &out_bestunit, &out_local,
- &out_tzoffset, False, None, False
- )
- elif iso_format:
- string_to_dts_succeeded = not string_to_dts(
- val, &dts, &out_bestunit, &out_local,
- &out_tzoffset, False, fmt, exact
- )
- if string_to_dts_succeeded:
- # No error reported by string_to_dts, pick back up
- # where we left off
- value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
- if out_local == 1:
- # Store the out_tzoffset in seconds
- # since we store the total_seconds of
- # dateutil.tz.tzoffset objects
- tz = timezone(timedelta(minutes=out_tzoffset))
- result_timezone[i] = tz
- out_local = 0
- out_tzoffset = 0
- iresult[i] = value
- check_dts_bounds(&dts)
- continue
-
- if parse_today_now(val, &iresult[i], utc):
- continue
-
- # Some ISO formats can't be parsed by string_to_dts
- # For example, 6-digit YYYYMD. So, if there's an error, and a format
- # was specified, then try the string-matching code below. If the format
- # specified was 'ISO8601', then we need to error, because
- # only string_to_dts handles mixed ISO8601 formats.
- if not string_to_dts_succeeded and fmt == "ISO8601":
- raise ValueError(f"Time data {val} is not ISO8601 format")
-
- # exact matching
- if exact:
- found = format_regex.match(val)
- if not found:
- raise ValueError(
- f"time data \"{val}\" doesn't match format \"{fmt}\""
- )
- if len(val) != found.end():
- raise ValueError(
- "unconverted data remains when parsing with "
- f"format \"{fmt}\": \"{val[found.end():]}\""
- )
-
- # search
- else:
- found = format_regex.search(val)
- if not found:
- raise ValueError(
- f"time data \"{val}\" doesn't match format \"{fmt}\""
- )
-
- iso_year = -1
- year = 1900
- month = day = 1
- hour = minute = second = ns = us = 0
- tz = None
- # Default to -1 to signify that values not known; not critical to have,
- # though
- iso_week = week_of_year = -1
- week_of_year_start = -1
- # weekday and julian defaulted to -1 so as to signal need to calculate
- # values
- weekday = julian = -1
- found_dict = found.groupdict()
- for group_key in found_dict.iterkeys():
- # Directives not explicitly handled below:
- # c, x, X
- # handled by making out of other directives
- # U, W
- # worthless without day of the week
- parse_code = _parse_code_table[group_key]
-
- if parse_code == 0:
- year = int(found_dict["y"])
- # Open Group specification for strptime() states that a %y
- # value in the range of [00, 68] is in the century 2000, while
- # [69,99] is in the century 1900
- if year <= 68:
- year += 2000
- else:
- year += 1900
- elif parse_code == 1:
- year = int(found_dict["Y"])
- elif parse_code == 2:
- month = int(found_dict["m"])
- # elif group_key == 'B':
- elif parse_code == 3:
- month = locale_time.f_month.index(found_dict["B"].lower())
- # elif group_key == 'b':
- elif parse_code == 4:
- month = locale_time.a_month.index(found_dict["b"].lower())
- # elif group_key == 'd':
- elif parse_code == 5:
- day = int(found_dict["d"])
- # elif group_key == 'H':
- elif parse_code == 6:
- hour = int(found_dict["H"])
- elif parse_code == 7:
- hour = int(found_dict["I"])
- ampm = found_dict.get("p", "").lower()
- # If there was no AM/PM indicator, we'll treat this like AM
- if ampm in ("", locale_time.am_pm[0]):
- # We're in AM so the hour is correct unless we're
- # looking at 12 midnight.
- # 12 midnight == 12 AM == hour 0
- if hour == 12:
- hour = 0
- elif ampm == locale_time.am_pm[1]:
- # We're in PM so we need to add 12 to the hour unless
- # we're looking at 12 noon.
- # 12 noon == 12 PM == hour 12
- if hour != 12:
- hour += 12
- elif parse_code == 8:
- minute = int(found_dict["M"])
- elif parse_code == 9:
- second = int(found_dict["S"])
- elif parse_code == 10:
- s = found_dict["f"]
- # Pad to always return nanoseconds
- s += "0" * (9 - len(s))
- us = long(s)
- ns = us % 1000
- us = us // 1000
- elif parse_code == 11:
- weekday = locale_time.f_weekday.index(found_dict["A"].lower())
- elif parse_code == 12:
- weekday = locale_time.a_weekday.index(found_dict["a"].lower())
- elif parse_code == 13:
- weekday = int(found_dict["w"])
- if weekday == 0:
- weekday = 6
- else:
- weekday -= 1
- elif parse_code == 14:
- julian = int(found_dict["j"])
- elif parse_code == 15 or parse_code == 16:
- week_of_year = int(found_dict[group_key])
- if group_key == "U":
- # U starts week on Sunday.
- week_of_year_start = 6
- else:
- # W starts week on Monday.
- week_of_year_start = 0
- elif parse_code == 17:
- tz = pytz.timezone(found_dict["Z"])
- elif parse_code == 19:
- tz = parse_timezone_directive(found_dict["z"])
- elif parse_code == 20:
- iso_year = int(found_dict["G"])
- elif parse_code == 21:
- iso_week = int(found_dict["V"])
- elif parse_code == 22:
- weekday = int(found_dict["u"])
- weekday -= 1
-
- # If we know the wk of the year and what day of that wk, we can figure
- # out the Julian day of the year.
- if julian == -1 and weekday != -1:
- if week_of_year != -1:
- week_starts_Mon = week_of_year_start == 0
- julian = _calc_julian_from_U_or_W(year, week_of_year, weekday,
- week_starts_Mon)
- elif iso_year != -1 and iso_week != -1:
- year, julian = _calc_julian_from_V(iso_year, iso_week,
- weekday + 1)
- # Cannot pre-calculate date() since can change in Julian
- # calculation and thus could have different value for the day of the wk
- # calculation.
- if julian == -1:
- # Need to add 1 to result since first day of the year is 1, not
- # 0.
- ordinal = date(year, month, day).toordinal()
- julian = ordinal - date(year, 1, 1).toordinal() + 1
- else:
- # Assume that if they bothered to include Julian day it will
- # be accurate.
- datetime_result = date.fromordinal(
- (julian - 1) + date(year, 1, 1).toordinal())
- year = datetime_result.year
- month = datetime_result.month
- day = datetime_result.day
- if weekday == -1:
- weekday = date(year, month, day).weekday()
-
- dts.year = year
- dts.month = month
- dts.day = day
- dts.hour = hour
- dts.min = minute
- dts.sec = second
- dts.us = us
- dts.ps = ns * 1000
-
- iresult[i] = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
- check_dts_bounds(&dts)
-
- result_timezone[i] = tz
-
- except (ValueError, OutOfBoundsDatetime) as ex:
- ex.args = (
- f"{str(ex)}, at position {i}. You might want to try:\n"
- " - passing `format` if your strings have a consistent format;\n"
- " - passing `format='ISO8601'` if your strings are "
- "all ISO8601 but not necessarily in exactly the same format;\n"
- " - passing `format='mixed'`, and the format will be "
- "inferred for each element individually. "
- "You might want to use `dayfirst` alongside this.",
- )
- if is_coerce:
- iresult[i] = NPY_NAT
- continue
- elif is_raise:
- raise
- return values, []
-
- return result, result_timezone.base
-
-
-class TimeRE(_TimeRE):
- """
- Handle conversion from format directives to regexes.
-
- Creates regexes for pattern matching a string of text containing
- time information
- """
-
- def __init__(self, locale_time=None):
- """
- Create keys/values.
-
- Order of execution is important for dependency reasons.
- """
- self._Z = None
- super().__init__(locale_time=locale_time)
- # GH 48767: Overrides for cpython's TimeRE
- # 1) Parse up to nanos instead of micros
- self.update({"f": r"(?P<f>[0-9]{1,9})"}),
-
- def __getitem__(self, key):
- if key == "Z":
- # lazy computation
- if self._Z is None:
- self._Z = self.__seqToRE(pytz.all_timezones, "Z")
- # Note: handling Z is the key difference vs using the stdlib
- # _strptime.TimeRE. test_to_datetime_parse_tzname_or_tzoffset with
- # fmt='%Y-%m-%d %H:%M:%S %Z' fails with the stdlib version.
- return self._Z
- return super().__getitem__(key)
-
-
-_cache_lock = _thread_allocate_lock()
-# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock
-# first!
-_TimeRE_cache = TimeRE()
-_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
-_regex_cache = {}
-
-
-cdef int _calc_julian_from_U_or_W(int year, int week_of_year,
- int day_of_week, int week_starts_Mon):
- """
- Calculate the Julian day based on the year, week of the year, and day of
- the week, with week_start_day representing whether the week of the year
- assumes the week starts on Sunday or Monday (6 or 0).
-
- Parameters
- ----------
- year : int
- the year
- week_of_year : int
- week taken from format U or W
- week_starts_Mon : int
- represents whether the week of the year
- assumes the week starts on Sunday or Monday (6 or 0)
-
- Returns
- -------
- int
- converted julian day
- """
-
- cdef:
- int first_weekday, week_0_length, days_to_week
-
- first_weekday = date(year, 1, 1).weekday()
- # If we are dealing with the %U directive (week starts on Sunday), it's
- # easier to just shift the view to Sunday being the first day of the
- # week.
- if not week_starts_Mon:
- first_weekday = (first_weekday + 1) % 7
- day_of_week = (day_of_week + 1) % 7
-
- # Need to watch out for a week 0 (when the first day of the year is not
- # the same as that specified by %U or %W).
- week_0_length = (7 - first_weekday) % 7
- if week_of_year == 0:
- return 1 + day_of_week - first_weekday
- else:
- days_to_week = week_0_length + (7 * (week_of_year - 1))
- return 1 + days_to_week + day_of_week
-
-
-cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday):
- """
- Calculate the Julian day based on the ISO 8601 year, week, and weekday.
-
- ISO weeks start on Mondays, with week 01 being the week containing 4 Jan.
- ISO week days range from 1 (Monday) to 7 (Sunday).
-
- Parameters
- ----------
- iso_year : int
- the year taken from format %G
- iso_week : int
- the week taken from format %V
- iso_weekday : int
- weekday taken from format %u
-
- Returns
- -------
- (int, int)
- the iso year and the Gregorian ordinal date / julian date
- """
-
- cdef:
- int correction, ordinal
-
- correction = date(iso_year, 1, 4).isoweekday() + 3
- ordinal = (iso_week * 7) + iso_weekday - correction
- # ordinal may be negative or 0 now, which means the date is in the previous
- # calendar year
- if ordinal < 1:
- ordinal += date(iso_year, 1, 1).toordinal()
- iso_year -= 1
- ordinal -= date(iso_year, 1, 1).toordinal()
- return iso_year, ordinal
-
-
-cdef tzinfo parse_timezone_directive(str z):
- """
- Parse the '%z' directive and return a datetime.timezone object.
-
- Parameters
- ----------
- z : string of the UTC offset
-
- Returns
- -------
- datetime.timezone
-
- Notes
- -----
- This is essentially similar to the cpython implementation
- https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479
- """
-
- cdef:
- int hours, minutes, seconds, pad_number, microseconds
- int total_minutes
- object gmtoff_remainder, gmtoff_remainder_padding
-
- if z == "Z":
- return timezone(timedelta(0))
- if z[3] == ":":
- z = z[:3] + z[4:]
- if len(z) > 5:
- if z[5] != ":":
- raise ValueError(f"Inconsistent use of : in {z}")
- z = z[:5] + z[6:]
- hours = int(z[1:3])
- minutes = int(z[3:5])
- seconds = int(z[5:7] or 0)
-
- # Pad to always return microseconds.
- gmtoff_remainder = z[8:]
- pad_number = 6 - len(gmtoff_remainder)
- gmtoff_remainder_padding = "0" * pad_number
- microseconds = int(gmtoff_remainder + gmtoff_remainder_padding)
-
- total_minutes = ((hours * 60) + minutes + (seconds // 60) +
- (microseconds // 60_000_000))
- total_minutes = -total_minutes if z.startswith("-") else total_minutes
- return timezone(timedelta(minutes=total_minutes))
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pxd
deleted file mode 100644
index fb6e29a8932..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pxd
+++ /dev/null
@@ -1,28 +0,0 @@
-from cpython.datetime cimport timedelta
-from numpy cimport int64_t
-
-from .np_datetime cimport NPY_DATETIMEUNIT
-
-
-# Exposed for tslib, not intended for outside use.
-cpdef int64_t delta_to_nanoseconds(
- delta, NPY_DATETIMEUNIT reso=*, bint round_ok=*
-) except? -1
-cdef convert_to_timedelta64(object ts, str unit)
-cdef bint is_any_td_scalar(object obj)
-
-
-cdef class _Timedelta(timedelta):
- cdef readonly:
- int64_t _value # nanoseconds
- bint _is_populated # are my components populated
- int64_t _d, _h, _m, _s, _ms, _us, _ns
- NPY_DATETIMEUNIT _creso
-
- cpdef timedelta to_pytimedelta(_Timedelta self)
- cdef bint _has_ns(self)
- cdef bint _is_in_pytimedelta_bounds(self)
- cdef _ensure_components(_Timedelta self)
- cdef bint _compare_mismatched_resos(self, _Timedelta other, op)
- cdef _Timedelta _as_creso(self, NPY_DATETIMEUNIT reso, bint round_ok=*)
- cpdef _maybe_cast_to_matching_resos(self, _Timedelta other)
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pyi
deleted file mode 100644
index d67a330e0b0..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pyi
+++ /dev/null
@@ -1,163 +0,0 @@
-from datetime import timedelta
-from typing import (
- ClassVar,
- Literal,
- TypeVar,
- overload,
-)
-
-import numpy as np
-
-from pandas._libs.tslibs import (
- NaTType,
- Tick,
-)
-from pandas._typing import npt
-
-# This should be kept consistent with the keys in the dict timedelta_abbrevs
-# in pandas/_libs/tslibs/timedeltas.pyx
-UnitChoices = Literal[
- "Y",
- "y",
- "M",
- "W",
- "w",
- "D",
- "d",
- "days",
- "day",
- "hours",
- "hour",
- "hr",
- "h",
- "m",
- "minute",
- "min",
- "minutes",
- "t",
- "s",
- "seconds",
- "sec",
- "second",
- "ms",
- "milliseconds",
- "millisecond",
- "milli",
- "millis",
- "l",
- "us",
- "microseconds",
- "microsecond",
- "µs",
- "micro",
- "micros",
- "u",
- "ns",
- "nanoseconds",
- "nano",
- "nanos",
- "nanosecond",
- "n",
-]
-_S = TypeVar("_S", bound=timedelta)
-
-def ints_to_pytimedelta(
- arr: npt.NDArray[np.timedelta64],
- box: bool = ...,
-) -> npt.NDArray[np.object_]: ...
-def array_to_timedelta64(
- values: npt.NDArray[np.object_],
- unit: str | None = ...,
- errors: str = ...,
-) -> np.ndarray: ... # np.ndarray[m8ns]
-def parse_timedelta_unit(unit: str | None) -> UnitChoices: ...
-def delta_to_nanoseconds(
- delta: np.timedelta64 | timedelta | Tick,
- reso: int = ..., # NPY_DATETIMEUNIT
- round_ok: bool = ...,
-) -> int: ...
-def floordiv_object_array(
- left: np.ndarray, right: npt.NDArray[np.object_]
-) -> np.ndarray: ...
-def truediv_object_array(
- left: np.ndarray, right: npt.NDArray[np.object_]
-) -> np.ndarray: ...
-
-class Timedelta(timedelta):
- _creso: int
- min: ClassVar[Timedelta]
- max: ClassVar[Timedelta]
- resolution: ClassVar[Timedelta]
- value: int # np.int64
- _value: int # np.int64
- # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]")
- def __new__( # type: ignore[misc]
- cls: type[_S],
- value=...,
- unit: str | None = ...,
- **kwargs: float | np.integer | np.floating,
- ) -> _S | NaTType: ...
- @classmethod
- def _from_value_and_reso(cls, value: np.int64, reso: int) -> Timedelta: ...
- @property
- def days(self) -> int: ...
- @property
- def seconds(self) -> int: ...
- @property
- def microseconds(self) -> int: ...
- def total_seconds(self) -> float: ...
- def to_pytimedelta(self) -> timedelta: ...
- def to_timedelta64(self) -> np.timedelta64: ...
- @property
- def asm8(self) -> np.timedelta64: ...
- # TODO: round/floor/ceil could return NaT?
- def round(self: _S, freq: str) -> _S: ...
- def floor(self: _S, freq: str) -> _S: ...
- def ceil(self: _S, freq: str) -> _S: ...
- @property
- def resolution_string(self) -> str: ...
- def __add__(self, other: timedelta) -> Timedelta: ...
- def __radd__(self, other: timedelta) -> Timedelta: ...
- def __sub__(self, other: timedelta) -> Timedelta: ...
- def __rsub__(self, other: timedelta) -> Timedelta: ...
- def __neg__(self) -> Timedelta: ...
- def __pos__(self) -> Timedelta: ...
- def __abs__(self) -> Timedelta: ...
- def __mul__(self, other: float) -> Timedelta: ...
- def __rmul__(self, other: float) -> Timedelta: ...
- # error: Signature of "__floordiv__" incompatible with supertype "timedelta"
- @overload # type: ignore[override]
- def __floordiv__(self, other: timedelta) -> int: ...
- @overload
- def __floordiv__(self, other: float) -> Timedelta: ...
- @overload
- def __floordiv__(
- self, other: npt.NDArray[np.timedelta64]
- ) -> npt.NDArray[np.intp]: ...
- @overload
- def __floordiv__(
- self, other: npt.NDArray[np.number]
- ) -> npt.NDArray[np.timedelta64] | Timedelta: ...
- @overload
- def __rfloordiv__(self, other: timedelta | str) -> int: ...
- @overload
- def __rfloordiv__(self, other: None | NaTType) -> NaTType: ...
- @overload
- def __rfloordiv__(self, other: np.ndarray) -> npt.NDArray[np.timedelta64]: ...
- @overload
- def __truediv__(self, other: timedelta) -> float: ...
- @overload
- def __truediv__(self, other: float) -> Timedelta: ...
- def __mod__(self, other: timedelta) -> Timedelta: ...
- def __divmod__(self, other: timedelta) -> tuple[int, Timedelta]: ...
- def __le__(self, other: timedelta) -> bool: ...
- def __lt__(self, other: timedelta) -> bool: ...
- def __ge__(self, other: timedelta) -> bool: ...
- def __gt__(self, other: timedelta) -> bool: ...
- def __hash__(self) -> int: ...
- def isoformat(self) -> str: ...
- def to_numpy(self) -> np.timedelta64: ...
- def view(self, dtype: npt.DTypeLike = ...) -> object: ...
- @property
- def unit(self) -> str: ...
- def as_unit(self, unit: str, round_ok: bool = ...) -> Timedelta: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pyx
deleted file mode 100644
index 955e1cf95e0..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/timedeltas.pyx
+++ /dev/null
@@ -1,2171 +0,0 @@
-import collections
-import warnings
-
-cimport cython
-from cpython.object cimport (
- Py_EQ,
- Py_GE,
- Py_GT,
- Py_LE,
- Py_LT,
- Py_NE,
- PyObject,
- PyObject_RichCompare,
-)
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- int64_t,
- ndarray,
-)
-
-cnp.import_array()
-
-from cpython.datetime cimport (
- PyDateTime_Check,
- PyDelta_Check,
- import_datetime,
- timedelta,
-)
-
-import_datetime()
-
-
-cimport pandas._libs.tslibs.util as util
-from pandas._libs.tslibs.base cimport ABCTimestamp
-from pandas._libs.tslibs.conversion cimport (
- cast_from_unit,
- precision_from_unit,
-)
-from pandas._libs.tslibs.dtypes cimport (
- get_supported_reso,
- npy_unit_to_abbrev,
-)
-from pandas._libs.tslibs.nattype cimport (
- NPY_NAT,
- c_NaT as NaT,
- c_nat_strings as nat_strings,
- checknull_with_nat,
-)
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- NPY_FR_ns,
- cmp_dtstructs,
- cmp_scalar,
- convert_reso,
- get_datetime64_unit,
- get_timedelta64_value,
- get_unit_from_dtype,
- npy_datetimestruct,
- pandas_datetime_to_datetimestruct,
- pandas_timedelta_to_timedeltastruct,
- pandas_timedeltastruct,
-)
-
-from pandas._libs.tslibs.np_datetime import (
- OutOfBoundsDatetime,
- OutOfBoundsTimedelta,
-)
-
-from pandas._libs.tslibs.offsets cimport is_tick_object
-from pandas._libs.tslibs.util cimport (
- is_array,
- is_datetime64_object,
- is_float_object,
- is_integer_object,
- is_timedelta64_object,
-)
-
-from pandas._libs.tslibs.fields import (
- RoundTo,
- round_nsint64,
-)
-
-# ----------------------------------------------------------------------
-# Constants
-
-# components named tuple
-Components = collections.namedtuple(
- "Components",
- [
- "days",
- "hours",
- "minutes",
- "seconds",
- "milliseconds",
- "microseconds",
- "nanoseconds",
- ],
-)
-
-# This should be kept consistent with UnitChoices in pandas/_libs/tslibs/timedeltas.pyi
-cdef dict timedelta_abbrevs = {
- "Y": "Y",
- "y": "Y",
- "M": "M",
- "W": "W",
- "w": "W",
- "D": "D",
- "d": "D",
- "days": "D",
- "day": "D",
- "hours": "h",
- "hour": "h",
- "hr": "h",
- "h": "h",
- "m": "m",
- "minute": "m",
- "min": "m",
- "minutes": "m",
- "t": "m",
- "s": "s",
- "seconds": "s",
- "sec": "s",
- "second": "s",
- "ms": "ms",
- "milliseconds": "ms",
- "millisecond": "ms",
- "milli": "ms",
- "millis": "ms",
- "l": "ms",
- "us": "us",
- "microseconds": "us",
- "microsecond": "us",
- "µs": "us",
- "micro": "us",
- "micros": "us",
- "u": "us",
- "ns": "ns",
- "nanoseconds": "ns",
- "nano": "ns",
- "nanos": "ns",
- "nanosecond": "ns",
- "n": "ns",
-}
-
-_no_input = object()
-
-
-# ----------------------------------------------------------------------
-# API
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def ints_to_pytimedelta(ndarray m8values, box=False):
- """
- convert an i8 repr to an ndarray of timedelta or Timedelta (if box ==
- True)
-
- Parameters
- ----------
- arr : ndarray[timedelta64]
- box : bool, default False
-
- Returns
- -------
- result : ndarray[object]
- array of Timedelta or timedeltas objects
- """
- cdef:
- NPY_DATETIMEUNIT reso = get_unit_from_dtype(m8values.dtype)
- Py_ssize_t i, n = m8values.size
- int64_t value
- object res_val
-
- # Note that `result` (and thus `result_flat`) is C-order and
- # `it` iterates C-order as well, so the iteration matches
- # See discussion at
- # github.com/pandas-dev/pandas/pull/46886#discussion_r860261305
- ndarray result = cnp.PyArray_EMPTY(
- m8values.ndim, m8values.shape, cnp.NPY_OBJECT, 0
- )
- object[::1] res_flat = result.ravel() # should NOT be a copy
-
- ndarray arr = m8values.view("i8")
- cnp.flatiter it = cnp.PyArray_IterNew(arr)
-
- for i in range(n):
- # Analogous to: value = arr[i]
- value = (<int64_t*>cnp.PyArray_ITER_DATA(it))[0]
-
- if value == NPY_NAT:
- res_val = <object>NaT
- else:
- if box:
- res_val = _timedelta_from_value_and_reso(Timedelta, value, reso=reso)
- elif reso == NPY_DATETIMEUNIT.NPY_FR_ns:
- res_val = timedelta(microseconds=int(value) / 1000)
- elif reso == NPY_DATETIMEUNIT.NPY_FR_us:
- res_val = timedelta(microseconds=value)
- elif reso == NPY_DATETIMEUNIT.NPY_FR_ms:
- res_val = timedelta(milliseconds=value)
- elif reso == NPY_DATETIMEUNIT.NPY_FR_s:
- res_val = timedelta(seconds=value)
- elif reso == NPY_DATETIMEUNIT.NPY_FR_m:
- res_val = timedelta(minutes=value)
- elif reso == NPY_DATETIMEUNIT.NPY_FR_h:
- res_val = timedelta(hours=value)
- elif reso == NPY_DATETIMEUNIT.NPY_FR_D:
- res_val = timedelta(days=value)
- elif reso == NPY_DATETIMEUNIT.NPY_FR_W:
- res_val = timedelta(weeks=value)
- else:
- # Month, Year, NPY_FR_GENERIC, pico, femto, atto
- raise NotImplementedError(reso)
-
- # Note: we can index result directly instead of using PyArray_MultiIter_DATA
- # like we do for the other functions because result is known C-contiguous
- # and is the first argument to PyArray_MultiIterNew2. The usual pattern
- # does not seem to work with object dtype.
- # See discussion at
- # github.com/pandas-dev/pandas/pull/46886#discussion_r860261305
- res_flat[i] = res_val
-
- cnp.PyArray_ITER_NEXT(it)
-
- return result
-
-
-# ----------------------------------------------------------------------
-
-
-cpdef int64_t delta_to_nanoseconds(
- delta,
- NPY_DATETIMEUNIT reso=NPY_FR_ns,
- bint round_ok=True,
-) except? -1:
- # Note: this will raise on timedelta64 with Y or M unit
-
- cdef:
- NPY_DATETIMEUNIT in_reso
- int64_t n
-
- if is_tick_object(delta):
- n = delta.n
- in_reso = delta._creso
-
- elif isinstance(delta, _Timedelta):
- n = delta._value
- in_reso = delta._creso
-
- elif is_timedelta64_object(delta):
- in_reso = get_datetime64_unit(delta)
- if in_reso == NPY_DATETIMEUNIT.NPY_FR_Y or in_reso == NPY_DATETIMEUNIT.NPY_FR_M:
- raise ValueError(
- "delta_to_nanoseconds does not support Y or M units, "
- "as their duration in nanoseconds is ambiguous."
- )
- n = get_timedelta64_value(delta)
-
- elif PyDelta_Check(delta):
- in_reso = NPY_DATETIMEUNIT.NPY_FR_us
- try:
- n = (
- delta.days * 24 * 3600 * 1_000_000
- + delta.seconds * 1_000_000
- + delta.microseconds
- )
- except OverflowError as err:
- raise OutOfBoundsTimedelta(*err.args) from err
-
- else:
- raise TypeError(type(delta))
-
- try:
- return convert_reso(n, in_reso, reso, round_ok=round_ok)
- except (OutOfBoundsDatetime, OverflowError) as err:
- # Catch OutOfBoundsDatetime bc convert_reso can call check_dts_bounds
- # for Y/M-resolution cases
- unit_str = npy_unit_to_abbrev(reso)
- raise OutOfBoundsTimedelta(
- f"Cannot cast {str(delta)} to unit={unit_str} without overflow."
- ) from err
-
-
-@cython.overflowcheck(True)
-cdef object ensure_td64ns(object ts):
- """
- Overflow-safe implementation of td64.astype("m8[ns]")
-
- Parameters
- ----------
- ts : np.timedelta64
-
- Returns
- -------
- np.timedelta64[ns]
- """
- cdef:
- NPY_DATETIMEUNIT td64_unit
- int64_t td64_value, mult
- str unitstr
-
- td64_unit = get_datetime64_unit(ts)
- if (
- td64_unit != NPY_DATETIMEUNIT.NPY_FR_ns
- and td64_unit != NPY_DATETIMEUNIT.NPY_FR_GENERIC
- ):
- unitstr = npy_unit_to_abbrev(td64_unit)
-
- td64_value = get_timedelta64_value(ts)
-
- mult = precision_from_unit(unitstr)[0]
- try:
- # NB: cython#1381 this cannot be *=
- td64_value = td64_value * mult
- except OverflowError as err:
- raise OutOfBoundsTimedelta(ts) from err
-
- return np.timedelta64(td64_value, "ns")
-
- return ts
-
-
-cdef convert_to_timedelta64(object ts, str unit):
- """
- Convert an incoming object to a timedelta64 if possible.
- Before calling, unit must be standardized to avoid repeated unit conversion
-
- Handle these types of objects:
- - timedelta/Timedelta
- - timedelta64
- - an offset
- - np.int64 (with unit providing a possible modifier)
- - None/NaT
-
- Return an ns based int64
- """
- # Caller is responsible for checking unit not in ["Y", "y", "M"]
-
- if checknull_with_nat(ts):
- return np.timedelta64(NPY_NAT, "ns")
- elif isinstance(ts, _Timedelta):
- # already in the proper format
- if ts._creso != NPY_FR_ns:
- ts = ts.as_unit("ns").asm8
- else:
- ts = np.timedelta64(ts._value, "ns")
- elif is_timedelta64_object(ts):
- ts = ensure_td64ns(ts)
- elif is_integer_object(ts):
- if ts == NPY_NAT:
- return np.timedelta64(NPY_NAT, "ns")
- else:
- ts = _maybe_cast_from_unit(ts, unit)
- elif is_float_object(ts):
- ts = _maybe_cast_from_unit(ts, unit)
- elif isinstance(ts, str):
- if (len(ts) > 0 and ts[0] == "P") or (len(ts) > 1 and ts[:2] == "-P"):
- ts = parse_iso_format_string(ts)
- else:
- ts = parse_timedelta_string(ts)
- ts = np.timedelta64(ts, "ns")
- elif is_tick_object(ts):
- ts = np.timedelta64(ts.nanos, "ns")
-
- if PyDelta_Check(ts):
- ts = np.timedelta64(delta_to_nanoseconds(ts), "ns")
- elif not is_timedelta64_object(ts):
- raise TypeError(f"Invalid type for timedelta scalar: {type(ts)}")
- return ts.astype("timedelta64[ns]")
-
-
-cdef _maybe_cast_from_unit(ts, str unit):
- # caller is responsible for checking
- # assert unit not in ["Y", "y", "M"]
- try:
- ts = cast_from_unit(ts, unit)
- except OutOfBoundsDatetime as err:
- raise OutOfBoundsTimedelta(
- f"Cannot cast {ts} from {unit} to 'ns' without overflow."
- ) from err
-
- ts = np.timedelta64(ts, "ns")
- return ts
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def array_to_timedelta64(
- ndarray values, str unit=None, str errors="raise"
-) -> ndarray:
- # values is object-dtype, may be 2D
- """
- Convert an ndarray to an array of timedeltas. If errors == 'coerce',
- coerce non-convertible objects to NaT. Otherwise, raise.
-
- Returns
- -------
- np.ndarray[timedelta64ns]
- """
- # Caller is responsible for checking
- assert unit not in ["Y", "y", "M"]
-
- cdef:
- Py_ssize_t i, n = values.size
- ndarray result = np.empty((<object>values).shape, dtype="m8[ns]")
- object item
- int64_t ival
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(result, values)
- cnp.flatiter it
-
- if values.descr.type_num != cnp.NPY_OBJECT:
- # raise here otherwise we segfault below
- raise TypeError("array_to_timedelta64 'values' must have object dtype")
-
- if errors not in {"ignore", "raise", "coerce"}:
- raise ValueError("errors must be one of {'ignore', 'raise', or 'coerce'}")
-
- if unit is not None and errors != "coerce":
- it = cnp.PyArray_IterNew(values)
- for i in range(n):
- # Analogous to: item = values[i]
- item = cnp.PyArray_GETITEM(values, cnp.PyArray_ITER_DATA(it))
- if isinstance(item, str):
- raise ValueError(
- "unit must not be specified if the input contains a str"
- )
- cnp.PyArray_ITER_NEXT(it)
-
- # Usually, we have all strings. If so, we hit the fast path.
- # If this path fails, we try conversion a different way, and
- # this is where all of the error handling will take place.
- try:
- for i in range(n):
- # Analogous to: item = values[i]
- item = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- ival = _item_to_timedelta64_fastpath(item)
-
- # Analogous to: iresult[i] = ival
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = ival
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- except (TypeError, ValueError):
- cnp.PyArray_MultiIter_RESET(mi)
-
- parsed_unit = parse_timedelta_unit(unit or "ns")
- for i in range(n):
- item = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- ival = _item_to_timedelta64(item, parsed_unit, errors)
-
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = ival
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return result
-
-
-cdef int64_t _item_to_timedelta64_fastpath(object item) except? -1:
- """
- See array_to_timedelta64.
- """
- if item is NaT:
- # we allow this check in the fast-path because NaT is a C-object
- # so this is an inexpensive check
- return NPY_NAT
- else:
- return parse_timedelta_string(item)
-
-
-cdef int64_t _item_to_timedelta64(
- object item,
- str parsed_unit,
- str errors
-) except? -1:
- """
- See array_to_timedelta64.
- """
- try:
- return get_timedelta64_value(convert_to_timedelta64(item, parsed_unit))
- except ValueError as err:
- if errors == "coerce":
- return NPY_NAT
- elif "unit abbreviation w/o a number" in str(err):
- # re-raise with more pertinent message
- msg = f"Could not convert '{item}' to NumPy timedelta"
- raise ValueError(msg) from err
- else:
- raise
-
-
-@cython.cpow(True)
-cdef int64_t parse_timedelta_string(str ts) except? -1:
- """
- Parse a regular format timedelta string. Return an int64_t (in ns)
- or raise a ValueError on an invalid parse.
- """
-
- cdef:
- unicode c
- bint neg = 0, have_dot = 0, have_value = 0, have_hhmmss = 0
- object current_unit = None
- int64_t result = 0, m = 0, r
- list number = [], frac = [], unit = []
-
- # neg : tracks if we have a leading negative for the value
- # have_dot : tracks if we are processing a dot (either post hhmmss or
- # inside an expression)
- # have_value : track if we have at least 1 leading unit
- # have_hhmmss : tracks if we have a regular format hh:mm:ss
-
- if len(ts) == 0 or ts in nat_strings:
- return NPY_NAT
-
- for c in ts:
-
- # skip whitespace / commas
- if c == " " or c == ",":
- pass
-
- # positive signs are ignored
- elif c == "+":
- pass
-
- # neg
- elif c == "-":
-
- if neg or have_value or have_hhmmss:
- raise ValueError("only leading negative signs are allowed")
-
- neg = 1
-
- # number (ascii codes)
- elif ord(c) >= 48 and ord(c) <= 57:
-
- if have_dot:
-
- # we found a dot, but now its just a fraction
- if len(unit):
- number.append(c)
- have_dot = 0
- else:
- frac.append(c)
-
- elif not len(unit):
- number.append(c)
-
- else:
- r = timedelta_from_spec(number, frac, unit)
- unit, number, frac = [], [c], []
-
- result += timedelta_as_neg(r, neg)
-
- # hh:mm:ss.
- elif c == ":":
-
- # we flip this off if we have a leading value
- if have_value:
- neg = 0
-
- # we are in the pattern hh:mm:ss pattern
- if len(number):
- if current_unit is None:
- current_unit = "h"
- m = 1000000000 * 3600
- elif current_unit == "h":
- current_unit = "m"
- m = 1000000000 * 60
- elif current_unit == "m":
- current_unit = "s"
- m = 1000000000
- r = <int64_t>int("".join(number)) * m
- result += timedelta_as_neg(r, neg)
- have_hhmmss = 1
- else:
- raise ValueError(f"expecting hh:mm:ss format, received: {ts}")
-
- unit, number = [], []
-
- # after the decimal point
- elif c == ".":
-
- if len(number) and current_unit is not None:
-
- # by definition we had something like
- # so we need to evaluate the final field from a
- # hh:mm:ss (so current_unit is 'm')
- if current_unit != "m":
- raise ValueError("expected hh:mm:ss format before .")
- m = 1000000000
- r = <int64_t>int("".join(number)) * m
- result += timedelta_as_neg(r, neg)
- have_value = 1
- unit, number, frac = [], [], []
-
- have_dot = 1
-
- # unit
- else:
- unit.append(c)
- have_value = 1
- have_dot = 0
-
- # we had a dot, but we have a fractional
- # value since we have an unit
- if have_dot and len(unit):
- r = timedelta_from_spec(number, frac, unit)
- result += timedelta_as_neg(r, neg)
-
- # we have a dot as part of a regular format
- # e.g. hh:mm:ss.fffffff
- elif have_dot:
-
- if ((len(number) or len(frac)) and not len(unit)
- and current_unit is None):
- raise ValueError("no units specified")
-
- if len(frac) > 0 and len(frac) <= 3:
- m = 10**(3 -len(frac)) * 1000 * 1000
- elif len(frac) > 3 and len(frac) <= 6:
- m = 10**(6 -len(frac)) * 1000
- elif len(frac) > 6 and len(frac) <= 9:
- m = 10**(9 -len(frac))
- else:
- m = 1
- frac = frac[:9]
- r = <int64_t>int("".join(frac)) * m
- result += timedelta_as_neg(r, neg)
-
- # we have a regular format
- # we must have seconds at this point (hence the unit is still 'm')
- elif current_unit is not None:
- if current_unit != "m":
- raise ValueError("expected hh:mm:ss format")
- m = 1000000000
- r = <int64_t>int("".join(number)) * m
- result += timedelta_as_neg(r, neg)
-
- # we have a last abbreviation
- elif len(unit):
- if len(number):
- r = timedelta_from_spec(number, frac, unit)
- result += timedelta_as_neg(r, neg)
- else:
- raise ValueError("unit abbreviation w/o a number")
-
- # we only have symbols and no numbers
- elif len(number) == 0:
- raise ValueError("symbols w/o a number")
-
- # treat as nanoseconds
- # but only if we don't have anything else
- else:
- if have_value:
- raise ValueError("have leftover units")
- if len(number):
- r = timedelta_from_spec(number, frac, "ns")
- result += timedelta_as_neg(r, neg)
-
- return result
-
-
-cdef int64_t timedelta_as_neg(int64_t value, bint neg):
- """
-
- Parameters
- ----------
- value : int64_t of the timedelta value
- neg : bool if the a negative value
- """
- if neg:
- return -value
- return value
-
-
-cdef timedelta_from_spec(object number, object frac, object unit):
- """
-
- Parameters
- ----------
- number : a list of number digits
- frac : a list of frac digits
- unit : a list of unit characters
- """
- cdef:
- str n
-
- unit = "".join(unit)
- if unit in ["M", "Y", "y"]:
- raise ValueError(
- "Units 'M', 'Y' and 'y' do not represent unambiguous timedelta "
- "values and are not supported."
- )
-
- unit = parse_timedelta_unit(unit)
-
- n = "".join(number) + "." + "".join(frac)
- return cast_from_unit(float(n), unit)
-
-
-cpdef inline str parse_timedelta_unit(str unit):
- """
- Parameters
- ----------
- unit : str or None
-
- Returns
- -------
- str
- Canonical unit string.
-
- Raises
- ------
- ValueError : on non-parseable input
- """
- if unit is None:
- return "ns"
- elif unit == "M":
- return unit
- try:
- return timedelta_abbrevs[unit.lower()]
- except KeyError:
- raise ValueError(f"invalid unit abbreviation: {unit}")
-
-# ----------------------------------------------------------------------
-# Timedelta ops utilities
-
-cdef bint _validate_ops_compat(other):
- # return True if we are compat with operating
- if checknull_with_nat(other):
- return True
- elif is_any_td_scalar(other):
- return True
- elif isinstance(other, str):
- return True
- return False
-
-
-def _op_unary_method(func, name):
- def f(self):
- new_value = func(self._value)
- return _timedelta_from_value_and_reso(Timedelta, new_value, self._creso)
- f.__name__ = name
- return f
-
-
-def _binary_op_method_timedeltalike(op, name):
- # define a binary operation that only works if the other argument is
- # timedelta like or an array of timedeltalike
- def f(self, other):
- if other is NaT:
- return NaT
-
- elif is_datetime64_object(other) or (
- PyDateTime_Check(other) and not isinstance(other, ABCTimestamp)
- ):
- # this case is for a datetime object that is specifically
- # *not* a Timestamp, as the Timestamp case will be
- # handled after `_validate_ops_compat` returns False below
- from pandas._libs.tslibs.timestamps import Timestamp
- return op(self, Timestamp(other))
- # We are implicitly requiring the canonical behavior to be
- # defined by Timestamp methods.
-
- elif is_array(other):
- if other.ndim == 0:
- # see also: item_from_zerodim
- item = cnp.PyArray_ToScalar(cnp.PyArray_DATA(other), other)
- return f(self, item)
-
- elif other.dtype.kind in ["m", "M"]:
- return op(self.to_timedelta64(), other)
- elif other.dtype.kind == "O":
- return np.array([op(self, x) for x in other])
- else:
- return NotImplemented
-
- elif not _validate_ops_compat(other):
- # Includes any of our non-cython classes
- return NotImplemented
-
- try:
- other = Timedelta(other)
- except ValueError:
- # failed to parse as timedelta
- return NotImplemented
-
- if other is NaT:
- # e.g. if original other was timedelta64('NaT')
- return NaT
-
- # Matching numpy, we cast to the higher resolution. Unlike numpy,
- # we raise instead of silently overflowing during this casting.
- if self._creso < other._creso:
- self = (<_Timedelta>self)._as_creso(other._creso, round_ok=True)
- elif self._creso > other._creso:
- other = (<_Timedelta>other)._as_creso(self._creso, round_ok=True)
-
- res = op(self._value, other._value)
- if res == NPY_NAT:
- # e.g. test_implementation_limits
- # TODO: more generally could do an overflowcheck in op?
- return NaT
-
- return _timedelta_from_value_and_reso(Timedelta, res, reso=self._creso)
-
- f.__name__ = name
- return f
-
-
-# ----------------------------------------------------------------------
-# Timedelta Construction
-
-cdef int64_t parse_iso_format_string(str ts) except? -1:
- """
- Extracts and cleanses the appropriate values from a match object with
- groups for each component of an ISO 8601 duration
-
- Parameters
- ----------
- ts: str
- ISO 8601 Duration formatted string
-
- Returns
- -------
- ns: int64_t
- Precision in nanoseconds of matched ISO 8601 duration
-
- Raises
- ------
- ValueError
- If ``ts`` cannot be parsed
- """
-
- cdef:
- unicode c
- int64_t result = 0, r
- int p = 0, sign = 1
- object dec_unit = "ms", err_msg
- bint have_dot = 0, have_value = 0, neg = 0
- list number = [], unit = []
-
- err_msg = f"Invalid ISO 8601 Duration format - {ts}"
-
- if ts[0] == "-":
- sign = -1
- ts = ts[1:]
-
- for c in ts:
- # number (ascii codes)
- if 48 <= ord(c) <= 57:
-
- have_value = 1
- if have_dot:
- if p == 3 and dec_unit != "ns":
- unit.append(dec_unit)
- if dec_unit == "ms":
- dec_unit = "us"
- elif dec_unit == "us":
- dec_unit = "ns"
- p = 0
- p += 1
-
- if not len(unit):
- number.append(c)
- else:
- r = timedelta_from_spec(number, "0", unit)
- result += timedelta_as_neg(r, neg)
-
- neg = 0
- unit, number = [], [c]
- else:
- if c == "P" or c == "T":
- pass # ignore marking characters P and T
- elif c == "-":
- if neg or have_value:
- raise ValueError(err_msg)
- else:
- neg = 1
- elif c == "+":
- pass
- elif c in ["W", "D", "H", "M"]:
- if c in ["H", "M"] and len(number) > 2:
- raise ValueError(err_msg)
- if c == "M":
- c = "min"
- unit.append(c)
- r = timedelta_from_spec(number, "0", unit)
- result += timedelta_as_neg(r, neg)
-
- neg = 0
- unit, number = [], []
- elif c == ".":
- # append any seconds
- if len(number):
- r = timedelta_from_spec(number, "0", "S")
- result += timedelta_as_neg(r, neg)
- unit, number = [], []
- have_dot = 1
- elif c == "S":
- if have_dot: # ms, us, or ns
- if not len(number) or p > 3:
- raise ValueError(err_msg)
- # pad to 3 digits as required
- pad = 3 - p
- while pad > 0:
- number.append("0")
- pad -= 1
-
- r = timedelta_from_spec(number, "0", dec_unit)
- result += timedelta_as_neg(r, neg)
- else: # seconds
- r = timedelta_from_spec(number, "0", "S")
- result += timedelta_as_neg(r, neg)
- else:
- raise ValueError(err_msg)
-
- if not have_value:
- # Received string only - never parsed any values
- raise ValueError(err_msg)
-
- return sign*result
-
-
-cdef _to_py_int_float(v):
- # Note: This used to be defined inside Timedelta.__new__
- # but cython will not allow `cdef` functions to be defined dynamically.
- if is_integer_object(v):
- return int(v)
- elif is_float_object(v):
- return float(v)
- raise TypeError(f"Invalid type {type(v)}. Must be int or float.")
-
-
-def _timedelta_unpickle(value, reso):
- return _timedelta_from_value_and_reso(Timedelta, value, reso)
-
-
-cdef _timedelta_from_value_and_reso(cls, int64_t value, NPY_DATETIMEUNIT reso):
- # Could make this a classmethod if/when cython supports cdef classmethods
- cdef:
- _Timedelta td_base
-
- assert value != NPY_NAT
- # For millisecond and second resos, we cannot actually pass int(value) because
- # many cases would fall outside of the pytimedelta implementation bounds.
- # We pass 0 instead, and override seconds, microseconds, days.
- # In principle we could pass 0 for ns and us too.
- if reso == NPY_FR_ns:
- td_base = _Timedelta.__new__(cls, microseconds=int(value) // 1000)
- elif reso == NPY_DATETIMEUNIT.NPY_FR_us:
- td_base = _Timedelta.__new__(cls, microseconds=int(value))
- elif reso == NPY_DATETIMEUNIT.NPY_FR_ms:
- td_base = _Timedelta.__new__(cls, milliseconds=0)
- elif reso == NPY_DATETIMEUNIT.NPY_FR_s:
- td_base = _Timedelta.__new__(cls, seconds=0)
- # Other resolutions are disabled but could potentially be implemented here:
- # elif reso == NPY_DATETIMEUNIT.NPY_FR_m:
- # td_base = _Timedelta.__new__(Timedelta, minutes=int(value))
- # elif reso == NPY_DATETIMEUNIT.NPY_FR_h:
- # td_base = _Timedelta.__new__(Timedelta, hours=int(value))
- # elif reso == NPY_DATETIMEUNIT.NPY_FR_D:
- # td_base = _Timedelta.__new__(Timedelta, days=int(value))
- else:
- raise NotImplementedError(
- "Only resolutions 's', 'ms', 'us', 'ns' are supported."
- )
-
- td_base._value= value
- td_base._is_populated = 0
- td_base._creso = reso
- return td_base
-
-
-class MinMaxReso:
- """
- We need to define min/max/resolution on both the Timedelta _instance_
- and Timedelta class. On an instance, these depend on the object's _reso.
- On the class, we default to the values we would get with nanosecond _reso.
- """
- def __init__(self, name):
- self._name = name
-
- def __get__(self, obj, type=None):
- if self._name == "min":
- val = np.iinfo(np.int64).min + 1
- elif self._name == "max":
- val = np.iinfo(np.int64).max
- else:
- assert self._name == "resolution"
- val = 1
-
- if obj is None:
- # i.e. this is on the class, default to nanos
- return Timedelta(val)
- else:
- return Timedelta._from_value_and_reso(val, obj._creso)
-
- def __set__(self, obj, value):
- raise AttributeError(f"{self._name} is not settable.")
-
-
-# Similar to Timestamp/datetime, this is a construction requirement for
-# timedeltas that we need to do object instantiation in python. This will
-# serve as a C extension type that shadows the Python class, where we do any
-# heavy lifting.
-cdef class _Timedelta(timedelta):
- # cdef readonly:
- # int64_t value # nanoseconds
- # bint _is_populated # are my components populated
- # int64_t _d, _h, _m, _s, _ms, _us, _ns
- # NPY_DATETIMEUNIT _reso
-
- # higher than np.ndarray and np.matrix
- __array_priority__ = 100
- min = MinMaxReso("min")
- max = MinMaxReso("max")
- resolution = MinMaxReso("resolution")
-
- @property
- def value(self):
- try:
- return convert_reso(self._value, self._creso, NPY_FR_ns, False)
- except OverflowError:
- raise OverflowError(
- "Cannot convert Timedelta to nanoseconds without overflow. "
- "Use `.asm8.view('i8')` to cast represent Timedelta in its own "
- f"unit (here, {self.unit})."
- )
-
- @property
- def _unit(self) -> str:
- """
- The abbreviation associated with self._creso.
- """
- return npy_unit_to_abbrev(self._creso)
-
- @property
- def days(self) -> int: # TODO(cython3): make cdef property
- """
- Returns the days of the timedelta.
-
- Returns
- -------
- int
-
- Examples
- --------
- >>> td = pd.Timedelta(1, "d")
- >>> td.days
- 1
-
- >>> td = pd.Timedelta('4 min 3 us 42 ns')
- >>> td.days
- 0
- """
- # NB: using the python C-API PyDateTime_DELTA_GET_DAYS will fail
- # (or be incorrect)
- self._ensure_components()
- return self._d
-
- @property
- def seconds(self) -> int: # TODO(cython3): make cdef property
- """
- Return the total hours, minutes, and seconds of the timedelta as seconds.
-
- Timedelta.seconds = hours * 3600 + minutes * 60 + seconds.
-
- Returns
- -------
- int
- Number of seconds.
-
- See Also
- --------
- Timedelta.components : Return all attributes with assigned values
- (i.e. days, hours, minutes, seconds, milliseconds, microseconds,
- nanoseconds).
-
- Examples
- --------
- **Using string input**
-
- >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns')
- >>> td.seconds
- 120
-
- **Using integer input**
-
- >>> td = pd.Timedelta(42, unit='s')
- >>> td.seconds
- 42
- """
- # NB: using the python C-API PyDateTime_DELTA_GET_SECONDS will fail
- # (or be incorrect)
- self._ensure_components()
- return self._h * 3600 + self._m * 60 + self._s
-
- @property
- def microseconds(self) -> int: # TODO(cython3): make cdef property
- # NB: using the python C-API PyDateTime_DELTA_GET_MICROSECONDS will fail
- # (or be incorrect)
- self._ensure_components()
- return self._ms * 1000 + self._us
-
- def total_seconds(self) -> float:
- """Total seconds in the duration."""
- # We need to override bc we overrided days/seconds/microseconds
- # TODO: add nanos/1e9?
- return self.days * 24 * 3600 + self.seconds + self.microseconds / 1_000_000
-
- @property
- def unit(self) -> str:
- return npy_unit_to_abbrev(self._creso)
-
- def __hash__(_Timedelta self):
- if self._has_ns():
- # Note: this does *not* satisfy the invariance
- # td1 == td2 \\Rightarrow hash(td1) == hash(td2)
- # if td1 and td2 have different _resos. timedelta64 also has this
- # non-invariant behavior.
- # see GH#44504
- return hash(self._value)
- elif self._is_in_pytimedelta_bounds() and (
- self._creso == NPY_FR_ns or self._creso == NPY_DATETIMEUNIT.NPY_FR_us
- ):
- # If we can defer to timedelta.__hash__, do so, as that
- # ensures the hash is invariant to our _reso.
- # We can only defer for ns and us, as for these two resos we
- # call _Timedelta.__new__ with the correct input in
- # _timedelta_from_value_and_reso; so timedelta.__hash__
- # will be correct
- return timedelta.__hash__(self)
- else:
- # We want to ensure that two equivalent Timedelta objects
- # have the same hash. So we try downcasting to the next-lowest
- # resolution.
- try:
- obj = (<_Timedelta>self)._as_creso(<NPY_DATETIMEUNIT>(self._creso + 1))
- except OverflowError:
- # Doesn't fit, so we're off the hook
- return hash(self._value)
- else:
- return hash(obj)
-
- def __richcmp__(_Timedelta self, object other, int op):
- cdef:
- _Timedelta ots
-
- if isinstance(other, _Timedelta):
- ots = other
- elif is_any_td_scalar(other):
- try:
- ots = Timedelta(other)
- except OutOfBoundsTimedelta as err:
- # GH#49021 pytimedelta.max overflows
- if not PyDelta_Check(other):
- # TODO: handle this case
- raise
- ltup = (self.days, self.seconds, self.microseconds, self.nanoseconds)
- rtup = (other.days, other.seconds, other.microseconds, 0)
- if op == Py_EQ:
- return ltup == rtup
- elif op == Py_NE:
- return ltup != rtup
- elif op == Py_LT:
- return ltup < rtup
- elif op == Py_LE:
- return ltup <= rtup
- elif op == Py_GT:
- return ltup > rtup
- elif op == Py_GE:
- return ltup >= rtup
-
- elif other is NaT:
- return op == Py_NE
-
- elif util.is_array(other):
- if other.dtype.kind == "m":
- return PyObject_RichCompare(self.asm8, other, op)
- elif other.dtype.kind == "O":
- # operate element-wise
- return np.array(
- [PyObject_RichCompare(self, x, op) for x in other],
- dtype=bool,
- )
- if op == Py_EQ:
- return np.zeros(other.shape, dtype=bool)
- elif op == Py_NE:
- return np.ones(other.shape, dtype=bool)
- return NotImplemented # let other raise TypeError
-
- else:
- return NotImplemented
-
- if self._creso == ots._creso:
- return cmp_scalar(self._value, ots._value, op)
- return self._compare_mismatched_resos(ots, op)
-
- # TODO: re-use/share with Timestamp
- cdef bint _compare_mismatched_resos(self, _Timedelta other, op):
- # Can't just dispatch to numpy as they silently overflow and get it wrong
- cdef:
- npy_datetimestruct dts_self
- npy_datetimestruct dts_other
-
- # dispatch to the datetimestruct utils instead of writing new ones!
- pandas_datetime_to_datetimestruct(self._value, self._creso, &dts_self)
- pandas_datetime_to_datetimestruct(other._value, other._creso, &dts_other)
- return cmp_dtstructs(&dts_self, &dts_other, op)
-
- cdef bint _has_ns(self):
- if self._creso == NPY_FR_ns:
- return self._value % 1000 != 0
- elif self._creso < NPY_FR_ns:
- # i.e. seconds, millisecond, microsecond
- return False
- else:
- raise NotImplementedError(self._creso)
-
- cdef bint _is_in_pytimedelta_bounds(self):
- """
- Check if we are within the bounds of datetime.timedelta.
- """
- self._ensure_components()
- return -999999999 <= self._d and self._d <= 999999999
-
- cdef _ensure_components(_Timedelta self):
- """
- compute the components
- """
- if self._is_populated:
- return
-
- cdef:
- pandas_timedeltastruct tds
-
- pandas_timedelta_to_timedeltastruct(self._value, self._creso, &tds)
- self._d = tds.days
- self._h = tds.hrs
- self._m = tds.min
- self._s = tds.sec
- self._ms = tds.ms
- self._us = tds.us
- self._ns = tds.ns
- self._seconds = tds.seconds
- self._microseconds = tds.microseconds
-
- self._is_populated = 1
-
- cpdef timedelta to_pytimedelta(_Timedelta self):
- """
- Convert a pandas Timedelta object into a python ``datetime.timedelta`` object.
-
- Timedelta objects are internally saved as numpy datetime64[ns] dtype.
- Use to_pytimedelta() to convert to object dtype.
-
- Returns
- -------
- datetime.timedelta or numpy.array of datetime.timedelta
-
- See Also
- --------
- to_timedelta : Convert argument to Timedelta type.
-
- Notes
- -----
- Any nanosecond resolution will be lost.
- """
- if self._creso == NPY_FR_ns:
- return timedelta(microseconds=int(self._value) / 1000)
-
- # TODO(@WillAyd): is this the right way to use components?
- self._ensure_components()
- return timedelta(
- days=self._d, seconds=self._seconds, microseconds=self._microseconds
- )
-
- def to_timedelta64(self) -> np.timedelta64:
- """
- Return a numpy.timedelta64 object with 'ns' precision.
- """
- cdef:
- str abbrev = npy_unit_to_abbrev(self._creso)
- # TODO: way to create a np.timedelta64 obj with the reso directly
- # instead of having to get the abbrev?
- return np.timedelta64(self._value, abbrev)
-
- def to_numpy(self, dtype=None, copy=False) -> np.timedelta64:
- """
- Convert the Timedelta to a NumPy timedelta64.
-
- This is an alias method for `Timedelta.to_timedelta64()`. The dtype and
- copy parameters are available here only for compatibility. Their values
- will not affect the return value.
-
- Returns
- -------
- numpy.timedelta64
-
- See Also
- --------
- Series.to_numpy : Similar method for Series.
- """
- if dtype is not None or copy is not False:
- raise ValueError(
- "Timedelta.to_numpy dtype and copy arguments are ignored"
- )
- return self.to_timedelta64()
-
- def view(self, dtype):
- """
- Array view compatibility.
-
- Parameters
- ----------
- dtype : str or dtype
- The dtype to view the underlying data as.
- """
- return np.timedelta64(self._value).view(dtype)
-
- @property
- def components(self):
- """
- Return a components namedtuple-like.
-
- Examples
- --------
- >>> td = pd.Timedelta('2 day 4 min 3 us 42 ns')
- >>> td.components
- Components(days=2, hours=0, minutes=4, seconds=0, milliseconds=0,
- microseconds=3, nanoseconds=42)
- """
- self._ensure_components()
- # return the named tuple
- return Components(self._d, self._h, self._m, self._s,
- self._ms, self._us, self._ns)
-
- @property
- def asm8(self) -> np.timedelta64:
- """
- Return a numpy timedelta64 array scalar view.
-
- Provides access to the array scalar view (i.e. a combination of the
- value and the units) associated with the numpy.timedelta64().view(),
- including a 64-bit integer representation of the timedelta in
- nanoseconds (Python int compatible).
-
- Returns
- -------
- numpy timedelta64 array scalar view
- Array scalar view of the timedelta in nanoseconds.
-
- Examples
- --------
- >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns')
- >>> td.asm8
- numpy.timedelta64(86520000003042,'ns')
-
- >>> td = pd.Timedelta('2 min 3 s')
- >>> td.asm8
- numpy.timedelta64(123000000000,'ns')
-
- >>> td = pd.Timedelta('3 ms 5 us')
- >>> td.asm8
- numpy.timedelta64(3005000,'ns')
-
- >>> td = pd.Timedelta(42, unit='ns')
- >>> td.asm8
- numpy.timedelta64(42,'ns')
- """
- return self.to_timedelta64()
-
- @property
- def resolution_string(self) -> str:
- """
- Return a string representing the lowest timedelta resolution.
-
- Each timedelta has a defined resolution that represents the lowest OR
- most granular level of precision. Each level of resolution is
- represented by a short string as defined below:
-
- Resolution: Return value
-
- * Days: 'D'
- * Hours: 'H'
- * Minutes: 'T'
- * Seconds: 'S'
- * Milliseconds: 'L'
- * Microseconds: 'U'
- * Nanoseconds: 'N'
-
- Returns
- -------
- str
- Timedelta resolution.
-
- Examples
- --------
- >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns')
- >>> td.resolution_string
- 'N'
-
- >>> td = pd.Timedelta('1 days 2 min 3 us')
- >>> td.resolution_string
- 'U'
-
- >>> td = pd.Timedelta('2 min 3 s')
- >>> td.resolution_string
- 'S'
-
- >>> td = pd.Timedelta(36, unit='us')
- >>> td.resolution_string
- 'U'
- """
- self._ensure_components()
- if self._ns:
- return "N"
- elif self._us:
- return "U"
- elif self._ms:
- return "L"
- elif self._s:
- return "S"
- elif self._m:
- return "T"
- elif self._h:
- return "H"
- else:
- return "D"
-
- @property
- def nanoseconds(self):
- """
- Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
-
- Returns
- -------
- int
- Number of nanoseconds.
-
- See Also
- --------
- Timedelta.components : Return all attributes with assigned values
- (i.e. days, hours, minutes, seconds, milliseconds, microseconds,
- nanoseconds).
-
- Examples
- --------
- **Using string input**
-
- >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns')
-
- >>> td.nanoseconds
- 42
-
- **Using integer input**
-
- >>> td = pd.Timedelta(42, unit='ns')
- >>> td.nanoseconds
- 42
- """
- self._ensure_components()
- return self._ns
-
- def _repr_base(self, format=None) -> str:
- """
-
- Parameters
- ----------
- format : None|all|sub_day|long
-
- Returns
- -------
- converted : string of a Timedelta
-
- """
- cdef:
- str sign, fmt
- dict comp_dict
- object subs
-
- self._ensure_components()
-
- if self._d < 0:
- sign = " +"
- else:
- sign = " "
-
- if format == "all":
- fmt = ("{days} days{sign}{hours:02}:{minutes:02}:{seconds:02}."
- "{milliseconds:03}{microseconds:03}{nanoseconds:03}")
- else:
- # if we have a partial day
- subs = (self._h or self._m or self._s or
- self._ms or self._us or self._ns)
-
- if self._ms or self._us or self._ns:
- seconds_fmt = "{seconds:02}.{milliseconds:03}{microseconds:03}"
- if self._ns:
- # GH#9309
- seconds_fmt += "{nanoseconds:03}"
- else:
- seconds_fmt = "{seconds:02}"
-
- if format == "sub_day" and not self._d:
- fmt = "{hours:02}:{minutes:02}:" + seconds_fmt
- elif subs or format == "long":
- fmt = "{days} days{sign}{hours:02}:{minutes:02}:" + seconds_fmt
- else:
- fmt = "{days} days"
-
- comp_dict = self.components._asdict()
- comp_dict["sign"] = sign
-
- return fmt.format(**comp_dict)
-
- def __repr__(self) -> str:
- repr_based = self._repr_base(format="long")
- return f"Timedelta('{repr_based}')"
-
- def __str__(self) -> str:
- return self._repr_base(format="long")
-
- def __bool__(self) -> bool:
- return self._value!= 0
-
- def isoformat(self) -> str:
- """
- Format the Timedelta as ISO 8601 Duration.
-
- ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``, where the ``[n]`` s are replaced by the
- values. See https://en.wikipedia.org/wiki/ISO_8601#Durations.
-
- Returns
- -------
- str
-
- See Also
- --------
- Timestamp.isoformat : Function is used to convert the given
- Timestamp object into the ISO format.
-
- Notes
- -----
- The longest component is days, whose value may be larger than
- 365.
- Every component is always included, even if its value is 0.
- Pandas uses nanosecond precision, so up to 9 decimal places may
- be included in the seconds component.
- Trailing 0's are removed from the seconds component after the decimal.
- We do not 0 pad components, so it's `...T5H...`, not `...T05H...`
-
- Examples
- --------
- >>> td = pd.Timedelta(days=6, minutes=50, seconds=3,
- ... milliseconds=10, microseconds=10, nanoseconds=12)
-
- >>> td.isoformat()
- 'P6DT0H50M3.010010012S'
- >>> pd.Timedelta(hours=1, seconds=10).isoformat()
- 'P0DT1H0M10S'
- >>> pd.Timedelta(days=500.5).isoformat()
- 'P500DT12H0M0S'
- """
- components = self.components
- seconds = (f"{components.seconds}."
- f"{components.milliseconds:0>3}"
- f"{components.microseconds:0>3}"
- f"{components.nanoseconds:0>3}")
- # Trim unnecessary 0s, 1.000000000 -> 1
- seconds = seconds.rstrip("0").rstrip(".")
- tpl = (f"P{components.days}DT{components.hours}"
- f"H{components.minutes}M{seconds}S")
- return tpl
-
- # ----------------------------------------------------------------
- # Constructors
-
- @classmethod
- def _from_value_and_reso(cls, int64_t value, NPY_DATETIMEUNIT reso):
- # exposing as classmethod for testing
- return _timedelta_from_value_and_reso(cls, value, reso)
-
- def as_unit(self, str unit, bint round_ok=True):
- """
- Convert the underlying int64 representaton to the given unit.
-
- Parameters
- ----------
- unit : {"ns", "us", "ms", "s"}
- round_ok : bool, default True
- If False and the conversion requires rounding, raise.
-
- Returns
- -------
- Timedelta
- """
- dtype = np.dtype(f"m8[{unit}]")
- reso = get_unit_from_dtype(dtype)
- return self._as_creso(reso, round_ok=round_ok)
-
- @cython.cdivision(False)
- cdef _Timedelta _as_creso(self, NPY_DATETIMEUNIT reso, bint round_ok=True):
- cdef:
- int64_t value
-
- if reso == self._creso:
- return self
-
- try:
- value = convert_reso(self._value, self._creso, reso, round_ok=round_ok)
- except OverflowError as err:
- unit = npy_unit_to_abbrev(reso)
- raise OutOfBoundsTimedelta(
- f"Cannot cast {self} to unit='{unit}' without overflow."
- ) from err
-
- return type(self)._from_value_and_reso(value, reso=reso)
-
- cpdef _maybe_cast_to_matching_resos(self, _Timedelta other):
- """
- If _resos do not match, cast to the higher resolution, raising on overflow.
- """
- if self._creso > other._creso:
- other = other._as_creso(self._creso)
- elif self._creso < other._creso:
- self = self._as_creso(other._creso)
- return self, other
-
-
-# Python front end to C extension type _Timedelta
-# This serves as the box for timedelta64
-
-class Timedelta(_Timedelta):
- """
- Represents a duration, the difference between two dates or times.
-
- Timedelta is the pandas equivalent of python's ``datetime.timedelta``
- and is interchangeable with it in most cases.
-
- Parameters
- ----------
- value : Timedelta, timedelta, np.timedelta64, str, or int
- unit : str, default 'ns'
- Denote the unit of the input, if input is an integer.
-
- Possible values:
-
- * 'W', 'D', 'T', 'S', 'L', 'U', or 'N'
- * 'days' or 'day'
- * 'hours', 'hour', 'hr', or 'h'
- * 'minutes', 'minute', 'min', or 'm'
- * 'seconds', 'second', or 'sec'
- * 'milliseconds', 'millisecond', 'millis', or 'milli'
- * 'microseconds', 'microsecond', 'micros', or 'micro'
- * 'nanoseconds', 'nanosecond', 'nanos', 'nano', or 'ns'.
-
- **kwargs
- Available kwargs: {days, seconds, microseconds,
- milliseconds, minutes, hours, weeks}.
- Values for construction in compat with datetime.timedelta.
- Numpy ints and floats will be coerced to python ints and floats.
-
- Notes
- -----
- The constructor may take in either both values of value and unit or
- kwargs as above. Either one of them must be used during initialization
-
- The ``.value`` attribute is always in ns.
-
- If the precision is higher than nanoseconds, the precision of the duration is
- truncated to nanoseconds.
-
- Examples
- --------
- Here we initialize Timedelta object with both value and unit
-
- >>> td = pd.Timedelta(1, "d")
- >>> td
- Timedelta('1 days 00:00:00')
-
- Here we initialize the Timedelta object with kwargs
-
- >>> td2 = pd.Timedelta(days=1)
- >>> td2
- Timedelta('1 days 00:00:00')
-
- We see that either way we get the same result
- """
-
- _req_any_kwargs_new = {"weeks", "days", "hours", "minutes", "seconds",
- "milliseconds", "microseconds", "nanoseconds"}
-
- def __new__(cls, object value=_no_input, unit=None, **kwargs):
- if value is _no_input:
- if not len(kwargs):
- raise ValueError("cannot construct a Timedelta without a "
- "value/unit or descriptive keywords "
- "(days,seconds....)")
-
- kwargs = {key: _to_py_int_float(kwargs[key]) for key in kwargs}
-
- unsupported_kwargs = set(kwargs)
- unsupported_kwargs.difference_update(cls._req_any_kwargs_new)
- if unsupported_kwargs or not cls._req_any_kwargs_new.intersection(kwargs):
- raise ValueError(
- "cannot construct a Timedelta from the passed arguments, "
- "allowed keywords are "
- "[weeks, days, hours, minutes, seconds, "
- "milliseconds, microseconds, nanoseconds]"
- )
-
- # GH43764, convert any input to nanoseconds first and then
- # create the timestamp. This ensures that any potential
- # nanosecond contributions from kwargs parsed as floats
- # are taken into consideration.
- seconds = int((
- (
- (kwargs.get("days", 0) + kwargs.get("weeks", 0) * 7) * 24
- + kwargs.get("hours", 0)
- ) * 3600
- + kwargs.get("minutes", 0) * 60
- + kwargs.get("seconds", 0)
- ) * 1_000_000_000
- )
-
- value = np.timedelta64(
- int(kwargs.get("nanoseconds", 0))
- + int(kwargs.get("microseconds", 0) * 1_000)
- + int(kwargs.get("milliseconds", 0) * 1_000_000)
- + seconds
- )
-
- if unit in {"Y", "y", "M"}:
- raise ValueError(
- "Units 'M', 'Y', and 'y' are no longer supported, as they do not "
- "represent unambiguous timedelta values durations."
- )
-
- # GH 30543 if pd.Timedelta already passed, return it
- # check that only value is passed
- if isinstance(value, _Timedelta):
- # 'unit' is benign in this case, but e.g. days or seconds
- # doesn't make sense here.
- if len(kwargs):
- # GH#48898
- raise ValueError(
- "Cannot pass both a Timedelta input and timedelta keyword "
- "arguments, got "
- f"{list(kwargs.keys())}"
- )
- return value
- elif isinstance(value, str):
- if unit is not None:
- raise ValueError("unit must not be specified if the value is a str")
- if (len(value) > 0 and value[0] == "P") or (
- len(value) > 1 and value[:2] == "-P"
- ):
- value = parse_iso_format_string(value)
- else:
- value = parse_timedelta_string(value)
- value = np.timedelta64(value)
- elif PyDelta_Check(value):
- # pytimedelta object -> microsecond resolution
- new_value = delta_to_nanoseconds(
- value, reso=NPY_DATETIMEUNIT.NPY_FR_us
- )
- return cls._from_value_and_reso(
- new_value, reso=NPY_DATETIMEUNIT.NPY_FR_us
- )
- elif is_timedelta64_object(value):
- # Retain the resolution if possible, otherwise cast to the nearest
- # supported resolution.
- new_value = get_timedelta64_value(value)
- if new_value == NPY_NAT:
- # i.e. np.timedelta64("NaT")
- return NaT
-
- reso = get_datetime64_unit(value)
- new_reso = get_supported_reso(reso)
- if reso != NPY_DATETIMEUNIT.NPY_FR_GENERIC:
- try:
- new_value = convert_reso(
- new_value,
- reso,
- new_reso,
- round_ok=True,
- )
- except (OverflowError, OutOfBoundsDatetime) as err:
- raise OutOfBoundsTimedelta(value) from err
- return cls._from_value_and_reso(new_value, reso=new_reso)
-
- elif is_tick_object(value):
- new_reso = get_supported_reso(value._creso)
- new_value = delta_to_nanoseconds(value, reso=new_reso)
- return cls._from_value_and_reso(new_value, reso=new_reso)
-
- elif is_integer_object(value) or is_float_object(value):
- # unit=None is de-facto 'ns'
- unit = parse_timedelta_unit(unit)
- value = convert_to_timedelta64(value, unit)
- elif checknull_with_nat(value):
- return NaT
- else:
- raise ValueError(
- "Value must be Timedelta, string, integer, "
- f"float, timedelta or convertible, not {type(value).__name__}"
- )
-
- if is_timedelta64_object(value):
- value = value.view("i8")
-
- # nat
- if value == NPY_NAT:
- return NaT
-
- return _timedelta_from_value_and_reso(cls, value, NPY_FR_ns)
-
- def __setstate__(self, state):
- if len(state) == 1:
- # older pickle, only supported nanosecond
- value = state[0]
- reso = NPY_FR_ns
- else:
- value, reso = state
- self._value= value
- self._creso = reso
-
- def __reduce__(self):
- object_state = self._value, self._creso
- return (_timedelta_unpickle, object_state)
-
- @cython.cdivision(True)
- def _round(self, freq, mode):
- cdef:
- int64_t result, unit
- ndarray[int64_t] arr
-
- from pandas._libs.tslibs.offsets import to_offset
-
- to_offset(freq).nanos # raises on non-fixed freq
- unit = delta_to_nanoseconds(to_offset(freq), self._creso)
-
- arr = np.array([self._value], dtype="i8")
- result = round_nsint64(arr, mode, unit)[0]
- return Timedelta._from_value_and_reso(result, self._creso)
-
- def round(self, freq):
- """
- Round the Timedelta to the specified resolution.
-
- Parameters
- ----------
- freq : str
- Frequency string indicating the rounding resolution.
-
- Returns
- -------
- a new Timedelta rounded to the given resolution of `freq`
-
- Raises
- ------
- ValueError if the freq cannot be converted
- """
- return self._round(freq, RoundTo.NEAREST_HALF_EVEN)
-
- def floor(self, freq):
- """
- Return a new Timedelta floored to this resolution.
-
- Parameters
- ----------
- freq : str
- Frequency string indicating the flooring resolution.
- """
- return self._round(freq, RoundTo.MINUS_INFTY)
-
- def ceil(self, freq):
- """
- Return a new Timedelta ceiled to this resolution.
-
- Parameters
- ----------
- freq : str
- Frequency string indicating the ceiling resolution.
- """
- return self._round(freq, RoundTo.PLUS_INFTY)
-
- # ----------------------------------------------------------------
- # Arithmetic Methods
- # TODO: Can some of these be defined in the cython class?
-
- __neg__ = _op_unary_method(lambda x: -x, "__neg__")
- __pos__ = _op_unary_method(lambda x: x, "__pos__")
- __abs__ = _op_unary_method(lambda x: abs(x), "__abs__")
-
- __add__ = _binary_op_method_timedeltalike(lambda x, y: x + y, "__add__")
- __radd__ = _binary_op_method_timedeltalike(lambda x, y: x + y, "__radd__")
- __sub__ = _binary_op_method_timedeltalike(lambda x, y: x - y, "__sub__")
- __rsub__ = _binary_op_method_timedeltalike(lambda x, y: y - x, "__rsub__")
-
- def __mul__(self, other):
- if is_integer_object(other) or is_float_object(other):
- if util.is_nan(other):
- # np.nan * timedelta -> np.timedelta64("NaT"), in this case NaT
- return NaT
-
- return _timedelta_from_value_and_reso(
- Timedelta,
- <int64_t>(other * self._value),
- reso=self._creso,
- )
-
- elif is_array(other):
- if other.ndim == 0:
- # see also: item_from_zerodim
- item = cnp.PyArray_ToScalar(cnp.PyArray_DATA(other), other)
- return self.__mul__(item)
- return other * self.to_timedelta64()
-
- return NotImplemented
-
- __rmul__ = __mul__
-
- def __truediv__(self, other):
- if _should_cast_to_timedelta(other):
- # We interpret NaT as timedelta64("NaT")
- other = Timedelta(other)
- if other is NaT:
- return np.nan
- if other._creso != self._creso:
- self, other = self._maybe_cast_to_matching_resos(other)
- return self._value/ float(other._value)
-
- elif is_integer_object(other) or is_float_object(other):
- # integers or floats
- if util.is_nan(other):
- return NaT
- return Timedelta._from_value_and_reso(
- <int64_t>(self._value/ other), self._creso
- )
-
- elif is_array(other):
- if other.ndim == 0:
- # see also: item_from_zerodim
- item = cnp.PyArray_ToScalar(cnp.PyArray_DATA(other), other)
- return self.__truediv__(item)
- return self.to_timedelta64() / other
-
- return NotImplemented
-
- def __rtruediv__(self, other):
- if _should_cast_to_timedelta(other):
- # We interpret NaT as timedelta64("NaT")
- other = Timedelta(other)
- if other is NaT:
- return np.nan
- if self._creso != other._creso:
- self, other = self._maybe_cast_to_matching_resos(other)
- return float(other._value) / self._value
-
- elif is_array(other):
- if other.ndim == 0:
- # see also: item_from_zerodim
- item = cnp.PyArray_ToScalar(cnp.PyArray_DATA(other), other)
- return self.__rtruediv__(item)
- elif other.dtype.kind == "O":
- # GH#31869
- return np.array([x / self for x in other])
-
- # TODO: if other.dtype.kind == "m" and other.dtype != self.asm8.dtype
- # then should disallow for consistency with scalar behavior; requires
- # deprecation cycle. (or changing scalar behavior)
- return other / self.to_timedelta64()
-
- return NotImplemented
-
- def __floordiv__(self, other):
- # numpy does not implement floordiv for timedelta64 dtype, so we cannot
- # just defer
- if _should_cast_to_timedelta(other):
- # We interpret NaT as timedelta64("NaT")
- other = Timedelta(other)
- if other is NaT:
- return np.nan
- if self._creso != other._creso:
- self, other = self._maybe_cast_to_matching_resos(other)
- return self._value// other._value
-
- elif is_integer_object(other) or is_float_object(other):
- if util.is_nan(other):
- return NaT
- return type(self)._from_value_and_reso(self._value// other, self._creso)
-
- elif is_array(other):
- if other.ndim == 0:
- # see also: item_from_zerodim
- item = cnp.PyArray_ToScalar(cnp.PyArray_DATA(other), other)
- return self.__floordiv__(item)
-
- if other.dtype.kind == "m":
- # also timedelta-like
- with warnings.catch_warnings():
- warnings.filterwarnings(
- "ignore",
- "invalid value encountered in floor_divide",
- RuntimeWarning
- )
- result = self.asm8 // other
- mask = other.view("i8") == NPY_NAT
- if mask.any():
- # We differ from numpy here
- result = result.astype("f8")
- result[mask] = np.nan
- return result
-
- elif other.dtype.kind in ["i", "u", "f"]:
- if other.ndim == 0:
- return self // other.item()
- else:
- return self.to_timedelta64() // other
-
- raise TypeError(f"Invalid dtype {other.dtype} for __floordiv__")
-
- return NotImplemented
-
- def __rfloordiv__(self, other):
- # numpy does not implement floordiv for timedelta64 dtype, so we cannot
- # just defer
- if _should_cast_to_timedelta(other):
- # We interpret NaT as timedelta64("NaT")
- other = Timedelta(other)
- if other is NaT:
- return np.nan
- if self._creso != other._creso:
- self, other = self._maybe_cast_to_matching_resos(other)
- return other._value// self._value
-
- elif is_array(other):
- if other.ndim == 0:
- # see also: item_from_zerodim
- item = cnp.PyArray_ToScalar(cnp.PyArray_DATA(other), other)
- return self.__rfloordiv__(item)
-
- if other.dtype.kind == "m":
- # also timedelta-like
- with warnings.catch_warnings():
- warnings.filterwarnings(
- "ignore",
- "invalid value encountered in floor_divide",
- RuntimeWarning
- )
- result = other // self.asm8
- mask = other.view("i8") == NPY_NAT
- if mask.any():
- # We differ from numpy here
- result = result.astype("f8")
- result[mask] = np.nan
- return result
-
- # Includes integer array // Timedelta, disallowed in GH#19761
- raise TypeError(f"Invalid dtype {other.dtype} for __floordiv__")
-
- return NotImplemented
-
- def __mod__(self, other):
- # Naive implementation, room for optimization
- return self.__divmod__(other)[1]
-
- def __rmod__(self, other):
- # Naive implementation, room for optimization
- return self.__rdivmod__(other)[1]
-
- def __divmod__(self, other):
- # Naive implementation, room for optimization
- div = self // other
- return div, self - div * other
-
- def __rdivmod__(self, other):
- # Naive implementation, room for optimization
- div = other // self
- return div, other - div * self
-
-
-def truediv_object_array(ndarray left, ndarray right):
- cdef:
- ndarray[object] result = np.empty((<object>left).shape, dtype=object)
- object td64 # really timedelta64 if we find a way to declare that
- object obj, res_value
- _Timedelta td
- Py_ssize_t i
-
- for i in range(len(left)):
- td64 = left[i]
- obj = right[i]
-
- if get_timedelta64_value(td64) == NPY_NAT:
- # td here should be interpreted as a td64 NaT
- if _should_cast_to_timedelta(obj):
- res_value = np.nan
- else:
- # if its a number then let numpy handle division, otherwise
- # numpy will raise
- res_value = td64 / obj
- else:
- td = Timedelta(td64)
- res_value = td / obj
-
- result[i] = res_value
-
- return result
-
-
-def floordiv_object_array(ndarray left, ndarray right):
- cdef:
- ndarray[object] result = np.empty((<object>left).shape, dtype=object)
- object td64 # really timedelta64 if we find a way to declare that
- object obj, res_value
- _Timedelta td
- Py_ssize_t i
-
- for i in range(len(left)):
- td64 = left[i]
- obj = right[i]
-
- if get_timedelta64_value(td64) == NPY_NAT:
- # td here should be interpreted as a td64 NaT
- if _should_cast_to_timedelta(obj):
- res_value = np.nan
- else:
- # if its a number then let numpy handle division, otherwise
- # numpy will raise
- res_value = td64 // obj
- else:
- td = Timedelta(td64)
- res_value = td // obj
-
- result[i] = res_value
-
- return result
-
-
-cdef bint is_any_td_scalar(object obj):
- """
- Cython equivalent for `isinstance(obj, (timedelta, np.timedelta64, Tick))`
-
- Parameters
- ----------
- obj : object
-
- Returns
- -------
- bool
- """
- return (
- PyDelta_Check(obj) or is_timedelta64_object(obj) or is_tick_object(obj)
- )
-
-
-cdef bint _should_cast_to_timedelta(object obj):
- """
- Should we treat this object as a Timedelta for the purpose of a binary op
- """
- return (
- is_any_td_scalar(obj) or obj is None or obj is NaT or isinstance(obj, str)
- )
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pxd
deleted file mode 100644
index 26018cd9042..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pxd
+++ /dev/null
@@ -1,36 +0,0 @@
-from cpython.datetime cimport (
- datetime,
- tzinfo,
-)
-from numpy cimport int64_t
-
-from pandas._libs.tslibs.base cimport ABCTimestamp
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- npy_datetimestruct,
-)
-from pandas._libs.tslibs.offsets cimport BaseOffset
-
-
-cdef _Timestamp create_timestamp_from_ts(int64_t value,
- npy_datetimestruct dts,
- tzinfo tz,
- bint fold,
- NPY_DATETIMEUNIT reso=*)
-
-
-cdef class _Timestamp(ABCTimestamp):
- cdef readonly:
- int64_t _value, nanosecond, year
- NPY_DATETIMEUNIT _creso
-
- cdef bint _get_start_end_field(self, str field, freq)
- cdef _get_date_name_field(self, str field, object locale)
- cdef int64_t _maybe_convert_value_to_local(self)
- cdef bint _can_compare(self, datetime other)
- cpdef to_datetime64(self)
- cpdef datetime to_pydatetime(_Timestamp self, bint warn=*)
- cdef bint _compare_outside_nanorange(_Timestamp self, datetime other,
- int op) except -1
- cdef bint _compare_mismatched_resos(_Timestamp self, _Timestamp other, int op)
- cdef _Timestamp _as_creso(_Timestamp self, NPY_DATETIMEUNIT creso, bint round_ok=*)
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pyi
deleted file mode 100644
index 26b0c9170aa..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pyi
+++ /dev/null
@@ -1,228 +0,0 @@
-from datetime import (
- date as _date,
- datetime,
- time as _time,
- timedelta,
- tzinfo as _tzinfo,
-)
-from time import struct_time
-from typing import (
- ClassVar,
- TypeVar,
- overload,
-)
-
-import numpy as np
-
-from pandas._libs.tslibs import (
- BaseOffset,
- NaTType,
- Period,
- Tick,
- Timedelta,
-)
-
-_DatetimeT = TypeVar("_DatetimeT", bound=datetime)
-
-def integer_op_not_supported(obj: object) -> TypeError: ...
-
-class Timestamp(datetime):
- _creso: int
- min: ClassVar[Timestamp]
- max: ClassVar[Timestamp]
-
- resolution: ClassVar[Timedelta]
- _value: int # np.int64
- # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]")
- def __new__( # type: ignore[misc]
- cls: type[_DatetimeT],
- ts_input: np.integer | float | str | _date | datetime | np.datetime64 = ...,
- year: int | None = ...,
- month: int | None = ...,
- day: int | None = ...,
- hour: int | None = ...,
- minute: int | None = ...,
- second: int | None = ...,
- microsecond: int | None = ...,
- tzinfo: _tzinfo | None = ...,
- *,
- nanosecond: int | None = ...,
- tz: str | _tzinfo | None | int = ...,
- unit: str | int | None = ...,
- fold: int | None = ...,
- ) -> _DatetimeT | NaTType: ...
- @classmethod
- def _from_value_and_reso(
- cls, value: int, reso: int, tz: _tzinfo | None
- ) -> Timestamp: ...
- @property
- def value(self) -> int: ... # np.int64
- @property
- def year(self) -> int: ...
- @property
- def month(self) -> int: ...
- @property
- def day(self) -> int: ...
- @property
- def hour(self) -> int: ...
- @property
- def minute(self) -> int: ...
- @property
- def second(self) -> int: ...
- @property
- def microsecond(self) -> int: ...
- @property
- def nanosecond(self) -> int: ...
- @property
- def tzinfo(self) -> _tzinfo | None: ...
- @property
- def tz(self) -> _tzinfo | None: ...
- @property
- def fold(self) -> int: ...
- @classmethod
- def fromtimestamp(
- cls: type[_DatetimeT], ts: float, tz: _tzinfo | None = ...
- ) -> _DatetimeT: ...
- @classmethod
- def utcfromtimestamp(cls: type[_DatetimeT], ts: float) -> _DatetimeT: ...
- @classmethod
- def today(cls: type[_DatetimeT], tz: _tzinfo | str | None = ...) -> _DatetimeT: ...
- @classmethod
- def fromordinal(
- cls: type[_DatetimeT],
- ordinal: int,
- tz: _tzinfo | str | None = ...,
- ) -> _DatetimeT: ...
- @classmethod
- def now(cls: type[_DatetimeT], tz: _tzinfo | str | None = ...) -> _DatetimeT: ...
- @classmethod
- def utcnow(cls: type[_DatetimeT]) -> _DatetimeT: ...
- # error: Signature of "combine" incompatible with supertype "datetime"
- @classmethod
- def combine( # type: ignore[override]
- cls, date: _date, time: _time
- ) -> datetime: ...
- @classmethod
- def fromisoformat(cls: type[_DatetimeT], date_string: str) -> _DatetimeT: ...
- def strftime(self, format: str) -> str: ...
- def __format__(self, fmt: str) -> str: ...
- def toordinal(self) -> int: ...
- def timetuple(self) -> struct_time: ...
- def timestamp(self) -> float: ...
- def utctimetuple(self) -> struct_time: ...
- def date(self) -> _date: ...
- def time(self) -> _time: ...
- def timetz(self) -> _time: ...
- # LSP violation: nanosecond is not present in datetime.datetime.replace
- # and has positional args following it
- def replace( # type: ignore[override]
- self: _DatetimeT,
- year: int | None = ...,
- month: int | None = ...,
- day: int | None = ...,
- hour: int | None = ...,
- minute: int | None = ...,
- second: int | None = ...,
- microsecond: int | None = ...,
- nanosecond: int | None = ...,
- tzinfo: _tzinfo | type[object] | None = ...,
- fold: int | None = ...,
- ) -> _DatetimeT: ...
- # LSP violation: datetime.datetime.astimezone has a default value for tz
- def astimezone( # type: ignore[override]
- self: _DatetimeT, tz: _tzinfo | None
- ) -> _DatetimeT: ...
- def ctime(self) -> str: ...
- def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ...
- @classmethod
- def strptime(cls, date_string: str, format: str) -> datetime: ...
- def utcoffset(self) -> timedelta | None: ...
- def tzname(self) -> str | None: ...
- def dst(self) -> timedelta | None: ...
- def __le__(self, other: datetime) -> bool: ... # type: ignore[override]
- def __lt__(self, other: datetime) -> bool: ... # type: ignore[override]
- def __ge__(self, other: datetime) -> bool: ... # type: ignore[override]
- def __gt__(self, other: datetime) -> bool: ... # type: ignore[override]
- # error: Signature of "__add__" incompatible with supertype "date"/"datetime"
- @overload # type: ignore[override]
- def __add__(self, other: np.ndarray) -> np.ndarray: ...
- @overload
- def __add__(
- self: _DatetimeT, other: timedelta | np.timedelta64 | Tick
- ) -> _DatetimeT: ...
- def __radd__(self: _DatetimeT, other: timedelta) -> _DatetimeT: ...
- @overload # type: ignore[override]
- def __sub__(self, other: datetime) -> Timedelta: ...
- @overload
- def __sub__(
- self: _DatetimeT, other: timedelta | np.timedelta64 | Tick
- ) -> _DatetimeT: ...
- def __hash__(self) -> int: ...
- def weekday(self) -> int: ...
- def isoweekday(self) -> int: ...
- def isocalendar(self) -> tuple[int, int, int]: ...
- @property
- def is_leap_year(self) -> bool: ...
- @property
- def is_month_start(self) -> bool: ...
- @property
- def is_quarter_start(self) -> bool: ...
- @property
- def is_year_start(self) -> bool: ...
- @property
- def is_month_end(self) -> bool: ...
- @property
- def is_quarter_end(self) -> bool: ...
- @property
- def is_year_end(self) -> bool: ...
- def to_pydatetime(self, warn: bool = ...) -> datetime: ...
- def to_datetime64(self) -> np.datetime64: ...
- def to_period(self, freq: BaseOffset | str = ...) -> Period: ...
- def to_julian_date(self) -> np.float64: ...
- @property
- def asm8(self) -> np.datetime64: ...
- def tz_convert(self: _DatetimeT, tz: _tzinfo | str | None) -> _DatetimeT: ...
- # TODO: could return NaT?
- def tz_localize(
- self: _DatetimeT,
- tz: _tzinfo | str | None,
- ambiguous: str = ...,
- nonexistent: str = ...,
- ) -> _DatetimeT: ...
- def normalize(self: _DatetimeT) -> _DatetimeT: ...
- # TODO: round/floor/ceil could return NaT?
- def round(
- self: _DatetimeT, freq: str, ambiguous: bool | str = ..., nonexistent: str = ...
- ) -> _DatetimeT: ...
- def floor(
- self: _DatetimeT, freq: str, ambiguous: bool | str = ..., nonexistent: str = ...
- ) -> _DatetimeT: ...
- def ceil(
- self: _DatetimeT, freq: str, ambiguous: bool | str = ..., nonexistent: str = ...
- ) -> _DatetimeT: ...
- def day_name(self, locale: str | None = ...) -> str: ...
- def month_name(self, locale: str | None = ...) -> str: ...
- @property
- def day_of_week(self) -> int: ...
- @property
- def dayofweek(self) -> int: ...
- @property
- def day_of_year(self) -> int: ...
- @property
- def dayofyear(self) -> int: ...
- @property
- def quarter(self) -> int: ...
- @property
- def week(self) -> int: ...
- def to_numpy(
- self, dtype: np.dtype | None = ..., copy: bool = ...
- ) -> np.datetime64: ...
- @property
- def _date_repr(self) -> str: ...
- @property
- def days_in_month(self) -> int: ...
- @property
- def daysinmonth(self) -> int: ...
- @property
- def unit(self) -> str: ...
- def as_unit(self, unit: str, round_ok: bool = ...) -> Timestamp: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pyx
deleted file mode 100644
index 6b707d4158f..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/timestamps.pyx
+++ /dev/null
@@ -1,2382 +0,0 @@
-"""
-_Timestamp is a c-defined subclass of datetime.datetime
-
-_Timestamp is PITA. Because we inherit from datetime, which has very specific
-construction requirements, we need to do object instantiation in python
-(see Timestamp class below). This will serve as a C extension type that
-shadows the python class, where we do any heavy lifting.
-"""
-
-import warnings
-
-cimport cython
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- int64_t,
- ndarray,
- uint8_t,
-)
-
-cnp.import_array()
-
-from cpython.datetime cimport ( # alias bc `tzinfo` is a kwarg below
- PyDate_Check,
- PyDateTime_Check,
- PyDelta_Check,
- PyTZInfo_Check,
- datetime,
- import_datetime,
- time as dt_time,
- tzinfo as tzinfo_type,
-)
-from cpython.object cimport (
- Py_EQ,
- Py_GE,
- Py_GT,
- Py_LE,
- Py_LT,
- Py_NE,
- PyObject_RichCompare,
- PyObject_RichCompareBool,
-)
-
-import_datetime()
-
-from pandas._libs.tslibs cimport ccalendar
-from pandas._libs.tslibs.base cimport ABCTimestamp
-
-from pandas.util._exceptions import find_stack_level
-
-from pandas._libs.tslibs.conversion cimport (
- _TSObject,
- convert_datetime_to_tsobject,
- convert_to_tsobject,
- maybe_localize_tso,
-)
-from pandas._libs.tslibs.dtypes cimport (
- npy_unit_to_abbrev,
- periods_per_day,
- periods_per_second,
-)
-from pandas._libs.tslibs.util cimport (
- is_array,
- is_datetime64_object,
- is_integer_object,
-)
-
-from pandas._libs.tslibs.fields import (
- RoundTo,
- get_date_name_field,
- get_start_end_field,
- round_nsint64,
-)
-
-from pandas._libs.tslibs.nattype cimport (
- NPY_NAT,
- c_NaT as NaT,
-)
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- NPY_FR_ns,
- check_dts_bounds,
- cmp_dtstructs,
- cmp_scalar,
- convert_reso,
- get_datetime64_unit,
- get_datetime64_value,
- get_unit_from_dtype,
- npy_datetimestruct,
- npy_datetimestruct_to_datetime,
- pandas_datetime_to_datetimestruct,
- pydatetime_to_dtstruct,
-)
-
-from pandas._libs.tslibs.np_datetime import (
- OutOfBoundsDatetime,
- OutOfBoundsTimedelta,
-)
-
-from pandas._libs.tslibs.offsets cimport to_offset
-from pandas._libs.tslibs.timedeltas cimport (
- _Timedelta,
- delta_to_nanoseconds,
- is_any_td_scalar,
-)
-
-from pandas._libs.tslibs.timedeltas import Timedelta
-
-from pandas._libs.tslibs.timezones cimport (
- get_timezone,
- is_utc,
- maybe_get_tz,
- treat_tz_as_pytz,
- utc_stdlib as UTC,
-)
-from pandas._libs.tslibs.tzconversion cimport (
- tz_convert_from_utc_single,
- tz_localize_to_utc_single,
-)
-
-# ----------------------------------------------------------------------
-# Constants
-_zero_time = dt_time(0, 0)
-_no_input = object()
-
-# ----------------------------------------------------------------------
-
-
-cdef _Timestamp create_timestamp_from_ts(
- int64_t value,
- npy_datetimestruct dts,
- tzinfo tz,
- bint fold,
- NPY_DATETIMEUNIT reso=NPY_FR_ns,
-):
- """ convenience routine to construct a Timestamp from its parts """
- cdef:
- _Timestamp ts_base
- int64_t pass_year = dts.year
-
- # We pass year=1970/1972 here and set year below because with non-nanosecond
- # resolution we may have datetimes outside of the stdlib pydatetime
- # implementation bounds, which would raise.
- # NB: this means the C-API macro PyDateTime_GET_YEAR is unreliable.
- if 1 <= pass_year <= 9999:
- # we are in-bounds for pydatetime
- pass
- elif ccalendar.is_leapyear(dts.year):
- pass_year = 1972
- else:
- pass_year = 1970
-
- ts_base = _Timestamp.__new__(Timestamp, pass_year, dts.month,
- dts.day, dts.hour, dts.min,
- dts.sec, dts.us, tz, fold=fold)
-
- ts_base._value = value
- ts_base.year = dts.year
- ts_base.nanosecond = dts.ps // 1000
- ts_base._creso = reso
-
- return ts_base
-
-
-def _unpickle_timestamp(value, freq, tz, reso=NPY_FR_ns):
- # GH#41949 dont warn on unpickle if we have a freq
- ts = Timestamp._from_value_and_reso(value, reso, tz)
- return ts
-
-
-# ----------------------------------------------------------------------
-
-def integer_op_not_supported(obj):
- # GH#22535 add/sub of integers and int-arrays is no longer allowed
- # Note we return rather than raise the exception so we can raise in
- # the caller; mypy finds this more palatable.
- cls = type(obj).__name__
-
- # GH#30886 using an fstring raises SystemError
- int_addsub_msg = (
- f"Addition/subtraction of integers and integer-arrays with {cls} is "
- "no longer supported. Instead of adding/subtracting `n`, "
- "use `n * obj.freq`"
- )
- return TypeError(int_addsub_msg)
-
-
-class MinMaxReso:
- """
- We need to define min/max/resolution on both the Timestamp _instance_
- and Timestamp class. On an instance, these depend on the object's _reso.
- On the class, we default to the values we would get with nanosecond _reso.
-
- See also: timedeltas.MinMaxReso
- """
- def __init__(self, name):
- self._name = name
-
- def __get__(self, obj, type=None):
- cls = Timestamp
- if self._name == "min":
- val = np.iinfo(np.int64).min + 1
- elif self._name == "max":
- val = np.iinfo(np.int64).max
- else:
- assert self._name == "resolution"
- val = 1
- cls = Timedelta
-
- if obj is None:
- # i.e. this is on the class, default to nanos
- return cls(val)
- elif self._name == "resolution":
- return Timedelta._from_value_and_reso(val, obj._creso)
- else:
- return Timestamp._from_value_and_reso(val, obj._creso, tz=None)
-
- def __set__(self, obj, value):
- raise AttributeError(f"{self._name} is not settable.")
-
-
-# ----------------------------------------------------------------------
-
-cdef class _Timestamp(ABCTimestamp):
-
- # higher than np.ndarray and np.matrix
- __array_priority__ = 100
- dayofweek = _Timestamp.day_of_week
- dayofyear = _Timestamp.day_of_year
-
- min = MinMaxReso("min")
- max = MinMaxReso("max")
- resolution = MinMaxReso("resolution") # GH#21336, GH#21365
-
- @property
- def value(self) -> int:
- try:
- return convert_reso(self._value, self._creso, NPY_FR_ns, False)
- except OverflowError:
- raise OverflowError(
- "Cannot convert Timestamp to nanoseconds without overflow. "
- "Use `.asm8.view('i8')` to cast represent Timestamp in its own "
- f"unit (here, {self.unit})."
- )
-
- @property
- def unit(self) -> str:
- """
- The abbreviation associated with self._creso.
-
- Examples
- --------
- >>> pd.Timestamp("2020-01-01 12:34:56").unit
- 's'
-
- >>> pd.Timestamp("2020-01-01 12:34:56.123").unit
- 'ms'
-
- >>> pd.Timestamp("2020-01-01 12:34:56.123456").unit
- 'us'
-
- >>> pd.Timestamp("2020-01-01 12:34:56.123456789").unit
- 'ns'
- """
- return npy_unit_to_abbrev(self._creso)
-
- # -----------------------------------------------------------------
- # Constructors
-
- @classmethod
- def _from_value_and_reso(cls, int64_t value, NPY_DATETIMEUNIT reso, tzinfo tz):
- cdef:
- _TSObject obj = _TSObject()
-
- if value == NPY_NAT:
- return NaT
-
- if reso < NPY_DATETIMEUNIT.NPY_FR_s or reso > NPY_DATETIMEUNIT.NPY_FR_ns:
- raise NotImplementedError(
- "Only resolutions 's', 'ms', 'us', 'ns' are supported."
- )
-
- obj.value = value
- obj.creso = reso
- pandas_datetime_to_datetimestruct(value, reso, &obj.dts)
- maybe_localize_tso(obj, tz, reso)
-
- return create_timestamp_from_ts(
- value, obj.dts, tz=obj.tzinfo, fold=obj.fold, reso=reso
- )
-
- @classmethod
- def _from_dt64(cls, dt64: np.datetime64):
- # construct a Timestamp from a np.datetime64 object, keeping the
- # resolution of the input.
- # This is herely mainly so we can incrementally implement non-nano
- # (e.g. only tznaive at first)
- cdef:
- int64_t value
- NPY_DATETIMEUNIT reso
-
- reso = get_datetime64_unit(dt64)
- value = get_datetime64_value(dt64)
- return cls._from_value_and_reso(value, reso, None)
-
- # -----------------------------------------------------------------
-
- def __hash__(_Timestamp self):
- if self.nanosecond:
- return hash(self._value)
- if not (1 <= self.year <= 9999):
- # out of bounds for pydatetime
- return hash(self._value)
- if self.fold:
- return datetime.__hash__(self.replace(fold=0))
- return datetime.__hash__(self)
-
- def __richcmp__(_Timestamp self, object other, int op):
- cdef:
- _Timestamp ots
-
- if isinstance(other, _Timestamp):
- ots = other
- elif other is NaT:
- return op == Py_NE
- elif is_datetime64_object(other):
- ots = Timestamp(other)
- elif PyDateTime_Check(other):
- if self.nanosecond == 0:
- val = self.to_pydatetime()
- return PyObject_RichCompareBool(val, other, op)
-
- try:
- ots = type(self)(other)
- except ValueError:
- return self._compare_outside_nanorange(other, op)
-
- elif is_array(other):
- # avoid recursion error GH#15183
- if other.dtype.kind == "M":
- if self.tz is None:
- return PyObject_RichCompare(self.asm8, other, op)
- elif op == Py_NE:
- return np.ones(other.shape, dtype=np.bool_)
- elif op == Py_EQ:
- return np.zeros(other.shape, dtype=np.bool_)
- raise TypeError(
- "Cannot compare tz-naive and tz-aware timestamps"
- )
- elif other.dtype.kind == "O":
- # Operate element-wise
- return np.array(
- [PyObject_RichCompare(self, x, op) for x in other],
- dtype=bool,
- )
- elif op == Py_NE:
- return np.ones(other.shape, dtype=np.bool_)
- elif op == Py_EQ:
- return np.zeros(other.shape, dtype=np.bool_)
- return NotImplemented
-
- elif PyDate_Check(other):
- # returning NotImplemented defers to the `date` implementation
- # which incorrectly drops tz and normalizes to midnight
- # before comparing
- # We follow the stdlib datetime behavior of never being equal
- if op == Py_EQ:
- return False
- elif op == Py_NE:
- return True
- raise TypeError(
- "Cannot compare Timestamp with datetime.date. "
- "Use ts == pd.Timestamp(date) or ts.date() == date instead."
- )
- else:
- return NotImplemented
-
- if not self._can_compare(ots):
- if op == Py_NE or op == Py_EQ:
- return NotImplemented
- raise TypeError(
- "Cannot compare tz-naive and tz-aware timestamps"
- )
- if self._creso == ots._creso:
- return cmp_scalar(self._value, ots._value, op)
- return self._compare_mismatched_resos(ots, op)
-
- # TODO: copied from Timedelta; try to de-duplicate
- cdef bint _compare_mismatched_resos(self, _Timestamp other, int op):
- # Can't just dispatch to numpy as they silently overflow and get it wrong
- cdef:
- npy_datetimestruct dts_self
- npy_datetimestruct dts_other
-
- # dispatch to the datetimestruct utils instead of writing new ones!
- pandas_datetime_to_datetimestruct(self._value, self._creso, &dts_self)
- pandas_datetime_to_datetimestruct(other._value, other._creso, &dts_other)
- return cmp_dtstructs(&dts_self, &dts_other, op)
-
- cdef bint _compare_outside_nanorange(_Timestamp self, datetime other,
- int op) except -1:
- cdef:
- datetime dtval = self.to_pydatetime(warn=False)
-
- if not self._can_compare(other):
- return NotImplemented
-
- if self.nanosecond == 0:
- return PyObject_RichCompareBool(dtval, other, op)
-
- # otherwise we have dtval < self
- if op == Py_NE:
- return True
- if op == Py_EQ:
- return False
- if op == Py_LE or op == Py_LT:
- return self.year <= other.year
- if op == Py_GE or op == Py_GT:
- return self.year >= other.year
-
- cdef bint _can_compare(self, datetime other):
- if self.tzinfo is not None:
- return other.tzinfo is not None
- return other.tzinfo is None
-
- @cython.overflowcheck(True)
- def __add__(self, other):
- cdef:
- int64_t nanos = 0
-
- if is_any_td_scalar(other):
- other = Timedelta(other)
-
- # TODO: share this with __sub__, Timedelta.__add__
- # Matching numpy, we cast to the higher resolution. Unlike numpy,
- # we raise instead of silently overflowing during this casting.
- if self._creso < other._creso:
- self = (<_Timestamp>self)._as_creso(other._creso, round_ok=True)
- elif self._creso > other._creso:
- other = (<_Timedelta>other)._as_creso(self._creso, round_ok=True)
-
- nanos = other._value
-
- try:
- new_value = self._value+ nanos
- result = type(self)._from_value_and_reso(
- new_value, reso=self._creso, tz=self.tzinfo
- )
- except OverflowError as err:
- # TODO: don't hard-code nanosecond here
- new_value = int(self._value) + int(nanos)
- raise OutOfBoundsDatetime(
- f"Out of bounds nanosecond timestamp: {new_value}"
- ) from err
-
- return result
-
- elif is_integer_object(other):
- raise integer_op_not_supported(self)
-
- elif is_array(other):
- if other.dtype.kind in ["i", "u"]:
- raise integer_op_not_supported(self)
- if other.dtype.kind == "m":
- if self.tz is None:
- return self.asm8 + other
- return np.asarray(
- [self + other[n] for n in range(len(other))],
- dtype=object,
- )
-
- elif not isinstance(self, _Timestamp):
- # cython semantics, args have been switched and this is __radd__
- # TODO(cython3): remove this it moved to __radd__
- return other.__add__(self)
-
- return NotImplemented
-
- def __radd__(self, other):
- # Have to duplicate checks to avoid infinite recursion due to NotImplemented
- if is_any_td_scalar(other) or is_integer_object(other) or is_array(other):
- return self.__add__(other)
- return NotImplemented
-
- def __sub__(self, other):
- if other is NaT:
- return NaT
-
- elif is_any_td_scalar(other) or is_integer_object(other):
- neg_other = -other
- return self + neg_other
-
- elif is_array(other):
- if other.dtype.kind in ["i", "u"]:
- raise integer_op_not_supported(self)
- if other.dtype.kind == "m":
- if self.tz is None:
- return self.asm8 - other
- return np.asarray(
- [self - other[n] for n in range(len(other))],
- dtype=object,
- )
- return NotImplemented
-
- # coerce if necessary if we are a Timestamp-like
- if (PyDateTime_Check(self)
- and (PyDateTime_Check(other) or is_datetime64_object(other))):
- # both_timestamps is to determine whether Timedelta(self - other)
- # should raise the OOB error, or fall back returning a timedelta.
- # TODO(cython3): clean out the bits that moved to __rsub__
- both_timestamps = (isinstance(other, _Timestamp) and
- isinstance(self, _Timestamp))
- if isinstance(self, _Timestamp):
- other = type(self)(other)
- else:
- self = type(other)(self)
-
- if (self.tzinfo is None) ^ (other.tzinfo is None):
- raise TypeError(
- "Cannot subtract tz-naive and tz-aware datetime-like objects."
- )
-
- # Matching numpy, we cast to the higher resolution. Unlike numpy,
- # we raise instead of silently overflowing during this casting.
- if self._creso < other._creso:
- self = (<_Timestamp>self)._as_creso(other._creso, round_ok=True)
- elif self._creso > other._creso:
- other = (<_Timestamp>other)._as_creso(self._creso, round_ok=True)
-
- # scalar Timestamp/datetime - Timestamp/datetime -> yields a
- # Timedelta
- try:
- res_value = self._value- other._value
- return Timedelta._from_value_and_reso(res_value, self._creso)
- except (OverflowError, OutOfBoundsDatetime, OutOfBoundsTimedelta) as err:
- if isinstance(other, _Timestamp):
- if both_timestamps:
- raise OutOfBoundsDatetime(
- "Result is too large for pandas.Timedelta. Convert inputs "
- "to datetime.datetime with 'Timestamp.to_pydatetime()' "
- "before subtracting."
- ) from err
- # We get here in stata tests, fall back to stdlib datetime
- # method and return stdlib timedelta object
- pass
- elif is_datetime64_object(self):
- # GH#28286 cython semantics for __rsub__, `other` is actually
- # the Timestamp
- # TODO(cython3): remove this, this moved to __rsub__
- return type(other)(self) - other
-
- return NotImplemented
-
- def __rsub__(self, other):
- if PyDateTime_Check(other):
- try:
- return type(self)(other) - self
- except (OverflowError, OutOfBoundsDatetime) as err:
- # We get here in stata tests, fall back to stdlib datetime
- # method and return stdlib timedelta object
- pass
- elif is_datetime64_object(other):
- return type(self)(other) - self
- return NotImplemented
-
- # -----------------------------------------------------------------
-
- cdef int64_t _maybe_convert_value_to_local(self):
- """Convert UTC i8 value to local i8 value if tz exists"""
- cdef:
- int64_t val
- tzinfo own_tz = self.tzinfo
- npy_datetimestruct dts
-
- if own_tz is not None and not is_utc(own_tz):
- pydatetime_to_dtstruct(self, &dts)
- val = npy_datetimestruct_to_datetime(self._creso, &dts) + self.nanosecond
- else:
- val = self._value
- return val
-
- @cython.boundscheck(False)
- cdef bint _get_start_end_field(self, str field, freq):
- cdef:
- int64_t val
- dict kwds
- ndarray[uint8_t, cast=True] out
- int month_kw
-
- if freq:
- kwds = freq.kwds
- month_kw = kwds.get("startingMonth", kwds.get("month", 12))
- freqstr = freq.freqstr
- else:
- month_kw = 12
- freqstr = None
-
- val = self._maybe_convert_value_to_local()
-
- out = get_start_end_field(np.array([val], dtype=np.int64),
- field, freqstr, month_kw, self._creso)
- return out[0]
-
- @property
- def is_month_start(self) -> bool:
- """
- Check if the date is the first day of the month.
-
- Returns
- -------
- bool
- True if the date is the first day of the month.
-
- See Also
- --------
- Timestamp.is_month_end : Similar property indicating the last day of the month.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.is_month_start
- False
-
- >>> ts = pd.Timestamp(2020, 1, 1)
- >>> ts.is_month_start
- True
- """
- return self.day == 1
-
- @property
- def is_month_end(self) -> bool:
- """
- Check if the date is the last day of the month.
-
- Returns
- -------
- bool
- True if the date is the last day of the month.
-
- See Also
- --------
- Timestamp.is_month_start : Similar property indicating month start.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.is_month_end
- False
-
- >>> ts = pd.Timestamp(2020, 12, 31)
- >>> ts.is_month_end
- True
- """
- return self.day == self.days_in_month
-
- @property
- def is_quarter_start(self) -> bool:
- """
- Check if the date is the first day of the quarter.
-
- Returns
- -------
- bool
- True if date is first day of the quarter.
-
- See Also
- --------
- Timestamp.is_quarter_end : Similar property indicating the quarter end.
- Timestamp.quarter : Return the quarter of the date.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.is_quarter_start
- False
-
- >>> ts = pd.Timestamp(2020, 4, 1)
- >>> ts.is_quarter_start
- True
- """
- return self.day == 1 and self.month % 3 == 1
-
- @property
- def is_quarter_end(self) -> bool:
- """
- Check if date is last day of the quarter.
-
- Returns
- -------
- bool
- True if date is last day of the quarter.
-
- See Also
- --------
- Timestamp.is_quarter_start : Similar property indicating the quarter start.
- Timestamp.quarter : Return the quarter of the date.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.is_quarter_end
- False
-
- >>> ts = pd.Timestamp(2020, 3, 31)
- >>> ts.is_quarter_end
- True
- """
- return (self.month % 3) == 0 and self.day == self.days_in_month
-
- @property
- def is_year_start(self) -> bool:
- """
- Return True if date is first day of the year.
-
- Returns
- -------
- bool
-
- See Also
- --------
- Timestamp.is_year_end : Similar property indicating the end of the year.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.is_year_start
- False
-
- >>> ts = pd.Timestamp(2020, 1, 1)
- >>> ts.is_year_start
- True
- """
- return self.day == self.month == 1
-
- @property
- def is_year_end(self) -> bool:
- """
- Return True if date is last day of the year.
-
- Returns
- -------
- bool
-
- See Also
- --------
- Timestamp.is_year_start : Similar property indicating the start of the year.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.is_year_end
- False
-
- >>> ts = pd.Timestamp(2020, 12, 31)
- >>> ts.is_year_end
- True
- """
- return self.month == 12 and self.day == 31
-
- @cython.boundscheck(False)
- cdef _get_date_name_field(self, str field, object locale):
- cdef:
- int64_t val
- object[::1] out
-
- val = self._maybe_convert_value_to_local()
-
- out = get_date_name_field(np.array([val], dtype=np.int64),
- field, locale=locale, reso=self._creso)
- return out[0]
-
- def day_name(self, locale=None) -> str:
- """
- Return the day name of the Timestamp with specified locale.
-
- Parameters
- ----------
- locale : str, default None (English locale)
- Locale determining the language in which to return the day name.
-
- Returns
- -------
- str
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> ts.day_name()
- 'Saturday'
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.day_name()
- nan
- """
- return self._get_date_name_field("day_name", locale)
-
- def month_name(self, locale=None) -> str:
- """
- Return the month name of the Timestamp with specified locale.
-
- Parameters
- ----------
- locale : str, default None (English locale)
- Locale determining the language in which to return the month name.
-
- Returns
- -------
- str
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> ts.month_name()
- 'March'
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.month_name()
- nan
- """
- return self._get_date_name_field("month_name", locale)
-
- @property
- def is_leap_year(self) -> bool:
- """
- Return True if year is a leap year.
-
- Returns
- -------
- bool
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.is_leap_year
- True
- """
- return bool(ccalendar.is_leapyear(self.year))
-
- @property
- def day_of_week(self) -> int:
- """
- Return day of the week.
-
- Returns
- -------
- int
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.day_of_week
- 5
- """
- return self.weekday()
-
- @property
- def day_of_year(self) -> int:
- """
- Return the day of the year.
-
- Returns
- -------
- int
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.day_of_year
- 74
- """
- return ccalendar.get_day_of_year(self.year, self.month, self.day)
-
- @property
- def quarter(self) -> int:
- """
- Return the quarter of the year.
-
- Returns
- -------
- int
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.quarter
- 1
- """
- return ((self.month - 1) // 3) + 1
-
- @property
- def week(self) -> int:
- """
- Return the week number of the year.
-
- Returns
- -------
- int
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.week
- 11
- """
- return ccalendar.get_week_of_year(self.year, self.month, self.day)
-
- @property
- def days_in_month(self) -> int:
- """
- Return the number of days in the month.
-
- Returns
- -------
- int
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14)
- >>> ts.days_in_month
- 31
- """
- return ccalendar.get_days_in_month(self.year, self.month)
-
- # -----------------------------------------------------------------
- # Transformation Methods
-
- def normalize(self) -> "Timestamp":
- """
- Normalize Timestamp to midnight, preserving tz information.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14, 15, 30)
- >>> ts.normalize()
- Timestamp('2020-03-14 00:00:00')
- """
- cdef:
- local_val = self._maybe_convert_value_to_local()
- int64_t normalized
- int64_t ppd = periods_per_day(self._creso)
- _Timestamp ts
-
- normalized = normalize_i8_stamp(local_val, ppd)
- ts = type(self)._from_value_and_reso(normalized, reso=self._creso, tz=None)
- return ts.tz_localize(self.tzinfo)
-
- # -----------------------------------------------------------------
- # Pickle Methods
-
- def __reduce_ex__(self, protocol):
- # python 3.6 compat
- # https://bugs.python.org/issue28730
- # now __reduce_ex__ is defined and higher priority than __reduce__
- return self.__reduce__()
-
- def __setstate__(self, state):
- self._value= state[0]
- self.tzinfo = state[2]
-
- if len(state) == 3:
- # pre-non-nano pickle
- # TODO: no tests get here 2022-05-10
- reso = NPY_FR_ns
- else:
- reso = state[4]
- self._creso = reso
-
- def __reduce__(self):
- object_state = self._value, None, self.tzinfo, self._creso
- return (_unpickle_timestamp, object_state)
-
- # -----------------------------------------------------------------
- # Rendering Methods
-
- def isoformat(self, sep: str = "T", timespec: str = "auto") -> str:
- """
- Return the time formatted according to ISO 8610.
-
- The full format looks like 'YYYY-MM-DD HH:MM:SS.mmmmmmnnn'.
- By default, the fractional part is omitted if self.microsecond == 0
- and self.nanosecond == 0.
-
- If self.tzinfo is not None, the UTC offset is also attached, giving
- giving a full format of 'YYYY-MM-DD HH:MM:SS.mmmmmmnnn+HH:MM'.
-
- Parameters
- ----------
- sep : str, default 'T'
- String used as the separator between the date and time.
-
- timespec : str, default 'auto'
- Specifies the number of additional terms of the time to include.
- The valid values are 'auto', 'hours', 'minutes', 'seconds',
- 'milliseconds', 'microseconds', and 'nanoseconds'.
-
- Returns
- -------
- str
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> ts.isoformat()
- '2020-03-14T15:32:52.192548651'
- >>> ts.isoformat(timespec='microseconds')
- '2020-03-14T15:32:52.192548'
- """
- base_ts = "microseconds" if timespec == "nanoseconds" else timespec
- base = super(_Timestamp, self).isoformat(sep=sep, timespec=base_ts)
- # We need to replace the fake year 1970 with our real year
- base = f"{self.year:04d}-" + base.split("-", 1)[1]
-
- if self.nanosecond == 0 and timespec != "nanoseconds":
- return base
-
- if self.tzinfo is not None:
- base1, base2 = base[:-6], base[-6:]
- else:
- base1, base2 = base, ""
-
- if timespec == "nanoseconds" or (timespec == "auto" and self.nanosecond):
- if self.microsecond:
- base1 += f"{self.nanosecond:03d}"
- else:
- base1 += f".{self.nanosecond:09d}"
-
- return base1 + base2
-
- def __repr__(self) -> str:
- stamp = self._repr_base
- zone = None
-
- if self.tzinfo is not None:
- try:
- stamp += self.strftime("%z")
- except ValueError:
- year2000 = self.replace(year=2000)
- stamp += year2000.strftime("%z")
-
- zone = get_timezone(self.tzinfo)
- try:
- stamp += zone.strftime(" %%Z")
- except AttributeError:
- # e.g. tzlocal has no `strftime`
- pass
-
- tz = f", tz='{zone}'" if zone is not None else ""
-
- return f"Timestamp('{stamp}'{tz})"
-
- @property
- def _repr_base(self) -> str:
- return f"{self._date_repr} {self._time_repr}"
-
- @property
- def _date_repr(self) -> str:
- # Ideal here would be self.strftime("%Y-%m-%d"), but
- # the datetime strftime() methods require year >= 1900 and is slower
- return f"{self.year}-{self.month:02d}-{self.day:02d}"
-
- @property
- def _time_repr(self) -> str:
- result = f"{self.hour:02d}:{self.minute:02d}:{self.second:02d}"
-
- if self.nanosecond != 0:
- result += f".{self.nanosecond + 1000 * self.microsecond:09d}"
- elif self.microsecond != 0:
- result += f".{self.microsecond:06d}"
-
- return result
-
- @property
- def _short_repr(self) -> str:
- # format a Timestamp with only _date_repr if possible
- # otherwise _repr_base
- if (self.hour == 0 and
- self.minute == 0 and
- self.second == 0 and
- self.microsecond == 0 and
- self.nanosecond == 0):
- return self._date_repr
- return self._repr_base
-
- # -----------------------------------------------------------------
- # Conversion Methods
-
- @cython.cdivision(False)
- cdef _Timestamp _as_creso(self, NPY_DATETIMEUNIT creso, bint round_ok=True):
- cdef:
- int64_t value
-
- if creso == self._creso:
- return self
-
- try:
- value = convert_reso(self._value, self._creso, creso, round_ok=round_ok)
- except OverflowError as err:
- unit = npy_unit_to_abbrev(creso)
- raise OutOfBoundsDatetime(
- f"Cannot cast {self} to unit='{unit}' without overflow."
- ) from err
-
- return type(self)._from_value_and_reso(value, reso=creso, tz=self.tzinfo)
-
- def as_unit(self, str unit, bint round_ok=True):
- """
- Convert the underlying int64 representaton to the given unit.
-
- Parameters
- ----------
- unit : {"ns", "us", "ms", "s"}
- round_ok : bool, default True
- If False and the conversion requires rounding, raise.
-
- Returns
- -------
- Timestamp
- """
- dtype = np.dtype(f"M8[{unit}]")
- reso = get_unit_from_dtype(dtype)
- try:
- return self._as_creso(reso, round_ok=round_ok)
- except OverflowError as err:
- raise OutOfBoundsDatetime(
- f"Cannot cast {self} to unit='{unit}' without overflow."
- ) from err
-
- @property
- def asm8(self) -> np.datetime64:
- """
- Return numpy datetime64 format in nanoseconds.
-
- Examples
- --------
- >>> ts = pd.Timestamp(2020, 3, 14, 15)
- >>> ts.asm8
- numpy.datetime64('2020-03-14T15:00:00.000000')
- """
- return self.to_datetime64()
-
- def timestamp(self):
- """
- Return POSIX timestamp as float.
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548')
- >>> ts.timestamp()
- 1584199972.192548
- """
- # GH 17329
- # Note: Naive timestamps will not match datetime.stdlib
-
- denom = periods_per_second(self._creso)
-
- return round(self._value/ denom, 6)
-
- cpdef datetime to_pydatetime(_Timestamp self, bint warn=True):
- """
- Convert a Timestamp object to a native Python datetime object.
-
- If warn=True, issue a warning if nanoseconds is nonzero.
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548')
- >>> ts.to_pydatetime()
- datetime.datetime(2020, 3, 14, 15, 32, 52, 192548)
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.to_pydatetime()
- NaT
- """
- if self.nanosecond != 0 and warn:
- warnings.warn("Discarding nonzero nanoseconds in conversion.",
- UserWarning, stacklevel=find_stack_level())
-
- return datetime(self.year, self.month, self.day,
- self.hour, self.minute, self.second,
- self.microsecond, self.tzinfo, fold=self.fold)
-
- cpdef to_datetime64(self):
- """
- Return a numpy.datetime64 object with 'ns' precision.
- """
- # TODO: find a way to construct dt64 directly from _reso
- abbrev = npy_unit_to_abbrev(self._creso)
- return np.datetime64(self._value, abbrev)
-
- def to_numpy(self, dtype=None, copy=False) -> np.datetime64:
- """
- Convert the Timestamp to a NumPy datetime64.
-
- This is an alias method for `Timestamp.to_datetime64()`. The dtype and
- copy parameters are available here only for compatibility. Their values
- will not affect the return value.
-
- Returns
- -------
- numpy.datetime64
-
- See Also
- --------
- DatetimeIndex.to_numpy : Similar method for DatetimeIndex.
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> ts.to_numpy()
- numpy.datetime64('2020-03-14T15:32:52.192548651')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.to_numpy()
- numpy.datetime64('NaT')
- """
- if dtype is not None or copy is not False:
- raise ValueError(
- "Timestamp.to_numpy dtype and copy arguments are ignored."
- )
- return self.to_datetime64()
-
- def to_period(self, freq=None):
- """
- Return an period of which this timestamp is an observation.
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> # Year end frequency
- >>> ts.to_period(freq='Y')
- Period('2020', 'A-DEC')
-
- >>> # Month end frequency
- >>> ts.to_period(freq='M')
- Period('2020-03', 'M')
-
- >>> # Weekly frequency
- >>> ts.to_period(freq='W')
- Period('2020-03-09/2020-03-15', 'W-SUN')
-
- >>> # Quarter end frequency
- >>> ts.to_period(freq='Q')
- Period('2020Q1', 'Q-DEC')
- """
- from pandas import Period
-
- if self.tz is not None:
- # GH#21333
- warnings.warn(
- "Converting to Period representation will drop timezone information.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
-
- return Period(self, freq=freq)
-
-
-# ----------------------------------------------------------------------
-
-# Python front end to C extension type _Timestamp
-# This serves as the box for datetime64
-
-
-class Timestamp(_Timestamp):
- """
- Pandas replacement for python datetime.datetime object.
-
- Timestamp is the pandas equivalent of python's Datetime
- and is interchangeable with it in most cases. It's the type used
- for the entries that make up a DatetimeIndex, and other timeseries
- oriented data structures in pandas.
-
- Parameters
- ----------
- ts_input : datetime-like, str, int, float
- Value to be converted to Timestamp.
- year, month, day : int
- hour, minute, second, microsecond : int, optional, default 0
- tzinfo : datetime.tzinfo, optional, default None
- nanosecond : int, optional, default 0
- tz : str, pytz.timezone, dateutil.tz.tzfile or None
- Time zone for time which Timestamp will have.
- unit : str
- Unit used for conversion if ts_input is of type int or float. The
- valid values are 'D', 'h', 'm', 's', 'ms', 'us', and 'ns'. For
- example, 's' means seconds and 'ms' means milliseconds.
-
- For float inputs, the result will be stored in nanoseconds, and
- the unit attribute will be set as ``'ns'``.
- fold : {0, 1}, default None, keyword-only
- Due to daylight saving time, one wall clock time can occur twice
- when shifting from summer to winter time; fold describes whether the
- datetime-like corresponds to the first (0) or the second time (1)
- the wall clock hits the ambiguous time.
-
- .. versionadded:: 1.1.0
-
- Notes
- -----
- There are essentially three calling conventions for the constructor. The
- primary form accepts four parameters. They can be passed by position or
- keyword.
-
- The other two forms mimic the parameters from ``datetime.datetime``. They
- can be passed by either position or keyword, but not both mixed together.
-
- Examples
- --------
- Using the primary calling convention:
-
- This converts a datetime-like string
-
- >>> pd.Timestamp('2017-01-01T12')
- Timestamp('2017-01-01 12:00:00')
-
- This converts a float representing a Unix epoch in units of seconds
-
- >>> pd.Timestamp(1513393355.5, unit='s')
- Timestamp('2017-12-16 03:02:35.500000')
-
- This converts an int representing a Unix-epoch in units of seconds
- and for a particular timezone
-
- >>> pd.Timestamp(1513393355, unit='s', tz='US/Pacific')
- Timestamp('2017-12-15 19:02:35-0800', tz='US/Pacific')
-
- Using the other two forms that mimic the API for ``datetime.datetime``:
-
- >>> pd.Timestamp(2017, 1, 1, 12)
- Timestamp('2017-01-01 12:00:00')
-
- >>> pd.Timestamp(year=2017, month=1, day=1, hour=12)
- Timestamp('2017-01-01 12:00:00')
- """
-
- @classmethod
- def fromordinal(cls, ordinal, tz=None):
- """
- Construct a timestamp from a a proleptic Gregorian ordinal.
-
- Parameters
- ----------
- ordinal : int
- Date corresponding to a proleptic Gregorian ordinal.
- tz : str, pytz.timezone, dateutil.tz.tzfile or None
- Time zone for the Timestamp.
-
- Notes
- -----
- By definition there cannot be any tz info on the ordinal itself.
-
- Examples
- --------
- >>> pd.Timestamp.fromordinal(737425)
- Timestamp('2020-01-01 00:00:00')
- """
- return cls(datetime.fromordinal(ordinal), tz=tz)
-
- @classmethod
- def now(cls, tz=None):
- """
- Return new Timestamp object representing current time local to tz.
-
- Parameters
- ----------
- tz : str or timezone object, default None
- Timezone to localize to.
-
- Examples
- --------
- >>> pd.Timestamp.now() # doctest: +SKIP
- Timestamp('2020-11-16 22:06:16.378782')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.now()
- NaT
- """
- if isinstance(tz, str):
- tz = maybe_get_tz(tz)
- return cls(datetime.now(tz))
-
- @classmethod
- def today(cls, tz=None):
- """
- Return the current time in the local timezone.
-
- This differs from datetime.today() in that it can be localized to a
- passed timezone.
-
- Parameters
- ----------
- tz : str or timezone object, default None
- Timezone to localize to.
-
- Examples
- --------
- >>> pd.Timestamp.today() # doctest: +SKIP
- Timestamp('2020-11-16 22:37:39.969883')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.today()
- NaT
- """
- return cls.now(tz)
-
- @classmethod
- def utcnow(cls):
- """
- Timestamp.utcnow()
-
- Return a new Timestamp representing UTC day and time.
-
- Examples
- --------
- >>> pd.Timestamp.utcnow() # doctest: +SKIP
- Timestamp('2020-11-16 22:50:18.092888+0000', tz='UTC')
- """
- return cls.now(UTC)
-
- @classmethod
- def utcfromtimestamp(cls, ts):
- """
- Timestamp.utcfromtimestamp(ts)
-
- Construct a timezone-aware UTC datetime from a POSIX timestamp.
-
- Notes
- -----
- Timestamp.utcfromtimestamp behavior differs from datetime.utcfromtimestamp
- in returning a timezone-aware object.
-
- Examples
- --------
- >>> pd.Timestamp.utcfromtimestamp(1584199972)
- Timestamp('2020-03-14 15:32:52+0000', tz='UTC')
- """
- # GH#22451
- return cls.fromtimestamp(ts, tz="UTC")
-
- @classmethod
- def fromtimestamp(cls, ts, tz=None):
- """
- Timestamp.fromtimestamp(ts)
-
- Transform timestamp[, tz] to tz's local time from POSIX timestamp.
-
- Examples
- --------
- >>> pd.Timestamp.fromtimestamp(1584199972) # doctest: +SKIP
- Timestamp('2020-03-14 15:32:52')
-
- Note that the output may change depending on your local time.
- """
- tz = maybe_get_tz(tz)
- return cls(datetime.fromtimestamp(ts, tz))
-
- def strftime(self, format):
- """
- Return a formatted string of the Timestamp.
-
- Parameters
- ----------
- format : str
- Format string to convert Timestamp to string.
- See strftime documentation for more information on the format string:
- https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior.
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> ts.strftime('%Y-%m-%d %X')
- '2020-03-14 15:32:52'
- """
- try:
- _dt = datetime(self.year, self.month, self.day,
- self.hour, self.minute, self.second,
- self.microsecond, self.tzinfo, fold=self.fold)
- except ValueError as err:
- raise NotImplementedError(
- "strftime not yet supported on Timestamps which "
- "are outside the range of Python's standard library. "
- "For now, please call the components you need (such as `.year` "
- "and `.month`) and construct your string from there."
- ) from err
- return _dt.strftime(format)
-
- # Issue 25016.
- @classmethod
- def strptime(cls, date_string, format):
- """
- Timestamp.strptime(string, format)
-
- Function is not implemented. Use pd.to_datetime().
- """
- raise NotImplementedError(
- "Timestamp.strptime() is not implemented. "
- "Use to_datetime() to parse date strings."
- )
-
- @classmethod
- def combine(cls, date, time):
- """
- Timestamp.combine(date, time)
-
- Combine date, time into datetime with same date and time fields.
-
- Examples
- --------
- >>> from datetime import date, time
- >>> pd.Timestamp.combine(date(2020, 3, 14), time(15, 30, 15))
- Timestamp('2020-03-14 15:30:15')
- """
- return cls(datetime.combine(date, time))
-
- def __new__(
- cls,
- object ts_input=_no_input,
- year=None,
- month=None,
- day=None,
- hour=None,
- minute=None,
- second=None,
- microsecond=None,
- tzinfo_type tzinfo=None,
- *,
- nanosecond=None,
- tz=None,
- unit=None,
- fold=None,
- ):
- # The parameter list folds together legacy parameter names (the first
- # four) and positional and keyword parameter names from pydatetime.
- #
- # There are three calling forms:
- #
- # - In the legacy form, the first parameter, ts_input, is required
- # and may be datetime-like, str, int, or float. The second
- # parameter, offset, is optional and may be str or DateOffset.
- #
- # - ints in the first, second, and third arguments indicate
- # pydatetime positional arguments. Only the first 8 arguments
- # (standing in for year, month, day, hour, minute, second,
- # microsecond, tzinfo) may be non-None. As a shortcut, we just
- # check that the second argument is an int.
- #
- # - Nones for the first four (legacy) arguments indicate pydatetime
- # keyword arguments. year, month, and day are required. As a
- # shortcut, we just check that the first argument was not passed.
- #
- # Mixing pydatetime positional and keyword arguments is forbidden!
-
- cdef:
- _TSObject ts
- tzinfo_type tzobj
-
- _date_attributes = [year, month, day, hour, minute, second,
- microsecond, nanosecond]
-
- if tzinfo is not None:
- # GH#17690 tzinfo must be a datetime.tzinfo object, ensured
- # by the cython annotation.
- if tz is not None:
- raise ValueError("Can provide at most one of tz, tzinfo")
-
- # User passed tzinfo instead of tz; avoid silently ignoring
- tz, tzinfo = tzinfo, None
-
- # Allow fold only for unambiguous input
- if fold is not None:
- if fold not in [0, 1]:
- raise ValueError(
- "Valid values for the fold argument are None, 0, or 1."
- )
-
- if (ts_input is not _no_input and not (
- PyDateTime_Check(ts_input) and
- getattr(ts_input, "tzinfo", None) is None)):
- raise ValueError(
- "Cannot pass fold with possibly unambiguous input: int, "
- "float, numpy.datetime64, str, or timezone-aware "
- "datetime-like. Pass naive datetime-like or build "
- "Timestamp from components."
- )
-
- if tz is not None and PyTZInfo_Check(tz) and treat_tz_as_pytz(tz):
- raise ValueError(
- "pytz timezones do not support fold. Please use dateutil "
- "timezones."
- )
-
- if hasattr(ts_input, "fold"):
- ts_input = ts_input.replace(fold=fold)
-
- # GH 30543 if pd.Timestamp already passed, return it
- # check that only ts_input is passed
- # checking verbosely, because cython doesn't optimize
- # list comprehensions (as of cython 0.29.x)
- if (isinstance(ts_input, _Timestamp) and
- tz is None and unit is None and year is None and
- month is None and day is None and hour is None and
- minute is None and second is None and
- microsecond is None and nanosecond is None and
- tzinfo is None):
- return ts_input
- elif isinstance(ts_input, str):
- # User passed a date string to parse.
- # Check that the user didn't also pass a date attribute kwarg.
- if any(arg is not None for arg in _date_attributes):
- raise ValueError(
- "Cannot pass a date attribute keyword "
- "argument when passing a date string; 'tz' is keyword-only"
- )
-
- elif ts_input is _no_input:
- # GH 31200
- # When year, month or day is not given, we call the datetime
- # constructor to make sure we get the same error message
- # since Timestamp inherits datetime
- datetime_kwargs = {
- "hour": hour or 0,
- "minute": minute or 0,
- "second": second or 0,
- "microsecond": microsecond or 0,
- "fold": fold or 0
- }
- if year is not None:
- datetime_kwargs["year"] = year
- if month is not None:
- datetime_kwargs["month"] = month
- if day is not None:
- datetime_kwargs["day"] = day
-
- ts_input = datetime(**datetime_kwargs)
-
- elif is_integer_object(year):
- # User passed positional arguments:
- # Timestamp(year, month, day[, hour[, minute[, second[,
- # microsecond[, tzinfo]]]]])
- ts_input = datetime(ts_input, year, month, day or 0,
- hour or 0, minute or 0, second or 0, fold=fold or 0)
- unit = None
-
- if getattr(ts_input, "tzinfo", None) is not None and tz is not None:
- raise ValueError("Cannot pass a datetime or Timestamp with tzinfo with "
- "the tz parameter. Use tz_convert instead.")
-
- tzobj = maybe_get_tz(tz)
- if tzobj is not None and is_datetime64_object(ts_input):
- # GH#24559, GH#42288 As of 2.0 we treat datetime64 as
- # wall-time (consistent with DatetimeIndex)
- return cls(ts_input).tz_localize(tzobj)
-
- if nanosecond is None:
- nanosecond = 0
- elif not (999 >= nanosecond >= 0):
- raise ValueError("nanosecond must be in 0..999")
-
- ts = convert_to_tsobject(ts_input, tzobj, unit, 0, 0, nanosecond)
-
- if ts.value == NPY_NAT:
- return NaT
-
- return create_timestamp_from_ts(ts.value, ts.dts, ts.tzinfo, ts.fold, ts.creso)
-
- def _round(self, freq, mode, ambiguous="raise", nonexistent="raise"):
- cdef:
- int64_t nanos
-
- freq = to_offset(freq)
- freq.nanos # raises on non-fixed freq
- nanos = delta_to_nanoseconds(freq, self._creso)
- if nanos == 0:
- if freq.nanos == 0:
- raise ValueError("Division by zero in rounding")
-
- # e.g. self.unit == "s" and sub-second freq
- return self
-
- # TODO: problem if nanos==0
-
- if self.tz is not None:
- value = self.tz_localize(None)._value
- else:
- value = self._value
-
- value = np.array([value], dtype=np.int64)
-
- # Will only ever contain 1 element for timestamp
- r = round_nsint64(value, mode, nanos)[0]
- result = Timestamp._from_value_and_reso(r, self._creso, None)
- if self.tz is not None:
- result = result.tz_localize(
- self.tz, ambiguous=ambiguous, nonexistent=nonexistent
- )
- return result
-
- def round(self, freq, ambiguous="raise", nonexistent="raise"):
- """
- Round the Timestamp to the specified resolution.
-
- Parameters
- ----------
- freq : str
- Frequency string indicating the rounding resolution.
- ambiguous : bool or {'raise', 'NaT'}, default 'raise'
- The behavior is as follows:
-
- * bool contains flags to determine if time is dst or not (note
- that this flag is only applicable for ambiguous fall dst dates).
- * 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
-
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
-timedelta}, default 'raise'
- A nonexistent time does not exist in a particular timezone
- where clocks moved forward due to DST.
-
- * 'shift_forward' will shift the nonexistent time forward to the
- closest existing time.
- * 'shift_backward' will shift the nonexistent time backward to the
- closest existing time.
- * 'NaT' will return NaT where there are nonexistent times.
- * timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
- nonexistent times.
-
- Returns
- -------
- a new Timestamp rounded to the given resolution of `freq`
-
- Raises
- ------
- ValueError if the freq cannot be converted
-
- Notes
- -----
- If the Timestamp has a timezone, rounding will take place relative to the
- local ("wall") time and re-localized to the same timezone. When rounding
- near daylight savings time, use ``nonexistent`` and ``ambiguous`` to
- control the re-localization behavior.
-
- Examples
- --------
- Create a timestamp object:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
-
- A timestamp can be rounded using multiple frequency units:
-
- >>> ts.round(freq='H') # hour
- Timestamp('2020-03-14 16:00:00')
-
- >>> ts.round(freq='T') # minute
- Timestamp('2020-03-14 15:33:00')
-
- >>> ts.round(freq='S') # seconds
- Timestamp('2020-03-14 15:32:52')
-
- >>> ts.round(freq='L') # milliseconds
- Timestamp('2020-03-14 15:32:52.193000')
-
- ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes):
-
- >>> ts.round(freq='5T')
- Timestamp('2020-03-14 15:35:00')
-
- or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes):
-
- >>> ts.round(freq='1H30T')
- Timestamp('2020-03-14 15:00:00')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.round()
- NaT
-
- When rounding near a daylight savings time transition, use ``ambiguous`` or
- ``nonexistent`` to control how the timestamp should be re-localized.
-
- >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam")
-
- >>> ts_tz.round("H", ambiguous=False)
- Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam')
-
- >>> ts_tz.round("H", ambiguous=True)
- Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam')
- """
- return self._round(
- freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent
- )
-
- def floor(self, freq, ambiguous="raise", nonexistent="raise"):
- """
- Return a new Timestamp floored to this resolution.
-
- Parameters
- ----------
- freq : str
- Frequency string indicating the flooring resolution.
- ambiguous : bool or {'raise', 'NaT'}, default 'raise'
- The behavior is as follows:
-
- * bool contains flags to determine if time is dst or not (note
- that this flag is only applicable for ambiguous fall dst dates).
- * 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
-
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
-timedelta}, default 'raise'
- A nonexistent time does not exist in a particular timezone
- where clocks moved forward due to DST.
-
- * 'shift_forward' will shift the nonexistent time forward to the
- closest existing time.
- * 'shift_backward' will shift the nonexistent time backward to the
- closest existing time.
- * 'NaT' will return NaT where there are nonexistent times.
- * timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
- nonexistent times.
-
- Raises
- ------
- ValueError if the freq cannot be converted.
-
- Notes
- -----
- If the Timestamp has a timezone, flooring will take place relative to the
- local ("wall") time and re-localized to the same timezone. When flooring
- near daylight savings time, use ``nonexistent`` and ``ambiguous`` to
- control the re-localization behavior.
-
- Examples
- --------
- Create a timestamp object:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
-
- A timestamp can be floored using multiple frequency units:
-
- >>> ts.floor(freq='H') # hour
- Timestamp('2020-03-14 15:00:00')
-
- >>> ts.floor(freq='T') # minute
- Timestamp('2020-03-14 15:32:00')
-
- >>> ts.floor(freq='S') # seconds
- Timestamp('2020-03-14 15:32:52')
-
- >>> ts.floor(freq='N') # nanoseconds
- Timestamp('2020-03-14 15:32:52.192548651')
-
- ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes):
-
- >>> ts.floor(freq='5T')
- Timestamp('2020-03-14 15:30:00')
-
- or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes):
-
- >>> ts.floor(freq='1H30T')
- Timestamp('2020-03-14 15:00:00')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.floor()
- NaT
-
- When rounding near a daylight savings time transition, use ``ambiguous`` or
- ``nonexistent`` to control how the timestamp should be re-localized.
-
- >>> ts_tz = pd.Timestamp("2021-10-31 03:30:00").tz_localize("Europe/Amsterdam")
-
- >>> ts_tz.floor("2H", ambiguous=False)
- Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam')
-
- >>> ts_tz.floor("2H", ambiguous=True)
- Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam')
- """
- return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent)
-
- def ceil(self, freq, ambiguous="raise", nonexistent="raise"):
- """
- Return a new Timestamp ceiled to this resolution.
-
- Parameters
- ----------
- freq : str
- Frequency string indicating the ceiling resolution.
- ambiguous : bool or {'raise', 'NaT'}, default 'raise'
- The behavior is as follows:
-
- * bool contains flags to determine if time is dst or not (note
- that this flag is only applicable for ambiguous fall dst dates).
- * 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
-
- nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \
-timedelta}, default 'raise'
- A nonexistent time does not exist in a particular timezone
- where clocks moved forward due to DST.
-
- * 'shift_forward' will shift the nonexistent time forward to the
- closest existing time.
- * 'shift_backward' will shift the nonexistent time backward to the
- closest existing time.
- * 'NaT' will return NaT where there are nonexistent times.
- * timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
- nonexistent times.
-
- Raises
- ------
- ValueError if the freq cannot be converted.
-
- Notes
- -----
- If the Timestamp has a timezone, ceiling will take place relative to the
- local ("wall") time and re-localized to the same timezone. When ceiling
- near daylight savings time, use ``nonexistent`` and ``ambiguous`` to
- control the re-localization behavior.
-
- Examples
- --------
- Create a timestamp object:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
-
- A timestamp can be ceiled using multiple frequency units:
-
- >>> ts.ceil(freq='H') # hour
- Timestamp('2020-03-14 16:00:00')
-
- >>> ts.ceil(freq='T') # minute
- Timestamp('2020-03-14 15:33:00')
-
- >>> ts.ceil(freq='S') # seconds
- Timestamp('2020-03-14 15:32:53')
-
- >>> ts.ceil(freq='U') # microseconds
- Timestamp('2020-03-14 15:32:52.192549')
-
- ``freq`` can also be a multiple of a single unit, like '5T' (i.e. 5 minutes):
-
- >>> ts.ceil(freq='5T')
- Timestamp('2020-03-14 15:35:00')
-
- or a combination of multiple units, like '1H30T' (i.e. 1 hour and 30 minutes):
-
- >>> ts.ceil(freq='1H30T')
- Timestamp('2020-03-14 16:30:00')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.ceil()
- NaT
-
- When rounding near a daylight savings time transition, use ``ambiguous`` or
- ``nonexistent`` to control how the timestamp should be re-localized.
-
- >>> ts_tz = pd.Timestamp("2021-10-31 01:30:00").tz_localize("Europe/Amsterdam")
-
- >>> ts_tz.ceil("H", ambiguous=False)
- Timestamp('2021-10-31 02:00:00+0100', tz='Europe/Amsterdam')
-
- >>> ts_tz.ceil("H", ambiguous=True)
- Timestamp('2021-10-31 02:00:00+0200', tz='Europe/Amsterdam')
- """
- return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent)
-
- @property
- def tz(self):
- """
- Alias for tzinfo.
-
- Examples
- --------
- >>> ts = pd.Timestamp(1584226800, unit='s', tz='Europe/Stockholm')
- >>> ts.tz
- <DstTzInfo 'Europe/Stockholm' CET+1:00:00 STD>
- """
- return self.tzinfo
-
- @tz.setter
- def tz(self, value):
- # GH 3746: Prevent localizing or converting the index by setting tz
- raise AttributeError(
- "Cannot directly set timezone. "
- "Use tz_localize() or tz_convert() as appropriate"
- )
-
- def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"):
- """
- Localize the Timestamp to a timezone.
-
- Convert naive Timestamp to local time zone or remove
- timezone from timezone-aware Timestamp.
-
- Parameters
- ----------
- tz : str, pytz.timezone, dateutil.tz.tzfile or None
- Time zone for time which Timestamp will be converted to.
- None will remove timezone holding local time.
-
- ambiguous : bool, 'NaT', default 'raise'
- When clocks moved backward due to DST, ambiguous times may arise.
- For example in Central European Time (UTC+01), when going from
- 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
- 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
- `ambiguous` parameter dictates how ambiguous times should be
- handled.
-
- The behavior is as follows:
-
- * bool contains flags to determine if time is dst or not (note
- that this flag is only applicable for ambiguous fall dst dates).
- * 'NaT' will return NaT for an ambiguous time.
- * 'raise' will raise an AmbiguousTimeError for an ambiguous time.
-
- nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \
-default 'raise'
- A nonexistent time does not exist in a particular timezone
- where clocks moved forward due to DST.
-
- The behavior is as follows:
-
- * 'shift_forward' will shift the nonexistent time forward to the
- closest existing time.
- * 'shift_backward' will shift the nonexistent time backward to the
- closest existing time.
- * 'NaT' will return NaT where there are nonexistent times.
- * timedelta objects will shift nonexistent times by the timedelta.
- * 'raise' will raise an NonExistentTimeError if there are
- nonexistent times.
-
- Returns
- -------
- localized : Timestamp
-
- Raises
- ------
- TypeError
- If the Timestamp is tz-aware and tz is not None.
-
- Examples
- --------
- Create a naive timestamp object:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651')
- >>> ts
- Timestamp('2020-03-14 15:32:52.192548651')
-
- Add 'Europe/Stockholm' as timezone:
-
- >>> ts.tz_localize(tz='Europe/Stockholm')
- Timestamp('2020-03-14 15:32:52.192548651+0100', tz='Europe/Stockholm')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.tz_localize()
- NaT
- """
- if not isinstance(ambiguous, bool) and ambiguous not in {"NaT", "raise"}:
- raise ValueError(
- "'ambiguous' parameter must be one of: "
- "True, False, 'NaT', 'raise' (default)"
- )
-
- nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")
- if nonexistent not in nonexistent_options and not PyDelta_Check(nonexistent):
- raise ValueError(
- "The nonexistent argument must be one of 'raise', "
- "'NaT', 'shift_forward', 'shift_backward' or a timedelta object"
- )
-
- if self.tzinfo is None:
- # tz naive, localize
- tz = maybe_get_tz(tz)
- if not isinstance(ambiguous, str):
- ambiguous = [ambiguous]
- value = tz_localize_to_utc_single(self._value, tz,
- ambiguous=ambiguous,
- nonexistent=nonexistent,
- creso=self._creso)
- elif tz is None:
- # reset tz
- value = tz_convert_from_utc_single(self._value, self.tz, creso=self._creso)
-
- else:
- raise TypeError(
- "Cannot localize tz-aware Timestamp, use tz_convert for conversions"
- )
-
- out = type(self)._from_value_and_reso(value, self._creso, tz=tz)
- return out
-
- def tz_convert(self, tz):
- """
- Convert timezone-aware Timestamp to another time zone.
-
- Parameters
- ----------
- tz : str, pytz.timezone, dateutil.tz.tzfile or None
- Time zone for time which Timestamp will be converted to.
- None will remove timezone holding UTC time.
-
- Returns
- -------
- converted : Timestamp
-
- Raises
- ------
- TypeError
- If Timestamp is tz-naive.
-
- Examples
- --------
- Create a timestamp object with UTC timezone:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651', tz='UTC')
- >>> ts
- Timestamp('2020-03-14 15:32:52.192548651+0000', tz='UTC')
-
- Change to Tokyo timezone:
-
- >>> ts.tz_convert(tz='Asia/Tokyo')
- Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo')
-
- Can also use ``astimezone``:
-
- >>> ts.astimezone(tz='Asia/Tokyo')
- Timestamp('2020-03-15 00:32:52.192548651+0900', tz='Asia/Tokyo')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.tz_convert(tz='Asia/Tokyo')
- NaT
- """
- if self.tzinfo is None:
- # tz naive, use tz_localize
- raise TypeError(
- "Cannot convert tz-naive Timestamp, use tz_localize to localize"
- )
- else:
- # Same UTC timestamp, different time zone
- tz = maybe_get_tz(tz)
- out = type(self)._from_value_and_reso(self._value, reso=self._creso, tz=tz)
- return out
-
- astimezone = tz_convert
-
- def replace(
- self,
- year=None,
- month=None,
- day=None,
- hour=None,
- minute=None,
- second=None,
- microsecond=None,
- nanosecond=None,
- tzinfo=object,
- fold=None,
- ):
- """
- Implements datetime.replace, handles nanoseconds.
-
- Parameters
- ----------
- year : int, optional
- month : int, optional
- day : int, optional
- hour : int, optional
- minute : int, optional
- second : int, optional
- microsecond : int, optional
- nanosecond : int, optional
- tzinfo : tz-convertible, optional
- fold : int, optional
-
- Returns
- -------
- Timestamp with fields replaced
-
- Examples
- --------
- Create a timestamp object:
-
- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651', tz='UTC')
- >>> ts
- Timestamp('2020-03-14 15:32:52.192548651+0000', tz='UTC')
-
- Replace year and the hour:
-
- >>> ts.replace(year=1999, hour=10)
- Timestamp('1999-03-14 10:32:52.192548651+0000', tz='UTC')
-
- Replace timezone (not a conversion):
-
- >>> import pytz
- >>> ts.replace(tzinfo=pytz.timezone('US/Pacific'))
- Timestamp('2020-03-14 15:32:52.192548651-0700', tz='US/Pacific')
-
- Analogous for ``pd.NaT``:
-
- >>> pd.NaT.replace(tzinfo=pytz.timezone('US/Pacific'))
- NaT
- """
-
- cdef:
- npy_datetimestruct dts
- int64_t value
- object k, v
- datetime ts_input
- tzinfo_type tzobj
- _TSObject ts
-
- # set to naive if needed
- tzobj = self.tzinfo
- value = self._value
-
- # GH 37610. Preserve fold when replacing.
- if fold is None:
- fold = self.fold
-
- if tzobj is not None:
- value = tz_convert_from_utc_single(value, tzobj, creso=self._creso)
-
- # setup components
- pandas_datetime_to_datetimestruct(value, self._creso, &dts)
- dts.ps = self.nanosecond * 1000
-
- # replace
- def validate(k, v):
- """ validate integers """
- if not is_integer_object(v):
- raise ValueError(
- f"value must be an integer, received {type(v)} for {k}"
- )
- return v
-
- if year is not None:
- dts.year = validate("year", year)
- if month is not None:
- dts.month = validate("month", month)
- if day is not None:
- dts.day = validate("day", day)
- if hour is not None:
- dts.hour = validate("hour", hour)
- if minute is not None:
- dts.min = validate("minute", minute)
- if second is not None:
- dts.sec = validate("second", second)
- if microsecond is not None:
- dts.us = validate("microsecond", microsecond)
- if nanosecond is not None:
- dts.ps = validate("nanosecond", nanosecond) * 1000
- if tzinfo is not object:
- tzobj = tzinfo
-
- # reconstruct & check bounds
- if tzobj is None:
- # We can avoid going through pydatetime paths, which is robust
- # to datetimes outside of pydatetime range.
- ts = _TSObject()
- check_dts_bounds(&dts, self._creso)
- ts.value = npy_datetimestruct_to_datetime(self._creso, &dts)
- ts.dts = dts
- ts.creso = self._creso
- ts.fold = fold
- return create_timestamp_from_ts(
- ts.value, dts, tzobj, fold, reso=self._creso
- )
-
- elif tzobj is not None and treat_tz_as_pytz(tzobj):
- # replacing across a DST boundary may induce a new tzinfo object
- # see GH#18319
- ts_input = tzobj.localize(datetime(dts.year, dts.month, dts.day,
- dts.hour, dts.min, dts.sec,
- dts.us),
- is_dst=not bool(fold))
- tzobj = ts_input.tzinfo
- else:
- kwargs = {"year": dts.year, "month": dts.month, "day": dts.day,
- "hour": dts.hour, "minute": dts.min, "second": dts.sec,
- "microsecond": dts.us, "tzinfo": tzobj,
- "fold": fold}
- ts_input = datetime(**kwargs)
-
- ts = convert_datetime_to_tsobject(
- ts_input, tzobj, nanos=dts.ps // 1000, reso=self._creso
- )
- return create_timestamp_from_ts(
- ts.value, dts, tzobj, fold, reso=self._creso
- )
-
- def to_julian_date(self) -> np.float64:
- """
- Convert TimeStamp to a Julian Date.
-
- 0 Julian date is noon January 1, 4713 BC.
-
- Examples
- --------
- >>> ts = pd.Timestamp('2020-03-14T15:32:52')
- >>> ts.to_julian_date()
- 2458923.147824074
- """
- year = self.year
- month = self.month
- day = self.day
- if month <= 2:
- year -= 1
- month += 12
- return (day +
- np.fix((153 * month - 457) / 5) +
- 365 * year +
- np.floor(year / 4) -
- np.floor(year / 100) +
- np.floor(year / 400) +
- 1721118.5 +
- (self.hour +
- self.minute / 60.0 +
- self.second / 3600.0 +
- self.microsecond / 3600.0 / 1e+6 +
- self.nanosecond / 3600.0 / 1e+9
- ) / 24.0)
-
- def isoweekday(self):
- """
- Return the day of the week represented by the date.
-
- Monday == 1 ... Sunday == 7.
- """
- # same as super().isoweekday(), but that breaks because of how
- # we have overriden year, see note in create_timestamp_from_ts
- return self.weekday() + 1
-
- def weekday(self):
- """
- Return the day of the week represented by the date.
-
- Monday == 0 ... Sunday == 6.
- """
- # same as super().weekday(), but that breaks because of how
- # we have overriden year, see note in create_timestamp_from_ts
- return ccalendar.dayofweek(self.year, self.month, self.day)
-
-
-# Aliases
-Timestamp.weekofyear = Timestamp.week
-Timestamp.daysinmonth = Timestamp.days_in_month
-
-
-# ----------------------------------------------------------------------
-# Scalar analogues to functions in vectorized.pyx
-
-
-@cython.cdivision(False)
-cdef int64_t normalize_i8_stamp(int64_t local_val, int64_t ppd) nogil:
- """
- Round the localized nanosecond timestamp down to the previous midnight.
-
- Parameters
- ----------
- local_val : int64_t
- ppd : int64_t
- Periods per day in the Timestamp's resolution.
-
- Returns
- -------
- int64_t
- """
- return local_val - (local_val % ppd)
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pxd
deleted file mode 100644
index 5629deb965a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pxd
+++ /dev/null
@@ -1,23 +0,0 @@
-from cpython.datetime cimport (
- datetime,
- timedelta,
- tzinfo,
-)
-
-
-cdef tzinfo utc_stdlib
-
-cpdef bint is_utc(tzinfo tz)
-cdef bint is_tzlocal(tzinfo tz)
-cdef bint is_zoneinfo(tzinfo tz)
-
-cdef bint treat_tz_as_pytz(tzinfo tz)
-
-cpdef bint tz_compare(tzinfo start, tzinfo end)
-cpdef object get_timezone(tzinfo tz)
-cpdef tzinfo maybe_get_tz(object tz)
-
-cdef timedelta get_utcoffset(tzinfo tz, datetime obj)
-cpdef bint is_fixed_offset(tzinfo tz)
-
-cdef object get_dst_info(tzinfo tz)
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pyi
deleted file mode 100644
index 4e9f0c6ae6c..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pyi
+++ /dev/null
@@ -1,21 +0,0 @@
-from datetime import (
- datetime,
- tzinfo,
-)
-from typing import Callable
-
-import numpy as np
-
-# imported from dateutil.tz
-dateutil_gettz: Callable[[str], tzinfo]
-
-def tz_standardize(tz: tzinfo) -> tzinfo: ...
-def tz_compare(start: tzinfo | None, end: tzinfo | None) -> bool: ...
-def infer_tzinfo(
- start: datetime | None,
- end: datetime | None,
-) -> tzinfo | None: ...
-def maybe_get_tz(tz: str | int | np.int64 | tzinfo | None) -> tzinfo | None: ...
-def get_timezone(tz: tzinfo) -> tzinfo | str: ...
-def is_utc(tz: tzinfo | None) -> bool: ...
-def is_fixed_offset(tz: tzinfo) -> bool: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pyx
deleted file mode 100644
index 6105f96a3b1..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/timezones.pyx
+++ /dev/null
@@ -1,448 +0,0 @@
-from datetime import (
- timedelta,
- timezone,
-)
-
-from pandas.compat._optional import import_optional_dependency
-
-try:
- # py39+
- import zoneinfo
- from zoneinfo import ZoneInfo
-except ImportError:
- zoneinfo = None
- ZoneInfo = None
-
-from cpython.datetime cimport (
- datetime,
- timedelta,
- tzinfo,
-)
-
-# dateutil compat
-
-from dateutil.tz import (
- gettz as dateutil_gettz,
- tzfile as _dateutil_tzfile,
- tzlocal as _dateutil_tzlocal,
- tzutc as _dateutil_tzutc,
-)
-import numpy as np
-import pytz
-from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo
-
-cimport numpy as cnp
-from numpy cimport int64_t
-
-cnp.import_array()
-
-# ----------------------------------------------------------------------
-from pandas._libs.tslibs.util cimport (
- get_nat,
- is_integer_object,
-)
-
-
-cdef int64_t NPY_NAT = get_nat()
-cdef tzinfo utc_stdlib = timezone.utc
-cdef tzinfo utc_pytz = pytz.utc
-cdef tzinfo utc_dateutil_str = dateutil_gettz("UTC") # NB: *not* the same as tzutc()
-
-cdef tzinfo utc_zoneinfo = None
-
-
-# ----------------------------------------------------------------------
-
-cdef bint is_utc_zoneinfo(tzinfo tz):
- # Workaround for cases with missing tzdata
- # https://github.com/pandas-dev/pandas/pull/46425#discussion_r830633025
- if tz is None or zoneinfo is None:
- return False
-
- global utc_zoneinfo
- if utc_zoneinfo is None:
- try:
- utc_zoneinfo = ZoneInfo("UTC")
- except zoneinfo.ZoneInfoNotFoundError:
- return False
- # Warn if tzdata is too old, even if there is a system tzdata to alert
- # users about the mismatch between local/system tzdata
- import_optional_dependency("tzdata", errors="warn", min_version="2022.1")
-
- return tz is utc_zoneinfo
-
-
-cpdef inline bint is_utc(tzinfo tz):
- return (
- tz is utc_pytz
- or tz is utc_stdlib
- or isinstance(tz, _dateutil_tzutc)
- or tz is utc_dateutil_str
- or is_utc_zoneinfo(tz)
- )
-
-
-cdef bint is_zoneinfo(tzinfo tz):
- if ZoneInfo is None:
- return False
- return isinstance(tz, ZoneInfo)
-
-
-cdef bint is_tzlocal(tzinfo tz):
- return isinstance(tz, _dateutil_tzlocal)
-
-
-cdef bint treat_tz_as_pytz(tzinfo tz):
- return (hasattr(tz, "_utc_transition_times") and
- hasattr(tz, "_transition_info"))
-
-
-cdef bint treat_tz_as_dateutil(tzinfo tz):
- return hasattr(tz, "_trans_list") and hasattr(tz, "_trans_idx")
-
-
-# Returns str or tzinfo object
-cpdef inline object get_timezone(tzinfo tz):
- """
- We need to do several things here:
- 1) Distinguish between pytz and dateutil timezones
- 2) Not be over-specific (e.g. US/Eastern with/without DST is same *zone*
- but a different tz object)
- 3) Provide something to serialize when we're storing a datetime object
- in pytables.
-
- We return a string prefaced with dateutil if it's a dateutil tz, else just
- the tz name. It needs to be a string so that we can serialize it with
- UJSON/pytables. maybe_get_tz (below) is the inverse of this process.
- """
- if tz is None:
- raise TypeError("tz argument cannot be None")
- if is_utc(tz):
- return tz
- else:
- if treat_tz_as_dateutil(tz):
- if ".tar.gz" in tz._filename:
- raise ValueError(
- "Bad tz filename. Dateutil on python 3 on windows has a "
- "bug which causes tzfile._filename to be the same for all "
- "timezone files. Please construct dateutil timezones "
- 'implicitly by passing a string like "dateutil/Europe'
- '/London" when you construct your pandas objects instead '
- "of passing a timezone object. See "
- "https://github.com/pandas-dev/pandas/pull/7362")
- return "dateutil/" + tz._filename
- else:
- # tz is a pytz timezone or unknown.
- try:
- zone = tz.zone
- if zone is None:
- return tz
- return zone
- except AttributeError:
- return tz
-
-
-cpdef inline tzinfo maybe_get_tz(object tz):
- """
- (Maybe) Construct a timezone object from a string. If tz is a string, use
- it to construct a timezone object. Otherwise, just return tz.
- """
- if isinstance(tz, str):
- if tz == "tzlocal()":
- tz = _dateutil_tzlocal()
- elif tz.startswith("dateutil/"):
- zone = tz[9:]
- tz = dateutil_gettz(zone)
- # On Python 3 on Windows, the filename is not always set correctly.
- if isinstance(tz, _dateutil_tzfile) and ".tar.gz" in tz._filename:
- tz._filename = zone
- elif tz[0] in {"-", "+"}:
- hours = int(tz[0:3])
- minutes = int(tz[0] + tz[4:6])
- tz = timezone(timedelta(hours=hours, minutes=minutes))
- elif tz[0:4] in {"UTC-", "UTC+"}:
- hours = int(tz[3:6])
- minutes = int(tz[3] + tz[7:9])
- tz = timezone(timedelta(hours=hours, minutes=minutes))
- elif tz == "UTC" or tz == "utc":
- tz = utc_stdlib
- else:
- tz = pytz.timezone(tz)
- elif is_integer_object(tz):
- tz = timezone(timedelta(seconds=tz))
- elif isinstance(tz, tzinfo):
- pass
- elif tz is None:
- pass
- else:
- raise TypeError(type(tz))
- return tz
-
-
-def _p_tz_cache_key(tz: tzinfo):
- """
- Python interface for cache function to facilitate testing.
- """
- return tz_cache_key(tz)
-
-
-# Timezone data caches, key is the pytz string or dateutil file name.
-dst_cache = {}
-
-
-cdef object tz_cache_key(tzinfo tz):
- """
- Return the key in the cache for the timezone info object or None
- if unknown.
-
- The key is currently the tz string for pytz timezones, the filename for
- dateutil timezones.
-
- Notes
- -----
- This cannot just be the hash of a timezone object. Unfortunately, the
- hashes of two dateutil tz objects which represent the same timezone are
- not equal (even though the tz objects will compare equal and represent
- the same tz file). Also, pytz objects are not always hashable so we use
- str(tz) instead.
- """
- if isinstance(tz, _pytz_BaseTzInfo):
- return tz.zone
- elif isinstance(tz, _dateutil_tzfile):
- if ".tar.gz" in tz._filename:
- raise ValueError("Bad tz filename. Dateutil on python 3 on "
- "windows has a bug which causes tzfile._filename "
- "to be the same for all timezone files. Please "
- "construct dateutil timezones implicitly by "
- 'passing a string like "dateutil/Europe/London" '
- "when you construct your pandas objects instead "
- "of passing a timezone object. See "
- "https://github.com/pandas-dev/pandas/pull/7362")
- return "dateutil" + tz._filename
- else:
- return None
-
-
-# ----------------------------------------------------------------------
-# UTC Offsets
-
-
-cdef timedelta get_utcoffset(tzinfo tz, datetime obj):
- try:
- return tz._utcoffset
- except AttributeError:
- return tz.utcoffset(obj)
-
-
-cpdef inline bint is_fixed_offset(tzinfo tz):
- if treat_tz_as_dateutil(tz):
- if len(tz._trans_idx) == 0 and len(tz._trans_list) == 0:
- return 1
- else:
- return 0
- elif treat_tz_as_pytz(tz):
- if (len(tz._transition_info) == 0
- and len(tz._utc_transition_times) == 0):
- return 1
- else:
- return 0
- elif is_zoneinfo(tz):
- return 0
- # This also implicitly accepts datetime.timezone objects which are
- # considered fixed
- return 1
-
-
-cdef object _get_utc_trans_times_from_dateutil_tz(tzinfo tz):
- """
- Transition times in dateutil timezones are stored in local non-dst
- time. This code converts them to UTC. It's the reverse of the code
- in dateutil.tz.tzfile.__init__.
- """
- new_trans = list(tz._trans_list)
- last_std_offset = 0
- for i, (trans, tti) in enumerate(zip(tz._trans_list, tz._trans_idx)):
- if not tti.isdst:
- last_std_offset = tti.offset
- new_trans[i] = trans - last_std_offset
- return new_trans
-
-
-cdef int64_t[::1] unbox_utcoffsets(object transinfo):
- cdef:
- Py_ssize_t i, sz
- int64_t[::1] arr
-
- sz = len(transinfo)
- arr = np.empty(sz, dtype="i8")
-
- for i in range(sz):
- arr[i] = int(transinfo[i][0].total_seconds()) * 1_000_000_000
-
- return arr
-
-
-# ----------------------------------------------------------------------
-# Daylight Savings
-
-
-cdef object get_dst_info(tzinfo tz):
- """
- Returns
- -------
- ndarray[int64_t]
- Nanosecond UTC times of DST transitions.
- ndarray[int64_t]
- Nanosecond UTC offsets corresponding to DST transitions.
- str
- Describing the type of tzinfo object.
- """
- cache_key = tz_cache_key(tz)
- if cache_key is None:
- # e.g. pytz.FixedOffset, matplotlib.dates._UTC,
- # psycopg2.tz.FixedOffsetTimezone
- num = int(get_utcoffset(tz, None).total_seconds()) * 1_000_000_000
- # If we have e.g. ZoneInfo here, the get_utcoffset call will return None,
- # so the total_seconds() call will raise AttributeError.
- return (np.array([NPY_NAT + 1], dtype=np.int64),
- np.array([num], dtype=np.int64),
- "unknown")
-
- if cache_key not in dst_cache:
- if treat_tz_as_pytz(tz):
- trans = np.array(tz._utc_transition_times, dtype="M8[ns]")
- trans = trans.view("i8")
- if tz._utc_transition_times[0].year == 1:
- trans[0] = NPY_NAT + 1
- deltas = unbox_utcoffsets(tz._transition_info)
- typ = "pytz"
-
- elif treat_tz_as_dateutil(tz):
- if len(tz._trans_list):
- # get utc trans times
- trans_list = _get_utc_trans_times_from_dateutil_tz(tz)
- trans = np.hstack([
- np.array([0], dtype="M8[s]"), # place holder for 1st item
- np.array(trans_list, dtype="M8[s]")]).astype(
- "M8[ns]") # all trans listed
- trans = trans.view("i8")
- trans[0] = NPY_NAT + 1
-
- # deltas
- deltas = np.array([v.offset for v in (
- tz._ttinfo_before,) + tz._trans_idx], dtype="i8")
- deltas *= 1_000_000_000
- typ = "dateutil"
-
- elif is_fixed_offset(tz):
- trans = np.array([NPY_NAT + 1], dtype=np.int64)
- deltas = np.array([tz._ttinfo_std.offset],
- dtype="i8") * 1_000_000_000
- typ = "fixed"
- else:
- # 2018-07-12 this is not reached in the tests, and this case
- # is not handled in any of the functions that call
- # get_dst_info. If this case _were_ hit the calling
- # functions would then hit an IndexError because they assume
- # `deltas` is non-empty.
- # (under the just-deleted code that returned empty arrays)
- raise AssertionError("dateutil tzinfo is not a FixedOffset "
- "and has an empty `_trans_list`.", tz)
- else:
- # static tzinfo, we can get here with pytz.StaticTZInfo
- # which are not caught by treat_tz_as_pytz
- trans = np.array([NPY_NAT + 1], dtype=np.int64)
- num = int(get_utcoffset(tz, None).total_seconds()) * 1_000_000_000
- deltas = np.array([num], dtype=np.int64)
- typ = "static"
-
- dst_cache[cache_key] = (trans, deltas, typ)
-
- return dst_cache[cache_key]
-
-
-def infer_tzinfo(datetime start, datetime end):
- if start is not None and end is not None:
- tz = start.tzinfo
- if not tz_compare(tz, end.tzinfo):
- raise AssertionError(f"Inputs must both have the same timezone, "
- f"{tz} != {end.tzinfo}")
- elif start is not None:
- tz = start.tzinfo
- elif end is not None:
- tz = end.tzinfo
- else:
- tz = None
- return tz
-
-
-cpdef bint tz_compare(tzinfo start, tzinfo end):
- """
- Compare string representations of timezones
-
- The same timezone can be represented as different instances of
- timezones. For example
- `<DstTzInfo 'Europe/Paris' LMT+0:09:00 STD>` and
- `<DstTzInfo 'Europe/Paris' CET+1:00:00 STD>` are essentially same
- timezones but aren't evaluated such, but the string representation
- for both of these is `'Europe/Paris'`.
-
- This exists only to add a notion of equality to pytz-style zones
- that is compatible with the notion of equality expected of tzinfo
- subclasses.
-
- Parameters
- ----------
- start : tzinfo
- end : tzinfo
-
- Returns:
- -------
- bool
- """
- # GH 18523
- if is_utc(start):
- # GH#38851 consider pytz/dateutil/stdlib UTCs as equivalent
- return is_utc(end)
- elif is_utc(end):
- # Ensure we don't treat tzlocal as equal to UTC when running in UTC
- return False
- elif start is None or end is None:
- return start is None and end is None
- return get_timezone(start) == get_timezone(end)
-
-
-def tz_standardize(tz: tzinfo) -> tzinfo:
- """
- If the passed tz is a pytz timezone object, "normalize" it to the a
- consistent version
-
- Parameters
- ----------
- tz : tzinfo
-
- Returns
- -------
- tzinfo
-
- Examples
- --------
- >>> from datetime import datetime
- >>> from pytz import timezone
- >>> tz = timezone('US/Pacific').normalize(
- ... datetime(2014, 1, 1, tzinfo=pytz.utc)
- ... ).tzinfo
- >>> tz
- <DstTzInfo 'US/Pacific' PST-1 day, 16:00:00 STD>
- >>> tz_standardize(tz)
- <DstTzInfo 'US/Pacific' LMT-1 day, 16:07:00 STD>
-
- >>> tz = timezone('US/Pacific')
- >>> tz
- <DstTzInfo 'US/Pacific' LMT-1 day, 16:07:00 STD>
- >>> tz_standardize(tz)
- <DstTzInfo 'US/Pacific' LMT-1 day, 16:07:00 STD>
- """
- if treat_tz_as_pytz(tz):
- return pytz.timezone(str(tz))
- return tz
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pxd
deleted file mode 100644
index 7c1dd04e2b2..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pxd
+++ /dev/null
@@ -1,39 +0,0 @@
-from cpython.datetime cimport tzinfo
-from numpy cimport (
- int64_t,
- intp_t,
- ndarray,
-)
-
-from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
-
-
-cpdef int64_t tz_convert_from_utc_single(
- int64_t utc_val, tzinfo tz, NPY_DATETIMEUNIT creso=*
-) except? -1
-cdef int64_t tz_localize_to_utc_single(
- int64_t val,
- tzinfo tz,
- object ambiguous=*,
- object nonexistent=*,
- NPY_DATETIMEUNIT creso=*,
-) except? -1
-
-
-cdef class Localizer:
- cdef:
- tzinfo tz
- NPY_DATETIMEUNIT _creso
- bint use_utc, use_fixed, use_tzlocal, use_dst, use_pytz
- ndarray trans
- Py_ssize_t ntrans
- const int64_t[::1] deltas
- int64_t delta
- int64_t* tdata
-
- cdef int64_t utc_val_to_local_val(
- self,
- int64_t utc_val,
- Py_ssize_t* pos,
- bint* fold=?,
- ) except? -1
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pyi
deleted file mode 100644
index a354765a348..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pyi
+++ /dev/null
@@ -1,21 +0,0 @@
-from datetime import (
- timedelta,
- tzinfo,
-)
-from typing import Iterable
-
-import numpy as np
-
-from pandas._typing import npt
-
-# tz_convert_from_utc_single exposed for testing
-def tz_convert_from_utc_single(
- val: np.int64, tz: tzinfo, creso: int = ...
-) -> np.int64: ...
-def tz_localize_to_utc(
- vals: npt.NDArray[np.int64],
- tz: tzinfo | None,
- ambiguous: str | bool | Iterable[bool] | None = ...,
- nonexistent: str | timedelta | np.timedelta64 | None = ...,
- creso: int = ..., # NPY_DATETIMEUNIT
-) -> npt.NDArray[np.int64]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pyx
deleted file mode 100644
index c5f3b0ab715..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/tzconversion.pyx
+++ /dev/null
@@ -1,816 +0,0 @@
-"""
-timezone conversion
-"""
-cimport cython
-from cpython.datetime cimport (
- PyDelta_Check,
- datetime,
- datetime_new,
- import_datetime,
- timedelta,
- tzinfo,
-)
-from cython cimport Py_ssize_t
-
-import_datetime()
-
-import numpy as np
-import pytz
-
-cimport numpy as cnp
-from numpy cimport (
- int64_t,
- intp_t,
- ndarray,
- uint8_t,
-)
-
-cnp.import_array()
-
-from pandas._libs.tslibs.dtypes cimport (
- periods_per_day,
- periods_per_second,
-)
-from pandas._libs.tslibs.nattype cimport NPY_NAT
-from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- npy_datetimestruct,
- pandas_datetime_to_datetimestruct,
- pydatetime_to_dt64,
-)
-from pandas._libs.tslibs.timezones cimport (
- get_dst_info,
- is_fixed_offset,
- is_tzlocal,
- is_utc,
- is_zoneinfo,
- utc_stdlib,
-)
-
-
-cdef const int64_t[::1] _deltas_placeholder = np.array([], dtype=np.int64)
-
-
-@cython.freelist(16)
-@cython.final
-cdef class Localizer:
- # cdef:
- # tzinfo tz
- # NPY_DATETIMEUNIT _creso
- # bint use_utc, use_fixed, use_tzlocal, use_dst, use_pytz
- # ndarray trans
- # Py_ssize_t ntrans
- # const int64_t[::1] deltas
- # int64_t delta
- # int64_t* tdata
-
- @cython.initializedcheck(False)
- @cython.boundscheck(False)
- def __cinit__(self, tzinfo tz, NPY_DATETIMEUNIT creso):
- self.tz = tz
- self._creso = creso
- self.use_utc = self.use_tzlocal = self.use_fixed = False
- self.use_dst = self.use_pytz = False
- self.ntrans = -1 # placeholder
- self.delta = -1 # placeholder
- self.deltas = _deltas_placeholder
- self.tdata = NULL
-
- if is_utc(tz) or tz is None:
- self.use_utc = True
-
- elif is_tzlocal(tz) or is_zoneinfo(tz):
- self.use_tzlocal = True
-
- else:
- trans, deltas, typ = get_dst_info(tz)
- if creso != NPY_DATETIMEUNIT.NPY_FR_ns:
- # NB: using floordiv here is implicitly assuming we will
- # never see trans or deltas that are not an integer number
- # of seconds.
- # TODO: avoid these np.array calls
- if creso == NPY_DATETIMEUNIT.NPY_FR_us:
- trans = np.array(trans) // 1_000
- deltas = np.array(deltas) // 1_000
- elif creso == NPY_DATETIMEUNIT.NPY_FR_ms:
- trans = np.array(trans) // 1_000_000
- deltas = np.array(deltas) // 1_000_000
- elif creso == NPY_DATETIMEUNIT.NPY_FR_s:
- trans = np.array(trans) // 1_000_000_000
- deltas = np.array(deltas) // 1_000_000_000
- else:
- raise NotImplementedError(creso)
-
- self.trans = trans
- self.ntrans = self.trans.shape[0]
- self.deltas = deltas
-
- if typ != "pytz" and typ != "dateutil":
- # static/fixed; in this case we know that len(delta) == 1
- self.use_fixed = True
- self.delta = deltas[0]
- else:
- self.use_dst = True
- if typ == "pytz":
- self.use_pytz = True
- self.tdata = <int64_t*>cnp.PyArray_DATA(trans)
-
- @cython.boundscheck(False)
- cdef int64_t utc_val_to_local_val(
- self, int64_t utc_val, Py_ssize_t* pos, bint* fold=NULL
- ) except? -1:
- if self.use_utc:
- return utc_val
- elif self.use_tzlocal:
- return utc_val + _tz_localize_using_tzinfo_api(
- utc_val, self.tz, to_utc=False, creso=self._creso, fold=fold
- )
- elif self.use_fixed:
- return utc_val + self.delta
- else:
- pos[0] = bisect_right_i8(self.tdata, utc_val, self.ntrans) - 1
- if fold is not NULL:
- fold[0] = _infer_dateutil_fold(
- utc_val, self.trans, self.deltas, pos[0]
- )
-
- return utc_val + self.deltas[pos[0]]
-
-
-cdef int64_t tz_localize_to_utc_single(
- int64_t val,
- tzinfo tz,
- object ambiguous=None,
- object nonexistent=None,
- NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns,
-) except? -1:
- """See tz_localize_to_utc.__doc__"""
- cdef:
- int64_t delta
- int64_t[::1] deltas
-
- if val == NPY_NAT:
- return val
-
- elif is_utc(tz) or tz is None:
- # TODO: test with non-nano
- return val
-
- elif is_tzlocal(tz):
- return val - _tz_localize_using_tzinfo_api(val, tz, to_utc=True, creso=creso)
-
- elif is_fixed_offset(tz):
- _, deltas, _ = get_dst_info(tz)
- delta = deltas[0]
- # TODO: de-duplicate with Localizer.__init__
- if creso != NPY_DATETIMEUNIT.NPY_FR_ns:
- if creso == NPY_DATETIMEUNIT.NPY_FR_us:
- delta = delta // 1000
- elif creso == NPY_DATETIMEUNIT.NPY_FR_ms:
- delta = delta // 1_000_000
- elif creso == NPY_DATETIMEUNIT.NPY_FR_s:
- delta = delta // 1_000_000_000
-
- return val - delta
-
- else:
- return tz_localize_to_utc(
- np.array([val], dtype="i8"),
- tz,
- ambiguous=ambiguous,
- nonexistent=nonexistent,
- creso=creso,
- )[0]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def tz_localize_to_utc(
- ndarray[int64_t] vals,
- tzinfo tz,
- object ambiguous=None,
- object nonexistent=None,
- NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns,
-):
- """
- Localize tzinfo-naive i8 to given time zone (using pytz). If
- there are ambiguities in the values, raise AmbiguousTimeError.
-
- Parameters
- ----------
- vals : ndarray[int64_t]
- tz : tzinfo or None
- ambiguous : str, bool, or arraylike
- When clocks moved backward due to DST, ambiguous times may arise.
- For example in Central European Time (UTC+01), when going from 03:00
- DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC
- and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter
- dictates how ambiguous times should be handled.
-
- - 'infer' will attempt to infer fall dst-transition hours based on
- order
- - bool-ndarray where True signifies a DST time, False signifies a
- non-DST time (note that this flag is only applicable for ambiguous
- times, but the array must have the same length as vals)
- - bool if True, treat all vals as DST. If False, treat them as non-DST
- - 'NaT' will return NaT where there are ambiguous times
-
- nonexistent : {None, "NaT", "shift_forward", "shift_backward", "raise", \
-timedelta-like}
- How to handle non-existent times when converting wall times to UTC
- creso : NPY_DATETIMEUNIT, default NPY_FR_ns
-
- Returns
- -------
- localized : ndarray[int64_t]
- """
- cdef:
- ndarray[uint8_t, cast=True] ambiguous_array
- Py_ssize_t i, n = vals.shape[0]
- Py_ssize_t delta_idx_offset, delta_idx
- int64_t v, left, right, val, new_local, remaining_mins
- int64_t first_delta, delta
- int64_t shift_delta = 0
- ndarray[int64_t] result_a, result_b, dst_hours
- int64_t[::1] result
- bint is_zi = False
- bint infer_dst = False, is_dst = False, fill = False
- bint shift_forward = False, shift_backward = False
- bint fill_nonexist = False
- str stamp
- Localizer info = Localizer(tz, creso=creso)
- int64_t pph = periods_per_day(creso) // 24
- int64_t pps = periods_per_second(creso)
- npy_datetimestruct dts
-
- # Vectorized version of DstTzInfo.localize
- if info.use_utc:
- return vals.copy()
-
- # silence false-positive compiler warning
- ambiguous_array = np.empty(0, dtype=bool)
- if isinstance(ambiguous, str):
- if ambiguous == "infer":
- infer_dst = True
- elif ambiguous == "NaT":
- fill = True
- elif isinstance(ambiguous, bool):
- is_dst = True
- if ambiguous:
- ambiguous_array = np.ones(len(vals), dtype=bool)
- else:
- ambiguous_array = np.zeros(len(vals), dtype=bool)
- elif hasattr(ambiguous, "__iter__"):
- is_dst = True
- if len(ambiguous) != len(vals):
- raise ValueError("Length of ambiguous bool-array must be "
- "the same size as vals")
- ambiguous_array = np.asarray(ambiguous, dtype=bool)
-
- if nonexistent == "NaT":
- fill_nonexist = True
- elif nonexistent == "shift_forward":
- shift_forward = True
- elif nonexistent == "shift_backward":
- shift_backward = True
- elif PyDelta_Check(nonexistent):
- from .timedeltas import delta_to_nanoseconds
- shift_delta = delta_to_nanoseconds(nonexistent, reso=creso)
- elif nonexistent not in ("raise", None):
- msg = ("nonexistent must be one of {'NaT', 'raise', 'shift_forward', "
- "shift_backwards} or a timedelta object")
- raise ValueError(msg)
-
- result = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0)
-
- if info.use_tzlocal and not is_zoneinfo(tz):
- for i in range(n):
- v = vals[i]
- if v == NPY_NAT:
- result[i] = NPY_NAT
- else:
- result[i] = v - _tz_localize_using_tzinfo_api(
- v, tz, to_utc=True, creso=creso
- )
- return result.base # to return underlying ndarray
-
- elif info.use_fixed:
- delta = info.delta
- for i in range(n):
- v = vals[i]
- if v == NPY_NAT:
- result[i] = NPY_NAT
- else:
- result[i] = v - delta
- return result.base # to return underlying ndarray
-
- # Determine whether each date lies left of the DST transition (store in
- # result_a) or right of the DST transition (store in result_b)
- if is_zoneinfo(tz):
- is_zi = True
- result_a, result_b =_get_utc_bounds_zoneinfo(
- vals, tz, creso=creso
- )
- else:
- result_a, result_b =_get_utc_bounds(
- vals, info.tdata, info.ntrans, info.deltas, creso=creso
- )
-
- # silence false-positive compiler warning
- dst_hours = np.empty(0, dtype=np.int64)
- if infer_dst:
- dst_hours = _get_dst_hours(vals, result_a, result_b, creso=creso)
-
- # Pre-compute delta_idx_offset that will be used if we go down non-existent
- # paths.
- # Shift the delta_idx by if the UTC offset of
- # the target tz is greater than 0 and we're moving forward
- # or vice versa
- first_delta = info.deltas[0]
- if (shift_forward or shift_delta > 0) and first_delta > 0:
- delta_idx_offset = 1
- elif (shift_backward or shift_delta < 0) and first_delta < 0:
- delta_idx_offset = 1
- else:
- delta_idx_offset = 0
-
- for i in range(n):
- val = vals[i]
- left = result_a[i]
- right = result_b[i]
- if val == NPY_NAT:
- # TODO: test with non-nano
- result[i] = val
- elif left != NPY_NAT and right != NPY_NAT:
- if left == right:
- # TODO: test with non-nano
- result[i] = left
- else:
- if infer_dst and dst_hours[i] != NPY_NAT:
- # TODO: test with non-nano
- result[i] = dst_hours[i]
- elif is_dst:
- if ambiguous_array[i]:
- result[i] = left
- else:
- result[i] = right
- elif fill:
- # TODO: test with non-nano; parametrize test_dt_round_tz_ambiguous
- result[i] = NPY_NAT
- else:
- stamp = _render_tstamp(val, creso=creso)
- raise pytz.AmbiguousTimeError(
- f"Cannot infer dst time from {stamp}, try using the "
- "'ambiguous' argument"
- )
- elif left != NPY_NAT:
- result[i] = left
- elif right != NPY_NAT:
- # TODO: test with non-nano
- result[i] = right
- else:
- # Handle nonexistent times
- if shift_forward or shift_backward or shift_delta != 0:
- # Shift the nonexistent time to the closest existing time
- remaining_mins = val % pph
- if shift_delta != 0:
- # Validate that we don't relocalize on another nonexistent
- # time
- if -1 < shift_delta + remaining_mins < pph:
- raise ValueError(
- "The provided timedelta will relocalize on a "
- f"nonexistent time: {nonexistent}"
- )
- new_local = val + shift_delta
- elif shift_forward:
- new_local = val + (pph - remaining_mins)
- else:
- # Subtract 1 since the beginning hour is _inclusive_ of
- # nonexistent times
- new_local = val - remaining_mins - 1
-
- if is_zi:
- # use the same construction as in _get_utc_bounds_zoneinfo
- pandas_datetime_to_datetimestruct(new_local, creso, &dts)
- extra = (dts.ps // 1000) * (pps // 1_000_000_000)
-
- dt = datetime_new(dts.year, dts.month, dts.day, dts.hour,
- dts.min, dts.sec, dts.us, None)
-
- if shift_forward or shift_delta > 0:
- dt = dt.replace(tzinfo=tz, fold=1)
- else:
- dt = dt.replace(tzinfo=tz, fold=0)
- dt = dt.astimezone(utc_stdlib)
- dt = dt.replace(tzinfo=None)
- result[i] = pydatetime_to_dt64(dt, &dts, creso) + extra
-
- else:
- delta_idx = bisect_right_i8(info.tdata, new_local, info.ntrans)
-
- delta_idx = delta_idx - delta_idx_offset
- result[i] = new_local - info.deltas[delta_idx]
- elif fill_nonexist:
- result[i] = NPY_NAT
- else:
- stamp = _render_tstamp(val, creso=creso)
- raise pytz.NonExistentTimeError(stamp)
-
- return result.base # .base to get underlying ndarray
-
-
-cdef Py_ssize_t bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n):
- # Caller is responsible for checking n > 0
- # This looks very similar to local_search_right in the ndarray.searchsorted
- # implementation.
- cdef:
- Py_ssize_t pivot, left = 0, right = n
-
- # edge cases
- if val > data[n - 1]:
- return n
-
- # Caller is responsible for ensuring 'val >= data[0]'. This is
- # ensured by the fact that 'data' comes from get_dst_info where data[0]
- # is *always* NPY_NAT+1. If that ever changes, we will need to restore
- # the following disabled check.
- # if val < data[0]:
- # return 0
-
- while left < right:
- pivot = left + (right - left) // 2
-
- if data[pivot] <= val:
- left = pivot + 1
- else:
- right = pivot
-
- return left
-
-
-cdef str _render_tstamp(int64_t val, NPY_DATETIMEUNIT creso):
- """ Helper function to render exception messages"""
- from pandas._libs.tslibs.timestamps import Timestamp
- ts = Timestamp._from_value_and_reso(val, creso, None)
- return str(ts)
-
-
-cdef _get_utc_bounds(
- ndarray vals,
- int64_t* tdata,
- Py_ssize_t ntrans,
- const int64_t[::1] deltas,
- NPY_DATETIMEUNIT creso,
-):
- # Determine whether each date lies left of the DST transition (store in
- # result_a) or right of the DST transition (store in result_b)
-
- cdef:
- ndarray result_a, result_b
- Py_ssize_t i, n = vals.size
- int64_t val, v_left, v_right
- Py_ssize_t isl, isr, pos_left, pos_right
- int64_t ppd = periods_per_day(creso)
-
- result_a = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0)
- result_b = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0)
-
- for i in range(n):
- # This loops resembles the "Find the two best possibilities" block
- # in pytz's DstTZInfo.localize method.
- result_a[i] = NPY_NAT
- result_b[i] = NPY_NAT
-
- val = vals[i]
- if val == NPY_NAT:
- continue
-
- # TODO: be careful of overflow in val-ppd
- isl = bisect_right_i8(tdata, val - ppd, ntrans) - 1
- if isl < 0:
- isl = 0
-
- v_left = val - deltas[isl]
- pos_left = bisect_right_i8(tdata, v_left, ntrans) - 1
- # timestamp falls to the left side of the DST transition
- if v_left + deltas[pos_left] == val:
- result_a[i] = v_left
-
- # TODO: be careful of overflow in val+ppd
- isr = bisect_right_i8(tdata, val + ppd, ntrans) - 1
- if isr < 0:
- isr = 0
-
- v_right = val - deltas[isr]
- pos_right = bisect_right_i8(tdata, v_right, ntrans) - 1
- # timestamp falls to the right side of the DST transition
- if v_right + deltas[pos_right] == val:
- result_b[i] = v_right
-
- return result_a, result_b
-
-
-cdef _get_utc_bounds_zoneinfo(ndarray vals, tz, NPY_DATETIMEUNIT creso):
- """
- For each point in 'vals', find the UTC time that it corresponds to if
- with fold=0 and fold=1. In non-ambiguous cases, these will match.
-
- Parameters
- ----------
- vals : ndarray[int64_t]
- tz : ZoneInfo
- creso : NPY_DATETIMEUNIT
-
- Returns
- -------
- ndarray[int64_t]
- ndarray[int64_t]
- """
- cdef:
- Py_ssize_t i, n = vals.size
- npy_datetimestruct dts
- datetime dt, rt, left, right, aware, as_utc
- int64_t val, pps = periods_per_second(creso)
- ndarray result_a, result_b
-
- result_a = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0)
- result_b = cnp.PyArray_EMPTY(vals.ndim, vals.shape, cnp.NPY_INT64, 0)
-
- for i in range(n):
- val = vals[i]
- if val == NPY_NAT:
- result_a[i] = NPY_NAT
- result_b[i] = NPY_NAT
- continue
-
- pandas_datetime_to_datetimestruct(val, creso, &dts)
- # casting to pydatetime drops nanoseconds etc, which we will
- # need to re-add later as 'extra'
- extra = (dts.ps // 1000) * (pps // 1_000_000_000)
-
- dt = datetime_new(dts.year, dts.month, dts.day, dts.hour,
- dts.min, dts.sec, dts.us, None)
-
- aware = dt.replace(tzinfo=tz)
- as_utc = aware.astimezone(utc_stdlib)
- rt = as_utc.astimezone(tz)
- if aware != rt:
- # AFAICT this means that 'aware' is non-existent
- # TODO: better way to check this?
- # mail.python.org/archives/list/datetime-sig@python.org/
- # thread/57Y3IQAASJOKHX4D27W463XTZIS2NR3M/
- result_a[i] = NPY_NAT
- else:
- left = as_utc.replace(tzinfo=None)
- result_a[i] = pydatetime_to_dt64(left, &dts, creso) + extra
-
- aware = dt.replace(fold=1, tzinfo=tz)
- as_utc = aware.astimezone(utc_stdlib)
- rt = as_utc.astimezone(tz)
- if aware != rt:
- result_b[i] = NPY_NAT
- else:
- right = as_utc.replace(tzinfo=None)
- result_b[i] = pydatetime_to_dt64(right, &dts, creso) + extra
-
- return result_a, result_b
-
-
-@cython.boundscheck(False)
-cdef ndarray[int64_t] _get_dst_hours(
- # vals, creso only needed here to potential render an exception message
- const int64_t[:] vals,
- ndarray[int64_t] result_a,
- ndarray[int64_t] result_b,
- NPY_DATETIMEUNIT creso,
-):
- cdef:
- Py_ssize_t i, n = vals.shape[0]
- ndarray[uint8_t, cast=True] mismatch
- ndarray[int64_t] delta, dst_hours
- ndarray[intp_t] switch_idxs, trans_idx, grp, a_idx, b_idx, one_diff
- list trans_grp
- intp_t switch_idx
- int64_t left, right
-
- dst_hours = cnp.PyArray_EMPTY(result_a.ndim, result_a.shape, cnp.NPY_INT64, 0)
- dst_hours[:] = NPY_NAT
-
- mismatch = cnp.PyArray_ZEROS(result_a.ndim, result_a.shape, cnp.NPY_BOOL, 0)
-
- for i in range(n):
- left = result_a[i]
- right = result_b[i]
-
- # Get the ambiguous hours (given the above, these are the hours
- # where result_a != result_b and neither of them are NAT)
- if left != right and left != NPY_NAT and right != NPY_NAT:
- mismatch[i] = 1
-
- trans_idx = mismatch.nonzero()[0]
-
- if trans_idx.size == 1:
- # see test_tz_localize_to_utc_ambiguous_infer
- stamp = _render_tstamp(vals[trans_idx[0]], creso=creso)
- raise pytz.AmbiguousTimeError(
- f"Cannot infer dst time from {stamp} as there "
- "are no repeated times"
- )
-
- # Split the array into contiguous chunks (where the difference between
- # indices is 1). These are effectively dst transitions in different
- # years which is useful for checking that there is not an ambiguous
- # transition in an individual year.
- if trans_idx.size > 0:
- one_diff = np.where(np.diff(trans_idx) != 1)[0] + 1
- trans_grp = np.array_split(trans_idx, one_diff)
-
- # Iterate through each day, if there are no hours where the
- # delta is negative (indicates a repeat of hour) the switch
- # cannot be inferred
- for grp in trans_grp:
-
- delta = np.diff(result_a[grp])
- if grp.size == 1 or np.all(delta > 0):
- # see test_tz_localize_to_utc_ambiguous_infer
- stamp = _render_tstamp(vals[grp[0]], creso=creso)
- raise pytz.AmbiguousTimeError(stamp)
-
- # Find the index for the switch and pull from a for dst and b
- # for standard
- switch_idxs = (delta <= 0).nonzero()[0]
- if switch_idxs.size > 1:
- # see test_tz_localize_to_utc_ambiguous_infer
- raise pytz.AmbiguousTimeError(
- f"There are {switch_idxs.size} dst switches when "
- "there should only be 1."
- )
-
- switch_idx = switch_idxs[0] + 1
- # Pull the only index and adjust
- a_idx = grp[:switch_idx]
- b_idx = grp[switch_idx:]
- dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx]))
-
- return dst_hours
-
-
-# ----------------------------------------------------------------------
-# Timezone Conversion
-
-cpdef int64_t tz_convert_from_utc_single(
- int64_t utc_val, tzinfo tz, NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns
-) except? -1:
- """
- Convert the val (in i8) from UTC to tz
-
- This is a single value version of tz_convert_from_utc.
-
- Parameters
- ----------
- utc_val : int64
- tz : tzinfo
- creso : NPY_DATETIMEUNIT, default NPY_FR_ns
-
- Returns
- -------
- converted: int64
- """
- cdef:
- Localizer info = Localizer(tz, creso=creso)
- Py_ssize_t pos
-
- # Note: caller is responsible for ensuring utc_val != NPY_NAT
- return info.utc_val_to_local_val(utc_val, &pos)
-
-
-# OSError may be thrown by tzlocal on windows at or close to 1970-01-01
-# see https://github.com/pandas-dev/pandas/pull/37591#issuecomment-720628241
-cdef int64_t _tz_localize_using_tzinfo_api(
- int64_t val,
- tzinfo tz,
- bint to_utc=True,
- NPY_DATETIMEUNIT creso=NPY_DATETIMEUNIT.NPY_FR_ns,
- bint* fold=NULL,
-) except? -1:
- """
- Convert the i8 representation of a datetime from a general-case timezone to
- UTC, or vice-versa using the datetime/tzinfo API.
-
- Private, not intended for use outside of tslibs.tzconversion.
-
- Parameters
- ----------
- val : int64_t
- tz : tzinfo
- to_utc : bint
- True if converting _to_ UTC, False if going the other direction.
- creso : NPY_DATETIMEUNIT
- fold : bint*, default NULL
- pointer to fold: whether datetime ends up in a fold or not
- after adjustment.
- Only passed with to_utc=False.
-
- Returns
- -------
- delta : int64_t
- Value to add when converting from utc, subtract when converting to utc.
-
- Notes
- -----
- Sets fold by pointer
- """
- cdef:
- npy_datetimestruct dts
- datetime dt
- int64_t delta
- timedelta td
- int64_t pps = periods_per_second(creso)
-
- pandas_datetime_to_datetimestruct(val, creso, &dts)
-
- # datetime_new is cython-optimized constructor
- if not to_utc:
- # tz.utcoffset only makes sense if datetime
- # is _wall time_, so if val is a UTC timestamp convert to wall time
- dt = _astimezone(dts, tz)
-
- if fold is not NULL:
- # NB: fold is only passed with to_utc=False
- fold[0] = dt.fold
- else:
- dt = datetime_new(dts.year, dts.month, dts.day, dts.hour,
- dts.min, dts.sec, dts.us, None)
-
- td = tz.utcoffset(dt)
- delta = int(td.total_seconds() * pps)
- return delta
-
-
-cdef datetime _astimezone(npy_datetimestruct dts, tzinfo tz):
- """
- Optimized equivalent to:
-
- dt = datetime(dts.year, dts.month, dts.day, dts.hour,
- dts.min, dts.sec, dts.us, utc_stdlib)
- dt = dt.astimezone(tz)
-
- Derived from the datetime.astimezone implementation at
- https://github.com/python/cpython/blob/main/Modules/_datetimemodule.c#L6187
-
- NB: we are assuming tz is not None.
- """
- cdef:
- datetime result
-
- result = datetime_new(dts.year, dts.month, dts.day, dts.hour,
- dts.min, dts.sec, dts.us, tz)
- return tz.fromutc(result)
-
-
-# NB: relies on dateutil internals, subject to change.
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cdef bint _infer_dateutil_fold(
- int64_t value,
- const int64_t[::1] trans,
- const int64_t[::1] deltas,
- Py_ssize_t pos,
-):
- """
- Infer _TSObject fold property from value by assuming 0 and then setting
- to 1 if necessary.
-
- Parameters
- ----------
- value : int64_t
- trans : ndarray[int64_t]
- ndarray of offset transition points in nanoseconds since epoch.
- deltas : int64_t[:]
- array of offsets corresponding to transition points in trans.
- pos : Py_ssize_t
- Position of the last transition point before taking fold into account.
-
- Returns
- -------
- bint
- Due to daylight saving time, one wall clock time can occur twice
- when shifting from summer to winter time; fold describes whether the
- datetime-like corresponds to the first (0) or the second time (1)
- the wall clock hits the ambiguous time
-
- References
- ----------
- .. [1] "PEP 495 - Local Time Disambiguation"
- https://www.python.org/dev/peps/pep-0495/#the-fold-attribute
- """
- cdef:
- bint fold = 0
- int64_t fold_delta
-
- if pos > 0:
- fold_delta = deltas[pos - 1] - deltas[pos]
- if value - fold_delta < trans[pos]:
- fold = 1
-
- return fold
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/util.pxd b/contrib/python/pandas/py3/pandas/_libs/tslibs/util.pxd
deleted file mode 100644
index 4e55bc1c48f..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/util.pxd
+++ /dev/null
@@ -1,226 +0,0 @@
-
-from cpython.object cimport PyTypeObject
-
-
-cdef extern from "Python.h":
- # Note: importing extern-style allows us to declare these as nogil
- # functions, whereas `from cpython cimport` does not.
- bint PyBool_Check(object obj) nogil
- bint PyFloat_Check(object obj) nogil
- bint PyComplex_Check(object obj) nogil
- bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
-
- # Note that following functions can potentially raise an exception,
- # thus they cannot be declared 'nogil'. Also PyUnicode_AsUTF8AndSize() can
- # potentially allocate memory inside in unlikely case of when underlying
- # unicode object was stored as non-utf8 and utf8 wasn't requested before.
- const char* PyUnicode_AsUTF8AndSize(object obj,
- Py_ssize_t* length) except NULL
-
- object PyUnicode_EncodeLocale(object obj, const char *errors) nogil
- object PyUnicode_DecodeLocale(const char *str, const char *errors) nogil
-
-
-from numpy cimport (
- float64_t,
- int64_t,
-)
-
-
-cdef extern from "numpy/arrayobject.h":
- PyTypeObject PyFloatingArrType_Type
-
-cdef extern from "numpy/ndarrayobject.h":
- PyTypeObject PyTimedeltaArrType_Type
- PyTypeObject PyDatetimeArrType_Type
- PyTypeObject PyComplexFloatingArrType_Type
- PyTypeObject PyBoolArrType_Type
-
- bint PyArray_IsIntegerScalar(obj) nogil
- bint PyArray_Check(obj) nogil
-
-cdef extern from "numpy/npy_common.h":
- int64_t NPY_MIN_INT64
-
-
-cdef inline int64_t get_nat():
- return NPY_MIN_INT64
-
-
-# --------------------------------------------------------------------
-# Type Checking
-
-cdef inline bint is_integer_object(object obj) nogil:
- """
- Cython equivalent of
-
- `isinstance(val, (int, long, np.integer)) and not isinstance(val, bool)`
-
- Parameters
- ----------
- val : object
-
- Returns
- -------
- is_integer : bool
-
- Notes
- -----
- This counts np.timedelta64 objects as integers.
- """
- return (not PyBool_Check(obj) and PyArray_IsIntegerScalar(obj)
- and not is_timedelta64_object(obj))
-
-
-cdef inline bint is_float_object(object obj) nogil:
- """
- Cython equivalent of `isinstance(val, (float, np.float_))`
-
- Parameters
- ----------
- val : object
-
- Returns
- -------
- is_float : bool
- """
- return (PyFloat_Check(obj) or
- (PyObject_TypeCheck(obj, &PyFloatingArrType_Type)))
-
-
-cdef inline bint is_complex_object(object obj) nogil:
- """
- Cython equivalent of `isinstance(val, (complex, np.complex_))`
-
- Parameters
- ----------
- val : object
-
- Returns
- -------
- is_complex : bool
- """
- return (PyComplex_Check(obj) or
- PyObject_TypeCheck(obj, &PyComplexFloatingArrType_Type))
-
-
-cdef inline bint is_bool_object(object obj) nogil:
- """
- Cython equivalent of `isinstance(val, (bool, np.bool_))`
-
- Parameters
- ----------
- val : object
-
- Returns
- -------
- is_bool : bool
- """
- return (PyBool_Check(obj) or
- PyObject_TypeCheck(obj, &PyBoolArrType_Type))
-
-
-cdef inline bint is_real_number_object(object obj) nogil:
- return is_bool_object(obj) or is_integer_object(obj) or is_float_object(obj)
-
-
-cdef inline bint is_timedelta64_object(object obj) nogil:
- """
- Cython equivalent of `isinstance(val, np.timedelta64)`
-
- Parameters
- ----------
- val : object
-
- Returns
- -------
- is_timedelta64 : bool
- """
- return PyObject_TypeCheck(obj, &PyTimedeltaArrType_Type)
-
-
-cdef inline bint is_datetime64_object(object obj) nogil:
- """
- Cython equivalent of `isinstance(val, np.datetime64)`
-
- Parameters
- ----------
- val : object
-
- Returns
- -------
- is_datetime64 : bool
- """
- return PyObject_TypeCheck(obj, &PyDatetimeArrType_Type)
-
-
-cdef inline bint is_array(object val):
- """
- Cython equivalent of `isinstance(val, np.ndarray)`
-
- Parameters
- ----------
- val : object
-
- Returns
- -------
- is_ndarray : bool
- """
- return PyArray_Check(val)
-
-
-cdef inline bint is_nan(object val):
- """
- Check if val is a Not-A-Number float or complex, including
- float('NaN') and np.nan.
-
- Parameters
- ----------
- val : object
-
- Returns
- -------
- is_nan : bool
- """
- cdef float64_t fval
- if is_float_object(val):
- fval = val
- return fval != fval
- return is_complex_object(val) and val != val
-
-
-cdef inline const char* get_c_string_buf_and_size(str py_string,
- Py_ssize_t *length) except NULL:
- """
- Extract internal char* buffer of unicode or bytes object `py_string` with
- getting length of this internal buffer saved in `length`.
-
- Notes
- -----
- Python object owns memory, thus returned char* must not be freed.
- `length` can be NULL if getting buffer length is not needed.
-
- Parameters
- ----------
- py_string : str
- length : Py_ssize_t*
-
- Returns
- -------
- buf : const char*
- """
- return PyUnicode_AsUTF8AndSize(py_string, length)
-
-
-cdef inline const char* get_c_string(str py_string) except NULL:
- return get_c_string_buf_and_size(py_string, NULL)
-
-
-cdef inline bytes string_encode_locale(str py_string):
- """As opposed to PyUnicode_Encode, use current system locale to encode."""
- return PyUnicode_EncodeLocale(py_string, NULL)
-
-
-cdef inline object char_to_string_locale(const char* data):
- """As opposed to PyUnicode_FromString, use current system locale to decode."""
- return PyUnicode_DecodeLocale(data, NULL)
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/vectorized.pyi b/contrib/python/pandas/py3/pandas/_libs/tslibs/vectorized.pyi
deleted file mode 100644
index 3fd9e2501e6..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/vectorized.pyi
+++ /dev/null
@@ -1,43 +0,0 @@
-"""
-For cython types that cannot be represented precisely, closest-available
-python equivalents are used, and the precise types kept as adjacent comments.
-"""
-from datetime import tzinfo
-
-import numpy as np
-
-from pandas._libs.tslibs.dtypes import Resolution
-from pandas._typing import npt
-
-def dt64arr_to_periodarr(
- stamps: npt.NDArray[np.int64],
- freq: int,
- tz: tzinfo | None,
- reso: int = ..., # NPY_DATETIMEUNIT
-) -> npt.NDArray[np.int64]: ...
-def is_date_array_normalized(
- stamps: npt.NDArray[np.int64],
- tz: tzinfo | None,
- reso: int, # NPY_DATETIMEUNIT
-) -> bool: ...
-def normalize_i8_timestamps(
- stamps: npt.NDArray[np.int64],
- tz: tzinfo | None,
- reso: int, # NPY_DATETIMEUNIT
-) -> npt.NDArray[np.int64]: ...
-def get_resolution(
- stamps: npt.NDArray[np.int64],
- tz: tzinfo | None = ...,
- reso: int = ..., # NPY_DATETIMEUNIT
-) -> Resolution: ...
-def ints_to_pydatetime(
- arr: npt.NDArray[np.int64],
- tz: tzinfo | None = ...,
- box: str = ...,
- reso: int = ..., # NPY_DATETIMEUNIT
-) -> npt.NDArray[np.object_]: ...
-def tz_convert_from_utc(
- stamps: npt.NDArray[np.int64],
- tz: tzinfo | None,
- reso: int = ..., # NPY_DATETIMEUNIT
-) -> npt.NDArray[np.int64]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/tslibs/vectorized.pyx b/contrib/python/pandas/py3/pandas/_libs/tslibs/vectorized.pyx
deleted file mode 100644
index 06e09d890de..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/tslibs/vectorized.pyx
+++ /dev/null
@@ -1,379 +0,0 @@
-cimport cython
-from cpython.datetime cimport (
- date,
- datetime,
- time,
- tzinfo,
-)
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- int64_t,
- ndarray,
-)
-
-cnp.import_array()
-
-from .dtypes import Resolution
-
-from .dtypes cimport (
- c_Resolution,
- periods_per_day,
-)
-from .nattype cimport (
- NPY_NAT,
- c_NaT as NaT,
-)
-from .np_datetime cimport (
- NPY_DATETIMEUNIT,
- NPY_FR_ns,
- npy_datetimestruct,
- pandas_datetime_to_datetimestruct,
-)
-from .period cimport get_period_ordinal
-from .timestamps cimport create_timestamp_from_ts
-from .timezones cimport is_utc
-from .tzconversion cimport Localizer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def tz_convert_from_utc(ndarray stamps, tzinfo tz, NPY_DATETIMEUNIT reso=NPY_FR_ns):
- # stamps is int64_t, arbitrary ndim
- """
- Convert the values (in i8) from UTC to tz
-
- Parameters
- ----------
- stamps : ndarray[int64]
- tz : tzinfo
-
- Returns
- -------
- ndarray[int64]
- """
- cdef:
- Localizer info = Localizer(tz, creso=reso)
- int64_t utc_val, local_val
- Py_ssize_t pos, i, n = stamps.size
-
- ndarray result
- cnp.broadcast mi
-
- if tz is None or is_utc(tz) or stamps.size == 0:
- # Much faster than going through the "standard" pattern below
- return stamps.copy()
-
- result = cnp.PyArray_EMPTY(stamps.ndim, stamps.shape, cnp.NPY_INT64, 0)
- mi = cnp.PyArray_MultiIterNew2(result, stamps)
-
- for i in range(n):
- # Analogous to: utc_val = stamps[i]
- utc_val = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if utc_val == NPY_NAT:
- local_val = NPY_NAT
- else:
- local_val = info.utc_val_to_local_val(utc_val, &pos)
-
- # Analogous to: result[i] = local_val
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = local_val
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return result
-
-
-# -------------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def ints_to_pydatetime(
- ndarray stamps,
- tzinfo tz=None,
- str box="datetime",
- NPY_DATETIMEUNIT reso=NPY_FR_ns,
-) -> np.ndarray:
- # stamps is int64, arbitrary ndim
- """
- Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp.
-
- Parameters
- ----------
- stamps : array of i8
- tz : str, optional
- convert to this timezone
- box : {'datetime', 'timestamp', 'date', 'time'}, default 'datetime'
- * If datetime, convert to datetime.datetime
- * If date, convert to datetime.date
- * If time, convert to datetime.time
- * If Timestamp, convert to pandas.Timestamp
-
- reso : NPY_DATETIMEUNIT, default NPY_FR_ns
-
- Returns
- -------
- ndarray[object] of type specified by box
- """
- cdef:
- Localizer info = Localizer(tz, creso=reso)
- int64_t utc_val, local_val
- Py_ssize_t i, n = stamps.size
- Py_ssize_t pos = -1 # unused, avoid not-initialized warning
-
- npy_datetimestruct dts
- tzinfo new_tz
- bint use_date = False, use_ts = False, use_pydt = False
- object res_val
- bint fold = 0
-
- # Note that `result` (and thus `result_flat`) is C-order and
- # `it` iterates C-order as well, so the iteration matches
- # See discussion at
- # github.com/pandas-dev/pandas/pull/46886#discussion_r860261305
- ndarray result = cnp.PyArray_EMPTY(stamps.ndim, stamps.shape, cnp.NPY_OBJECT, 0)
- object[::1] res_flat = result.ravel() # should NOT be a copy
- cnp.flatiter it = cnp.PyArray_IterNew(stamps)
-
- if box == "date":
- assert (tz is None), "tz should be None when converting to date"
- use_date = True
- elif box == "timestamp":
- use_ts = True
- elif box == "datetime":
- use_pydt = True
- elif box != "time":
- raise ValueError(
- "box must be one of 'datetime', 'date', 'time' or 'timestamp'"
- )
-
- for i in range(n):
- # Analogous to: utc_val = stamps[i]
- utc_val = (<int64_t*>cnp.PyArray_ITER_DATA(it))[0]
-
- new_tz = tz
-
- if utc_val == NPY_NAT:
- res_val = <object>NaT
-
- else:
-
- local_val = info.utc_val_to_local_val(utc_val, &pos, &fold)
- if info.use_pytz:
- # find right representation of dst etc in pytz timezone
- new_tz = tz._tzinfos[tz._transition_info[pos]]
-
- pandas_datetime_to_datetimestruct(local_val, reso, &dts)
-
- if use_ts:
- res_val = create_timestamp_from_ts(
- utc_val, dts, new_tz, fold, reso=reso
- )
- elif use_pydt:
- res_val = datetime(
- dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us,
- new_tz, fold=fold,
- )
- elif use_date:
- res_val = date(dts.year, dts.month, dts.day)
- else:
- res_val = time(dts.hour, dts.min, dts.sec, dts.us, new_tz, fold=fold)
-
- # Note: we can index result directly instead of using PyArray_MultiIter_DATA
- # like we do for the other functions because result is known C-contiguous
- # and is the first argument to PyArray_MultiIterNew2. The usual pattern
- # does not seem to work with object dtype.
- # See discussion at
- # github.com/pandas-dev/pandas/pull/46886#discussion_r860261305
- res_flat[i] = res_val
-
- cnp.PyArray_ITER_NEXT(it)
-
- return result
-
-
-# -------------------------------------------------------------------------
-
-
-cdef c_Resolution _reso_stamp(npy_datetimestruct *dts):
- if dts.ps != 0:
- return c_Resolution.RESO_NS
- elif dts.us != 0:
- if dts.us % 1000 == 0:
- return c_Resolution.RESO_MS
- return c_Resolution.RESO_US
- elif dts.sec != 0:
- return c_Resolution.RESO_SEC
- elif dts.min != 0:
- return c_Resolution.RESO_MIN
- elif dts.hour != 0:
- return c_Resolution.RESO_HR
- return c_Resolution.RESO_DAY
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def get_resolution(
- ndarray stamps, tzinfo tz=None, NPY_DATETIMEUNIT reso=NPY_FR_ns
-) -> Resolution:
- # stamps is int64_t, any ndim
- cdef:
- Localizer info = Localizer(tz, creso=reso)
- int64_t utc_val, local_val
- Py_ssize_t i, n = stamps.size
- Py_ssize_t pos = -1 # unused, avoid not-initialized warning
- cnp.flatiter it = cnp.PyArray_IterNew(stamps)
-
- npy_datetimestruct dts
- c_Resolution pd_reso = c_Resolution.RESO_DAY, curr_reso
-
- for i in range(n):
- # Analogous to: utc_val = stamps[i]
- utc_val = cnp.PyArray_GETITEM(stamps, cnp.PyArray_ITER_DATA(it))
-
- if utc_val == NPY_NAT:
- pass
- else:
- local_val = info.utc_val_to_local_val(utc_val, &pos)
-
- pandas_datetime_to_datetimestruct(local_val, reso, &dts)
- curr_reso = _reso_stamp(&dts)
- if curr_reso < pd_reso:
- pd_reso = curr_reso
-
- cnp.PyArray_ITER_NEXT(it)
-
- return Resolution(pd_reso)
-
-
-# -------------------------------------------------------------------------
-
-
-@cython.cdivision(False)
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef ndarray normalize_i8_timestamps(ndarray stamps, tzinfo tz, NPY_DATETIMEUNIT reso):
- # stamps is int64_t, arbitrary ndim
- """
- Normalize each of the (nanosecond) timezone aware timestamps in the given
- array by rounding down to the beginning of the day (i.e. midnight).
- This is midnight for timezone, `tz`.
-
- Parameters
- ----------
- stamps : int64 ndarray
- tz : tzinfo or None
- reso : NPY_DATETIMEUNIT
-
- Returns
- -------
- result : int64 ndarray of converted of normalized nanosecond timestamps
- """
- cdef:
- Localizer info = Localizer(tz, creso=reso)
- int64_t utc_val, local_val, res_val
- Py_ssize_t i, n = stamps.size
- Py_ssize_t pos = -1 # unused, avoid not-initialized warning
-
- ndarray result = cnp.PyArray_EMPTY(stamps.ndim, stamps.shape, cnp.NPY_INT64, 0)
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(result, stamps)
- int64_t ppd = periods_per_day(reso)
-
- for i in range(n):
- # Analogous to: utc_val = stamps[i]
- utc_val = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if utc_val == NPY_NAT:
- res_val = NPY_NAT
- else:
- local_val = info.utc_val_to_local_val(utc_val, &pos)
- res_val = local_val - (local_val % ppd)
-
- # Analogous to: result[i] = res_val
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_val
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return result
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def is_date_array_normalized(ndarray stamps, tzinfo tz, NPY_DATETIMEUNIT reso) -> bool:
- # stamps is int64_t, arbitrary ndim
- """
- Check if all of the given (nanosecond) timestamps are normalized to
- midnight, i.e. hour == minute == second == 0. If the optional timezone
- `tz` is not None, then this is midnight for this timezone.
-
- Parameters
- ----------
- stamps : int64 ndarray
- tz : tzinfo or None
- reso : NPY_DATETIMEUNIT
-
- Returns
- -------
- is_normalized : bool True if all stamps are normalized
- """
- cdef:
- Localizer info = Localizer(tz, creso=reso)
- int64_t utc_val, local_val
- Py_ssize_t i, n = stamps.size
- Py_ssize_t pos = -1 # unused, avoid not-initialized warning
- cnp.flatiter it = cnp.PyArray_IterNew(stamps)
- int64_t ppd = periods_per_day(reso)
-
- for i in range(n):
- # Analogous to: utc_val = stamps[i]
- utc_val = cnp.PyArray_GETITEM(stamps, cnp.PyArray_ITER_DATA(it))
-
- local_val = info.utc_val_to_local_val(utc_val, &pos)
-
- if local_val % ppd != 0:
- return False
-
- cnp.PyArray_ITER_NEXT(it)
-
- return True
-
-
-# -------------------------------------------------------------------------
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def dt64arr_to_periodarr(
- ndarray stamps, int freq, tzinfo tz, NPY_DATETIMEUNIT reso=NPY_FR_ns
-):
- # stamps is int64_t, arbitrary ndim
- cdef:
- Localizer info = Localizer(tz, creso=reso)
- Py_ssize_t i, n = stamps.size
- Py_ssize_t pos = -1 # unused, avoid not-initialized warning
- int64_t utc_val, local_val, res_val
-
- npy_datetimestruct dts
- ndarray result = cnp.PyArray_EMPTY(stamps.ndim, stamps.shape, cnp.NPY_INT64, 0)
- cnp.broadcast mi = cnp.PyArray_MultiIterNew2(result, stamps)
-
- for i in range(n):
- # Analogous to: utc_val = stamps[i]
- utc_val = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
-
- if utc_val == NPY_NAT:
- res_val = NPY_NAT
- else:
- local_val = info.utc_val_to_local_val(utc_val, &pos)
- pandas_datetime_to_datetimestruct(local_val, reso, &dts)
- res_val = get_period_ordinal(&dts, freq)
-
- # Analogous to: result[i] = res_val
- (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = res_val
-
- cnp.PyArray_MultiIter_NEXT(mi)
-
- return result
diff --git a/contrib/python/pandas/py3/pandas/_libs/util.pxd b/contrib/python/pandas/py3/pandas/_libs/util.pxd
deleted file mode 100644
index 18009a1a25d..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/util.pxd
+++ /dev/null
@@ -1,17 +0,0 @@
-cimport numpy as cnp
-from libc.stdint cimport (
- INT8_MAX,
- INT8_MIN,
- INT16_MAX,
- INT16_MIN,
- INT32_MAX,
- INT32_MIN,
- INT64_MAX,
- INT64_MIN,
- UINT8_MAX,
- UINT16_MAX,
- UINT32_MAX,
- UINT64_MAX,
-)
-
-from pandas._libs.tslibs.util cimport *
diff --git a/contrib/python/pandas/py3/pandas/_libs/window/__init__.py b/contrib/python/pandas/py3/pandas/_libs/window/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/window/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/_libs/window/aggregations.pyi b/contrib/python/pandas/py3/pandas/_libs/window/aggregations.pyi
deleted file mode 100644
index b926a7cb734..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/window/aggregations.pyi
+++ /dev/null
@@ -1,127 +0,0 @@
-from typing import (
- Any,
- Callable,
- Literal,
-)
-
-import numpy as np
-
-from pandas._typing import (
- WindowingRankType,
- npt,
-)
-
-def roll_sum(
- values: np.ndarray, # const float64_t[:]
- start: np.ndarray, # np.ndarray[np.int64]
- end: np.ndarray, # np.ndarray[np.int64]
- minp: int, # int64_t
-) -> np.ndarray: ... # np.ndarray[float]
-def roll_mean(
- values: np.ndarray, # const float64_t[:]
- start: np.ndarray, # np.ndarray[np.int64]
- end: np.ndarray, # np.ndarray[np.int64]
- minp: int, # int64_t
-) -> np.ndarray: ... # np.ndarray[float]
-def roll_var(
- values: np.ndarray, # const float64_t[:]
- start: np.ndarray, # np.ndarray[np.int64]
- end: np.ndarray, # np.ndarray[np.int64]
- minp: int, # int64_t
- ddof: int = ...,
-) -> np.ndarray: ... # np.ndarray[float]
-def roll_skew(
- values: np.ndarray, # np.ndarray[np.float64]
- start: np.ndarray, # np.ndarray[np.int64]
- end: np.ndarray, # np.ndarray[np.int64]
- minp: int, # int64_t
-) -> np.ndarray: ... # np.ndarray[float]
-def roll_kurt(
- values: np.ndarray, # np.ndarray[np.float64]
- start: np.ndarray, # np.ndarray[np.int64]
- end: np.ndarray, # np.ndarray[np.int64]
- minp: int, # int64_t
-) -> np.ndarray: ... # np.ndarray[float]
-def roll_median_c(
- values: np.ndarray, # np.ndarray[np.float64]
- start: np.ndarray, # np.ndarray[np.int64]
- end: np.ndarray, # np.ndarray[np.int64]
- minp: int, # int64_t
-) -> np.ndarray: ... # np.ndarray[float]
-def roll_max(
- values: np.ndarray, # np.ndarray[np.float64]
- start: np.ndarray, # np.ndarray[np.int64]
- end: np.ndarray, # np.ndarray[np.int64]
- minp: int, # int64_t
-) -> np.ndarray: ... # np.ndarray[float]
-def roll_min(
- values: np.ndarray, # np.ndarray[np.float64]
- start: np.ndarray, # np.ndarray[np.int64]
- end: np.ndarray, # np.ndarray[np.int64]
- minp: int, # int64_t
-) -> np.ndarray: ... # np.ndarray[float]
-def roll_quantile(
- values: np.ndarray, # const float64_t[:]
- start: np.ndarray, # np.ndarray[np.int64]
- end: np.ndarray, # np.ndarray[np.int64]
- minp: int, # int64_t
- quantile: float, # float64_t
- interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"],
-) -> np.ndarray: ... # np.ndarray[float]
-def roll_rank(
- values: np.ndarray,
- start: np.ndarray,
- end: np.ndarray,
- minp: int,
- percentile: bool,
- method: WindowingRankType,
- ascending: bool,
-) -> np.ndarray: ... # np.ndarray[float]
-def roll_apply(
- obj: object,
- start: np.ndarray, # np.ndarray[np.int64]
- end: np.ndarray, # np.ndarray[np.int64]
- minp: int, # int64_t
- function: Callable[..., Any],
- raw: bool,
- args: tuple[Any, ...],
- kwargs: dict[str, Any],
-) -> npt.NDArray[np.float64]: ...
-def roll_weighted_sum(
- values: np.ndarray, # const float64_t[:]
- weights: np.ndarray, # const float64_t[:]
- minp: int,
-) -> np.ndarray: ... # np.ndarray[np.float64]
-def roll_weighted_mean(
- values: np.ndarray, # const float64_t[:]
- weights: np.ndarray, # const float64_t[:]
- minp: int,
-) -> np.ndarray: ... # np.ndarray[np.float64]
-def roll_weighted_var(
- values: np.ndarray, # const float64_t[:]
- weights: np.ndarray, # const float64_t[:]
- minp: int, # int64_t
- ddof: int, # unsigned int
-) -> np.ndarray: ... # np.ndarray[np.float64]
-def ewm(
- vals: np.ndarray, # const float64_t[:]
- start: np.ndarray, # const int64_t[:]
- end: np.ndarray, # const int64_t[:]
- minp: int,
- com: float, # float64_t
- adjust: bool,
- ignore_na: bool,
- deltas: np.ndarray, # const float64_t[:]
- normalize: bool,
-) -> np.ndarray: ... # np.ndarray[np.float64]
-def ewmcov(
- input_x: np.ndarray, # const float64_t[:]
- start: np.ndarray, # const int64_t[:]
- end: np.ndarray, # const int64_t[:]
- minp: int,
- input_y: np.ndarray, # const float64_t[:]
- com: float, # float64_t
- adjust: bool,
- ignore_na: bool,
- bias: bool,
-) -> np.ndarray: ... # np.ndarray[np.float64]
diff --git a/contrib/python/pandas/py3/pandas/_libs/window/aggregations.pyx b/contrib/python/pandas/py3/pandas/_libs/window/aggregations.pyx
deleted file mode 100644
index 511df25c3a8..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/window/aggregations.pyx
+++ /dev/null
@@ -1,1953 +0,0 @@
-# cython: boundscheck=False, wraparound=False, cdivision=True
-
-from libc.math cimport (
- round,
- signbit,
- sqrt,
-)
-from libcpp.deque cimport deque
-
-from pandas._libs.algos cimport TiebreakEnumType
-
-import numpy as np
-
-cimport numpy as cnp
-from numpy cimport (
- float32_t,
- float64_t,
- int64_t,
- ndarray,
-)
-
-cnp.import_array()
-
-import cython
-
-from pandas._libs.algos import is_monotonic
-
-
-cdef extern from "../src/skiplist.h":
- ctypedef struct node_t:
- node_t **next
- int *width
- double value
- int is_nil
- int levels
- int ref_count
-
- ctypedef struct skiplist_t:
- node_t *head
- node_t **tmp_chain
- int *tmp_steps
- int size
- int maxlevels
-
- skiplist_t* skiplist_init(int) nogil
- void skiplist_destroy(skiplist_t*) nogil
- double skiplist_get(skiplist_t*, int, int*) nogil
- int skiplist_insert(skiplist_t*, double) nogil
- int skiplist_remove(skiplist_t*, double) nogil
- int skiplist_rank(skiplist_t*, double) nogil
- int skiplist_min_rank(skiplist_t*, double) nogil
-
-cdef:
- float32_t MINfloat32 = np.NINF
- float64_t MINfloat64 = np.NINF
-
- float32_t MAXfloat32 = np.inf
- float64_t MAXfloat64 = np.inf
-
- float64_t NaN = <float64_t>np.NaN
-
-cdef bint is_monotonic_increasing_start_end_bounds(
- ndarray[int64_t, ndim=1] start, ndarray[int64_t, ndim=1] end
-):
- return is_monotonic(start, False)[0] and is_monotonic(end, False)[0]
-
-# ----------------------------------------------------------------------
-# Rolling sum
-
-
-cdef float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x,
- int64_t num_consecutive_same_value, float64_t prev_value
- ) nogil:
- cdef:
- float64_t result
-
- if nobs == 0 == minp:
- result = 0
- elif nobs >= minp:
- if num_consecutive_same_value >= nobs:
- result = prev_value * nobs
- else:
- result = sum_x
- else:
- result = NaN
-
- return result
-
-
-cdef void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
- float64_t *compensation, int64_t *num_consecutive_same_value,
- float64_t *prev_value) nogil:
- """ add a value from the sum calc using Kahan summation """
-
- cdef:
- float64_t y, t
-
- # Not NaN
- if val == val:
- nobs[0] = nobs[0] + 1
- y = val - compensation[0]
- t = sum_x[0] + y
- compensation[0] = t - sum_x[0] - y
- sum_x[0] = t
-
- # GH#42064, record num of same values to remove floating point artifacts
- if val == prev_value[0]:
- num_consecutive_same_value[0] += 1
- else:
- # reset to 1 (include current value itself)
- num_consecutive_same_value[0] = 1
- prev_value[0] = val
-
-
-cdef void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
- float64_t *compensation) nogil:
- """ remove a value from the sum calc using Kahan summation """
-
- cdef:
- float64_t y, t
-
- # Not NaN
- if val == val:
- nobs[0] = nobs[0] - 1
- y = - val - compensation[0]
- t = sum_x[0] + y
- compensation[0] = t - sum_x[0] - y
- sum_x[0] = t
-
-
-def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
- ndarray[int64_t] end, int64_t minp) -> np.ndarray:
- cdef:
- Py_ssize_t i, j
- float64_t sum_x, compensation_add, compensation_remove, prev_value
- int64_t s, e, num_consecutive_same_value
- int64_t nobs = 0, N = len(start)
- ndarray[float64_t] output
- bint is_monotonic_increasing_bounds
-
- is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
- start, end
- )
- output = np.empty(N, dtype=np.float64)
-
- with nogil:
-
- for i in range(0, N):
- s = start[i]
- e = end[i]
-
- if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
-
- # setup
- prev_value = values[s]
- num_consecutive_same_value = 0
- sum_x = compensation_add = compensation_remove = 0
- nobs = 0
- for j in range(s, e):
- add_sum(values[j], &nobs, &sum_x, &compensation_add,
- &num_consecutive_same_value, &prev_value)
-
- else:
-
- # calculate deletes
- for j in range(start[i - 1], s):
- remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
-
- # calculate adds
- for j in range(end[i - 1], e):
- add_sum(values[j], &nobs, &sum_x, &compensation_add,
- &num_consecutive_same_value, &prev_value)
-
- output[i] = calc_sum(
- minp, nobs, sum_x, num_consecutive_same_value, prev_value
- )
-
- if not is_monotonic_increasing_bounds:
- nobs = 0
- sum_x = 0.0
- compensation_remove = 0.0
-
- return output
-
-
-# ----------------------------------------------------------------------
-# Rolling mean
-
-
-cdef float64_t calc_mean(int64_t minp, Py_ssize_t nobs, Py_ssize_t neg_ct,
- float64_t sum_x, int64_t num_consecutive_same_value,
- float64_t prev_value) nogil:
- cdef:
- float64_t result
-
- if nobs >= minp and nobs > 0:
- result = sum_x / <float64_t>nobs
- if num_consecutive_same_value >= nobs:
- result = prev_value
- elif neg_ct == 0 and result < 0:
- # all positive
- result = 0
- elif neg_ct == nobs and result > 0:
- # all negative
- result = 0
- else:
- pass
- else:
- result = NaN
- return result
-
-
-cdef void add_mean(
- float64_t val,
- Py_ssize_t *nobs,
- float64_t *sum_x,
- Py_ssize_t *neg_ct,
- float64_t *compensation,
- int64_t *num_consecutive_same_value,
- float64_t *prev_value
-) nogil:
- """ add a value from the mean calc using Kahan summation """
- cdef:
- float64_t y, t
-
- # Not NaN
- if val == val:
- nobs[0] = nobs[0] + 1
- y = val - compensation[0]
- t = sum_x[0] + y
- compensation[0] = t - sum_x[0] - y
- sum_x[0] = t
- if signbit(val):
- neg_ct[0] = neg_ct[0] + 1
-
- # GH#42064, record num of same values to remove floating point artifacts
- if val == prev_value[0]:
- num_consecutive_same_value[0] += 1
- else:
- # reset to 1 (include current value itself)
- num_consecutive_same_value[0] = 1
- prev_value[0] = val
-
-
-cdef void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
- Py_ssize_t *neg_ct, float64_t *compensation) nogil:
- """ remove a value from the mean calc using Kahan summation """
- cdef:
- float64_t y, t
-
- if val == val:
- nobs[0] = nobs[0] - 1
- y = - val - compensation[0]
- t = sum_x[0] + y
- compensation[0] = t - sum_x[0] - y
- sum_x[0] = t
- if signbit(val):
- neg_ct[0] = neg_ct[0] - 1
-
-
-def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
- ndarray[int64_t] end, int64_t minp) -> np.ndarray:
- cdef:
- float64_t val, compensation_add, compensation_remove, sum_x, prev_value
- int64_t s, e, num_consecutive_same_value
- Py_ssize_t nobs, i, j, neg_ct, N = len(start)
- ndarray[float64_t] output
- bint is_monotonic_increasing_bounds
-
- is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
- start, end
- )
- output = np.empty(N, dtype=np.float64)
-
- with nogil:
-
- for i in range(0, N):
- s = start[i]
- e = end[i]
-
- if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
-
- # setup
- compensation_add = compensation_remove = sum_x = 0
- nobs = neg_ct = 0
- prev_value = values[s]
- num_consecutive_same_value = 0
- for j in range(s, e):
- val = values[j]
- add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add,
- &num_consecutive_same_value, &prev_value)
-
- else:
-
- # calculate deletes
- for j in range(start[i - 1], s):
- val = values[j]
- remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove)
-
- # calculate adds
- for j in range(end[i - 1], e):
- val = values[j]
- add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add,
- &num_consecutive_same_value, &prev_value)
-
- output[i] = calc_mean(
- minp, nobs, neg_ct, sum_x, num_consecutive_same_value, prev_value
- )
-
- if not is_monotonic_increasing_bounds:
- nobs = 0
- neg_ct = 0
- sum_x = 0.0
- compensation_remove = 0.0
- return output
-
-# ----------------------------------------------------------------------
-# Rolling variance
-
-
-cdef float64_t calc_var(
- int64_t minp,
- int ddof,
- float64_t nobs,
- float64_t ssqdm_x,
- int64_t num_consecutive_same_value
-) nogil:
- cdef:
- float64_t result
-
- # Variance is unchanged if no observation is added or removed
- if (nobs >= minp) and (nobs > ddof):
-
- # pathological case & repeatedly same values case
- if nobs == 1 or num_consecutive_same_value >= nobs:
- result = 0
- else:
- result = ssqdm_x / (nobs - <float64_t>ddof)
- else:
- result = NaN
-
- return result
-
-
-cdef void add_var(
- float64_t val,
- float64_t *nobs,
- float64_t *mean_x,
- float64_t *ssqdm_x,
- float64_t *compensation,
- int64_t *num_consecutive_same_value,
- float64_t *prev_value,
-) nogil:
- """ add a value from the var calc """
- cdef:
- float64_t delta, prev_mean, y, t
-
- # GH#21813, if msvc 2017 bug is resolved, we should be OK with != instead of `isnan`
- if val != val:
- return
-
- nobs[0] = nobs[0] + 1
-
- # GH#42064, record num of same values to remove floating point artifacts
- if val == prev_value[0]:
- num_consecutive_same_value[0] += 1
- else:
- # reset to 1 (include current value itself)
- num_consecutive_same_value[0] = 1
- prev_value[0] = val
-
- # Welford's method for the online variance-calculation
- # using Kahan summation
- # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- prev_mean = mean_x[0] - compensation[0]
- y = val - compensation[0]
- t = y - mean_x[0]
- compensation[0] = t + mean_x[0] - y
- delta = t
- if nobs[0]:
- mean_x[0] = mean_x[0] + delta / nobs[0]
- else:
- mean_x[0] = 0
- ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0])
-
-
-cdef void remove_var(
- float64_t val,
- float64_t *nobs,
- float64_t *mean_x,
- float64_t *ssqdm_x,
- float64_t *compensation
-) nogil:
- """ remove a value from the var calc """
- cdef:
- float64_t delta, prev_mean, y, t
- if val == val:
- nobs[0] = nobs[0] - 1
- if nobs[0]:
- # Welford's method for the online variance-calculation
- # using Kahan summation
- # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- prev_mean = mean_x[0] - compensation[0]
- y = val - compensation[0]
- t = y - mean_x[0]
- compensation[0] = t + mean_x[0] - y
- delta = t
- mean_x[0] = mean_x[0] - delta / nobs[0]
- ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0])
- else:
- mean_x[0] = 0
- ssqdm_x[0] = 0
-
-
-def roll_var(const float64_t[:] values, ndarray[int64_t] start,
- ndarray[int64_t] end, int64_t minp, int ddof=1) -> np.ndarray:
- """
- Numerically stable implementation using Welford's method.
- """
- cdef:
- float64_t mean_x, ssqdm_x, nobs, compensation_add,
- float64_t compensation_remove, prev_value
- int64_t s, e, num_consecutive_same_value
- Py_ssize_t i, j, N = len(start)
- ndarray[float64_t] output
- bint is_monotonic_increasing_bounds
-
- minp = max(minp, 1)
- is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
- start, end
- )
- output = np.empty(N, dtype=np.float64)
-
- with nogil:
-
- for i in range(0, N):
-
- s = start[i]
- e = end[i]
-
- # Over the first window, observations can only be added
- # never removed
- if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
-
- prev_value = values[s]
- num_consecutive_same_value = 0
-
- mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
- for j in range(s, e):
- add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
- &num_consecutive_same_value, &prev_value)
-
- else:
-
- # After the first window, observations can both be added
- # and removed
-
- # calculate deletes
- for j in range(start[i - 1], s):
- remove_var(values[j], &nobs, &mean_x, &ssqdm_x,
- &compensation_remove)
-
- # calculate adds
- for j in range(end[i - 1], e):
- add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
- &num_consecutive_same_value, &prev_value)
-
- output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value)
-
- if not is_monotonic_increasing_bounds:
- nobs = 0.0
- mean_x = 0.0
- ssqdm_x = 0.0
- compensation_remove = 0.0
-
- return output
-
-# ----------------------------------------------------------------------
-# Rolling skewness
-
-
-cdef float64_t calc_skew(int64_t minp, int64_t nobs,
- float64_t x, float64_t xx, float64_t xxx,
- int64_t num_consecutive_same_value
- ) nogil:
- cdef:
- float64_t result, dnobs
- float64_t A, B, C, R
-
- if nobs >= minp:
- dnobs = <float64_t>nobs
- A = x / dnobs
- B = xx / dnobs - A * A
- C = xxx / dnobs - A * A * A - 3 * A * B
-
- if nobs < 3:
- result = NaN
- # GH 42064 46431
- # uniform case, force result to be 0
- elif num_consecutive_same_value >= nobs:
- result = 0.0
- # #18044: with uniform distribution, floating issue will
- # cause B != 0. and cause the result is a very
- # large number.
- #
- # in core/nanops.py nanskew/nankurt call the function
- # _zero_out_fperr(m2) to fix floating error.
- # if the variance is less than 1e-14, it could be
- # treat as zero, here we follow the original
- # skew/kurt behaviour to check B <= 1e-14
- elif B <= 1e-14:
- result = NaN
- else:
- R = sqrt(B)
- result = ((sqrt(dnobs * (dnobs - 1.)) * C) /
- ((dnobs - 2) * R * R * R))
- else:
- result = NaN
-
- return result
-
-
-cdef void add_skew(float64_t val, int64_t *nobs,
- float64_t *x, float64_t *xx,
- float64_t *xxx,
- float64_t *compensation_x,
- float64_t *compensation_xx,
- float64_t *compensation_xxx,
- int64_t *num_consecutive_same_value,
- float64_t *prev_value,
- ) nogil:
- """ add a value from the skew calc """
- cdef:
- float64_t y, t
-
- # Not NaN
- if val == val:
- nobs[0] = nobs[0] + 1
-
- y = val - compensation_x[0]
- t = x[0] + y
- compensation_x[0] = t - x[0] - y
- x[0] = t
- y = val * val - compensation_xx[0]
- t = xx[0] + y
- compensation_xx[0] = t - xx[0] - y
- xx[0] = t
- y = val * val * val - compensation_xxx[0]
- t = xxx[0] + y
- compensation_xxx[0] = t - xxx[0] - y
- xxx[0] = t
-
- # GH#42064, record num of same values to remove floating point artifacts
- if val == prev_value[0]:
- num_consecutive_same_value[0] += 1
- else:
- # reset to 1 (include current value itself)
- num_consecutive_same_value[0] = 1
- prev_value[0] = val
-
-
-cdef void remove_skew(float64_t val, int64_t *nobs,
- float64_t *x, float64_t *xx,
- float64_t *xxx,
- float64_t *compensation_x,
- float64_t *compensation_xx,
- float64_t *compensation_xxx) nogil:
- """ remove a value from the skew calc """
- cdef:
- float64_t y, t
-
- # Not NaN
- if val == val:
- nobs[0] = nobs[0] - 1
-
- y = - val - compensation_x[0]
- t = x[0] + y
- compensation_x[0] = t - x[0] - y
- x[0] = t
- y = - val * val - compensation_xx[0]
- t = xx[0] + y
- compensation_xx[0] = t - xx[0] - y
- xx[0] = t
- y = - val * val * val - compensation_xxx[0]
- t = xxx[0] + y
- compensation_xxx[0] = t - xxx[0] - y
- xxx[0] = t
-
-
-def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
- ndarray[int64_t] end, int64_t minp) -> np.ndarray:
- cdef:
- Py_ssize_t i, j
- float64_t val, min_val, mean_val, sum_val = 0
- float64_t compensation_xxx_add, compensation_xxx_remove
- float64_t compensation_xx_add, compensation_xx_remove
- float64_t compensation_x_add, compensation_x_remove
- float64_t x, xx, xxx
- float64_t prev_value
- int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0
- int64_t s, e, num_consecutive_same_value
- ndarray[float64_t] output, values_copy
- bint is_monotonic_increasing_bounds
-
- minp = max(minp, 3)
- is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
- start, end
- )
- output = np.empty(N, dtype=np.float64)
- min_val = np.nanmin(values)
- values_copy = np.copy(values)
-
- with nogil:
- for i in range(0, V):
- val = values_copy[i]
- if val == val:
- nobs_mean += 1
- sum_val += val
- mean_val = sum_val / nobs_mean
- # Other cases would lead to imprecision for smallest values
- if min_val - mean_val > -1e5:
- mean_val = round(mean_val)
- for i in range(0, V):
- values_copy[i] = values_copy[i] - mean_val
-
- for i in range(0, N):
-
- s = start[i]
- e = end[i]
-
- # Over the first window, observations can only be added
- # never removed
- if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
-
- prev_value = values[s]
- num_consecutive_same_value = 0
-
- compensation_xxx_add = compensation_xxx_remove = 0
- compensation_xx_add = compensation_xx_remove = 0
- compensation_x_add = compensation_x_remove = 0
- x = xx = xxx = 0
- nobs = 0
- for j in range(s, e):
- val = values_copy[j]
- add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
- &compensation_xx_add, &compensation_xxx_add,
- &num_consecutive_same_value, &prev_value)
-
- else:
-
- # After the first window, observations can both be added
- # and removed
- # calculate deletes
- for j in range(start[i - 1], s):
- val = values_copy[j]
- remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove,
- &compensation_xx_remove, &compensation_xxx_remove)
-
- # calculate adds
- for j in range(end[i - 1], e):
- val = values_copy[j]
- add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
- &compensation_xx_add, &compensation_xxx_add,
- &num_consecutive_same_value, &prev_value)
-
- output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value)
-
- if not is_monotonic_increasing_bounds:
- nobs = 0
- x = 0.0
- xx = 0.0
- xxx = 0.0
-
- return output
-
-# ----------------------------------------------------------------------
-# Rolling kurtosis
-
-
-cdef float64_t calc_kurt(int64_t minp, int64_t nobs,
- float64_t x, float64_t xx,
- float64_t xxx, float64_t xxxx,
- int64_t num_consecutive_same_value,
- ) nogil:
- cdef:
- float64_t result, dnobs
- float64_t A, B, C, D, R, K
-
- if nobs >= minp:
- if nobs < 4:
- result = NaN
- # GH 42064 46431
- # uniform case, force result to be -3.
- elif num_consecutive_same_value >= nobs:
- result = -3.
- else:
- dnobs = <float64_t>nobs
- A = x / dnobs
- R = A * A
- B = xx / dnobs - R
- R = R * A
- C = xxx / dnobs - R - 3 * A * B
- R = R * A
- D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A
-
- # #18044: with uniform distribution, floating issue will
- # cause B != 0. and cause the result is a very
- # large number.
- #
- # in core/nanops.py nanskew/nankurt call the function
- # _zero_out_fperr(m2) to fix floating error.
- # if the variance is less than 1e-14, it could be
- # treat as zero, here we follow the original
- # skew/kurt behaviour to check B <= 1e-14
- if B <= 1e-14:
- result = NaN
- else:
- K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
- result = K / ((dnobs - 2.) * (dnobs - 3.))
- else:
- result = NaN
-
- return result
-
-
-cdef void add_kurt(float64_t val, int64_t *nobs,
- float64_t *x, float64_t *xx,
- float64_t *xxx, float64_t *xxxx,
- float64_t *compensation_x,
- float64_t *compensation_xx,
- float64_t *compensation_xxx,
- float64_t *compensation_xxxx,
- int64_t *num_consecutive_same_value,
- float64_t *prev_value
- ) nogil:
- """ add a value from the kurotic calc """
- cdef:
- float64_t y, t
-
- # Not NaN
- if val == val:
- nobs[0] = nobs[0] + 1
-
- y = val - compensation_x[0]
- t = x[0] + y
- compensation_x[0] = t - x[0] - y
- x[0] = t
- y = val * val - compensation_xx[0]
- t = xx[0] + y
- compensation_xx[0] = t - xx[0] - y
- xx[0] = t
- y = val * val * val - compensation_xxx[0]
- t = xxx[0] + y
- compensation_xxx[0] = t - xxx[0] - y
- xxx[0] = t
- y = val * val * val * val - compensation_xxxx[0]
- t = xxxx[0] + y
- compensation_xxxx[0] = t - xxxx[0] - y
- xxxx[0] = t
-
- # GH#42064, record num of same values to remove floating point artifacts
- if val == prev_value[0]:
- num_consecutive_same_value[0] += 1
- else:
- # reset to 1 (include current value itself)
- num_consecutive_same_value[0] = 1
- prev_value[0] = val
-
-
-cdef void remove_kurt(float64_t val, int64_t *nobs,
- float64_t *x, float64_t *xx,
- float64_t *xxx, float64_t *xxxx,
- float64_t *compensation_x,
- float64_t *compensation_xx,
- float64_t *compensation_xxx,
- float64_t *compensation_xxxx) nogil:
- """ remove a value from the kurotic calc """
- cdef:
- float64_t y, t
-
- # Not NaN
- if val == val:
- nobs[0] = nobs[0] - 1
-
- y = - val - compensation_x[0]
- t = x[0] + y
- compensation_x[0] = t - x[0] - y
- x[0] = t
- y = - val * val - compensation_xx[0]
- t = xx[0] + y
- compensation_xx[0] = t - xx[0] - y
- xx[0] = t
- y = - val * val * val - compensation_xxx[0]
- t = xxx[0] + y
- compensation_xxx[0] = t - xxx[0] - y
- xxx[0] = t
- y = - val * val * val * val - compensation_xxxx[0]
- t = xxxx[0] + y
- compensation_xxxx[0] = t - xxxx[0] - y
- xxxx[0] = t
-
-
-def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
- ndarray[int64_t] end, int64_t minp) -> np.ndarray:
- cdef:
- Py_ssize_t i, j
- float64_t val, mean_val, min_val, sum_val = 0
- float64_t compensation_xxxx_add, compensation_xxxx_remove
- float64_t compensation_xxx_remove, compensation_xxx_add
- float64_t compensation_xx_remove, compensation_xx_add
- float64_t compensation_x_remove, compensation_x_add
- float64_t x, xx, xxx, xxxx
- float64_t prev_value
- int64_t nobs, s, e, num_consecutive_same_value
- int64_t N = len(start), V = len(values), nobs_mean = 0
- ndarray[float64_t] output, values_copy
- bint is_monotonic_increasing_bounds
-
- minp = max(minp, 4)
- is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
- start, end
- )
- output = np.empty(N, dtype=np.float64)
- values_copy = np.copy(values)
- min_val = np.nanmin(values)
-
- with nogil:
- for i in range(0, V):
- val = values_copy[i]
- if val == val:
- nobs_mean += 1
- sum_val += val
- mean_val = sum_val / nobs_mean
- # Other cases would lead to imprecision for smallest values
- if min_val - mean_val > -1e4:
- mean_val = round(mean_val)
- for i in range(0, V):
- values_copy[i] = values_copy[i] - mean_val
-
- for i in range(0, N):
-
- s = start[i]
- e = end[i]
-
- # Over the first window, observations can only be added
- # never removed
- if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
-
- prev_value = values[s]
- num_consecutive_same_value = 0
-
- compensation_xxxx_add = compensation_xxxx_remove = 0
- compensation_xxx_remove = compensation_xxx_add = 0
- compensation_xx_remove = compensation_xx_add = 0
- compensation_x_remove = compensation_x_add = 0
- x = xx = xxx = xxxx = 0
- nobs = 0
- for j in range(s, e):
- add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
- &compensation_x_add, &compensation_xx_add,
- &compensation_xxx_add, &compensation_xxxx_add,
- &num_consecutive_same_value, &prev_value)
-
- else:
-
- # After the first window, observations can both be added
- # and removed
- # calculate deletes
- for j in range(start[i - 1], s):
- remove_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
- &compensation_x_remove, &compensation_xx_remove,
- &compensation_xxx_remove, &compensation_xxxx_remove)
-
- # calculate adds
- for j in range(end[i - 1], e):
- add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
- &compensation_x_add, &compensation_xx_add,
- &compensation_xxx_add, &compensation_xxxx_add,
- &num_consecutive_same_value, &prev_value)
-
- output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx,
- num_consecutive_same_value)
-
- if not is_monotonic_increasing_bounds:
- nobs = 0
- x = 0.0
- xx = 0.0
- xxx = 0.0
- xxxx = 0.0
-
- return output
-
-
-# ----------------------------------------------------------------------
-# Rolling median, min, max
-
-
-def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
- ndarray[int64_t] end, int64_t minp) -> np.ndarray:
- cdef:
- Py_ssize_t i, j
- bint err = False, is_monotonic_increasing_bounds
- int midpoint, ret = 0
- int64_t nobs = 0, N = len(start), s, e, win
- float64_t val, res
- skiplist_t *sl
- ndarray[float64_t] output
-
- is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
- start, end
- )
-
- # we use the Fixed/Variable Indexer here as the
- # actual skiplist ops outweigh any window computation costs
- output = np.empty(N, dtype=np.float64)
-
- if (end - start).max() == 0:
- output[:] = NaN
- return output
- win = (end - start).max()
- sl = skiplist_init(<int>win)
- if sl == NULL:
- raise MemoryError("skiplist_init failed")
-
- with nogil:
-
- for i in range(0, N):
- s = start[i]
- e = end[i]
-
- if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
-
- if i != 0:
- skiplist_destroy(sl)
- sl = skiplist_init(<int>win)
- nobs = 0
- # setup
- for j in range(s, e):
- val = values[j]
- if val == val:
- nobs += 1
- err = skiplist_insert(sl, val) == -1
- if err:
- break
-
- else:
-
- # calculate adds
- for j in range(end[i - 1], e):
- val = values[j]
- if val == val:
- nobs += 1
- err = skiplist_insert(sl, val) == -1
- if err:
- break
-
- # calculate deletes
- for j in range(start[i - 1], s):
- val = values[j]
- if val == val:
- skiplist_remove(sl, val)
- nobs -= 1
- if nobs >= minp:
- midpoint = <int>(nobs / 2)
- if nobs % 2:
- res = skiplist_get(sl, midpoint, &ret)
- else:
- res = (skiplist_get(sl, midpoint, &ret) +
- skiplist_get(sl, (midpoint - 1), &ret)) / 2
- if ret == 0:
- res = NaN
- else:
- res = NaN
-
- output[i] = res
-
- if not is_monotonic_increasing_bounds:
- nobs = 0
- skiplist_destroy(sl)
- sl = skiplist_init(<int>win)
-
- skiplist_destroy(sl)
- if err:
- raise MemoryError("skiplist_insert failed")
- return output
-
-
-# ----------------------------------------------------------------------
-
-# Moving maximum / minimum code taken from Bottleneck under the terms
-# of its Simplified BSD license
-# https://github.com/pydata/bottleneck
-
-
-cdef float64_t init_mm(float64_t ai, Py_ssize_t *nobs, bint is_max) nogil:
-
- if ai == ai:
- nobs[0] = nobs[0] + 1
- elif is_max:
- ai = MINfloat64
- else:
- ai = MAXfloat64
-
- return ai
-
-
-cdef void remove_mm(float64_t aold, Py_ssize_t *nobs) nogil:
- """ remove a value from the mm calc """
- if aold == aold:
- nobs[0] = nobs[0] - 1
-
-
-cdef float64_t calc_mm(int64_t minp, Py_ssize_t nobs,
- float64_t value) nogil:
- cdef:
- float64_t result
-
- if nobs >= minp:
- result = value
- else:
- result = NaN
-
- return result
-
-
-def roll_max(ndarray[float64_t] values, ndarray[int64_t] start,
- ndarray[int64_t] end, int64_t minp) -> np.ndarray:
- """
- Moving max of 1d array of any numeric type along axis=0 ignoring NaNs.
-
- Parameters
- ----------
- values : np.ndarray[np.float64]
- window : int, size of rolling window
- minp : if number of observations in window
- is below this, output a NaN
- index : ndarray, optional
- index for window computation
- closed : 'right', 'left', 'both', 'neither'
- make the interval closed on the right, left,
- both or neither endpoints
-
- Returns
- -------
- np.ndarray[float]
- """
- return _roll_min_max(values, start, end, minp, is_max=1)
-
-
-def roll_min(ndarray[float64_t] values, ndarray[int64_t] start,
- ndarray[int64_t] end, int64_t minp) -> np.ndarray:
- """
- Moving min of 1d array of any numeric type along axis=0 ignoring NaNs.
-
- Parameters
- ----------
- values : np.ndarray[np.float64]
- window : int, size of rolling window
- minp : if number of observations in window
- is below this, output a NaN
- index : ndarray, optional
- index for window computation
-
- Returns
- -------
- np.ndarray[float]
- """
- return _roll_min_max(values, start, end, minp, is_max=0)
-
-
-cdef _roll_min_max(ndarray[float64_t] values,
- ndarray[int64_t] starti,
- ndarray[int64_t] endi,
- int64_t minp,
- bint is_max):
- cdef:
- float64_t ai
- int64_t curr_win_size, start
- Py_ssize_t i, k, nobs = 0, N = len(starti)
- deque Q[int64_t] # min/max always the front
- deque W[int64_t] # track the whole window for nobs compute
- ndarray[float64_t, ndim=1] output
-
- output = np.empty(N, dtype=np.float64)
- Q = deque[int64_t]()
- W = deque[int64_t]()
-
- with nogil:
-
- # This is using a modified version of the C++ code in this
- # SO post: https://stackoverflow.com/a/12239580
- # The original impl didn't deal with variable window sizes
- # So the code was optimized for that
-
- # first window's size
- curr_win_size = endi[0] - starti[0]
- # GH 32865
- # Anchor output index to values index to provide custom
- # BaseIndexer support
- for i in range(N):
-
- curr_win_size = endi[i] - starti[i]
- if i == 0:
- start = starti[i]
- else:
- start = endi[i - 1]
-
- for k in range(start, endi[i]):
- ai = init_mm(values[k], &nobs, is_max)
- # Discard previous entries if we find new min or max
- if is_max:
- while not Q.empty() and ((ai >= values[Q.back()]) or
- values[Q.back()] != values[Q.back()]):
- Q.pop_back()
- else:
- while not Q.empty() and ((ai <= values[Q.back()]) or
- values[Q.back()] != values[Q.back()]):
- Q.pop_back()
- Q.push_back(k)
- W.push_back(k)
-
- # Discard entries outside and left of current window
- while not Q.empty() and Q.front() <= starti[i] - 1:
- Q.pop_front()
- while not W.empty() and W.front() <= starti[i] - 1:
- remove_mm(values[W.front()], &nobs)
- W.pop_front()
-
- # Save output based on index in input value array
- if not Q.empty() and curr_win_size > 0:
- output[i] = calc_mm(minp, nobs, values[Q.front()])
- else:
- output[i] = NaN
-
- return output
-
-
-cdef enum InterpolationType:
- LINEAR,
- LOWER,
- HIGHER,
- NEAREST,
- MIDPOINT
-
-
-interpolation_types = {
- "linear": LINEAR,
- "lower": LOWER,
- "higher": HIGHER,
- "nearest": NEAREST,
- "midpoint": MIDPOINT,
-}
-
-
-def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
- ndarray[int64_t] end, int64_t minp,
- float64_t quantile, str interpolation) -> np.ndarray:
- """
- O(N log(window)) implementation using skip list
- """
- cdef:
- Py_ssize_t i, j, s, e, N = len(start), idx
- int ret = 0
- int64_t nobs = 0, win
- float64_t val, idx_with_fraction
- float64_t vlow, vhigh
- skiplist_t *skiplist
- InterpolationType interpolation_type
- ndarray[float64_t] output
-
- if quantile <= 0.0 or quantile >= 1.0:
- raise ValueError(f"quantile value {quantile} not in [0, 1]")
-
- try:
- interpolation_type = interpolation_types[interpolation]
- except KeyError:
- raise ValueError(f"Interpolation '{interpolation}' is not supported")
-
- is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
- start, end
- )
- # we use the Fixed/Variable Indexer here as the
- # actual skiplist ops outweigh any window computation costs
- output = np.empty(N, dtype=np.float64)
-
- win = (end - start).max()
- if win == 0:
- output[:] = NaN
- return output
- skiplist = skiplist_init(<int>win)
- if skiplist == NULL:
- raise MemoryError("skiplist_init failed")
-
- with nogil:
- for i in range(0, N):
- s = start[i]
- e = end[i]
-
- if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
- if i != 0:
- nobs = 0
- skiplist_destroy(skiplist)
- skiplist = skiplist_init(<int>win)
-
- # setup
- for j in range(s, e):
- val = values[j]
- if val == val:
- nobs += 1
- skiplist_insert(skiplist, val)
-
- else:
- # calculate adds
- for j in range(end[i - 1], e):
- val = values[j]
- if val == val:
- nobs += 1
- skiplist_insert(skiplist, val)
-
- # calculate deletes
- for j in range(start[i - 1], s):
- val = values[j]
- if val == val:
- skiplist_remove(skiplist, val)
- nobs -= 1
- if nobs >= minp:
- if nobs == 1:
- # Single value in skip list
- output[i] = skiplist_get(skiplist, 0, &ret)
- else:
- idx_with_fraction = quantile * (nobs - 1)
- idx = <int>idx_with_fraction
-
- if idx_with_fraction == idx:
- # no need to interpolate
- output[i] = skiplist_get(skiplist, idx, &ret)
- continue
-
- if interpolation_type == LINEAR:
- vlow = skiplist_get(skiplist, idx, &ret)
- vhigh = skiplist_get(skiplist, idx + 1, &ret)
- output[i] = ((vlow + (vhigh - vlow) *
- (idx_with_fraction - idx)))
- elif interpolation_type == LOWER:
- output[i] = skiplist_get(skiplist, idx, &ret)
- elif interpolation_type == HIGHER:
- output[i] = skiplist_get(skiplist, idx + 1, &ret)
- elif interpolation_type == NEAREST:
- # the same behaviour as round()
- if idx_with_fraction - idx == 0.5:
- if idx % 2 == 0:
- output[i] = skiplist_get(skiplist, idx, &ret)
- else:
- output[i] = skiplist_get(
- skiplist, idx + 1, &ret)
- elif idx_with_fraction - idx < 0.5:
- output[i] = skiplist_get(skiplist, idx, &ret)
- else:
- output[i] = skiplist_get(skiplist, idx + 1, &ret)
- elif interpolation_type == MIDPOINT:
- vlow = skiplist_get(skiplist, idx, &ret)
- vhigh = skiplist_get(skiplist, idx + 1, &ret)
- output[i] = <float64_t>(vlow + vhigh) / 2
-
- if ret == 0:
- output[i] = NaN
- else:
- output[i] = NaN
-
- skiplist_destroy(skiplist)
-
- return output
-
-
-rolling_rank_tiebreakers = {
- "average": TiebreakEnumType.TIEBREAK_AVERAGE,
- "min": TiebreakEnumType.TIEBREAK_MIN,
- "max": TiebreakEnumType.TIEBREAK_MAX,
-}
-
-
-def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
- ndarray[int64_t] end, int64_t minp, bint percentile,
- str method, bint ascending) -> np.ndarray:
- """
- O(N log(window)) implementation using skip list
-
- derived from roll_quantile
- """
- cdef:
- Py_ssize_t i, j, s, e, N = len(start)
- float64_t rank_min = 0, rank = 0
- int64_t nobs = 0, win
- float64_t val
- skiplist_t *skiplist
- float64_t[::1] output
- TiebreakEnumType rank_type
-
- try:
- rank_type = rolling_rank_tiebreakers[method]
- except KeyError:
- raise ValueError(f"Method '{method}' is not supported")
-
- is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
- start, end
- )
- # we use the Fixed/Variable Indexer here as the
- # actual skiplist ops outweigh any window computation costs
- output = np.empty(N, dtype=np.float64)
-
- win = (end - start).max()
- if win == 0:
- output[:] = NaN
- return np.asarray(output)
- skiplist = skiplist_init(<int>win)
- if skiplist == NULL:
- raise MemoryError("skiplist_init failed")
-
- with nogil:
- for i in range(N):
- s = start[i]
- e = end[i]
-
- if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
- if i != 0:
- nobs = 0
- skiplist_destroy(skiplist)
- skiplist = skiplist_init(<int>win)
-
- # setup
- for j in range(s, e):
- val = values[j] if ascending else -values[j]
- if val == val:
- nobs += 1
- rank = skiplist_insert(skiplist, val)
- if rank == -1:
- raise MemoryError("skiplist_insert failed")
- if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE:
- # The average rank of `val` is the sum of the ranks of all
- # instances of `val` in the skip list divided by the number
- # of instances. The sum of consecutive integers from 1 to N
- # is N * (N + 1) / 2.
- # The sum of the ranks is the sum of integers from the
- # lowest rank to the highest rank, which is the sum of
- # integers from 1 to the highest rank minus the sum of
- # integers from 1 to one less than the lowest rank.
- rank_min = skiplist_min_rank(skiplist, val)
- rank = (((rank * (rank + 1) / 2)
- - ((rank_min - 1) * rank_min / 2))
- / (rank - rank_min + 1))
- elif rank_type == TiebreakEnumType.TIEBREAK_MIN:
- rank = skiplist_min_rank(skiplist, val)
- else:
- rank = NaN
-
- else:
- # calculate deletes
- for j in range(start[i - 1], s):
- val = values[j] if ascending else -values[j]
- if val == val:
- skiplist_remove(skiplist, val)
- nobs -= 1
-
- # calculate adds
- for j in range(end[i - 1], e):
- val = values[j] if ascending else -values[j]
- if val == val:
- nobs += 1
- rank = skiplist_insert(skiplist, val)
- if rank == -1:
- raise MemoryError("skiplist_insert failed")
- if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE:
- rank_min = skiplist_min_rank(skiplist, val)
- rank = (((rank * (rank + 1) / 2)
- - ((rank_min - 1) * rank_min / 2))
- / (rank - rank_min + 1))
- elif rank_type == TiebreakEnumType.TIEBREAK_MIN:
- rank = skiplist_min_rank(skiplist, val)
- else:
- rank = NaN
- if nobs >= minp:
- output[i] = rank / nobs if percentile else rank
- else:
- output[i] = NaN
-
- skiplist_destroy(skiplist)
-
- return np.asarray(output)
-
-
-def roll_apply(object obj,
- ndarray[int64_t] start, ndarray[int64_t] end,
- int64_t minp,
- object function, bint raw,
- tuple args, dict kwargs) -> np.ndarray:
- cdef:
- ndarray[float64_t] output, counts
- ndarray[float64_t, cast=True] arr
- Py_ssize_t i, s, e, N = len(start), n = len(obj)
-
- if n == 0:
- return np.array([], dtype=np.float64)
-
- arr = np.asarray(obj)
-
- # ndarray input
- if raw and not arr.flags.c_contiguous:
- arr = arr.copy("C")
-
- counts = roll_sum(np.isfinite(arr).astype(float), start, end, minp)
-
- output = np.empty(N, dtype=np.float64)
-
- for i in range(N):
-
- s = start[i]
- e = end[i]
-
- if counts[i] >= minp:
- if raw:
- output[i] = function(arr[s:e], *args, **kwargs)
- else:
- output[i] = function(obj.iloc[s:e], *args, **kwargs)
- else:
- output[i] = NaN
-
- return output
-
-
-# ----------------------------------------------------------------------
-# Rolling sum and mean for weighted window
-
-
-def roll_weighted_sum(
- const float64_t[:] values, const float64_t[:] weights, int minp
-) -> np.ndarray:
- return _roll_weighted_sum_mean(values, weights, minp, avg=0)
-
-
-def roll_weighted_mean(
- const float64_t[:] values, const float64_t[:] weights, int minp
-) -> np.ndarray:
- return _roll_weighted_sum_mean(values, weights, minp, avg=1)
-
-
-cdef float64_t[:] _roll_weighted_sum_mean(const float64_t[:] values,
- const float64_t[:] weights,
- int minp, bint avg):
- """
- Assume len(weights) << len(values)
- """
- cdef:
- float64_t[:] output, tot_wgt, counts
- Py_ssize_t in_i, win_i, win_n, in_n
- float64_t val_in, val_win, c, w
-
- in_n = len(values)
- win_n = len(weights)
-
- output = np.zeros(in_n, dtype=np.float64)
- counts = np.zeros(in_n, dtype=np.float64)
- if avg:
- tot_wgt = np.zeros(in_n, dtype=np.float64)
-
- elif minp > in_n:
- minp = in_n + 1
-
- minp = max(minp, 1)
-
- with nogil:
- if avg:
- for win_i in range(win_n):
- val_win = weights[win_i]
- if val_win != val_win:
- continue
-
- for in_i in range(in_n - (win_n - win_i) + 1):
- val_in = values[in_i]
- if val_in == val_in:
- output[in_i + (win_n - win_i) - 1] += val_in * val_win
- counts[in_i + (win_n - win_i) - 1] += 1
- tot_wgt[in_i + (win_n - win_i) - 1] += val_win
-
- for in_i in range(in_n):
- c = counts[in_i]
- if c < minp:
- output[in_i] = NaN
- else:
- w = tot_wgt[in_i]
- if w == 0:
- output[in_i] = NaN
- else:
- output[in_i] /= tot_wgt[in_i]
-
- else:
- for win_i in range(win_n):
- val_win = weights[win_i]
- if val_win != val_win:
- continue
-
- for in_i in range(in_n - (win_n - win_i) + 1):
- val_in = values[in_i]
-
- if val_in == val_in:
- output[in_i + (win_n - win_i) - 1] += val_in * val_win
- counts[in_i + (win_n - win_i) - 1] += 1
-
- for in_i in range(in_n):
- c = counts[in_i]
- if c < minp:
- output[in_i] = NaN
-
- return output
-
-
-# ----------------------------------------------------------------------
-# Rolling var for weighted window
-
-
-cdef float64_t calc_weighted_var(float64_t t,
- float64_t sum_w,
- Py_ssize_t win_n,
- unsigned int ddof,
- float64_t nobs,
- int64_t minp) nogil:
- """
- Calculate weighted variance for a window using West's method.
-
- Paper: https://dl.acm.org/citation.cfm?id=359153
-
- Parameters
- ----------
- t: float64_t
- sum of weighted squared differences
- sum_w: float64_t
- sum of weights
- win_n: Py_ssize_t
- window size
- ddof: unsigned int
- delta degrees of freedom
- nobs: float64_t
- number of observations
- minp: int64_t
- minimum number of observations
-
- Returns
- -------
- result : float64_t
- weighted variance of the window
- """
-
- cdef:
- float64_t result
-
- # Variance is unchanged if no observation is added or removed
- if (nobs >= minp) and (nobs > ddof):
-
- # pathological case
- if nobs == 1:
- result = 0
- else:
- result = t * win_n / ((win_n - ddof) * sum_w)
- if result < 0:
- result = 0
- else:
- result = NaN
-
- return result
-
-
-cdef void add_weighted_var(float64_t val,
- float64_t w,
- float64_t *t,
- float64_t *sum_w,
- float64_t *mean,
- float64_t *nobs) nogil:
- """
- Update weighted mean, sum of weights and sum of weighted squared
- differences to include value and weight pair in weighted variance
- calculation using West's method.
-
- Paper: https://dl.acm.org/citation.cfm?id=359153
-
- Parameters
- ----------
- val: float64_t
- window values
- w: float64_t
- window weights
- t: float64_t
- sum of weighted squared differences
- sum_w: float64_t
- sum of weights
- mean: float64_t
- weighted mean
- nobs: float64_t
- number of observations
- """
-
- cdef:
- float64_t temp, q, r
-
- if val != val:
- return
-
- nobs[0] = nobs[0] + 1
-
- q = val - mean[0]
- temp = sum_w[0] + w
- r = q * w / temp
-
- mean[0] = mean[0] + r
- t[0] = t[0] + r * sum_w[0] * q
- sum_w[0] = temp
-
-
-cdef void remove_weighted_var(float64_t val,
- float64_t w,
- float64_t *t,
- float64_t *sum_w,
- float64_t *mean,
- float64_t *nobs) nogil:
- """
- Update weighted mean, sum of weights and sum of weighted squared
- differences to remove value and weight pair from weighted variance
- calculation using West's method.
-
- Paper: https://dl.acm.org/citation.cfm?id=359153
-
- Parameters
- ----------
- val: float64_t
- window values
- w: float64_t
- window weights
- t: float64_t
- sum of weighted squared differences
- sum_w: float64_t
- sum of weights
- mean: float64_t
- weighted mean
- nobs: float64_t
- number of observations
- """
-
- cdef:
- float64_t temp, q, r
-
- if val == val:
- nobs[0] = nobs[0] - 1
-
- if nobs[0]:
- q = val - mean[0]
- temp = sum_w[0] - w
- r = q * w / temp
-
- mean[0] = mean[0] - r
- t[0] = t[0] - r * sum_w[0] * q
- sum_w[0] = temp
-
- else:
- t[0] = 0
- sum_w[0] = 0
- mean[0] = 0
-
-
-def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights,
- int64_t minp, unsigned int ddof):
- """
- Calculates weighted rolling variance using West's online algorithm.
-
- Paper: https://dl.acm.org/citation.cfm?id=359153
-
- Parameters
- ----------
- values: float64_t[:]
- values to roll window over
- weights: float64_t[:]
- array of weights whose length is window size
- minp: int64_t
- minimum number of observations to calculate
- variance of a window
- ddof: unsigned int
- the divisor used in variance calculations
- is the window size - ddof
-
- Returns
- -------
- output: float64_t[:]
- weighted variances of windows
- """
-
- cdef:
- float64_t t = 0, sum_w = 0, mean = 0, nobs = 0
- float64_t val, pre_val, w, pre_w
- Py_ssize_t i, n, win_n
- float64_t[:] output
-
- n = len(values)
- win_n = len(weights)
- output = np.empty(n, dtype=np.float64)
-
- with nogil:
-
- for i in range(min(win_n, n)):
- add_weighted_var(values[i], weights[i], &t,
- &sum_w, &mean, &nobs)
-
- output[i] = calc_weighted_var(t, sum_w, win_n,
- ddof, nobs, minp)
-
- for i in range(win_n, n):
- val = values[i]
- pre_val = values[i - win_n]
-
- w = weights[i % win_n]
- pre_w = weights[(i - win_n) % win_n]
-
- if val == val:
- if pre_val == pre_val:
- remove_weighted_var(pre_val, pre_w, &t,
- &sum_w, &mean, &nobs)
-
- add_weighted_var(val, w, &t, &sum_w, &mean, &nobs)
-
- elif pre_val == pre_val:
- remove_weighted_var(pre_val, pre_w, &t,
- &sum_w, &mean, &nobs)
-
- output[i] = calc_weighted_var(t, sum_w, win_n,
- ddof, nobs, minp)
-
- return output
-
-
-# ----------------------------------------------------------------------
-# Exponentially weighted moving
-@cython.cpow(True)
-def ewm(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end,
- int minp, float64_t com, bint adjust, bint ignore_na,
- const float64_t[:] deltas=None, bint normalize=True) -> np.ndarray:
- """
- Compute exponentially-weighted moving average or sum using center-of-mass.
-
- Parameters
- ----------
- vals : ndarray (float64 type)
- start: ndarray (int64 type)
- end: ndarray (int64 type)
- minp : int
- com : float64
- adjust : bool
- ignore_na : bool
- deltas : ndarray (float64 type), optional. If None, implicitly assumes equally
- spaced points (used when `times` is not passed)
- normalize : bool, optional.
- If True, calculate the mean. If False, calculate the sum.
-
- Returns
- -------
- np.ndarray[float64_t]
- """
-
- cdef:
- Py_ssize_t i, j, s, e, nobs, win_size, N = len(vals), M = len(start)
- const float64_t[:] sub_vals
- const float64_t[:] sub_deltas=None
- ndarray[float64_t] sub_output, output = np.empty(N, dtype=np.float64)
- float64_t alpha, old_wt_factor, new_wt, weighted, old_wt, cur
- bint is_observation, use_deltas
-
- if N == 0:
- return output
-
- use_deltas = deltas is not None
-
- alpha = 1. / (1. + com)
- old_wt_factor = 1. - alpha
- new_wt = 1. if adjust else alpha
-
- for j in range(M):
- s = start[j]
- e = end[j]
- sub_vals = vals[s:e]
- # note that len(deltas) = len(vals) - 1 and deltas[i] is to be used in
- # conjunction with vals[i+1]
- if use_deltas:
- sub_deltas = deltas[s:e - 1]
- win_size = len(sub_vals)
- sub_output = np.empty(win_size, dtype=np.float64)
-
- weighted = sub_vals[0]
- is_observation = weighted == weighted
- nobs = int(is_observation)
- sub_output[0] = weighted if nobs >= minp else NaN
- old_wt = 1.
-
- with nogil:
- for i in range(1, win_size):
- cur = sub_vals[i]
- is_observation = cur == cur
- nobs += is_observation
- if weighted == weighted:
-
- if is_observation or not ignore_na:
- if normalize:
- if use_deltas:
- old_wt *= old_wt_factor ** sub_deltas[i - 1]
- else:
- old_wt *= old_wt_factor
- else:
- weighted = old_wt_factor * weighted
- if is_observation:
- if normalize:
- # avoid numerical errors on constant series
- if weighted != cur:
- weighted = old_wt * weighted + new_wt * cur
- weighted /= (old_wt + new_wt)
- if adjust:
- old_wt += new_wt
- else:
- old_wt = 1.
- else:
- weighted += cur
- elif is_observation:
- weighted = cur
-
- sub_output[i] = weighted if nobs >= minp else NaN
-
- output[s:e] = sub_output
-
- return output
-
-
-def ewmcov(const float64_t[:] input_x, const int64_t[:] start, const int64_t[:] end,
- int minp, const float64_t[:] input_y, float64_t com, bint adjust,
- bint ignore_na, bint bias) -> np.ndarray:
- """
- Compute exponentially-weighted moving variance using center-of-mass.
-
- Parameters
- ----------
- input_x : ndarray (float64 type)
- start: ndarray (int64 type)
- end: ndarray (int64 type)
- minp : int
- input_y : ndarray (float64 type)
- com : float64
- adjust : bool
- ignore_na : bool
- bias : bool
-
- Returns
- -------
- np.ndarray[float64_t]
- """
-
- cdef:
- Py_ssize_t i, j, s, e, win_size, nobs
- Py_ssize_t N = len(input_x), M = len(input_y), L = len(start)
- float64_t alpha, old_wt_factor, new_wt, mean_x, mean_y, cov
- float64_t sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y
- float64_t numerator, denominator
- const float64_t[:] sub_x_vals, sub_y_vals
- ndarray[float64_t] sub_out, output = np.empty(N, dtype=np.float64)
- bint is_observation
-
- if M != N:
- raise ValueError(f"arrays are of different lengths ({N} and {M})")
-
- if N == 0:
- return output
-
- alpha = 1. / (1. + com)
- old_wt_factor = 1. - alpha
- new_wt = 1. if adjust else alpha
-
- for j in range(L):
- s = start[j]
- e = end[j]
- sub_x_vals = input_x[s:e]
- sub_y_vals = input_y[s:e]
- win_size = len(sub_x_vals)
- sub_out = np.empty(win_size, dtype=np.float64)
-
- mean_x = sub_x_vals[0]
- mean_y = sub_y_vals[0]
- is_observation = (mean_x == mean_x) and (mean_y == mean_y)
- nobs = int(is_observation)
- if not is_observation:
- mean_x = NaN
- mean_y = NaN
- sub_out[0] = (0. if bias else NaN) if nobs >= minp else NaN
- cov = 0.
- sum_wt = 1.
- sum_wt2 = 1.
- old_wt = 1.
-
- with nogil:
- for i in range(1, win_size):
- cur_x = sub_x_vals[i]
- cur_y = sub_y_vals[i]
- is_observation = (cur_x == cur_x) and (cur_y == cur_y)
- nobs += is_observation
- if mean_x == mean_x:
- if is_observation or not ignore_na:
- sum_wt *= old_wt_factor
- sum_wt2 *= (old_wt_factor * old_wt_factor)
- old_wt *= old_wt_factor
- if is_observation:
- old_mean_x = mean_x
- old_mean_y = mean_y
-
- # avoid numerical errors on constant series
- if mean_x != cur_x:
- mean_x = ((old_wt * old_mean_x) +
- (new_wt * cur_x)) / (old_wt + new_wt)
-
- # avoid numerical errors on constant series
- if mean_y != cur_y:
- mean_y = ((old_wt * old_mean_y) +
- (new_wt * cur_y)) / (old_wt + new_wt)
- cov = ((old_wt * (cov + ((old_mean_x - mean_x) *
- (old_mean_y - mean_y)))) +
- (new_wt * ((cur_x - mean_x) *
- (cur_y - mean_y)))) / (old_wt + new_wt)
- sum_wt += new_wt
- sum_wt2 += (new_wt * new_wt)
- old_wt += new_wt
- if not adjust:
- sum_wt /= old_wt
- sum_wt2 /= (old_wt * old_wt)
- old_wt = 1.
- elif is_observation:
- mean_x = cur_x
- mean_y = cur_y
-
- if nobs >= minp:
- if not bias:
- numerator = sum_wt * sum_wt
- denominator = numerator - sum_wt2
- if denominator > 0:
- sub_out[i] = (numerator / denominator) * cov
- else:
- sub_out[i] = NaN
- else:
- sub_out[i] = cov
- else:
- sub_out[i] = NaN
-
- output[s:e] = sub_out
-
- return output
diff --git a/contrib/python/pandas/py3/pandas/_libs/window/indexers.pyi b/contrib/python/pandas/py3/pandas/_libs/window/indexers.pyi
deleted file mode 100644
index c9bc64be34a..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/window/indexers.pyi
+++ /dev/null
@@ -1,12 +0,0 @@
-import numpy as np
-
-from pandas._typing import npt
-
-def calculate_variable_window_bounds(
- num_values: int, # int64_t
- window_size: int, # int64_t
- min_periods,
- center: bool,
- closed: str | None,
- index: np.ndarray, # const int64_t[:]
-) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/window/indexers.pyx b/contrib/python/pandas/py3/pandas/_libs/window/indexers.pyx
deleted file mode 100644
index 02934346130..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/window/indexers.pyx
+++ /dev/null
@@ -1,149 +0,0 @@
-# cython: boundscheck=False, wraparound=False, cdivision=True
-
-import numpy as np
-
-from numpy cimport (
- int64_t,
- ndarray,
-)
-
-# Cython routines for window indexers
-
-
-def calculate_variable_window_bounds(
- int64_t num_values,
- int64_t window_size,
- object min_periods, # unused but here to match get_window_bounds signature
- bint center,
- str closed,
- const int64_t[:] index
-):
- """
- Calculate window boundaries for rolling windows from a time offset.
-
- Parameters
- ----------
- num_values : int64
- total number of values
-
- window_size : int64
- window size calculated from the offset
-
- min_periods : object
- ignored, exists for compatibility
-
- center : bint
- center the rolling window on the current observation
-
- closed : str
- string of side of the window that should be closed
-
- index : ndarray[int64]
- time series index to roll over
-
- Returns
- -------
- (ndarray[int64], ndarray[int64])
- """
- cdef:
- bint left_closed = False
- bint right_closed = False
- ndarray[int64_t, ndim=1] start, end
- int64_t start_bound, end_bound, index_growth_sign = 1
- Py_ssize_t i, j
-
- if num_values <= 0:
- return np.empty(0, dtype="int64"), np.empty(0, dtype="int64")
-
- # default is 'right'
- if closed is None:
- closed = "right"
-
- if closed in ["right", "both"]:
- right_closed = True
-
- if closed in ["left", "both"]:
- left_closed = True
-
- # GH 43997:
- # If the forward and the backward facing windows
- # would result in a fraction of 1/2 a nanosecond
- # we need to make both interval ends inclusive.
- if center and window_size % 2 == 1:
- right_closed = True
- left_closed = True
-
- if index[num_values - 1] < index[0]:
- index_growth_sign = -1
-
- start = np.empty(num_values, dtype="int64")
- start.fill(-1)
- end = np.empty(num_values, dtype="int64")
- end.fill(-1)
-
- start[0] = 0
-
- # right endpoint is closed
- if right_closed:
- end[0] = 1
- # right endpoint is open
- else:
- end[0] = 0
- if center:
- end_bound = index[0] + index_growth_sign * window_size / 2
- for j in range(0, num_values):
- if (index[j] - end_bound) * index_growth_sign < 0:
- end[0] = j + 1
- elif (index[j] - end_bound) * index_growth_sign == 0 and right_closed:
- end[0] = j + 1
- elif (index[j] - end_bound) * index_growth_sign >= 0:
- end[0] = j
- break
-
- with nogil:
-
- # start is start of slice interval (including)
- # end is end of slice interval (not including)
- for i in range(1, num_values):
- if center:
- end_bound = index[i] + index_growth_sign * window_size / 2
- start_bound = index[i] - index_growth_sign * window_size / 2
- else:
- end_bound = index[i]
- start_bound = index[i] - index_growth_sign * window_size
-
- # left endpoint is closed
- if left_closed:
- start_bound -= 1 * index_growth_sign
-
- # advance the start bound until we are
- # within the constraint
- start[i] = i
- for j in range(start[i - 1], i):
- if (index[j] - start_bound) * index_growth_sign > 0:
- start[i] = j
- break
-
- # for centered window advance the end bound until we are
- # outside the constraint
- if center:
- for j in range(end[i - 1], num_values + 1):
- if j == num_values:
- end[i] = j
- elif ((index[j] - end_bound) * index_growth_sign == 0 and
- right_closed):
- end[i] = j + 1
- elif (index[j] - end_bound) * index_growth_sign >= 0:
- end[i] = j
- break
- # end bound is previous end
- # or current index
- elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0:
- end[i] = i + 1
- else:
- end[i] = end[i - 1]
-
- # right endpoint is open
- if not right_closed and not center:
- end[i] -= 1
- return start, end
diff --git a/contrib/python/pandas/py3/pandas/_libs/writers.pyi b/contrib/python/pandas/py3/pandas/_libs/writers.pyi
deleted file mode 100644
index 7b41856525d..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/writers.pyi
+++ /dev/null
@@ -1,20 +0,0 @@
-import numpy as np
-
-from pandas._typing import ArrayLike
-
-def write_csv_rows(
- data: list[ArrayLike],
- data_index: np.ndarray,
- nlevels: int,
- cols: np.ndarray,
- writer: object, # _csv.writer
-) -> None: ...
-def convert_json_to_lines(arr: str) -> str: ...
-def max_len_string_array(
- arr: np.ndarray, # pandas_string[:]
-) -> int: ...
-def word_len(val: object) -> int: ...
-def string_array_replace_from_nan_rep(
- arr: np.ndarray, # np.ndarray[object, ndim=1]
- nan_rep: object,
-) -> None: ...
diff --git a/contrib/python/pandas/py3/pandas/_libs/writers.pyx b/contrib/python/pandas/py3/pandas/_libs/writers.pyx
deleted file mode 100644
index bd5c0290b28..00000000000
--- a/contrib/python/pandas/py3/pandas/_libs/writers.pyx
+++ /dev/null
@@ -1,173 +0,0 @@
-cimport cython
-import numpy as np
-
-from cpython cimport (
- PyBytes_GET_SIZE,
- PyUnicode_GET_LENGTH,
-)
-from numpy cimport (
- ndarray,
- uint8_t,
-)
-
-ctypedef fused pandas_string:
- str
- bytes
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def write_csv_rows(
- list data,
- ndarray data_index,
- Py_ssize_t nlevels,
- ndarray cols,
- object writer
-) -> None:
- """
- Write the given data to the writer object, pre-allocating where possible
- for performance improvements.
-
- Parameters
- ----------
- data : list[ArrayLike]
- data_index : ndarray
- nlevels : int
- cols : ndarray
- writer : _csv.writer
- """
- # In crude testing, N>100 yields little marginal improvement
- cdef:
- Py_ssize_t i, j = 0, k = len(data_index), N = 100, ncols = len(cols)
- list rows
-
- # pre-allocate rows
- rows = [[None] * (nlevels + ncols) for _ in range(N)]
-
- if nlevels == 1:
- for j in range(k):
- row = rows[j % N]
- row[0] = data_index[j]
- for i in range(ncols):
- row[1 + i] = data[i][j]
-
- if j >= N - 1 and j % N == N - 1:
- writer.writerows(rows)
- elif nlevels > 1:
- for j in range(k):
- row = rows[j % N]
- row[:nlevels] = list(data_index[j])
- for i in range(ncols):
- row[nlevels + i] = data[i][j]
-
- if j >= N - 1 and j % N == N - 1:
- writer.writerows(rows)
- else:
- for j in range(k):
- row = rows[j % N]
- for i in range(ncols):
- row[i] = data[i][j]
-
- if j >= N - 1 and j % N == N - 1:
- writer.writerows(rows)
-
- if j >= 0 and (j < N - 1 or (j % N) != N - 1):
- writer.writerows(rows[:((j + 1) % N)])
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def convert_json_to_lines(arr: str) -> str:
- """
- replace comma separated json with line feeds, paying special attention
- to quotes & brackets
- """
- cdef:
- Py_ssize_t i = 0, num_open_brackets_seen = 0, length
- bint in_quotes = False, is_escaping = False
- ndarray[uint8_t, ndim=1] narr
- unsigned char val, newline, comma, left_bracket, right_bracket, quote
- unsigned char backslash
-
- newline = ord("\n")
- comma = ord(",")
- left_bracket = ord("{")
- right_bracket = ord("}")
- quote = ord('"')
- backslash = ord("\\")
-
- narr = np.frombuffer(arr.encode("utf-8"), dtype="u1").copy()
- length = narr.shape[0]
- for i in range(length):
- val = narr[i]
- if val == quote and i > 0 and not is_escaping:
- in_quotes = ~in_quotes
- if val == backslash or is_escaping:
- is_escaping = ~is_escaping
- if val == comma: # commas that should be \n
- if num_open_brackets_seen == 0 and not in_quotes:
- narr[i] = newline
- elif val == left_bracket:
- if not in_quotes:
- num_open_brackets_seen += 1
- elif val == right_bracket:
- if not in_quotes:
- num_open_brackets_seen -= 1
-
- return narr.tobytes().decode("utf-8") + "\n" # GH:36888
-
-
-# stata, pytables
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def max_len_string_array(pandas_string[:] arr) -> Py_ssize_t:
- """
- Return the maximum size of elements in a 1-dim string array.
- """
- cdef:
- Py_ssize_t i, m = 0, wlen = 0, length = arr.shape[0]
- pandas_string val
-
- for i in range(length):
- val = arr[i]
- wlen = word_len(val)
-
- if wlen > m:
- m = wlen
-
- return m
-
-
-cpdef inline Py_ssize_t word_len(object val):
- """
- Return the maximum length of a string or bytes value.
- """
- cdef:
- Py_ssize_t wlen = 0
-
- if isinstance(val, str):
- wlen = PyUnicode_GET_LENGTH(val)
- elif isinstance(val, bytes):
- wlen = PyBytes_GET_SIZE(val)
-
- return wlen
-
-# ------------------------------------------------------------------
-# PyTables Helpers
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def string_array_replace_from_nan_rep(
- ndarray[object, ndim=1] arr,
- object nan_rep,
-) -> None:
- """
- Replace the values in the array with np.nan if they are nan_rep.
- """
- cdef:
- Py_ssize_t length = len(arr), i = 0
-
- for i in range(length):
- if arr[i] == nan_rep:
- arr[i] = np.nan
diff --git a/contrib/python/pandas/py3/pandas/_testing/__init__.py b/contrib/python/pandas/py3/pandas/_testing/__init__.py
deleted file mode 100644
index f9add5c2c5d..00000000000
--- a/contrib/python/pandas/py3/pandas/_testing/__init__.py
+++ /dev/null
@@ -1,1168 +0,0 @@
-from __future__ import annotations
-
-import collections
-from datetime import datetime
-from decimal import Decimal
-import operator
-import os
-import re
-import string
-from sys import byteorder
-from typing import (
- TYPE_CHECKING,
- Callable,
- ContextManager,
- Counter,
- Iterable,
- cast,
-)
-
-import numpy as np
-
-from pandas._config.localization import (
- can_set_locale,
- get_locales,
- set_locale,
-)
-
-from pandas._typing import (
- Dtype,
- Frequency,
- NpDtype,
-)
-from pandas.compat import pa_version_under7p0
-
-from pandas.core.dtypes.common import (
- is_float_dtype,
- is_integer_dtype,
- is_sequence,
- is_signed_integer_dtype,
- is_unsigned_integer_dtype,
- pandas_dtype,
-)
-
-import pandas as pd
-from pandas import (
- ArrowDtype,
- Categorical,
- CategoricalIndex,
- DataFrame,
- DatetimeIndex,
- Index,
- IntervalIndex,
- MultiIndex,
- RangeIndex,
- Series,
- bdate_range,
-)
-from pandas._testing._io import (
- close,
- network,
- round_trip_localpath,
- round_trip_pathlib,
- round_trip_pickle,
- write_to_compressed,
-)
-from pandas._testing._random import (
- rands,
- rands_array,
-)
-from pandas._testing._warnings import (
- assert_produces_warning,
- maybe_produces_warning,
-)
-from pandas._testing.asserters import (
- assert_almost_equal,
- assert_attr_equal,
- assert_categorical_equal,
- assert_class_equal,
- assert_contains_all,
- assert_copy,
- assert_datetime_array_equal,
- assert_dict_equal,
- assert_equal,
- assert_extension_array_equal,
- assert_frame_equal,
- assert_index_equal,
- assert_indexing_slices_equivalent,
- assert_interval_array_equal,
- assert_is_sorted,
- assert_is_valid_plot_return_object,
- assert_metadata_equivalent,
- assert_numpy_array_equal,
- assert_period_array_equal,
- assert_series_equal,
- assert_sp_array_equal,
- assert_timedelta_array_equal,
- raise_assert_detail,
-)
-from pandas._testing.compat import (
- get_dtype,
- get_obj,
-)
-from pandas._testing.contexts import (
- decompress_file,
- ensure_clean,
- ensure_safe_environment_variables,
- raises_chained_assignment_error,
- set_timezone,
- use_numexpr,
- with_csv_dialect,
-)
-from pandas.core.arrays import (
- BaseMaskedArray,
- ExtensionArray,
- PandasArray,
-)
-from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
-from pandas.core.construction import extract_array
-
-if TYPE_CHECKING:
- from pandas import (
- PeriodIndex,
- TimedeltaIndex,
- )
- from pandas.core.arrays import ArrowExtensionArray
-
-_N = 30
-_K = 4
-
-UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
-UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
-SIGNED_INT_NUMPY_DTYPES: list[NpDtype] = [int, "int8", "int16", "int32", "int64"]
-SIGNED_INT_EA_DTYPES: list[Dtype] = ["Int8", "Int16", "Int32", "Int64"]
-ALL_INT_NUMPY_DTYPES = UNSIGNED_INT_NUMPY_DTYPES + SIGNED_INT_NUMPY_DTYPES
-ALL_INT_EA_DTYPES = UNSIGNED_INT_EA_DTYPES + SIGNED_INT_EA_DTYPES
-ALL_INT_DTYPES: list[Dtype] = [*ALL_INT_NUMPY_DTYPES, *ALL_INT_EA_DTYPES]
-
-FLOAT_NUMPY_DTYPES: list[NpDtype] = [float, "float32", "float64"]
-FLOAT_EA_DTYPES: list[Dtype] = ["Float32", "Float64"]
-ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES]
-
-COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"]
-STRING_DTYPES: list[Dtype] = [str, "str", "U"]
-
-DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"]
-TIMEDELTA64_DTYPES: list[Dtype] = ["timedelta64[ns]", "m8[ns]"]
-
-BOOL_DTYPES: list[Dtype] = [bool, "bool"]
-BYTES_DTYPES: list[Dtype] = [bytes, "bytes"]
-OBJECT_DTYPES: list[Dtype] = [object, "object"]
-
-ALL_REAL_NUMPY_DTYPES = FLOAT_NUMPY_DTYPES + ALL_INT_NUMPY_DTYPES
-ALL_REAL_EXTENSION_DTYPES = FLOAT_EA_DTYPES + ALL_INT_EA_DTYPES
-ALL_REAL_DTYPES: list[Dtype] = [*ALL_REAL_NUMPY_DTYPES, *ALL_REAL_EXTENSION_DTYPES]
-ALL_NUMERIC_DTYPES: list[Dtype] = [*ALL_REAL_DTYPES, *COMPLEX_DTYPES]
-
-ALL_NUMPY_DTYPES = (
- ALL_REAL_NUMPY_DTYPES
- + COMPLEX_DTYPES
- + STRING_DTYPES
- + DATETIME64_DTYPES
- + TIMEDELTA64_DTYPES
- + BOOL_DTYPES
- + OBJECT_DTYPES
- + BYTES_DTYPES
-)
-
-NARROW_NP_DTYPES = [
- np.float16,
- np.float32,
- np.int8,
- np.int16,
- np.int32,
- np.uint8,
- np.uint16,
- np.uint32,
-]
-
-ENDIAN = {"little": "<", "big": ">"}[byteorder]
-
-NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA, Decimal("NaN")]
-NP_NAT_OBJECTS = [
- cls("NaT", unit)
- for cls in [np.datetime64, np.timedelta64]
- for unit in [
- "Y",
- "M",
- "W",
- "D",
- "h",
- "m",
- "s",
- "ms",
- "us",
- "ns",
- "ps",
- "fs",
- "as",
- ]
-]
-
-if not pa_version_under7p0:
- import pyarrow as pa
-
- UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]
- SIGNED_INT_PYARROW_DTYPES = [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
- ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES
- ALL_INT_PYARROW_DTYPES_STR_REPR = [
- str(ArrowDtype(typ)) for typ in ALL_INT_PYARROW_DTYPES
- ]
-
- # pa.float16 doesn't seem supported
- # https://github.com/apache/arrow/blob/master/python/pyarrow/src/arrow/python/helpers.cc#L86
- FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()]
- FLOAT_PYARROW_DTYPES_STR_REPR = [
- str(ArrowDtype(typ)) for typ in FLOAT_PYARROW_DTYPES
- ]
- DECIMAL_PYARROW_DTYPES = [pa.decimal128(7, 3)]
- STRING_PYARROW_DTYPES = [pa.string()]
- BINARY_PYARROW_DTYPES = [pa.binary()]
-
- TIME_PYARROW_DTYPES = [
- pa.time32("s"),
- pa.time32("ms"),
- pa.time64("us"),
- pa.time64("ns"),
- ]
- DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()]
- DATETIME_PYARROW_DTYPES = [
- pa.timestamp(unit=unit, tz=tz)
- for unit in ["s", "ms", "us", "ns"]
- for tz in [None, "UTC", "US/Pacific", "US/Eastern"]
- ]
- TIMEDELTA_PYARROW_DTYPES = [pa.duration(unit) for unit in ["s", "ms", "us", "ns"]]
-
- BOOL_PYARROW_DTYPES = [pa.bool_()]
-
- # TODO: Add container like pyarrow types:
- # https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions
- ALL_PYARROW_DTYPES = (
- ALL_INT_PYARROW_DTYPES
- + FLOAT_PYARROW_DTYPES
- + DECIMAL_PYARROW_DTYPES
- + STRING_PYARROW_DTYPES
- + BINARY_PYARROW_DTYPES
- + TIME_PYARROW_DTYPES
- + DATE_PYARROW_DTYPES
- + DATETIME_PYARROW_DTYPES
- + TIMEDELTA_PYARROW_DTYPES
- + BOOL_PYARROW_DTYPES
- )
-else:
- FLOAT_PYARROW_DTYPES_STR_REPR = []
- ALL_INT_PYARROW_DTYPES_STR_REPR = []
- ALL_PYARROW_DTYPES = []
-
-
-EMPTY_STRING_PATTERN = re.compile("^$")
-
-
-def reset_display_options() -> None:
- """
- Reset the display options for printing and representing objects.
- """
- pd.reset_option("^display.", silent=True)
-
-
-# -----------------------------------------------------------------------------
-# Comparators
-
-
-def equalContents(arr1, arr2) -> bool:
- """
- Checks if the set of unique elements of arr1 and arr2 are equivalent.
- """
- return frozenset(arr1) == frozenset(arr2)
-
-
-def box_expected(expected, box_cls, transpose: bool = True):
- """
- Helper function to wrap the expected output of a test in a given box_class.
-
- Parameters
- ----------
- expected : np.ndarray, Index, Series
- box_cls : {Index, Series, DataFrame}
-
- Returns
- -------
- subclass of box_cls
- """
- if box_cls is pd.array:
- if isinstance(expected, RangeIndex):
- # pd.array would return an IntegerArray
- expected = PandasArray(np.asarray(expected._values))
- else:
- expected = pd.array(expected, copy=False)
- elif box_cls is Index:
- expected = Index(expected)
- elif box_cls is Series:
- expected = Series(expected)
- elif box_cls is DataFrame:
- expected = Series(expected).to_frame()
- if transpose:
- # for vector operations, we need a DataFrame to be a single-row,
- # not a single-column, in order to operate against non-DataFrame
- # vectors of the same length. But convert to two rows to avoid
- # single-row special cases in datetime arithmetic
- expected = expected.T
- expected = pd.concat([expected] * 2, ignore_index=True)
- elif box_cls is np.ndarray or box_cls is np.array:
- expected = np.array(expected)
- elif box_cls is to_array:
- expected = to_array(expected)
- else:
- raise NotImplementedError(box_cls)
- return expected
-
-
-def to_array(obj):
- """
- Similar to pd.array, but does not cast numpy dtypes to nullable dtypes.
- """
- # temporary implementation until we get pd.array in place
- dtype = getattr(obj, "dtype", None)
-
- if dtype is None:
- return np.asarray(obj)
-
- return extract_array(obj, extract_numpy=True)
-
-
-# -----------------------------------------------------------------------------
-# Others
-
-
-def getCols(k) -> str:
- return string.ascii_uppercase[:k]
-
-
-# make index
-def makeStringIndex(k: int = 10, name=None) -> Index:
- return Index(rands_array(nchars=10, size=k), name=name)
-
-
-def makeCategoricalIndex(
- k: int = 10, n: int = 3, name=None, **kwargs
-) -> CategoricalIndex:
- """make a length k index or n categories"""
- x = rands_array(nchars=4, size=n, replace=False)
- return CategoricalIndex(
- Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs
- )
-
-
-def makeIntervalIndex(k: int = 10, name=None, **kwargs) -> IntervalIndex:
- """make a length k IntervalIndex"""
- x = np.linspace(0, 100, num=(k + 1))
- return IntervalIndex.from_breaks(x, name=name, **kwargs)
-
-
-def makeBoolIndex(k: int = 10, name=None) -> Index:
- if k == 1:
- return Index([True], name=name)
- elif k == 2:
- return Index([False, True], name=name)
- return Index([False, True] + [False] * (k - 2), name=name)
-
-
-def makeNumericIndex(k: int = 10, *, name=None, dtype: Dtype | None) -> Index:
- dtype = pandas_dtype(dtype)
- assert isinstance(dtype, np.dtype)
-
- if is_integer_dtype(dtype):
- values = np.arange(k, dtype=dtype)
- if is_unsigned_integer_dtype(dtype):
- values += 2 ** (dtype.itemsize * 8 - 1)
- elif is_float_dtype(dtype):
- values = np.random.random_sample(k) - np.random.random_sample(1)
- values.sort()
- values = values * (10 ** np.random.randint(0, 9))
- else:
- raise NotImplementedError(f"wrong dtype {dtype}")
-
- return Index(values, dtype=dtype, name=name)
-
-
-def makeIntIndex(k: int = 10, *, name=None, dtype: Dtype = "int64") -> Index:
- dtype = pandas_dtype(dtype)
- if not is_signed_integer_dtype(dtype):
- raise TypeError(f"Wrong dtype {dtype}")
- return makeNumericIndex(k, name=name, dtype=dtype)
-
-
-def makeUIntIndex(k: int = 10, *, name=None, dtype: Dtype = "uint64") -> Index:
- dtype = pandas_dtype(dtype)
- if not is_unsigned_integer_dtype(dtype):
- raise TypeError(f"Wrong dtype {dtype}")
- return makeNumericIndex(k, name=name, dtype=dtype)
-
-
-def makeRangeIndex(k: int = 10, name=None, **kwargs) -> RangeIndex:
- return RangeIndex(0, k, 1, name=name, **kwargs)
-
-
-def makeFloatIndex(k: int = 10, *, name=None, dtype: Dtype = "float64") -> Index:
- dtype = pandas_dtype(dtype)
- if not is_float_dtype(dtype):
- raise TypeError(f"Wrong dtype {dtype}")
- return makeNumericIndex(k, name=name, dtype=dtype)
-
-
-def makeDateIndex(
- k: int = 10, freq: Frequency = "B", name=None, **kwargs
-) -> DatetimeIndex:
- dt = datetime(2000, 1, 1)
- dr = bdate_range(dt, periods=k, freq=freq, name=name)
- return DatetimeIndex(dr, name=name, **kwargs)
-
-
-def makeTimedeltaIndex(
- k: int = 10, freq: Frequency = "D", name=None, **kwargs
-) -> TimedeltaIndex:
- return pd.timedelta_range(start="1 day", periods=k, freq=freq, name=name, **kwargs)
-
-
-def makePeriodIndex(k: int = 10, name=None, **kwargs) -> PeriodIndex:
- dt = datetime(2000, 1, 1)
- return pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs)
-
-
-def makeMultiIndex(k: int = 10, names=None, **kwargs):
- N = (k // 2) + 1
- rng = range(N)
- mi = MultiIndex.from_product([("foo", "bar"), rng], names=names, **kwargs)
- assert len(mi) >= k # GH#38795
- return mi[:k]
-
-
-def index_subclass_makers_generator():
- make_index_funcs = [
- makeDateIndex,
- makePeriodIndex,
- makeTimedeltaIndex,
- makeRangeIndex,
- makeIntervalIndex,
- makeCategoricalIndex,
- makeMultiIndex,
- ]
- yield from make_index_funcs
-
-
-def all_timeseries_index_generator(k: int = 10) -> Iterable[Index]:
- """
- Generator which can be iterated over to get instances of all the classes
- which represent time-series.
-
- Parameters
- ----------
- k: length of each of the index instances
- """
- make_index_funcs: list[Callable[..., Index]] = [
- makeDateIndex,
- makePeriodIndex,
- makeTimedeltaIndex,
- ]
- for make_index_func in make_index_funcs:
- yield make_index_func(k=k)
-
-
-# make series
-def make_rand_series(name=None, dtype=np.float64) -> Series:
- index = makeStringIndex(_N)
- data = np.random.randn(_N)
- with np.errstate(invalid="ignore"):
- data = data.astype(dtype, copy=False)
- return Series(data, index=index, name=name)
-
-
-def makeFloatSeries(name=None) -> Series:
- return make_rand_series(name=name)
-
-
-def makeStringSeries(name=None) -> Series:
- return make_rand_series(name=name)
-
-
-def makeObjectSeries(name=None) -> Series:
- data = makeStringIndex(_N)
- data = Index(data, dtype=object)
- index = makeStringIndex(_N)
- return Series(data, index=index, name=name)
-
-
-def getSeriesData() -> dict[str, Series]:
- index = makeStringIndex(_N)
- return {c: Series(np.random.randn(_N), index=index) for c in getCols(_K)}
-
-
-def makeTimeSeries(nper=None, freq: Frequency = "B", name=None) -> Series:
- if nper is None:
- nper = _N
- return Series(
- np.random.randn(nper), index=makeDateIndex(nper, freq=freq), name=name
- )
-
-
-def makePeriodSeries(nper=None, name=None) -> Series:
- if nper is None:
- nper = _N
- return Series(np.random.randn(nper), index=makePeriodIndex(nper), name=name)
-
-
-def getTimeSeriesData(nper=None, freq: Frequency = "B") -> dict[str, Series]:
- return {c: makeTimeSeries(nper, freq) for c in getCols(_K)}
-
-
-def getPeriodData(nper=None) -> dict[str, Series]:
- return {c: makePeriodSeries(nper) for c in getCols(_K)}
-
-
-# make frame
-def makeTimeDataFrame(nper=None, freq: Frequency = "B") -> DataFrame:
- data = getTimeSeriesData(nper, freq)
- return DataFrame(data)
-
-
-def makeDataFrame() -> DataFrame:
- data = getSeriesData()
- return DataFrame(data)
-
-
-def getMixedTypeDict():
- index = Index(["a", "b", "c", "d", "e"])
-
- data = {
- "A": [0.0, 1.0, 2.0, 3.0, 4.0],
- "B": [0.0, 1.0, 0.0, 1.0, 0.0],
- "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
- "D": bdate_range("1/1/2009", periods=5),
- }
-
- return index, data
-
-
-def makeMixedDataFrame() -> DataFrame:
- return DataFrame(getMixedTypeDict()[1])
-
-
-def makePeriodFrame(nper=None) -> DataFrame:
- data = getPeriodData(nper)
- return DataFrame(data)
-
-
-def makeCustomIndex(
- nentries,
- nlevels,
- prefix: str = "#",
- names: bool | str | list[str] | None = False,
- ndupe_l=None,
- idx_type=None,
-) -> Index:
- """
- Create an index/multindex with given dimensions, levels, names, etc'
-
- nentries - number of entries in index
- nlevels - number of levels (> 1 produces multindex)
- prefix - a string prefix for labels
- names - (Optional), bool or list of strings. if True will use default
- names, if false will use no names, if a list is given, the name of
- each level in the index will be taken from the list.
- ndupe_l - (Optional), list of ints, the number of rows for which the
- label will repeated at the corresponding level, you can specify just
- the first few, the rest will use the default ndupe_l of 1.
- len(ndupe_l) <= nlevels.
- idx_type - "i"/"f"/"s"/"dt"/"p"/"td".
- If idx_type is not None, `idx_nlevels` must be 1.
- "i"/"f" creates an integer/float index,
- "s" creates a string
- "dt" create a datetime index.
- "td" create a datetime index.
-
- if unspecified, string labels will be generated.
- """
- if ndupe_l is None:
- ndupe_l = [1] * nlevels
- assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels
- assert names is None or names is False or names is True or len(names) is nlevels
- assert idx_type is None or (
- idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1
- )
-
- if names is True:
- # build default names
- names = [prefix + str(i) for i in range(nlevels)]
- if names is False:
- # pass None to index constructor for no name
- names = None
-
- # make singleton case uniform
- if isinstance(names, str) and nlevels == 1:
- names = [names]
-
- # specific 1D index type requested?
- idx_func_dict: dict[str, Callable[..., Index]] = {
- "i": makeIntIndex,
- "f": makeFloatIndex,
- "s": makeStringIndex,
- "dt": makeDateIndex,
- "td": makeTimedeltaIndex,
- "p": makePeriodIndex,
- }
- idx_func = idx_func_dict.get(idx_type)
- if idx_func:
- idx = idx_func(nentries)
- # but we need to fill in the name
- if names:
- idx.name = names[0]
- return idx
- elif idx_type is not None:
- raise ValueError(
- f"{repr(idx_type)} is not a legal value for `idx_type`, "
- "use 'i'/'f'/'s'/'dt'/'p'/'td'."
- )
-
- if len(ndupe_l) < nlevels:
- ndupe_l.extend([1] * (nlevels - len(ndupe_l)))
- assert len(ndupe_l) == nlevels
-
- assert all(x > 0 for x in ndupe_l)
-
- list_of_lists = []
- for i in range(nlevels):
-
- def keyfunc(x):
- numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_")
- return [int(num) for num in numeric_tuple]
-
- # build a list of lists to create the index from
- div_factor = nentries // ndupe_l[i] + 1
-
- # Deprecated since version 3.9: collections.Counter now supports []. See PEP 585
- # and Generic Alias Type.
- cnt: Counter[str] = collections.Counter()
- for j in range(div_factor):
- label = f"{prefix}_l{i}_g{j}"
- cnt[label] = ndupe_l[i]
- # cute Counter trick
- result = sorted(cnt.elements(), key=keyfunc)[:nentries]
- list_of_lists.append(result)
-
- tuples = list(zip(*list_of_lists))
-
- # convert tuples to index
- if nentries == 1:
- # we have a single level of tuples, i.e. a regular Index
- name = None if names is None else names[0]
- index = Index(tuples[0], name=name)
- elif nlevels == 1:
- name = None if names is None else names[0]
- index = Index((x[0] for x in tuples), name=name)
- else:
- index = MultiIndex.from_tuples(tuples, names=names)
- return index
-
-
-def makeCustomDataframe(
- nrows,
- ncols,
- c_idx_names: bool | list[str] = True,
- r_idx_names: bool | list[str] = True,
- c_idx_nlevels: int = 1,
- r_idx_nlevels: int = 1,
- data_gen_f=None,
- c_ndupe_l=None,
- r_ndupe_l=None,
- dtype=None,
- c_idx_type=None,
- r_idx_type=None,
-) -> DataFrame:
- """
- Create a DataFrame using supplied parameters.
-
- Parameters
- ----------
- nrows, ncols - number of data rows/cols
- c_idx_names, r_idx_names - False/True/list of strings, yields No names ,
- default names or uses the provided names for the levels of the
- corresponding index. You can provide a single string when
- c_idx_nlevels ==1.
- c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex
- r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex
- data_gen_f - a function f(row,col) which return the data value
- at that position, the default generator used yields values of the form
- "RxCy" based on position.
- c_ndupe_l, r_ndupe_l - list of integers, determines the number
- of duplicates for each label at a given level of the corresponding
- index. The default `None` value produces a multiplicity of 1 across
- all levels, i.e. a unique index. Will accept a partial list of length
- N < idx_nlevels, for just the first N levels. If ndupe doesn't divide
- nrows/ncol, the last label might have lower multiplicity.
- dtype - passed to the DataFrame constructor as is, in case you wish to
- have more control in conjunction with a custom `data_gen_f`
- r_idx_type, c_idx_type - "i"/"f"/"s"/"dt"/"td".
- If idx_type is not None, `idx_nlevels` must be 1.
- "i"/"f" creates an integer/float index,
- "s" creates a string index
- "dt" create a datetime index.
- "td" create a timedelta index.
-
- if unspecified, string labels will be generated.
-
- Examples
- --------
- # 5 row, 3 columns, default names on both, single index on both axis
- >> makeCustomDataframe(5,3)
-
- # make the data a random int between 1 and 100
- >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100))
-
- # 2-level multiindex on rows with each label duplicated
- # twice on first level, default names on both axis, single
- # index on both axis
- >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2])
-
- # DatetimeIndex on row, index with unicode labels on columns
- # no names on either axis
- >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False,
- r_idx_type="dt",c_idx_type="u")
-
- # 4-level multindex on rows with names provided, 2-level multindex
- # on columns with default labels and default names.
- >> a=makeCustomDataframe(5,3,r_idx_nlevels=4,
- r_idx_names=["FEE","FIH","FOH","FUM"],
- c_idx_nlevels=2)
-
- >> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4)
- """
- assert c_idx_nlevels > 0
- assert r_idx_nlevels > 0
- assert r_idx_type is None or (
- r_idx_type in ("i", "f", "s", "dt", "p", "td") and r_idx_nlevels == 1
- )
- assert c_idx_type is None or (
- c_idx_type in ("i", "f", "s", "dt", "p", "td") and c_idx_nlevels == 1
- )
-
- columns = makeCustomIndex(
- ncols,
- nlevels=c_idx_nlevels,
- prefix="C",
- names=c_idx_names,
- ndupe_l=c_ndupe_l,
- idx_type=c_idx_type,
- )
- index = makeCustomIndex(
- nrows,
- nlevels=r_idx_nlevels,
- prefix="R",
- names=r_idx_names,
- ndupe_l=r_ndupe_l,
- idx_type=r_idx_type,
- )
-
- # by default, generate data based on location
- if data_gen_f is None:
- data_gen_f = lambda r, c: f"R{r}C{c}"
-
- data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)]
-
- return DataFrame(data, index, columns, dtype=dtype)
-
-
-def _create_missing_idx(nrows, ncols, density: float, random_state=None):
- if random_state is None:
- random_state = np.random
- else:
- random_state = np.random.RandomState(random_state)
-
- # below is cribbed from scipy.sparse
- size = round((1 - density) * nrows * ncols)
- # generate a few more to ensure unique values
- min_rows = 5
- fac = 1.02
- extra_size = min(size + min_rows, fac * size)
-
- def _gen_unique_rand(rng, _extra_size):
- ind = rng.rand(int(_extra_size))
- return np.unique(np.floor(ind * nrows * ncols))[:size]
-
- ind = _gen_unique_rand(random_state, extra_size)
- while ind.size < size:
- extra_size *= 1.05
- ind = _gen_unique_rand(random_state, extra_size)
-
- j = np.floor(ind * 1.0 / nrows).astype(int)
- i = (ind - j * nrows).astype(int)
- return i.tolist(), j.tolist()
-
-
-def makeMissingDataframe(density: float = 0.9, random_state=None) -> DataFrame:
- df = makeDataFrame()
- i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state)
- df.iloc[i, j] = np.nan
- return df
-
-
-class SubclassedSeries(Series):
- _metadata = ["testattr", "name"]
-
- @property
- def _constructor(self):
- # For testing, those properties return a generic callable, and not
- # the actual class. In this case that is equivalent, but it is to
- # ensure we don't rely on the property returning a class
- # See https://github.com/pandas-dev/pandas/pull/46018 and
- # https://github.com/pandas-dev/pandas/issues/32638 and linked issues
- return lambda *args, **kwargs: SubclassedSeries(*args, **kwargs)
-
- @property
- def _constructor_expanddim(self):
- return lambda *args, **kwargs: SubclassedDataFrame(*args, **kwargs)
-
-
-class SubclassedDataFrame(DataFrame):
- _metadata = ["testattr"]
-
- @property
- def _constructor(self):
- return lambda *args, **kwargs: SubclassedDataFrame(*args, **kwargs)
-
- @property
- def _constructor_sliced(self):
- return lambda *args, **kwargs: SubclassedSeries(*args, **kwargs)
-
-
-class SubclassedCategorical(Categorical):
- @property
- def _constructor(self):
- return SubclassedCategorical
-
-
-def _make_skipna_wrapper(alternative, skipna_alternative=None):
- """
- Create a function for calling on an array.
-
- Parameters
- ----------
- alternative : function
- The function to be called on the array with no NaNs.
- Only used when 'skipna_alternative' is None.
- skipna_alternative : function
- The function to be called on the original array
-
- Returns
- -------
- function
- """
- if skipna_alternative:
-
- def skipna_wrapper(x):
- return skipna_alternative(x.values)
-
- else:
-
- def skipna_wrapper(x):
- nona = x.dropna()
- if len(nona) == 0:
- return np.nan
- return alternative(nona)
-
- return skipna_wrapper
-
-
-def convert_rows_list_to_csv_str(rows_list: list[str]) -> str:
- """
- Convert list of CSV rows to single CSV-formatted string for current OS.
-
- This method is used for creating expected value of to_csv() method.
-
- Parameters
- ----------
- rows_list : List[str]
- Each element represents the row of csv.
-
- Returns
- -------
- str
- Expected output of to_csv() in current OS.
- """
- sep = os.linesep
- return sep.join(rows_list) + sep
-
-
-def external_error_raised(expected_exception: type[Exception]) -> ContextManager:
- """
- Helper function to mark pytest.raises that have an external error message.
-
- Parameters
- ----------
- expected_exception : Exception
- Expected error to raise.
-
- Returns
- -------
- Callable
- Regular `pytest.raises` function with `match` equal to `None`.
- """
- import pytest
-
- return pytest.raises(expected_exception, match=None)
-
-
-cython_table = pd.core.common._cython_table.items()
-
-
-def get_cython_table_params(ndframe, func_names_and_expected):
- """
- Combine frame, functions from com._cython_table
- keys and expected result.
-
- Parameters
- ----------
- ndframe : DataFrame or Series
- func_names_and_expected : Sequence of two items
- The first item is a name of a NDFrame method ('sum', 'prod') etc.
- The second item is the expected return value.
-
- Returns
- -------
- list
- List of three items (DataFrame, function, expected result)
- """
- results = []
- for func_name, expected in func_names_and_expected:
- results.append((ndframe, func_name, expected))
- results += [
- (ndframe, func, expected)
- for func, name in cython_table
- if name == func_name
- ]
- return results
-
-
-def get_op_from_name(op_name: str) -> Callable:
- """
- The operator function for a given op name.
-
- Parameters
- ----------
- op_name : str
- The op name, in form of "add" or "__add__".
-
- Returns
- -------
- function
- A function performing the operation.
- """
- short_opname = op_name.strip("_")
- try:
- op = getattr(operator, short_opname)
- except AttributeError:
- # Assume it is the reverse operator
- rop = getattr(operator, short_opname[1:])
- op = lambda x, y: rop(y, x)
-
- return op
-
-
-# -----------------------------------------------------------------------------
-# Indexing test helpers
-
-
-def getitem(x):
- return x
-
-
-def setitem(x):
- return x
-
-
-def loc(x):
- return x.loc
-
-
-def iloc(x):
- return x.iloc
-
-
-def at(x):
- return x.at
-
-
-def iat(x):
- return x.iat
-
-
-# -----------------------------------------------------------------------------
-
-
-def shares_memory(left, right) -> bool:
- """
- Pandas-compat for np.shares_memory.
- """
- if isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
- return np.shares_memory(left, right)
- elif isinstance(left, np.ndarray):
- # Call with reversed args to get to unpacking logic below.
- return shares_memory(right, left)
-
- if isinstance(left, RangeIndex):
- return False
- if isinstance(left, MultiIndex):
- return shares_memory(left._codes, right)
- if isinstance(left, (Index, Series)):
- return shares_memory(left._values, right)
-
- if isinstance(left, NDArrayBackedExtensionArray):
- return shares_memory(left._ndarray, right)
- if isinstance(left, pd.core.arrays.SparseArray):
- return shares_memory(left.sp_values, right)
- if isinstance(left, pd.core.arrays.IntervalArray):
- return shares_memory(left._left, right) or shares_memory(left._right, right)
-
- if isinstance(left, ExtensionArray) and left.dtype == "string[pyarrow]":
- # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
- left = cast("ArrowExtensionArray", left)
- if isinstance(right, ExtensionArray) and right.dtype == "string[pyarrow]":
- right = cast("ArrowExtensionArray", right)
- left_pa_data = left._data
- right_pa_data = right._data
- left_buf1 = left_pa_data.chunk(0).buffers()[1]
- right_buf1 = right_pa_data.chunk(0).buffers()[1]
- return left_buf1 == right_buf1
-
- if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
- # By convention, we'll say these share memory if they share *either*
- # the _data or the _mask
- return np.shares_memory(left._data, right._data) or np.shares_memory(
- left._mask, right._mask
- )
-
- if isinstance(left, DataFrame) and len(left._mgr.arrays) == 1:
- arr = left._mgr.arrays[0]
- return shares_memory(arr, right)
-
- raise NotImplementedError(type(left), type(right))
-
-
-__all__ = [
- "ALL_INT_EA_DTYPES",
- "ALL_INT_NUMPY_DTYPES",
- "ALL_NUMPY_DTYPES",
- "ALL_REAL_NUMPY_DTYPES",
- "all_timeseries_index_generator",
- "assert_almost_equal",
- "assert_attr_equal",
- "assert_categorical_equal",
- "assert_class_equal",
- "assert_contains_all",
- "assert_copy",
- "assert_datetime_array_equal",
- "assert_dict_equal",
- "assert_equal",
- "assert_extension_array_equal",
- "assert_frame_equal",
- "assert_index_equal",
- "assert_indexing_slices_equivalent",
- "assert_interval_array_equal",
- "assert_is_sorted",
- "assert_is_valid_plot_return_object",
- "assert_metadata_equivalent",
- "assert_numpy_array_equal",
- "assert_period_array_equal",
- "assert_produces_warning",
- "assert_series_equal",
- "assert_sp_array_equal",
- "assert_timedelta_array_equal",
- "at",
- "BOOL_DTYPES",
- "box_expected",
- "BYTES_DTYPES",
- "can_set_locale",
- "close",
- "COMPLEX_DTYPES",
- "convert_rows_list_to_csv_str",
- "DATETIME64_DTYPES",
- "decompress_file",
- "EMPTY_STRING_PATTERN",
- "ENDIAN",
- "ensure_clean",
- "ensure_safe_environment_variables",
- "equalContents",
- "external_error_raised",
- "FLOAT_EA_DTYPES",
- "FLOAT_NUMPY_DTYPES",
- "getCols",
- "get_cython_table_params",
- "get_dtype",
- "getitem",
- "get_locales",
- "getMixedTypeDict",
- "get_obj",
- "get_op_from_name",
- "getPeriodData",
- "getSeriesData",
- "getTimeSeriesData",
- "iat",
- "iloc",
- "index_subclass_makers_generator",
- "loc",
- "makeBoolIndex",
- "makeCategoricalIndex",
- "makeCustomDataframe",
- "makeCustomIndex",
- "makeDataFrame",
- "makeDateIndex",
- "makeFloatIndex",
- "makeFloatSeries",
- "makeIntervalIndex",
- "makeIntIndex",
- "makeMissingDataframe",
- "makeMixedDataFrame",
- "makeMultiIndex",
- "makeNumericIndex",
- "makeObjectSeries",
- "makePeriodFrame",
- "makePeriodIndex",
- "makePeriodSeries",
- "make_rand_series",
- "makeRangeIndex",
- "makeStringIndex",
- "makeStringSeries",
- "makeTimeDataFrame",
- "makeTimedeltaIndex",
- "makeTimeSeries",
- "makeUIntIndex",
- "maybe_produces_warning",
- "NARROW_NP_DTYPES",
- "network",
- "NP_NAT_OBJECTS",
- "NULL_OBJECTS",
- "OBJECT_DTYPES",
- "raise_assert_detail",
- "rands",
- "reset_display_options",
- "raises_chained_assignment_error",
- "round_trip_localpath",
- "round_trip_pathlib",
- "round_trip_pickle",
- "setitem",
- "set_locale",
- "set_timezone",
- "shares_memory",
- "SIGNED_INT_EA_DTYPES",
- "SIGNED_INT_NUMPY_DTYPES",
- "STRING_DTYPES",
- "SubclassedCategorical",
- "SubclassedDataFrame",
- "SubclassedSeries",
- "TIMEDELTA64_DTYPES",
- "to_array",
- "UNSIGNED_INT_EA_DTYPES",
- "UNSIGNED_INT_NUMPY_DTYPES",
- "use_numexpr",
- "with_csv_dialect",
- "write_to_compressed",
-]
diff --git a/contrib/python/pandas/py3/pandas/_testing/_hypothesis.py b/contrib/python/pandas/py3/pandas/_testing/_hypothesis.py
deleted file mode 100644
index 5256a303de3..00000000000
--- a/contrib/python/pandas/py3/pandas/_testing/_hypothesis.py
+++ /dev/null
@@ -1,89 +0,0 @@
-"""
-Hypothesis data generator helpers.
-"""
-from datetime import datetime
-
-from hypothesis import strategies as st
-from hypothesis.extra.dateutil import timezones as dateutil_timezones
-from hypothesis.extra.pytz import timezones as pytz_timezones
-
-from pandas.compat import is_platform_windows
-
-import pandas as pd
-
-from pandas.tseries.offsets import (
- BMonthBegin,
- BMonthEnd,
- BQuarterBegin,
- BQuarterEnd,
- BYearBegin,
- BYearEnd,
- MonthBegin,
- MonthEnd,
- QuarterBegin,
- QuarterEnd,
- YearBegin,
- YearEnd,
-)
-
-OPTIONAL_INTS = st.lists(st.one_of(st.integers(), st.none()), max_size=10, min_size=3)
-
-OPTIONAL_FLOATS = st.lists(st.one_of(st.floats(), st.none()), max_size=10, min_size=3)
-
-OPTIONAL_TEXT = st.lists(st.one_of(st.none(), st.text()), max_size=10, min_size=3)
-
-OPTIONAL_DICTS = st.lists(
- st.one_of(st.none(), st.dictionaries(st.text(), st.integers())),
- max_size=10,
- min_size=3,
-)
-
-OPTIONAL_LISTS = st.lists(
- st.one_of(st.none(), st.lists(st.text(), max_size=10, min_size=3)),
- max_size=10,
- min_size=3,
-)
-
-OPTIONAL_ONE_OF_ALL = st.one_of(
- OPTIONAL_DICTS, OPTIONAL_FLOATS, OPTIONAL_INTS, OPTIONAL_LISTS, OPTIONAL_TEXT
-)
-
-if is_platform_windows():
- DATETIME_NO_TZ = st.datetimes(min_value=datetime(1900, 1, 1))
-else:
- DATETIME_NO_TZ = st.datetimes()
-
-DATETIME_JAN_1_1900_OPTIONAL_TZ = st.datetimes(
- min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(),
- max_value=pd.Timestamp(1900, 1, 1).to_pydatetime(),
- timezones=st.one_of(st.none(), dateutil_timezones(), pytz_timezones()),
-)
-
-DATETIME_IN_PD_TIMESTAMP_RANGE_NO_TZ = st.datetimes(
- min_value=pd.Timestamp.min.to_pydatetime(warn=False),
- max_value=pd.Timestamp.max.to_pydatetime(warn=False),
-)
-
-INT_NEG_999_TO_POS_999 = st.integers(-999, 999)
-
-# The strategy for each type is registered in conftest.py, as they don't carry
-# enough runtime information (e.g. type hints) to infer how to build them.
-YQM_OFFSET = st.one_of(
- *map(
- st.from_type,
- [
- MonthBegin,
- MonthEnd,
- BMonthBegin,
- BMonthEnd,
- QuarterBegin,
- QuarterEnd,
- BQuarterBegin,
- BQuarterEnd,
- YearBegin,
- YearEnd,
- BYearBegin,
- BYearEnd,
- ],
- )
-)
diff --git a/contrib/python/pandas/py3/pandas/_testing/_io.py b/contrib/python/pandas/py3/pandas/_testing/_io.py
deleted file mode 100644
index 29618bdd649..00000000000
--- a/contrib/python/pandas/py3/pandas/_testing/_io.py
+++ /dev/null
@@ -1,435 +0,0 @@
-from __future__ import annotations
-
-import bz2
-from functools import wraps
-import gzip
-import io
-import socket
-import tarfile
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
-)
-import zipfile
-
-from pandas._typing import (
- FilePath,
- ReadPickleBuffer,
-)
-from pandas.compat import get_lzma_file
-from pandas.compat._optional import import_optional_dependency
-
-import pandas as pd
-from pandas._testing._random import rands
-from pandas._testing.contexts import ensure_clean
-
-from pandas.io.common import urlopen
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
-
-# skip tests on exceptions with these messages
-_network_error_messages = (
- # 'urlopen error timed out',
- # 'timeout: timed out',
- # 'socket.timeout: timed out',
- "timed out",
- "Server Hangup",
- "HTTP Error 503: Service Unavailable",
- "502: Proxy Error",
- "HTTP Error 502: internal error",
- "HTTP Error 502",
- "HTTP Error 503",
- "HTTP Error 403",
- "HTTP Error 400",
- "Temporary failure in name resolution",
- "Name or service not known",
- "Connection refused",
- "certificate verify",
-)
-
-# or this e.errno/e.reason.errno
-_network_errno_vals = (
- 101, # Network is unreachable
- 111, # Connection refused
- 110, # Connection timed out
- 104, # Connection reset Error
- 54, # Connection reset by peer
- 60, # urllib.error.URLError: [Errno 60] Connection timed out
-)
-
-# Both of the above shouldn't mask real issues such as 404's
-# or refused connections (changed DNS).
-# But some tests (test_data yahoo) contact incredibly flakey
-# servers.
-
-# and conditionally raise on exception types in _get_default_network_errors
-
-
-def _get_default_network_errors():
- # Lazy import for http.client & urllib.error
- # because it imports many things from the stdlib
- import http.client
- import urllib.error
-
- return (
- OSError,
- http.client.HTTPException,
- TimeoutError,
- urllib.error.URLError,
- socket.timeout,
- )
-
-
-def optional_args(decorator):
- """
- allows a decorator to take optional positional and keyword arguments.
- Assumes that taking a single, callable, positional argument means that
- it is decorating a function, i.e. something like this::
-
- @my_decorator
- def function(): pass
-
- Calls decorator with decorator(f, *args, **kwargs)
- """
-
- @wraps(decorator)
- def wrapper(*args, **kwargs):
- def dec(f):
- return decorator(f, *args, **kwargs)
-
- is_decorating = not kwargs and len(args) == 1 and callable(args[0])
- if is_decorating:
- f = args[0]
- args = ()
- return dec(f)
- else:
- return dec
-
- return wrapper
-
-
-# error: Untyped decorator makes function "network" untyped
-@optional_args # type: ignore[misc]
-def network(
- t,
- url: str = "https://www.google.com",
- raise_on_error: bool = False,
- check_before_test: bool = False,
- error_classes=None,
- skip_errnos=_network_errno_vals,
- _skip_on_messages=_network_error_messages,
-):
- """
- Label a test as requiring network connection and, if an error is
- encountered, only raise if it does not find a network connection.
-
- In comparison to ``network``, this assumes an added contract to your test:
- you must assert that, under normal conditions, your test will ONLY fail if
- it does not have network connectivity.
-
- You can call this in 3 ways: as a standard decorator, with keyword
- arguments, or with a positional argument that is the url to check.
-
- Parameters
- ----------
- t : callable
- The test requiring network connectivity.
- url : path
- The url to test via ``pandas.io.common.urlopen`` to check
- for connectivity. Defaults to 'https://www.google.com'.
- raise_on_error : bool
- If True, never catches errors.
- check_before_test : bool
- If True, checks connectivity before running the test case.
- error_classes : tuple or Exception
- error classes to ignore. If not in ``error_classes``, raises the error.
- defaults to OSError. Be careful about changing the error classes here.
- skip_errnos : iterable of int
- Any exception that has .errno or .reason.erno set to one
- of these values will be skipped with an appropriate
- message.
- _skip_on_messages: iterable of string
- any exception e for which one of the strings is
- a substring of str(e) will be skipped with an appropriate
- message. Intended to suppress errors where an errno isn't available.
-
- Notes
- -----
- * ``raise_on_error`` supersedes ``check_before_test``
-
- Returns
- -------
- t : callable
- The decorated test ``t``, with checks for connectivity errors.
-
- Example
- -------
-
- Tests decorated with @network will fail if it's possible to make a network
- connection to another URL (defaults to google.com)::
-
- >>> from pandas import _testing as tm
- >>> @tm.network
- ... def test_network():
- ... with pd.io.common.urlopen("rabbit://bonanza.com"):
- ... pass
- >>> test_network() # doctest: +SKIP
- Traceback
- ...
- URLError: <urlopen error unknown url type: rabbit>
-
- You can specify alternative URLs::
-
- >>> @tm.network("https://www.yahoo.com")
- ... def test_something_with_yahoo():
- ... raise OSError("Failure Message")
- >>> test_something_with_yahoo() # doctest: +SKIP
- Traceback (most recent call last):
- ...
- OSError: Failure Message
-
- If you set check_before_test, it will check the url first and not run the
- test on failure::
-
- >>> @tm.network("failing://url.blaher", check_before_test=True)
- ... def test_something():
- ... print("I ran!")
- ... raise ValueError("Failure")
- >>> test_something() # doctest: +SKIP
- Traceback (most recent call last):
- ...
-
- Errors not related to networking will always be raised.
- """
- import pytest
-
- if error_classes is None:
- error_classes = _get_default_network_errors()
-
- t.network = True
-
- @wraps(t)
- def wrapper(*args, **kwargs):
- if (
- check_before_test
- and not raise_on_error
- and not can_connect(url, error_classes)
- ):
- pytest.skip(
- f"May not have network connectivity because cannot connect to {url}"
- )
- try:
- return t(*args, **kwargs)
- except Exception as err:
- errno = getattr(err, "errno", None)
- if not errno and hasattr(errno, "reason"):
- # error: "Exception" has no attribute "reason"
- errno = getattr(err.reason, "errno", None) # type: ignore[attr-defined]
-
- if errno in skip_errnos:
- pytest.skip(f"Skipping test due to known errno and error {err}")
-
- e_str = str(err)
-
- if any(m.lower() in e_str.lower() for m in _skip_on_messages):
- pytest.skip(
- f"Skipping test because exception message is known and error {err}"
- )
-
- if not isinstance(err, error_classes) or raise_on_error:
- raise
- pytest.skip(f"Skipping test due to lack of connectivity and error {err}")
-
- return wrapper
-
-
-def can_connect(url, error_classes=None) -> bool:
- """
- Try to connect to the given url. True if succeeds, False if OSError
- raised
-
- Parameters
- ----------
- url : basestring
- The URL to try to connect to
-
- Returns
- -------
- connectable : bool
- Return True if no OSError (unable to connect) or URLError (bad url) was
- raised
- """
- if error_classes is None:
- error_classes = _get_default_network_errors()
-
- try:
- with urlopen(url, timeout=20) as response:
- # Timeout just in case rate-limiting is applied
- if response.status != 200:
- return False
- except error_classes:
- return False
- else:
- return True
-
-
-# ------------------------------------------------------------------
-# File-IO
-
-
-def round_trip_pickle(
- obj: Any, path: FilePath | ReadPickleBuffer | None = None
-) -> DataFrame | Series:
- """
- Pickle an object and then read it again.
-
- Parameters
- ----------
- obj : any object
- The object to pickle and then re-read.
- path : str, path object or file-like object, default None
- The path where the pickled object is written and then read.
-
- Returns
- -------
- pandas object
- The original object that was pickled and then re-read.
- """
- _path = path
- if _path is None:
- _path = f"__{rands(10)}__.pickle"
- with ensure_clean(_path) as temp_path:
- pd.to_pickle(obj, temp_path)
- return pd.read_pickle(temp_path)
-
-
-def round_trip_pathlib(writer, reader, path: str | None = None):
- """
- Write an object to file specified by a pathlib.Path and read it back
-
- Parameters
- ----------
- writer : callable bound to pandas object
- IO writing function (e.g. DataFrame.to_csv )
- reader : callable
- IO reading function (e.g. pd.read_csv )
- path : str, default None
- The path where the object is written and then read.
-
- Returns
- -------
- pandas object
- The original object that was serialized and then re-read.
- """
- import pytest
-
- Path = pytest.importorskip("pathlib").Path
- if path is None:
- path = "___pathlib___"
- with ensure_clean(path) as path:
- writer(Path(path))
- obj = reader(Path(path))
- return obj
-
-
-def round_trip_localpath(writer, reader, path: str | None = None):
- """
- Write an object to file specified by a py.path LocalPath and read it back.
-
- Parameters
- ----------
- writer : callable bound to pandas object
- IO writing function (e.g. DataFrame.to_csv )
- reader : callable
- IO reading function (e.g. pd.read_csv )
- path : str, default None
- The path where the object is written and then read.
-
- Returns
- -------
- pandas object
- The original object that was serialized and then re-read.
- """
- import pytest
-
- LocalPath = pytest.importorskip("py.path").local
- if path is None:
- path = "___localpath___"
- with ensure_clean(path) as path:
- writer(LocalPath(path))
- obj = reader(LocalPath(path))
- return obj
-
-
-def write_to_compressed(compression, path, data, dest: str = "test"):
- """
- Write data to a compressed file.
-
- Parameters
- ----------
- compression : {'gzip', 'bz2', 'zip', 'xz', 'zstd'}
- The compression type to use.
- path : str
- The file path to write the data.
- data : str
- The data to write.
- dest : str, default "test"
- The destination file (for ZIP only)
-
- Raises
- ------
- ValueError : An invalid compression value was passed in.
- """
- args: tuple[Any, ...] = (data,)
- mode = "wb"
- method = "write"
- compress_method: Callable
-
- if compression == "zip":
- compress_method = zipfile.ZipFile
- mode = "w"
- args = (dest, data)
- method = "writestr"
- elif compression == "tar":
- compress_method = tarfile.TarFile
- mode = "w"
- file = tarfile.TarInfo(name=dest)
- bytes = io.BytesIO(data)
- file.size = len(data)
- args = (file, bytes)
- method = "addfile"
- elif compression == "gzip":
- compress_method = gzip.GzipFile
- elif compression == "bz2":
- compress_method = bz2.BZ2File
- elif compression == "zstd":
- compress_method = import_optional_dependency("zstandard").open
- elif compression == "xz":
- compress_method = get_lzma_file()
- else:
- raise ValueError(f"Unrecognized compression type: {compression}")
-
- with compress_method(path, mode=mode) as f:
- getattr(f, method)(*args)
-
-
-# ------------------------------------------------------------------
-# Plotting
-
-
-def close(fignum=None) -> None:
- from matplotlib.pyplot import (
- close as _close,
- get_fignums,
- )
-
- if fignum is None:
- for fignum in get_fignums():
- _close(fignum)
- else:
- _close(fignum)
diff --git a/contrib/python/pandas/py3/pandas/_testing/_random.py b/contrib/python/pandas/py3/pandas/_testing/_random.py
deleted file mode 100644
index 7cfd92efb5d..00000000000
--- a/contrib/python/pandas/py3/pandas/_testing/_random.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import string
-
-import numpy as np
-
-from pandas._typing import NpDtype
-
-RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1))
-
-
-def rands_array(nchars, size, dtype: NpDtype = "O", replace: bool = True) -> np.ndarray:
- """
- Generate an array of byte strings.
- """
- retval = (
- np.random.choice(RANDS_CHARS, size=nchars * np.prod(size), replace=replace)
- .view((np.str_, nchars))
- .reshape(size)
- )
- return retval.astype(dtype)
-
-
-def rands(nchars) -> str:
- """
- Generate one random byte string.
-
- See `rands_array` if you want to create an array of random strings.
-
- """
- return "".join(np.random.choice(RANDS_CHARS, nchars))
diff --git a/contrib/python/pandas/py3/pandas/_testing/_warnings.py b/contrib/python/pandas/py3/pandas/_testing/_warnings.py
deleted file mode 100644
index 201aa811833..00000000000
--- a/contrib/python/pandas/py3/pandas/_testing/_warnings.py
+++ /dev/null
@@ -1,216 +0,0 @@
-from __future__ import annotations
-
-from contextlib import (
- contextmanager,
- nullcontext,
-)
-import re
-import sys
-from typing import (
- Generator,
- Literal,
- Sequence,
- Type,
- cast,
-)
-import warnings
-
-
-@contextmanager
-def assert_produces_warning(
- expected_warning: type[Warning] | bool | tuple[type[Warning], ...] | None = Warning,
- filter_level: Literal[
- "error", "ignore", "always", "default", "module", "once"
- ] = "always",
- check_stacklevel: bool = True,
- raise_on_extra_warnings: bool = True,
- match: str | None = None,
-) -> Generator[list[warnings.WarningMessage], None, None]:
- """
- Context manager for running code expected to either raise a specific warning,
- multiple specific warnings, or not raise any warnings. Verifies that the code
- raises the expected warning(s), and that it does not raise any other unexpected
- warnings. It is basically a wrapper around ``warnings.catch_warnings``.
-
- Parameters
- ----------
- expected_warning : {Warning, False, tuple[Warning, ...], None}, default Warning
- The type of Exception raised. ``exception.Warning`` is the base
- class for all warnings. To raise multiple types of exceptions,
- pass them as a tuple. To check that no warning is returned,
- specify ``False`` or ``None``.
- filter_level : str or None, default "always"
- Specifies whether warnings are ignored, displayed, or turned
- into errors.
- Valid values are:
-
- * "error" - turns matching warnings into exceptions
- * "ignore" - discard the warning
- * "always" - always emit a warning
- * "default" - print the warning the first time it is generated
- from each location
- * "module" - print the warning the first time it is generated
- from each module
- * "once" - print the warning the first time it is generated
-
- check_stacklevel : bool, default True
- If True, displays the line that called the function containing
- the warning to show were the function is called. Otherwise, the
- line that implements the function is displayed.
- raise_on_extra_warnings : bool, default True
- Whether extra warnings not of the type `expected_warning` should
- cause the test to fail.
- match : str, optional
- Match warning message.
-
- Examples
- --------
- >>> import warnings
- >>> with assert_produces_warning():
- ... warnings.warn(UserWarning())
- ...
- >>> with assert_produces_warning(False):
- ... warnings.warn(RuntimeWarning())
- ...
- Traceback (most recent call last):
- ...
- AssertionError: Caused unexpected warning(s): ['RuntimeWarning'].
- >>> with assert_produces_warning(UserWarning):
- ... warnings.warn(RuntimeWarning())
- Traceback (most recent call last):
- ...
- AssertionError: Did not see expected warning of class 'UserWarning'.
-
- ..warn:: This is *not* thread-safe.
- """
- __tracebackhide__ = True
-
- with warnings.catch_warnings(record=True) as w:
- warnings.simplefilter(filter_level)
- try:
- yield w
- finally:
- if expected_warning:
- expected_warning = cast(Type[Warning], expected_warning)
- _assert_caught_expected_warning(
- caught_warnings=w,
- expected_warning=expected_warning,
- match=match,
- check_stacklevel=check_stacklevel,
- )
- if raise_on_extra_warnings:
- _assert_caught_no_extra_warnings(
- caught_warnings=w,
- expected_warning=expected_warning,
- )
-
-
-def maybe_produces_warning(warning: type[Warning], condition: bool, **kwargs):
- """
- Return a context manager that possibly checks a warning based on the condition
- """
- if condition:
- return assert_produces_warning(warning, **kwargs)
- else:
- return nullcontext()
-
-
-def _assert_caught_expected_warning(
- *,
- caught_warnings: Sequence[warnings.WarningMessage],
- expected_warning: type[Warning],
- match: str | None,
- check_stacklevel: bool,
-) -> None:
- """Assert that there was the expected warning among the caught warnings."""
- saw_warning = False
- matched_message = False
- unmatched_messages = []
-
- for actual_warning in caught_warnings:
- if issubclass(actual_warning.category, expected_warning):
- saw_warning = True
-
- if check_stacklevel:
- _assert_raised_with_correct_stacklevel(actual_warning)
-
- if match is not None:
- if re.search(match, str(actual_warning.message)):
- matched_message = True
- else:
- unmatched_messages.append(actual_warning.message)
-
- if not saw_warning:
- raise AssertionError(
- f"Did not see expected warning of class "
- f"{repr(expected_warning.__name__)}"
- )
-
- if match and not matched_message:
- raise AssertionError(
- f"Did not see warning {repr(expected_warning.__name__)} "
- f"matching '{match}'. The emitted warning messages are "
- f"{unmatched_messages}"
- )
-
-
-def _assert_caught_no_extra_warnings(
- *,
- caught_warnings: Sequence[warnings.WarningMessage],
- expected_warning: type[Warning] | bool | tuple[type[Warning], ...] | None,
-) -> None:
- """Assert that no extra warnings apart from the expected ones are caught."""
- extra_warnings = []
-
- for actual_warning in caught_warnings:
- if _is_unexpected_warning(actual_warning, expected_warning):
- # GH#38630 pytest.filterwarnings does not suppress these.
- if actual_warning.category == ResourceWarning:
- # GH 44732: Don't make the CI flaky by filtering SSL-related
- # ResourceWarning from dependencies
- if "unclosed <ssl.SSLSocket" in str(actual_warning.message):
- continue
- # GH 44844: Matplotlib leaves font files open during the entire process
- # upon import. Don't make CI flaky if ResourceWarning raised
- # due to these open files.
- if any("matplotlib" in mod for mod in sys.modules):
- continue
- extra_warnings.append(
- (
- actual_warning.category.__name__,
- actual_warning.message,
- actual_warning.filename,
- actual_warning.lineno,
- )
- )
-
- if extra_warnings:
- raise AssertionError(f"Caused unexpected warning(s): {repr(extra_warnings)}")
-
-
-def _is_unexpected_warning(
- actual_warning: warnings.WarningMessage,
- expected_warning: type[Warning] | bool | tuple[type[Warning], ...] | None,
-) -> bool:
- """Check if the actual warning issued is unexpected."""
- if actual_warning and not expected_warning:
- return True
- expected_warning = cast(Type[Warning], expected_warning)
- return bool(not issubclass(actual_warning.category, expected_warning))
-
-
-def _assert_raised_with_correct_stacklevel(
- actual_warning: warnings.WarningMessage,
-) -> None:
- from inspect import (
- getframeinfo,
- stack,
- )
-
- caller = getframeinfo(stack()[4][0])
- msg = (
- "Warning not set with correct stacklevel. "
- f"File where warning is raised: {actual_warning.filename} != "
- f"{caller.filename}. Warning message: {actual_warning.message}"
- )
- assert actual_warning.filename == caller.filename, msg
diff --git a/contrib/python/pandas/py3/pandas/_testing/asserters.py b/contrib/python/pandas/py3/pandas/_testing/asserters.py
deleted file mode 100644
index 196ebd6003d..00000000000
--- a/contrib/python/pandas/py3/pandas/_testing/asserters.py
+++ /dev/null
@@ -1,1378 +0,0 @@
-from __future__ import annotations
-
-import operator
-from typing import (
- Literal,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs.missing import is_matching_na
-from pandas._libs.sparse import SparseIndex
-import pandas._libs.testing as _testing
-from pandas._libs.tslibs.np_datetime import compare_mismatched_resolutions
-
-from pandas.core.dtypes.common import (
- is_bool,
- is_categorical_dtype,
- is_extension_array_dtype,
- is_integer_dtype,
- is_interval_dtype,
- is_number,
- is_numeric_dtype,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.dtypes import (
- CategoricalDtype,
- DatetimeTZDtype,
- PandasDtype,
-)
-from pandas.core.dtypes.missing import array_equivalent
-
-import pandas as pd
-from pandas import (
- Categorical,
- DataFrame,
- DatetimeIndex,
- Index,
- IntervalIndex,
- MultiIndex,
- PeriodIndex,
- RangeIndex,
- Series,
- TimedeltaIndex,
-)
-from pandas.core.algorithms import take_nd
-from pandas.core.arrays import (
- DatetimeArray,
- ExtensionArray,
- IntervalArray,
- PeriodArray,
- TimedeltaArray,
-)
-from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
-from pandas.core.arrays.string_ import StringDtype
-from pandas.core.indexes.api import safe_sort_index
-
-from pandas.io.formats.printing import pprint_thing
-
-
-def assert_almost_equal(
- left,
- right,
- check_dtype: bool | Literal["equiv"] = "equiv",
- rtol: float = 1.0e-5,
- atol: float = 1.0e-8,
- **kwargs,
-) -> None:
- """
- Check that the left and right objects are approximately equal.
-
- By approximately equal, we refer to objects that are numbers or that
- contain numbers which may be equivalent to specific levels of precision.
-
- Parameters
- ----------
- left : object
- right : object
- check_dtype : bool or {'equiv'}, default 'equiv'
- Check dtype if both a and b are the same type. If 'equiv' is passed in,
- then `RangeIndex` and `Index` with int64 dtype are also considered
- equivalent when doing type checking.
- rtol : float, default 1e-5
- Relative tolerance.
-
- .. versionadded:: 1.1.0
- atol : float, default 1e-8
- Absolute tolerance.
-
- .. versionadded:: 1.1.0
- """
- if isinstance(left, Index):
- assert_index_equal(
- left,
- right,
- check_exact=False,
- exact=check_dtype,
- rtol=rtol,
- atol=atol,
- **kwargs,
- )
-
- elif isinstance(left, Series):
- assert_series_equal(
- left,
- right,
- check_exact=False,
- check_dtype=check_dtype,
- rtol=rtol,
- atol=atol,
- **kwargs,
- )
-
- elif isinstance(left, DataFrame):
- assert_frame_equal(
- left,
- right,
- check_exact=False,
- check_dtype=check_dtype,
- rtol=rtol,
- atol=atol,
- **kwargs,
- )
-
- else:
- # Other sequences.
- if check_dtype:
- if is_number(left) and is_number(right):
- # Do not compare numeric classes, like np.float64 and float.
- pass
- elif is_bool(left) and is_bool(right):
- # Do not compare bool classes, like np.bool_ and bool.
- pass
- else:
- if isinstance(left, np.ndarray) or isinstance(right, np.ndarray):
- obj = "numpy array"
- else:
- obj = "Input"
- assert_class_equal(left, right, obj=obj)
-
- # if we have "equiv", this becomes True
- _testing.assert_almost_equal(
- left, right, check_dtype=bool(check_dtype), rtol=rtol, atol=atol, **kwargs
- )
-
-
-def _check_isinstance(left, right, cls):
- """
- Helper method for our assert_* methods that ensures that
- the two objects being compared have the right type before
- proceeding with the comparison.
-
- Parameters
- ----------
- left : The first object being compared.
- right : The second object being compared.
- cls : The class type to check against.
-
- Raises
- ------
- AssertionError : Either `left` or `right` is not an instance of `cls`.
- """
- cls_name = cls.__name__
-
- if not isinstance(left, cls):
- raise AssertionError(
- f"{cls_name} Expected type {cls}, found {type(left)} instead"
- )
- if not isinstance(right, cls):
- raise AssertionError(
- f"{cls_name} Expected type {cls}, found {type(right)} instead"
- )
-
-
-def assert_dict_equal(left, right, compare_keys: bool = True) -> None:
- _check_isinstance(left, right, dict)
- _testing.assert_dict_equal(left, right, compare_keys=compare_keys)
-
-
-def assert_index_equal(
- left: Index,
- right: Index,
- exact: bool | str = "equiv",
- check_names: bool = True,
- check_exact: bool = True,
- check_categorical: bool = True,
- check_order: bool = True,
- rtol: float = 1.0e-5,
- atol: float = 1.0e-8,
- obj: str = "Index",
-) -> None:
- """
- Check that left and right Index are equal.
-
- Parameters
- ----------
- left : Index
- right : Index
- exact : bool or {'equiv'}, default 'equiv'
- Whether to check the Index class, dtype and inferred_type
- are identical. If 'equiv', then RangeIndex can be substituted for
- Index with an int64 dtype as well.
- check_names : bool, default True
- Whether to check the names attribute.
- check_exact : bool, default True
- Whether to compare number exactly.
- check_categorical : bool, default True
- Whether to compare internal Categorical exactly.
- check_order : bool, default True
- Whether to compare the order of index entries as well as their values.
- If True, both indexes must contain the same elements, in the same order.
- If False, both indexes must contain the same elements, but in any order.
-
- .. versionadded:: 1.2.0
- rtol : float, default 1e-5
- Relative tolerance. Only used when check_exact is False.
-
- .. versionadded:: 1.1.0
- atol : float, default 1e-8
- Absolute tolerance. Only used when check_exact is False.
-
- .. versionadded:: 1.1.0
- obj : str, default 'Index'
- Specify object name being compared, internally used to show appropriate
- assertion message.
-
- Examples
- --------
- >>> from pandas import testing as tm
- >>> a = pd.Index([1, 2, 3])
- >>> b = pd.Index([1, 2, 3])
- >>> tm.assert_index_equal(a, b)
- """
- __tracebackhide__ = True
-
- def _check_types(left, right, obj: str = "Index") -> None:
- if not exact:
- return
-
- assert_class_equal(left, right, exact=exact, obj=obj)
- assert_attr_equal("inferred_type", left, right, obj=obj)
-
- # Skip exact dtype checking when `check_categorical` is False
- if is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype):
- if check_categorical:
- assert_attr_equal("dtype", left, right, obj=obj)
- assert_index_equal(left.categories, right.categories, exact=exact)
- return
-
- assert_attr_equal("dtype", left, right, obj=obj)
-
- def _get_ilevel_values(index, level):
- # accept level number only
- unique = index.levels[level]
- level_codes = index.codes[level]
- filled = take_nd(unique._values, level_codes, fill_value=unique._na_value)
- return unique._shallow_copy(filled, name=index.names[level])
-
- # instance validation
- _check_isinstance(left, right, Index)
-
- # class / dtype comparison
- _check_types(left, right, obj=obj)
-
- # level comparison
- if left.nlevels != right.nlevels:
- msg1 = f"{obj} levels are different"
- msg2 = f"{left.nlevels}, {left}"
- msg3 = f"{right.nlevels}, {right}"
- raise_assert_detail(obj, msg1, msg2, msg3)
-
- # length comparison
- if len(left) != len(right):
- msg1 = f"{obj} length are different"
- msg2 = f"{len(left)}, {left}"
- msg3 = f"{len(right)}, {right}"
- raise_assert_detail(obj, msg1, msg2, msg3)
-
- # If order doesn't matter then sort the index entries
- if not check_order:
- left = safe_sort_index(left)
- right = safe_sort_index(right)
-
- # MultiIndex special comparison for little-friendly error messages
- if left.nlevels > 1:
- left = cast(MultiIndex, left)
- right = cast(MultiIndex, right)
-
- for level in range(left.nlevels):
- # cannot use get_level_values here because it can change dtype
- llevel = _get_ilevel_values(left, level)
- rlevel = _get_ilevel_values(right, level)
-
- lobj = f"MultiIndex level [{level}]"
- assert_index_equal(
- llevel,
- rlevel,
- exact=exact,
- check_names=check_names,
- check_exact=check_exact,
- rtol=rtol,
- atol=atol,
- obj=lobj,
- )
- # get_level_values may change dtype
- _check_types(left.levels[level], right.levels[level], obj=obj)
-
- # skip exact index checking when `check_categorical` is False
- if check_exact and check_categorical:
- if not left.equals(right):
- mismatch = left._values != right._values
-
- if is_extension_array_dtype(mismatch):
- mismatch = cast("ExtensionArray", mismatch).fillna(True)
-
- diff = np.sum(mismatch.astype(int)) * 100.0 / len(left)
- msg = f"{obj} values are different ({np.round(diff, 5)} %)"
- raise_assert_detail(obj, msg, left, right)
- else:
- # if we have "equiv", this becomes True
- exact_bool = bool(exact)
- _testing.assert_almost_equal(
- left.values,
- right.values,
- rtol=rtol,
- atol=atol,
- check_dtype=exact_bool,
- obj=obj,
- lobj=left,
- robj=right,
- )
-
- # metadata comparison
- if check_names:
- assert_attr_equal("names", left, right, obj=obj)
- if isinstance(left, PeriodIndex) or isinstance(right, PeriodIndex):
- assert_attr_equal("freq", left, right, obj=obj)
- if isinstance(left, IntervalIndex) or isinstance(right, IntervalIndex):
- assert_interval_array_equal(left._values, right._values)
-
- if check_categorical:
- if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype):
- assert_categorical_equal(left._values, right._values, obj=f"{obj} category")
-
-
-def assert_class_equal(
- left, right, exact: bool | str = True, obj: str = "Input"
-) -> None:
- """
- Checks classes are equal.
- """
- __tracebackhide__ = True
-
- def repr_class(x):
- if isinstance(x, Index):
- # return Index as it is to include values in the error message
- return x
-
- return type(x).__name__
-
- def is_class_equiv(idx: Index) -> bool:
- """Classes that are a RangeIndex (sub-)instance or exactly an `Index` .
-
- This only checks class equivalence. There is a separate check that the
- dtype is int64.
- """
- return type(idx) is Index or isinstance(idx, RangeIndex)
-
- if type(left) == type(right):
- return
-
- if exact == "equiv":
- if is_class_equiv(left) and is_class_equiv(right):
- return
-
- msg = f"{obj} classes are different"
- raise_assert_detail(obj, msg, repr_class(left), repr_class(right))
-
-
-def assert_attr_equal(attr: str, left, right, obj: str = "Attributes") -> None:
- """
- Check attributes are equal. Both objects must have attribute.
-
- Parameters
- ----------
- attr : str
- Attribute name being compared.
- left : object
- right : object
- obj : str, default 'Attributes'
- Specify object name being compared, internally used to show appropriate
- assertion message
- """
- __tracebackhide__ = True
-
- left_attr = getattr(left, attr)
- right_attr = getattr(right, attr)
-
- if left_attr is right_attr or is_matching_na(left_attr, right_attr):
- # e.g. both np.nan, both NaT, both pd.NA, ...
- return None
-
- try:
- result = left_attr == right_attr
- except TypeError:
- # datetimetz on rhs may raise TypeError
- result = False
- if (left_attr is pd.NA) ^ (right_attr is pd.NA):
- result = False
- elif not isinstance(result, bool):
- result = result.all()
-
- if not result:
- msg = f'Attribute "{attr}" are different'
- raise_assert_detail(obj, msg, left_attr, right_attr)
- return None
-
-
-def assert_is_valid_plot_return_object(objs) -> None:
- import matplotlib.pyplot as plt
-
- if isinstance(objs, (Series, np.ndarray)):
- for el in objs.ravel():
- msg = (
- "one of 'objs' is not a matplotlib Axes instance, "
- f"type encountered {repr(type(el).__name__)}"
- )
- assert isinstance(el, (plt.Axes, dict)), msg
- else:
- msg = (
- "objs is neither an ndarray of Artist instances nor a single "
- "ArtistArtist instance, tuple, or dict, 'objs' is a "
- f"{repr(type(objs).__name__)}"
- )
- assert isinstance(objs, (plt.Artist, tuple, dict)), msg
-
-
-def assert_is_sorted(seq) -> None:
- """Assert that the sequence is sorted."""
- if isinstance(seq, (Index, Series)):
- seq = seq.values
- # sorting does not change precisions
- assert_numpy_array_equal(seq, np.sort(np.array(seq)))
-
-
-def assert_categorical_equal(
- left,
- right,
- check_dtype: bool = True,
- check_category_order: bool = True,
- obj: str = "Categorical",
-) -> None:
- """
- Test that Categoricals are equivalent.
-
- Parameters
- ----------
- left : Categorical
- right : Categorical
- check_dtype : bool, default True
- Check that integer dtype of the codes are the same.
- check_category_order : bool, default True
- Whether the order of the categories should be compared, which
- implies identical integer codes. If False, only the resulting
- values are compared. The ordered attribute is
- checked regardless.
- obj : str, default 'Categorical'
- Specify object name being compared, internally used to show appropriate
- assertion message.
- """
- _check_isinstance(left, right, Categorical)
-
- exact: bool | str
- if isinstance(left.categories, RangeIndex) or isinstance(
- right.categories, RangeIndex
- ):
- exact = "equiv"
- else:
- # We still want to require exact matches for Index
- exact = True
-
- if check_category_order:
- assert_index_equal(
- left.categories, right.categories, obj=f"{obj}.categories", exact=exact
- )
- assert_numpy_array_equal(
- left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes"
- )
- else:
- try:
- lc = left.categories.sort_values()
- rc = right.categories.sort_values()
- except TypeError:
- # e.g. '<' not supported between instances of 'int' and 'str'
- lc, rc = left.categories, right.categories
- assert_index_equal(lc, rc, obj=f"{obj}.categories", exact=exact)
- assert_index_equal(
- left.categories.take(left.codes),
- right.categories.take(right.codes),
- obj=f"{obj}.values",
- exact=exact,
- )
-
- assert_attr_equal("ordered", left, right, obj=obj)
-
-
-def assert_interval_array_equal(
- left, right, exact: bool | Literal["equiv"] = "equiv", obj: str = "IntervalArray"
-) -> None:
- """
- Test that two IntervalArrays are equivalent.
-
- Parameters
- ----------
- left, right : IntervalArray
- The IntervalArrays to compare.
- exact : bool or {'equiv'}, default 'equiv'
- Whether to check the Index class, dtype and inferred_type
- are identical. If 'equiv', then RangeIndex can be substituted for
- Index with an int64 dtype as well.
- obj : str, default 'IntervalArray'
- Specify object name being compared, internally used to show appropriate
- assertion message
- """
- _check_isinstance(left, right, IntervalArray)
-
- kwargs = {}
- if left._left.dtype.kind in ["m", "M"]:
- # We have a DatetimeArray or TimedeltaArray
- kwargs["check_freq"] = False
-
- assert_equal(left._left, right._left, obj=f"{obj}.left", **kwargs)
- assert_equal(left._right, right._right, obj=f"{obj}.left", **kwargs)
-
- assert_attr_equal("closed", left, right, obj=obj)
-
-
-def assert_period_array_equal(left, right, obj: str = "PeriodArray") -> None:
- _check_isinstance(left, right, PeriodArray)
-
- assert_numpy_array_equal(left._ndarray, right._ndarray, obj=f"{obj}._ndarray")
- assert_attr_equal("freq", left, right, obj=obj)
-
-
-def assert_datetime_array_equal(
- left, right, obj: str = "DatetimeArray", check_freq: bool = True
-) -> None:
- __tracebackhide__ = True
- _check_isinstance(left, right, DatetimeArray)
-
- assert_numpy_array_equal(left._ndarray, right._ndarray, obj=f"{obj}._ndarray")
- if check_freq:
- assert_attr_equal("freq", left, right, obj=obj)
- assert_attr_equal("tz", left, right, obj=obj)
-
-
-def assert_timedelta_array_equal(
- left, right, obj: str = "TimedeltaArray", check_freq: bool = True
-) -> None:
- __tracebackhide__ = True
- _check_isinstance(left, right, TimedeltaArray)
- assert_numpy_array_equal(left._ndarray, right._ndarray, obj=f"{obj}._ndarray")
- if check_freq:
- assert_attr_equal("freq", left, right, obj=obj)
-
-
-def raise_assert_detail(
- obj, message, left, right, diff=None, first_diff=None, index_values=None
-):
- __tracebackhide__ = True
-
- msg = f"""{obj} are different
-
-{message}"""
-
- if isinstance(index_values, np.ndarray):
- msg += f"\n[index]: {pprint_thing(index_values)}"
-
- if isinstance(left, np.ndarray):
- left = pprint_thing(left)
- elif isinstance(left, (CategoricalDtype, PandasDtype, StringDtype)):
- left = repr(left)
-
- if isinstance(right, np.ndarray):
- right = pprint_thing(right)
- elif isinstance(right, (CategoricalDtype, PandasDtype, StringDtype)):
- right = repr(right)
-
- msg += f"""
-[left]: {left}
-[right]: {right}"""
-
- if diff is not None:
- msg += f"\n[diff]: {diff}"
-
- if first_diff is not None:
- msg += f"\n{first_diff}"
-
- raise AssertionError(msg)
-
-
-def assert_numpy_array_equal(
- left,
- right,
- strict_nan: bool = False,
- check_dtype: bool | Literal["equiv"] = True,
- err_msg=None,
- check_same=None,
- obj: str = "numpy array",
- index_values=None,
-) -> None:
- """
- Check that 'np.ndarray' is equivalent.
-
- Parameters
- ----------
- left, right : numpy.ndarray or iterable
- The two arrays to be compared.
- strict_nan : bool, default False
- If True, consider NaN and None to be different.
- check_dtype : bool, default True
- Check dtype if both a and b are np.ndarray.
- err_msg : str, default None
- If provided, used as assertion message.
- check_same : None|'copy'|'same', default None
- Ensure left and right refer/do not refer to the same memory area.
- obj : str, default 'numpy array'
- Specify object name being compared, internally used to show appropriate
- assertion message.
- index_values : numpy.ndarray, default None
- optional index (shared by both left and right), used in output.
- """
- __tracebackhide__ = True
-
- # instance validation
- # Show a detailed error message when classes are different
- assert_class_equal(left, right, obj=obj)
- # both classes must be an np.ndarray
- _check_isinstance(left, right, np.ndarray)
-
- def _get_base(obj):
- return obj.base if getattr(obj, "base", None) is not None else obj
-
- left_base = _get_base(left)
- right_base = _get_base(right)
-
- if check_same == "same":
- if left_base is not right_base:
- raise AssertionError(f"{repr(left_base)} is not {repr(right_base)}")
- elif check_same == "copy":
- if left_base is right_base:
- raise AssertionError(f"{repr(left_base)} is {repr(right_base)}")
-
- def _raise(left, right, err_msg):
- if err_msg is None:
- if left.shape != right.shape:
- raise_assert_detail(
- obj, f"{obj} shapes are different", left.shape, right.shape
- )
-
- diff = 0
- for left_arr, right_arr in zip(left, right):
- # count up differences
- if not array_equivalent(left_arr, right_arr, strict_nan=strict_nan):
- diff += 1
-
- diff = diff * 100.0 / left.size
- msg = f"{obj} values are different ({np.round(diff, 5)} %)"
- raise_assert_detail(obj, msg, left, right, index_values=index_values)
-
- raise AssertionError(err_msg)
-
- # compare shape and values
- if not array_equivalent(left, right, strict_nan=strict_nan):
- _raise(left, right, err_msg)
-
- if check_dtype:
- if isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
- assert_attr_equal("dtype", left, right, obj=obj)
-
-
-def assert_extension_array_equal(
- left,
- right,
- check_dtype: bool | Literal["equiv"] = True,
- index_values=None,
- check_exact: bool = False,
- rtol: float = 1.0e-5,
- atol: float = 1.0e-8,
- obj: str = "ExtensionArray",
-) -> None:
- """
- Check that left and right ExtensionArrays are equal.
-
- Parameters
- ----------
- left, right : ExtensionArray
- The two arrays to compare.
- check_dtype : bool, default True
- Whether to check if the ExtensionArray dtypes are identical.
- index_values : numpy.ndarray, default None
- Optional index (shared by both left and right), used in output.
- check_exact : bool, default False
- Whether to compare number exactly.
- rtol : float, default 1e-5
- Relative tolerance. Only used when check_exact is False.
-
- .. versionadded:: 1.1.0
- atol : float, default 1e-8
- Absolute tolerance. Only used when check_exact is False.
-
- .. versionadded:: 1.1.0
- obj : str, default 'ExtensionArray'
- Specify object name being compared, internally used to show appropriate
- assertion message.
-
- .. versionadded:: 2.0.0
-
- Notes
- -----
- Missing values are checked separately from valid values.
- A mask of missing values is computed for each and checked to match.
- The remaining all-valid values are cast to object dtype and checked.
-
- Examples
- --------
- >>> from pandas import testing as tm
- >>> a = pd.Series([1, 2, 3, 4])
- >>> b, c = a.array, a.array
- >>> tm.assert_extension_array_equal(b, c)
- """
- assert isinstance(left, ExtensionArray), "left is not an ExtensionArray"
- assert isinstance(right, ExtensionArray), "right is not an ExtensionArray"
- if check_dtype:
- assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")
-
- if (
- isinstance(left, DatetimeLikeArrayMixin)
- and isinstance(right, DatetimeLikeArrayMixin)
- and type(right) == type(left)
- ):
- # GH 52449
- if not check_dtype and left.dtype.kind in "mM":
- if not isinstance(left.dtype, np.dtype):
- l_unit = cast(DatetimeTZDtype, left.dtype).unit
- else:
- l_unit = np.datetime_data(left.dtype)[0]
- if not isinstance(right.dtype, np.dtype):
- r_unit = cast(DatetimeTZDtype, left.dtype).unit
- else:
- r_unit = np.datetime_data(right.dtype)[0]
- if (
- l_unit != r_unit
- and compare_mismatched_resolutions(
- left._ndarray, right._ndarray, operator.eq
- ).all()
- ):
- return
- # Avoid slow object-dtype comparisons
- # np.asarray for case where we have a np.MaskedArray
- assert_numpy_array_equal(
- np.asarray(left.asi8),
- np.asarray(right.asi8),
- index_values=index_values,
- obj=obj,
- )
- return
-
- left_na = np.asarray(left.isna())
- right_na = np.asarray(right.isna())
- assert_numpy_array_equal(
- left_na, right_na, obj=f"{obj} NA mask", index_values=index_values
- )
-
- left_valid = left[~left_na].to_numpy(dtype=object)
- right_valid = right[~right_na].to_numpy(dtype=object)
- if check_exact:
- assert_numpy_array_equal(
- left_valid, right_valid, obj=obj, index_values=index_values
- )
- else:
- _testing.assert_almost_equal(
- left_valid,
- right_valid,
- check_dtype=bool(check_dtype),
- rtol=rtol,
- atol=atol,
- obj=obj,
- index_values=index_values,
- )
-
-
-# This could be refactored to use the NDFrame.equals method
-def assert_series_equal(
- left,
- right,
- check_dtype: bool | Literal["equiv"] = True,
- check_index_type: bool | Literal["equiv"] = "equiv",
- check_series_type: bool = True,
- check_names: bool = True,
- check_exact: bool = False,
- check_datetimelike_compat: bool = False,
- check_categorical: bool = True,
- check_category_order: bool = True,
- check_freq: bool = True,
- check_flags: bool = True,
- rtol: float = 1.0e-5,
- atol: float = 1.0e-8,
- obj: str = "Series",
- *,
- check_index: bool = True,
- check_like: bool = False,
-) -> None:
- """
- Check that left and right Series are equal.
-
- Parameters
- ----------
- left : Series
- right : Series
- check_dtype : bool, default True
- Whether to check the Series dtype is identical.
- check_index_type : bool or {'equiv'}, default 'equiv'
- Whether to check the Index class, dtype and inferred_type
- are identical.
- check_series_type : bool, default True
- Whether to check the Series class is identical.
- check_names : bool, default True
- Whether to check the Series and Index names attribute.
- check_exact : bool, default False
- Whether to compare number exactly.
- check_datetimelike_compat : bool, default False
- Compare datetime-like which is comparable ignoring dtype.
- check_categorical : bool, default True
- Whether to compare internal Categorical exactly.
- check_category_order : bool, default True
- Whether to compare category order of internal Categoricals.
-
- .. versionadded:: 1.0.2
- check_freq : bool, default True
- Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex.
-
- .. versionadded:: 1.1.0
- check_flags : bool, default True
- Whether to check the `flags` attribute.
-
- .. versionadded:: 1.2.0
-
- rtol : float, default 1e-5
- Relative tolerance. Only used when check_exact is False.
-
- .. versionadded:: 1.1.0
- atol : float, default 1e-8
- Absolute tolerance. Only used when check_exact is False.
-
- .. versionadded:: 1.1.0
- obj : str, default 'Series'
- Specify object name being compared, internally used to show appropriate
- assertion message.
- check_index : bool, default True
- Whether to check index equivalence. If False, then compare only values.
-
- .. versionadded:: 1.3.0
- check_like : bool, default False
- If True, ignore the order of the index. Must be False if check_index is False.
- Note: same labels must be with the same data.
-
- .. versionadded:: 1.5.0
-
- Examples
- --------
- >>> from pandas import testing as tm
- >>> a = pd.Series([1, 2, 3, 4])
- >>> b = pd.Series([1, 2, 3, 4])
- >>> tm.assert_series_equal(a, b)
- """
- __tracebackhide__ = True
-
- if not check_index and check_like:
- raise ValueError("check_like must be False if check_index is False")
-
- # instance validation
- _check_isinstance(left, right, Series)
-
- if check_series_type:
- assert_class_equal(left, right, obj=obj)
-
- # length comparison
- if len(left) != len(right):
- msg1 = f"{len(left)}, {left.index}"
- msg2 = f"{len(right)}, {right.index}"
- raise_assert_detail(obj, "Series length are different", msg1, msg2)
-
- if check_flags:
- assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}"
-
- if check_index:
- # GH #38183
- assert_index_equal(
- left.index,
- right.index,
- exact=check_index_type,
- check_names=check_names,
- check_exact=check_exact,
- check_categorical=check_categorical,
- check_order=not check_like,
- rtol=rtol,
- atol=atol,
- obj=f"{obj}.index",
- )
-
- if check_like:
- left = left.reindex_like(right)
-
- if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)):
- lidx = left.index
- ridx = right.index
- assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq)
-
- if check_dtype:
- # We want to skip exact dtype checking when `check_categorical`
- # is False. We'll still raise if only one is a `Categorical`,
- # regardless of `check_categorical`
- if (
- isinstance(left.dtype, CategoricalDtype)
- and isinstance(right.dtype, CategoricalDtype)
- and not check_categorical
- ):
- pass
- else:
- assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")
-
- if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype):
- left_values = left._values
- right_values = right._values
- # Only check exact if dtype is numeric
- if isinstance(left_values, ExtensionArray) and isinstance(
- right_values, ExtensionArray
- ):
- assert_extension_array_equal(
- left_values,
- right_values,
- check_dtype=check_dtype,
- index_values=np.asarray(left.index),
- obj=str(obj),
- )
- else:
- assert_numpy_array_equal(
- left_values,
- right_values,
- check_dtype=check_dtype,
- obj=str(obj),
- index_values=np.asarray(left.index),
- )
- elif check_datetimelike_compat and (
- needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype)
- ):
- # we want to check only if we have compat dtypes
- # e.g. integer and M|m are NOT compat, but we can simply check
- # the values in that case
-
- # datetimelike may have different objects (e.g. datetime.datetime
- # vs Timestamp) but will compare equal
- if not Index(left._values).equals(Index(right._values)):
- msg = (
- f"[datetimelike_compat=True] {left._values} "
- f"is not equal to {right._values}."
- )
- raise AssertionError(msg)
- elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype):
- assert_interval_array_equal(left.array, right.array)
- elif isinstance(left.dtype, CategoricalDtype) or isinstance(
- right.dtype, CategoricalDtype
- ):
- _testing.assert_almost_equal(
- left._values,
- right._values,
- rtol=rtol,
- atol=atol,
- check_dtype=bool(check_dtype),
- obj=str(obj),
- index_values=np.asarray(left.index),
- )
- elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype):
- assert_extension_array_equal(
- left._values,
- right._values,
- rtol=rtol,
- atol=atol,
- check_dtype=check_dtype,
- index_values=np.asarray(left.index),
- obj=str(obj),
- )
- elif is_extension_array_dtype_and_needs_i8_conversion(
- left.dtype, right.dtype
- ) or is_extension_array_dtype_and_needs_i8_conversion(right.dtype, left.dtype):
- assert_extension_array_equal(
- left._values,
- right._values,
- check_dtype=check_dtype,
- index_values=np.asarray(left.index),
- obj=str(obj),
- )
- elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype):
- # DatetimeArray or TimedeltaArray
- assert_extension_array_equal(
- left._values,
- right._values,
- check_dtype=check_dtype,
- index_values=np.asarray(left.index),
- obj=str(obj),
- )
- else:
- _testing.assert_almost_equal(
- left._values,
- right._values,
- rtol=rtol,
- atol=atol,
- check_dtype=bool(check_dtype),
- obj=str(obj),
- index_values=np.asarray(left.index),
- )
-
- # metadata comparison
- if check_names:
- assert_attr_equal("name", left, right, obj=obj)
-
- if check_categorical:
- if isinstance(left.dtype, CategoricalDtype) or isinstance(
- right.dtype, CategoricalDtype
- ):
- assert_categorical_equal(
- left._values,
- right._values,
- obj=f"{obj} category",
- check_category_order=check_category_order,
- )
-
-
-# This could be refactored to use the NDFrame.equals method
-def assert_frame_equal(
- left,
- right,
- check_dtype: bool | Literal["equiv"] = True,
- check_index_type: bool | Literal["equiv"] = "equiv",
- check_column_type: bool | Literal["equiv"] = "equiv",
- check_frame_type: bool = True,
- check_names: bool = True,
- by_blocks: bool = False,
- check_exact: bool = False,
- check_datetimelike_compat: bool = False,
- check_categorical: bool = True,
- check_like: bool = False,
- check_freq: bool = True,
- check_flags: bool = True,
- rtol: float = 1.0e-5,
- atol: float = 1.0e-8,
- obj: str = "DataFrame",
-) -> None:
- """
- Check that left and right DataFrame are equal.
-
- This function is intended to compare two DataFrames and output any
- differences. It is mostly intended for use in unit tests.
- Additional parameters allow varying the strictness of the
- equality checks performed.
-
- Parameters
- ----------
- left : DataFrame
- First DataFrame to compare.
- right : DataFrame
- Second DataFrame to compare.
- check_dtype : bool, default True
- Whether to check the DataFrame dtype is identical.
- check_index_type : bool or {'equiv'}, default 'equiv'
- Whether to check the Index class, dtype and inferred_type
- are identical.
- check_column_type : bool or {'equiv'}, default 'equiv'
- Whether to check the columns class, dtype and inferred_type
- are identical. Is passed as the ``exact`` argument of
- :func:`assert_index_equal`.
- check_frame_type : bool, default True
- Whether to check the DataFrame class is identical.
- check_names : bool, default True
- Whether to check that the `names` attribute for both the `index`
- and `column` attributes of the DataFrame is identical.
- by_blocks : bool, default False
- Specify how to compare internal data. If False, compare by columns.
- If True, compare by blocks.
- check_exact : bool, default False
- Whether to compare number exactly.
- check_datetimelike_compat : bool, default False
- Compare datetime-like which is comparable ignoring dtype.
- check_categorical : bool, default True
- Whether to compare internal Categorical exactly.
- check_like : bool, default False
- If True, ignore the order of index & columns.
- Note: index labels must match their respective rows
- (same as in columns) - same labels must be with the same data.
- check_freq : bool, default True
- Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex.
-
- .. versionadded:: 1.1.0
- check_flags : bool, default True
- Whether to check the `flags` attribute.
- rtol : float, default 1e-5
- Relative tolerance. Only used when check_exact is False.
-
- .. versionadded:: 1.1.0
- atol : float, default 1e-8
- Absolute tolerance. Only used when check_exact is False.
-
- .. versionadded:: 1.1.0
- obj : str, default 'DataFrame'
- Specify object name being compared, internally used to show appropriate
- assertion message.
-
- See Also
- --------
- assert_series_equal : Equivalent method for asserting Series equality.
- DataFrame.equals : Check DataFrame equality.
-
- Examples
- --------
- This example shows comparing two DataFrames that are equal
- but with columns of differing dtypes.
-
- >>> from pandas.testing import assert_frame_equal
- >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
- >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]})
-
- df1 equals itself.
-
- >>> assert_frame_equal(df1, df1)
-
- df1 differs from df2 as column 'b' is of a different type.
-
- >>> assert_frame_equal(df1, df2)
- Traceback (most recent call last):
- ...
- AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="b") are different
-
- Attribute "dtype" are different
- [left]: int64
- [right]: float64
-
- Ignore differing dtypes in columns with check_dtype.
-
- >>> assert_frame_equal(df1, df2, check_dtype=False)
- """
- __tracebackhide__ = True
-
- # instance validation
- _check_isinstance(left, right, DataFrame)
-
- if check_frame_type:
- assert isinstance(left, type(right))
- # assert_class_equal(left, right, obj=obj)
-
- # shape comparison
- if left.shape != right.shape:
- raise_assert_detail(
- obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}"
- )
-
- if check_flags:
- assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}"
-
- # index comparison
- assert_index_equal(
- left.index,
- right.index,
- exact=check_index_type,
- check_names=check_names,
- check_exact=check_exact,
- check_categorical=check_categorical,
- check_order=not check_like,
- rtol=rtol,
- atol=atol,
- obj=f"{obj}.index",
- )
-
- # column comparison
- assert_index_equal(
- left.columns,
- right.columns,
- exact=check_column_type,
- check_names=check_names,
- check_exact=check_exact,
- check_categorical=check_categorical,
- check_order=not check_like,
- rtol=rtol,
- atol=atol,
- obj=f"{obj}.columns",
- )
-
- if check_like:
- left = left.reindex_like(right)
-
- # compare by blocks
- if by_blocks:
- rblocks = right._to_dict_of_blocks()
- lblocks = left._to_dict_of_blocks()
- for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))):
- assert dtype in lblocks
- assert dtype in rblocks
- assert_frame_equal(
- lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj
- )
-
- # compare by columns
- else:
- for i, col in enumerate(left.columns):
- # We have already checked that columns match, so we can do
- # fast location-based lookups
- lcol = left._ixs(i, axis=1)
- rcol = right._ixs(i, axis=1)
-
- # GH #38183
- # use check_index=False, because we do not want to run
- # assert_index_equal for each column,
- # as we already checked it for the whole dataframe before.
- assert_series_equal(
- lcol,
- rcol,
- check_dtype=check_dtype,
- check_index_type=check_index_type,
- check_exact=check_exact,
- check_names=check_names,
- check_datetimelike_compat=check_datetimelike_compat,
- check_categorical=check_categorical,
- check_freq=check_freq,
- obj=f'{obj}.iloc[:, {i}] (column name="{col}")',
- rtol=rtol,
- atol=atol,
- check_index=False,
- check_flags=False,
- )
-
-
-def assert_equal(left, right, **kwargs) -> None:
- """
- Wrapper for tm.assert_*_equal to dispatch to the appropriate test function.
-
- Parameters
- ----------
- left, right : Index, Series, DataFrame, ExtensionArray, or np.ndarray
- The two items to be compared.
- **kwargs
- All keyword arguments are passed through to the underlying assert method.
- """
- __tracebackhide__ = True
-
- if isinstance(left, Index):
- assert_index_equal(left, right, **kwargs)
- if isinstance(left, (DatetimeIndex, TimedeltaIndex)):
- assert left.freq == right.freq, (left.freq, right.freq)
- elif isinstance(left, Series):
- assert_series_equal(left, right, **kwargs)
- elif isinstance(left, DataFrame):
- assert_frame_equal(left, right, **kwargs)
- elif isinstance(left, IntervalArray):
- assert_interval_array_equal(left, right, **kwargs)
- elif isinstance(left, PeriodArray):
- assert_period_array_equal(left, right, **kwargs)
- elif isinstance(left, DatetimeArray):
- assert_datetime_array_equal(left, right, **kwargs)
- elif isinstance(left, TimedeltaArray):
- assert_timedelta_array_equal(left, right, **kwargs)
- elif isinstance(left, ExtensionArray):
- assert_extension_array_equal(left, right, **kwargs)
- elif isinstance(left, np.ndarray):
- assert_numpy_array_equal(left, right, **kwargs)
- elif isinstance(left, str):
- assert kwargs == {}
- assert left == right
- else:
- assert kwargs == {}
- assert_almost_equal(left, right)
-
-
-def assert_sp_array_equal(left, right) -> None:
- """
- Check that the left and right SparseArray are equal.
-
- Parameters
- ----------
- left : SparseArray
- right : SparseArray
- """
- _check_isinstance(left, right, pd.arrays.SparseArray)
-
- assert_numpy_array_equal(left.sp_values, right.sp_values)
-
- # SparseIndex comparison
- assert isinstance(left.sp_index, SparseIndex)
- assert isinstance(right.sp_index, SparseIndex)
-
- left_index = left.sp_index
- right_index = right.sp_index
-
- if not left_index.equals(right_index):
- raise_assert_detail(
- "SparseArray.index", "index are not equal", left_index, right_index
- )
- else:
- # Just ensure a
- pass
-
- assert_attr_equal("fill_value", left, right)
- assert_attr_equal("dtype", left, right)
- assert_numpy_array_equal(left.to_dense(), right.to_dense())
-
-
-def assert_contains_all(iterable, dic) -> None:
- for k in iterable:
- assert k in dic, f"Did not contain item: {repr(k)}"
-
-
-def assert_copy(iter1, iter2, **eql_kwargs) -> None:
- """
- iter1, iter2: iterables that produce elements
- comparable with assert_almost_equal
-
- Checks that the elements are equal, but not
- the same object. (Does not check that items
- in sequences are also not the same object)
- """
- for elem1, elem2 in zip(iter1, iter2):
- assert_almost_equal(elem1, elem2, **eql_kwargs)
- msg = (
- f"Expected object {repr(type(elem1))} and object {repr(type(elem2))} to be "
- "different objects, but they were the same object."
- )
- assert elem1 is not elem2, msg
-
-
-def is_extension_array_dtype_and_needs_i8_conversion(left_dtype, right_dtype) -> bool:
- """
- Checks that we have the combination of an ExtensionArraydtype and
- a dtype that should be converted to int64
-
- Returns
- -------
- bool
-
- Related to issue #37609
- """
- return is_extension_array_dtype(left_dtype) and needs_i8_conversion(right_dtype)
-
-
-def assert_indexing_slices_equivalent(ser: Series, l_slc: slice, i_slc: slice) -> None:
- """
- Check that ser.iloc[i_slc] matches ser.loc[l_slc] and, if applicable,
- ser[l_slc].
- """
- expected = ser.iloc[i_slc]
-
- assert_series_equal(ser.loc[l_slc], expected)
-
- if not is_integer_dtype(ser.index):
- # For integer indices, .loc and plain getitem are position-based.
- assert_series_equal(ser[l_slc], expected)
-
-
-def assert_metadata_equivalent(
- left: DataFrame | Series, right: DataFrame | Series | None = None
-) -> None:
- """
- Check that ._metadata attributes are equivalent.
- """
- for attr in left._metadata:
- val = getattr(left, attr, None)
- if right is None:
- assert val is None
- else:
- assert val == getattr(right, attr, None)
diff --git a/contrib/python/pandas/py3/pandas/_testing/compat.py b/contrib/python/pandas/py3/pandas/_testing/compat.py
deleted file mode 100644
index bb3bb99a4c6..00000000000
--- a/contrib/python/pandas/py3/pandas/_testing/compat.py
+++ /dev/null
@@ -1,24 +0,0 @@
-"""
-Helpers for sharing tests between DataFrame/Series
-"""
-from pandas._typing import DtypeObj
-
-from pandas import DataFrame
-
-
-def get_dtype(obj) -> DtypeObj:
- if isinstance(obj, DataFrame):
- # Note: we are assuming only one column
- return obj.dtypes.iat[0]
- else:
- return obj.dtype
-
-
-def get_obj(df: DataFrame, klass):
- """
- For sharing tests using frame_or_series, either return the DataFrame
- unchanged or return it's first column as a Series.
- """
- if klass is DataFrame:
- return df
- return df._ixs(0, axis=1)
diff --git a/contrib/python/pandas/py3/pandas/_testing/contexts.py b/contrib/python/pandas/py3/pandas/_testing/contexts.py
deleted file mode 100644
index 4479cfc06f8..00000000000
--- a/contrib/python/pandas/py3/pandas/_testing/contexts.py
+++ /dev/null
@@ -1,219 +0,0 @@
-from __future__ import annotations
-
-from contextlib import contextmanager
-import os
-from pathlib import Path
-import tempfile
-from typing import (
- IO,
- Any,
- Generator,
-)
-import uuid
-
-from pandas._typing import (
- BaseBuffer,
- CompressionOptions,
- FilePath,
-)
-from pandas.compat import PYPY
-from pandas.errors import ChainedAssignmentError
-
-from pandas import set_option
-
-from pandas.io.common import get_handle
-
-
-@contextmanager
-def decompress_file(
- path: FilePath | BaseBuffer, compression: CompressionOptions
-) -> Generator[IO[bytes], None, None]:
- """
- Open a compressed file and return a file object.
-
- Parameters
- ----------
- path : str
- The path where the file is read from.
-
- compression : {'gzip', 'bz2', 'zip', 'xz', 'zstd', None}
- Name of the decompression to use
-
- Returns
- -------
- file object
- """
- with get_handle(path, "rb", compression=compression, is_text=False) as handle:
- yield handle.handle
-
-
-@contextmanager
-def set_timezone(tz: str) -> Generator[None, None, None]:
- """
- Context manager for temporarily setting a timezone.
-
- Parameters
- ----------
- tz : str
- A string representing a valid timezone.
-
- Examples
- --------
- >>> from datetime import datetime
- >>> from dateutil.tz import tzlocal
- >>> tzlocal().tzname(datetime(2021, 1, 1)) # doctest: +SKIP
- 'IST'
-
- >>> with set_timezone('US/Eastern'):
- ... tzlocal().tzname(datetime(2021, 1, 1))
- ...
- 'EST'
- """
- import time
-
- def setTZ(tz) -> None:
- if tz is None:
- try:
- del os.environ["TZ"]
- except KeyError:
- pass
- else:
- os.environ["TZ"] = tz
- time.tzset()
-
- orig_tz = os.environ.get("TZ")
- setTZ(tz)
- try:
- yield
- finally:
- setTZ(orig_tz)
-
-
-@contextmanager
-def ensure_clean(
- filename=None, return_filelike: bool = False, **kwargs: Any
-) -> Generator[Any, None, None]:
- """
- Gets a temporary path and agrees to remove on close.
-
- This implementation does not use tempfile.mkstemp to avoid having a file handle.
- If the code using the returned path wants to delete the file itself, windows
- requires that no program has a file handle to it.
-
- Parameters
- ----------
- filename : str (optional)
- suffix of the created file.
- return_filelike : bool (default False)
- if True, returns a file-like which is *always* cleaned. Necessary for
- savefig and other functions which want to append extensions.
- **kwargs
- Additional keywords are passed to open().
-
- """
- folder = Path(tempfile.gettempdir())
-
- if filename is None:
- filename = ""
- filename = str(uuid.uuid4()) + filename
- path = folder / filename
-
- path.touch()
-
- handle_or_str: str | IO = str(path)
- if return_filelike:
- kwargs.setdefault("mode", "w+b")
- handle_or_str = open(path, **kwargs)
-
- try:
- yield handle_or_str
- finally:
- if not isinstance(handle_or_str, str):
- handle_or_str.close()
- if path.is_file():
- path.unlink()
-
-
-@contextmanager
-def ensure_safe_environment_variables() -> Generator[None, None, None]:
- """
- Get a context manager to safely set environment variables
-
- All changes will be undone on close, hence environment variables set
- within this contextmanager will neither persist nor change global state.
- """
- saved_environ = dict(os.environ)
- try:
- yield
- finally:
- os.environ.clear()
- os.environ.update(saved_environ)
-
-
-@contextmanager
-def with_csv_dialect(name, **kwargs) -> Generator[None, None, None]:
- """
- Context manager to temporarily register a CSV dialect for parsing CSV.
-
- Parameters
- ----------
- name : str
- The name of the dialect.
- kwargs : mapping
- The parameters for the dialect.
-
- Raises
- ------
- ValueError : the name of the dialect conflicts with a builtin one.
-
- See Also
- --------
- csv : Python's CSV library.
- """
- import csv
-
- _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"}
-
- if name in _BUILTIN_DIALECTS:
- raise ValueError("Cannot override builtin dialect.")
-
- csv.register_dialect(name, **kwargs)
- try:
- yield
- finally:
- csv.unregister_dialect(name)
-
-
-@contextmanager
-def use_numexpr(use, min_elements=None) -> Generator[None, None, None]:
- from pandas.core.computation import expressions as expr
-
- if min_elements is None:
- min_elements = expr._MIN_ELEMENTS
-
- olduse = expr.USE_NUMEXPR
- oldmin = expr._MIN_ELEMENTS
- set_option("compute.use_numexpr", use)
- expr._MIN_ELEMENTS = min_elements
- try:
- yield
- finally:
- expr._MIN_ELEMENTS = oldmin
- set_option("compute.use_numexpr", olduse)
-
-
-def raises_chained_assignment_error():
- if PYPY:
- from contextlib import nullcontext
-
- return nullcontext()
- else:
- from pandas._testing import assert_produces_warning
-
- return assert_produces_warning(
- ChainedAssignmentError,
- match=(
- "A value is trying to be set on a copy of a DataFrame or Series "
- "through chained assignment"
- ),
- )
diff --git a/contrib/python/pandas/py3/pandas/_typing.py b/contrib/python/pandas/py3/pandas/_typing.py
deleted file mode 100644
index 9d8caf8744d..00000000000
--- a/contrib/python/pandas/py3/pandas/_typing.py
+++ /dev/null
@@ -1,373 +0,0 @@
-from __future__ import annotations
-
-from datetime import (
- datetime,
- timedelta,
- tzinfo,
-)
-from os import PathLike
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Dict,
- Hashable,
- Iterator,
- List,
- Literal,
- Mapping,
- Optional,
- Protocol,
- Sequence,
- Tuple,
- Type as type_t,
- TypeVar,
- Union,
-)
-
-import numpy as np
-
-# To prevent import cycles place any internal imports in the branch below
-# and use a string literal forward reference to it in subsequent types
-# https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
-if TYPE_CHECKING:
- import numpy.typing as npt
-
- from pandas._libs import (
- NaTType,
- Period,
- Timedelta,
- Timestamp,
- )
- from pandas._libs.tslibs import BaseOffset
-
- from pandas.core.dtypes.dtypes import ExtensionDtype
-
- from pandas import Interval
- from pandas.arrays import (
- DatetimeArray,
- TimedeltaArray,
- )
- from pandas.core.arrays.base import ExtensionArray
- from pandas.core.frame import DataFrame
- from pandas.core.generic import NDFrame
- from pandas.core.groupby.generic import (
- DataFrameGroupBy,
- GroupBy,
- SeriesGroupBy,
- )
- from pandas.core.indexes.base import Index
- from pandas.core.internals import (
- ArrayManager,
- BlockManager,
- SingleArrayManager,
- SingleBlockManager,
- )
- from pandas.core.resample import Resampler
- from pandas.core.series import Series
- from pandas.core.window.rolling import BaseWindow
-
- from pandas.io.formats.format import EngFormatter
-
- ScalarLike_co = Union[
- int,
- float,
- complex,
- str,
- bytes,
- np.generic,
- ]
-
- # numpy compatible types
- NumpyValueArrayLike = Union[ScalarLike_co, npt.ArrayLike]
- # Name "npt._ArrayLikeInt_co" is not defined [name-defined]
- NumpySorter = Optional[npt._ArrayLikeInt_co] # type: ignore[name-defined]
-
-else:
- npt: Any = None
-
-HashableT = TypeVar("HashableT", bound=Hashable)
-
-# array-like
-
-ArrayLike = Union["ExtensionArray", np.ndarray]
-AnyArrayLike = Union[ArrayLike, "Index", "Series"]
-TimeArrayLike = Union["DatetimeArray", "TimedeltaArray"]
-
-# scalars
-
-PythonScalar = Union[str, float, bool]
-DatetimeLikeScalar = Union["Period", "Timestamp", "Timedelta"]
-PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"]
-Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, datetime]
-IntStrT = TypeVar("IntStrT", int, str)
-
-
-# timestamp and timedelta convertible types
-
-TimestampConvertibleTypes = Union[
- "Timestamp", datetime, np.datetime64, np.int64, float, str
-]
-TimedeltaConvertibleTypes = Union[
- "Timedelta", timedelta, np.timedelta64, np.int64, float, str
-]
-Timezone = Union[str, tzinfo]
-
-# NDFrameT is stricter and ensures that the same subclass of NDFrame always is
-# used. E.g. `def func(a: NDFrameT) -> NDFrameT: ...` means that if a
-# Series is passed into a function, a Series is always returned and if a DataFrame is
-# passed in, a DataFrame is always returned.
-NDFrameT = TypeVar("NDFrameT", bound="NDFrame")
-
-NumpyIndexT = TypeVar("NumpyIndexT", np.ndarray, "Index")
-
-AxisInt = int
-Axis = Union[AxisInt, Literal["index", "columns", "rows"]]
-IndexLabel = Union[Hashable, Sequence[Hashable]]
-Level = Hashable
-Shape = Tuple[int, ...]
-Suffixes = Tuple[Optional[str], Optional[str]]
-Ordered = Optional[bool]
-JSONSerializable = Optional[Union[PythonScalar, List, Dict]]
-Frequency = Union[str, "BaseOffset"]
-Axes = Union[AnyArrayLike, List, range]
-
-RandomState = Union[
- int,
- ArrayLike,
- np.random.Generator,
- np.random.BitGenerator,
- np.random.RandomState,
-]
-
-# dtypes
-NpDtype = Union[str, np.dtype, type_t[Union[str, complex, bool, object]]]
-Dtype = Union["ExtensionDtype", NpDtype]
-AstypeArg = Union["ExtensionDtype", "npt.DTypeLike"]
-# DtypeArg specifies all allowable dtypes in a functions its dtype argument
-DtypeArg = Union[Dtype, Dict[Hashable, Dtype]]
-DtypeObj = Union[np.dtype, "ExtensionDtype"]
-
-# converters
-ConvertersArg = Dict[Hashable, Callable[[Dtype], Dtype]]
-
-# parse_dates
-ParseDatesArg = Union[
- bool, List[Hashable], List[List[Hashable]], Dict[Hashable, List[Hashable]]
-]
-
-# For functions like rename that convert one label to another
-Renamer = Union[Mapping[Any, Hashable], Callable[[Any], Hashable]]
-
-# to maintain type information across generic functions and parametrization
-T = TypeVar("T")
-
-# used in decorators to preserve the signature of the function it decorates
-# see https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators
-FuncType = Callable[..., Any]
-F = TypeVar("F", bound=FuncType)
-
-# types of vectorized key functions for DataFrame::sort_values and
-# DataFrame::sort_index, among others
-ValueKeyFunc = Optional[Callable[["Series"], Union["Series", AnyArrayLike]]]
-IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]]
-
-# types of `func` kwarg for DataFrame.aggregate and Series.aggregate
-AggFuncTypeBase = Union[Callable, str]
-AggFuncTypeDict = Dict[Hashable, Union[AggFuncTypeBase, List[AggFuncTypeBase]]]
-AggFuncType = Union[
- AggFuncTypeBase,
- List[AggFuncTypeBase],
- AggFuncTypeDict,
-]
-AggObjType = Union[
- "Series",
- "DataFrame",
- "GroupBy",
- "SeriesGroupBy",
- "DataFrameGroupBy",
- "BaseWindow",
- "Resampler",
-]
-
-PythonFuncType = Callable[[Any], Any]
-
-# filenames and file-like-objects
-AnyStr_co = TypeVar("AnyStr_co", str, bytes, covariant=True)
-AnyStr_contra = TypeVar("AnyStr_contra", str, bytes, contravariant=True)
-
-
-class BaseBuffer(Protocol):
- @property
- def mode(self) -> str:
- # for _get_filepath_or_buffer
- ...
-
- def seek(self, __offset: int, __whence: int = ...) -> int:
- # with one argument: gzip.GzipFile, bz2.BZ2File
- # with two arguments: zip.ZipFile, read_sas
- ...
-
- def seekable(self) -> bool:
- # for bz2.BZ2File
- ...
-
- def tell(self) -> int:
- # for zip.ZipFile, read_stata, to_stata
- ...
-
-
-class ReadBuffer(BaseBuffer, Protocol[AnyStr_co]):
- def read(self, __n: int = ...) -> AnyStr_co:
- # for BytesIOWrapper, gzip.GzipFile, bz2.BZ2File
- ...
-
-
-class WriteBuffer(BaseBuffer, Protocol[AnyStr_contra]):
- def write(self, __b: AnyStr_contra) -> Any:
- # for gzip.GzipFile, bz2.BZ2File
- ...
-
- def flush(self) -> Any:
- # for gzip.GzipFile, bz2.BZ2File
- ...
-
-
-class ReadPickleBuffer(ReadBuffer[bytes], Protocol):
- def readline(self) -> bytes:
- ...
-
-
-class WriteExcelBuffer(WriteBuffer[bytes], Protocol):
- def truncate(self, size: int | None = ...) -> int:
- ...
-
-
-class ReadCsvBuffer(ReadBuffer[AnyStr_co], Protocol):
- def __iter__(self) -> Iterator[AnyStr_co]:
- # for engine=python
- ...
-
- def fileno(self) -> int:
- # for _MMapWrapper
- ...
-
- def readline(self) -> AnyStr_co:
- # for engine=python
- ...
-
- @property
- def closed(self) -> bool:
- # for enine=pyarrow
- ...
-
-
-FilePath = Union[str, "PathLike[str]"]
-
-# for arbitrary kwargs passed during reading/writing files
-StorageOptions = Optional[Dict[str, Any]]
-
-
-# compression keywords and compression
-CompressionDict = Dict[str, Any]
-CompressionOptions = Optional[
- Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"], CompressionDict]
-]
-
-# types in DataFrameFormatter
-FormattersType = Union[
- List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable]
-]
-ColspaceType = Mapping[Hashable, Union[str, int]]
-FloatFormatType = Union[str, Callable, "EngFormatter"]
-ColspaceArgType = Union[
- str, int, Sequence[Union[str, int]], Mapping[Hashable, Union[str, int]]
-]
-
-# Arguments for fillna()
-FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"]
-
-# internals
-Manager = Union[
- "ArrayManager", "SingleArrayManager", "BlockManager", "SingleBlockManager"
-]
-SingleManager = Union["SingleArrayManager", "SingleBlockManager"]
-Manager2D = Union["ArrayManager", "BlockManager"]
-
-# indexing
-# PositionalIndexer -> valid 1D positional indexer, e.g. can pass
-# to ndarray.__getitem__
-# ScalarIndexer is for a single value as the index
-# SequenceIndexer is for list like or slices (but not tuples)
-# PositionalIndexerTuple is extends the PositionalIndexer for 2D arrays
-# These are used in various __getitem__ overloads
-# TODO(typing#684): add Ellipsis, see
-# https://github.com/python/typing/issues/684#issuecomment-548203158
-# https://bugs.python.org/issue41810
-# Using List[int] here rather than Sequence[int] to disallow tuples.
-ScalarIndexer = Union[int, np.integer]
-SequenceIndexer = Union[slice, List[int], np.ndarray]
-PositionalIndexer = Union[ScalarIndexer, SequenceIndexer]
-PositionalIndexerTuple = Tuple[PositionalIndexer, PositionalIndexer]
-PositionalIndexer2D = Union[PositionalIndexer, PositionalIndexerTuple]
-if TYPE_CHECKING:
- TakeIndexer = Union[Sequence[int], Sequence[np.integer], npt.NDArray[np.integer]]
-else:
- TakeIndexer = Any
-
-# Shared by functions such as drop and astype
-IgnoreRaise = Literal["ignore", "raise"]
-
-# Windowing rank methods
-WindowingRankType = Literal["average", "min", "max"]
-
-# read_csv engines
-CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
-
-# read_json engines
-JSONEngine = Literal["ujson", "pyarrow"]
-
-# read_xml parsers
-XMLParsers = Literal["lxml", "etree"]
-
-# Interval closed type
-IntervalLeftRight = Literal["left", "right"]
-IntervalClosedType = Union[IntervalLeftRight, Literal["both", "neither"]]
-
-# datetime and NaTType
-DatetimeNaTType = Union[datetime, "NaTType"]
-DateTimeErrorChoices = Union[IgnoreRaise, Literal["coerce"]]
-
-# sort_index
-SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"]
-NaPosition = Literal["first", "last"]
-
-# quantile interpolation
-QuantileInterpolation = Literal["linear", "lower", "higher", "midpoint", "nearest"]
-
-# plotting
-PlottingOrientation = Literal["horizontal", "vertical"]
-
-# dropna
-AnyAll = Literal["any", "all"]
-
-# merge
-MergeHow = Literal["left", "right", "inner", "outer", "cross"]
-
-# join
-JoinHow = Literal["left", "right", "inner", "outer"]
-
-MatplotlibColor = Union[str, Sequence[float]]
-TimeGrouperOrigin = Union[
- "Timestamp", Literal["epoch", "start", "start_day", "end", "end_day"]
-]
-TimeAmbiguous = Union[Literal["infer", "NaT", "raise"], "npt.NDArray[np.bool_]"]
-TimeNonexistent = Union[
- Literal["shift_forward", "shift_backward", "NaT", "raise"], timedelta
-]
-DropKeep = Literal["first", "last", False]
-CorrelationMethod = Union[
- Literal["pearson", "kendall", "spearman"], Callable[[np.ndarray, np.ndarray], float]
-]
-AlignJoin = Literal["outer", "inner", "left", "right"]
-DtypeBackend = Literal["pyarrow", "numpy_nullable"]
diff --git a/contrib/python/pandas/py3/pandas/_version.py b/contrib/python/pandas/py3/pandas/_version.py
deleted file mode 100644
index 60dfa3554d4..00000000000
--- a/contrib/python/pandas/py3/pandas/_version.py
+++ /dev/null
@@ -1,21 +0,0 @@
-
-# This file was generated by 'versioneer.py' (0.28) from
-# revision-control system data, or from the parent directory name of an
-# unpacked source archive. Distribution tarballs contain a pre-generated copy
-# of this file.
-
-import json
-
-version_json = '''
-{
- "date": "2023-06-28T14:27:24-0700",
- "dirty": false,
- "error": null,
- "full-revisionid": "0f437949513225922d851e9581723d82120684a6",
- "version": "2.0.3"
-}
-''' # END VERSION_JSON
-
-
-def get_versions():
- return json.loads(version_json)
diff --git a/contrib/python/pandas/py3/pandas/api/__init__.py b/contrib/python/pandas/py3/pandas/api/__init__.py
deleted file mode 100644
index 9d4f721225d..00000000000
--- a/contrib/python/pandas/py3/pandas/api/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-""" public toolkit API """
-from pandas.api import (
- extensions,
- indexers,
- interchange,
- types,
-)
-
-__all__ = [
- "interchange",
- "extensions",
- "indexers",
- "types",
-]
diff --git a/contrib/python/pandas/py3/pandas/api/extensions/__init__.py b/contrib/python/pandas/py3/pandas/api/extensions/__init__.py
deleted file mode 100644
index ea5f1ba9268..00000000000
--- a/contrib/python/pandas/py3/pandas/api/extensions/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""
-Public API for extending pandas objects.
-"""
-
-from pandas._libs.lib import no_default
-
-from pandas.core.dtypes.base import (
- ExtensionDtype,
- register_extension_dtype,
-)
-
-from pandas.core.accessor import (
- register_dataframe_accessor,
- register_index_accessor,
- register_series_accessor,
-)
-from pandas.core.algorithms import take
-from pandas.core.arrays import (
- ExtensionArray,
- ExtensionScalarOpsMixin,
-)
-
-__all__ = [
- "no_default",
- "ExtensionDtype",
- "register_extension_dtype",
- "register_dataframe_accessor",
- "register_index_accessor",
- "register_series_accessor",
- "take",
- "ExtensionArray",
- "ExtensionScalarOpsMixin",
-]
diff --git a/contrib/python/pandas/py3/pandas/api/indexers/__init__.py b/contrib/python/pandas/py3/pandas/api/indexers/__init__.py
deleted file mode 100644
index 78357f11dc3..00000000000
--- a/contrib/python/pandas/py3/pandas/api/indexers/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-"""
-Public API for Rolling Window Indexers.
-"""
-
-from pandas.core.indexers import check_array_indexer
-from pandas.core.indexers.objects import (
- BaseIndexer,
- FixedForwardWindowIndexer,
- VariableOffsetWindowIndexer,
-)
-
-__all__ = [
- "check_array_indexer",
- "BaseIndexer",
- "FixedForwardWindowIndexer",
- "VariableOffsetWindowIndexer",
-]
diff --git a/contrib/python/pandas/py3/pandas/api/interchange/__init__.py b/contrib/python/pandas/py3/pandas/api/interchange/__init__.py
deleted file mode 100644
index 2f3a73bc46b..00000000000
--- a/contrib/python/pandas/py3/pandas/api/interchange/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""
-Public API for DataFrame interchange protocol.
-"""
-
-from pandas.core.interchange.dataframe_protocol import DataFrame
-from pandas.core.interchange.from_dataframe import from_dataframe
-
-__all__ = ["from_dataframe", "DataFrame"]
diff --git a/contrib/python/pandas/py3/pandas/api/types/__init__.py b/contrib/python/pandas/py3/pandas/api/types/__init__.py
deleted file mode 100644
index fb1abdd5b18..00000000000
--- a/contrib/python/pandas/py3/pandas/api/types/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""
-Public toolkit API.
-"""
-
-from pandas._libs.lib import infer_dtype
-
-from pandas.core.dtypes.api import * # noqa: F401, F403
-from pandas.core.dtypes.concat import union_categoricals
-from pandas.core.dtypes.dtypes import (
- CategoricalDtype,
- DatetimeTZDtype,
- IntervalDtype,
- PeriodDtype,
-)
-
-__all__ = [
- "infer_dtype",
- "union_categoricals",
- "CategoricalDtype",
- "DatetimeTZDtype",
- "IntervalDtype",
- "PeriodDtype",
-]
diff --git a/contrib/python/pandas/py3/pandas/arrays/__init__.py b/contrib/python/pandas/py3/pandas/arrays/__init__.py
deleted file mode 100644
index 3a8e80a6b5d..00000000000
--- a/contrib/python/pandas/py3/pandas/arrays/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-"""
-All of pandas' ExtensionArrays.
-
-See :ref:`extending.extension-types` for more.
-"""
-from pandas.core.arrays import (
- ArrowExtensionArray,
- ArrowStringArray,
- BooleanArray,
- Categorical,
- DatetimeArray,
- FloatingArray,
- IntegerArray,
- IntervalArray,
- PandasArray,
- PeriodArray,
- SparseArray,
- StringArray,
- TimedeltaArray,
-)
-
-__all__ = [
- "ArrowExtensionArray",
- "ArrowStringArray",
- "BooleanArray",
- "Categorical",
- "DatetimeArray",
- "FloatingArray",
- "IntegerArray",
- "IntervalArray",
- "PandasArray",
- "PeriodArray",
- "SparseArray",
- "StringArray",
- "TimedeltaArray",
-]
diff --git a/contrib/python/pandas/py3/pandas/compat/__init__.py b/contrib/python/pandas/py3/pandas/compat/__init__.py
deleted file mode 100644
index 60a9b3d4fd3..00000000000
--- a/contrib/python/pandas/py3/pandas/compat/__init__.py
+++ /dev/null
@@ -1,169 +0,0 @@
-"""
-compat
-======
-
-Cross-compatible functions for different versions of Python.
-
-Other items:
-* platform checker
-"""
-from __future__ import annotations
-
-import os
-import platform
-import sys
-
-from pandas._typing import F
-from pandas.compat._constants import (
- IS64,
- PY39,
- PY310,
- PY311,
- PYPY,
-)
-import pandas.compat.compressors
-from pandas.compat.numpy import (
- is_numpy_dev,
- np_version_under1p21,
-)
-from pandas.compat.pyarrow import (
- pa_version_under7p0,
- pa_version_under8p0,
- pa_version_under9p0,
- pa_version_under11p0,
-)
-
-
-def set_function_name(f: F, name: str, cls) -> F:
- """
- Bind the name/qualname attributes of the function.
- """
- f.__name__ = name
- f.__qualname__ = f"{cls.__name__}.{name}"
- f.__module__ = cls.__module__
- return f
-
-
-def is_platform_little_endian() -> bool:
- """
- Checking if the running platform is little endian.
-
- Returns
- -------
- bool
- True if the running platform is little endian.
- """
- return sys.byteorder == "little"
-
-
-def is_platform_windows() -> bool:
- """
- Checking if the running platform is windows.
-
- Returns
- -------
- bool
- True if the running platform is windows.
- """
- return sys.platform in ["win32", "cygwin"]
-
-
-def is_platform_linux() -> bool:
- """
- Checking if the running platform is linux.
-
- Returns
- -------
- bool
- True if the running platform is linux.
- """
- return sys.platform == "linux"
-
-
-def is_platform_mac() -> bool:
- """
- Checking if the running platform is mac.
-
- Returns
- -------
- bool
- True if the running platform is mac.
- """
- return sys.platform == "darwin"
-
-
-def is_platform_arm() -> bool:
- """
- Checking if the running platform use ARM architecture.
-
- Returns
- -------
- bool
- True if the running platform uses ARM architecture.
- """
- return platform.machine() in ("arm64", "aarch64") or platform.machine().startswith(
- "armv"
- )
-
-
-def is_platform_power() -> bool:
- """
- Checking if the running platform use Power architecture.
-
- Returns
- -------
- bool
- True if the running platform uses ARM architecture.
- """
- return platform.machine() in ("ppc64", "ppc64le")
-
-
-def is_ci_environment() -> bool:
- """
- Checking if running in a continuous integration environment by checking
- the PANDAS_CI environment variable.
-
- Returns
- -------
- bool
- True if the running in a continuous integration environment.
- """
- return os.environ.get("PANDAS_CI", "0") == "1"
-
-
-def get_lzma_file() -> type[pandas.compat.compressors.LZMAFile]:
- """
- Importing the `LZMAFile` class from the `lzma` module.
-
- Returns
- -------
- class
- The `LZMAFile` class from the `lzma` module.
-
- Raises
- ------
- RuntimeError
- If the `lzma` module was not imported correctly, or didn't exist.
- """
- if not pandas.compat.compressors.has_lzma:
- raise RuntimeError(
- "lzma module not available. "
- "A Python re-install with the proper dependencies, "
- "might be required to solve this issue."
- )
- return pandas.compat.compressors.LZMAFile
-
-
-__all__ = [
- "is_numpy_dev",
- "np_version_under1p21",
- "pa_version_under7p0",
- "pa_version_under8p0",
- "pa_version_under9p0",
- "pa_version_under11p0",
- "IS64",
- "PY39",
- "PY310",
- "PY311",
- "PYPY",
-]
diff --git a/contrib/python/pandas/py3/pandas/compat/_constants.py b/contrib/python/pandas/py3/pandas/compat/_constants.py
deleted file mode 100644
index 75d99f5ae51..00000000000
--- a/contrib/python/pandas/py3/pandas/compat/_constants.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""
-_constants
-======
-
-Constants relevant for the Python implementation.
-"""
-
-from __future__ import annotations
-
-import platform
-import sys
-
-IS64 = sys.maxsize > 2**32
-
-PY39 = sys.version_info >= (3, 9)
-PY310 = sys.version_info >= (3, 10)
-PY311 = sys.version_info >= (3, 11)
-PYPY = platform.python_implementation() == "PyPy"
-
-
-__all__ = [
- "IS64",
- "PY39",
- "PY310",
- "PY311",
- "PYPY",
-]
diff --git a/contrib/python/pandas/py3/pandas/compat/_optional.py b/contrib/python/pandas/py3/pandas/compat/_optional.py
deleted file mode 100644
index 15edb4836d2..00000000000
--- a/contrib/python/pandas/py3/pandas/compat/_optional.py
+++ /dev/null
@@ -1,173 +0,0 @@
-from __future__ import annotations
-
-import importlib
-import sys
-import types
-import warnings
-
-from pandas.util._exceptions import find_stack_level
-
-from pandas.util.version import Version
-
-# Update install.rst & setup.cfg when updating versions!
-
-VERSIONS = {
- "bs4": "4.9.3",
- "blosc": "1.21.0",
- "bottleneck": "1.3.2",
- "brotli": "0.7.0",
- "fastparquet": "0.6.3",
- "fsspec": "2021.07.0",
- "html5lib": "1.1",
- "hypothesis": "6.34.2",
- "gcsfs": "2021.07.0",
- "jinja2": "3.0.0",
- "lxml.etree": "4.6.3",
- "matplotlib": "3.6.1",
- "numba": "0.53.1",
- "numexpr": "2.7.3",
- "odfpy": "1.4.1",
- "openpyxl": "3.0.7",
- "pandas_gbq": "0.15.0",
- "psycopg2": "2.8.6", # (dt dec pq3 ext lo64)
- "pymysql": "1.0.2",
- "pyarrow": "7.0.0",
- "pyreadstat": "1.1.2",
- "pytest": "7.3.2",
- "pyxlsb": "1.0.8",
- "s3fs": "2021.08.0",
- "scipy": "1.7.1",
- "snappy": "0.6.0",
- "sqlalchemy": "1.2.0",
- "tables": "3.6.1",
- "tabulate": "0.8.9",
- "xarray": "0.21.0",
- "xlrd": "2.0.1",
- "xlsxwriter": "1.4.3",
- "zstandard": "0.15.2",
- "tzdata": "2022.1",
- "qtpy": "2.2.0",
- "pyqt5": "5.15.1",
-}
-
-# A mapping from import name to package name (on PyPI) for packages where
-# these two names are different.
-
-INSTALL_MAPPING = {
- "bs4": "beautifulsoup4",
- "bottleneck": "Bottleneck",
- "brotli": "brotlipy",
- "jinja2": "Jinja2",
- "lxml.etree": "lxml",
- "odf": "odfpy",
- "pandas_gbq": "pandas-gbq",
- "snappy": "python-snappy",
- "sqlalchemy": "SQLAlchemy",
- "tables": "pytables",
-}
-
-
-def get_version(module: types.ModuleType) -> str:
- version = getattr(module, "__version__", None)
- if version is None:
- # xlrd uses a capitalized attribute name
- version = getattr(module, "__VERSION__", None)
-
- if version is None:
- if module.__name__ == "brotli":
- # brotli doesn't contain attributes to confirm it's version
- return ""
- if module.__name__ == "snappy":
- # snappy doesn't contain attributes to confirm it's version
- # See https://github.com/andrix/python-snappy/pull/119
- return ""
- raise ImportError(f"Can't determine version for {module.__name__}")
- if module.__name__ == "psycopg2":
- # psycopg2 appends " (dt dec pq3 ext lo64)" to it's version
- version = version.split()[0]
- return version
-
-
-def import_optional_dependency(
- name: str,
- extra: str = "",
- errors: str = "raise",
- min_version: str | None = None,
-):
- """
- Import an optional dependency.
-
- By default, if a dependency is missing an ImportError with a nice
- message will be raised. If a dependency is present, but too old,
- we raise.
-
- Parameters
- ----------
- name : str
- The module name.
- extra : str
- Additional text to include in the ImportError message.
- errors : str {'raise', 'warn', 'ignore'}
- What to do when a dependency is not found or its version is too old.
-
- * raise : Raise an ImportError
- * warn : Only applicable when a module's version is to old.
- Warns that the version is too old and returns None
- * ignore: If the module is not installed, return None, otherwise,
- return the module, even if the version is too old.
- It's expected that users validate the version locally when
- using ``errors="ignore"`` (see. ``io/html.py``)
- min_version : str, default None
- Specify a minimum version that is different from the global pandas
- minimum version required.
- Returns
- -------
- maybe_module : Optional[ModuleType]
- The imported module, when found and the version is correct.
- None is returned when the package is not found and `errors`
- is False, or when the package's version is too old and `errors`
- is ``'warn'``.
- """
-
- assert errors in {"warn", "raise", "ignore"}
-
- package_name = INSTALL_MAPPING.get(name)
- install_name = package_name if package_name is not None else name
-
- msg = (
- f"Missing optional dependency '{install_name}'. {extra} "
- f"Use pip or conda to install {install_name}."
- )
- try:
- module = importlib.import_module(name)
- except ImportError:
- if errors == "raise":
- raise ImportError(msg)
- return None
-
- # Handle submodules: if we have submodule, grab parent module from sys.modules
- parent = name.split(".")[0]
- if parent != name:
- install_name = parent
- module_to_get = sys.modules[install_name]
- else:
- module_to_get = module
- minimum_version = min_version if min_version is not None else VERSIONS.get(parent)
- if minimum_version:
- version = get_version(module_to_get)
- if version and Version(version) < Version(minimum_version):
- msg = (
- f"Pandas requires version '{minimum_version}' or newer of '{parent}' "
- f"(version '{version}' currently installed)."
- )
- if errors == "warn":
- warnings.warn(
- msg,
- UserWarning,
- stacklevel=find_stack_level(),
- )
- return None
- elif errors == "raise":
- raise ImportError(msg)
-
- return module
diff --git a/contrib/python/pandas/py3/pandas/compat/compressors.py b/contrib/python/pandas/py3/pandas/compat/compressors.py
deleted file mode 100644
index a4f39c4e34b..00000000000
--- a/contrib/python/pandas/py3/pandas/compat/compressors.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-Patched ``BZ2File`` and ``LZMAFile`` to handle pickle protocol 5.
-"""
-
-from __future__ import annotations
-
-import bz2
-from pickle import PickleBuffer
-
-from pandas.compat._constants import PY310
-
-try:
- import lzma
-
- has_lzma = True
-except ImportError:
- has_lzma = False
-
-
-def flatten_buffer(
- b: bytes | bytearray | memoryview | PickleBuffer,
-) -> bytes | bytearray | memoryview:
- """
- Return some 1-D `uint8` typed buffer.
-
- Coerces anything that does not match that description to one that does
- without copying if possible (otherwise will copy).
- """
-
- if isinstance(b, (bytes, bytearray)):
- return b
-
- if not isinstance(b, PickleBuffer):
- b = PickleBuffer(b)
-
- try:
- # coerce to 1-D `uint8` C-contiguous `memoryview` zero-copy
- return b.raw()
- except BufferError:
- # perform in-memory copy if buffer is not contiguous
- return memoryview(b).tobytes("A")
-
-
-class BZ2File(bz2.BZ2File):
- if not PY310:
-
- def write(self, b) -> int:
- # Workaround issue where `bz2.BZ2File` expects `len`
- # to return the number of bytes in `b` by converting
- # `b` into something that meets that constraint with
- # minimal copying.
- #
- # Note: This is fixed in Python 3.10.
- return super().write(flatten_buffer(b))
-
-
-if has_lzma:
-
- class LZMAFile(lzma.LZMAFile):
- if not PY310:
-
- def write(self, b) -> int:
- # Workaround issue where `lzma.LZMAFile` expects `len`
- # to return the number of bytes in `b` by converting
- # `b` into something that meets that constraint with
- # minimal copying.
- #
- # Note: This is fixed in Python 3.10.
- return super().write(flatten_buffer(b))
diff --git a/contrib/python/pandas/py3/pandas/compat/numpy/__init__.py b/contrib/python/pandas/py3/pandas/compat/numpy/__init__.py
deleted file mode 100644
index 0d86a63f6db..00000000000
--- a/contrib/python/pandas/py3/pandas/compat/numpy/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-""" support numpy compatibility across versions """
-import numpy as np
-
-from pandas.util.version import Version
-
-# numpy versioning
-_np_version = np.__version__
-_nlv = Version(_np_version)
-np_version_under1p21 = _nlv < Version("1.21")
-np_version_under1p22 = _nlv < Version("1.22")
-np_version_gte1p22 = _nlv >= Version("1.22")
-np_version_gte1p24 = _nlv >= Version("1.24")
-np_version_gte1p24p3 = _nlv >= Version("1.24.3")
-np_version_gte1p25 = _nlv >= Version("1.25")
-is_numpy_dev = _nlv.dev is not None
-_min_numpy_ver = "1.20.3"
-
-if is_numpy_dev or not np_version_under1p22:
- np_percentile_argname = "method"
-else:
- np_percentile_argname = "interpolation"
-
-
-if _nlv < Version(_min_numpy_ver):
- raise ImportError(
- f"this version of pandas is incompatible with numpy < {_min_numpy_ver}\n"
- f"your numpy version is {_np_version}.\n"
- f"Please upgrade numpy to >= {_min_numpy_ver} to use this pandas version"
- )
-
-
-__all__ = [
- "np",
- "_np_version",
- "is_numpy_dev",
-]
diff --git a/contrib/python/pandas/py3/pandas/compat/numpy/function.py b/contrib/python/pandas/py3/pandas/compat/numpy/function.py
deleted file mode 100644
index 2c71990d74c..00000000000
--- a/contrib/python/pandas/py3/pandas/compat/numpy/function.py
+++ /dev/null
@@ -1,391 +0,0 @@
-"""
-For compatibility with numpy libraries, pandas functions or methods have to
-accept '*args' and '**kwargs' parameters to accommodate numpy arguments that
-are not actually used or respected in the pandas implementation.
-
-To ensure that users do not abuse these parameters, validation is performed in
-'validators.py' to make sure that any extra parameters passed correspond ONLY
-to those in the numpy signature. Part of that validation includes whether or
-not the user attempted to pass in non-default values for these extraneous
-parameters. As we want to discourage users from relying on these parameters
-when calling the pandas implementation, we want them only to pass in the
-default values for these parameters.
-
-This module provides a set of commonly used default arguments for functions and
-methods that are spread throughout the codebase. This module will make it
-easier to adjust to future upstream changes in the analogous numpy signatures.
-"""
-from __future__ import annotations
-
-from typing import (
- Any,
- TypeVar,
- cast,
- overload,
-)
-
-from numpy import ndarray
-
-from pandas._libs.lib import (
- is_bool,
- is_integer,
-)
-from pandas._typing import (
- Axis,
- AxisInt,
-)
-from pandas.errors import UnsupportedFunctionCall
-from pandas.util._validators import (
- validate_args,
- validate_args_and_kwargs,
- validate_kwargs,
-)
-
-AxisNoneT = TypeVar("AxisNoneT", Axis, None)
-
-
-class CompatValidator:
- def __init__(
- self,
- defaults,
- fname=None,
- method: str | None = None,
- max_fname_arg_count=None,
- ) -> None:
- self.fname = fname
- self.method = method
- self.defaults = defaults
- self.max_fname_arg_count = max_fname_arg_count
-
- def __call__(
- self,
- args,
- kwargs,
- fname=None,
- max_fname_arg_count=None,
- method: str | None = None,
- ) -> None:
- if args or kwargs:
- fname = self.fname if fname is None else fname
- max_fname_arg_count = (
- self.max_fname_arg_count
- if max_fname_arg_count is None
- else max_fname_arg_count
- )
- method = self.method if method is None else method
-
- if method == "args":
- validate_args(fname, args, max_fname_arg_count, self.defaults)
- elif method == "kwargs":
- validate_kwargs(fname, kwargs, self.defaults)
- elif method == "both":
- validate_args_and_kwargs(
- fname, args, kwargs, max_fname_arg_count, self.defaults
- )
- else:
- raise ValueError(f"invalid validation method '{method}'")
-
-
-ARGMINMAX_DEFAULTS = {"out": None}
-validate_argmin = CompatValidator(
- ARGMINMAX_DEFAULTS, fname="argmin", method="both", max_fname_arg_count=1
-)
-validate_argmax = CompatValidator(
- ARGMINMAX_DEFAULTS, fname="argmax", method="both", max_fname_arg_count=1
-)
-
-
-def process_skipna(skipna: bool | ndarray | None, args) -> tuple[bool, Any]:
- if isinstance(skipna, ndarray) or skipna is None:
- args = (skipna,) + args
- skipna = True
-
- return skipna, args
-
-
-def validate_argmin_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> bool:
- """
- If 'Series.argmin' is called via the 'numpy' library, the third parameter
- in its signature is 'out', which takes either an ndarray or 'None', so
- check if the 'skipna' parameter is either an instance of ndarray or is
- None, since 'skipna' itself should be a boolean
- """
- skipna, args = process_skipna(skipna, args)
- validate_argmin(args, kwargs)
- return skipna
-
-
-def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> bool:
- """
- If 'Series.argmax' is called via the 'numpy' library, the third parameter
- in its signature is 'out', which takes either an ndarray or 'None', so
- check if the 'skipna' parameter is either an instance of ndarray or is
- None, since 'skipna' itself should be a boolean
- """
- skipna, args = process_skipna(skipna, args)
- validate_argmax(args, kwargs)
- return skipna
-
-
-ARGSORT_DEFAULTS: dict[str, int | str | None] = {}
-ARGSORT_DEFAULTS["axis"] = -1
-ARGSORT_DEFAULTS["kind"] = "quicksort"
-ARGSORT_DEFAULTS["order"] = None
-ARGSORT_DEFAULTS["kind"] = None
-
-
-validate_argsort = CompatValidator(
- ARGSORT_DEFAULTS, fname="argsort", max_fname_arg_count=0, method="both"
-)
-
-# two different signatures of argsort, this second validation for when the
-# `kind` param is supported
-ARGSORT_DEFAULTS_KIND: dict[str, int | None] = {}
-ARGSORT_DEFAULTS_KIND["axis"] = -1
-ARGSORT_DEFAULTS_KIND["order"] = None
-validate_argsort_kind = CompatValidator(
- ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both"
-)
-
-
-def validate_argsort_with_ascending(ascending: bool | int | None, args, kwargs) -> bool:
- """
- If 'Categorical.argsort' is called via the 'numpy' library, the first
- parameter in its signature is 'axis', which takes either an integer or
- 'None', so check if the 'ascending' parameter has either integer type or is
- None, since 'ascending' itself should be a boolean
- """
- if is_integer(ascending) or ascending is None:
- args = (ascending,) + args
- ascending = True
-
- validate_argsort_kind(args, kwargs, max_fname_arg_count=3)
- ascending = cast(bool, ascending)
- return ascending
-
-
-CLIP_DEFAULTS: dict[str, Any] = {"out": None}
-validate_clip = CompatValidator(
- CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3
-)
-
-
-@overload
-def validate_clip_with_axis(axis: ndarray, args, kwargs) -> None:
- ...
-
-
-@overload
-def validate_clip_with_axis(axis: AxisNoneT, args, kwargs) -> AxisNoneT:
- ...
-
-
-def validate_clip_with_axis(
- axis: ndarray | AxisNoneT, args, kwargs
-) -> AxisNoneT | None:
- """
- If 'NDFrame.clip' is called via the numpy library, the third parameter in
- its signature is 'out', which can takes an ndarray, so check if the 'axis'
- parameter is an instance of ndarray, since 'axis' itself should either be
- an integer or None
- """
- if isinstance(axis, ndarray):
- args = (axis,) + args
- # error: Incompatible types in assignment (expression has type "None",
- # variable has type "Union[ndarray[Any, Any], str, int]")
- axis = None # type: ignore[assignment]
-
- validate_clip(args, kwargs)
- # error: Incompatible return value type (got "Union[ndarray[Any, Any],
- # str, int]", expected "Union[str, int, None]")
- return axis # type: ignore[return-value]
-
-
-CUM_FUNC_DEFAULTS: dict[str, Any] = {}
-CUM_FUNC_DEFAULTS["dtype"] = None
-CUM_FUNC_DEFAULTS["out"] = None
-validate_cum_func = CompatValidator(
- CUM_FUNC_DEFAULTS, method="both", max_fname_arg_count=1
-)
-validate_cumsum = CompatValidator(
- CUM_FUNC_DEFAULTS, fname="cumsum", method="both", max_fname_arg_count=1
-)
-
-
-def validate_cum_func_with_skipna(skipna, args, kwargs, name) -> bool:
- """
- If this function is called via the 'numpy' library, the third parameter in
- its signature is 'dtype', which takes either a 'numpy' dtype or 'None', so
- check if the 'skipna' parameter is a boolean or not
- """
- if not is_bool(skipna):
- args = (skipna,) + args
- skipna = True
-
- validate_cum_func(args, kwargs, fname=name)
- return skipna
-
-
-ALLANY_DEFAULTS: dict[str, bool | None] = {}
-ALLANY_DEFAULTS["dtype"] = None
-ALLANY_DEFAULTS["out"] = None
-ALLANY_DEFAULTS["keepdims"] = False
-ALLANY_DEFAULTS["axis"] = None
-validate_all = CompatValidator(
- ALLANY_DEFAULTS, fname="all", method="both", max_fname_arg_count=1
-)
-validate_any = CompatValidator(
- ALLANY_DEFAULTS, fname="any", method="both", max_fname_arg_count=1
-)
-
-LOGICAL_FUNC_DEFAULTS = {"out": None, "keepdims": False}
-validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs")
-
-MINMAX_DEFAULTS = {"axis": None, "out": None, "keepdims": False}
-validate_min = CompatValidator(
- MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1
-)
-validate_max = CompatValidator(
- MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1
-)
-
-RESHAPE_DEFAULTS: dict[str, str] = {"order": "C"}
-validate_reshape = CompatValidator(
- RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1
-)
-
-REPEAT_DEFAULTS: dict[str, Any] = {"axis": None}
-validate_repeat = CompatValidator(
- REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1
-)
-
-ROUND_DEFAULTS: dict[str, Any] = {"out": None}
-validate_round = CompatValidator(
- ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1
-)
-
-SORT_DEFAULTS: dict[str, int | str | None] = {}
-SORT_DEFAULTS["axis"] = -1
-SORT_DEFAULTS["kind"] = "quicksort"
-SORT_DEFAULTS["order"] = None
-validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs")
-
-STAT_FUNC_DEFAULTS: dict[str, Any | None] = {}
-STAT_FUNC_DEFAULTS["dtype"] = None
-STAT_FUNC_DEFAULTS["out"] = None
-
-SUM_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
-SUM_DEFAULTS["axis"] = None
-SUM_DEFAULTS["keepdims"] = False
-SUM_DEFAULTS["initial"] = None
-
-PROD_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
-PROD_DEFAULTS["axis"] = None
-PROD_DEFAULTS["keepdims"] = False
-PROD_DEFAULTS["initial"] = None
-
-MEDIAN_DEFAULTS = STAT_FUNC_DEFAULTS.copy()
-MEDIAN_DEFAULTS["overwrite_input"] = False
-MEDIAN_DEFAULTS["keepdims"] = False
-
-STAT_FUNC_DEFAULTS["keepdims"] = False
-
-validate_stat_func = CompatValidator(STAT_FUNC_DEFAULTS, method="kwargs")
-validate_sum = CompatValidator(
- SUM_DEFAULTS, fname="sum", method="both", max_fname_arg_count=1
-)
-validate_prod = CompatValidator(
- PROD_DEFAULTS, fname="prod", method="both", max_fname_arg_count=1
-)
-validate_mean = CompatValidator(
- STAT_FUNC_DEFAULTS, fname="mean", method="both", max_fname_arg_count=1
-)
-validate_median = CompatValidator(
- MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1
-)
-
-STAT_DDOF_FUNC_DEFAULTS: dict[str, bool | None] = {}
-STAT_DDOF_FUNC_DEFAULTS["dtype"] = None
-STAT_DDOF_FUNC_DEFAULTS["out"] = None
-STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False
-validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs")
-
-TAKE_DEFAULTS: dict[str, str | None] = {}
-TAKE_DEFAULTS["out"] = None
-TAKE_DEFAULTS["mode"] = "raise"
-validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs")
-
-
-def validate_take_with_convert(convert: ndarray | bool | None, args, kwargs) -> bool:
- """
- If this function is called via the 'numpy' library, the third parameter in
- its signature is 'axis', which takes either an ndarray or 'None', so check
- if the 'convert' parameter is either an instance of ndarray or is None
- """
- if isinstance(convert, ndarray) or convert is None:
- args = (convert,) + args
- convert = True
-
- validate_take(args, kwargs, max_fname_arg_count=3, method="both")
- return convert
-
-
-TRANSPOSE_DEFAULTS = {"axes": None}
-validate_transpose = CompatValidator(
- TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0
-)
-
-
-def validate_groupby_func(name, args, kwargs, allowed=None) -> None:
- """
- 'args' and 'kwargs' should be empty, except for allowed kwargs because all
- of their necessary parameters are explicitly listed in the function
- signature
- """
- if allowed is None:
- allowed = []
-
- kwargs = set(kwargs) - set(allowed)
-
- if len(args) + len(kwargs) > 0:
- raise UnsupportedFunctionCall(
- "numpy operations are not valid with groupby. "
- f"Use .groupby(...).{name}() instead"
- )
-
-
-RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var")
-
-
-def validate_resampler_func(method: str, args, kwargs) -> None:
- """
- 'args' and 'kwargs' should be empty because all of their necessary
- parameters are explicitly listed in the function signature
- """
- if len(args) + len(kwargs) > 0:
- if method in RESAMPLER_NUMPY_OPS:
- raise UnsupportedFunctionCall(
- "numpy operations are not valid with resample. "
- f"Use .resample(...).{method}() instead"
- )
- raise TypeError("too many arguments passed in")
-
-
-def validate_minmax_axis(axis: AxisInt | None, ndim: int = 1) -> None:
- """
- Ensure that the axis argument passed to min, max, argmin, or argmax is zero
- or None, as otherwise it will be incorrectly ignored.
-
- Parameters
- ----------
- axis : int or None
- ndim : int, default 1
-
- Raises
- ------
- ValueError
- """
- if axis is None:
- return
- if axis >= ndim or (axis < 0 and ndim + axis < 0):
- raise ValueError(f"`axis` must be fewer than the number of dimensions ({ndim})")
diff --git a/contrib/python/pandas/py3/pandas/compat/pickle_compat.py b/contrib/python/pandas/py3/pandas/compat/pickle_compat.py
deleted file mode 100644
index 9800c960f03..00000000000
--- a/contrib/python/pandas/py3/pandas/compat/pickle_compat.py
+++ /dev/null
@@ -1,249 +0,0 @@
-"""
-Support pre-0.12 series pickle compatibility.
-"""
-from __future__ import annotations
-
-import contextlib
-import copy
-import io
-import pickle as pkl
-from typing import Generator
-
-import numpy as np
-
-from pandas._libs.arrays import NDArrayBacked
-from pandas._libs.tslibs import BaseOffset
-
-from pandas import Index
-from pandas.core.arrays import (
- DatetimeArray,
- PeriodArray,
- TimedeltaArray,
-)
-from pandas.core.internals import BlockManager
-
-
-def load_reduce(self):
- stack = self.stack
- args = stack.pop()
- func = stack[-1]
-
- try:
- stack[-1] = func(*args)
- return
- except TypeError as err:
- # If we have a deprecated function,
- # try to replace and try again.
-
- msg = "_reconstruct: First argument must be a sub-type of ndarray"
-
- if msg in str(err):
- try:
- cls = args[0]
- stack[-1] = object.__new__(cls)
- return
- except TypeError:
- pass
- elif args and isinstance(args[0], type) and issubclass(args[0], BaseOffset):
- # TypeError: object.__new__(Day) is not safe, use Day.__new__()
- cls = args[0]
- stack[-1] = cls.__new__(*args)
- return
- elif args and issubclass(args[0], PeriodArray):
- cls = args[0]
- stack[-1] = NDArrayBacked.__new__(*args)
- return
-
- raise
-
-
-# If classes are moved, provide compat here.
-_class_locations_map = {
- ("pandas.core.sparse.array", "SparseArray"): ("pandas.core.arrays", "SparseArray"),
- # 15477
- ("pandas.core.base", "FrozenNDArray"): ("numpy", "ndarray"),
- ("pandas.core.indexes.frozen", "FrozenNDArray"): ("numpy", "ndarray"),
- ("pandas.core.base", "FrozenList"): ("pandas.core.indexes.frozen", "FrozenList"),
- # 10890
- ("pandas.core.series", "TimeSeries"): ("pandas.core.series", "Series"),
- ("pandas.sparse.series", "SparseTimeSeries"): (
- "pandas.core.sparse.series",
- "SparseSeries",
- ),
- # 12588, extensions moving
- ("pandas._sparse", "BlockIndex"): ("pandas._libs.sparse", "BlockIndex"),
- ("pandas.tslib", "Timestamp"): ("pandas._libs.tslib", "Timestamp"),
- # 18543 moving period
- ("pandas._period", "Period"): ("pandas._libs.tslibs.period", "Period"),
- ("pandas._libs.period", "Period"): ("pandas._libs.tslibs.period", "Period"),
- # 18014 moved __nat_unpickle from _libs.tslib-->_libs.tslibs.nattype
- ("pandas.tslib", "__nat_unpickle"): (
- "pandas._libs.tslibs.nattype",
- "__nat_unpickle",
- ),
- ("pandas._libs.tslib", "__nat_unpickle"): (
- "pandas._libs.tslibs.nattype",
- "__nat_unpickle",
- ),
- # 15998 top-level dirs moving
- ("pandas.sparse.array", "SparseArray"): (
- "pandas.core.arrays.sparse",
- "SparseArray",
- ),
- ("pandas.indexes.base", "_new_Index"): ("pandas.core.indexes.base", "_new_Index"),
- ("pandas.indexes.base", "Index"): ("pandas.core.indexes.base", "Index"),
- ("pandas.indexes.numeric", "Int64Index"): (
- "pandas.core.indexes.base",
- "Index", # updated in 50775
- ),
- ("pandas.indexes.range", "RangeIndex"): ("pandas.core.indexes.range", "RangeIndex"),
- ("pandas.indexes.multi", "MultiIndex"): ("pandas.core.indexes.multi", "MultiIndex"),
- ("pandas.tseries.index", "_new_DatetimeIndex"): (
- "pandas.core.indexes.datetimes",
- "_new_DatetimeIndex",
- ),
- ("pandas.tseries.index", "DatetimeIndex"): (
- "pandas.core.indexes.datetimes",
- "DatetimeIndex",
- ),
- ("pandas.tseries.period", "PeriodIndex"): (
- "pandas.core.indexes.period",
- "PeriodIndex",
- ),
- # 19269, arrays moving
- ("pandas.core.categorical", "Categorical"): ("pandas.core.arrays", "Categorical"),
- # 19939, add timedeltaindex, float64index compat from 15998 move
- ("pandas.tseries.tdi", "TimedeltaIndex"): (
- "pandas.core.indexes.timedeltas",
- "TimedeltaIndex",
- ),
- ("pandas.indexes.numeric", "Float64Index"): (
- "pandas.core.indexes.base",
- "Index", # updated in 50775
- ),
- # 50775, remove Int64Index, UInt64Index & Float64Index from codabase
- ("pandas.core.indexes.numeric", "Int64Index"): (
- "pandas.core.indexes.base",
- "Index",
- ),
- ("pandas.core.indexes.numeric", "UInt64Index"): (
- "pandas.core.indexes.base",
- "Index",
- ),
- ("pandas.core.indexes.numeric", "Float64Index"): (
- "pandas.core.indexes.base",
- "Index",
- ),
-}
-
-
-# our Unpickler sub-class to override methods and some dispatcher
-# functions for compat and uses a non-public class of the pickle module.
-
-
-class Unpickler(pkl._Unpickler):
- def find_class(self, module, name):
- # override superclass
- key = (module, name)
- module, name = _class_locations_map.get(key, key)
- return super().find_class(module, name)
-
-
-Unpickler.dispatch = copy.copy(Unpickler.dispatch)
-Unpickler.dispatch[pkl.REDUCE[0]] = load_reduce
-
-
-def load_newobj(self) -> None:
- args = self.stack.pop()
- cls = self.stack[-1]
-
- # compat
- if issubclass(cls, Index):
- obj = object.__new__(cls)
- elif issubclass(cls, DatetimeArray) and not args:
- arr = np.array([], dtype="M8[ns]")
- obj = cls.__new__(cls, arr, arr.dtype)
- elif issubclass(cls, TimedeltaArray) and not args:
- arr = np.array([], dtype="m8[ns]")
- obj = cls.__new__(cls, arr, arr.dtype)
- elif cls is BlockManager and not args:
- obj = cls.__new__(cls, (), [], False)
- else:
- obj = cls.__new__(cls, *args)
-
- self.stack[-1] = obj
-
-
-Unpickler.dispatch[pkl.NEWOBJ[0]] = load_newobj
-
-
-def load_newobj_ex(self) -> None:
- kwargs = self.stack.pop()
- args = self.stack.pop()
- cls = self.stack.pop()
-
- # compat
- if issubclass(cls, Index):
- obj = object.__new__(cls)
- else:
- obj = cls.__new__(cls, *args, **kwargs)
- self.append(obj)
-
-
-try:
- Unpickler.dispatch[pkl.NEWOBJ_EX[0]] = load_newobj_ex
-except (AttributeError, KeyError):
- pass
-
-
-def load(fh, encoding: str | None = None, is_verbose: bool = False):
- """
- Load a pickle, with a provided encoding,
-
- Parameters
- ----------
- fh : a filelike object
- encoding : an optional encoding
- is_verbose : show exception output
- """
- try:
- fh.seek(0)
- if encoding is not None:
- up = Unpickler(fh, encoding=encoding)
- else:
- up = Unpickler(fh)
- # "Unpickler" has no attribute "is_verbose" [attr-defined]
- up.is_verbose = is_verbose # type: ignore[attr-defined]
-
- return up.load()
- except (ValueError, TypeError):
- raise
-
-
-def loads(
- bytes_object: bytes,
- *,
- fix_imports: bool = True,
- encoding: str = "ASCII",
- errors: str = "strict",
-):
- """
- Analogous to pickle._loads.
- """
- fd = io.BytesIO(bytes_object)
- return Unpickler(
- fd, fix_imports=fix_imports, encoding=encoding, errors=errors
- ).load()
-
-
-@contextlib.contextmanager
-def patch_pickle() -> Generator[None, None, None]:
- """
- Temporarily patch pickle to use our unpickler.
- """
- orig_loads = pkl.loads
- try:
- setattr(pkl, "loads", loads)
- yield
- finally:
- setattr(pkl, "loads", orig_loads)
diff --git a/contrib/python/pandas/py3/pandas/compat/pyarrow.py b/contrib/python/pandas/py3/pandas/compat/pyarrow.py
deleted file mode 100644
index 020ec346490..00000000000
--- a/contrib/python/pandas/py3/pandas/compat/pyarrow.py
+++ /dev/null
@@ -1,22 +0,0 @@
-""" support pyarrow compatibility across versions """
-
-from __future__ import annotations
-
-from pandas.util.version import Version
-
-try:
- import pyarrow as pa
-
- _pa_version = pa.__version__
- _palv = Version(_pa_version)
- pa_version_under7p0 = _palv < Version("7.0.0")
- pa_version_under8p0 = _palv < Version("8.0.0")
- pa_version_under9p0 = _palv < Version("9.0.0")
- pa_version_under10p0 = _palv < Version("10.0.0")
- pa_version_under11p0 = _palv < Version("11.0.0")
-except ImportError:
- pa_version_under7p0 = True
- pa_version_under8p0 = True
- pa_version_under9p0 = True
- pa_version_under10p0 = True
- pa_version_under11p0 = True
diff --git a/contrib/python/pandas/py3/pandas/core/__init__.py b/contrib/python/pandas/py3/pandas/core/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/core/_numba/__init__.py b/contrib/python/pandas/py3/pandas/core/_numba/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/_numba/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/core/_numba/executor.py b/contrib/python/pandas/py3/pandas/core/_numba/executor.py
deleted file mode 100644
index 13d8b52bae3..00000000000
--- a/contrib/python/pandas/py3/pandas/core/_numba/executor.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from __future__ import annotations
-
-import functools
-from typing import (
- TYPE_CHECKING,
- Callable,
-)
-
-import numpy as np
-
-from pandas._typing import Scalar
-from pandas.compat._optional import import_optional_dependency
-
-
-@functools.lru_cache(maxsize=None)
-def generate_shared_aggregator(
- func: Callable[..., Scalar],
- nopython: bool,
- nogil: bool,
- parallel: bool,
-):
- """
- Generate a Numba function that loops over the columns 2D object and applies
- a 1D numba kernel over each column.
-
- Parameters
- ----------
- func : function
- aggregation function to be applied to each column
- nopython : bool
- nopython to be passed into numba.jit
- nogil : bool
- nogil to be passed into numba.jit
- parallel : bool
- parallel to be passed into numba.jit
-
- Returns
- -------
- Numba function
- """
- if TYPE_CHECKING:
- import numba
- else:
- numba = import_optional_dependency("numba")
-
- @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
- def column_looper(
- values: np.ndarray,
- start: np.ndarray,
- end: np.ndarray,
- min_periods: int,
- *args,
- ):
- result = np.empty((len(start), values.shape[1]), dtype=np.float64)
- for i in numba.prange(values.shape[1]):
- result[:, i] = func(values[:, i], start, end, min_periods, *args)
- return result
-
- return column_looper
diff --git a/contrib/python/pandas/py3/pandas/core/_numba/kernels/__init__.py b/contrib/python/pandas/py3/pandas/core/_numba/kernels/__init__.py
deleted file mode 100644
index 219ff023bf7..00000000000
--- a/contrib/python/pandas/py3/pandas/core/_numba/kernels/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from pandas.core._numba.kernels.mean_ import sliding_mean
-from pandas.core._numba.kernels.min_max_ import sliding_min_max
-from pandas.core._numba.kernels.sum_ import sliding_sum
-from pandas.core._numba.kernels.var_ import sliding_var
-
-__all__ = ["sliding_mean", "sliding_sum", "sliding_var", "sliding_min_max"]
diff --git a/contrib/python/pandas/py3/pandas/core/_numba/kernels/mean_.py b/contrib/python/pandas/py3/pandas/core/_numba/kernels/mean_.py
deleted file mode 100644
index 35a6e688d01..00000000000
--- a/contrib/python/pandas/py3/pandas/core/_numba/kernels/mean_.py
+++ /dev/null
@@ -1,150 +0,0 @@
-"""
-Numba 1D mean kernels that can be shared by
-* Dataframe / Series
-* groupby
-* rolling / expanding
-
-Mirrors pandas/_libs/window/aggregation.pyx
-"""
-from __future__ import annotations
-
-import numba
-import numpy as np
-
-from pandas.core._numba.kernels.shared import is_monotonic_increasing
-
-
-@numba.jit(nopython=True, nogil=True, parallel=False)
-def add_mean(
- val: float,
- nobs: int,
- sum_x: float,
- neg_ct: int,
- compensation: float,
- num_consecutive_same_value: int,
- prev_value: float,
-) -> tuple[int, float, int, float, int, float]:
- if not np.isnan(val):
- nobs += 1
- y = val - compensation
- t = sum_x + y
- compensation = t - sum_x - y
- sum_x = t
- if val < 0:
- neg_ct += 1
-
- if val == prev_value:
- num_consecutive_same_value += 1
- else:
- num_consecutive_same_value = 1
- prev_value = val
-
- return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value
-
-
-@numba.jit(nopython=True, nogil=True, parallel=False)
-def remove_mean(
- val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float
-) -> tuple[int, float, int, float]:
- if not np.isnan(val):
- nobs -= 1
- y = -val - compensation
- t = sum_x + y
- compensation = t - sum_x - y
- sum_x = t
- if val < 0:
- neg_ct -= 1
- return nobs, sum_x, neg_ct, compensation
-
-
-@numba.jit(nopython=True, nogil=True, parallel=False)
-def sliding_mean(
- values: np.ndarray,
- start: np.ndarray,
- end: np.ndarray,
- min_periods: int,
-) -> np.ndarray:
- N = len(start)
- nobs = 0
- sum_x = 0.0
- neg_ct = 0
- compensation_add = 0.0
- compensation_remove = 0.0
-
- is_monotonic_increasing_bounds = is_monotonic_increasing(
- start
- ) and is_monotonic_increasing(end)
-
- output = np.empty(N, dtype=np.float64)
-
- for i in range(N):
- s = start[i]
- e = end[i]
- if i == 0 or not is_monotonic_increasing_bounds:
- prev_value = values[s]
- num_consecutive_same_value = 0
-
- for j in range(s, e):
- val = values[j]
- (
- nobs,
- sum_x,
- neg_ct,
- compensation_add,
- num_consecutive_same_value,
- prev_value,
- ) = add_mean(
- val,
- nobs,
- sum_x,
- neg_ct,
- compensation_add,
- num_consecutive_same_value,
- prev_value, # pyright: ignore[reportGeneralTypeIssues]
- )
- else:
- for j in range(start[i - 1], s):
- val = values[j]
- nobs, sum_x, neg_ct, compensation_remove = remove_mean(
- val, nobs, sum_x, neg_ct, compensation_remove
- )
-
- for j in range(end[i - 1], e):
- val = values[j]
- (
- nobs,
- sum_x,
- neg_ct,
- compensation_add,
- num_consecutive_same_value,
- prev_value,
- ) = add_mean(
- val,
- nobs,
- sum_x,
- neg_ct,
- compensation_add,
- num_consecutive_same_value,
- prev_value, # pyright: ignore[reportGeneralTypeIssues]
- )
-
- if nobs >= min_periods and nobs > 0:
- result = sum_x / nobs
- if num_consecutive_same_value >= nobs:
- result = prev_value
- elif neg_ct == 0 and result < 0:
- result = 0
- elif neg_ct == nobs and result > 0:
- result = 0
- else:
- result = np.nan
-
- output[i] = result
-
- if not is_monotonic_increasing_bounds:
- nobs = 0
- sum_x = 0.0
- neg_ct = 0
- compensation_remove = 0.0
-
- return output
diff --git a/contrib/python/pandas/py3/pandas/core/_numba/kernels/min_max_.py b/contrib/python/pandas/py3/pandas/core/_numba/kernels/min_max_.py
deleted file mode 100644
index acba66a6e4f..00000000000
--- a/contrib/python/pandas/py3/pandas/core/_numba/kernels/min_max_.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-Numba 1D min/max kernels that can be shared by
-* Dataframe / Series
-* groupby
-* rolling / expanding
-
-Mirrors pandas/_libs/window/aggregation.pyx
-"""
-from __future__ import annotations
-
-import numba
-import numpy as np
-
-
-@numba.jit(nopython=True, nogil=True, parallel=False)
-def sliding_min_max(
- values: np.ndarray,
- start: np.ndarray,
- end: np.ndarray,
- min_periods: int,
- is_max: bool,
-) -> np.ndarray:
- N = len(start)
- nobs = 0
- output = np.empty(N, dtype=np.float64)
- # Use deque once numba supports it
- # https://github.com/numba/numba/issues/7417
- Q: list = []
- W: list = []
- for i in range(N):
- curr_win_size = end[i] - start[i]
- if i == 0:
- st = start[i]
- else:
- st = end[i - 1]
-
- for k in range(st, end[i]):
- ai = values[k]
- if not np.isnan(ai):
- nobs += 1
- elif is_max:
- ai = -np.inf
- else:
- ai = np.inf
- # Discard previous entries if we find new min or max
- if is_max:
- while Q and ((ai >= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
- Q.pop()
- else:
- while Q and ((ai <= values[Q[-1]]) or values[Q[-1]] != values[Q[-1]]):
- Q.pop()
- Q.append(k)
- W.append(k)
-
- # Discard entries outside and left of current window
- while Q and Q[0] <= start[i] - 1:
- Q.pop(0)
- while W and W[0] <= start[i] - 1:
- if not np.isnan(values[W[0]]):
- nobs -= 1
- W.pop(0)
-
- # Save output based on index in input value array
- if Q and curr_win_size > 0 and nobs >= min_periods:
- output[i] = values[Q[0]]
- else:
- output[i] = np.nan
-
- return output
diff --git a/contrib/python/pandas/py3/pandas/core/_numba/kernels/shared.py b/contrib/python/pandas/py3/pandas/core/_numba/kernels/shared.py
deleted file mode 100644
index 6e6bcef590d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/_numba/kernels/shared.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from __future__ import annotations
-
-import numba
-import numpy as np
-
-
-@numba.jit(
- # error: Any? not callable
- numba.boolean(numba.int64[:]), # type: ignore[misc]
- nopython=True,
- nogil=True,
- parallel=False,
-)
-def is_monotonic_increasing(bounds: np.ndarray) -> bool:
- """Check if int64 values are monotonically increasing."""
- n = len(bounds)
- if n < 2:
- return True
- prev = bounds[0]
- for i in range(1, n):
- cur = bounds[i]
- if cur < prev:
- return False
- prev = cur
- return True
diff --git a/contrib/python/pandas/py3/pandas/core/_numba/kernels/sum_.py b/contrib/python/pandas/py3/pandas/core/_numba/kernels/sum_.py
deleted file mode 100644
index eb8846b1fa5..00000000000
--- a/contrib/python/pandas/py3/pandas/core/_numba/kernels/sum_.py
+++ /dev/null
@@ -1,138 +0,0 @@
-"""
-Numba 1D sum kernels that can be shared by
-* Dataframe / Series
-* groupby
-* rolling / expanding
-
-Mirrors pandas/_libs/window/aggregation.pyx
-"""
-from __future__ import annotations
-
-import numba
-import numpy as np
-
-from pandas.core._numba.kernels.shared import is_monotonic_increasing
-
-
-@numba.jit(nopython=True, nogil=True, parallel=False)
-def add_sum(
- val: float,
- nobs: int,
- sum_x: float,
- compensation: float,
- num_consecutive_same_value: int,
- prev_value: float,
-) -> tuple[int, float, float, int, float]:
- if not np.isnan(val):
- nobs += 1
- y = val - compensation
- t = sum_x + y
- compensation = t - sum_x - y
- sum_x = t
-
- if val == prev_value:
- num_consecutive_same_value += 1
- else:
- num_consecutive_same_value = 1
- prev_value = val
-
- return nobs, sum_x, compensation, num_consecutive_same_value, prev_value
-
-
-@numba.jit(nopython=True, nogil=True, parallel=False)
-def remove_sum(
- val: float, nobs: int, sum_x: float, compensation: float
-) -> tuple[int, float, float]:
- if not np.isnan(val):
- nobs -= 1
- y = -val - compensation
- t = sum_x + y
- compensation = t - sum_x - y
- sum_x = t
- return nobs, sum_x, compensation
-
-
-@numba.jit(nopython=True, nogil=True, parallel=False)
-def sliding_sum(
- values: np.ndarray,
- start: np.ndarray,
- end: np.ndarray,
- min_periods: int,
-) -> np.ndarray:
- N = len(start)
- nobs = 0
- sum_x = 0.0
- compensation_add = 0.0
- compensation_remove = 0.0
-
- is_monotonic_increasing_bounds = is_monotonic_increasing(
- start
- ) and is_monotonic_increasing(end)
-
- output = np.empty(N, dtype=np.float64)
-
- for i in range(N):
- s = start[i]
- e = end[i]
- if i == 0 or not is_monotonic_increasing_bounds:
- prev_value = values[s]
- num_consecutive_same_value = 0
-
- for j in range(s, e):
- val = values[j]
- (
- nobs,
- sum_x,
- compensation_add,
- num_consecutive_same_value,
- prev_value,
- ) = add_sum(
- val,
- nobs,
- sum_x,
- compensation_add,
- num_consecutive_same_value,
- prev_value, # pyright: ignore[reportGeneralTypeIssues]
- )
- else:
- for j in range(start[i - 1], s):
- val = values[j]
- nobs, sum_x, compensation_remove = remove_sum(
- val, nobs, sum_x, compensation_remove
- )
-
- for j in range(end[i - 1], e):
- val = values[j]
- (
- nobs,
- sum_x,
- compensation_add,
- num_consecutive_same_value,
- prev_value,
- ) = add_sum(
- val,
- nobs,
- sum_x,
- compensation_add,
- num_consecutive_same_value,
- prev_value, # pyright: ignore[reportGeneralTypeIssues]
- )
-
- if nobs == 0 == min_periods:
- result = 0.0
- elif nobs >= min_periods:
- if num_consecutive_same_value >= nobs:
- result = prev_value * nobs
- else:
- result = sum_x
- else:
- result = np.nan
-
- output[i] = result
-
- if not is_monotonic_increasing_bounds:
- nobs = 0
- sum_x = 0.0
- compensation_remove = 0.0
-
- return output
diff --git a/contrib/python/pandas/py3/pandas/core/_numba/kernels/var_.py b/contrib/python/pandas/py3/pandas/core/_numba/kernels/var_.py
deleted file mode 100644
index 2c4559ddc21..00000000000
--- a/contrib/python/pandas/py3/pandas/core/_numba/kernels/var_.py
+++ /dev/null
@@ -1,157 +0,0 @@
-"""
-Numba 1D var kernels that can be shared by
-* Dataframe / Series
-* groupby
-* rolling / expanding
-
-Mirrors pandas/_libs/window/aggregation.pyx
-"""
-from __future__ import annotations
-
-import numba
-import numpy as np
-
-from pandas.core._numba.kernels.shared import is_monotonic_increasing
-
-
-@numba.jit(nopython=True, nogil=True, parallel=False)
-def add_var(
- val: float,
- nobs: int,
- mean_x: float,
- ssqdm_x: float,
- compensation: float,
- num_consecutive_same_value: int,
- prev_value: float,
-) -> tuple[int, float, float, float, int, float]:
- if not np.isnan(val):
- if val == prev_value:
- num_consecutive_same_value += 1
- else:
- num_consecutive_same_value = 1
- prev_value = val
-
- nobs += 1
- prev_mean = mean_x - compensation
- y = val - compensation
- t = y - mean_x
- compensation = t + mean_x - y
- delta = t
- if nobs:
- mean_x += delta / nobs
- else:
- mean_x = 0
- ssqdm_x += (val - prev_mean) * (val - mean_x)
- return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value
-
-
-@numba.jit(nopython=True, nogil=True, parallel=False)
-def remove_var(
- val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
-) -> tuple[int, float, float, float]:
- if not np.isnan(val):
- nobs -= 1
- if nobs:
- prev_mean = mean_x - compensation
- y = val - compensation
- t = y - mean_x
- compensation = t + mean_x - y
- delta = t
- mean_x -= delta / nobs
- ssqdm_x -= (val - prev_mean) * (val - mean_x)
- else:
- mean_x = 0
- ssqdm_x = 0
- return nobs, mean_x, ssqdm_x, compensation
-
-
-@numba.jit(nopython=True, nogil=True, parallel=False)
-def sliding_var(
- values: np.ndarray,
- start: np.ndarray,
- end: np.ndarray,
- min_periods: int,
- ddof: int = 1,
-) -> np.ndarray:
- N = len(start)
- nobs = 0
- mean_x = 0.0
- ssqdm_x = 0.0
- compensation_add = 0.0
- compensation_remove = 0.0
-
- min_periods = max(min_periods, 1)
- is_monotonic_increasing_bounds = is_monotonic_increasing(
- start
- ) and is_monotonic_increasing(end)
-
- output = np.empty(N, dtype=np.float64)
-
- for i in range(N):
- s = start[i]
- e = end[i]
- if i == 0 or not is_monotonic_increasing_bounds:
- prev_value = values[s]
- num_consecutive_same_value = 0
-
- for j in range(s, e):
- val = values[j]
- (
- nobs,
- mean_x,
- ssqdm_x,
- compensation_add,
- num_consecutive_same_value,
- prev_value,
- ) = add_var(
- val,
- nobs,
- mean_x,
- ssqdm_x,
- compensation_add,
- num_consecutive_same_value,
- prev_value, # pyright: ignore[reportGeneralTypeIssues]
- )
- else:
- for j in range(start[i - 1], s):
- val = values[j]
- nobs, mean_x, ssqdm_x, compensation_remove = remove_var(
- val, nobs, mean_x, ssqdm_x, compensation_remove
- )
-
- for j in range(end[i - 1], e):
- val = values[j]
- (
- nobs,
- mean_x,
- ssqdm_x,
- compensation_add,
- num_consecutive_same_value,
- prev_value,
- ) = add_var(
- val,
- nobs,
- mean_x,
- ssqdm_x,
- compensation_add,
- num_consecutive_same_value,
- prev_value, # pyright: ignore[reportGeneralTypeIssues]
- )
-
- if nobs >= min_periods and nobs > ddof:
- if nobs == 1 or num_consecutive_same_value >= nobs:
- result = 0.0
- else:
- result = ssqdm_x / (nobs - ddof)
- else:
- result = np.nan
-
- output[i] = result
-
- if not is_monotonic_increasing_bounds:
- nobs = 0
- mean_x = 0.0
- ssqdm_x = 0.0
- compensation_remove = 0.0
-
- return output
diff --git a/contrib/python/pandas/py3/pandas/core/accessor.py b/contrib/python/pandas/py3/pandas/core/accessor.py
deleted file mode 100644
index 58da2cd9947..00000000000
--- a/contrib/python/pandas/py3/pandas/core/accessor.py
+++ /dev/null
@@ -1,340 +0,0 @@
-"""
-
-accessor.py contains base classes for implementing accessor properties
-that can be mixed into or pinned onto other pandas classes.
-
-"""
-from __future__ import annotations
-
-from typing import (
- Callable,
- final,
-)
-import warnings
-
-from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
-
-
-class DirNamesMixin:
- _accessors: set[str] = set()
- _hidden_attrs: frozenset[str] = frozenset()
-
- @final
- def _dir_deletions(self) -> set[str]:
- """
- Delete unwanted __dir__ for this object.
- """
- return self._accessors | self._hidden_attrs
-
- def _dir_additions(self) -> set[str]:
- """
- Add additional __dir__ for this object.
- """
- return {accessor for accessor in self._accessors if hasattr(self, accessor)}
-
- def __dir__(self) -> list[str]:
- """
- Provide method name lookup and completion.
-
- Notes
- -----
- Only provide 'public' methods.
- """
- rv = set(super().__dir__())
- rv = (rv - self._dir_deletions()) | self._dir_additions()
- return sorted(rv)
-
-
-class PandasDelegate:
- """
- Abstract base class for delegating methods/properties.
- """
-
- def _delegate_property_get(self, name, *args, **kwargs):
- raise TypeError(f"You cannot access the property {name}")
-
- def _delegate_property_set(self, name, value, *args, **kwargs):
- raise TypeError(f"The property {name} cannot be set")
-
- def _delegate_method(self, name, *args, **kwargs):
- raise TypeError(f"You cannot call method {name}")
-
- @classmethod
- def _add_delegate_accessors(
- cls,
- delegate,
- accessors: list[str],
- typ: str,
- overwrite: bool = False,
- accessor_mapping: Callable[[str], str] = lambda x: x,
- raise_on_missing: bool = True,
- ) -> None:
- """
- Add accessors to cls from the delegate class.
-
- Parameters
- ----------
- cls
- Class to add the methods/properties to.
- delegate
- Class to get methods/properties and doc-strings.
- accessors : list of str
- List of accessors to add.
- typ : {'property', 'method'}
- overwrite : bool, default False
- Overwrite the method/property in the target class if it exists.
- accessor_mapping: Callable, default lambda x: x
- Callable to map the delegate's function to the cls' function.
- raise_on_missing: bool, default True
- Raise if an accessor does not exist on delegate.
- False skips the missing accessor.
- """
-
- def _create_delegator_property(name):
- def _getter(self):
- return self._delegate_property_get(name)
-
- def _setter(self, new_values):
- return self._delegate_property_set(name, new_values)
-
- _getter.__name__ = name
- _setter.__name__ = name
-
- return property(
- fget=_getter,
- fset=_setter,
- doc=getattr(delegate, accessor_mapping(name)).__doc__,
- )
-
- def _create_delegator_method(name):
- def f(self, *args, **kwargs):
- return self._delegate_method(name, *args, **kwargs)
-
- f.__name__ = name
- f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__
-
- return f
-
- for name in accessors:
- if (
- not raise_on_missing
- and getattr(delegate, accessor_mapping(name), None) is None
- ):
- continue
-
- if typ == "property":
- f = _create_delegator_property(name)
- else:
- f = _create_delegator_method(name)
-
- # don't overwrite existing methods/properties
- if overwrite or not hasattr(cls, name):
- setattr(cls, name, f)
-
-
-def delegate_names(
- delegate,
- accessors: list[str],
- typ: str,
- overwrite: bool = False,
- accessor_mapping: Callable[[str], str] = lambda x: x,
- raise_on_missing: bool = True,
-):
- """
- Add delegated names to a class using a class decorator. This provides
- an alternative usage to directly calling `_add_delegate_accessors`
- below a class definition.
-
- Parameters
- ----------
- delegate : object
- The class to get methods/properties & doc-strings.
- accessors : Sequence[str]
- List of accessor to add.
- typ : {'property', 'method'}
- overwrite : bool, default False
- Overwrite the method/property in the target class if it exists.
- accessor_mapping: Callable, default lambda x: x
- Callable to map the delegate's function to the cls' function.
- raise_on_missing: bool, default True
- Raise if an accessor does not exist on delegate.
- False skips the missing accessor.
-
- Returns
- -------
- callable
- A class decorator.
-
- Examples
- --------
- @delegate_names(Categorical, ["categories", "ordered"], "property")
- class CategoricalAccessor(PandasDelegate):
- [...]
- """
-
- def add_delegate_accessors(cls):
- cls._add_delegate_accessors(
- delegate,
- accessors,
- typ,
- overwrite=overwrite,
- accessor_mapping=accessor_mapping,
- raise_on_missing=raise_on_missing,
- )
- return cls
-
- return add_delegate_accessors
-
-
-# Ported with modifications from xarray
-# https://github.com/pydata/xarray/blob/master/xarray/core/extensions.py
-# 1. We don't need to catch and re-raise AttributeErrors as RuntimeErrors
-# 2. We use a UserWarning instead of a custom Warning
-
-
-class CachedAccessor:
- """
- Custom property-like object.
-
- A descriptor for caching accessors.
-
- Parameters
- ----------
- name : str
- Namespace that will be accessed under, e.g. ``df.foo``.
- accessor : cls
- Class with the extension methods.
-
- Notes
- -----
- For accessor, The class's __init__ method assumes that one of
- ``Series``, ``DataFrame`` or ``Index`` as the
- single argument ``data``.
- """
-
- def __init__(self, name: str, accessor) -> None:
- self._name = name
- self._accessor = accessor
-
- def __get__(self, obj, cls):
- if obj is None:
- # we're accessing the attribute of the class, i.e., Dataset.geo
- return self._accessor
- accessor_obj = self._accessor(obj)
- # Replace the property with the accessor object. Inspired by:
- # https://www.pydanny.com/cached-property.html
- # We need to use object.__setattr__ because we overwrite __setattr__ on
- # NDFrame
- object.__setattr__(obj, self._name, accessor_obj)
- return accessor_obj
-
-
-@doc(klass="", others="")
-def _register_accessor(name, cls):
- """
- Register a custom accessor on {klass} objects.
-
- Parameters
- ----------
- name : str
- Name under which the accessor should be registered. A warning is issued
- if this name conflicts with a preexisting attribute.
-
- Returns
- -------
- callable
- A class decorator.
-
- See Also
- --------
- register_dataframe_accessor : Register a custom accessor on DataFrame objects.
- register_series_accessor : Register a custom accessor on Series objects.
- register_index_accessor : Register a custom accessor on Index objects.
-
- Notes
- -----
- When accessed, your accessor will be initialized with the pandas object
- the user is interacting with. So the signature must be
-
- .. code-block:: python
-
- def __init__(self, pandas_object): # noqa: E999
- ...
-
- For consistency with pandas methods, you should raise an ``AttributeError``
- if the data passed to your accessor has an incorrect dtype.
-
- >>> pd.Series(['a', 'b']).dt
- Traceback (most recent call last):
- ...
- AttributeError: Can only use .dt accessor with datetimelike values
-
- Examples
- --------
- In your library code::
-
- import pandas as pd
-
- @pd.api.extensions.register_dataframe_accessor("geo")
- class GeoAccessor:
- def __init__(self, pandas_obj):
- self._obj = pandas_obj
-
- @property
- def center(self):
- # return the geographic center point of this DataFrame
- lat = self._obj.latitude
- lon = self._obj.longitude
- return (float(lon.mean()), float(lat.mean()))
-
- def plot(self):
- # plot this array's data on a map, e.g., using Cartopy
- pass
-
- Back in an interactive IPython session:
-
- .. code-block:: ipython
-
- In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10),
- ...: "latitude": np.linspace(0, 20)}})
- In [2]: ds.geo.center
- Out[2]: (5.0, 10.0)
- In [3]: ds.geo.plot() # plots data on a map
- """
-
- def decorator(accessor):
- if hasattr(cls, name):
- warnings.warn(
- f"registration of accessor {repr(accessor)} under name "
- f"{repr(name)} for type {repr(cls)} is overriding a preexisting "
- f"attribute with the same name.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- setattr(cls, name, CachedAccessor(name, accessor))
- cls._accessors.add(name)
- return accessor
-
- return decorator
-
-
-@doc(_register_accessor, klass="DataFrame")
-def register_dataframe_accessor(name):
- from pandas import DataFrame
-
- return _register_accessor(name, DataFrame)
-
-
-@doc(_register_accessor, klass="Series")
-def register_series_accessor(name):
- from pandas import Series
-
- return _register_accessor(name, Series)
-
-
-@doc(_register_accessor, klass="Index")
-def register_index_accessor(name):
- from pandas import Index
-
- return _register_accessor(name, Index)
diff --git a/contrib/python/pandas/py3/pandas/core/algorithms.py b/contrib/python/pandas/py3/pandas/core/algorithms.py
deleted file mode 100644
index d312612cdc6..00000000000
--- a/contrib/python/pandas/py3/pandas/core/algorithms.py
+++ /dev/null
@@ -1,1672 +0,0 @@
-"""
-Generic data algorithms. This module is experimental at the moment and not
-intended for public consumption
-"""
-from __future__ import annotations
-
-import operator
-from textwrap import dedent
-from typing import (
- TYPE_CHECKING,
- Literal,
- cast,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import (
- algos,
- hashtable as htable,
- iNaT,
- lib,
-)
-from pandas._typing import (
- AnyArrayLike,
- ArrayLike,
- AxisInt,
- DtypeObj,
- TakeIndexer,
- npt,
-)
-from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.cast import (
- construct_1d_object_array_from_listlike,
- infer_dtype_from_array,
- np_find_common_type,
-)
-from pandas.core.dtypes.common import (
- ensure_float64,
- ensure_object,
- ensure_platform_int,
- is_array_like,
- is_bool_dtype,
- is_categorical_dtype,
- is_complex_dtype,
- is_extension_array_dtype,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_list_like,
- is_numeric_dtype,
- is_object_dtype,
- is_scalar,
- is_signed_integer_dtype,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.concat import concat_compat
-from pandas.core.dtypes.dtypes import (
- BaseMaskedDtype,
- ExtensionDtype,
- PandasDtype,
-)
-from pandas.core.dtypes.generic import (
- ABCDatetimeArray,
- ABCExtensionArray,
- ABCIndex,
- ABCMultiIndex,
- ABCSeries,
- ABCTimedeltaArray,
-)
-from pandas.core.dtypes.missing import (
- isna,
- na_value_for_dtype,
-)
-
-from pandas.core.array_algos.take import take_nd
-from pandas.core.construction import (
- array as pd_array,
- ensure_wrapped_if_datetimelike,
- extract_array,
-)
-from pandas.core.indexers import validate_indices
-
-if TYPE_CHECKING:
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- )
-
- from pandas import (
- Categorical,
- Index,
- Series,
- )
- from pandas.core.arrays import (
- BaseMaskedArray,
- ExtensionArray,
- )
-
-
-# --------------- #
-# dtype access #
-# --------------- #
-def _ensure_data(values: ArrayLike) -> np.ndarray:
- """
- routine to ensure that our data is of the correct
- input dtype for lower-level routines
-
- This will coerce:
- - ints -> int64
- - uint -> uint64
- - bool -> uint8
- - datetimelike -> i8
- - datetime64tz -> i8 (in local tz)
- - categorical -> codes
-
- Parameters
- ----------
- values : np.ndarray or ExtensionArray
-
- Returns
- -------
- np.ndarray
- """
-
- if not isinstance(values, ABCMultiIndex):
- # extract_array would raise
- values = extract_array(values, extract_numpy=True)
-
- if is_object_dtype(values.dtype):
- return ensure_object(np.asarray(values))
-
- elif isinstance(values.dtype, BaseMaskedDtype):
- # i.e. BooleanArray, FloatingArray, IntegerArray
- values = cast("BaseMaskedArray", values)
- if not values._hasna:
- # No pd.NAs -> We can avoid an object-dtype cast (and copy) GH#41816
- # recurse to avoid re-implementing logic for eg bool->uint8
- return _ensure_data(values._data)
- return np.asarray(values)
-
- elif is_categorical_dtype(values.dtype):
- # NB: cases that go through here should NOT be using _reconstruct_data
- # on the back-end.
- values = cast("Categorical", values)
- return values.codes
-
- elif is_bool_dtype(values.dtype):
- if isinstance(values, np.ndarray):
- # i.e. actually dtype == np.dtype("bool")
- return np.asarray(values).view("uint8")
- else:
- # e.g. Sparse[bool, False] # TODO: no test cases get here
- return np.asarray(values).astype("uint8", copy=False)
-
- elif is_integer_dtype(values.dtype):
- return np.asarray(values)
-
- elif is_float_dtype(values.dtype):
- # Note: checking `values.dtype == "float128"` raises on Windows and 32bit
- # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]"
- # has no attribute "itemsize"
- if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr]
- # we dont (yet) have float128 hashtable support
- return ensure_float64(values)
- return np.asarray(values)
-
- elif is_complex_dtype(values.dtype):
- return cast(np.ndarray, values)
-
- # datetimelike
- elif needs_i8_conversion(values.dtype):
- npvalues = values.view("i8")
- npvalues = cast(np.ndarray, npvalues)
- return npvalues
-
- # we have failed, return object
- values = np.asarray(values, dtype=object)
- return ensure_object(values)
-
-
-def _reconstruct_data(
- values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike
-) -> ArrayLike:
- """
- reverse of _ensure_data
-
- Parameters
- ----------
- values : np.ndarray or ExtensionArray
- dtype : np.dtype or ExtensionDtype
- original : AnyArrayLike
-
- Returns
- -------
- ExtensionArray or np.ndarray
- """
- if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
- # Catch DatetimeArray/TimedeltaArray
- return values
-
- if not isinstance(dtype, np.dtype):
- # i.e. ExtensionDtype; note we have ruled out above the possibility
- # that values.dtype == dtype
- cls = dtype.construct_array_type()
-
- values = cls._from_sequence(values, dtype=dtype)
-
- else:
- values = values.astype(dtype, copy=False)
-
- return values
-
-
-def _ensure_arraylike(values) -> ArrayLike:
- """
- ensure that we are arraylike if not already
- """
- if not is_array_like(values):
- inferred = lib.infer_dtype(values, skipna=False)
- if inferred in ["mixed", "string", "mixed-integer"]:
- # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160
- if isinstance(values, tuple):
- values = list(values)
- values = construct_1d_object_array_from_listlike(values)
- else:
- values = np.asarray(values)
- return values
-
-
-_hashtables = {
- "complex128": htable.Complex128HashTable,
- "complex64": htable.Complex64HashTable,
- "float64": htable.Float64HashTable,
- "float32": htable.Float32HashTable,
- "uint64": htable.UInt64HashTable,
- "uint32": htable.UInt32HashTable,
- "uint16": htable.UInt16HashTable,
- "uint8": htable.UInt8HashTable,
- "int64": htable.Int64HashTable,
- "int32": htable.Int32HashTable,
- "int16": htable.Int16HashTable,
- "int8": htable.Int8HashTable,
- "string": htable.StringHashTable,
- "object": htable.PyObjectHashTable,
-}
-
-
-def _get_hashtable_algo(values: np.ndarray):
- """
- Parameters
- ----------
- values : np.ndarray
-
- Returns
- -------
- htable : HashTable subclass
- values : ndarray
- """
- values = _ensure_data(values)
-
- ndtype = _check_object_for_strings(values)
- hashtable = _hashtables[ndtype]
- return hashtable, values
-
-
-def _check_object_for_strings(values: np.ndarray) -> str:
- """
- Check if we can use string hashtable instead of object hashtable.
-
- Parameters
- ----------
- values : ndarray
-
- Returns
- -------
- str
- """
- ndtype = values.dtype.name
- if ndtype == "object":
- # it's cheaper to use a String Hash Table than Object; we infer
- # including nulls because that is the only difference between
- # StringHashTable and ObjectHashtable
- if lib.infer_dtype(values, skipna=False) in ["string"]:
- ndtype = "string"
- return ndtype
-
-
-# --------------- #
-# top-level algos #
-# --------------- #
-
-
-def unique(values):
- """
- Return unique values based on a hash table.
-
- Uniques are returned in order of appearance. This does NOT sort.
-
- Significantly faster than numpy.unique for long enough sequences.
- Includes NA values.
-
- Parameters
- ----------
- values : 1d array-like
-
- Returns
- -------
- numpy.ndarray or ExtensionArray
-
- The return can be:
-
- * Index : when the input is an Index
- * Categorical : when the input is a Categorical dtype
- * ndarray : when the input is a Series/ndarray
-
- Return numpy.ndarray or ExtensionArray.
-
- See Also
- --------
- Index.unique : Return unique values from an Index.
- Series.unique : Return unique values of Series object.
-
- Examples
- --------
- >>> pd.unique(pd.Series([2, 1, 3, 3]))
- array([2, 1, 3])
-
- >>> pd.unique(pd.Series([2] + [1] * 5))
- array([2, 1])
-
- >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]))
- array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
-
- >>> pd.unique(
- ... pd.Series(
- ... [
- ... pd.Timestamp("20160101", tz="US/Eastern"),
- ... pd.Timestamp("20160101", tz="US/Eastern"),
- ... ]
- ... )
- ... )
- <DatetimeArray>
- ['2016-01-01 00:00:00-05:00']
- Length: 1, dtype: datetime64[ns, US/Eastern]
-
- >>> pd.unique(
- ... pd.Index(
- ... [
- ... pd.Timestamp("20160101", tz="US/Eastern"),
- ... pd.Timestamp("20160101", tz="US/Eastern"),
- ... ]
- ... )
- ... )
- DatetimeIndex(['2016-01-01 00:00:00-05:00'],
- dtype='datetime64[ns, US/Eastern]',
- freq=None)
-
- >>> pd.unique(list("baabc"))
- array(['b', 'a', 'c'], dtype=object)
-
- An unordered Categorical will return categories in the
- order of appearance.
-
- >>> pd.unique(pd.Series(pd.Categorical(list("baabc"))))
- ['b', 'a', 'c']
- Categories (3, object): ['a', 'b', 'c']
-
- >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc"))))
- ['b', 'a', 'c']
- Categories (3, object): ['a', 'b', 'c']
-
- An ordered Categorical preserves the category ordering.
-
- >>> pd.unique(
- ... pd.Series(
- ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)
- ... )
- ... )
- ['b', 'a', 'c']
- Categories (3, object): ['a' < 'b' < 'c']
-
- An array of tuples
-
- >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")])
- array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
- """
- return unique_with_mask(values)
-
-
-def nunique_ints(values: ArrayLike) -> int:
- """
- Return the number of unique values for integer array-likes.
-
- Significantly faster than pandas.unique for long enough sequences.
- No checks are done to ensure input is integral.
-
- Parameters
- ----------
- values : 1d array-like
-
- Returns
- -------
- int : The number of unique values in ``values``
- """
- if len(values) == 0:
- return 0
- values = _ensure_data(values)
- # bincount requires intp
- result = (np.bincount(values.ravel().astype("intp")) != 0).sum()
- return result
-
-
-def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
- """See algorithms.unique for docs. Takes a mask for masked arrays."""
- values = _ensure_arraylike(values)
-
- if is_extension_array_dtype(values.dtype):
- # Dispatch to extension dtype's unique.
- return values.unique()
-
- original = values
- hashtable, values = _get_hashtable_algo(values)
-
- table = hashtable(len(values))
- if mask is None:
- uniques = table.unique(values)
- uniques = _reconstruct_data(uniques, original.dtype, original)
- return uniques
-
- else:
- uniques, mask = table.unique(values, mask=mask)
- uniques = _reconstruct_data(uniques, original.dtype, original)
- assert mask is not None # for mypy
- return uniques, mask.astype("bool")
-
-
-unique1d = unique
-
-
-def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:
- """
- Compute the isin boolean array.
-
- Parameters
- ----------
- comps : array-like
- values : array-like
-
- Returns
- -------
- ndarray[bool]
- Same length as `comps`.
- """
- if not is_list_like(comps):
- raise TypeError(
- "only list-like objects are allowed to be passed "
- f"to isin(), you passed a [{type(comps).__name__}]"
- )
- if not is_list_like(values):
- raise TypeError(
- "only list-like objects are allowed to be passed "
- f"to isin(), you passed a [{type(values).__name__}]"
- )
-
- if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
- orig_values = list(values)
- values = _ensure_arraylike(orig_values)
-
- if (
- len(values) > 0
- and is_numeric_dtype(values)
- and not is_signed_integer_dtype(comps)
- ):
- # GH#46485 Use object to avoid upcast to float64 later
- # TODO: Share with _find_common_type_compat
- values = construct_1d_object_array_from_listlike(orig_values)
-
- elif isinstance(values, ABCMultiIndex):
- # Avoid raising in extract_array
- values = np.array(values)
- else:
- values = extract_array(values, extract_numpy=True, extract_range=True)
-
- comps_array = _ensure_arraylike(comps)
- comps_array = extract_array(comps_array, extract_numpy=True)
- if not isinstance(comps_array, np.ndarray):
- # i.e. Extension Array
- return comps_array.isin(values)
-
- elif needs_i8_conversion(comps_array.dtype):
- # Dispatch to DatetimeLikeArrayMixin.isin
- return pd_array(comps_array).isin(values)
- elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps_array.dtype):
- # e.g. comps_array are integers and values are datetime64s
- return np.zeros(comps_array.shape, dtype=bool)
- # TODO: not quite right ... Sparse/Categorical
- elif needs_i8_conversion(values.dtype):
- return isin(comps_array, values.astype(object))
-
- elif isinstance(values.dtype, ExtensionDtype):
- return isin(np.asarray(comps_array), np.asarray(values))
-
- # GH16012
- # Ensure np.in1d doesn't get object types or it *may* throw an exception
- # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
- # in1d is faster for small sizes
- if (
- len(comps_array) > 1_000_000
- and len(values) <= 26
- and not is_object_dtype(comps_array)
- ):
- # If the values include nan we need to check for nan explicitly
- # since np.nan it not equal to np.nan
- if isna(values).any():
-
- def f(c, v):
- return np.logical_or(np.in1d(c, v), np.isnan(c))
-
- else:
- f = np.in1d
-
- else:
- common = np_find_common_type(values.dtype, comps_array.dtype)
- values = values.astype(common, copy=False)
- comps_array = comps_array.astype(common, copy=False)
- f = htable.ismember
-
- return f(comps_array, values)
-
-
-def factorize_array(
- values: np.ndarray,
- use_na_sentinel: bool = True,
- size_hint: int | None = None,
- na_value: object = None,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> tuple[npt.NDArray[np.intp], np.ndarray]:
- """
- Factorize a numpy array to codes and uniques.
-
- This doesn't do any coercion of types or unboxing before factorization.
-
- Parameters
- ----------
- values : ndarray
- use_na_sentinel : bool, default True
- If True, the sentinel -1 will be used for NaN values. If False,
- NaN values will be encoded as non-negative integers and will not drop the
- NaN from the uniques of the values.
- size_hint : int, optional
- Passed through to the hashtable's 'get_labels' method
- na_value : object, optional
- A value in `values` to consider missing. Note: only use this
- parameter when you know that you don't have any values pandas would
- consider missing in the array (NaN for float data, iNaT for
- datetimes, etc.).
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
-
- Returns
- -------
- codes : ndarray[np.intp]
- uniques : ndarray
- """
- original = values
- if values.dtype.kind in ["m", "M"]:
- # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
- # need to do the same to na_value. We are assuming here that the passed
- # na_value is an appropriately-typed NaT.
- # e.g. test_where_datetimelike_categorical
- na_value = iNaT
-
- hash_klass, values = _get_hashtable_algo(values)
-
- table = hash_klass(size_hint or len(values))
- uniques, codes = table.factorize(
- values,
- na_sentinel=-1,
- na_value=na_value,
- mask=mask,
- ignore_na=use_na_sentinel,
- )
-
- # re-cast e.g. i8->dt64/td64, uint8->bool
- uniques = _reconstruct_data(uniques, original.dtype, original)
-
- codes = ensure_platform_int(codes)
- return codes, uniques
-
-
-@doc(
- values=dedent(
- """\
- values : sequence
- A 1-D sequence. Sequences that aren't pandas objects are
- coerced to ndarrays before factorization.
- """
- ),
- sort=dedent(
- """\
- sort : bool, default False
- Sort `uniques` and shuffle `codes` to maintain the
- relationship.
- """
- ),
- size_hint=dedent(
- """\
- size_hint : int, optional
- Hint to the hashtable sizer.
- """
- ),
-)
-def factorize(
- values,
- sort: bool = False,
- use_na_sentinel: bool = True,
- size_hint: int | None = None,
-) -> tuple[np.ndarray, np.ndarray | Index]:
- """
- Encode the object as an enumerated type or categorical variable.
-
- This method is useful for obtaining a numeric representation of an
- array when all that matters is identifying distinct values. `factorize`
- is available as both a top-level function :func:`pandas.factorize`,
- and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
-
- Parameters
- ----------
- {values}{sort}
- use_na_sentinel : bool, default True
- If True, the sentinel -1 will be used for NaN values. If False,
- NaN values will be encoded as non-negative integers and will not drop the
- NaN from the uniques of the values.
-
- .. versionadded:: 1.5.0
- {size_hint}\
-
- Returns
- -------
- codes : ndarray
- An integer ndarray that's an indexer into `uniques`.
- ``uniques.take(codes)`` will have the same values as `values`.
- uniques : ndarray, Index, or Categorical
- The unique valid values. When `values` is Categorical, `uniques`
- is a Categorical. When `values` is some other pandas object, an
- `Index` is returned. Otherwise, a 1-D ndarray is returned.
-
- .. note::
-
- Even if there's a missing value in `values`, `uniques` will
- *not* contain an entry for it.
-
- See Also
- --------
- cut : Discretize continuous-valued array.
- unique : Find the unique value in an array.
-
- Notes
- -----
- Reference :ref:`the user guide <reshaping.factorize>` for more examples.
-
- Examples
- --------
- These examples all show factorize as a top-level method like
- ``pd.factorize(values)``. The results are identical for methods like
- :meth:`Series.factorize`.
-
- >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
- >>> codes
- array([0, 0, 1, 2, 0])
- >>> uniques
- array(['b', 'a', 'c'], dtype=object)
-
- With ``sort=True``, the `uniques` will be sorted, and `codes` will be
- shuffled so that the relationship is the maintained.
-
- >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
- >>> codes
- array([1, 1, 0, 2, 1])
- >>> uniques
- array(['a', 'b', 'c'], dtype=object)
-
- When ``use_na_sentinel=True`` (the default), missing values are indicated in
- the `codes` with the sentinel value ``-1`` and missing values are not
- included in `uniques`.
-
- >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
- >>> codes
- array([ 0, -1, 1, 2, 0])
- >>> uniques
- array(['b', 'a', 'c'], dtype=object)
-
- Thus far, we've only factorized lists (which are internally coerced to
- NumPy arrays). When factorizing pandas objects, the type of `uniques`
- will differ. For Categoricals, a `Categorical` is returned.
-
- >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
- >>> codes, uniques = pd.factorize(cat)
- >>> codes
- array([0, 0, 1])
- >>> uniques
- ['a', 'c']
- Categories (3, object): ['a', 'b', 'c']
-
- Notice that ``'b'`` is in ``uniques.categories``, despite not being
- present in ``cat.values``.
-
- For all other pandas objects, an Index of the appropriate type is
- returned.
-
- >>> cat = pd.Series(['a', 'a', 'c'])
- >>> codes, uniques = pd.factorize(cat)
- >>> codes
- array([0, 0, 1])
- >>> uniques
- Index(['a', 'c'], dtype='object')
-
- If NaN is in the values, and we want to include NaN in the uniques of the
- values, it can be achieved by setting ``use_na_sentinel=False``.
-
- >>> values = np.array([1, 2, 1, np.nan])
- >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
- >>> codes
- array([ 0, 1, 0, -1])
- >>> uniques
- array([1., 2.])
-
- >>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
- >>> codes
- array([0, 1, 0, 2])
- >>> uniques
- array([ 1., 2., nan])
- """
- # Implementation notes: This method is responsible for 3 things
- # 1.) coercing data to array-like (ndarray, Index, extension array)
- # 2.) factorizing codes and uniques
- # 3.) Maybe boxing the uniques in an Index
- #
- # Step 2 is dispatched to extension types (like Categorical). They are
- # responsible only for factorization. All data coercion, sorting and boxing
- # should happen here.
- if isinstance(values, (ABCIndex, ABCSeries)):
- return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)
-
- values = _ensure_arraylike(values)
- original = values
-
- if (
- isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
- and values.freq is not None
- ):
- # The presence of 'freq' means we can fast-path sorting and know there
- # aren't NAs
- codes, uniques = values.factorize(sort=sort)
- return codes, uniques
-
- elif not isinstance(values, np.ndarray):
- # i.e. ExtensionArray
- codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)
-
- else:
- values = np.asarray(values) # convert DTA/TDA/MultiIndex
-
- if not use_na_sentinel and is_object_dtype(values):
- # factorize can now handle differentiating various types of null values.
- # These can only occur when the array has object dtype.
- # However, for backwards compatibility we only use the null for the
- # provided dtype. This may be revisited in the future, see GH#48476.
- null_mask = isna(values)
- if null_mask.any():
- na_value = na_value_for_dtype(values.dtype, compat=False)
- # Don't modify (potentially user-provided) array
- values = np.where(null_mask, na_value, values)
-
- codes, uniques = factorize_array(
- values,
- use_na_sentinel=use_na_sentinel,
- size_hint=size_hint,
- )
-
- if sort and len(uniques) > 0:
- uniques, codes = safe_sort(
- uniques,
- codes,
- use_na_sentinel=use_na_sentinel,
- assume_unique=True,
- verify=False,
- )
-
- uniques = _reconstruct_data(uniques, original.dtype, original)
-
- return codes, uniques
-
-
-def value_counts(
- values,
- sort: bool = True,
- ascending: bool = False,
- normalize: bool = False,
- bins=None,
- dropna: bool = True,
-) -> Series:
- """
- Compute a histogram of the counts of non-null values.
-
- Parameters
- ----------
- values : ndarray (1-d)
- sort : bool, default True
- Sort by values
- ascending : bool, default False
- Sort in ascending order
- normalize: bool, default False
- If True then compute a relative histogram
- bins : integer, optional
- Rather than count values, group them into half-open bins,
- convenience for pd.cut, only works with numeric data
- dropna : bool, default True
- Don't include counts of NaN
-
- Returns
- -------
- Series
- """
- from pandas import (
- Index,
- Series,
- )
-
- index_name = getattr(values, "name", None)
- name = "proportion" if normalize else "count"
-
- if bins is not None:
- from pandas.core.reshape.tile import cut
-
- values = Series(values, copy=False)
- try:
- ii = cut(values, bins, include_lowest=True)
- except TypeError as err:
- raise TypeError("bins argument only works with numeric data.") from err
-
- # count, remove nulls (from the index), and but the bins
- result = ii.value_counts(dropna=dropna)
- result.name = name
- result = result[result.index.notna()]
- result.index = result.index.astype("interval")
- result = result.sort_index()
-
- # if we are dropna and we have NO values
- if dropna and (result._values == 0).all():
- result = result.iloc[0:0]
-
- # normalizing is by len of all (regardless of dropna)
- counts = np.array([len(ii)])
-
- else:
- if is_extension_array_dtype(values):
- # handle Categorical and sparse,
- result = Series(values, copy=False)._values.value_counts(dropna=dropna)
- result.name = name
- result.index.name = index_name
- counts = result._values
- if not isinstance(counts, np.ndarray):
- # e.g. ArrowExtensionArray
- counts = np.asarray(counts)
-
- elif isinstance(values, ABCMultiIndex):
- # GH49558
- levels = list(range(values.nlevels))
- result = (
- Series(index=values, name=name)
- .groupby(level=levels, dropna=dropna)
- .size()
- )
- result.index.names = values.names
- counts = result._values
-
- else:
- values = _ensure_arraylike(values)
- keys, counts = value_counts_arraylike(values, dropna)
- if keys.dtype == np.float16:
- keys = keys.astype(np.float32)
-
- # For backwards compatibility, we let Index do its normal type
- # inference, _except_ for if if infers from object to bool.
- idx = Index(keys)
- if idx.dtype == bool and keys.dtype == object:
- idx = idx.astype(object)
- idx.name = index_name
-
- result = Series(counts, index=idx, name=name, copy=False)
-
- if sort:
- result = result.sort_values(ascending=ascending)
-
- if normalize:
- result = result / counts.sum()
-
- return result
-
-
-# Called once from SparseArray, otherwise could be private
-def value_counts_arraylike(
- values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
-) -> tuple[ArrayLike, npt.NDArray[np.int64]]:
- """
- Parameters
- ----------
- values : np.ndarray
- dropna : bool
- mask : np.ndarray[bool] or None, default None
-
- Returns
- -------
- uniques : np.ndarray
- counts : np.ndarray[np.int64]
- """
- original = values
- values = _ensure_data(values)
-
- keys, counts = htable.value_count(values, dropna, mask=mask)
-
- if needs_i8_conversion(original.dtype):
- # datetime, timedelta, or period
-
- if dropna:
- mask = keys != iNaT
- keys, counts = keys[mask], counts[mask]
-
- res_keys = _reconstruct_data(keys, original.dtype, original)
- return res_keys, counts
-
-
-def duplicated(
- values: ArrayLike, keep: Literal["first", "last", False] = "first"
-) -> npt.NDArray[np.bool_]:
- """
- Return boolean ndarray denoting duplicate values.
-
- Parameters
- ----------
- values : nd.array, ExtensionArray or Series
- Array over which to check for duplicate values.
- keep : {'first', 'last', False}, default 'first'
- - ``first`` : Mark duplicates as ``True`` except for the first
- occurrence.
- - ``last`` : Mark duplicates as ``True`` except for the last
- occurrence.
- - False : Mark all duplicates as ``True``.
-
- Returns
- -------
- duplicated : ndarray[bool]
- """
- if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype):
- values = cast("BaseMaskedArray", values)
- return htable.duplicated(values._data, keep=keep, mask=values._mask)
-
- values = _ensure_data(values)
- return htable.duplicated(values, keep=keep)
-
-
-def mode(
- values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None
-) -> ArrayLike:
- """
- Returns the mode(s) of an array.
-
- Parameters
- ----------
- values : array-like
- Array over which to check for duplicate values.
- dropna : bool, default True
- Don't consider counts of NaN/NaT.
-
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- values = _ensure_arraylike(values)
- original = values
-
- if needs_i8_conversion(values.dtype):
- # Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray.
- values = ensure_wrapped_if_datetimelike(values)
- values = cast("ExtensionArray", values)
- return values._mode(dropna=dropna)
-
- values = _ensure_data(values)
-
- npresult = htable.mode(values, dropna=dropna, mask=mask)
- try:
- npresult = np.sort(npresult)
- except TypeError as err:
- warnings.warn(
- f"Unable to sort modes: {err}",
- stacklevel=find_stack_level(),
- )
-
- result = _reconstruct_data(npresult, original.dtype, original)
- return result
-
-
-def rank(
- values: ArrayLike,
- axis: AxisInt = 0,
- method: str = "average",
- na_option: str = "keep",
- ascending: bool = True,
- pct: bool = False,
-) -> npt.NDArray[np.float64]:
- """
- Rank the values along a given axis.
-
- Parameters
- ----------
- values : np.ndarray or ExtensionArray
- Array whose values will be ranked. The number of dimensions in this
- array must not exceed 2.
- axis : int, default 0
- Axis over which to perform rankings.
- method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
- The method by which tiebreaks are broken during the ranking.
- na_option : {'keep', 'top'}, default 'keep'
- The method by which NaNs are placed in the ranking.
- - ``keep``: rank each NaN value with a NaN ranking
- - ``top``: replace each NaN with either +/- inf so that they
- there are ranked at the top
- ascending : bool, default True
- Whether or not the elements should be ranked in ascending order.
- pct : bool, default False
- Whether or not to the display the returned rankings in integer form
- (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
- """
- is_datetimelike = needs_i8_conversion(values.dtype)
- values = _ensure_data(values)
-
- if values.ndim == 1:
- ranks = algos.rank_1d(
- values,
- is_datetimelike=is_datetimelike,
- ties_method=method,
- ascending=ascending,
- na_option=na_option,
- pct=pct,
- )
- elif values.ndim == 2:
- ranks = algos.rank_2d(
- values,
- axis=axis,
- is_datetimelike=is_datetimelike,
- ties_method=method,
- ascending=ascending,
- na_option=na_option,
- pct=pct,
- )
- else:
- raise TypeError("Array with ndim > 2 are not supported.")
-
- return ranks
-
-
-def checked_add_with_arr(
- arr: npt.NDArray[np.int64],
- b: int | npt.NDArray[np.int64],
- arr_mask: npt.NDArray[np.bool_] | None = None,
- b_mask: npt.NDArray[np.bool_] | None = None,
-) -> npt.NDArray[np.int64]:
- """
- Perform array addition that checks for underflow and overflow.
-
- Performs the addition of an int64 array and an int64 integer (or array)
- but checks that they do not result in overflow first. For elements that
- are indicated to be NaN, whether or not there is overflow for that element
- is automatically ignored.
-
- Parameters
- ----------
- arr : np.ndarray[int64] addend.
- b : array or scalar addend.
- arr_mask : np.ndarray[bool] or None, default None
- array indicating which elements to exclude from checking
- b_mask : np.ndarray[bool] or None, default None
- array or scalar indicating which element(s) to exclude from checking
-
- Returns
- -------
- sum : An array for elements x + b for each element x in arr if b is
- a scalar or an array for elements x + y for each element pair
- (x, y) in (arr, b).
-
- Raises
- ------
- OverflowError if any x + y exceeds the maximum or minimum int64 value.
- """
- # For performance reasons, we broadcast 'b' to the new array 'b2'
- # so that it has the same size as 'arr'.
- b2 = np.broadcast_to(b, arr.shape)
- if b_mask is not None:
- # We do the same broadcasting for b_mask as well.
- b2_mask = np.broadcast_to(b_mask, arr.shape)
- else:
- b2_mask = None
-
- # For elements that are NaN, regardless of their value, we should
- # ignore whether they overflow or not when doing the checked add.
- if arr_mask is not None and b2_mask is not None:
- not_nan = np.logical_not(arr_mask | b2_mask)
- elif arr_mask is not None:
- not_nan = np.logical_not(arr_mask)
- elif b_mask is not None:
- # error: Argument 1 to "__call__" of "_UFunc_Nin1_Nout1" has
- # incompatible type "Optional[ndarray[Any, dtype[bool_]]]";
- # expected "Union[_SupportsArray[dtype[Any]], _NestedSequence
- # [_SupportsArray[dtype[Any]]], bool, int, float, complex, str
- # , bytes, _NestedSequence[Union[bool, int, float, complex, str
- # , bytes]]]"
- not_nan = np.logical_not(b2_mask) # type: ignore[arg-type]
- else:
- not_nan = np.empty(arr.shape, dtype=bool)
- not_nan.fill(True)
-
- # gh-14324: For each element in 'arr' and its corresponding element
- # in 'b2', we check the sign of the element in 'b2'. If it is positive,
- # we then check whether its sum with the element in 'arr' exceeds
- # np.iinfo(np.int64).max. If so, we have an overflow error. If it
- # it is negative, we then check whether its sum with the element in
- # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
- # error as well.
- i8max = lib.i8max
- i8min = iNaT
-
- mask1 = b2 > 0
- mask2 = b2 < 0
-
- if not mask1.any():
- to_raise = ((i8min - b2 > arr) & not_nan).any()
- elif not mask2.any():
- to_raise = ((i8max - b2 < arr) & not_nan).any()
- else:
- to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or (
- (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2]
- ).any()
-
- if to_raise:
- raise OverflowError("Overflow in int64 addition")
-
- result = arr + b
- if arr_mask is not None or b2_mask is not None:
- np.putmask(result, ~not_nan, iNaT)
-
- return result
-
-
-# ---- #
-# take #
-# ---- #
-
-
-def take(
- arr,
- indices: TakeIndexer,
- axis: AxisInt = 0,
- allow_fill: bool = False,
- fill_value=None,
-):
- """
- Take elements from an array.
-
- Parameters
- ----------
- arr : array-like or scalar value
- Non array-likes (sequences/scalars without a dtype) are coerced
- to an ndarray.
- indices : sequence of int or one-dimensional np.ndarray of int
- Indices to be taken.
- axis : int, default 0
- The axis over which to select values.
- allow_fill : bool, default False
- How to handle negative values in `indices`.
-
- * False: negative values in `indices` indicate positional indices
- from the right (the default). This is similar to :func:`numpy.take`.
-
- * True: negative values in `indices` indicate
- missing values. These values are set to `fill_value`. Any other
- negative values raise a ``ValueError``.
-
- fill_value : any, optional
- Fill value to use for NA-indices when `allow_fill` is True.
- This may be ``None``, in which case the default NA value for
- the type (``self.dtype.na_value``) is used.
-
- For multi-dimensional `arr`, each *element* is filled with
- `fill_value`.
-
- Returns
- -------
- ndarray or ExtensionArray
- Same type as the input.
-
- Raises
- ------
- IndexError
- When `indices` is out of bounds for the array.
- ValueError
- When the indexer contains negative values other than ``-1``
- and `allow_fill` is True.
-
- Notes
- -----
- When `allow_fill` is False, `indices` may be whatever dimensionality
- is accepted by NumPy for `arr`.
-
- When `allow_fill` is True, `indices` should be 1-D.
-
- See Also
- --------
- numpy.take : Take elements from an array along an axis.
-
- Examples
- --------
- >>> import pandas as pd
-
- With the default ``allow_fill=False``, negative numbers indicate
- positional indices from the right.
-
- >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1])
- array([10, 10, 30])
-
- Setting ``allow_fill=True`` will place `fill_value` in those positions.
-
- >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
- array([10., 10., nan])
-
- >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
- ... fill_value=-10)
- array([ 10, 10, -10])
- """
- if not is_array_like(arr):
- arr = np.asarray(arr)
-
- indices = np.asarray(indices, dtype=np.intp)
-
- if allow_fill:
- # Pandas style, -1 means NA
- validate_indices(indices, arr.shape[axis])
- result = take_nd(
- arr, indices, axis=axis, allow_fill=True, fill_value=fill_value
- )
- else:
- # NumPy style
- result = arr.take(indices, axis=axis)
- return result
-
-
-# ------------ #
-# searchsorted #
-# ------------ #
-
-
-def searchsorted(
- arr: ArrayLike,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
-) -> npt.NDArray[np.intp] | np.intp:
- """
- Find indices where elements should be inserted to maintain order.
-
- Find the indices into a sorted array `arr` (a) such that, if the
- corresponding elements in `value` were inserted before the indices,
- the order of `arr` would be preserved.
-
- Assuming that `arr` is sorted:
-
- ====== ================================
- `side` returned index `i` satisfies
- ====== ================================
- left ``arr[i-1] < value <= self[i]``
- right ``arr[i-1] <= value < self[i]``
- ====== ================================
-
- Parameters
- ----------
- arr: np.ndarray, ExtensionArray, Series
- Input array. If `sorter` is None, then it must be sorted in
- ascending order, otherwise `sorter` must be an array of indices
- that sort it.
- value : array-like or scalar
- Values to insert into `arr`.
- side : {'left', 'right'}, optional
- If 'left', the index of the first suitable location found is given.
- If 'right', return the last such index. If there is no suitable
- index, return either 0 or N (where N is the length of `self`).
- sorter : 1-D array-like, optional
- Optional array of integer indices that sort array a into ascending
- order. They are typically the result of argsort.
-
- Returns
- -------
- array of ints or int
- If value is array-like, array of insertion points.
- If value is scalar, a single integer.
-
- See Also
- --------
- numpy.searchsorted : Similar method from NumPy.
- """
- if sorter is not None:
- sorter = ensure_platform_int(sorter)
-
- if (
- isinstance(arr, np.ndarray)
- and is_integer_dtype(arr.dtype)
- and (is_integer(value) or is_integer_dtype(value))
- ):
- # if `arr` and `value` have different dtypes, `arr` would be
- # recast by numpy, causing a slow search.
- # Before searching below, we therefore try to give `value` the
- # same dtype as `arr`, while guarding against integer overflows.
- iinfo = np.iinfo(arr.dtype.type)
- value_arr = np.array([value]) if is_scalar(value) else np.array(value)
- if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():
- # value within bounds, so no overflow, so can convert value dtype
- # to dtype of arr
- dtype = arr.dtype
- else:
- dtype = value_arr.dtype
-
- if is_scalar(value):
- # We know that value is int
- value = cast(int, dtype.type(value))
- else:
- value = pd_array(cast(ArrayLike, value), dtype=dtype)
- else:
- # E.g. if `arr` is an array with dtype='datetime64[ns]'
- # and `value` is a pd.Timestamp, we may need to convert value
- arr = ensure_wrapped_if_datetimelike(arr)
-
- # Argument 1 to "searchsorted" of "ndarray" has incompatible type
- # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike"
- return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type]
-
-
-# ---- #
-# diff #
-# ---- #
-
-_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}
-
-
-def diff(arr, n: int, axis: AxisInt = 0):
- """
- difference of n between self,
- analogous to s-s.shift(n)
-
- Parameters
- ----------
- arr : ndarray or ExtensionArray
- n : int
- number of periods
- axis : {0, 1}
- axis to shift on
- stacklevel : int, default 3
- The stacklevel for the lost dtype warning.
-
- Returns
- -------
- shifted
- """
-
- n = int(n)
- na = np.nan
- dtype = arr.dtype
-
- is_bool = is_bool_dtype(dtype)
- if is_bool:
- op = operator.xor
- else:
- op = operator.sub
-
- if isinstance(dtype, PandasDtype):
- # PandasArray cannot necessarily hold shifted versions of itself.
- arr = arr.to_numpy()
- dtype = arr.dtype
-
- if not isinstance(dtype, np.dtype):
- # i.e ExtensionDtype
- if hasattr(arr, f"__{op.__name__}__"):
- if axis != 0:
- raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
- return op(arr, arr.shift(n))
- else:
- raise TypeError(
- f"{type(arr).__name__} has no 'diff' method. "
- "Convert to a suitable dtype prior to calling 'diff'."
- )
-
- is_timedelta = False
- if needs_i8_conversion(arr.dtype):
- dtype = np.int64
- arr = arr.view("i8")
- na = iNaT
- is_timedelta = True
-
- elif is_bool:
- # We have to cast in order to be able to hold np.nan
- dtype = np.object_
-
- elif is_integer_dtype(dtype):
- # We have to cast in order to be able to hold np.nan
-
- # int8, int16 are incompatible with float64,
- # see https://github.com/cython/cython/issues/2646
- if arr.dtype.name in ["int8", "int16"]:
- dtype = np.float32
- else:
- dtype = np.float64
-
- orig_ndim = arr.ndim
- if orig_ndim == 1:
- # reshape so we can always use algos.diff_2d
- arr = arr.reshape(-1, 1)
- # TODO: require axis == 0
-
- dtype = np.dtype(dtype)
- out_arr = np.empty(arr.shape, dtype=dtype)
-
- na_indexer = [slice(None)] * 2
- na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
- out_arr[tuple(na_indexer)] = na
-
- if arr.dtype.name in _diff_special:
- # TODO: can diff_2d dtype specialization troubles be fixed by defining
- # out_arr inside diff_2d?
- algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta)
- else:
- # To keep mypy happy, _res_indexer is a list while res_indexer is
- # a tuple, ditto for lag_indexer.
- _res_indexer = [slice(None)] * 2
- _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)
- res_indexer = tuple(_res_indexer)
-
- _lag_indexer = [slice(None)] * 2
- _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
- lag_indexer = tuple(_lag_indexer)
-
- out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer])
-
- if is_timedelta:
- out_arr = out_arr.view("timedelta64[ns]")
-
- if orig_ndim == 1:
- out_arr = out_arr[:, 0]
- return out_arr
-
-
-# --------------------------------------------------------------------
-# Helper functions
-
-
-# Note: safe_sort is in algorithms.py instead of sorting.py because it is
-# low-dependency, is used in this module, and used private methods from
-# this module.
-def safe_sort(
- values,
- codes=None,
- use_na_sentinel: bool = True,
- assume_unique: bool = False,
- verify: bool = True,
-) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]:
- """
- Sort ``values`` and reorder corresponding ``codes``.
-
- ``values`` should be unique if ``codes`` is not None.
- Safe for use with mixed types (int, str), orders ints before strs.
-
- Parameters
- ----------
- values : list-like
- Sequence; must be unique if ``codes`` is not None.
- codes : list_like, optional
- Indices to ``values``. All out of bound indices are treated as
- "not found" and will be masked with ``-1``.
- use_na_sentinel : bool, default True
- If True, the sentinel -1 will be used for NaN values. If False,
- NaN values will be encoded as non-negative integers and will not drop the
- NaN from the uniques of the values.
- assume_unique : bool, default False
- When True, ``values`` are assumed to be unique, which can speed up
- the calculation. Ignored when ``codes`` is None.
- verify : bool, default True
- Check if codes are out of bound for the values and put out of bound
- codes equal to ``-1``. If ``verify=False``, it is assumed there
- are no out of bound codes. Ignored when ``codes`` is None.
-
- Returns
- -------
- ordered : AnyArrayLike
- Sorted ``values``
- new_codes : ndarray
- Reordered ``codes``; returned when ``codes`` is not None.
-
- Raises
- ------
- TypeError
- * If ``values`` is not list-like or if ``codes`` is neither None
- nor list-like
- * If ``values`` cannot be sorted
- ValueError
- * If ``codes`` is not None and ``values`` contain duplicates.
- """
- if not is_list_like(values):
- raise TypeError(
- "Only list-like objects are allowed to be passed to safe_sort as values"
- )
-
- if not is_array_like(values):
- # don't convert to string types
- dtype, _ = infer_dtype_from_array(values)
- # error: Argument "dtype" to "asarray" has incompatible type "Union[dtype[Any],
- # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
- # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
- # _DTypeDict, Tuple[Any, Any]]]"
- values = np.asarray(values, dtype=dtype) # type: ignore[arg-type]
-
- sorter = None
- ordered: AnyArrayLike
-
- if (
- not is_extension_array_dtype(values)
- and lib.infer_dtype(values, skipna=False) == "mixed-integer"
- ):
- ordered = _sort_mixed(values)
- else:
- try:
- sorter = values.argsort()
- ordered = values.take(sorter)
- except TypeError:
- # Previous sorters failed or were not applicable, try `_sort_mixed`
- # which would work, but which fails for special case of 1d arrays
- # with tuples.
- if values.size and isinstance(values[0], tuple):
- ordered = _sort_tuples(values)
- else:
- ordered = _sort_mixed(values)
-
- # codes:
-
- if codes is None:
- return ordered
-
- if not is_list_like(codes):
- raise TypeError(
- "Only list-like objects or None are allowed to "
- "be passed to safe_sort as codes"
- )
- codes = ensure_platform_int(np.asarray(codes))
-
- if not assume_unique and not len(unique(values)) == len(values):
- raise ValueError("values should be unique if codes is not None")
-
- if sorter is None:
- # mixed types
- hash_klass, values = _get_hashtable_algo(values)
- t = hash_klass(len(values))
- t.map_locations(values)
- sorter = ensure_platform_int(t.lookup(ordered))
-
- if use_na_sentinel:
- # take_nd is faster, but only works for na_sentinels of -1
- order2 = sorter.argsort()
- new_codes = take_nd(order2, codes, fill_value=-1)
- if verify:
- mask = (codes < -len(values)) | (codes >= len(values))
- else:
- mask = None
- else:
- reverse_indexer = np.empty(len(sorter), dtype=np.int_)
- reverse_indexer.put(sorter, np.arange(len(sorter)))
- # Out of bound indices will be masked with `-1` next, so we
- # may deal with them here without performance loss using `mode='wrap'`
- new_codes = reverse_indexer.take(codes, mode="wrap")
-
- if use_na_sentinel:
- mask = codes == -1
- if verify:
- mask = mask | (codes < -len(values)) | (codes >= len(values))
-
- if use_na_sentinel and mask is not None:
- np.putmask(new_codes, mask, -1)
-
- return ordered, ensure_platform_int(new_codes)
-
-
-def _sort_mixed(values) -> AnyArrayLike:
- """order ints before strings before nulls in 1d arrays"""
- str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
- null_pos = np.array([isna(x) for x in values], dtype=bool)
- num_pos = ~str_pos & ~null_pos
- str_argsort = np.argsort(values[str_pos])
- num_argsort = np.argsort(values[num_pos])
- # convert boolean arrays to positional indices, then order by underlying values
- str_locs = str_pos.nonzero()[0].take(str_argsort)
- num_locs = num_pos.nonzero()[0].take(num_argsort)
- null_locs = null_pos.nonzero()[0]
- locs = np.concatenate([num_locs, str_locs, null_locs])
- return values.take(locs)
-
-
-def _sort_tuples(values: np.ndarray) -> np.ndarray:
- """
- Convert array of tuples (1d) to array of arrays (2d).
- We need to keep the columns separately as they contain different types and
- nans (can't use `np.sort` as it may fail when str and nan are mixed in a
- column as types cannot be compared).
- """
- from pandas.core.internals.construction import to_arrays
- from pandas.core.sorting import lexsort_indexer
-
- arrays, _ = to_arrays(values, None)
- indexer = lexsort_indexer(arrays, orders=True)
- return values[indexer]
-
-
-def union_with_duplicates(
- lvals: ArrayLike | Index, rvals: ArrayLike | Index
-) -> ArrayLike | Index:
- """
- Extracts the union from lvals and rvals with respect to duplicates and nans in
- both arrays.
-
- Parameters
- ----------
- lvals: np.ndarray or ExtensionArray
- left values which is ordered in front.
- rvals: np.ndarray or ExtensionArray
- right values ordered after lvals.
-
- Returns
- -------
- np.ndarray or ExtensionArray
- Containing the unsorted union of both arrays.
-
- Notes
- -----
- Caller is responsible for ensuring lvals.dtype == rvals.dtype.
- """
- from pandas import Series
-
- l_count = value_counts(lvals, dropna=False)
- r_count = value_counts(rvals, dropna=False)
- l_count, r_count = l_count.align(r_count, fill_value=0)
- final_count = np.maximum(l_count.values, r_count.values)
- final_count = Series(final_count, index=l_count.index, dtype="int", copy=False)
- if isinstance(lvals, ABCMultiIndex) and isinstance(rvals, ABCMultiIndex):
- unique_vals = lvals.append(rvals).unique()
- else:
- if isinstance(lvals, ABCIndex):
- lvals = lvals._values
- if isinstance(rvals, ABCIndex):
- rvals = rvals._values
- unique_vals = unique(concat_compat([lvals, rvals]))
- unique_vals = ensure_wrapped_if_datetimelike(unique_vals)
- repeats = final_count.reindex(unique_vals).values
- return np.repeat(unique_vals, repeats)
diff --git a/contrib/python/pandas/py3/pandas/core/api.py b/contrib/python/pandas/py3/pandas/core/api.py
deleted file mode 100644
index c0b828d9330..00000000000
--- a/contrib/python/pandas/py3/pandas/core/api.py
+++ /dev/null
@@ -1,140 +0,0 @@
-from pandas._libs import (
- NaT,
- Period,
- Timedelta,
- Timestamp,
-)
-from pandas._libs.missing import NA
-
-from pandas.core.dtypes.dtypes import (
- CategoricalDtype,
- DatetimeTZDtype,
- IntervalDtype,
- PeriodDtype,
-)
-from pandas.core.dtypes.missing import (
- isna,
- isnull,
- notna,
- notnull,
-)
-
-from pandas.core.algorithms import (
- factorize,
- unique,
- value_counts,
-)
-from pandas.core.arrays import Categorical
-from pandas.core.arrays.arrow import ArrowDtype
-from pandas.core.arrays.boolean import BooleanDtype
-from pandas.core.arrays.floating import (
- Float32Dtype,
- Float64Dtype,
-)
-from pandas.core.arrays.integer import (
- Int8Dtype,
- Int16Dtype,
- Int32Dtype,
- Int64Dtype,
- UInt8Dtype,
- UInt16Dtype,
- UInt32Dtype,
- UInt64Dtype,
-)
-from pandas.core.arrays.string_ import StringDtype
-from pandas.core.construction import array
-from pandas.core.flags import Flags
-from pandas.core.groupby import (
- Grouper,
- NamedAgg,
-)
-from pandas.core.indexes.api import (
- CategoricalIndex,
- DatetimeIndex,
- Index,
- IntervalIndex,
- MultiIndex,
- PeriodIndex,
- RangeIndex,
- TimedeltaIndex,
-)
-from pandas.core.indexes.datetimes import (
- bdate_range,
- date_range,
-)
-from pandas.core.indexes.interval import (
- Interval,
- interval_range,
-)
-from pandas.core.indexes.period import period_range
-from pandas.core.indexes.timedeltas import timedelta_range
-from pandas.core.indexing import IndexSlice
-from pandas.core.series import Series
-from pandas.core.tools.datetimes import to_datetime
-from pandas.core.tools.numeric import to_numeric
-from pandas.core.tools.timedeltas import to_timedelta
-
-from pandas.io.formats.format import set_eng_float_format
-from pandas.tseries.offsets import DateOffset
-
-# DataFrame needs to be imported after NamedAgg to avoid a circular import
-from pandas.core.frame import DataFrame # isort:skip
-
-__all__ = [
- "array",
- "ArrowDtype",
- "bdate_range",
- "BooleanDtype",
- "Categorical",
- "CategoricalDtype",
- "CategoricalIndex",
- "DataFrame",
- "DateOffset",
- "date_range",
- "DatetimeIndex",
- "DatetimeTZDtype",
- "factorize",
- "Flags",
- "Float32Dtype",
- "Float64Dtype",
- "Grouper",
- "Index",
- "IndexSlice",
- "Int16Dtype",
- "Int32Dtype",
- "Int64Dtype",
- "Int8Dtype",
- "Interval",
- "IntervalDtype",
- "IntervalIndex",
- "interval_range",
- "isna",
- "isnull",
- "MultiIndex",
- "NA",
- "NamedAgg",
- "NaT",
- "notna",
- "notnull",
- "Period",
- "PeriodDtype",
- "PeriodIndex",
- "period_range",
- "RangeIndex",
- "Series",
- "set_eng_float_format",
- "StringDtype",
- "Timedelta",
- "TimedeltaIndex",
- "timedelta_range",
- "Timestamp",
- "to_datetime",
- "to_numeric",
- "to_timedelta",
- "UInt16Dtype",
- "UInt32Dtype",
- "UInt64Dtype",
- "UInt8Dtype",
- "unique",
- "value_counts",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/apply.py b/contrib/python/pandas/py3/pandas/core/apply.py
deleted file mode 100644
index da049218d51..00000000000
--- a/contrib/python/pandas/py3/pandas/core/apply.py
+++ /dev/null
@@ -1,1502 +0,0 @@
-from __future__ import annotations
-
-import abc
-from collections import defaultdict
-from contextlib import nullcontext
-from functools import partial
-import inspect
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- ContextManager,
- DefaultDict,
- Dict,
- Hashable,
- Iterable,
- Iterator,
- List,
- Sequence,
- cast,
-)
-
-import numpy as np
-
-from pandas._config import option_context
-
-from pandas._libs import lib
-from pandas._typing import (
- AggFuncType,
- AggFuncTypeBase,
- AggFuncTypeDict,
- AggObjType,
- Axis,
- AxisInt,
- NDFrameT,
- npt,
-)
-from pandas.errors import SpecificationError
-from pandas.util._decorators import cache_readonly
-
-from pandas.core.dtypes.cast import is_nested_object
-from pandas.core.dtypes.common import (
- is_dict_like,
- is_extension_array_dtype,
- is_list_like,
- is_sequence,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCNDFrame,
- ABCSeries,
-)
-
-from pandas.core.algorithms import safe_sort
-from pandas.core.base import SelectionMixin
-import pandas.core.common as com
-from pandas.core.construction import ensure_wrapped_if_datetimelike
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Index,
- Series,
- )
- from pandas.core.groupby import GroupBy
- from pandas.core.resample import Resampler
- from pandas.core.window.rolling import BaseWindow
-
-
-ResType = Dict[int, Any]
-
-
-def frame_apply(
- obj: DataFrame,
- func: AggFuncType,
- axis: Axis = 0,
- raw: bool = False,
- result_type: str | None = None,
- args=None,
- kwargs=None,
-) -> FrameApply:
- """construct and return a row or column based frame apply object"""
- axis = obj._get_axis_number(axis)
- klass: type[FrameApply]
- if axis == 0:
- klass = FrameRowApply
- elif axis == 1:
- klass = FrameColumnApply
-
- return klass(
- obj,
- func,
- raw=raw,
- result_type=result_type,
- args=args,
- kwargs=kwargs,
- )
-
-
-class Apply(metaclass=abc.ABCMeta):
- axis: AxisInt
-
- def __init__(
- self,
- obj: AggObjType,
- func,
- raw: bool,
- result_type: str | None,
- args,
- kwargs,
- ) -> None:
- self.obj = obj
- self.raw = raw
- self.args = args or ()
- self.kwargs = kwargs or {}
-
- if result_type not in [None, "reduce", "broadcast", "expand"]:
- raise ValueError(
- "invalid value for result_type, must be one "
- "of {None, 'reduce', 'broadcast', 'expand'}"
- )
-
- self.result_type = result_type
-
- # curry if needed
- if (
- (kwargs or args)
- and not isinstance(func, (np.ufunc, str))
- and not is_list_like(func)
- ):
-
- def f(x):
- return func(x, *args, **kwargs)
-
- else:
- f = func
-
- self.orig_f: AggFuncType = func
- self.f: AggFuncType = f
-
- @abc.abstractmethod
- def apply(self) -> DataFrame | Series:
- pass
-
- def agg(self) -> DataFrame | Series | None:
- """
- Provide an implementation for the aggregators.
-
- Returns
- -------
- Result of aggregation, or None if agg cannot be performed by
- this method.
- """
- obj = self.obj
- arg = self.f
- args = self.args
- kwargs = self.kwargs
-
- if isinstance(arg, str):
- return self.apply_str()
-
- if is_dict_like(arg):
- return self.agg_dict_like()
- elif is_list_like(arg):
- # we require a list, but not a 'str'
- return self.agg_list_like()
-
- if callable(arg):
- f = com.get_cython_func(arg)
- if f and not args and not kwargs:
- return getattr(obj, f)()
-
- # caller can react
- return None
-
- def transform(self) -> DataFrame | Series:
- """
- Transform a DataFrame or Series.
-
- Returns
- -------
- DataFrame or Series
- Result of applying ``func`` along the given axis of the
- Series or DataFrame.
-
- Raises
- ------
- ValueError
- If the transform function fails or does not transform.
- """
- obj = self.obj
- func = self.orig_f
- axis = self.axis
- args = self.args
- kwargs = self.kwargs
-
- is_series = obj.ndim == 1
-
- if obj._get_axis_number(axis) == 1:
- assert not is_series
- return obj.T.transform(func, 0, *args, **kwargs).T
-
- if is_list_like(func) and not is_dict_like(func):
- func = cast(List[AggFuncTypeBase], func)
- # Convert func equivalent dict
- if is_series:
- func = {com.get_callable_name(v) or v: v for v in func}
- else:
- func = {col: func for col in obj}
-
- if is_dict_like(func):
- func = cast(AggFuncTypeDict, func)
- return self.transform_dict_like(func)
-
- # func is either str or callable
- func = cast(AggFuncTypeBase, func)
- try:
- result = self.transform_str_or_callable(func)
- except TypeError:
- raise
- except Exception as err:
- raise ValueError("Transform function failed") from err
-
- # Functions that transform may return empty Series/DataFrame
- # when the dtype is not appropriate
- if (
- isinstance(result, (ABCSeries, ABCDataFrame))
- and result.empty
- and not obj.empty
- ):
- raise ValueError("Transform function failed")
- # error: Argument 1 to "__get__" of "AxisProperty" has incompatible type
- # "Union[Series, DataFrame, GroupBy[Any], SeriesGroupBy,
- # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame,
- # Series]"
- if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
- obj.index # type:ignore[arg-type]
- ):
- raise ValueError("Function did not transform")
-
- return result
-
- def transform_dict_like(self, func):
- """
- Compute transform in the case of a dict-like func
- """
- from pandas.core.reshape.concat import concat
-
- obj = self.obj
- args = self.args
- kwargs = self.kwargs
-
- # transform is currently only for Series/DataFrame
- assert isinstance(obj, ABCNDFrame)
-
- if len(func) == 0:
- raise ValueError("No transform functions were provided")
-
- func = self.normalize_dictlike_arg("transform", obj, func)
-
- results: dict[Hashable, DataFrame | Series] = {}
- for name, how in func.items():
- colg = obj._gotitem(name, ndim=1)
- results[name] = colg.transform(how, 0, *args, **kwargs)
- return concat(results, axis=1)
-
- def transform_str_or_callable(self, func) -> DataFrame | Series:
- """
- Compute transform in the case of a string or callable func
- """
- obj = self.obj
- args = self.args
- kwargs = self.kwargs
-
- if isinstance(func, str):
- return self._try_aggregate_string_function(obj, func, *args, **kwargs)
-
- if not args and not kwargs:
- f = com.get_cython_func(func)
- if f:
- return getattr(obj, f)()
-
- # Two possible ways to use a UDF - apply or call directly
- try:
- return obj.apply(func, args=args, **kwargs)
- except Exception:
- return func(obj, *args, **kwargs)
-
- def agg_list_like(self) -> DataFrame | Series:
- """
- Compute aggregation in the case of a list-like argument.
-
- Returns
- -------
- Result of aggregation.
- """
- from pandas.core.groupby.generic import (
- DataFrameGroupBy,
- SeriesGroupBy,
- )
- from pandas.core.reshape.concat import concat
-
- obj = self.obj
- arg = cast(List[AggFuncTypeBase], self.f)
-
- if getattr(obj, "axis", 0) == 1:
- raise NotImplementedError("axis other than 0 is not supported")
-
- if not isinstance(obj, SelectionMixin):
- # i.e. obj is Series or DataFrame
- selected_obj = obj
- elif obj._selected_obj.ndim == 1:
- # For SeriesGroupBy this matches _obj_with_exclusions
- selected_obj = obj._selected_obj
- else:
- selected_obj = obj._obj_with_exclusions
-
- results = []
- keys = []
-
- is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
- context_manager: ContextManager
- if is_groupby:
- # When as_index=False, we combine all results using indices
- # and adjust index after
- context_manager = com.temp_setattr(obj, "as_index", True)
- else:
- context_manager = nullcontext()
- with context_manager:
- # degenerate case
- if selected_obj.ndim == 1:
- for a in arg:
- colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
- if isinstance(colg, (ABCSeries, ABCDataFrame)):
- new_res = colg.aggregate(
- a, self.axis, *self.args, **self.kwargs
- )
- else:
- new_res = colg.aggregate(a, *self.args, **self.kwargs)
- results.append(new_res)
-
- # make sure we find a good name
- name = com.get_callable_name(a) or a
- keys.append(name)
-
- else:
- indices = []
- for index, col in enumerate(selected_obj):
- colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
- if isinstance(colg, (ABCSeries, ABCDataFrame)):
- new_res = colg.aggregate(
- arg, self.axis, *self.args, **self.kwargs
- )
- else:
- new_res = colg.aggregate(arg, *self.args, **self.kwargs)
- results.append(new_res)
- indices.append(index)
- keys = selected_obj.columns.take(indices)
-
- try:
- return concat(results, keys=keys, axis=1, sort=False)
- except TypeError as err:
- # we are concatting non-NDFrame objects,
- # e.g. a list of scalars
- from pandas import Series
-
- result = Series(results, index=keys, name=obj.name)
- if is_nested_object(result):
- raise ValueError(
- "cannot combine transform and aggregation operations"
- ) from err
- return result
-
- def agg_dict_like(self) -> DataFrame | Series:
- """
- Compute aggregation in the case of a dict-like argument.
-
- Returns
- -------
- Result of aggregation.
- """
- from pandas import Index
- from pandas.core.groupby.generic import (
- DataFrameGroupBy,
- SeriesGroupBy,
- )
- from pandas.core.reshape.concat import concat
-
- obj = self.obj
- arg = cast(AggFuncTypeDict, self.f)
-
- if getattr(obj, "axis", 0) == 1:
- raise NotImplementedError("axis other than 0 is not supported")
-
- if not isinstance(obj, SelectionMixin):
- # i.e. obj is Series or DataFrame
- selected_obj = obj
- selection = None
- else:
- selected_obj = obj._selected_obj
- selection = obj._selection
-
- arg = self.normalize_dictlike_arg("agg", selected_obj, arg)
-
- is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
- context_manager: ContextManager
- if is_groupby:
- # When as_index=False, we combine all results using indices
- # and adjust index after
- context_manager = com.temp_setattr(obj, "as_index", True)
- else:
- context_manager = nullcontext()
- with context_manager:
- if selected_obj.ndim == 1:
- # key only used for output
- colg = obj._gotitem(selection, ndim=1)
- results = {key: colg.agg(how) for key, how in arg.items()}
- else:
- # key used for column selection and output
- results = {
- key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()
- }
-
- # set the final keys
- keys = list(arg.keys())
-
- # Avoid making two isinstance calls in all and any below
- is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()]
-
- # combine results
- if all(is_ndframe):
- keys_to_use: Iterable[Hashable]
- keys_to_use = [k for k in keys if not results[k].empty]
- # Have to check, if at least one DataFrame is not empty.
- keys_to_use = keys_to_use if keys_to_use != [] else keys
- if selected_obj.ndim == 2:
- # keys are columns, so we can preserve names
- ktu = Index(keys_to_use)
- ktu._set_names(selected_obj.columns.names)
- keys_to_use = ktu
-
- axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1
- result = concat(
- {k: results[k] for k in keys_to_use},
- axis=axis,
- keys=keys_to_use,
- )
- elif any(is_ndframe):
- # There is a mix of NDFrames and scalars
- raise ValueError(
- "cannot perform both aggregation "
- "and transformation operations "
- "simultaneously"
- )
- else:
- from pandas import Series
-
- # we have a dict of scalars
- # GH 36212 use name only if obj is a series
- if obj.ndim == 1:
- obj = cast("Series", obj)
- name = obj.name
- else:
- name = None
-
- result = Series(results, name=name)
-
- return result
-
- def apply_str(self) -> DataFrame | Series:
- """
- Compute apply in case of a string.
-
- Returns
- -------
- result: Series or DataFrame
- """
- # Caller is responsible for checking isinstance(self.f, str)
- f = cast(str, self.f)
-
- obj = self.obj
-
- # Support for `frame.transform('method')`
- # Some methods (shift, etc.) require the axis argument, others
- # don't, so inspect and insert if necessary.
- func = getattr(obj, f, None)
- if callable(func):
- sig = inspect.getfullargspec(func)
- arg_names = (*sig.args, *sig.kwonlyargs)
- if self.axis != 0 and (
- "axis" not in arg_names or f in ("corrwith", "skew")
- ):
- raise ValueError(f"Operation {f} does not support axis=1")
- if "axis" in arg_names:
- self.kwargs["axis"] = self.axis
- return self._try_aggregate_string_function(obj, f, *self.args, **self.kwargs)
-
- def apply_multiple(self) -> DataFrame | Series:
- """
- Compute apply in case of a list-like or dict-like.
-
- Returns
- -------
- result: Series, DataFrame, or None
- Result when self.f is a list-like or dict-like, None otherwise.
- """
- return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs)
-
- def normalize_dictlike_arg(
- self, how: str, obj: DataFrame | Series, func: AggFuncTypeDict
- ) -> AggFuncTypeDict:
- """
- Handler for dict-like argument.
-
- Ensures that necessary columns exist if obj is a DataFrame, and
- that a nested renamer is not passed. Also normalizes to all lists
- when values consists of a mix of list and non-lists.
- """
- assert how in ("apply", "agg", "transform")
-
- # Can't use func.values(); wouldn't work for a Series
- if (
- how == "agg"
- and isinstance(obj, ABCSeries)
- and any(is_list_like(v) for _, v in func.items())
- ) or (any(is_dict_like(v) for _, v in func.items())):
- # GH 15931 - deprecation of renaming keys
- raise SpecificationError("nested renamer is not supported")
-
- if obj.ndim != 1:
- # Check for missing columns on a frame
- cols = set(func.keys()) - set(obj.columns)
- if len(cols) > 0:
- cols_sorted = list(safe_sort(list(cols)))
- raise KeyError(f"Column(s) {cols_sorted} do not exist")
-
- aggregator_types = (list, tuple, dict)
-
- # if we have a dict of any non-scalars
- # eg. {'A' : ['mean']}, normalize all to
- # be list-likes
- # Cannot use func.values() because arg may be a Series
- if any(isinstance(x, aggregator_types) for _, x in func.items()):
- new_func: AggFuncTypeDict = {}
- for k, v in func.items():
- if not isinstance(v, aggregator_types):
- new_func[k] = [v]
- else:
- new_func[k] = v
- func = new_func
- return func
-
- def _try_aggregate_string_function(self, obj, arg: str, *args, **kwargs):
- """
- if arg is a string, then try to operate on it:
- - try to find a function (or attribute) on ourselves
- - try to find a numpy function
- - raise
- """
- assert isinstance(arg, str)
-
- f = getattr(obj, arg, None)
- if f is not None:
- if callable(f):
- return f(*args, **kwargs)
-
- # people may try to aggregate on a non-callable attribute
- # but don't let them think they can pass args to it
- assert len(args) == 0
- assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0
- return f
-
- f = getattr(np, arg, None)
- if f is not None and hasattr(obj, "__array__"):
- # in particular exclude Window
- return f(obj, *args, **kwargs)
-
- raise AttributeError(
- f"'{arg}' is not a valid function for '{type(obj).__name__}' object"
- )
-
-
-class NDFrameApply(Apply):
- """
- Methods shared by FrameApply and SeriesApply but
- not GroupByApply or ResamplerWindowApply
- """
-
- obj: DataFrame | Series
-
- @property
- def index(self) -> Index:
- return self.obj.index
-
- @property
- def agg_axis(self) -> Index:
- return self.obj._get_agg_axis(self.axis)
-
-
-class FrameApply(NDFrameApply):
- obj: DataFrame
-
- # ---------------------------------------------------------------
- # Abstract Methods
-
- @property
- @abc.abstractmethod
- def result_index(self) -> Index:
- pass
-
- @property
- @abc.abstractmethod
- def result_columns(self) -> Index:
- pass
-
- @property
- @abc.abstractmethod
- def series_generator(self) -> Iterator[Series]:
- pass
-
- @abc.abstractmethod
- def wrap_results_for_axis(
- self, results: ResType, res_index: Index
- ) -> DataFrame | Series:
- pass
-
- # ---------------------------------------------------------------
-
- @property
- def res_columns(self) -> Index:
- return self.result_columns
-
- @property
- def columns(self) -> Index:
- return self.obj.columns
-
- @cache_readonly
- def values(self):
- return self.obj.values
-
- @cache_readonly
- def dtypes(self) -> Series:
- return self.obj.dtypes
-
- def apply(self) -> DataFrame | Series:
- """compute the results"""
- # dispatch to agg
- if is_list_like(self.f):
- return self.apply_multiple()
-
- # all empty
- if len(self.columns) == 0 and len(self.index) == 0:
- return self.apply_empty_result()
-
- # string dispatch
- if isinstance(self.f, str):
- return self.apply_str()
-
- # ufunc
- elif isinstance(self.f, np.ufunc):
- with np.errstate(all="ignore"):
- results = self.obj._mgr.apply("apply", func=self.f)
- # _constructor will retain self.index and self.columns
- return self.obj._constructor(data=results)
-
- # broadcasting
- if self.result_type == "broadcast":
- return self.apply_broadcast(self.obj)
-
- # one axis empty
- elif not all(self.obj.shape):
- return self.apply_empty_result()
-
- # raw
- elif self.raw:
- return self.apply_raw()
-
- return self.apply_standard()
-
- def agg(self):
- obj = self.obj
- axis = self.axis
-
- # TODO: Avoid having to change state
- self.obj = self.obj if self.axis == 0 else self.obj.T
- self.axis = 0
-
- result = None
- try:
- result = super().agg()
- finally:
- self.obj = obj
- self.axis = axis
-
- if axis == 1:
- result = result.T if result is not None else result
-
- if result is None:
- result = self.obj.apply(self.orig_f, axis, args=self.args, **self.kwargs)
-
- return result
-
- def apply_empty_result(self):
- """
- we have an empty result; at least 1 axis is 0
-
- we will try to apply the function to an empty
- series in order to see if this is a reduction function
- """
- assert callable(self.f)
-
- # we are not asked to reduce or infer reduction
- # so just return a copy of the existing object
- if self.result_type not in ["reduce", None]:
- return self.obj.copy()
-
- # we may need to infer
- should_reduce = self.result_type == "reduce"
-
- from pandas import Series
-
- if not should_reduce:
- try:
- if self.axis == 0:
- r = self.f(Series([], dtype=np.float64))
- else:
- r = self.f(Series(index=self.columns, dtype=np.float64))
- except Exception:
- pass
- else:
- should_reduce = not isinstance(r, Series)
-
- if should_reduce:
- if len(self.agg_axis):
- r = self.f(Series([], dtype=np.float64))
- else:
- r = np.nan
-
- return self.obj._constructor_sliced(r, index=self.agg_axis)
- else:
- return self.obj.copy()
-
- def apply_raw(self):
- """apply to the values as a numpy array"""
-
- def wrap_function(func):
- """
- Wrap user supplied function to work around numpy issue.
-
- see https://github.com/numpy/numpy/issues/8352
- """
-
- def wrapper(*args, **kwargs):
- result = func(*args, **kwargs)
- if isinstance(result, str):
- result = np.array(result, dtype=object)
- return result
-
- return wrapper
-
- result = np.apply_along_axis(wrap_function(self.f), self.axis, self.values)
-
- # TODO: mixed type case
- if result.ndim == 2:
- return self.obj._constructor(result, index=self.index, columns=self.columns)
- else:
- return self.obj._constructor_sliced(result, index=self.agg_axis)
-
- def apply_broadcast(self, target: DataFrame) -> DataFrame:
- assert callable(self.f)
-
- result_values = np.empty_like(target.values)
-
- # axis which we want to compare compliance
- result_compare = target.shape[0]
-
- for i, col in enumerate(target.columns):
- res = self.f(target[col])
- ares = np.asarray(res).ndim
-
- # must be a scalar or 1d
- if ares > 1:
- raise ValueError("too many dims to broadcast")
- if ares == 1:
- # must match return dim
- if result_compare != len(res):
- raise ValueError("cannot broadcast result")
-
- result_values[:, i] = res
-
- # we *always* preserve the original index / columns
- result = self.obj._constructor(
- result_values, index=target.index, columns=target.columns
- )
- return result
-
- def apply_standard(self):
- results, res_index = self.apply_series_generator()
-
- # wrap results
- return self.wrap_results(results, res_index)
-
- def apply_series_generator(self) -> tuple[ResType, Index]:
- assert callable(self.f)
-
- series_gen = self.series_generator
- res_index = self.result_index
-
- results = {}
-
- with option_context("mode.chained_assignment", None):
- for i, v in enumerate(series_gen):
- # ignore SettingWithCopy here in case the user mutates
- results[i] = self.f(v)
- if isinstance(results[i], ABCSeries):
- # If we have a view on v, we need to make a copy because
- # series_generator will swap out the underlying data
- results[i] = results[i].copy(deep=False)
-
- return results, res_index
-
- def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series:
- from pandas import Series
-
- # see if we can infer the results
- if len(results) > 0 and 0 in results and is_sequence(results[0]):
- return self.wrap_results_for_axis(results, res_index)
-
- # dict of scalars
-
- # the default dtype of an empty Series is `object`, but this
- # code can be hit by df.mean() where the result should have dtype
- # float64 even if it's an empty Series.
- constructor_sliced = self.obj._constructor_sliced
- if len(results) == 0 and constructor_sliced is Series:
- result = constructor_sliced(results, dtype=np.float64)
- else:
- result = constructor_sliced(results)
- result.index = res_index
-
- return result
-
- def apply_str(self) -> DataFrame | Series:
- # Caller is responsible for checking isinstance(self.f, str)
- # TODO: GH#39993 - Avoid special-casing by replacing with lambda
- if self.f == "size":
- # Special-cased because DataFrame.size returns a single scalar
- obj = self.obj
- value = obj.shape[self.axis]
- return obj._constructor_sliced(value, index=self.agg_axis)
- return super().apply_str()
-
-
-class FrameRowApply(FrameApply):
- axis: AxisInt = 0
-
- @property
- def series_generator(self):
- return (self.obj._ixs(i, axis=1) for i in range(len(self.columns)))
-
- @property
- def result_index(self) -> Index:
- return self.columns
-
- @property
- def result_columns(self) -> Index:
- return self.index
-
- def wrap_results_for_axis(
- self, results: ResType, res_index: Index
- ) -> DataFrame | Series:
- """return the results for the rows"""
-
- if self.result_type == "reduce":
- # e.g. test_apply_dict GH#8735
- res = self.obj._constructor_sliced(results)
- res.index = res_index
- return res
-
- elif self.result_type is None and all(
- isinstance(x, dict) for x in results.values()
- ):
- # Our operation was a to_dict op e.g.
- # test_apply_dict GH#8735, test_apply_reduce_to_dict GH#25196 #37544
- res = self.obj._constructor_sliced(results)
- res.index = res_index
- return res
-
- try:
- result = self.obj._constructor(data=results)
- except ValueError as err:
- if "All arrays must be of the same length" in str(err):
- # e.g. result = [[2, 3], [1.5], ['foo', 'bar']]
- # see test_agg_listlike_result GH#29587
- res = self.obj._constructor_sliced(results)
- res.index = res_index
- return res
- else:
- raise
-
- if not isinstance(results[0], ABCSeries):
- if len(result.index) == len(self.res_columns):
- result.index = self.res_columns
-
- if len(result.columns) == len(res_index):
- result.columns = res_index
-
- return result
-
-
-class FrameColumnApply(FrameApply):
- axis: AxisInt = 1
-
- def apply_broadcast(self, target: DataFrame) -> DataFrame:
- result = super().apply_broadcast(target.T)
- return result.T
-
- @property
- def series_generator(self):
- values = self.values
- values = ensure_wrapped_if_datetimelike(values)
- assert len(values) > 0
-
- # We create one Series object, and will swap out the data inside
- # of it. Kids: don't do this at home.
- ser = self.obj._ixs(0, axis=0)
- mgr = ser._mgr
-
- if is_extension_array_dtype(ser.dtype):
- # values will be incorrect for this block
- # TODO(EA2D): special case would be unnecessary with 2D EAs
- obj = self.obj
- for i in range(len(obj)):
- yield obj._ixs(i, axis=0)
-
- else:
- for arr, name in zip(values, self.index):
- # GH#35462 re-pin mgr in case setitem changed it
- ser._mgr = mgr
- mgr.set_values(arr)
- object.__setattr__(ser, "_name", name)
- yield ser
-
- @property
- def result_index(self) -> Index:
- return self.index
-
- @property
- def result_columns(self) -> Index:
- return self.columns
-
- def wrap_results_for_axis(
- self, results: ResType, res_index: Index
- ) -> DataFrame | Series:
- """return the results for the columns"""
- result: DataFrame | Series
-
- # we have requested to expand
- if self.result_type == "expand":
- result = self.infer_to_same_shape(results, res_index)
-
- # we have a non-series and don't want inference
- elif not isinstance(results[0], ABCSeries):
- result = self.obj._constructor_sliced(results)
- result.index = res_index
-
- # we may want to infer results
- else:
- result = self.infer_to_same_shape(results, res_index)
-
- return result
-
- def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame:
- """infer the results to the same shape as the input object"""
- result = self.obj._constructor(data=results)
- result = result.T
-
- # set the index
- result.index = res_index
-
- # infer dtypes
- result = result.infer_objects(copy=False)
-
- return result
-
-
-class SeriesApply(NDFrameApply):
- obj: Series
- axis: AxisInt = 0
-
- def __init__(
- self,
- obj: Series,
- func: AggFuncType,
- convert_dtype: bool,
- args,
- kwargs,
- ) -> None:
- self.convert_dtype = convert_dtype
-
- super().__init__(
- obj,
- func,
- raw=False,
- result_type=None,
- args=args,
- kwargs=kwargs,
- )
-
- def apply(self) -> DataFrame | Series:
- obj = self.obj
-
- if len(obj) == 0:
- return self.apply_empty_result()
-
- # dispatch to agg
- if is_list_like(self.f):
- return self.apply_multiple()
-
- if isinstance(self.f, str):
- # if we are a string, try to dispatch
- return self.apply_str()
-
- # self.f is Callable
- return self.apply_standard()
-
- def agg(self):
- result = super().agg()
- if result is None:
- f = self.f
- kwargs = self.kwargs
-
- # string, list-like, and dict-like are entirely handled in super
- assert callable(f)
-
- # we can be called from an inner function which
- # passes this meta-data
- kwargs.pop("_level", None)
-
- # try a regular apply, this evaluates lambdas
- # row-by-row; however if the lambda is expected a Series
- # expression, e.g.: lambda x: x-x.quantile(0.25)
- # this will fail, so we can try a vectorized evaluation
-
- # we cannot FIRST try the vectorized evaluation, because
- # then .agg and .apply would have different semantics if the
- # operation is actually defined on the Series, e.g. str
- try:
- result = self.obj.apply(f)
- except (ValueError, AttributeError, TypeError):
- result = f(self.obj)
-
- return result
-
- def apply_empty_result(self) -> Series:
- obj = self.obj
- return obj._constructor(dtype=obj.dtype, index=obj.index).__finalize__(
- obj, method="apply"
- )
-
- def apply_standard(self) -> DataFrame | Series:
- # caller is responsible for ensuring that f is Callable
- f = cast(Callable, self.f)
- obj = self.obj
-
- with np.errstate(all="ignore"):
- if isinstance(f, np.ufunc):
- return f(obj)
-
- # row-wise access
- if is_extension_array_dtype(obj.dtype) and hasattr(obj._values, "map"):
- # GH#23179 some EAs do not have `map`
- mapped = obj._values.map(f)
- else:
- values = obj.astype(object)._values
- mapped = lib.map_infer(
- values,
- f,
- convert=self.convert_dtype,
- )
-
- if len(mapped) and isinstance(mapped[0], ABCSeries):
- # GH#43986 Need to do list(mapped) in order to get treated as nested
- # See also GH#25959 regarding EA support
- return obj._constructor_expanddim(list(mapped), index=obj.index)
- else:
- return obj._constructor(mapped, index=obj.index).__finalize__(
- obj, method="apply"
- )
-
-
-class GroupByApply(Apply):
- def __init__(
- self,
- obj: GroupBy[NDFrameT],
- func: AggFuncType,
- args,
- kwargs,
- ) -> None:
- kwargs = kwargs.copy()
- self.axis = obj.obj._get_axis_number(kwargs.get("axis", 0))
- super().__init__(
- obj,
- func,
- raw=False,
- result_type=None,
- args=args,
- kwargs=kwargs,
- )
-
- def apply(self):
- raise NotImplementedError
-
- def transform(self):
- raise NotImplementedError
-
-
-class ResamplerWindowApply(Apply):
- axis: AxisInt = 0
- obj: Resampler | BaseWindow
-
- def __init__(
- self,
- obj: Resampler | BaseWindow,
- func: AggFuncType,
- args,
- kwargs,
- ) -> None:
- super().__init__(
- obj,
- func,
- raw=False,
- result_type=None,
- args=args,
- kwargs=kwargs,
- )
-
- def apply(self):
- raise NotImplementedError
-
- def transform(self):
- raise NotImplementedError
-
-
-def reconstruct_func(
- func: AggFuncType | None, **kwargs
-) -> tuple[bool, AggFuncType | None, list[str] | None, npt.NDArray[np.intp] | None]:
- """
- This is the internal function to reconstruct func given if there is relabeling
- or not and also normalize the keyword to get new order of columns.
-
- If named aggregation is applied, `func` will be None, and kwargs contains the
- column and aggregation function information to be parsed;
- If named aggregation is not applied, `func` is either string (e.g. 'min') or
- Callable, or list of them (e.g. ['min', np.max]), or the dictionary of column name
- and str/Callable/list of them (e.g. {'A': 'min'}, or {'A': [np.min, lambda x: x]})
-
- If relabeling is True, will return relabeling, reconstructed func, column
- names, and the reconstructed order of columns.
- If relabeling is False, the columns and order will be None.
-
- Parameters
- ----------
- func: agg function (e.g. 'min' or Callable) or list of agg functions
- (e.g. ['min', np.max]) or dictionary (e.g. {'A': ['min', np.max]}).
- **kwargs: dict, kwargs used in is_multi_agg_with_relabel and
- normalize_keyword_aggregation function for relabelling
-
- Returns
- -------
- relabelling: bool, if there is relabelling or not
- func: normalized and mangled func
- columns: list of column names
- order: array of columns indices
-
- Examples
- --------
- >>> reconstruct_func(None, **{"foo": ("col", "min")})
- (True, defaultdict(<class 'list'>, {'col': ['min']}), ('foo',), array([0]))
-
- >>> reconstruct_func("min")
- (False, 'min', None, None)
- """
- relabeling = func is None and is_multi_agg_with_relabel(**kwargs)
- columns: list[str] | None = None
- order: npt.NDArray[np.intp] | None = None
-
- if not relabeling:
- if isinstance(func, list) and len(func) > len(set(func)):
- # GH 28426 will raise error if duplicated function names are used and
- # there is no reassigned name
- raise SpecificationError(
- "Function names must be unique if there is no new column names "
- "assigned"
- )
- if func is None:
- # nicer error message
- raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).")
-
- if relabeling:
- func, columns, order = normalize_keyword_aggregation(kwargs)
-
- return relabeling, func, columns, order
-
-
-def is_multi_agg_with_relabel(**kwargs) -> bool:
- """
- Check whether kwargs passed to .agg look like multi-agg with relabeling.
-
- Parameters
- ----------
- **kwargs : dict
-
- Returns
- -------
- bool
-
- Examples
- --------
- >>> is_multi_agg_with_relabel(a="max")
- False
- >>> is_multi_agg_with_relabel(a_max=("a", "max"), a_min=("a", "min"))
- True
- >>> is_multi_agg_with_relabel()
- False
- """
- return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and (
- len(kwargs) > 0
- )
-
-
-def normalize_keyword_aggregation(
- kwargs: dict,
-) -> tuple[dict, list[str], npt.NDArray[np.intp]]:
- """
- Normalize user-provided "named aggregation" kwargs.
- Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs
- to the old Dict[str, List[scalar]]].
-
- Parameters
- ----------
- kwargs : dict
-
- Returns
- -------
- aggspec : dict
- The transformed kwargs.
- columns : List[str]
- The user-provided keys.
- col_idx_order : List[int]
- List of columns indices.
-
- Examples
- --------
- >>> normalize_keyword_aggregation({"output": ("input", "sum")})
- (defaultdict(<class 'list'>, {'input': ['sum']}), ('output',), array([0]))
- """
- from pandas.core.indexes.base import Index
-
- # Normalize the aggregation functions as Mapping[column, List[func]],
- # process normally, then fixup the names.
- # TODO: aggspec type: typing.Dict[str, List[AggScalar]]
- # May be hitting https://github.com/python/mypy/issues/5958
- # saying it doesn't have an attribute __name__
- aggspec: DefaultDict = defaultdict(list)
- order = []
- columns, pairs = list(zip(*kwargs.items()))
-
- for column, aggfunc in pairs:
- aggspec[column].append(aggfunc)
- order.append((column, com.get_callable_name(aggfunc) or aggfunc))
-
- # uniquify aggfunc name if duplicated in order list
- uniquified_order = _make_unique_kwarg_list(order)
-
- # GH 25719, due to aggspec will change the order of assigned columns in aggregation
- # uniquified_aggspec will store uniquified order list and will compare it with order
- # based on index
- aggspec_order = [
- (column, com.get_callable_name(aggfunc) or aggfunc)
- for column, aggfuncs in aggspec.items()
- for aggfunc in aggfuncs
- ]
- uniquified_aggspec = _make_unique_kwarg_list(aggspec_order)
-
- # get the new index of columns by comparison
- col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order)
- return aggspec, columns, col_idx_order
-
-
-def _make_unique_kwarg_list(
- seq: Sequence[tuple[Any, Any]]
-) -> Sequence[tuple[Any, Any]]:
- """
- Uniquify aggfunc name of the pairs in the order list
-
- Examples:
- --------
- >>> kwarg_list = [('a', '<lambda>'), ('a', '<lambda>'), ('b', '<lambda>')]
- >>> _make_unique_kwarg_list(kwarg_list)
- [('a', '<lambda>_0'), ('a', '<lambda>_1'), ('b', '<lambda>')]
- """
- return [
- (pair[0], f"{pair[1]}_{seq[:i].count(pair)}") if seq.count(pair) > 1 else pair
- for i, pair in enumerate(seq)
- ]
-
-
-def relabel_result(
- result: DataFrame | Series,
- func: dict[str, list[Callable | str]],
- columns: Iterable[Hashable],
- order: Iterable[int],
-) -> dict[Hashable, Series]:
- """
- Internal function to reorder result if relabelling is True for
- dataframe.agg, and return the reordered result in dict.
-
- Parameters:
- ----------
- result: Result from aggregation
- func: Dict of (column name, funcs)
- columns: New columns name for relabelling
- order: New order for relabelling
-
- Examples:
- ---------
- >>> result = DataFrame({"A": [np.nan, 2, np.nan],
- ... "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]}) # doctest: +SKIP
- >>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]}
- >>> columns = ("foo", "aab", "bar", "dat")
- >>> order = [0, 1, 2, 3]
- >>> _relabel_result(result, func, columns, order) # doctest: +SKIP
- dict(A=Series([2.0, NaN, NaN, NaN], index=["foo", "aab", "bar", "dat"]),
- C=Series([NaN, 6.0, NaN, NaN], index=["foo", "aab", "bar", "dat"]),
- B=Series([NaN, NaN, 2.5, 4.0], index=["foo", "aab", "bar", "dat"]))
- """
- from pandas.core.indexes.base import Index
-
- reordered_indexes = [
- pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1])
- ]
- reordered_result_in_dict: dict[Hashable, Series] = {}
- idx = 0
-
- reorder_mask = not isinstance(result, ABCSeries) and len(result.columns) > 1
- for col, fun in func.items():
- s = result[col].dropna()
-
- # In the `_aggregate`, the callable names are obtained and used in `result`, and
- # these names are ordered alphabetically. e.g.
- # C2 C1
- # <lambda> 1 NaN
- # amax NaN 4.0
- # max NaN 4.0
- # sum 18.0 6.0
- # Therefore, the order of functions for each column could be shuffled
- # accordingly so need to get the callable name if it is not parsed names, and
- # reorder the aggregated result for each column.
- # e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is
- # [sum, <lambda>], but in `result`, it will be [<lambda>, sum], and we need to
- # reorder so that aggregated values map to their functions regarding the order.
-
- # However there is only one column being used for aggregation, not need to
- # reorder since the index is not sorted, and keep as is in `funcs`, e.g.
- # A
- # min 1.0
- # mean 1.5
- # mean 1.5
- if reorder_mask:
- fun = [
- com.get_callable_name(f) if not isinstance(f, str) else f for f in fun
- ]
- col_idx_order = Index(s.index).get_indexer(fun)
- s = s[col_idx_order]
-
- # assign the new user-provided "named aggregation" as index names, and reindex
- # it based on the whole user-provided names.
- s.index = reordered_indexes[idx : idx + len(fun)]
- reordered_result_in_dict[col] = s.reindex(columns, copy=False)
- idx = idx + len(fun)
- return reordered_result_in_dict
-
-
-# TODO: Can't use, because mypy doesn't like us setting __name__
-# error: "partial[Any]" has no attribute "__name__"
-# the type is:
-# typing.Sequence[Callable[..., ScalarResult]]
-# -> typing.Sequence[Callable[..., ScalarResult]]:
-
-
-def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]:
- """
- Possibly mangle a list of aggfuncs.
-
- Parameters
- ----------
- aggfuncs : Sequence
-
- Returns
- -------
- mangled: list-like
- A new AggSpec sequence, where lambdas have been converted
- to have unique names.
-
- Notes
- -----
- If just one aggfunc is passed, the name will not be mangled.
- """
- if len(aggfuncs) <= 1:
- # don't mangle for .agg([lambda x: .])
- return aggfuncs
- i = 0
- mangled_aggfuncs = []
- for aggfunc in aggfuncs:
- if com.get_callable_name(aggfunc) == "<lambda>":
- aggfunc = partial(aggfunc)
- aggfunc.__name__ = f"<lambda_{i}>"
- i += 1
- mangled_aggfuncs.append(aggfunc)
-
- return mangled_aggfuncs
-
-
-def maybe_mangle_lambdas(agg_spec: Any) -> Any:
- """
- Make new lambdas with unique names.
-
- Parameters
- ----------
- agg_spec : Any
- An argument to GroupBy.agg.
- Non-dict-like `agg_spec` are pass through as is.
- For dict-like `agg_spec` a new spec is returned
- with name-mangled lambdas.
-
- Returns
- -------
- mangled : Any
- Same type as the input.
-
- Examples
- --------
- >>> maybe_mangle_lambdas('sum')
- 'sum'
- >>> maybe_mangle_lambdas([lambda: 1, lambda: 2]) # doctest: +SKIP
- [<function __main__.<lambda_0>,
- <function pandas...._make_lambda.<locals>.f(*args, **kwargs)>]
- """
- is_dict = is_dict_like(agg_spec)
- if not (is_dict or is_list_like(agg_spec)):
- return agg_spec
- mangled_aggspec = type(agg_spec)() # dict or OrderedDict
-
- if is_dict:
- for key, aggfuncs in agg_spec.items():
- if is_list_like(aggfuncs) and not is_dict_like(aggfuncs):
- mangled_aggfuncs = _managle_lambda_list(aggfuncs)
- else:
- mangled_aggfuncs = aggfuncs
-
- mangled_aggspec[key] = mangled_aggfuncs
- else:
- mangled_aggspec = _managle_lambda_list(agg_spec)
-
- return mangled_aggspec
-
-
-def validate_func_kwargs(
- kwargs: dict,
-) -> tuple[list[str], list[str | Callable[..., Any]]]:
- """
- Validates types of user-provided "named aggregation" kwargs.
- `TypeError` is raised if aggfunc is not `str` or callable.
-
- Parameters
- ----------
- kwargs : dict
-
- Returns
- -------
- columns : List[str]
- List of user-provied keys.
- func : List[Union[str, callable[...,Any]]]
- List of user-provided aggfuncs
-
- Examples
- --------
- >>> validate_func_kwargs({'one': 'min', 'two': 'max'})
- (['one', 'two'], ['min', 'max'])
- """
- tuple_given_message = "func is expected but received {} in **kwargs."
- columns = list(kwargs)
- func = []
- for col_func in kwargs.values():
- if not (isinstance(col_func, str) or callable(col_func)):
- raise TypeError(tuple_given_message.format(type(col_func).__name__))
- func.append(col_func)
- if not columns:
- no_arg_message = "Must provide 'func' or named aggregation **kwargs."
- raise TypeError(no_arg_message)
- return columns, func
diff --git a/contrib/python/pandas/py3/pandas/core/array_algos/__init__.py b/contrib/python/pandas/py3/pandas/core/array_algos/__init__.py
deleted file mode 100644
index a7655a013c6..00000000000
--- a/contrib/python/pandas/py3/pandas/core/array_algos/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""
-core.array_algos is for algorithms that operate on ndarray and ExtensionArray.
-These should:
-
-- Assume that any Index, Series, or DataFrame objects have already been unwrapped.
-- Assume that any list arguments have already been cast to ndarray/EA.
-- Not depend on Index, Series, or DataFrame, nor import any of these.
-- May dispatch to ExtensionArray methods, but should not import from core.arrays.
-"""
diff --git a/contrib/python/pandas/py3/pandas/core/array_algos/datetimelike_accumulations.py b/contrib/python/pandas/py3/pandas/core/array_algos/datetimelike_accumulations.py
deleted file mode 100644
index d0c62274212..00000000000
--- a/contrib/python/pandas/py3/pandas/core/array_algos/datetimelike_accumulations.py
+++ /dev/null
@@ -1,67 +0,0 @@
-"""
-datetimelke_accumulations.py is for accumulations of datetimelike extension arrays
-"""
-
-from __future__ import annotations
-
-from typing import Callable
-
-import numpy as np
-
-from pandas._libs import iNaT
-
-from pandas.core.dtypes.missing import isna
-
-
-def _cum_func(
- func: Callable,
- values: np.ndarray,
- *,
- skipna: bool = True,
-):
- """
- Accumulations for 1D datetimelike arrays.
-
- Parameters
- ----------
- func : np.cumsum, np.maximum.accumulate, np.minimum.accumulate
- values : np.ndarray
- Numpy array with the values (can be of any dtype that support the
- operation). Values is changed is modified inplace.
- skipna : bool, default True
- Whether to skip NA.
- """
- try:
- fill_value = {
- np.maximum.accumulate: np.iinfo(np.int64).min,
- np.cumsum: 0,
- np.minimum.accumulate: np.iinfo(np.int64).max,
- }[func]
- except KeyError:
- raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray")
-
- mask = isna(values)
- y = values.view("i8")
- y[mask] = fill_value
-
- if not skipna:
- mask = np.maximum.accumulate(mask)
-
- result = func(y)
- result[mask] = iNaT
-
- if values.dtype.kind in ["m", "M"]:
- return result.view(values.dtype.base)
- return result
-
-
-def cumsum(values: np.ndarray, *, skipna: bool = True) -> np.ndarray:
- return _cum_func(np.cumsum, values, skipna=skipna)
-
-
-def cummin(values: np.ndarray, *, skipna: bool = True):
- return _cum_func(np.minimum.accumulate, values, skipna=skipna)
-
-
-def cummax(values: np.ndarray, *, skipna: bool = True):
- return _cum_func(np.maximum.accumulate, values, skipna=skipna)
diff --git a/contrib/python/pandas/py3/pandas/core/array_algos/masked_accumulations.py b/contrib/python/pandas/py3/pandas/core/array_algos/masked_accumulations.py
deleted file mode 100644
index 07113128e09..00000000000
--- a/contrib/python/pandas/py3/pandas/core/array_algos/masked_accumulations.py
+++ /dev/null
@@ -1,92 +0,0 @@
-"""
-masked_accumulations.py is for accumulation algorithms using a mask-based approach
-for missing values.
-"""
-
-from __future__ import annotations
-
-from typing import Callable
-
-import numpy as np
-
-from pandas._typing import npt
-
-from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_float_dtype,
- is_integer_dtype,
-)
-
-
-def _cum_func(
- func: Callable,
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- *,
- skipna: bool = True,
-):
- """
- Accumulations for 1D masked array.
-
- We will modify values in place to replace NAs with the appropriate fill value.
-
- Parameters
- ----------
- func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate
- values : np.ndarray
- Numpy array with the values (can be of any dtype that support the
- operation).
- mask : np.ndarray
- Boolean numpy array (True values indicate missing values).
- skipna : bool, default True
- Whether to skip NA.
- """
- dtype_info: np.iinfo | np.finfo
- if is_float_dtype(values):
- dtype_info = np.finfo(values.dtype.type)
- elif is_integer_dtype(values):
- dtype_info = np.iinfo(values.dtype.type)
- elif is_bool_dtype(values):
- # Max value of bool is 1, but since we are setting into a boolean
- # array, 255 is fine as well. Min value has to be 0 when setting
- # into the boolean array.
- dtype_info = np.iinfo(np.uint8)
- else:
- raise NotImplementedError(
- f"No masked accumulation defined for dtype {values.dtype.type}"
- )
- try:
- fill_value = {
- np.cumprod: 1,
- np.maximum.accumulate: dtype_info.min,
- np.cumsum: 0,
- np.minimum.accumulate: dtype_info.max,
- }[func]
- except KeyError:
- raise NotImplementedError(
- f"No accumulation for {func} implemented on BaseMaskedArray"
- )
-
- values[mask] = fill_value
-
- if not skipna:
- mask = np.maximum.accumulate(mask)
-
- values = func(values)
- return values, mask
-
-
-def cumsum(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
- return _cum_func(np.cumsum, values, mask, skipna=skipna)
-
-
-def cumprod(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
- return _cum_func(np.cumprod, values, mask, skipna=skipna)
-
-
-def cummin(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
- return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna)
-
-
-def cummax(values: np.ndarray, mask: npt.NDArray[np.bool_], *, skipna: bool = True):
- return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna)
diff --git a/contrib/python/pandas/py3/pandas/core/array_algos/masked_reductions.py b/contrib/python/pandas/py3/pandas/core/array_algos/masked_reductions.py
deleted file mode 100644
index 3096e84bb7e..00000000000
--- a/contrib/python/pandas/py3/pandas/core/array_algos/masked_reductions.py
+++ /dev/null
@@ -1,192 +0,0 @@
-"""
-masked_reductions.py is for reduction algorithms using a mask-based approach
-for missing values.
-"""
-from __future__ import annotations
-
-from typing import Callable
-import warnings
-
-import numpy as np
-
-from pandas._libs import missing as libmissing
-from pandas._typing import (
- AxisInt,
- npt,
-)
-
-from pandas.core.nanops import check_below_min_count
-
-
-def _reductions(
- func: Callable,
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- *,
- skipna: bool = True,
- min_count: int = 0,
- axis: AxisInt | None = None,
- **kwargs,
-):
- """
- Sum, mean or product for 1D masked array.
-
- Parameters
- ----------
- func : np.sum or np.prod
- values : np.ndarray
- Numpy array with the values (can be of any dtype that support the
- operation).
- mask : np.ndarray[bool]
- Boolean numpy array (True values indicate missing values).
- skipna : bool, default True
- Whether to skip NA.
- min_count : int, default 0
- The required number of valid values to perform the operation. If fewer than
- ``min_count`` non-NA values are present the result will be NA.
- axis : int, optional, default None
- """
- if not skipna:
- if mask.any(axis=axis) or check_below_min_count(values.shape, None, min_count):
- return libmissing.NA
- else:
- return func(values, axis=axis, **kwargs)
- else:
- if check_below_min_count(values.shape, mask, min_count) and (
- axis is None or values.ndim == 1
- ):
- return libmissing.NA
-
- return func(values, where=~mask, axis=axis, **kwargs)
-
-
-def sum(
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- *,
- skipna: bool = True,
- min_count: int = 0,
- axis: AxisInt | None = None,
-):
- return _reductions(
- np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
- )
-
-
-def prod(
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- *,
- skipna: bool = True,
- min_count: int = 0,
- axis: AxisInt | None = None,
-):
- return _reductions(
- np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count, axis=axis
- )
-
-
-def _minmax(
- func: Callable,
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- *,
- skipna: bool = True,
- axis: AxisInt | None = None,
-):
- """
- Reduction for 1D masked array.
-
- Parameters
- ----------
- func : np.min or np.max
- values : np.ndarray
- Numpy array with the values (can be of any dtype that support the
- operation).
- mask : np.ndarray[bool]
- Boolean numpy array (True values indicate missing values).
- skipna : bool, default True
- Whether to skip NA.
- axis : int, optional, default None
- """
- if not skipna:
- if mask.any() or not values.size:
- # min/max with empty array raise in numpy, pandas returns NA
- return libmissing.NA
- else:
- return func(values)
- else:
- subset = values[~mask]
- if subset.size:
- return func(subset)
- else:
- # min/max with empty array raise in numpy, pandas returns NA
- return libmissing.NA
-
-
-def min(
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- *,
- skipna: bool = True,
- axis: AxisInt | None = None,
-):
- return _minmax(np.min, values=values, mask=mask, skipna=skipna, axis=axis)
-
-
-def max(
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- *,
- skipna: bool = True,
- axis: AxisInt | None = None,
-):
- return _minmax(np.max, values=values, mask=mask, skipna=skipna, axis=axis)
-
-
-def mean(
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- *,
- skipna: bool = True,
- axis: AxisInt | None = None,
-):
- if not values.size or mask.all():
- return libmissing.NA
- return _reductions(np.mean, values=values, mask=mask, skipna=skipna, axis=axis)
-
-
-def var(
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- *,
- skipna: bool = True,
- axis: AxisInt | None = None,
- ddof: int = 1,
-):
- if not values.size or mask.all():
- return libmissing.NA
-
- with warnings.catch_warnings():
- warnings.simplefilter("ignore", RuntimeWarning)
- return _reductions(
- np.var, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
- )
-
-
-def std(
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- *,
- skipna: bool = True,
- axis: AxisInt | None = None,
- ddof: int = 1,
-):
- if not values.size or mask.all():
- return libmissing.NA
-
- with warnings.catch_warnings():
- warnings.simplefilter("ignore", RuntimeWarning)
- return _reductions(
- np.std, values=values, mask=mask, skipna=skipna, axis=axis, ddof=ddof
- )
diff --git a/contrib/python/pandas/py3/pandas/core/array_algos/putmask.py b/contrib/python/pandas/py3/pandas/core/array_algos/putmask.py
deleted file mode 100644
index 3e2c711d12f..00000000000
--- a/contrib/python/pandas/py3/pandas/core/array_algos/putmask.py
+++ /dev/null
@@ -1,152 +0,0 @@
-"""
-EA-compatible analogue to np.putmask
-"""
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Any,
-)
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import (
- ArrayLike,
- npt,
-)
-from pandas.compat import np_version_under1p21
-
-from pandas.core.dtypes.cast import infer_dtype_from
-from pandas.core.dtypes.common import is_list_like
-
-from pandas.core.arrays import ExtensionArray
-
-if TYPE_CHECKING:
- from pandas import MultiIndex
-
-
-def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None:
- """
- ExtensionArray-compatible implementation of np.putmask. The main
- difference is we do not handle repeating or truncating like numpy.
-
- Parameters
- ----------
- values: np.ndarray or ExtensionArray
- mask : np.ndarray[bool]
- We assume extract_bool_array has already been called.
- value : Any
- """
-
- if (
- not isinstance(values, np.ndarray)
- or (values.dtype == object and not lib.is_scalar(value))
- # GH#43424: np.putmask raises TypeError if we cannot cast between types with
- # rule = "safe", a stricter guarantee we may not have here
- or (
- isinstance(value, np.ndarray) and not np.can_cast(value.dtype, values.dtype)
- )
- ):
- # GH#19266 using np.putmask gives unexpected results with listlike value
- # along with object dtype
- if is_list_like(value) and len(value) == len(values):
- values[mask] = value[mask]
- else:
- values[mask] = value
- else:
- # GH#37833 np.putmask is more performant than __setitem__
- np.putmask(values, mask, value)
-
-
-def putmask_without_repeat(
- values: np.ndarray, mask: npt.NDArray[np.bool_], new: Any
-) -> None:
- """
- np.putmask will truncate or repeat if `new` is a listlike with
- len(new) != len(values). We require an exact match.
-
- Parameters
- ----------
- values : np.ndarray
- mask : np.ndarray[bool]
- new : Any
- """
- if np_version_under1p21:
- new = setitem_datetimelike_compat(values, mask.sum(), new)
-
- if getattr(new, "ndim", 0) >= 1:
- new = new.astype(values.dtype, copy=False)
-
- # TODO: this prob needs some better checking for 2D cases
- nlocs = mask.sum()
- if nlocs > 0 and is_list_like(new) and getattr(new, "ndim", 1) == 1:
- shape = np.shape(new)
- # np.shape compat for if setitem_datetimelike_compat
- # changed arraylike to list e.g. test_where_dt64_2d
- if nlocs == shape[-1]:
- # GH#30567
- # If length of ``new`` is less than the length of ``values``,
- # `np.putmask` would first repeat the ``new`` array and then
- # assign the masked values hence produces incorrect result.
- # `np.place` on the other hand uses the ``new`` values at it is
- # to place in the masked locations of ``values``
- np.place(values, mask, new)
- # i.e. values[mask] = new
- elif mask.shape[-1] == shape[-1] or shape[-1] == 1:
- np.putmask(values, mask, new)
- else:
- raise ValueError("cannot assign mismatch length to masked array")
- else:
- np.putmask(values, mask, new)
-
-
-def validate_putmask(
- values: ArrayLike | MultiIndex, mask: np.ndarray
-) -> tuple[npt.NDArray[np.bool_], bool]:
- """
- Validate mask and check if this putmask operation is a no-op.
- """
- mask = extract_bool_array(mask)
- if mask.shape != values.shape:
- raise ValueError("putmask: mask and data must be the same size")
-
- noop = not mask.any()
- return mask, noop
-
-
-def extract_bool_array(mask: ArrayLike) -> npt.NDArray[np.bool_]:
- """
- If we have a SparseArray or BooleanArray, convert it to ndarray[bool].
- """
- if isinstance(mask, ExtensionArray):
- # We could have BooleanArray, Sparse[bool], ...
- # Except for BooleanArray, this is equivalent to just
- # np.asarray(mask, dtype=bool)
- mask = mask.to_numpy(dtype=bool, na_value=False)
-
- mask = np.asarray(mask, dtype=bool)
- return mask
-
-
-def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other):
- """
- Parameters
- ----------
- values : np.ndarray
- num_set : int
- For putmask, this is mask.sum()
- other : Any
- """
- if values.dtype == object:
- dtype, _ = infer_dtype_from(other, pandas_dtype=True)
-
- if isinstance(dtype, np.dtype) and dtype.kind in ["m", "M"]:
- # https://github.com/numpy/numpy/issues/12550
- # timedelta64 will incorrectly cast to int
- if not is_list_like(other):
- other = [other] * num_set
- else:
- other = list(other)
-
- return other
diff --git a/contrib/python/pandas/py3/pandas/core/array_algos/quantile.py b/contrib/python/pandas/py3/pandas/core/array_algos/quantile.py
deleted file mode 100644
index d3d9cb1b29b..00000000000
--- a/contrib/python/pandas/py3/pandas/core/array_algos/quantile.py
+++ /dev/null
@@ -1,224 +0,0 @@
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._typing import (
- ArrayLike,
- Scalar,
- npt,
-)
-from pandas.compat.numpy import np_percentile_argname
-
-from pandas.core.dtypes.missing import (
- isna,
- na_value_for_dtype,
-)
-
-
-def quantile_compat(
- values: ArrayLike, qs: npt.NDArray[np.float64], interpolation: str
-) -> ArrayLike:
- """
- Compute the quantiles of the given values for each quantile in `qs`.
-
- Parameters
- ----------
- values : np.ndarray or ExtensionArray
- qs : np.ndarray[float64]
- interpolation : str
-
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- if isinstance(values, np.ndarray):
- fill_value = na_value_for_dtype(values.dtype, compat=False)
- mask = isna(values)
- return quantile_with_mask(values, mask, fill_value, qs, interpolation)
- else:
- return values._quantile(qs, interpolation)
-
-
-def quantile_with_mask(
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- fill_value,
- qs: npt.NDArray[np.float64],
- interpolation: str,
-) -> np.ndarray:
- """
- Compute the quantiles of the given values for each quantile in `qs`.
-
- Parameters
- ----------
- values : np.ndarray
- For ExtensionArray, this is _values_for_factorize()[0]
- mask : np.ndarray[bool]
- mask = isna(values)
- For ExtensionArray, this is computed before calling _value_for_factorize
- fill_value : Scalar
- The value to interpret fill NA entries with
- For ExtensionArray, this is _values_for_factorize()[1]
- qs : np.ndarray[float64]
- interpolation : str
- Type of interpolation
-
- Returns
- -------
- np.ndarray
-
- Notes
- -----
- Assumes values is already 2D. For ExtensionArray this means np.atleast_2d
- has been called on _values_for_factorize()[0]
-
- Quantile is computed along axis=1.
- """
- assert values.shape == mask.shape
- if values.ndim == 1:
- # unsqueeze, operate, re-squeeze
- values = np.atleast_2d(values)
- mask = np.atleast_2d(mask)
- res_values = quantile_with_mask(values, mask, fill_value, qs, interpolation)
- return res_values[0]
-
- assert values.ndim == 2
-
- is_empty = values.shape[1] == 0
-
- if is_empty:
- # create the array of na_values
- # 2d len(values) * len(qs)
- flat = np.array([fill_value] * len(qs))
- result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
- else:
- result = _nanpercentile(
- values,
- qs * 100.0,
- na_value=fill_value,
- mask=mask,
- interpolation=interpolation,
- )
-
- result = np.array(result, copy=False)
- result = result.T
-
- return result
-
-
-def _nanpercentile_1d(
- values: np.ndarray,
- mask: npt.NDArray[np.bool_],
- qs: npt.NDArray[np.float64],
- na_value: Scalar,
- interpolation: str,
-) -> Scalar | np.ndarray:
- """
- Wrapper for np.percentile that skips missing values, specialized to
- 1-dimensional case.
-
- Parameters
- ----------
- values : array over which to find quantiles
- mask : ndarray[bool]
- locations in values that should be considered missing
- qs : np.ndarray[float64] of quantile indices to find
- na_value : scalar
- value to return for empty or all-null values
- interpolation : str
-
- Returns
- -------
- quantiles : scalar or array
- """
- # mask is Union[ExtensionArray, ndarray]
- values = values[~mask]
-
- if len(values) == 0:
- # Can't pass dtype=values.dtype here bc we might have na_value=np.nan
- # with values.dtype=int64 see test_quantile_empty
- # equiv: 'np.array([na_value] * len(qs))' but much faster
- return np.full(len(qs), na_value)
-
- return np.percentile(
- values,
- qs,
- # error: No overload variant of "percentile" matches argument
- # types "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]"
- # , "Dict[str, str]" [call-overload]
- **{np_percentile_argname: interpolation}, # type: ignore[call-overload]
- )
-
-
-def _nanpercentile(
- values: np.ndarray,
- qs: npt.NDArray[np.float64],
- *,
- na_value,
- mask: npt.NDArray[np.bool_],
- interpolation: str,
-):
- """
- Wrapper for np.percentile that skips missing values.
-
- Parameters
- ----------
- values : np.ndarray[ndim=2] over which to find quantiles
- qs : np.ndarray[float64] of quantile indices to find
- na_value : scalar
- value to return for empty or all-null values
- mask : np.ndarray[bool]
- locations in values that should be considered missing
- interpolation : str
-
- Returns
- -------
- quantiles : scalar or array
- """
-
- if values.dtype.kind in ["m", "M"]:
- # need to cast to integer to avoid rounding errors in numpy
- result = _nanpercentile(
- values.view("i8"),
- qs=qs,
- na_value=na_value.view("i8"),
- mask=mask,
- interpolation=interpolation,
- )
-
- # Note: we have to do `astype` and not view because in general we
- # have float result at this point, not i8
- return result.astype(values.dtype)
-
- if mask.any():
- # Caller is responsible for ensuring mask shape match
- assert mask.shape == values.shape
- result = [
- _nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
- for (val, m) in zip(list(values), list(mask))
- ]
- if values.dtype.kind == "f":
- # preserve itemsize
- result = np.array(result, dtype=values.dtype, copy=False).T
- else:
- result = np.array(result, copy=False).T
- if (
- result.dtype != values.dtype
- and not mask.all()
- and (result == result.astype(values.dtype, copy=False)).all()
- ):
- # mask.all() will never get cast back to int
- # e.g. values id integer dtype and result is floating dtype,
- # only cast back to integer dtype if result values are all-integer.
- result = result.astype(values.dtype, copy=False)
- return result
- else:
- return np.percentile(
- values,
- qs,
- axis=1,
- # error: No overload variant of "percentile" matches argument types
- # "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]",
- # "int", "Dict[str, str]" [call-overload]
- **{np_percentile_argname: interpolation}, # type: ignore[call-overload]
- )
diff --git a/contrib/python/pandas/py3/pandas/core/array_algos/replace.py b/contrib/python/pandas/py3/pandas/core/array_algos/replace.py
deleted file mode 100644
index 14bf26f40ea..00000000000
--- a/contrib/python/pandas/py3/pandas/core/array_algos/replace.py
+++ /dev/null
@@ -1,150 +0,0 @@
-"""
-Methods used by Block.replace and related methods.
-"""
-from __future__ import annotations
-
-import operator
-import re
-from typing import (
- Any,
- Pattern,
-)
-
-import numpy as np
-
-from pandas._typing import (
- ArrayLike,
- Scalar,
- npt,
-)
-
-from pandas.core.dtypes.common import (
- is_re,
- is_re_compilable,
- is_scalar,
-)
-from pandas.core.dtypes.missing import isna
-
-
-def should_use_regex(regex: bool, to_replace: Any) -> bool:
- """
- Decide whether to treat `to_replace` as a regular expression.
- """
- if is_re(to_replace):
- regex = True
-
- regex = regex and is_re_compilable(to_replace)
-
- # Don't use regex if the pattern is empty.
- regex = regex and re.compile(to_replace).pattern != ""
- return regex
-
-
-def compare_or_regex_search(
- a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: npt.NDArray[np.bool_]
-) -> ArrayLike:
- """
- Compare two array-like inputs of the same shape or two scalar values
-
- Calls operator.eq or re.search, depending on regex argument. If regex is
- True, perform an element-wise regex matching.
-
- Parameters
- ----------
- a : array-like
- b : scalar or regex pattern
- regex : bool
- mask : np.ndarray[bool]
-
- Returns
- -------
- mask : array-like of bool
- """
- if isna(b):
- return ~mask
-
- def _check_comparison_types(
- result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern
- ):
- """
- Raises an error if the two arrays (a,b) cannot be compared.
- Otherwise, returns the comparison result as expected.
- """
- if is_scalar(result) and isinstance(a, np.ndarray):
- type_names = [type(a).__name__, type(b).__name__]
-
- type_names[0] = f"ndarray(dtype={a.dtype})"
-
- raise TypeError(
- f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
- )
-
- if not regex or not should_use_regex(regex, b):
- # TODO: should use missing.mask_missing?
- op = lambda x: operator.eq(x, b)
- else:
- op = np.vectorize(
- lambda x: bool(re.search(b, x))
- if isinstance(x, str) and isinstance(b, (str, Pattern))
- else False
- )
-
- # GH#32621 use mask to avoid comparing to NAs
- if isinstance(a, np.ndarray):
- a = a[mask]
-
- result = op(a)
-
- if isinstance(result, np.ndarray) and mask is not None:
- # The shape of the mask can differ to that of the result
- # since we may compare only a subset of a's or b's elements
- tmp = np.zeros(mask.shape, dtype=np.bool_)
- np.place(tmp, mask, result)
- result = tmp
-
- _check_comparison_types(result, a, b)
- return result
-
-
-def replace_regex(
- values: ArrayLike, rx: re.Pattern, value, mask: npt.NDArray[np.bool_] | None
-) -> None:
- """
- Parameters
- ----------
- values : ArrayLike
- Object dtype.
- rx : re.Pattern
- value : Any
- mask : np.ndarray[bool], optional
-
- Notes
- -----
- Alters values in-place.
- """
-
- # deal with replacing values with objects (strings) that match but
- # whose replacement is not a string (numeric, nan, object)
- if isna(value) or not isinstance(value, str):
-
- def re_replacer(s):
- if is_re(rx) and isinstance(s, str):
- return value if rx.search(s) is not None else s
- else:
- return s
-
- else:
- # value is guaranteed to be a string here, s can be either a string
- # or null if it's null it gets returned
- def re_replacer(s):
- if is_re(rx) and isinstance(s, str):
- return rx.sub(value, s)
- else:
- return s
-
- f = np.vectorize(re_replacer, otypes=[np.object_])
-
- if mask is None:
- values[:] = f(values)
- else:
- values[mask] = f(values[mask])
diff --git a/contrib/python/pandas/py3/pandas/core/array_algos/take.py b/contrib/python/pandas/py3/pandas/core/array_algos/take.py
deleted file mode 100644
index 7282b0729f7..00000000000
--- a/contrib/python/pandas/py3/pandas/core/array_algos/take.py
+++ /dev/null
@@ -1,594 +0,0 @@
-from __future__ import annotations
-
-import functools
-from typing import (
- TYPE_CHECKING,
- cast,
- overload,
-)
-
-import numpy as np
-
-from pandas._libs import (
- algos as libalgos,
- lib,
-)
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- npt,
-)
-
-from pandas.core.dtypes.cast import maybe_promote
-from pandas.core.dtypes.common import (
- ensure_platform_int,
- is_1d_only_ea_obj,
-)
-from pandas.core.dtypes.missing import na_value_for_dtype
-
-from pandas.core.construction import ensure_wrapped_if_datetimelike
-
-if TYPE_CHECKING:
- from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
- from pandas.core.arrays.base import ExtensionArray
-
-
-@overload
-def take_nd(
- arr: np.ndarray,
- indexer,
- axis: AxisInt = ...,
- fill_value=...,
- allow_fill: bool = ...,
-) -> np.ndarray:
- ...
-
-
-@overload
-def take_nd(
- arr: ExtensionArray,
- indexer,
- axis: AxisInt = ...,
- fill_value=...,
- allow_fill: bool = ...,
-) -> ArrayLike:
- ...
-
-
-def take_nd(
- arr: ArrayLike,
- indexer,
- axis: AxisInt = 0,
- fill_value=lib.no_default,
- allow_fill: bool = True,
-) -> ArrayLike:
- """
- Specialized Cython take which sets NaN values in one pass
-
- This dispatches to ``take`` defined on ExtensionArrays. It does not
- currently dispatch to ``SparseArray.take`` for sparse ``arr``.
-
- Note: this function assumes that the indexer is a valid(ated) indexer with
- no out of bound indices.
-
- Parameters
- ----------
- arr : np.ndarray or ExtensionArray
- Input array.
- indexer : ndarray
- 1-D array of indices to take, subarrays corresponding to -1 value
- indices are filed with fill_value
- axis : int, default 0
- Axis to take from
- fill_value : any, default np.nan
- Fill value to replace -1 values with
- allow_fill : bool, default True
- If False, indexer is assumed to contain no -1 values so no filling
- will be done. This short-circuits computation of a mask. Result is
- undefined if allow_fill == False and -1 is present in indexer.
-
- Returns
- -------
- subarray : np.ndarray or ExtensionArray
- May be the same type as the input, or cast to an ndarray.
- """
- if fill_value is lib.no_default:
- fill_value = na_value_for_dtype(arr.dtype, compat=False)
- elif isinstance(arr.dtype, np.dtype) and arr.dtype.kind in "mM":
- dtype, fill_value = maybe_promote(arr.dtype, fill_value)
- if arr.dtype != dtype:
- # EA.take is strict about returning a new object of the same type
- # so for that case cast upfront
- arr = arr.astype(dtype)
-
- if not isinstance(arr, np.ndarray):
- # i.e. ExtensionArray,
- # includes for EA to catch DatetimeArray, TimedeltaArray
- if not is_1d_only_ea_obj(arr):
- # i.e. DatetimeArray, TimedeltaArray
- arr = cast("NDArrayBackedExtensionArray", arr)
- return arr.take(
- indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis
- )
-
- return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
-
- arr = np.asarray(arr)
- return _take_nd_ndarray(arr, indexer, axis, fill_value, allow_fill)
-
-
-def _take_nd_ndarray(
- arr: np.ndarray,
- indexer: npt.NDArray[np.intp] | None,
- axis: AxisInt,
- fill_value,
- allow_fill: bool,
-) -> np.ndarray:
- if indexer is None:
- indexer = np.arange(arr.shape[axis], dtype=np.intp)
- dtype, fill_value = arr.dtype, arr.dtype.type()
- else:
- indexer = ensure_platform_int(indexer)
-
- dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
- arr, indexer, fill_value, allow_fill
- )
-
- flip_order = False
- if arr.ndim == 2 and arr.flags.f_contiguous:
- flip_order = True
-
- if flip_order:
- arr = arr.T
- axis = arr.ndim - axis - 1
-
- # at this point, it's guaranteed that dtype can hold both the arr values
- # and the fill_value
- out_shape_ = list(arr.shape)
- out_shape_[axis] = len(indexer)
- out_shape = tuple(out_shape_)
- if arr.flags.f_contiguous and axis == arr.ndim - 1:
- # minor tweak that can make an order-of-magnitude difference
- # for dataframes initialized directly from 2-d ndarrays
- # (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its
- # f-contiguous transpose)
- out = np.empty(out_shape, dtype=dtype, order="F")
- else:
- out = np.empty(out_shape, dtype=dtype)
-
- func = _get_take_nd_function(
- arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info
- )
- func(arr, indexer, out, fill_value)
-
- if flip_order:
- out = out.T
- return out
-
-
-def take_1d(
- arr: ArrayLike,
- indexer: npt.NDArray[np.intp],
- fill_value=None,
- allow_fill: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> ArrayLike:
- """
- Specialized version for 1D arrays. Differences compared to `take_nd`:
-
- - Assumes input array has already been converted to numpy array / EA
- - Assumes indexer is already guaranteed to be intp dtype ndarray
- - Only works for 1D arrays
-
- To ensure the lowest possible overhead.
-
- Note: similarly to `take_nd`, this function assumes that the indexer is
- a valid(ated) indexer with no out of bound indices.
-
- Parameters
- ----------
- arr : np.ndarray or ExtensionArray
- Input array.
- indexer : ndarray
- 1-D array of indices to take (validated indices, intp dtype).
- fill_value : any, default np.nan
- Fill value to replace -1 values with
- allow_fill : bool, default True
- If False, indexer is assumed to contain no -1 values so no filling
- will be done. This short-circuits computation of a mask. Result is
- undefined if allow_fill == False and -1 is present in indexer.
- mask : np.ndarray, optional, default None
- If `allow_fill` is True, and the mask (where indexer == -1) is already
- known, it can be passed to avoid recomputation.
- """
- if not isinstance(arr, np.ndarray):
- # ExtensionArray -> dispatch to their method
- return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
-
- if not allow_fill:
- return arr.take(indexer)
-
- dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
- arr, indexer, fill_value, True, mask
- )
-
- # at this point, it's guaranteed that dtype can hold both the arr values
- # and the fill_value
- out = np.empty(indexer.shape, dtype=dtype)
-
- func = _get_take_nd_function(
- arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info
- )
- func(arr, indexer, out, fill_value)
-
- return out
-
-
-def take_2d_multi(
- arr: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- fill_value=np.nan,
-) -> np.ndarray:
- """
- Specialized Cython take which sets NaN values in one pass.
- """
- # This is only called from one place in DataFrame._reindex_multi,
- # so we know indexer is well-behaved.
- assert indexer is not None
- assert indexer[0] is not None
- assert indexer[1] is not None
-
- row_idx, col_idx = indexer
-
- row_idx = ensure_platform_int(row_idx)
- col_idx = ensure_platform_int(col_idx)
- indexer = row_idx, col_idx
- mask_info = None
-
- # check for promotion based on types only (do this first because
- # it's faster than computing a mask)
- dtype, fill_value = maybe_promote(arr.dtype, fill_value)
- if dtype != arr.dtype:
- # check if promotion is actually required based on indexer
- row_mask = row_idx == -1
- col_mask = col_idx == -1
- row_needs = row_mask.any()
- col_needs = col_mask.any()
- mask_info = (row_mask, col_mask), (row_needs, col_needs)
-
- if not (row_needs or col_needs):
- # if not, then depromote, set fill_value to dummy
- # (it won't be used but we don't want the cython code
- # to crash when trying to cast it to dtype)
- dtype, fill_value = arr.dtype, arr.dtype.type()
-
- # at this point, it's guaranteed that dtype can hold both the arr values
- # and the fill_value
- out_shape = len(row_idx), len(col_idx)
- out = np.empty(out_shape, dtype=dtype)
-
- func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None)
- if func is None and arr.dtype != out.dtype:
- func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None)
- if func is not None:
- func = _convert_wrapper(func, out.dtype)
-
- if func is not None:
- func(arr, indexer, out=out, fill_value=fill_value)
- else:
- # test_reindex_multi
- _take_2d_multi_object(
- arr, indexer, out, fill_value=fill_value, mask_info=mask_info
- )
-
- return out
-
-
-@functools.lru_cache(maxsize=128)
-def _get_take_nd_function_cached(
- ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: AxisInt
-):
- """
- Part of _get_take_nd_function below that doesn't need `mask_info` and thus
- can be cached (mask_info potentially contains a numpy ndarray which is not
- hashable and thus cannot be used as argument for cached function).
- """
- tup = (arr_dtype.name, out_dtype.name)
- if ndim == 1:
- func = _take_1d_dict.get(tup, None)
- elif ndim == 2:
- if axis == 0:
- func = _take_2d_axis0_dict.get(tup, None)
- else:
- func = _take_2d_axis1_dict.get(tup, None)
- if func is not None:
- return func
-
- # We get here with string, uint, float16, and complex dtypes that could
- # potentially be handled in algos_take_helper.
- # Also a couple with (M8[ns], object) and (m8[ns], object)
- tup = (out_dtype.name, out_dtype.name)
- if ndim == 1:
- func = _take_1d_dict.get(tup, None)
- elif ndim == 2:
- if axis == 0:
- func = _take_2d_axis0_dict.get(tup, None)
- else:
- func = _take_2d_axis1_dict.get(tup, None)
- if func is not None:
- func = _convert_wrapper(func, out_dtype)
- return func
-
- return None
-
-
-def _get_take_nd_function(
- ndim: int,
- arr_dtype: np.dtype,
- out_dtype: np.dtype,
- axis: AxisInt = 0,
- mask_info=None,
-):
- """
- Get the appropriate "take" implementation for the given dimension, axis
- and dtypes.
- """
- func = None
- if ndim <= 2:
- # for this part we don't need `mask_info` -> use the cached algo lookup
- func = _get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis)
-
- if func is None:
-
- def func(arr, indexer, out, fill_value=np.nan) -> None:
- indexer = ensure_platform_int(indexer)
- _take_nd_object(
- arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
- )
-
- return func
-
-
-def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None):
- def wrapper(
- arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
- ) -> None:
- if arr_dtype is not None:
- arr = arr.view(arr_dtype)
- if out_dtype is not None:
- out = out.view(out_dtype)
- if fill_wrap is not None:
- # FIXME: if we get here with dt64/td64 we need to be sure we have
- # matching resos
- if fill_value.dtype.kind == "m":
- fill_value = fill_value.astype("m8[ns]")
- else:
- fill_value = fill_value.astype("M8[ns]")
- fill_value = fill_wrap(fill_value)
-
- f(arr, indexer, out, fill_value=fill_value)
-
- return wrapper
-
-
-def _convert_wrapper(f, conv_dtype):
- def wrapper(
- arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan
- ) -> None:
- if conv_dtype == object:
- # GH#39755 avoid casting dt64/td64 to integers
- arr = ensure_wrapped_if_datetimelike(arr)
- arr = arr.astype(conv_dtype)
- f(arr, indexer, out, fill_value=fill_value)
-
- return wrapper
-
-
-_take_1d_dict = {
- ("int8", "int8"): libalgos.take_1d_int8_int8,
- ("int8", "int32"): libalgos.take_1d_int8_int32,
- ("int8", "int64"): libalgos.take_1d_int8_int64,
- ("int8", "float64"): libalgos.take_1d_int8_float64,
- ("int16", "int16"): libalgos.take_1d_int16_int16,
- ("int16", "int32"): libalgos.take_1d_int16_int32,
- ("int16", "int64"): libalgos.take_1d_int16_int64,
- ("int16", "float64"): libalgos.take_1d_int16_float64,
- ("int32", "int32"): libalgos.take_1d_int32_int32,
- ("int32", "int64"): libalgos.take_1d_int32_int64,
- ("int32", "float64"): libalgos.take_1d_int32_float64,
- ("int64", "int64"): libalgos.take_1d_int64_int64,
- ("int64", "float64"): libalgos.take_1d_int64_float64,
- ("float32", "float32"): libalgos.take_1d_float32_float32,
- ("float32", "float64"): libalgos.take_1d_float32_float64,
- ("float64", "float64"): libalgos.take_1d_float64_float64,
- ("object", "object"): libalgos.take_1d_object_object,
- ("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8),
- ("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None),
- ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
- libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
- ),
- ("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
- libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64
- ),
-}
-
-_take_2d_axis0_dict = {
- ("int8", "int8"): libalgos.take_2d_axis0_int8_int8,
- ("int8", "int32"): libalgos.take_2d_axis0_int8_int32,
- ("int8", "int64"): libalgos.take_2d_axis0_int8_int64,
- ("int8", "float64"): libalgos.take_2d_axis0_int8_float64,
- ("int16", "int16"): libalgos.take_2d_axis0_int16_int16,
- ("int16", "int32"): libalgos.take_2d_axis0_int16_int32,
- ("int16", "int64"): libalgos.take_2d_axis0_int16_int64,
- ("int16", "float64"): libalgos.take_2d_axis0_int16_float64,
- ("int32", "int32"): libalgos.take_2d_axis0_int32_int32,
- ("int32", "int64"): libalgos.take_2d_axis0_int32_int64,
- ("int32", "float64"): libalgos.take_2d_axis0_int32_float64,
- ("int64", "int64"): libalgos.take_2d_axis0_int64_int64,
- ("int64", "float64"): libalgos.take_2d_axis0_int64_float64,
- ("float32", "float32"): libalgos.take_2d_axis0_float32_float32,
- ("float32", "float64"): libalgos.take_2d_axis0_float32_float64,
- ("float64", "float64"): libalgos.take_2d_axis0_float64_float64,
- ("object", "object"): libalgos.take_2d_axis0_object_object,
- ("bool", "bool"): _view_wrapper(
- libalgos.take_2d_axis0_bool_bool, np.uint8, np.uint8
- ),
- ("bool", "object"): _view_wrapper(
- libalgos.take_2d_axis0_bool_object, np.uint8, None
- ),
- ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
- libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
- ),
- ("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
- libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
- ),
-}
-
-_take_2d_axis1_dict = {
- ("int8", "int8"): libalgos.take_2d_axis1_int8_int8,
- ("int8", "int32"): libalgos.take_2d_axis1_int8_int32,
- ("int8", "int64"): libalgos.take_2d_axis1_int8_int64,
- ("int8", "float64"): libalgos.take_2d_axis1_int8_float64,
- ("int16", "int16"): libalgos.take_2d_axis1_int16_int16,
- ("int16", "int32"): libalgos.take_2d_axis1_int16_int32,
- ("int16", "int64"): libalgos.take_2d_axis1_int16_int64,
- ("int16", "float64"): libalgos.take_2d_axis1_int16_float64,
- ("int32", "int32"): libalgos.take_2d_axis1_int32_int32,
- ("int32", "int64"): libalgos.take_2d_axis1_int32_int64,
- ("int32", "float64"): libalgos.take_2d_axis1_int32_float64,
- ("int64", "int64"): libalgos.take_2d_axis1_int64_int64,
- ("int64", "float64"): libalgos.take_2d_axis1_int64_float64,
- ("float32", "float32"): libalgos.take_2d_axis1_float32_float32,
- ("float32", "float64"): libalgos.take_2d_axis1_float32_float64,
- ("float64", "float64"): libalgos.take_2d_axis1_float64_float64,
- ("object", "object"): libalgos.take_2d_axis1_object_object,
- ("bool", "bool"): _view_wrapper(
- libalgos.take_2d_axis1_bool_bool, np.uint8, np.uint8
- ),
- ("bool", "object"): _view_wrapper(
- libalgos.take_2d_axis1_bool_object, np.uint8, None
- ),
- ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
- libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
- ),
- ("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
- libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
- ),
-}
-
-_take_2d_multi_dict = {
- ("int8", "int8"): libalgos.take_2d_multi_int8_int8,
- ("int8", "int32"): libalgos.take_2d_multi_int8_int32,
- ("int8", "int64"): libalgos.take_2d_multi_int8_int64,
- ("int8", "float64"): libalgos.take_2d_multi_int8_float64,
- ("int16", "int16"): libalgos.take_2d_multi_int16_int16,
- ("int16", "int32"): libalgos.take_2d_multi_int16_int32,
- ("int16", "int64"): libalgos.take_2d_multi_int16_int64,
- ("int16", "float64"): libalgos.take_2d_multi_int16_float64,
- ("int32", "int32"): libalgos.take_2d_multi_int32_int32,
- ("int32", "int64"): libalgos.take_2d_multi_int32_int64,
- ("int32", "float64"): libalgos.take_2d_multi_int32_float64,
- ("int64", "int64"): libalgos.take_2d_multi_int64_int64,
- ("int64", "float64"): libalgos.take_2d_multi_int64_float64,
- ("float32", "float32"): libalgos.take_2d_multi_float32_float32,
- ("float32", "float64"): libalgos.take_2d_multi_float32_float64,
- ("float64", "float64"): libalgos.take_2d_multi_float64_float64,
- ("object", "object"): libalgos.take_2d_multi_object_object,
- ("bool", "bool"): _view_wrapper(
- libalgos.take_2d_multi_bool_bool, np.uint8, np.uint8
- ),
- ("bool", "object"): _view_wrapper(
- libalgos.take_2d_multi_bool_object, np.uint8, None
- ),
- ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
- libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
- ),
- ("timedelta64[ns]", "timedelta64[ns]"): _view_wrapper(
- libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
- ),
-}
-
-
-def _take_nd_object(
- arr: np.ndarray,
- indexer: npt.NDArray[np.intp],
- out: np.ndarray,
- axis: AxisInt,
- fill_value,
- mask_info,
-) -> None:
- if mask_info is not None:
- mask, needs_masking = mask_info
- else:
- mask = indexer == -1
- needs_masking = mask.any()
- if arr.dtype != out.dtype:
- arr = arr.astype(out.dtype)
- if arr.shape[axis] > 0:
- arr.take(indexer, axis=axis, out=out)
- if needs_masking:
- outindexer = [slice(None)] * arr.ndim
- outindexer[axis] = mask
- out[tuple(outindexer)] = fill_value
-
-
-def _take_2d_multi_object(
- arr: np.ndarray,
- indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
- out: np.ndarray,
- fill_value,
- mask_info,
-) -> None:
- # this is not ideal, performance-wise, but it's better than raising
- # an exception (best to optimize in Cython to avoid getting here)
- row_idx, col_idx = indexer # both np.intp
- if mask_info is not None:
- (row_mask, col_mask), (row_needs, col_needs) = mask_info
- else:
- row_mask = row_idx == -1
- col_mask = col_idx == -1
- row_needs = row_mask.any()
- col_needs = col_mask.any()
- if fill_value is not None:
- if row_needs:
- out[row_mask, :] = fill_value
- if col_needs:
- out[:, col_mask] = fill_value
- for i, u_ in enumerate(row_idx):
- if u_ != -1:
- for j, v in enumerate(col_idx):
- if v != -1:
- out[i, j] = arr[u_, v]
-
-
-def _take_preprocess_indexer_and_fill_value(
- arr: np.ndarray,
- indexer: npt.NDArray[np.intp],
- fill_value,
- allow_fill: bool,
- mask: npt.NDArray[np.bool_] | None = None,
-):
- mask_info: tuple[np.ndarray | None, bool] | None = None
-
- if not allow_fill:
- dtype, fill_value = arr.dtype, arr.dtype.type()
- mask_info = None, False
- else:
- # check for promotion based on types only (do this first because
- # it's faster than computing a mask)
- dtype, fill_value = maybe_promote(arr.dtype, fill_value)
- if dtype != arr.dtype:
- # check if promotion is actually required based on indexer
- if mask is not None:
- needs_masking = True
- else:
- mask = indexer == -1
- needs_masking = bool(mask.any())
- mask_info = mask, needs_masking
- if not needs_masking:
- # if not, then depromote, set fill_value to dummy
- # (it won't be used but we don't want the cython code
- # to crash when trying to cast it to dtype)
- dtype, fill_value = arr.dtype, arr.dtype.type()
-
- return dtype, fill_value, mask_info
diff --git a/contrib/python/pandas/py3/pandas/core/array_algos/transforms.py b/contrib/python/pandas/py3/pandas/core/array_algos/transforms.py
deleted file mode 100644
index 56648189f17..00000000000
--- a/contrib/python/pandas/py3/pandas/core/array_algos/transforms.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""
-transforms.py is for shape-preserving functions.
-"""
-
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._typing import AxisInt
-
-
-def shift(values: np.ndarray, periods: int, axis: AxisInt, fill_value) -> np.ndarray:
- new_values = values
-
- if periods == 0 or values.size == 0:
- return new_values.copy()
-
- # make sure array sent to np.roll is c_contiguous
- f_ordered = values.flags.f_contiguous
- if f_ordered:
- new_values = new_values.T
- axis = new_values.ndim - axis - 1
-
- if new_values.size:
- new_values = np.roll(
- new_values,
- np.intp(periods),
- axis=axis,
- )
-
- axis_indexer = [slice(None)] * values.ndim
- if periods > 0:
- axis_indexer[axis] = slice(None, periods)
- else:
- axis_indexer[axis] = slice(periods, None)
- new_values[tuple(axis_indexer)] = fill_value
-
- # restore original order
- if f_ordered:
- new_values = new_values.T
-
- return new_values
diff --git a/contrib/python/pandas/py3/pandas/core/arraylike.py b/contrib/python/pandas/py3/pandas/core/arraylike.py
deleted file mode 100644
index 1d10d797866..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arraylike.py
+++ /dev/null
@@ -1,527 +0,0 @@
-"""
-Methods that can be shared by many array-like classes or subclasses:
- Series
- Index
- ExtensionArray
-"""
-from __future__ import annotations
-
-import operator
-from typing import Any
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
-
-from pandas.core.dtypes.generic import ABCNDFrame
-
-from pandas.core import roperator
-from pandas.core.construction import extract_array
-from pandas.core.ops.common import unpack_zerodim_and_defer
-
-REDUCTION_ALIASES = {
- "maximum": "max",
- "minimum": "min",
- "add": "sum",
- "multiply": "prod",
-}
-
-
-class OpsMixin:
- # -------------------------------------------------------------
- # Comparisons
-
- def _cmp_method(self, other, op):
- return NotImplemented
-
- @unpack_zerodim_and_defer("__eq__")
- def __eq__(self, other):
- return self._cmp_method(other, operator.eq)
-
- @unpack_zerodim_and_defer("__ne__")
- def __ne__(self, other):
- return self._cmp_method(other, operator.ne)
-
- @unpack_zerodim_and_defer("__lt__")
- def __lt__(self, other):
- return self._cmp_method(other, operator.lt)
-
- @unpack_zerodim_and_defer("__le__")
- def __le__(self, other):
- return self._cmp_method(other, operator.le)
-
- @unpack_zerodim_and_defer("__gt__")
- def __gt__(self, other):
- return self._cmp_method(other, operator.gt)
-
- @unpack_zerodim_and_defer("__ge__")
- def __ge__(self, other):
- return self._cmp_method(other, operator.ge)
-
- # -------------------------------------------------------------
- # Logical Methods
-
- def _logical_method(self, other, op):
- return NotImplemented
-
- @unpack_zerodim_and_defer("__and__")
- def __and__(self, other):
- return self._logical_method(other, operator.and_)
-
- @unpack_zerodim_and_defer("__rand__")
- def __rand__(self, other):
- return self._logical_method(other, roperator.rand_)
-
- @unpack_zerodim_and_defer("__or__")
- def __or__(self, other):
- return self._logical_method(other, operator.or_)
-
- @unpack_zerodim_and_defer("__ror__")
- def __ror__(self, other):
- return self._logical_method(other, roperator.ror_)
-
- @unpack_zerodim_and_defer("__xor__")
- def __xor__(self, other):
- return self._logical_method(other, operator.xor)
-
- @unpack_zerodim_and_defer("__rxor__")
- def __rxor__(self, other):
- return self._logical_method(other, roperator.rxor)
-
- # -------------------------------------------------------------
- # Arithmetic Methods
-
- def _arith_method(self, other, op):
- return NotImplemented
-
- @unpack_zerodim_and_defer("__add__")
- def __add__(self, other):
- """
- Get Addition of DataFrame and other, column-wise.
-
- Equivalent to ``DataFrame.add(other)``.
-
- Parameters
- ----------
- other : scalar, sequence, Series, dict or DataFrame
- Object to be added to the DataFrame.
-
- Returns
- -------
- DataFrame
- The result of adding ``other`` to DataFrame.
-
- See Also
- --------
- DataFrame.add : Add a DataFrame and another object, with option for index-
- or column-oriented addition.
-
- Examples
- --------
- >>> df = pd.DataFrame({'height': [1.5, 2.6], 'weight': [500, 800]},
- ... index=['elk', 'moose'])
- >>> df
- height weight
- elk 1.5 500
- moose 2.6 800
-
- Adding a scalar affects all rows and columns.
-
- >>> df[['height', 'weight']] + 1.5
- height weight
- elk 3.0 501.5
- moose 4.1 801.5
-
- Each element of a list is added to a column of the DataFrame, in order.
-
- >>> df[['height', 'weight']] + [0.5, 1.5]
- height weight
- elk 2.0 501.5
- moose 3.1 801.5
-
- Keys of a dictionary are aligned to the DataFrame, based on column names;
- each value in the dictionary is added to the corresponding column.
-
- >>> df[['height', 'weight']] + {'height': 0.5, 'weight': 1.5}
- height weight
- elk 2.0 501.5
- moose 3.1 801.5
-
- When `other` is a :class:`Series`, the index of `other` is aligned with the
- columns of the DataFrame.
-
- >>> s1 = pd.Series([0.5, 1.5], index=['weight', 'height'])
- >>> df[['height', 'weight']] + s1
- height weight
- elk 3.0 500.5
- moose 4.1 800.5
-
- Even when the index of `other` is the same as the index of the DataFrame,
- the :class:`Series` will not be reoriented. If index-wise alignment is desired,
- :meth:`DataFrame.add` should be used with `axis='index'`.
-
- >>> s2 = pd.Series([0.5, 1.5], index=['elk', 'moose'])
- >>> df[['height', 'weight']] + s2
- elk height moose weight
- elk NaN NaN NaN NaN
- moose NaN NaN NaN NaN
-
- >>> df[['height', 'weight']].add(s2, axis='index')
- height weight
- elk 2.0 500.5
- moose 4.1 801.5
-
- When `other` is a :class:`DataFrame`, both columns names and the
- index are aligned.
-
- >>> other = pd.DataFrame({'height': [0.2, 0.4, 0.6]},
- ... index=['elk', 'moose', 'deer'])
- >>> df[['height', 'weight']] + other
- height weight
- deer NaN NaN
- elk 1.7 NaN
- moose 3.0 NaN
- """
- return self._arith_method(other, operator.add)
-
- @unpack_zerodim_and_defer("__radd__")
- def __radd__(self, other):
- return self._arith_method(other, roperator.radd)
-
- @unpack_zerodim_and_defer("__sub__")
- def __sub__(self, other):
- return self._arith_method(other, operator.sub)
-
- @unpack_zerodim_and_defer("__rsub__")
- def __rsub__(self, other):
- return self._arith_method(other, roperator.rsub)
-
- @unpack_zerodim_and_defer("__mul__")
- def __mul__(self, other):
- return self._arith_method(other, operator.mul)
-
- @unpack_zerodim_and_defer("__rmul__")
- def __rmul__(self, other):
- return self._arith_method(other, roperator.rmul)
-
- @unpack_zerodim_and_defer("__truediv__")
- def __truediv__(self, other):
- return self._arith_method(other, operator.truediv)
-
- @unpack_zerodim_and_defer("__rtruediv__")
- def __rtruediv__(self, other):
- return self._arith_method(other, roperator.rtruediv)
-
- @unpack_zerodim_and_defer("__floordiv__")
- def __floordiv__(self, other):
- return self._arith_method(other, operator.floordiv)
-
- @unpack_zerodim_and_defer("__rfloordiv")
- def __rfloordiv__(self, other):
- return self._arith_method(other, roperator.rfloordiv)
-
- @unpack_zerodim_and_defer("__mod__")
- def __mod__(self, other):
- return self._arith_method(other, operator.mod)
-
- @unpack_zerodim_and_defer("__rmod__")
- def __rmod__(self, other):
- return self._arith_method(other, roperator.rmod)
-
- @unpack_zerodim_and_defer("__divmod__")
- def __divmod__(self, other):
- return self._arith_method(other, divmod)
-
- @unpack_zerodim_and_defer("__rdivmod__")
- def __rdivmod__(self, other):
- return self._arith_method(other, roperator.rdivmod)
-
- @unpack_zerodim_and_defer("__pow__")
- def __pow__(self, other):
- return self._arith_method(other, operator.pow)
-
- @unpack_zerodim_and_defer("__rpow__")
- def __rpow__(self, other):
- return self._arith_method(other, roperator.rpow)
-
-
-# -----------------------------------------------------------------------------
-# Helpers to implement __array_ufunc__
-
-
-def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any):
- """
- Compatibility with numpy ufuncs.
-
- See also
- --------
- numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__
- """
- from pandas.core.frame import (
- DataFrame,
- Series,
- )
- from pandas.core.generic import NDFrame
- from pandas.core.internals import BlockManager
-
- cls = type(self)
-
- kwargs = _standardize_out_kwarg(**kwargs)
-
- # for binary ops, use our custom dunder methods
- result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs)
- if result is not NotImplemented:
- return result
-
- # Determine if we should defer.
- no_defer = (
- np.ndarray.__array_ufunc__,
- cls.__array_ufunc__,
- )
-
- for item in inputs:
- higher_priority = (
- hasattr(item, "__array_priority__")
- and item.__array_priority__ > self.__array_priority__
- )
- has_array_ufunc = (
- hasattr(item, "__array_ufunc__")
- and type(item).__array_ufunc__ not in no_defer
- and not isinstance(item, self._HANDLED_TYPES)
- )
- if higher_priority or has_array_ufunc:
- return NotImplemented
-
- # align all the inputs.
- types = tuple(type(x) for x in inputs)
- alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)]
-
- if len(alignable) > 1:
- # This triggers alignment.
- # At the moment, there aren't any ufuncs with more than two inputs
- # so this ends up just being x1.index | x2.index, but we write
- # it to handle *args.
- set_types = set(types)
- if len(set_types) > 1 and {DataFrame, Series}.issubset(set_types):
- # We currently don't handle ufunc(DataFrame, Series)
- # well. Previously this raised an internal ValueError. We might
- # support it someday, so raise a NotImplementedError.
- raise NotImplementedError(
- f"Cannot apply ufunc {ufunc} to mixed DataFrame and Series inputs."
- )
- axes = self.axes
- for obj in alignable[1:]:
- # this relies on the fact that we aren't handling mixed
- # series / frame ufuncs.
- for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)):
- axes[i] = ax1.union(ax2)
-
- reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes))
- inputs = tuple(
- x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x
- for x, t in zip(inputs, types)
- )
- else:
- reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes))
-
- if self.ndim == 1:
- names = [getattr(x, "name") for x in inputs if hasattr(x, "name")]
- name = names[0] if len(set(names)) == 1 else None
- reconstruct_kwargs = {"name": name}
- else:
- reconstruct_kwargs = {}
-
- def reconstruct(result):
- if ufunc.nout > 1:
- # np.modf, np.frexp, np.divmod
- return tuple(_reconstruct(x) for x in result)
-
- return _reconstruct(result)
-
- def _reconstruct(result):
- if lib.is_scalar(result):
- return result
-
- if result.ndim != self.ndim:
- if method == "outer":
- raise NotImplementedError
- return result
- if isinstance(result, BlockManager):
- # we went through BlockManager.apply e.g. np.sqrt
- result = self._constructor(result, **reconstruct_kwargs, copy=False)
- else:
- # we converted an array, lost our axes
- result = self._constructor(
- result, **reconstruct_axes, **reconstruct_kwargs, copy=False
- )
- # TODO: When we support multiple values in __finalize__, this
- # should pass alignable to `__finalize__` instead of self.
- # Then `np.add(a, b)` would consider attrs from both a and b
- # when a and b are NDFrames.
- if len(alignable) == 1:
- result = result.__finalize__(self)
- return result
-
- if "out" in kwargs:
- # e.g. test_multiindex_get_loc
- result = dispatch_ufunc_with_out(self, ufunc, method, *inputs, **kwargs)
- return reconstruct(result)
-
- if method == "reduce":
- # e.g. test.series.test_ufunc.test_reduce
- result = dispatch_reduction_ufunc(self, ufunc, method, *inputs, **kwargs)
- if result is not NotImplemented:
- return result
-
- # We still get here with kwargs `axis` for e.g. np.maximum.accumulate
- # and `dtype` and `keepdims` for np.ptp
-
- if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1):
- # Just give up on preserving types in the complex case.
- # In theory we could preserve them for them.
- # * nout>1 is doable if BlockManager.apply took nout and
- # returned a Tuple[BlockManager].
- # * len(inputs) > 1 is doable when we know that we have
- # aligned blocks / dtypes.
-
- # e.g. my_ufunc, modf, logaddexp, heaviside, subtract, add
- inputs = tuple(np.asarray(x) for x in inputs)
- # Note: we can't use default_array_ufunc here bc reindexing means
- # that `self` may not be among `inputs`
- result = getattr(ufunc, method)(*inputs, **kwargs)
- elif self.ndim == 1:
- # ufunc(series, ...)
- inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
- result = getattr(ufunc, method)(*inputs, **kwargs)
- else:
- # ufunc(dataframe)
- if method == "__call__" and not kwargs:
- # for np.<ufunc>(..) calls
- # kwargs cannot necessarily be handled block-by-block, so only
- # take this path if there are no kwargs
- mgr = inputs[0]._mgr
- result = mgr.apply(getattr(ufunc, method))
- else:
- # otherwise specific ufunc methods (eg np.<ufunc>.accumulate(..))
- # Those can have an axis keyword and thus can't be called block-by-block
- result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs)
- # e.g. np.negative (only one reached), with "where" and "out" in kwargs
-
- result = reconstruct(result)
- return result
-
-
-def _standardize_out_kwarg(**kwargs) -> dict:
- """
- If kwargs contain "out1" and "out2", replace that with a tuple "out"
-
- np.divmod, np.modf, np.frexp can have either `out=(out1, out2)` or
- `out1=out1, out2=out2)`
- """
- if "out" not in kwargs and "out1" in kwargs and "out2" in kwargs:
- out1 = kwargs.pop("out1")
- out2 = kwargs.pop("out2")
- out = (out1, out2)
- kwargs["out"] = out
- return kwargs
-
-
-def dispatch_ufunc_with_out(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
- """
- If we have an `out` keyword, then call the ufunc without `out` and then
- set the result into the given `out`.
- """
-
- # Note: we assume _standardize_out_kwarg has already been called.
- out = kwargs.pop("out")
- where = kwargs.pop("where", None)
-
- result = getattr(ufunc, method)(*inputs, **kwargs)
-
- if result is NotImplemented:
- return NotImplemented
-
- if isinstance(result, tuple):
- # i.e. np.divmod, np.modf, np.frexp
- if not isinstance(out, tuple) or len(out) != len(result):
- raise NotImplementedError
-
- for arr, res in zip(out, result):
- _assign_where(arr, res, where)
-
- return out
-
- if isinstance(out, tuple):
- if len(out) == 1:
- out = out[0]
- else:
- raise NotImplementedError
-
- _assign_where(out, result, where)
- return out
-
-
-def _assign_where(out, result, where) -> None:
- """
- Set a ufunc result into 'out', masking with a 'where' argument if necessary.
- """
- if where is None:
- # no 'where' arg passed to ufunc
- out[:] = result
- else:
- np.putmask(out, where, result)
-
-
-def default_array_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
- """
- Fallback to the behavior we would get if we did not define __array_ufunc__.
-
- Notes
- -----
- We are assuming that `self` is among `inputs`.
- """
- if not any(x is self for x in inputs):
- raise NotImplementedError
-
- new_inputs = [x if x is not self else np.asarray(x) for x in inputs]
-
- return getattr(ufunc, method)(*new_inputs, **kwargs)
-
-
-def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
- """
- Dispatch ufunc reductions to self's reduction methods.
- """
- assert method == "reduce"
-
- if len(inputs) != 1 or inputs[0] is not self:
- return NotImplemented
-
- if ufunc.__name__ not in REDUCTION_ALIASES:
- return NotImplemented
-
- method_name = REDUCTION_ALIASES[ufunc.__name__]
-
- # NB: we are assuming that min/max represent minimum/maximum methods,
- # which would not be accurate for e.g. Timestamp.min
- if not hasattr(self, method_name):
- return NotImplemented
-
- if self.ndim > 1:
- if isinstance(self, ABCNDFrame):
- # TODO: test cases where this doesn't hold, i.e. 2D DTA/TDA
- kwargs["numeric_only"] = False
-
- if "axis" not in kwargs:
- # For DataFrame reductions we don't want the default axis=0
- # Note: np.min is not a ufunc, but uses array_function_dispatch,
- # so calls DataFrame.min (without ever getting here) with the np.min
- # default of axis=None, which DataFrame.min catches and changes to axis=0.
- # np.minimum.reduce(df) gets here bc axis is not in kwargs,
- # so we set axis=0 to match the behaviorof np.minimum.reduce(df.values)
- kwargs["axis"] = 0
-
- # By default, numpy's reductions do not skip NaNs, so we have to
- # pass skipna=False
- return getattr(self, method_name)(skipna=False, **kwargs)
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/__init__.py b/contrib/python/pandas/py3/pandas/core/arrays/__init__.py
deleted file mode 100644
index 79be8760db9..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from pandas.core.arrays.arrow import ArrowExtensionArray
-from pandas.core.arrays.base import (
- ExtensionArray,
- ExtensionOpsMixin,
- ExtensionScalarOpsMixin,
-)
-from pandas.core.arrays.boolean import BooleanArray
-from pandas.core.arrays.categorical import Categorical
-from pandas.core.arrays.datetimes import DatetimeArray
-from pandas.core.arrays.floating import FloatingArray
-from pandas.core.arrays.integer import IntegerArray
-from pandas.core.arrays.interval import IntervalArray
-from pandas.core.arrays.masked import BaseMaskedArray
-from pandas.core.arrays.numpy_ import PandasArray
-from pandas.core.arrays.period import (
- PeriodArray,
- period_array,
-)
-from pandas.core.arrays.sparse import SparseArray
-from pandas.core.arrays.string_ import StringArray
-from pandas.core.arrays.string_arrow import ArrowStringArray
-from pandas.core.arrays.timedeltas import TimedeltaArray
-
-__all__ = [
- "ArrowExtensionArray",
- "ExtensionArray",
- "ExtensionOpsMixin",
- "ExtensionScalarOpsMixin",
- "ArrowStringArray",
- "BaseMaskedArray",
- "BooleanArray",
- "Categorical",
- "DatetimeArray",
- "FloatingArray",
- "IntegerArray",
- "IntervalArray",
- "PandasArray",
- "PeriodArray",
- "period_array",
- "SparseArray",
- "StringArray",
- "TimedeltaArray",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/_mixins.py b/contrib/python/pandas/py3/pandas/core/arrays/_mixins.py
deleted file mode 100644
index 8804582798d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/_mixins.py
+++ /dev/null
@@ -1,496 +0,0 @@
-from __future__ import annotations
-
-from functools import wraps
-from typing import (
- TYPE_CHECKING,
- Any,
- Literal,
- Sequence,
- TypeVar,
- cast,
- overload,
-)
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.arrays import NDArrayBacked
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- Dtype,
- F,
- PositionalIndexer2D,
- PositionalIndexerTuple,
- ScalarIndexer,
- SequenceIndexer,
- Shape,
- TakeIndexer,
- npt,
- type_t,
-)
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import doc
-from pandas.util._validators import (
- validate_bool_kwarg,
- validate_fillna_kwargs,
- validate_insert_loc,
-)
-
-from pandas.core.dtypes.common import (
- is_dtype_equal,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import (
- DatetimeTZDtype,
- ExtensionDtype,
- PeriodDtype,
-)
-from pandas.core.dtypes.missing import array_equivalent
-
-from pandas.core import missing
-from pandas.core.algorithms import (
- take,
- unique,
- value_counts,
-)
-from pandas.core.array_algos.quantile import quantile_with_mask
-from pandas.core.array_algos.transforms import shift
-from pandas.core.arrays.base import ExtensionArray
-from pandas.core.construction import extract_array
-from pandas.core.indexers import check_array_indexer
-from pandas.core.sorting import nargminmax
-
-NDArrayBackedExtensionArrayT = TypeVar(
- "NDArrayBackedExtensionArrayT", bound="NDArrayBackedExtensionArray"
-)
-
-if TYPE_CHECKING:
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- )
-
- from pandas import Series
-
-
-def ravel_compat(meth: F) -> F:
- """
- Decorator to ravel a 2D array before passing it to a cython operation,
- then reshape the result to our own shape.
- """
-
- @wraps(meth)
- def method(self, *args, **kwargs):
- if self.ndim == 1:
- return meth(self, *args, **kwargs)
-
- flags = self._ndarray.flags
- flat = self.ravel("K")
- result = meth(flat, *args, **kwargs)
- order = "F" if flags.f_contiguous else "C"
- return result.reshape(self.shape, order=order)
-
- return cast(F, method)
-
-
-class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
- """
- ExtensionArray that is backed by a single NumPy ndarray.
- """
-
- _ndarray: np.ndarray
-
- # scalar used to denote NA value inside our self._ndarray, e.g. -1
- # for Categorical, iNaT for Period. Outside of object dtype,
- # self.isna() should be exactly locations in self._ndarray with
- # _internal_fill_value.
- _internal_fill_value: Any
-
- def _box_func(self, x):
- """
- Wrap numpy type in our dtype.type if necessary.
- """
- return x
-
- def _validate_scalar(self, value):
- # used by NDArrayBackedExtensionIndex.insert
- raise AbstractMethodError(self)
-
- # ------------------------------------------------------------------------
-
- def view(self, dtype: Dtype | None = None) -> ArrayLike:
- # We handle datetime64, datetime64tz, timedelta64, and period
- # dtypes here. Everything else we pass through to the underlying
- # ndarray.
- if dtype is None or dtype is self.dtype:
- return self._from_backing_data(self._ndarray)
-
- if isinstance(dtype, type):
- # we sometimes pass non-dtype objects, e.g np.ndarray;
- # pass those through to the underlying ndarray
- return self._ndarray.view(dtype)
-
- dtype = pandas_dtype(dtype)
- arr = self._ndarray
-
- if isinstance(dtype, (PeriodDtype, DatetimeTZDtype)):
- cls = dtype.construct_array_type()
- return cls(arr.view("i8"), dtype=dtype)
- elif dtype == "M8[ns]":
- from pandas.core.arrays import DatetimeArray
-
- return DatetimeArray(arr.view("i8"), dtype=dtype)
- elif dtype == "m8[ns]":
- from pandas.core.arrays import TimedeltaArray
-
- return TimedeltaArray(arr.view("i8"), dtype=dtype)
-
- # error: Argument "dtype" to "view" of "_ArrayOrScalarCommon" has incompatible
- # type "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None,
- # type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int,
- # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
- return arr.view(dtype=dtype) # type: ignore[arg-type]
-
- def take(
- self: NDArrayBackedExtensionArrayT,
- indices: TakeIndexer,
- *,
- allow_fill: bool = False,
- fill_value: Any = None,
- axis: AxisInt = 0,
- ) -> NDArrayBackedExtensionArrayT:
- if allow_fill:
- fill_value = self._validate_scalar(fill_value)
-
- new_data = take(
- self._ndarray,
- indices,
- allow_fill=allow_fill,
- fill_value=fill_value,
- axis=axis,
- )
- return self._from_backing_data(new_data)
-
- # ------------------------------------------------------------------------
-
- def equals(self, other) -> bool:
- if type(self) is not type(other):
- return False
- if not is_dtype_equal(self.dtype, other.dtype):
- return False
- return bool(array_equivalent(self._ndarray, other._ndarray))
-
- @classmethod
- def _from_factorized(cls, values, original):
- assert values.dtype == original._ndarray.dtype
- return original._from_backing_data(values)
-
- def _values_for_argsort(self) -> np.ndarray:
- return self._ndarray
-
- def _values_for_factorize(self):
- return self._ndarray, self._internal_fill_value
-
- # Signature of "argmin" incompatible with supertype "ExtensionArray"
- def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
- # override base class by adding axis keyword
- validate_bool_kwarg(skipna, "skipna")
- if not skipna and self._hasna:
- raise NotImplementedError
- return nargminmax(self, "argmin", axis=axis)
-
- # Signature of "argmax" incompatible with supertype "ExtensionArray"
- def argmax(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override]
- # override base class by adding axis keyword
- validate_bool_kwarg(skipna, "skipna")
- if not skipna and self._hasna:
- raise NotImplementedError
- return nargminmax(self, "argmax", axis=axis)
-
- def unique(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT:
- new_data = unique(self._ndarray)
- return self._from_backing_data(new_data)
-
- @classmethod
- @doc(ExtensionArray._concat_same_type)
- def _concat_same_type(
- cls: type[NDArrayBackedExtensionArrayT],
- to_concat: Sequence[NDArrayBackedExtensionArrayT],
- axis: AxisInt = 0,
- ) -> NDArrayBackedExtensionArrayT:
- dtypes = {str(x.dtype) for x in to_concat}
- if len(dtypes) != 1:
- raise ValueError("to_concat must have the same dtype (tz)", dtypes)
-
- new_values = [x._ndarray for x in to_concat]
- new_arr = np.concatenate(new_values, axis=axis)
- return to_concat[0]._from_backing_data(new_arr)
-
- @doc(ExtensionArray.searchsorted)
- def searchsorted(
- self,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- npvalue = self._validate_setitem_value(value)
- return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)
-
- @doc(ExtensionArray.shift)
- def shift(self, periods: int = 1, fill_value=None, axis: AxisInt = 0):
- fill_value = self._validate_scalar(fill_value)
- new_values = shift(self._ndarray, periods, axis, fill_value)
-
- return self._from_backing_data(new_values)
-
- def __setitem__(self, key, value) -> None:
- key = check_array_indexer(self, key)
- value = self._validate_setitem_value(value)
- self._ndarray[key] = value
-
- def _validate_setitem_value(self, value):
- return value
-
- @overload
- def __getitem__(self, key: ScalarIndexer) -> Any:
- ...
-
- @overload
- def __getitem__(
- self: NDArrayBackedExtensionArrayT,
- key: SequenceIndexer | PositionalIndexerTuple,
- ) -> NDArrayBackedExtensionArrayT:
- ...
-
- def __getitem__(
- self: NDArrayBackedExtensionArrayT,
- key: PositionalIndexer2D,
- ) -> NDArrayBackedExtensionArrayT | Any:
- if lib.is_integer(key):
- # fast-path
- result = self._ndarray[key]
- if self.ndim == 1:
- return self._box_func(result)
- return self._from_backing_data(result)
-
- # error: Incompatible types in assignment (expression has type "ExtensionArray",
- # variable has type "Union[int, slice, ndarray]")
- key = extract_array(key, extract_numpy=True) # type: ignore[assignment]
- key = check_array_indexer(self, key)
- result = self._ndarray[key]
- if lib.is_scalar(result):
- return self._box_func(result)
-
- result = self._from_backing_data(result)
- return result
-
- def _fill_mask_inplace(
- self, method: str, limit, mask: npt.NDArray[np.bool_]
- ) -> None:
- # (for now) when self.ndim == 2, we assume axis=0
- func = missing.get_fill_func(method, ndim=self.ndim)
- func(self._ndarray.T, limit=limit, mask=mask.T)
-
- @doc(ExtensionArray.fillna)
- def fillna(
- self: NDArrayBackedExtensionArrayT, value=None, method=None, limit=None
- ) -> NDArrayBackedExtensionArrayT:
- value, method = validate_fillna_kwargs(
- value, method, validate_scalar_dict_value=False
- )
-
- mask = self.isna()
- # error: Argument 2 to "check_value_size" has incompatible type
- # "ExtensionArray"; expected "ndarray"
- value = missing.check_value_size(
- value, mask, len(self) # type: ignore[arg-type]
- )
-
- if mask.any():
- if method is not None:
- # TODO: check value is None
- # (for now) when self.ndim == 2, we assume axis=0
- func = missing.get_fill_func(method, ndim=self.ndim)
- npvalues = self._ndarray.T.copy()
- func(npvalues, limit=limit, mask=mask.T)
- npvalues = npvalues.T
-
- # TODO: PandasArray didn't used to copy, need tests for this
- new_values = self._from_backing_data(npvalues)
- else:
- # fill with value
- new_values = self.copy()
- new_values[mask] = value
- else:
- # We validate the fill_value even if there is nothing to fill
- if value is not None:
- self._validate_setitem_value(value)
-
- new_values = self.copy()
- return new_values
-
- # ------------------------------------------------------------------------
- # Reductions
-
- def _wrap_reduction_result(self, axis: AxisInt | None, result):
- if axis is None or self.ndim == 1:
- return self._box_func(result)
- return self._from_backing_data(result)
-
- # ------------------------------------------------------------------------
- # __array_function__ methods
-
- def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
- """
- Analogue to np.putmask(self, mask, value)
-
- Parameters
- ----------
- mask : np.ndarray[bool]
- value : scalar or listlike
-
- Raises
- ------
- TypeError
- If value cannot be cast to self.dtype.
- """
- value = self._validate_setitem_value(value)
-
- np.putmask(self._ndarray, mask, value)
-
- def _where(
- self: NDArrayBackedExtensionArrayT, mask: npt.NDArray[np.bool_], value
- ) -> NDArrayBackedExtensionArrayT:
- """
- Analogue to np.where(mask, self, value)
-
- Parameters
- ----------
- mask : np.ndarray[bool]
- value : scalar or listlike
-
- Raises
- ------
- TypeError
- If value cannot be cast to self.dtype.
- """
- value = self._validate_setitem_value(value)
-
- res_values = np.where(mask, self._ndarray, value)
- return self._from_backing_data(res_values)
-
- # ------------------------------------------------------------------------
- # Index compat methods
-
- def insert(
- self: NDArrayBackedExtensionArrayT, loc: int, item
- ) -> NDArrayBackedExtensionArrayT:
- """
- Make new ExtensionArray inserting new item at location. Follows
- Python list.append semantics for negative values.
-
- Parameters
- ----------
- loc : int
- item : object
-
- Returns
- -------
- type(self)
- """
- loc = validate_insert_loc(loc, len(self))
-
- code = self._validate_scalar(item)
-
- new_vals = np.concatenate(
- (
- self._ndarray[:loc],
- np.asarray([code], dtype=self._ndarray.dtype),
- self._ndarray[loc:],
- )
- )
- return self._from_backing_data(new_vals)
-
- # ------------------------------------------------------------------------
- # Additional array methods
- # These are not part of the EA API, but we implement them because
- # pandas assumes they're there.
-
- def value_counts(self, dropna: bool = True) -> Series:
- """
- Return a Series containing counts of unique values.
-
- Parameters
- ----------
- dropna : bool, default True
- Don't include counts of NA values.
-
- Returns
- -------
- Series
- """
- if self.ndim != 1:
- raise NotImplementedError
-
- from pandas import (
- Index,
- Series,
- )
-
- if dropna:
- # error: Unsupported operand type for ~ ("ExtensionArray")
- values = self[~self.isna()]._ndarray # type: ignore[operator]
- else:
- values = self._ndarray
-
- result = value_counts(values, sort=False, dropna=dropna)
-
- index_arr = self._from_backing_data(np.asarray(result.index._data))
- index = Index(index_arr, name=result.index.name)
- return Series(result._values, index=index, name=result.name, copy=False)
-
- def _quantile(
- self: NDArrayBackedExtensionArrayT,
- qs: npt.NDArray[np.float64],
- interpolation: str,
- ) -> NDArrayBackedExtensionArrayT:
- # TODO: disable for Categorical if not ordered?
-
- mask = np.asarray(self.isna())
- arr = self._ndarray
- fill_value = self._internal_fill_value
-
- res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
-
- res_values = self._cast_quantile_result(res_values)
- return self._from_backing_data(res_values)
-
- # TODO: see if we can share this with other dispatch-wrapping methods
- def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
- """
- Cast the result of quantile_with_mask to an appropriate dtype
- to pass to _from_backing_data in _quantile.
- """
- return res_values
-
- # ------------------------------------------------------------------------
- # numpy-like methods
-
- @classmethod
- def _empty(
- cls: type_t[NDArrayBackedExtensionArrayT], shape: Shape, dtype: ExtensionDtype
- ) -> NDArrayBackedExtensionArrayT:
- """
- Analogous to np.empty(shape, dtype=dtype)
-
- Parameters
- ----------
- shape : tuple[int]
- dtype : ExtensionDtype
- """
- # The base implementation uses a naive approach to find the dtype
- # for the backing ndarray
- arr = cls._from_sequence([], dtype=dtype)
- backing = np.empty(shape, dtype=arr._ndarray.dtype)
- return arr._from_backing_data(backing)
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/_ranges.py b/contrib/python/pandas/py3/pandas/core/arrays/_ranges.py
deleted file mode 100644
index c93fc946853..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/_ranges.py
+++ /dev/null
@@ -1,209 +0,0 @@
-"""
-Helper functions to generate range-like data for DatetimeArray
-(and possibly TimedeltaArray/PeriodArray)
-"""
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._libs.lib import i8max
-from pandas._libs.tslibs import (
- BaseOffset,
- OutOfBoundsDatetime,
- Timedelta,
- Timestamp,
- iNaT,
-)
-from pandas._typing import npt
-
-
-def generate_regular_range(
- start: Timestamp | Timedelta | None,
- end: Timestamp | Timedelta | None,
- periods: int | None,
- freq: BaseOffset,
- unit: str = "ns",
-) -> npt.NDArray[np.intp]:
- """
- Generate a range of dates or timestamps with the spans between dates
- described by the given `freq` DateOffset.
-
- Parameters
- ----------
- start : Timedelta, Timestamp or None
- First point of produced date range.
- end : Timedelta, Timestamp or None
- Last point of produced date range.
- periods : int or None
- Number of periods in produced date range.
- freq : Tick
- Describes space between dates in produced date range.
- unit : str, default "ns"
- The resolution the output is meant to represent.
-
- Returns
- -------
- ndarray[np.int64]
- Representing the given resolution.
- """
- istart = start._value if start is not None else None
- iend = end._value if end is not None else None
- freq.nanos # raises if non-fixed frequency
- td = Timedelta(freq)
- try:
- td = td.as_unit( # pyright: ignore[reportGeneralTypeIssues]
- unit, round_ok=False
- )
- except ValueError as err:
- raise ValueError(
- f"freq={freq} is incompatible with unit={unit}. "
- "Use a lower freq or a higher unit instead."
- ) from err
- stride = int(td._value)
-
- if periods is None and istart is not None and iend is not None:
- b = istart
- # cannot just use e = Timestamp(end) + 1 because arange breaks when
- # stride is too large, see GH10887
- e = b + (iend - b) // stride * stride + stride // 2 + 1
- elif istart is not None and periods is not None:
- b = istart
- e = _generate_range_overflow_safe(b, periods, stride, side="start")
- elif iend is not None and periods is not None:
- e = iend + stride
- b = _generate_range_overflow_safe(e, periods, stride, side="end")
- else:
- raise ValueError(
- "at least 'start' or 'end' should be specified if a 'period' is given."
- )
-
- with np.errstate(over="raise"):
- # If the range is sufficiently large, np.arange may overflow
- # and incorrectly return an empty array if not caught.
- try:
- values = np.arange(b, e, stride, dtype=np.int64)
- except FloatingPointError:
- xdr = [b]
- while xdr[-1] != e:
- xdr.append(xdr[-1] + stride)
- values = np.array(xdr[:-1], dtype=np.int64)
- return values
-
-
-def _generate_range_overflow_safe(
- endpoint: int, periods: int, stride: int, side: str = "start"
-) -> int:
- """
- Calculate the second endpoint for passing to np.arange, checking
- to avoid an integer overflow. Catch OverflowError and re-raise
- as OutOfBoundsDatetime.
-
- Parameters
- ----------
- endpoint : int
- nanosecond timestamp of the known endpoint of the desired range
- periods : int
- number of periods in the desired range
- stride : int
- nanoseconds between periods in the desired range
- side : {'start', 'end'}
- which end of the range `endpoint` refers to
-
- Returns
- -------
- other_end : int
-
- Raises
- ------
- OutOfBoundsDatetime
- """
- # GH#14187 raise instead of incorrectly wrapping around
- assert side in ["start", "end"]
-
- i64max = np.uint64(i8max)
- msg = f"Cannot generate range with {side}={endpoint} and periods={periods}"
-
- with np.errstate(over="raise"):
- # if periods * strides cannot be multiplied within the *uint64* bounds,
- # we cannot salvage the operation by recursing, so raise
- try:
- addend = np.uint64(periods) * np.uint64(np.abs(stride))
- except FloatingPointError as err:
- raise OutOfBoundsDatetime(msg) from err
-
- if np.abs(addend) <= i64max:
- # relatively easy case without casting concerns
- return _generate_range_overflow_safe_signed(endpoint, periods, stride, side)
-
- elif (endpoint > 0 and side == "start" and stride > 0) or (
- endpoint < 0 < stride and side == "end"
- ):
- # no chance of not-overflowing
- raise OutOfBoundsDatetime(msg)
-
- elif side == "end" and endpoint - stride <= i64max < endpoint:
- # in _generate_regular_range we added `stride` thereby overflowing
- # the bounds. Adjust to fix this.
- return _generate_range_overflow_safe(
- endpoint - stride, periods - 1, stride, side
- )
-
- # split into smaller pieces
- mid_periods = periods // 2
- remaining = periods - mid_periods
- assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
-
- midpoint = _generate_range_overflow_safe(endpoint, mid_periods, stride, side)
- return _generate_range_overflow_safe(midpoint, remaining, stride, side)
-
-
-def _generate_range_overflow_safe_signed(
- endpoint: int, periods: int, stride: int, side: str
-) -> int:
- """
- A special case for _generate_range_overflow_safe where `periods * stride`
- can be calculated without overflowing int64 bounds.
- """
- assert side in ["start", "end"]
- if side == "end":
- stride *= -1
-
- with np.errstate(over="raise"):
- addend = np.int64(periods) * np.int64(stride)
- try:
- # easy case with no overflows
- result = np.int64(endpoint) + addend
- if result == iNaT:
- # Putting this into a DatetimeArray/TimedeltaArray
- # would incorrectly be interpreted as NaT
- raise OverflowError
- # error: Incompatible return value type (got "signedinteger[_64Bit]",
- # expected "int")
- return result # type: ignore[return-value]
- except (FloatingPointError, OverflowError):
- # with endpoint negative and addend positive we risk
- # FloatingPointError; with reversed signed we risk OverflowError
- pass
-
- # if stride and endpoint had opposite signs, then endpoint + addend
- # should never overflow. so they must have the same signs
- assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
-
- if stride > 0:
- # watch out for very special case in which we just slightly
- # exceed implementation bounds, but when passing the result to
- # np.arange will get a result slightly within the bounds
-
- # error: Incompatible types in assignment (expression has type
- # "unsignedinteger[_64Bit]", variable has type "signedinteger[_64Bit]")
- result = np.uint64(endpoint) + np.uint64(addend) # type: ignore[assignment]
- i64max = np.uint64(i8max)
- assert result > i64max
- if result <= i64max + np.uint64(stride):
- # error: Incompatible return value type (got "unsignedinteger", expected
- # "int")
- return result # type: ignore[return-value]
-
- raise OutOfBoundsDatetime(
- f"Cannot generate range with {side}={endpoint} and periods={periods}"
- )
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/arrow/__init__.py b/contrib/python/pandas/py3/pandas/core/arrays/arrow/__init__.py
deleted file mode 100644
index e7fa6fae0a5..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/arrow/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from pandas.core.arrays.arrow.array import ArrowExtensionArray
-from pandas.core.arrays.arrow.dtype import ArrowDtype
-
-__all__ = ["ArrowDtype", "ArrowExtensionArray"]
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/arrow/_arrow_utils.py b/contrib/python/pandas/py3/pandas/core/arrays/arrow/_arrow_utils.py
deleted file mode 100644
index 6e6ef6a2c20..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/arrow/_arrow_utils.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from __future__ import annotations
-
-import warnings
-
-import numpy as np
-import pyarrow
-
-from pandas.errors import PerformanceWarning
-from pandas.util._exceptions import find_stack_level
-
-
-def fallback_performancewarning(version: str | None = None) -> None:
- """
- Raise a PerformanceWarning for falling back to ExtensionArray's
- non-pyarrow method
- """
- msg = "Falling back on a non-pyarrow code path which may decrease performance."
- if version is not None:
- msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
- warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
-
-
-def pyarrow_array_to_numpy_and_mask(
- arr, dtype: np.dtype
-) -> tuple[np.ndarray, np.ndarray]:
- """
- Convert a primitive pyarrow.Array to a numpy array and boolean mask based
- on the buffers of the Array.
-
- At the moment pyarrow.BooleanArray is not supported.
-
- Parameters
- ----------
- arr : pyarrow.Array
- dtype : numpy.dtype
-
- Returns
- -------
- (data, mask)
- Tuple of two numpy arrays with the raw data (with specified dtype) and
- a boolean mask (validity mask, so False means missing)
- """
- dtype = np.dtype(dtype)
-
- buflist = arr.buffers()
- # Since Arrow buffers might contain padding and the data might be offset,
- # the buffer gets sliced here before handing it to numpy.
- # See also https://github.com/pandas-dev/pandas/issues/40896
- offset = arr.offset * dtype.itemsize
- length = len(arr) * dtype.itemsize
- data_buf = buflist[1][offset : offset + length]
- data = np.frombuffer(data_buf, dtype=dtype)
- bitmask = buflist[0]
- if bitmask is not None:
- mask = pyarrow.BooleanArray.from_buffers(
- pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset
- )
- mask = np.asarray(mask)
- else:
- mask = np.ones(len(arr), dtype=bool)
- return data, mask
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/arrow/array.py b/contrib/python/pandas/py3/pandas/core/arrays/arrow/array.py
deleted file mode 100644
index 445ec36135d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/arrow/array.py
+++ /dev/null
@@ -1,2206 +0,0 @@
-from __future__ import annotations
-
-from copy import deepcopy
-import functools
-import operator
-import re
-import sys
-import textwrap
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Literal,
- Sequence,
- TypeVar,
- cast,
-)
-import unicodedata
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- Dtype,
- FillnaOptions,
- Iterator,
- NpDtype,
- PositionalIndexer,
- Scalar,
- SortKind,
- TakeIndexer,
- TimeAmbiguous,
- TimeNonexistent,
- npt,
-)
-from pandas.compat import (
- pa_version_under7p0,
- pa_version_under8p0,
- pa_version_under9p0,
- pa_version_under11p0,
-)
-from pandas.util._decorators import doc
-from pandas.util._validators import validate_fillna_kwargs
-
-from pandas.core.dtypes.common import (
- is_array_like,
- is_bool_dtype,
- is_integer,
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- is_scalar,
-)
-from pandas.core.dtypes.dtypes import DatetimeTZDtype
-from pandas.core.dtypes.missing import isna
-
-from pandas.core import roperator
-from pandas.core.arraylike import OpsMixin
-from pandas.core.arrays.base import (
- ExtensionArray,
- ExtensionArraySupportsAnyAll,
-)
-import pandas.core.common as com
-from pandas.core.indexers import (
- check_array_indexer,
- unpack_tuple_and_ellipses,
- validate_indices,
-)
-from pandas.core.strings.base import BaseStringArrayMethods
-
-from pandas.tseries.frequencies import to_offset
-
-if not pa_version_under7p0:
- import pyarrow as pa
- import pyarrow.compute as pc
-
- from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
- from pandas.core.arrays.arrow.dtype import ArrowDtype
-
- ARROW_CMP_FUNCS = {
- "eq": pc.equal,
- "ne": pc.not_equal,
- "lt": pc.less,
- "gt": pc.greater,
- "le": pc.less_equal,
- "ge": pc.greater_equal,
- }
-
- ARROW_LOGICAL_FUNCS = {
- "and_": pc.and_kleene,
- "rand_": lambda x, y: pc.and_kleene(y, x),
- "or_": pc.or_kleene,
- "ror_": lambda x, y: pc.or_kleene(y, x),
- "xor": pc.xor,
- "rxor": lambda x, y: pc.xor(y, x),
- }
-
- def cast_for_truediv(
- arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar
- ) -> pa.ChunkedArray:
- # Ensure int / int -> float mirroring Python/Numpy behavior
- # as pc.divide_checked(int, int) -> int
- if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(
- pa_object.type
- ):
- return arrow_array.cast(pa.float64())
- return arrow_array
-
- def floordiv_compat(
- left: pa.ChunkedArray | pa.Array | pa.Scalar,
- right: pa.ChunkedArray | pa.Array | pa.Scalar,
- ) -> pa.ChunkedArray:
- # Ensure int // int -> int mirroring Python/Numpy behavior
- # as pc.floor(pc.divide_checked(int, int)) -> float
- result = pc.floor(pc.divide(left, right))
- if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):
- result = result.cast(left.type)
- return result
-
- ARROW_ARITHMETIC_FUNCS = {
- "add": pc.add_checked,
- "radd": lambda x, y: pc.add_checked(y, x),
- "sub": pc.subtract_checked,
- "rsub": lambda x, y: pc.subtract_checked(y, x),
- "mul": pc.multiply_checked,
- "rmul": lambda x, y: pc.multiply_checked(y, x),
- "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y),
- "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)),
- "floordiv": lambda x, y: floordiv_compat(x, y),
- "rfloordiv": lambda x, y: floordiv_compat(y, x),
- "mod": NotImplemented,
- "rmod": NotImplemented,
- "divmod": NotImplemented,
- "rdivmod": NotImplemented,
- "pow": pc.power_checked,
- "rpow": lambda x, y: pc.power_checked(y, x),
- }
-
-if TYPE_CHECKING:
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- )
-
- from pandas import Series
-
-ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray")
-
-
-def get_unit_from_pa_dtype(pa_dtype):
- # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804
- if pa_version_under11p0:
- unit = str(pa_dtype).split("[", 1)[-1][:-1]
- if unit not in ["s", "ms", "us", "ns"]:
- raise ValueError(pa_dtype)
- return unit
- return pa_dtype.unit
-
-
-def to_pyarrow_type(
- dtype: ArrowDtype | pa.DataType | Dtype | None,
-) -> pa.DataType | None:
- """
- Convert dtype to a pyarrow type instance.
- """
- if isinstance(dtype, ArrowDtype):
- return dtype.pyarrow_dtype
- elif isinstance(dtype, pa.DataType):
- return dtype
- elif isinstance(dtype, DatetimeTZDtype):
- return pa.timestamp(dtype.unit, dtype.tz)
- elif dtype:
- try:
- # Accepts python types too
- # Doesn't handle all numpy types
- return pa.from_numpy_dtype(dtype)
- except pa.ArrowNotImplementedError:
- pass
- return None
-
-
-class ArrowExtensionArray(
- OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods
-):
- """
- Pandas ExtensionArray backed by a PyArrow ChunkedArray.
-
- .. warning::
-
- ArrowExtensionArray is considered experimental. The implementation and
- parts of the API may change without warning.
-
- Parameters
- ----------
- values : pyarrow.Array or pyarrow.ChunkedArray
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
-
- Returns
- -------
- ArrowExtensionArray
-
- Notes
- -----
- Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__
- Some methods may either raise an exception or raise a ``PerformanceWarning`` if an
- associated compute function is not available based on the installed version of PyArrow.
-
- Please install the latest version of PyArrow to enable the best functionality and avoid
- potential bugs in prior versions of PyArrow.
-
- Examples
- --------
- Create an ArrowExtensionArray with :func:`pandas.array`:
-
- >>> pd.array([1, 1, None], dtype="int64[pyarrow]")
- <ArrowExtensionArray>
- [1, 1, <NA>]
- Length: 3, dtype: int64[pyarrow]
- """ # noqa: E501 (http link too long)
-
- _data: pa.ChunkedArray
- _dtype: ArrowDtype
-
- def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
- if pa_version_under7p0:
- msg = "pyarrow>=7.0.0 is required for PyArrow backed ArrowExtensionArray."
- raise ImportError(msg)
- if isinstance(values, pa.Array):
- self._data = pa.chunked_array([values])
- elif isinstance(values, pa.ChunkedArray):
- self._data = values
- else:
- raise ValueError(
- f"Unsupported type '{type(values)}' for ArrowExtensionArray"
- )
- self._dtype = ArrowDtype(self._data.type)
-
- @classmethod
- def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
- """
- Construct a new ExtensionArray from a sequence of scalars.
- """
- pa_dtype = to_pyarrow_type(dtype)
- if (
- isinstance(scalars, np.ndarray)
- and isinstance(dtype, ArrowDtype)
- and (
- pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype)
- )
- ):
- # See https://github.com/apache/arrow/issues/35289
- scalars = scalars.tolist()
-
- if isinstance(scalars, cls):
- scalars = scalars._data
- elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)):
- if copy and is_array_like(scalars):
- # pa array should not get updated when numpy array is updated
- scalars = deepcopy(scalars)
- try:
- scalars = pa.array(scalars, type=pa_dtype, from_pandas=True)
- except pa.ArrowInvalid:
- # GH50430: let pyarrow infer type, then cast
- scalars = pa.array(scalars, from_pandas=True)
- if pa_dtype:
- if pa.types.is_dictionary(pa_dtype):
- scalars = scalars.dictionary_encode()
- else:
- scalars = scalars.cast(pa_dtype)
- arr = cls(scalars)
- if pa.types.is_duration(scalars.type) and scalars.null_count > 0:
- # GH52843: upstream bug for duration types when originally
- # constructed with data containing numpy NaT.
- # https://github.com/apache/arrow/issues/35088
- arr = arr.fillna(arr.dtype.na_value)
- return arr
-
- @classmethod
- def _from_sequence_of_strings(
- cls, strings, *, dtype: Dtype | None = None, copy: bool = False
- ):
- """
- Construct a new ExtensionArray from a sequence of strings.
- """
- pa_type = to_pyarrow_type(dtype)
- if (
- pa_type is None
- or pa.types.is_binary(pa_type)
- or pa.types.is_string(pa_type)
- ):
- # pa_type is None: Let pa.array infer
- # pa_type is string/binary: scalars already correct type
- scalars = strings
- elif pa.types.is_timestamp(pa_type):
- from pandas.core.tools.datetimes import to_datetime
-
- scalars = to_datetime(strings, errors="raise")
- elif pa.types.is_date(pa_type):
- from pandas.core.tools.datetimes import to_datetime
-
- scalars = to_datetime(strings, errors="raise").date
- elif pa.types.is_duration(pa_type):
- from pandas.core.tools.timedeltas import to_timedelta
-
- scalars = to_timedelta(strings, errors="raise")
- if pa_type.unit != "ns":
- # GH51175: test_from_sequence_of_strings_pa_array
- # attempt to parse as int64 reflecting pyarrow's
- # duration to string casting behavior
- mask = isna(scalars)
- if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
- strings = pa.array(strings, type=pa.string(), from_pandas=True)
- strings = pc.if_else(mask, None, strings)
- try:
- scalars = strings.cast(pa.int64())
- except pa.ArrowInvalid:
- pass
- elif pa.types.is_time(pa_type):
- from pandas.core.tools.times import to_time
-
- # "coerce" to allow "null times" (None) to not raise
- scalars = to_time(strings, errors="coerce")
- elif pa.types.is_boolean(pa_type):
- from pandas.core.arrays import BooleanArray
-
- scalars = BooleanArray._from_sequence_of_strings(strings).to_numpy()
- elif (
- pa.types.is_integer(pa_type)
- or pa.types.is_floating(pa_type)
- or pa.types.is_decimal(pa_type)
- ):
- from pandas.core.tools.numeric import to_numeric
-
- scalars = to_numeric(strings, errors="raise")
- else:
- raise NotImplementedError(
- f"Converting strings to {pa_type} is not implemented."
- )
- return cls._from_sequence(scalars, dtype=pa_type, copy=copy)
-
- def __getitem__(self, item: PositionalIndexer):
- """Select a subset of self.
-
- Parameters
- ----------
- item : int, slice, or ndarray
- * int: The position in 'self' to get.
- * slice: A slice object, where 'start', 'stop', and 'step' are
- integers or None
- * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
-
- Returns
- -------
- item : scalar or ExtensionArray
-
- Notes
- -----
- For scalar ``item``, return a scalar value suitable for the array's
- type. This should be an instance of ``self.dtype.type``.
- For slice ``key``, return an instance of ``ExtensionArray``, even
- if the slice is length 0 or 1.
- For a boolean mask, return an instance of ``ExtensionArray``, filtered
- to the values where ``item`` is True.
- """
- item = check_array_indexer(self, item)
-
- if isinstance(item, np.ndarray):
- if not len(item):
- # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
- if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
- pa_dtype = pa.string()
- else:
- pa_dtype = self._dtype.pyarrow_dtype
- return type(self)(pa.chunked_array([], type=pa_dtype))
- elif is_integer_dtype(item.dtype):
- return self.take(item)
- elif is_bool_dtype(item.dtype):
- return type(self)(self._data.filter(item))
- else:
- raise IndexError(
- "Only integers, slices and integer or "
- "boolean arrays are valid indices."
- )
- elif isinstance(item, tuple):
- item = unpack_tuple_and_ellipses(item)
-
- if item is Ellipsis:
- # TODO: should be handled by pyarrow?
- item = slice(None)
-
- if is_scalar(item) and not is_integer(item):
- # e.g. "foo" or 2.5
- # exception message copied from numpy
- raise IndexError(
- r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
- r"(`None`) and integer or boolean arrays are valid indices"
- )
- # We are not an array indexer, so maybe e.g. a slice or integer
- # indexer. We dispatch to pyarrow.
- value = self._data[item]
- if isinstance(value, pa.ChunkedArray):
- return type(self)(value)
- else:
- scalar = value.as_py()
- if scalar is None:
- return self._dtype.na_value
- else:
- return scalar
-
- def __iter__(self) -> Iterator[Any]:
- """
- Iterate over elements of the array.
- """
- na_value = self._dtype.na_value
- for value in self._data:
- val = value.as_py()
- if val is None:
- yield na_value
- else:
- yield val
-
- def __arrow_array__(self, type=None):
- """Convert myself to a pyarrow ChunkedArray."""
- return self._data
-
- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
- """Correctly construct numpy arrays when passed to `np.asarray()`."""
- return self.to_numpy(dtype=dtype)
-
- def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- return type(self)(pc.invert(self._data))
-
- def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- return type(self)(pc.negate_checked(self._data))
-
- def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- return type(self)(self._data)
-
- def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- return type(self)(pc.abs_checked(self._data))
-
- # GH 42600: __getstate__/__setstate__ not necessary once
- # https://issues.apache.org/jira/browse/ARROW-10739 is addressed
- def __getstate__(self):
- state = self.__dict__.copy()
- state["_data"] = self._data.combine_chunks()
- return state
-
- def __setstate__(self, state) -> None:
- state["_data"] = pa.chunked_array(state["_data"])
- self.__dict__.update(state)
-
- def _cmp_method(self, other, op):
- from pandas.core.arrays.masked import BaseMaskedArray
-
- pc_func = ARROW_CMP_FUNCS[op.__name__]
- if isinstance(other, ArrowExtensionArray):
- result = pc_func(self._data, other._data)
- elif isinstance(other, (np.ndarray, list)):
- result = pc_func(self._data, other)
- elif isinstance(other, BaseMaskedArray):
- # GH 52625
- result = pc_func(self._data, other.__arrow_array__())
- elif is_scalar(other):
- try:
- result = pc_func(self._data, pa.scalar(other))
- except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
- mask = isna(self) | isna(other)
- valid = ~mask
- result = np.zeros(len(self), dtype="bool")
- result[valid] = op(np.array(self)[valid], other)
- result = pa.array(result, type=pa.bool_())
- result = pc.if_else(valid, result, None)
- else:
- raise NotImplementedError(
- f"{op.__name__} not implemented for {type(other)}"
- )
- return ArrowExtensionArray(result)
-
- def _evaluate_op_method(self, other, op, arrow_funcs):
- from pandas.core.arrays.masked import BaseMaskedArray
-
- pa_type = self._data.type
- if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [
- operator.add,
- roperator.radd,
- ]:
- length = self._data.length()
-
- seps: list[str] | list[bytes]
- if pa.types.is_string(pa_type):
- seps = [""] * length
- else:
- seps = [b""] * length
-
- if is_scalar(other):
- other = [other] * length
- elif isinstance(other, type(self)):
- other = other._data
- if op is operator.add:
- result = pc.binary_join_element_wise(self._data, other, seps)
- else:
- result = pc.binary_join_element_wise(other, self._data, seps)
- return type(self)(result)
-
- pc_func = arrow_funcs[op.__name__]
- if pc_func is NotImplemented:
- raise NotImplementedError(f"{op.__name__} not implemented.")
- if isinstance(other, ArrowExtensionArray):
- result = pc_func(self._data, other._data)
- elif isinstance(other, (np.ndarray, list)):
- result = pc_func(self._data, pa.array(other, from_pandas=True))
- elif isinstance(other, BaseMaskedArray):
- # GH 52625
- result = pc_func(self._data, other.__arrow_array__())
- elif is_scalar(other):
- if isna(other) and op.__name__ in ARROW_LOGICAL_FUNCS:
- # pyarrow kleene ops require null to be typed
- pa_scalar = pa.scalar(None, type=self._data.type)
- else:
- pa_scalar = pa.scalar(other)
- result = pc_func(self._data, pa_scalar)
- else:
- raise NotImplementedError(
- f"{op.__name__} not implemented for {type(other)}"
- )
- return type(self)(result)
-
- def _logical_method(self, other, op):
- return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS)
-
- def _arith_method(self, other, op):
- return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)
-
- def equals(self, other) -> bool:
- if not isinstance(other, ArrowExtensionArray):
- return False
- # I'm told that pyarrow makes __eq__ behave like pandas' equals;
- # TODO: is this documented somewhere?
- return self._data == other._data
-
- @property
- def dtype(self) -> ArrowDtype:
- """
- An instance of 'ExtensionDtype'.
- """
- return self._dtype
-
- @property
- def nbytes(self) -> int:
- """
- The number of bytes needed to store this object in memory.
- """
- return self._data.nbytes
-
- def __len__(self) -> int:
- """
- Length of this array.
-
- Returns
- -------
- length : int
- """
- return len(self._data)
-
- def __contains__(self, key) -> bool:
- # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
- if isna(key) and key is not self.dtype.na_value:
- if self.dtype.kind == "f" and lib.is_float(key) and isna(key):
- return pc.any(pc.is_nan(self._data)).as_py()
-
- # e.g. date or timestamp types we do not allow None here to match pd.NA
- return False
- # TODO: maybe complex? object?
-
- return bool(super().__contains__(key))
-
- @property
- def _hasna(self) -> bool:
- return self._data.null_count > 0
-
- def isna(self) -> npt.NDArray[np.bool_]:
- """
- Boolean NumPy array indicating if each value is missing.
-
- This should return a 1-D array the same length as 'self'.
- """
- return self._data.is_null().to_numpy()
-
- def any(self, *, skipna: bool = True, **kwargs):
- """
- Return whether any element is truthy.
-
- Returns False unless there is at least one element that is truthy.
- By default, NAs are skipped. If ``skipna=False`` is specified and
- missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
- is used as for logical operations.
-
- Parameters
- ----------
- skipna : bool, default True
- Exclude NA values. If the entire array is NA and `skipna` is
- True, then the result will be False, as for an empty array.
- If `skipna` is False, the result will still be True if there is
- at least one element that is truthy, otherwise NA will be returned
- if there are NA's present.
-
- Returns
- -------
- bool or :attr:`pandas.NA`
-
- See Also
- --------
- ArrowExtensionArray.all : Return whether all elements are truthy.
-
- Examples
- --------
- The result indicates whether any element is truthy (and by default
- skips NAs):
-
- >>> pd.array([True, False, True], dtype="boolean[pyarrow]").any()
- True
- >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any()
- True
- >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any()
- False
- >>> pd.array([], dtype="boolean[pyarrow]").any()
- False
- >>> pd.array([pd.NA], dtype="boolean[pyarrow]").any()
- False
- >>> pd.array([pd.NA], dtype="float64[pyarrow]").any()
- False
-
- With ``skipna=False``, the result can be NA if this is logically
- required (whether ``pd.NA`` is True or False influences the result):
-
- >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
- True
- >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
- True
- >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
- <NA>
- >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
- <NA>
- """
- return self._reduce("any", skipna=skipna, **kwargs)
-
- def all(self, *, skipna: bool = True, **kwargs):
- """
- Return whether all elements are truthy.
-
- Returns True unless there is at least one element that is falsey.
- By default, NAs are skipped. If ``skipna=False`` is specified and
- missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
- is used as for logical operations.
-
- Parameters
- ----------
- skipna : bool, default True
- Exclude NA values. If the entire array is NA and `skipna` is
- True, then the result will be True, as for an empty array.
- If `skipna` is False, the result will still be False if there is
- at least one element that is falsey, otherwise NA will be returned
- if there are NA's present.
-
- Returns
- -------
- bool or :attr:`pandas.NA`
-
- See Also
- --------
- ArrowExtensionArray.any : Return whether any element is truthy.
-
- Examples
- --------
- The result indicates whether all elements are truthy (and by default
- skips NAs):
-
- >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all()
- True
- >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all()
- True
- >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all()
- False
- >>> pd.array([], dtype="boolean[pyarrow]").all()
- True
- >>> pd.array([pd.NA], dtype="boolean[pyarrow]").all()
- True
- >>> pd.array([pd.NA], dtype="float64[pyarrow]").all()
- True
-
- With ``skipna=False``, the result can be NA if this is logically
- required (whether ``pd.NA`` is True or False influences the result):
-
- >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
- <NA>
- >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
- <NA>
- >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
- False
- >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
- False
- """
- return self._reduce("all", skipna=skipna, **kwargs)
-
- def argsort(
- self,
- *,
- ascending: bool = True,
- kind: SortKind = "quicksort",
- na_position: str = "last",
- **kwargs,
- ) -> np.ndarray:
- order = "ascending" if ascending else "descending"
- null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None)
- if null_placement is None:
- raise ValueError(f"invalid na_position: {na_position}")
-
- result = pc.array_sort_indices(
- self._data, order=order, null_placement=null_placement
- )
- np_result = result.to_numpy()
- return np_result.astype(np.intp, copy=False)
-
- def _argmin_max(self, skipna: bool, method: str) -> int:
- if self._data.length() in (0, self._data.null_count) or (
- self._hasna and not skipna
- ):
- # For empty or all null, pyarrow returns -1 but pandas expects TypeError
- # For skipna=False and data w/ null, pandas expects NotImplementedError
- # let ExtensionArray.arg{max|min} raise
- return getattr(super(), f"arg{method}")(skipna=skipna)
-
- data = self._data
- if pa.types.is_duration(data.type):
- data = data.cast(pa.int64())
-
- value = getattr(pc, method)(data, skip_nulls=skipna)
- return pc.index(data, value).as_py()
-
- def argmin(self, skipna: bool = True) -> int:
- return self._argmin_max(skipna, "min")
-
- def argmax(self, skipna: bool = True) -> int:
- return self._argmin_max(skipna, "max")
-
- def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- """
- Return a shallow copy of the array.
-
- Underlying ChunkedArray is immutable, so a deep copy is unnecessary.
-
- Returns
- -------
- type(self)
- """
- return type(self)(self._data)
-
- def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- """
- Return ArrowExtensionArray without NA values.
-
- Returns
- -------
- ArrowExtensionArray
- """
- return type(self)(pc.drop_null(self._data))
-
- @doc(ExtensionArray.fillna)
- def fillna(
- self: ArrowExtensionArrayT,
- value: object | ArrayLike | None = None,
- method: FillnaOptions | None = None,
- limit: int | None = None,
- ) -> ArrowExtensionArrayT:
- value, method = validate_fillna_kwargs(value, method)
-
- if limit is not None:
- return super().fillna(value=value, method=method, limit=limit)
-
- if method is not None:
- fallback_performancewarning()
- return super().fillna(value=value, method=method, limit=limit)
-
- if is_array_like(value):
- value = cast(ArrayLike, value)
- if len(value) != len(self):
- raise ValueError(
- f"Length of 'value' does not match. Got ({len(value)}) "
- f" expected {len(self)}"
- )
-
- def convert_fill_value(value, pa_type, dtype):
- if value is None:
- return value
- if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
- return value
- if is_array_like(value):
- pa_box = pa.array
- else:
- pa_box = pa.scalar
- try:
- value = pa_box(value, type=pa_type, from_pandas=True)
- except pa.ArrowTypeError as err:
- msg = f"Invalid value '{str(value)}' for dtype {dtype}"
- raise TypeError(msg) from err
- return value
-
- fill_value = convert_fill_value(value, self._data.type, self.dtype)
-
- try:
- if method is None:
- return type(self)(pc.fill_null(self._data, fill_value=fill_value))
- elif method == "pad":
- return type(self)(pc.fill_null_forward(self._data))
- elif method == "backfill":
- return type(self)(pc.fill_null_backward(self._data))
- except pa.ArrowNotImplementedError:
- # ArrowNotImplementedError: Function 'coalesce' has no kernel
- # matching input types (duration[ns], duration[ns])
- # TODO: remove try/except wrapper if/when pyarrow implements
- # a kernel for duration types.
- pass
-
- return super().fillna(value=value, method=method, limit=limit)
-
- def isin(self, values) -> npt.NDArray[np.bool_]:
- # short-circuit to return all False array.
- if not len(values):
- return np.zeros(len(self), dtype=bool)
-
- result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True))
- # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
- # to False
- return np.array(result, dtype=np.bool_)
-
- def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
- """
- Return an array and missing value suitable for factorization.
-
- Returns
- -------
- values : ndarray
- na_value : pd.NA
-
- Notes
- -----
- The values returned by this method are also used in
- :func:`pandas.util.hash_pandas_object`.
- """
- values = self._data.to_numpy()
- return values, self.dtype.na_value
-
- @doc(ExtensionArray.factorize)
- def factorize(
- self,
- use_na_sentinel: bool = True,
- ) -> tuple[np.ndarray, ExtensionArray]:
- null_encoding = "mask" if use_na_sentinel else "encode"
-
- pa_type = self._data.type
- if pa.types.is_duration(pa_type):
- # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
- data = self._data.cast(pa.int64())
- else:
- data = self._data
-
- if pa.types.is_dictionary(data.type):
- encoded = data
- else:
- encoded = data.dictionary_encode(null_encoding=null_encoding)
- if encoded.length() == 0:
- indices = np.array([], dtype=np.intp)
- uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))
- else:
- pa_indices = encoded.combine_chunks().indices
- if pa_indices.null_count > 0:
- pa_indices = pc.fill_null(pa_indices, -1)
- indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype(
- np.intp, copy=False
- )
- uniques = type(self)(encoded.chunk(0).dictionary)
-
- if pa.types.is_duration(pa_type):
- uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype))
- return indices, uniques
-
- def reshape(self, *args, **kwargs):
- raise NotImplementedError(
- f"{type(self)} does not support reshape "
- f"as backed by a 1D pyarrow.ChunkedArray."
- )
-
- def round(
- self: ArrowExtensionArrayT, decimals: int = 0, *args, **kwargs
- ) -> ArrowExtensionArrayT:
- """
- Round each value in the array a to the given number of decimals.
-
- Parameters
- ----------
- decimals : int, default 0
- Number of decimal places to round to. If decimals is negative,
- it specifies the number of positions to the left of the decimal point.
- *args, **kwargs
- Additional arguments and keywords have no effect.
-
- Returns
- -------
- ArrowExtensionArray
- Rounded values of the ArrowExtensionArray.
-
- See Also
- --------
- DataFrame.round : Round values of a DataFrame.
- Series.round : Round values of a Series.
- """
- return type(self)(pc.round(self._data, ndigits=decimals))
-
- @doc(ExtensionArray.searchsorted)
- def searchsorted(
- self,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- if self._hasna:
- raise ValueError(
- "searchsorted requires array to be sorted, which is impossible "
- "with NAs present."
- )
- if isinstance(value, ExtensionArray):
- value = value.astype(object)
- # Base class searchsorted would cast to object, which is *much* slower.
- return self.to_numpy().searchsorted(value, side=side, sorter=sorter)
-
- def take(
- self,
- indices: TakeIndexer,
- allow_fill: bool = False,
- fill_value: Any = None,
- ) -> ArrowExtensionArray:
- """
- Take elements from an array.
-
- Parameters
- ----------
- indices : sequence of int or one-dimensional np.ndarray of int
- Indices to be taken.
- allow_fill : bool, default False
- How to handle negative values in `indices`.
-
- * False: negative values in `indices` indicate positional indices
- from the right (the default). This is similar to
- :func:`numpy.take`.
-
- * True: negative values in `indices` indicate
- missing values. These values are set to `fill_value`. Any other
- other negative values raise a ``ValueError``.
-
- fill_value : any, optional
- Fill value to use for NA-indices when `allow_fill` is True.
- This may be ``None``, in which case the default NA value for
- the type, ``self.dtype.na_value``, is used.
-
- For many ExtensionArrays, there will be two representations of
- `fill_value`: a user-facing "boxed" scalar, and a low-level
- physical NA value. `fill_value` should be the user-facing version,
- and the implementation should handle translating that to the
- physical version for processing the take if necessary.
-
- Returns
- -------
- ExtensionArray
-
- Raises
- ------
- IndexError
- When the indices are out of bounds for the array.
- ValueError
- When `indices` contains negative values other than ``-1``
- and `allow_fill` is True.
-
- See Also
- --------
- numpy.take
- api.extensions.take
-
- Notes
- -----
- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
- ``iloc``, when `indices` is a sequence of values. Additionally,
- it's called by :meth:`Series.reindex`, or any other method
- that causes realignment, with a `fill_value`.
- """
- # TODO: Remove once we got rid of the (indices < 0) check
- if not is_array_like(indices):
- indices_array = np.asanyarray(indices)
- else:
- # error: Incompatible types in assignment (expression has type
- # "Sequence[int]", variable has type "ndarray")
- indices_array = indices # type: ignore[assignment]
-
- if len(self._data) == 0 and (indices_array >= 0).any():
- raise IndexError("cannot do a non-empty take")
- if indices_array.size > 0 and indices_array.max() >= len(self._data):
- raise IndexError("out of bounds value in 'indices'.")
-
- if allow_fill:
- fill_mask = indices_array < 0
- if fill_mask.any():
- validate_indices(indices_array, len(self._data))
- # TODO(ARROW-9433): Treat negative indices as NULL
- indices_array = pa.array(indices_array, mask=fill_mask)
- result = self._data.take(indices_array)
- if isna(fill_value):
- return type(self)(result)
- # TODO: ArrowNotImplementedError: Function fill_null has no
- # kernel matching input types (array[string], scalar[string])
- result = type(self)(result)
- result[fill_mask] = fill_value
- return result
- # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
- else:
- # Nothing to fill
- return type(self)(self._data.take(indices))
- else: # allow_fill=False
- # TODO(ARROW-9432): Treat negative indices as indices from the right.
- if (indices_array < 0).any():
- # Don't modify in-place
- indices_array = np.copy(indices_array)
- indices_array[indices_array < 0] += len(self._data)
- return type(self)(self._data.take(indices_array))
-
- @doc(ExtensionArray.to_numpy)
- def to_numpy(
- self,
- dtype: npt.DTypeLike | None = None,
- copy: bool = False,
- na_value: object = lib.no_default,
- ) -> np.ndarray:
- if dtype is None and self._hasna:
- dtype = object
- if na_value is lib.no_default:
- na_value = self.dtype.na_value
-
- pa_type = self._data.type
- if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type):
- # temporal types with units and/or timezones currently
- # require pandas/python scalars to pass all tests
- # TODO: improve performance (this is slow)
- result = np.array(list(self), dtype=dtype)
- elif is_object_dtype(dtype) and self._hasna:
- result = np.empty(len(self), dtype=object)
- mask = ~self.isna()
- result[mask] = np.asarray(self[mask]._data)
- elif pa.types.is_null(self._data.type):
- result = np.asarray(self._data, dtype=dtype)
- if not isna(na_value):
- result[:] = na_value
- return result
- elif self._hasna:
- data = self.copy()
- data[self.isna()] = na_value
- return np.asarray(data._data, dtype=dtype)
- else:
- result = np.asarray(self._data, dtype=dtype)
- if copy:
- result = result.copy()
- if self._hasna:
- result[self.isna()] = na_value
- return result
-
- def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
- """
- Compute the ArrowExtensionArray of unique values.
-
- Returns
- -------
- ArrowExtensionArray
- """
- pa_type = self._data.type
-
- if pa.types.is_duration(pa_type):
- # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
- data = self._data.cast(pa.int64())
- else:
- data = self._data
-
- pa_result = pc.unique(data)
-
- if pa.types.is_duration(pa_type):
- pa_result = pa_result.cast(pa_type)
-
- return type(self)(pa_result)
-
- def value_counts(self, dropna: bool = True) -> Series:
- """
- Return a Series containing counts of each unique value.
-
- Parameters
- ----------
- dropna : bool, default True
- Don't include counts of missing values.
-
- Returns
- -------
- counts : Series
-
- See Also
- --------
- Series.value_counts
- """
- pa_type = self._data.type
- if pa.types.is_duration(pa_type):
- # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
- data = self._data.cast(pa.int64())
- else:
- data = self._data
-
- from pandas import (
- Index,
- Series,
- )
-
- vc = data.value_counts()
-
- values = vc.field(0)
- counts = vc.field(1)
- if dropna and data.null_count > 0:
- mask = values.is_valid()
- values = values.filter(mask)
- counts = counts.filter(mask)
-
- if pa.types.is_duration(pa_type):
- values = values.cast(pa_type)
-
- counts = ArrowExtensionArray(counts)
-
- index = Index(type(self)(values))
-
- return Series(counts, index=index, name="count", copy=False)
-
- @classmethod
- def _concat_same_type(
- cls: type[ArrowExtensionArrayT], to_concat
- ) -> ArrowExtensionArrayT:
- """
- Concatenate multiple ArrowExtensionArrays.
-
- Parameters
- ----------
- to_concat : sequence of ArrowExtensionArrays
-
- Returns
- -------
- ArrowExtensionArray
- """
- chunks = [array for ea in to_concat for array in ea._data.iterchunks()]
- if to_concat[0].dtype == "string":
- # StringDtype has no attrivute pyarrow_dtype
- pa_dtype = pa.string()
- else:
- pa_dtype = to_concat[0].dtype.pyarrow_dtype
- arr = pa.chunked_array(chunks, type=pa_dtype)
- return cls(arr)
-
- def _accumulate(
- self, name: str, *, skipna: bool = True, **kwargs
- ) -> ArrowExtensionArray | ExtensionArray:
- """
- Return an ExtensionArray performing an accumulation operation.
-
- The underlying data type might change.
-
- Parameters
- ----------
- name : str
- Name of the function, supported values are:
- - cummin
- - cummax
- - cumsum
- - cumprod
- skipna : bool, default True
- If True, skip NA values.
- **kwargs
- Additional keyword arguments passed to the accumulation function.
- Currently, there is no supported kwarg.
-
- Returns
- -------
- array
-
- Raises
- ------
- NotImplementedError : subclass does not define accumulations
- """
- pyarrow_name = {
- "cumsum": "cumulative_sum_checked",
- }.get(name, name)
- pyarrow_meth = getattr(pc, pyarrow_name, None)
- if pyarrow_meth is None:
- return super()._accumulate(name, skipna=skipna, **kwargs)
-
- data_to_accum = self._data
-
- pa_dtype = data_to_accum.type
- if pa.types.is_duration(pa_dtype):
- data_to_accum = data_to_accum.cast(pa.int64())
-
- result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
-
- if pa.types.is_duration(pa_dtype):
- result = result.cast(pa_dtype)
-
- return type(self)(result)
-
- def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
- """
- Return a scalar result of performing the reduction operation.
-
- Parameters
- ----------
- name : str
- Name of the function, supported values are:
- { any, all, min, max, sum, mean, median, prod,
- std, var, sem, kurt, skew }.
- skipna : bool, default True
- If True, skip NaN values.
- **kwargs
- Additional keyword arguments passed to the reduction function.
- Currently, `ddof` is the only supported kwarg.
-
- Returns
- -------
- scalar
-
- Raises
- ------
- TypeError : subclass does not define reductions
- """
- pa_type = self._data.type
-
- data_to_reduce = self._data
-
- if name in ["any", "all"] and (
- pa.types.is_integer(pa_type)
- or pa.types.is_floating(pa_type)
- or pa.types.is_duration(pa_type)
- or pa.types.is_decimal(pa_type)
- ):
- # pyarrow only supports any/all for boolean dtype, we allow
- # for other dtypes, matching our non-pyarrow behavior
-
- if pa.types.is_duration(pa_type):
- data_to_cmp = self._data.cast(pa.int64())
- else:
- data_to_cmp = self._data
-
- not_eq = pc.not_equal(data_to_cmp, 0)
- data_to_reduce = not_eq
-
- elif name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
- data_to_reduce = self._data.cast(pa.int64())
-
- elif name in ["median", "mean", "std", "sem"] and pa.types.is_temporal(pa_type):
- nbits = pa_type.bit_width
- if nbits == 32:
- data_to_reduce = self._data.cast(pa.int32())
- else:
- data_to_reduce = self._data.cast(pa.int64())
-
- if name == "sem":
-
- def pyarrow_meth(data, skip_nulls, **kwargs):
- numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs)
- denominator = pc.sqrt_checked(pc.count(self._data))
- return pc.divide_checked(numerator, denominator)
-
- else:
- pyarrow_name = {
- "median": "quantile",
- "prod": "product",
- "std": "stddev",
- "var": "variance",
- }.get(name, name)
- # error: Incompatible types in assignment
- # (expression has type "Optional[Any]", variable has type
- # "Callable[[Any, Any, KwArg(Any)], Any]")
- pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment]
- if pyarrow_meth is None:
- # Let ExtensionArray._reduce raise the TypeError
- return super()._reduce(name, skipna=skipna, **kwargs)
-
- # GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0
- if name in ["any", "all"] and "min_count" not in kwargs:
- kwargs["min_count"] = 0
- elif name == "median":
- # GH 52679: Use quantile instead of approximate_median
- kwargs["q"] = 0.5
-
- try:
- result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs)
- except (AttributeError, NotImplementedError, TypeError) as err:
- msg = (
- f"'{type(self).__name__}' with dtype {self.dtype} "
- f"does not support reduction '{name}' with pyarrow "
- f"version {pa.__version__}. '{name}' may be supported by "
- f"upgrading pyarrow."
- )
- raise TypeError(msg) from err
- if name == "median":
- # GH 52679: Use quantile instead of approximate_median; returns array
- result = result[0]
- if pc.is_null(result).as_py():
- return self.dtype.na_value
-
- if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
- result = result.cast(pa_type)
- if name in ["median", "mean"] and pa.types.is_temporal(pa_type):
- result = result.cast(pa_type)
- if name in ["std", "sem"] and pa.types.is_temporal(pa_type):
- result = result.cast(pa.int64())
- if pa.types.is_duration(pa_type):
- result = result.cast(pa_type)
- elif pa.types.is_time(pa_type):
- unit = get_unit_from_pa_dtype(pa_type)
- result = result.cast(pa.duration(unit))
- elif pa.types.is_date(pa_type):
- # go with closest available unit, i.e. "s"
- result = result.cast(pa.duration("s"))
- else:
- # i.e. timestamp
- result = result.cast(pa.duration(pa_type.unit))
-
- return result.as_py()
-
- def __setitem__(self, key, value) -> None:
- """Set one or more values inplace.
-
- Parameters
- ----------
- key : int, ndarray, or slice
- When called from, e.g. ``Series.__setitem__``, ``key`` will be
- one of
-
- * scalar int
- * ndarray of integers.
- * boolean ndarray
- * slice object
-
- value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
- value or values to be set of ``key``.
-
- Returns
- -------
- None
- """
- # GH50085: unwrap 1D indexers
- if isinstance(key, tuple) and len(key) == 1:
- key = key[0]
-
- key = check_array_indexer(self, key)
- value = self._maybe_convert_setitem_value(value)
-
- if com.is_null_slice(key):
- # fast path (GH50248)
- data = self._if_else(True, value, self._data)
-
- elif is_integer(key):
- # fast path
- key = cast(int, key)
- n = len(self)
- if key < 0:
- key += n
- if not 0 <= key < n:
- raise IndexError(
- f"index {key} is out of bounds for axis 0 with size {n}"
- )
- if is_list_like(value):
- raise ValueError("Length of indexer and values mismatch")
- elif isinstance(value, pa.Scalar):
- value = value.as_py()
- chunks = [
- *self._data[:key].chunks,
- pa.array([value], type=self._data.type, from_pandas=True),
- *self._data[key + 1 :].chunks,
- ]
- data = pa.chunked_array(chunks).combine_chunks()
-
- elif is_bool_dtype(key):
- key = np.asarray(key, dtype=np.bool_)
- data = self._replace_with_mask(self._data, key, value)
-
- elif is_scalar(value) or isinstance(value, pa.Scalar):
- mask = np.zeros(len(self), dtype=np.bool_)
- mask[key] = True
- data = self._if_else(mask, value, self._data)
-
- else:
- indices = np.arange(len(self))[key]
- if len(indices) != len(value):
- raise ValueError("Length of indexer and values mismatch")
- if len(indices) == 0:
- return
- argsort = np.argsort(indices)
- indices = indices[argsort]
- value = value.take(argsort)
- mask = np.zeros(len(self), dtype=np.bool_)
- mask[indices] = True
- data = self._replace_with_mask(self._data, mask, value)
-
- if isinstance(data, pa.Array):
- data = pa.chunked_array([data])
- self._data = data
-
- def _rank(
- self,
- *,
- axis: AxisInt = 0,
- method: str = "average",
- na_option: str = "keep",
- ascending: bool = True,
- pct: bool = False,
- ):
- """
- See Series.rank.__doc__.
- """
- if pa_version_under9p0 or axis != 0:
- ranked = super()._rank(
- axis=axis,
- method=method,
- na_option=na_option,
- ascending=ascending,
- pct=pct,
- )
- # keep dtypes consistent with the implementation below
- if method == "average" or pct:
- pa_type = pa.float64()
- else:
- pa_type = pa.uint64()
- result = pa.array(ranked, type=pa_type, from_pandas=True)
- return type(self)(result)
-
- data = self._data.combine_chunks()
- sort_keys = "ascending" if ascending else "descending"
- null_placement = "at_start" if na_option == "top" else "at_end"
- tiebreaker = "min" if method == "average" else method
-
- result = pc.rank(
- data,
- sort_keys=sort_keys,
- null_placement=null_placement,
- tiebreaker=tiebreaker,
- )
-
- if na_option == "keep":
- mask = pc.is_null(self._data)
- null = pa.scalar(None, type=result.type)
- result = pc.if_else(mask, null, result)
-
- if method == "average":
- result_max = pc.rank(
- data,
- sort_keys=sort_keys,
- null_placement=null_placement,
- tiebreaker="max",
- )
- result_max = result_max.cast(pa.float64())
- result_min = result.cast(pa.float64())
- result = pc.divide(pc.add(result_min, result_max), 2)
-
- if pct:
- if not pa.types.is_floating(result.type):
- result = result.cast(pa.float64())
- if method == "dense":
- divisor = pc.max(result)
- else:
- divisor = pc.count(result)
- result = pc.divide(result, divisor)
-
- return type(self)(result)
-
- def _quantile(
- self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str
- ) -> ArrowExtensionArrayT:
- """
- Compute the quantiles of self for each quantile in `qs`.
-
- Parameters
- ----------
- qs : np.ndarray[float64]
- interpolation: str
-
- Returns
- -------
- same type as self
- """
- pa_dtype = self._data.type
-
- data = self._data
- if pa.types.is_temporal(pa_dtype):
- # https://github.com/apache/arrow/issues/33769 in these cases
- # we can cast to ints and back
- nbits = pa_dtype.bit_width
- if nbits == 32:
- data = data.cast(pa.int32())
- else:
- data = data.cast(pa.int64())
-
- result = pc.quantile(data, q=qs, interpolation=interpolation)
-
- if pa.types.is_temporal(pa_dtype):
- nbits = pa_dtype.bit_width
- if nbits == 32:
- result = result.cast(pa.int32())
- else:
- result = result.cast(pa.int64())
- result = result.cast(pa_dtype)
-
- return type(self)(result)
-
- def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT:
- """
- Returns the mode(s) of the ExtensionArray.
-
- Always returns `ExtensionArray` even if only one value.
-
- Parameters
- ----------
- dropna : bool, default True
- Don't consider counts of NA values.
-
- Returns
- -------
- same type as self
- Sorted, if possible.
- """
- pa_type = self._data.type
- if pa.types.is_temporal(pa_type):
- nbits = pa_type.bit_width
- if nbits == 32:
- data = self._data.cast(pa.int32())
- elif nbits == 64:
- data = self._data.cast(pa.int64())
- else:
- raise NotImplementedError(pa_type)
- else:
- data = self._data
-
- if dropna:
- data = data.drop_null()
-
- res = pc.value_counts(data)
- most_common = res.field("values").filter(
- pc.equal(res.field("counts"), pc.max(res.field("counts")))
- )
-
- if pa.types.is_temporal(pa_type):
- most_common = most_common.cast(pa_type)
-
- return type(self)(most_common)
-
- def _maybe_convert_setitem_value(self, value):
- """Maybe convert value to be pyarrow compatible."""
- if value is None:
- return value
- if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
- return value
- if is_list_like(value):
- pa_box = pa.array
- else:
- pa_box = pa.scalar
- try:
- value = pa_box(value, type=self._data.type, from_pandas=True)
- except pa.ArrowTypeError as err:
- msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
- raise TypeError(msg) from err
- return value
-
- @classmethod
- def _if_else(
- cls,
- cond: npt.NDArray[np.bool_] | bool,
- left: ArrayLike | Scalar,
- right: ArrayLike | Scalar,
- ):
- """
- Choose values based on a condition.
-
- Analogous to pyarrow.compute.if_else, with logic
- to fallback to numpy for unsupported types.
-
- Parameters
- ----------
- cond : npt.NDArray[np.bool_] or bool
- left : ArrayLike | Scalar
- right : ArrayLike | Scalar
-
- Returns
- -------
- pa.Array
- """
- try:
- return pc.if_else(cond, left, right)
- except pa.ArrowNotImplementedError:
- pass
-
- def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
- if isinstance(value, (pa.Array, pa.ChunkedArray)):
- pa_type = value.type
- elif isinstance(value, pa.Scalar):
- pa_type = value.type
- value = value.as_py()
- else:
- pa_type = None
- return np.array(value, dtype=object), pa_type
-
- left, left_type = _to_numpy_and_type(left)
- right, right_type = _to_numpy_and_type(right)
- pa_type = left_type or right_type
- result = np.where(cond, left, right)
- return pa.array(result, type=pa_type, from_pandas=True)
-
- @classmethod
- def _replace_with_mask(
- cls,
- values: pa.Array | pa.ChunkedArray,
- mask: npt.NDArray[np.bool_] | bool,
- replacements: ArrayLike | Scalar,
- ):
- """
- Replace items selected with a mask.
-
- Analogous to pyarrow.compute.replace_with_mask, with logic
- to fallback to numpy for unsupported types.
-
- Parameters
- ----------
- values : pa.Array or pa.ChunkedArray
- mask : npt.NDArray[np.bool_] or bool
- replacements : ArrayLike or Scalar
- Replacement value(s)
-
- Returns
- -------
- pa.Array or pa.ChunkedArray
- """
- if isinstance(replacements, pa.ChunkedArray):
- # replacements must be array or scalar, not ChunkedArray
- replacements = replacements.combine_chunks()
- if pa_version_under8p0:
- # pc.replace_with_mask seems to be a bit unreliable for versions < 8.0:
- # version <= 7: segfaults with various types
- # version <= 6: fails to replace nulls
- if isinstance(replacements, pa.Array):
- indices = np.full(len(values), None)
- indices[mask] = np.arange(len(replacements))
- indices = pa.array(indices, type=pa.int64())
- replacements = replacements.take(indices)
- return cls._if_else(mask, replacements, values)
- if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type):
- # GH#52059 replace_with_mask segfaults for chunked array
- # https://github.com/apache/arrow/issues/34634
- values = values.combine_chunks()
- try:
- return pc.replace_with_mask(values, mask, replacements)
- except pa.ArrowNotImplementedError:
- pass
- if isinstance(replacements, pa.Array):
- replacements = np.array(replacements, dtype=object)
- elif isinstance(replacements, pa.Scalar):
- replacements = replacements.as_py()
- result = np.array(values, dtype=object)
- result[mask] = replacements
- return pa.array(result, type=values.type, from_pandas=True)
-
- def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
- """Apply a callable to each element while maintaining the chunking structure."""
- return [
- [
- None if val is None else func(val)
- for val in chunk.to_numpy(zero_copy_only=False)
- ]
- for chunk in self._data.iterchunks()
- ]
-
- def _str_count(self, pat: str, flags: int = 0):
- if flags:
- raise NotImplementedError(f"count not implemented with {flags=}")
- return type(self)(pc.count_substring_regex(self._data, pat))
-
- def _str_pad(
- self,
- width: int,
- side: Literal["left", "right", "both"] = "left",
- fillchar: str = " ",
- ):
- if side == "left":
- pa_pad = pc.utf8_lpad
- elif side == "right":
- pa_pad = pc.utf8_rpad
- elif side == "both":
- pa_pad = pc.utf8_center
- else:
- raise ValueError(
- f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
- )
- return type(self)(pa_pad(self._data, width=width, padding=fillchar))
-
- def _str_contains(
- self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
- ):
- if flags:
- raise NotImplementedError(f"contains not implemented with {flags=}")
-
- if regex:
- pa_contains = pc.match_substring_regex
- else:
- pa_contains = pc.match_substring
- result = pa_contains(self._data, pat, ignore_case=not case)
- if not isna(na):
- result = result.fill_null(na)
- return type(self)(result)
-
- def _str_startswith(self, pat: str, na=None):
- result = pc.starts_with(self._data, pattern=pat)
- if not isna(na):
- result = result.fill_null(na)
- return type(self)(result)
-
- def _str_endswith(self, pat: str, na=None):
- result = pc.ends_with(self._data, pattern=pat)
- if not isna(na):
- result = result.fill_null(na)
- return type(self)(result)
-
- def _str_replace(
- self,
- pat: str | re.Pattern,
- repl: str | Callable,
- n: int = -1,
- case: bool = True,
- flags: int = 0,
- regex: bool = True,
- ):
- if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
- raise NotImplementedError(
- "replace is not supported with a re.Pattern, callable repl, "
- "case=False, or flags!=0"
- )
-
- func = pc.replace_substring_regex if regex else pc.replace_substring
- result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
- return type(self)(result)
-
- def _str_repeat(self, repeats: int | Sequence[int]):
- if not isinstance(repeats, int):
- raise NotImplementedError(
- f"repeat is not implemented when repeats is {type(repeats).__name__}"
- )
- elif pa_version_under7p0:
- raise NotImplementedError("repeat is not implemented for pyarrow < 7")
- else:
- return type(self)(pc.binary_repeat(self._data, repeats))
-
- def _str_match(
- self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.startswith("^"):
- pat = f"^{pat}"
- return self._str_contains(pat, case, flags, na, regex=True)
-
- def _str_fullmatch(
- self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.endswith("$") or pat.endswith("//$"):
- pat = f"{pat}$"
- return self._str_match(pat, case, flags, na)
-
- def _str_find(self, sub: str, start: int = 0, end: int | None = None):
- if start != 0 and end is not None:
- slices = pc.utf8_slice_codeunits(self._data, start, stop=end)
- result = pc.find_substring(slices, sub)
- not_found = pc.equal(result, -1)
- offset_result = pc.add(result, end - start)
- result = pc.if_else(not_found, result, offset_result)
- elif start == 0 and end is None:
- slices = self._data
- result = pc.find_substring(slices, sub)
- else:
- raise NotImplementedError(
- f"find not implemented with {sub=}, {start=}, {end=}"
- )
- return type(self)(result)
-
- def _str_get(self, i: int):
- lengths = pc.utf8_length(self._data)
- if i >= 0:
- out_of_bounds = pc.greater_equal(i, lengths)
- start = i
- stop = i + 1
- step = 1
- else:
- out_of_bounds = pc.greater(-i, lengths)
- start = i
- stop = i - 1
- step = -1
- not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
- selected = pc.utf8_slice_codeunits(
- self._data, start=start, stop=stop, step=step
- )
- result = pa.array([None] * self._data.length(), type=self._data.type)
- result = pc.if_else(not_out_of_bounds, selected, result)
- return type(self)(result)
-
- def _str_join(self, sep: str):
- return type(self)(pc.binary_join(self._data, sep))
-
- def _str_partition(self, sep: str, expand: bool):
- predicate = lambda val: val.partition(sep)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- def _str_rpartition(self, sep: str, expand: bool):
- predicate = lambda val: val.rpartition(sep)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- def _str_slice(
- self, start: int | None = None, stop: int | None = None, step: int | None = None
- ):
- if start is None:
- start = 0
- if step is None:
- step = 1
- return type(self)(
- pc.utf8_slice_codeunits(self._data, start=start, stop=stop, step=step)
- )
-
- def _str_slice_replace(
- self, start: int | None = None, stop: int | None = None, repl: str | None = None
- ):
- if repl is None:
- repl = ""
- if start is None:
- start = 0
- return type(self)(pc.utf8_replace_slice(self._data, start, stop, repl))
-
- def _str_isalnum(self):
- return type(self)(pc.utf8_is_alnum(self._data))
-
- def _str_isalpha(self):
- return type(self)(pc.utf8_is_alpha(self._data))
-
- def _str_isdecimal(self):
- return type(self)(pc.utf8_is_decimal(self._data))
-
- def _str_isdigit(self):
- return type(self)(pc.utf8_is_digit(self._data))
-
- def _str_islower(self):
- return type(self)(pc.utf8_is_lower(self._data))
-
- def _str_isnumeric(self):
- return type(self)(pc.utf8_is_numeric(self._data))
-
- def _str_isspace(self):
- return type(self)(pc.utf8_is_space(self._data))
-
- def _str_istitle(self):
- return type(self)(pc.utf8_is_title(self._data))
-
- def _str_capitalize(self):
- return type(self)(pc.utf8_capitalize(self._data))
-
- def _str_title(self):
- return type(self)(pc.utf8_title(self._data))
-
- def _str_isupper(self):
- return type(self)(pc.utf8_is_upper(self._data))
-
- def _str_swapcase(self):
- return type(self)(pc.utf8_swapcase(self._data))
-
- def _str_len(self):
- return type(self)(pc.utf8_length(self._data))
-
- def _str_lower(self):
- return type(self)(pc.utf8_lower(self._data))
-
- def _str_upper(self):
- return type(self)(pc.utf8_upper(self._data))
-
- def _str_strip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_trim_whitespace(self._data)
- else:
- result = pc.utf8_trim(self._data, characters=to_strip)
- return type(self)(result)
-
- def _str_lstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_ltrim_whitespace(self._data)
- else:
- result = pc.utf8_ltrim(self._data, characters=to_strip)
- return type(self)(result)
-
- def _str_rstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_rtrim_whitespace(self._data)
- else:
- result = pc.utf8_rtrim(self._data, characters=to_strip)
- return type(self)(result)
-
- def _str_removeprefix(self, prefix: str):
- # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed
- # starts_with = pc.starts_with(self._data, pattern=prefix)
- # removed = pc.utf8_slice_codeunits(self._data, len(prefix))
- # result = pc.if_else(starts_with, removed, self._data)
- # return type(self)(result)
- if sys.version_info < (3, 9):
- # NOTE pyupgrade will remove this when we run it with --py39-plus
- # so don't remove the unnecessary `else` statement below
- from pandas.util._str_methods import removeprefix
-
- predicate = functools.partial(removeprefix, prefix=prefix)
- else:
- predicate = lambda val: val.removeprefix(prefix)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- def _str_removesuffix(self, suffix: str):
- ends_with = pc.ends_with(self._data, pattern=suffix)
- removed = pc.utf8_slice_codeunits(self._data, 0, stop=-len(suffix))
- result = pc.if_else(ends_with, removed, self._data)
- return type(self)(result)
-
- def _str_casefold(self):
- predicate = lambda val: val.casefold()
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- def _str_encode(self, encoding: str, errors: str = "strict"):
- predicate = lambda val: val.encode(encoding, errors)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
- raise NotImplementedError(
- "str.extract not supported with pd.ArrowDtype(pa.string())."
- )
-
- def _str_findall(self, pat: str, flags: int = 0):
- regex = re.compile(pat, flags=flags)
- predicate = lambda val: regex.findall(val)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- def _str_get_dummies(self, sep: str = "|"):
- split = pc.split_pattern(self._data, sep).combine_chunks()
- uniques = split.flatten().unique()
- uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques))
- result_data = []
- for lst in split.to_pylist():
- if lst is None:
- result_data.append([False] * len(uniques_sorted))
- else:
- res = pc.is_in(uniques_sorted, pa.array(set(lst)))
- result_data.append(res.to_pylist())
- result = type(self)(pa.array(result_data))
- return result, uniques_sorted.to_pylist()
-
- def _str_index(self, sub: str, start: int = 0, end: int | None = None):
- predicate = lambda val: val.index(sub, start, end)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- def _str_rindex(self, sub: str, start: int = 0, end: int | None = None):
- predicate = lambda val: val.rindex(sub, start, end)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- def _str_normalize(self, form: str):
- predicate = lambda val: unicodedata.normalize(form, val)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- def _str_rfind(self, sub: str, start: int = 0, end=None):
- predicate = lambda val: val.rfind(sub, start, end)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- def _str_split(
- self,
- pat: str | None = None,
- n: int | None = -1,
- expand: bool = False,
- regex: bool | None = None,
- ):
- if n in {-1, 0}:
- n = None
- if regex:
- split_func = pc.split_pattern_regex
- else:
- split_func = pc.split_pattern
- return type(self)(split_func(self._data, pat, max_splits=n))
-
- def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
- if n in {-1, 0}:
- n = None
- return type(self)(pc.split_pattern(self._data, pat, max_splits=n, reverse=True))
-
- def _str_translate(self, table: dict[int, str]):
- predicate = lambda val: val.translate(table)
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- def _str_wrap(self, width: int, **kwargs):
- kwargs["width"] = width
- tw = textwrap.TextWrapper(**kwargs)
- predicate = lambda val: "\n".join(tw.wrap(val))
- result = self._apply_elementwise(predicate)
- return type(self)(pa.chunked_array(result))
-
- @property
- def _dt_year(self):
- return type(self)(pc.year(self._data))
-
- @property
- def _dt_day(self):
- return type(self)(pc.day(self._data))
-
- @property
- def _dt_day_of_week(self):
- return type(self)(pc.day_of_week(self._data))
-
- _dt_dayofweek = _dt_day_of_week
- _dt_weekday = _dt_day_of_week
-
- @property
- def _dt_day_of_year(self):
- return type(self)(pc.day_of_year(self._data))
-
- _dt_dayofyear = _dt_day_of_year
-
- @property
- def _dt_hour(self):
- return type(self)(pc.hour(self._data))
-
- def _dt_isocalendar(self):
- return type(self)(pc.iso_calendar(self._data))
-
- @property
- def _dt_is_leap_year(self):
- return type(self)(pc.is_leap_year(self._data))
-
- @property
- def _dt_microsecond(self):
- return type(self)(pc.microsecond(self._data))
-
- @property
- def _dt_minute(self):
- return type(self)(pc.minute(self._data))
-
- @property
- def _dt_month(self):
- return type(self)(pc.month(self._data))
-
- @property
- def _dt_nanosecond(self):
- return type(self)(pc.nanosecond(self._data))
-
- @property
- def _dt_quarter(self):
- return type(self)(pc.quarter(self._data))
-
- @property
- def _dt_second(self):
- return type(self)(pc.second(self._data))
-
- @property
- def _dt_date(self):
- return type(self)(self._data.cast(pa.date32()))
-
- @property
- def _dt_time(self):
- unit = (
- self.dtype.pyarrow_dtype.unit
- if self.dtype.pyarrow_dtype.unit in {"us", "ns"}
- else "ns"
- )
- return type(self)(self._data.cast(pa.time64(unit)))
-
- @property
- def _dt_tz(self):
- return self.dtype.pyarrow_dtype.tz
-
- def _dt_strftime(self, format: str):
- return type(self)(pc.strftime(self._data, format=format))
-
- def _round_temporally(
- self,
- method: Literal["ceil", "floor", "round"],
- freq,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- if ambiguous != "raise":
- raise NotImplementedError("ambiguous is not supported.")
- if nonexistent != "raise":
- raise NotImplementedError("nonexistent is not supported.")
- offset = to_offset(freq)
- if offset is None:
- raise ValueError(f"Must specify a valid frequency: {freq}")
- pa_supported_unit = {
- "A": "year",
- "AS": "year",
- "Q": "quarter",
- "QS": "quarter",
- "M": "month",
- "MS": "month",
- "W": "week",
- "D": "day",
- "H": "hour",
- "T": "minute",
- "S": "second",
- "L": "millisecond",
- "U": "microsecond",
- "N": "nanosecond",
- }
- unit = pa_supported_unit.get(offset._prefix, None)
- if unit is None:
- raise ValueError(f"{freq=} is not supported")
- multiple = offset.n
- rounding_method = getattr(pc, f"{method}_temporal")
- return type(self)(rounding_method(self._data, multiple=multiple, unit=unit))
-
- def _dt_ceil(
- self,
- freq,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- return self._round_temporally("ceil", freq, ambiguous, nonexistent)
-
- def _dt_floor(
- self,
- freq,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- return self._round_temporally("floor", freq, ambiguous, nonexistent)
-
- def _dt_round(
- self,
- freq,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- return self._round_temporally("round", freq, ambiguous, nonexistent)
-
- def _dt_to_pydatetime(self):
- if pa.types.is_date(self.dtype.pyarrow_dtype):
- raise ValueError(
- f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. "
- "Convert to pyarrow timestamp type."
- )
- data = self._data.to_pylist()
- if self._dtype.pyarrow_dtype.unit == "ns":
- data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data]
- return np.array(data, dtype=object)
-
- def _dt_tz_localize(
- self,
- tz,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- if ambiguous != "raise":
- raise NotImplementedError(f"{ambiguous=} is not supported")
- nonexistent_pa = {
- "raise": "raise",
- "shift_backward": "earliest",
- "shift_forward": "latest",
- }.get(
- nonexistent, None # type: ignore[arg-type]
- )
- if nonexistent_pa is None:
- raise NotImplementedError(f"{nonexistent=} is not supported")
- if tz is None:
- result = self._data.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit))
- else:
- result = pc.assume_timezone(
- self._data, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa
- )
- return type(self)(result)
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/arrow/dtype.py b/contrib/python/pandas/py3/pandas/core/arrays/arrow/dtype.py
deleted file mode 100644
index 7d4fbb788cc..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/arrow/dtype.py
+++ /dev/null
@@ -1,312 +0,0 @@
-from __future__ import annotations
-
-from datetime import (
- date,
- datetime,
- time,
- timedelta,
-)
-from decimal import Decimal
-import re
-
-import numpy as np
-
-from pandas._libs.tslibs import (
- Timedelta,
- Timestamp,
-)
-from pandas._typing import (
- TYPE_CHECKING,
- DtypeObj,
- type_t,
-)
-from pandas.compat import pa_version_under7p0
-from pandas.util._decorators import cache_readonly
-
-from pandas.core.dtypes.base import (
- StorageExtensionDtype,
- register_extension_dtype,
-)
-from pandas.core.dtypes.dtypes import CategoricalDtypeType
-
-if not pa_version_under7p0:
- import pyarrow as pa
-
-if TYPE_CHECKING:
- from pandas.core.arrays.arrow import ArrowExtensionArray
-
-
-@register_extension_dtype
-class ArrowDtype(StorageExtensionDtype):
- """
- An ExtensionDtype for PyArrow data types.
-
- .. warning::
-
- ArrowDtype is considered experimental. The implementation and
- parts of the API may change without warning.
-
- While most ``dtype`` arguments can accept the "string"
- constructor, e.g. ``"int64[pyarrow]"``, ArrowDtype is useful
- if the data type contains parameters like ``pyarrow.timestamp``.
-
- Parameters
- ----------
- pyarrow_dtype : pa.DataType
- An instance of a `pyarrow.DataType <https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions>`__.
-
- Attributes
- ----------
- pyarrow_dtype
-
- Methods
- -------
- None
-
- Returns
- -------
- ArrowDtype
-
- Examples
- --------
- >>> import pyarrow as pa
- >>> pd.ArrowDtype(pa.int64())
- int64[pyarrow]
-
- Types with parameters must be constructed with ArrowDtype.
-
- >>> pd.ArrowDtype(pa.timestamp("s", tz="America/New_York"))
- timestamp[s, tz=America/New_York][pyarrow]
- >>> pd.ArrowDtype(pa.list_(pa.int64()))
- list<item: int64>[pyarrow]
- """ # noqa: E501
-
- _metadata = ("storage", "pyarrow_dtype") # type: ignore[assignment]
-
- def __init__(self, pyarrow_dtype: pa.DataType) -> None:
- super().__init__("pyarrow")
- if pa_version_under7p0:
- raise ImportError("pyarrow>=7.0.0 is required for ArrowDtype")
- if not isinstance(pyarrow_dtype, pa.DataType):
- raise ValueError(
- f"pyarrow_dtype ({pyarrow_dtype}) must be an instance "
- f"of a pyarrow.DataType. Got {type(pyarrow_dtype)} instead."
- )
- self.pyarrow_dtype = pyarrow_dtype
-
- def __repr__(self) -> str:
- return self.name
-
- @property
- def type(self):
- """
- Returns associated scalar type.
- """
- pa_type = self.pyarrow_dtype
- if pa.types.is_integer(pa_type):
- return int
- elif pa.types.is_floating(pa_type):
- return float
- elif pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
- return str
- elif (
- pa.types.is_binary(pa_type)
- or pa.types.is_fixed_size_binary(pa_type)
- or pa.types.is_large_binary(pa_type)
- ):
- return bytes
- elif pa.types.is_boolean(pa_type):
- return bool
- elif pa.types.is_duration(pa_type):
- if pa_type.unit == "ns":
- return Timedelta
- else:
- return timedelta
- elif pa.types.is_timestamp(pa_type):
- if pa_type.unit == "ns":
- return Timestamp
- else:
- return datetime
- elif pa.types.is_date(pa_type):
- return date
- elif pa.types.is_time(pa_type):
- return time
- elif pa.types.is_decimal(pa_type):
- return Decimal
- elif pa.types.is_dictionary(pa_type):
- # TODO: Potentially change this & CategoricalDtype.type to
- # something more representative of the scalar
- return CategoricalDtypeType
- elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type):
- return list
- elif pa.types.is_map(pa_type):
- return list
- elif pa.types.is_struct(pa_type):
- return dict
- elif pa.types.is_null(pa_type):
- # TODO: None? pd.NA? pa.null?
- return type(pa_type)
- else:
- raise NotImplementedError(pa_type)
-
- @property
- def name(self) -> str: # type: ignore[override]
- """
- A string identifying the data type.
- """
- return f"{str(self.pyarrow_dtype)}[{self.storage}]"
-
- @cache_readonly
- def numpy_dtype(self) -> np.dtype:
- """Return an instance of the related numpy dtype"""
- if pa.types.is_string(self.pyarrow_dtype):
- # pa.string().to_pandas_dtype() = object which we don't want
- return np.dtype(str)
- try:
- return np.dtype(self.pyarrow_dtype.to_pandas_dtype())
- except (NotImplementedError, TypeError):
- return np.dtype(object)
-
- @cache_readonly
- def kind(self) -> str:
- if pa.types.is_timestamp(self.pyarrow_dtype):
- # To mirror DatetimeTZDtype
- return "M"
- return self.numpy_dtype.kind
-
- @cache_readonly
- def itemsize(self) -> int:
- """Return the number of bytes in this dtype"""
- return self.numpy_dtype.itemsize
-
- @classmethod
- def construct_array_type(cls) -> type_t[ArrowExtensionArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- from pandas.core.arrays.arrow import ArrowExtensionArray
-
- return ArrowExtensionArray
-
- @classmethod
- def construct_from_string(cls, string: str) -> ArrowDtype:
- """
- Construct this type from a string.
-
- Parameters
- ----------
- string : str
- string should follow the format f"{pyarrow_type}[pyarrow]"
- e.g. int64[pyarrow]
- """
- if not isinstance(string, str):
- raise TypeError(
- f"'construct_from_string' expects a string, got {type(string)}"
- )
- if not string.endswith("[pyarrow]"):
- raise TypeError(f"'{string}' must end with '[pyarrow]'")
- if string == "string[pyarrow]":
- # Ensure Registry.find skips ArrowDtype to use StringDtype instead
- raise TypeError("string[pyarrow] should be constructed by StringDtype")
-
- base_type = string[:-9] # get rid of "[pyarrow]"
- try:
- pa_dtype = pa.type_for_alias(base_type)
- except ValueError as err:
- has_parameters = re.search(r"[\[\(].*[\]\)]", base_type)
- if has_parameters:
- # Fallback to try common temporal types
- try:
- return cls._parse_temporal_dtype_string(base_type)
- except (NotImplementedError, ValueError):
- # Fall through to raise with nice exception message below
- pass
-
- raise NotImplementedError(
- "Passing pyarrow type specific parameters "
- f"({has_parameters.group()}) in the string is not supported. "
- "Please construct an ArrowDtype object with a pyarrow_dtype "
- "instance with specific parameters."
- ) from err
- raise TypeError(f"'{base_type}' is not a valid pyarrow data type.") from err
- return cls(pa_dtype)
-
- # TODO(arrow#33642): This can be removed once supported by pyarrow
- @classmethod
- def _parse_temporal_dtype_string(cls, string: str) -> ArrowDtype:
- """
- Construct a temporal ArrowDtype from string.
- """
- # we assume
- # 1) "[pyarrow]" has already been stripped from the end of our string.
- # 2) we know "[" is present
- head, tail = string.split("[", 1)
-
- if not tail.endswith("]"):
- raise ValueError
- tail = tail[:-1]
-
- if head == "timestamp":
- assert "," in tail # otherwise type_for_alias should work
- unit, tz = tail.split(",", 1)
- unit = unit.strip()
- tz = tz.strip()
- if tz.startswith("tz="):
- tz = tz[3:]
-
- pa_type = pa.timestamp(unit, tz=tz)
- dtype = cls(pa_type)
- return dtype
-
- raise NotImplementedError(string)
-
- @property
- def _is_numeric(self) -> bool:
- """
- Whether columns with this dtype should be considered numeric.
- """
- # TODO: pa.types.is_boolean?
- return (
- pa.types.is_integer(self.pyarrow_dtype)
- or pa.types.is_floating(self.pyarrow_dtype)
- or pa.types.is_decimal(self.pyarrow_dtype)
- )
-
- @property
- def _is_boolean(self) -> bool:
- """
- Whether this dtype should be considered boolean.
- """
- return pa.types.is_boolean(self.pyarrow_dtype)
-
- def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
- # We unwrap any masked dtypes, find the common dtype we would use
- # for that, then re-mask the result.
- # Mirrors BaseMaskedDtype
- from pandas.core.dtypes.cast import find_common_type
-
- new_dtype = find_common_type(
- [
- dtype.numpy_dtype if isinstance(dtype, ArrowDtype) else dtype
- for dtype in dtypes
- ]
- )
- if not isinstance(new_dtype, np.dtype):
- return None
- try:
- pa_dtype = pa.from_numpy_dtype(new_dtype)
- return type(self)(pa_dtype)
- except NotImplementedError:
- return None
-
- def __from_arrow__(self, array: pa.Array | pa.ChunkedArray):
- """
- Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
- """
- array_class = self.construct_array_type()
- arr = array.cast(self.pyarrow_dtype, safe=True)
- return array_class(arr)
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/arrow/extension_types.py b/contrib/python/pandas/py3/pandas/core/arrays/arrow/extension_types.py
deleted file mode 100644
index 25f597af5e3..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/arrow/extension_types.py
+++ /dev/null
@@ -1,111 +0,0 @@
-from __future__ import annotations
-
-import json
-
-import pyarrow
-
-from pandas._typing import IntervalClosedType
-
-from pandas.core.arrays.interval import VALID_CLOSED
-
-
-class ArrowPeriodType(pyarrow.ExtensionType):
- def __init__(self, freq) -> None:
- # attributes need to be set first before calling
- # super init (as that calls serialize)
- self._freq = freq
- pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
-
- @property
- def freq(self):
- return self._freq
-
- def __arrow_ext_serialize__(self) -> bytes:
- metadata = {"freq": self.freq}
- return json.dumps(metadata).encode()
-
- @classmethod
- def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowPeriodType:
- metadata = json.loads(serialized.decode())
- return ArrowPeriodType(metadata["freq"])
-
- def __eq__(self, other):
- if isinstance(other, pyarrow.BaseExtensionType):
- return type(self) == type(other) and self.freq == other.freq
- else:
- return NotImplemented
-
- def __ne__(self, other) -> bool:
- return not self == other
-
- def __hash__(self) -> int:
- return hash((str(self), self.freq))
-
- def to_pandas_dtype(self):
- import pandas as pd
-
- return pd.PeriodDtype(freq=self.freq)
-
-
-# register the type with a dummy instance
-_period_type = ArrowPeriodType("D")
-pyarrow.register_extension_type(_period_type)
-
-
-class ArrowIntervalType(pyarrow.ExtensionType):
- def __init__(self, subtype, closed: IntervalClosedType) -> None:
- # attributes need to be set first before calling
- # super init (as that calls serialize)
- assert closed in VALID_CLOSED
- self._closed: IntervalClosedType = closed
- if not isinstance(subtype, pyarrow.DataType):
- subtype = pyarrow.type_for_alias(str(subtype))
- self._subtype = subtype
-
- storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
- pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
-
- @property
- def subtype(self):
- return self._subtype
-
- @property
- def closed(self) -> IntervalClosedType:
- return self._closed
-
- def __arrow_ext_serialize__(self) -> bytes:
- metadata = {"subtype": str(self.subtype), "closed": self.closed}
- return json.dumps(metadata).encode()
-
- @classmethod
- def __arrow_ext_deserialize__(cls, storage_type, serialized) -> ArrowIntervalType:
- metadata = json.loads(serialized.decode())
- subtype = pyarrow.type_for_alias(metadata["subtype"])
- closed = metadata["closed"]
- return ArrowIntervalType(subtype, closed)
-
- def __eq__(self, other):
- if isinstance(other, pyarrow.BaseExtensionType):
- return (
- type(self) == type(other)
- and self.subtype == other.subtype
- and self.closed == other.closed
- )
- else:
- return NotImplemented
-
- def __ne__(self, other) -> bool:
- return not self == other
-
- def __hash__(self) -> int:
- return hash((str(self), str(self.subtype), self.closed))
-
- def to_pandas_dtype(self):
- import pandas as pd
-
- return pd.IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)
-
-
-# register the type with a dummy instance
-_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
-pyarrow.register_extension_type(_interval_type)
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/base.py b/contrib/python/pandas/py3/pandas/core/arrays/base.py
deleted file mode 100644
index db8c87f0654..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/base.py
+++ /dev/null
@@ -1,1873 +0,0 @@
-"""
-An interface for extending pandas with custom arrays.
-
-.. warning::
-
- This is an experimental API and subject to breaking changes
- without warning.
-"""
-from __future__ import annotations
-
-import operator
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- ClassVar,
- Iterator,
- Literal,
- Sequence,
- TypeVar,
- cast,
- overload,
-)
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import (
- ArrayLike,
- AstypeArg,
- AxisInt,
- Dtype,
- FillnaOptions,
- PositionalIndexer,
- ScalarIndexer,
- SequenceIndexer,
- Shape,
- SortKind,
- TakeIndexer,
- npt,
-)
-from pandas.compat import set_function_name
-from pandas.compat.numpy import function as nv
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import (
- Appender,
- Substitution,
- cache_readonly,
-)
-from pandas.util._validators import (
- validate_bool_kwarg,
- validate_fillna_kwargs,
- validate_insert_loc,
-)
-
-from pandas.core.dtypes.cast import maybe_cast_to_extension_array
-from pandas.core.dtypes.common import (
- is_datetime64_dtype,
- is_dtype_equal,
- is_list_like,
- is_scalar,
- is_timedelta64_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import ExtensionDtype
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import isna
-
-from pandas.core import (
- arraylike,
- missing,
- roperator,
-)
-from pandas.core.algorithms import (
- factorize_array,
- isin,
- mode,
- rank,
- unique,
-)
-from pandas.core.array_algos.quantile import quantile_with_mask
-from pandas.core.sorting import (
- nargminmax,
- nargsort,
-)
-
-if TYPE_CHECKING:
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- )
-
-_extension_array_shared_docs: dict[str, str] = {}
-
-ExtensionArrayT = TypeVar("ExtensionArrayT", bound="ExtensionArray")
-
-
-class ExtensionArray:
- """
- Abstract base class for custom 1-D array types.
-
- pandas will recognize instances of this class as proper arrays
- with a custom type and will not attempt to coerce them to objects. They
- may be stored directly inside a :class:`DataFrame` or :class:`Series`.
-
- Attributes
- ----------
- dtype
- nbytes
- ndim
- shape
-
- Methods
- -------
- argsort
- astype
- copy
- dropna
- factorize
- fillna
- equals
- insert
- isin
- isna
- ravel
- repeat
- searchsorted
- shift
- take
- tolist
- unique
- view
- _accumulate
- _concat_same_type
- _formatter
- _from_factorized
- _from_sequence
- _from_sequence_of_strings
- _reduce
- _values_for_argsort
- _values_for_factorize
-
- Notes
- -----
- The interface includes the following abstract methods that must be
- implemented by subclasses:
-
- * _from_sequence
- * _from_factorized
- * __getitem__
- * __len__
- * __eq__
- * dtype
- * nbytes
- * isna
- * take
- * copy
- * _concat_same_type
-
- A default repr displaying the type, (truncated) data, length,
- and dtype is provided. It can be customized or replaced by
- by overriding:
-
- * __repr__ : A default repr for the ExtensionArray.
- * _formatter : Print scalars inside a Series or DataFrame.
-
- Some methods require casting the ExtensionArray to an ndarray of Python
- objects with ``self.astype(object)``, which may be expensive. When
- performance is a concern, we highly recommend overriding the following
- methods:
-
- * fillna
- * dropna
- * unique
- * factorize / _values_for_factorize
- * argsort, argmax, argmin / _values_for_argsort
- * searchsorted
-
- The remaining methods implemented on this class should be performant,
- as they only compose abstract methods. Still, a more efficient
- implementation may be available, and these methods can be overridden.
-
- One can implement methods to handle array accumulations or reductions.
-
- * _accumulate
- * _reduce
-
- One can implement methods to handle parsing from strings that will be used
- in methods such as ``pandas.io.parsers.read_csv``.
-
- * _from_sequence_of_strings
-
- This class does not inherit from 'abc.ABCMeta' for performance reasons.
- Methods and properties required by the interface raise
- ``pandas.errors.AbstractMethodError`` and no ``register`` method is
- provided for registering virtual subclasses.
-
- ExtensionArrays are limited to 1 dimension.
-
- They may be backed by none, one, or many NumPy arrays. For example,
- ``pandas.Categorical`` is an extension array backed by two arrays,
- one for codes and one for categories. An array of IPv6 address may
- be backed by a NumPy structured array with two fields, one for the
- lower 64 bits and one for the upper 64 bits. Or they may be backed
- by some other storage type, like Python lists. Pandas makes no
- assumptions on how the data are stored, just that it can be converted
- to a NumPy array.
- The ExtensionArray interface does not impose any rules on how this data
- is stored. However, currently, the backing data cannot be stored in
- attributes called ``.values`` or ``._values`` to ensure full compatibility
- with pandas internals. But other names as ``.data``, ``._data``,
- ``._items``, ... can be freely used.
-
- If implementing NumPy's ``__array_ufunc__`` interface, pandas expects
- that
-
- 1. You defer by returning ``NotImplemented`` when any Series are present
- in `inputs`. Pandas will extract the arrays and call the ufunc again.
- 2. You define a ``_HANDLED_TYPES`` tuple as an attribute on the class.
- Pandas inspect this to determine whether the ufunc is valid for the
- types present.
-
- See :ref:`extending.extension.ufunc` for more.
-
- By default, ExtensionArrays are not hashable. Immutable subclasses may
- override this behavior.
- """
-
- # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray.
- # Don't override this.
- _typ = "extension"
-
- # ------------------------------------------------------------------------
- # Constructors
- # ------------------------------------------------------------------------
-
- @classmethod
- def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
- """
- Construct a new ExtensionArray from a sequence of scalars.
-
- Parameters
- ----------
- scalars : Sequence
- Each element will be an instance of the scalar type for this
- array, ``cls.dtype.type`` or be converted into this type in this method.
- dtype : dtype, optional
- Construct for this particular dtype. This should be a Dtype
- compatible with the ExtensionArray.
- copy : bool, default False
- If True, copy the underlying data.
-
- Returns
- -------
- ExtensionArray
- """
- raise AbstractMethodError(cls)
-
- @classmethod
- def _from_sequence_of_strings(
- cls, strings, *, dtype: Dtype | None = None, copy: bool = False
- ):
- """
- Construct a new ExtensionArray from a sequence of strings.
-
- Parameters
- ----------
- strings : Sequence
- Each element will be an instance of the scalar type for this
- array, ``cls.dtype.type``.
- dtype : dtype, optional
- Construct for this particular dtype. This should be a Dtype
- compatible with the ExtensionArray.
- copy : bool, default False
- If True, copy the underlying data.
-
- Returns
- -------
- ExtensionArray
- """
- raise AbstractMethodError(cls)
-
- @classmethod
- def _from_factorized(cls, values, original):
- """
- Reconstruct an ExtensionArray after factorization.
-
- Parameters
- ----------
- values : ndarray
- An integer ndarray with the factorized values.
- original : ExtensionArray
- The original ExtensionArray that factorize was called on.
-
- See Also
- --------
- factorize : Top-level factorize method that dispatches here.
- ExtensionArray.factorize : Encode the extension array as an enumerated type.
- """
- raise AbstractMethodError(cls)
-
- # ------------------------------------------------------------------------
- # Must be a Sequence
- # ------------------------------------------------------------------------
- @overload
- def __getitem__(self, item: ScalarIndexer) -> Any:
- ...
-
- @overload
- def __getitem__(self: ExtensionArrayT, item: SequenceIndexer) -> ExtensionArrayT:
- ...
-
- def __getitem__(
- self: ExtensionArrayT, item: PositionalIndexer
- ) -> ExtensionArrayT | Any:
- """
- Select a subset of self.
-
- Parameters
- ----------
- item : int, slice, or ndarray
- * int: The position in 'self' to get.
-
- * slice: A slice object, where 'start', 'stop', and 'step' are
- integers or None
-
- * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
-
- * list[int]: A list of int
-
- Returns
- -------
- item : scalar or ExtensionArray
-
- Notes
- -----
- For scalar ``item``, return a scalar value suitable for the array's
- type. This should be an instance of ``self.dtype.type``.
-
- For slice ``key``, return an instance of ``ExtensionArray``, even
- if the slice is length 0 or 1.
-
- For a boolean mask, return an instance of ``ExtensionArray``, filtered
- to the values where ``item`` is True.
- """
- raise AbstractMethodError(self)
-
- def __setitem__(self, key, value) -> None:
- """
- Set one or more values inplace.
-
- This method is not required to satisfy the pandas extension array
- interface.
-
- Parameters
- ----------
- key : int, ndarray, or slice
- When called from, e.g. ``Series.__setitem__``, ``key`` will be
- one of
-
- * scalar int
- * ndarray of integers.
- * boolean ndarray
- * slice object
-
- value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
- value or values to be set of ``key``.
-
- Returns
- -------
- None
- """
- # Some notes to the ExtensionArray implementor who may have ended up
- # here. While this method is not required for the interface, if you
- # *do* choose to implement __setitem__, then some semantics should be
- # observed:
- #
- # * Setting multiple values : ExtensionArrays should support setting
- # multiple values at once, 'key' will be a sequence of integers and
- # 'value' will be a same-length sequence.
- #
- # * Broadcasting : For a sequence 'key' and a scalar 'value',
- # each position in 'key' should be set to 'value'.
- #
- # * Coercion : Most users will expect basic coercion to work. For
- # example, a string like '2018-01-01' is coerced to a datetime
- # when setting on a datetime64ns array. In general, if the
- # __init__ method coerces that value, then so should __setitem__
- # Note, also, that Series/DataFrame.where internally use __setitem__
- # on a copy of the data.
- raise NotImplementedError(f"{type(self)} does not implement __setitem__.")
-
- def __len__(self) -> int:
- """
- Length of this array
-
- Returns
- -------
- length : int
- """
- raise AbstractMethodError(self)
-
- def __iter__(self) -> Iterator[Any]:
- """
- Iterate over elements of the array.
- """
- # This needs to be implemented so that pandas recognizes extension
- # arrays as list-like. The default implementation makes successive
- # calls to ``__getitem__``, which may be slower than necessary.
- for i in range(len(self)):
- yield self[i]
-
- def __contains__(self, item: object) -> bool | np.bool_:
- """
- Return for `item in self`.
- """
- # GH37867
- # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA]
- # would raise a TypeError. The implementation below works around that.
- if is_scalar(item) and isna(item):
- if not self._can_hold_na:
- return False
- elif item is self.dtype.na_value or isinstance(item, self.dtype.type):
- return self._hasna
- else:
- return False
- else:
- # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no
- # attribute "any"
- return (item == self).any() # type: ignore[union-attr]
-
- # error: Signature of "__eq__" incompatible with supertype "object"
- def __eq__(self, other: Any) -> ArrayLike: # type: ignore[override]
- """
- Return for `self == other` (element-wise equality).
- """
- # Implementer note: this should return a boolean numpy ndarray or
- # a boolean ExtensionArray.
- # When `other` is one of Series, Index, or DataFrame, this method should
- # return NotImplemented (to ensure that those objects are responsible for
- # first unpacking the arrays, and then dispatch the operation to the
- # underlying arrays)
- raise AbstractMethodError(self)
-
- # error: Signature of "__ne__" incompatible with supertype "object"
- def __ne__(self, other: Any) -> ArrayLike: # type: ignore[override]
- """
- Return for `self != other` (element-wise in-equality).
- """
- return ~(self == other)
-
- def to_numpy(
- self,
- dtype: npt.DTypeLike | None = None,
- copy: bool = False,
- na_value: object = lib.no_default,
- ) -> np.ndarray:
- """
- Convert to a NumPy ndarray.
-
- This is similar to :meth:`numpy.asarray`, but may provide additional control
- over how the conversion is done.
-
- Parameters
- ----------
- dtype : str or numpy.dtype, optional
- The dtype to pass to :meth:`numpy.asarray`.
- copy : bool, default False
- Whether to ensure that the returned value is a not a view on
- another array. Note that ``copy=False`` does not *ensure* that
- ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
- a copy is made, even if not strictly necessary.
- na_value : Any, optional
- The value to use for missing values. The default value depends
- on `dtype` and the type of the array.
-
- Returns
- -------
- numpy.ndarray
- """
- result = np.asarray(self, dtype=dtype)
- if copy or na_value is not lib.no_default:
- result = result.copy()
- if na_value is not lib.no_default:
- result[self.isna()] = na_value
- return result
-
- # ------------------------------------------------------------------------
- # Required attributes
- # ------------------------------------------------------------------------
-
- @property
- def dtype(self) -> ExtensionDtype:
- """
- An instance of 'ExtensionDtype'.
- """
- raise AbstractMethodError(self)
-
- @property
- def shape(self) -> Shape:
- """
- Return a tuple of the array dimensions.
- """
- return (len(self),)
-
- @property
- def size(self) -> int:
- """
- The number of elements in the array.
- """
- # error: Incompatible return value type (got "signedinteger[_64Bit]",
- # expected "int") [return-value]
- return np.prod(self.shape) # type: ignore[return-value]
-
- @property
- def ndim(self) -> int:
- """
- Extension Arrays are only allowed to be 1-dimensional.
- """
- return 1
-
- @property
- def nbytes(self) -> int:
- """
- The number of bytes needed to store this object in memory.
- """
- # If this is expensive to compute, return an approximate lower bound
- # on the number of bytes needed.
- raise AbstractMethodError(self)
-
- # ------------------------------------------------------------------------
- # Additional Methods
- # ------------------------------------------------------------------------
-
- @overload
- def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
- ...
-
- @overload
- def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
- ...
-
- @overload
- def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
- ...
-
- def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
- """
- Cast to a NumPy array or ExtensionArray with 'dtype'.
-
- Parameters
- ----------
- dtype : str or dtype
- Typecode or data-type to which the array is cast.
- copy : bool, default True
- Whether to copy the data, even if not necessary. If False,
- a copy is made only if the old dtype does not match the
- new dtype.
-
- Returns
- -------
- np.ndarray or pandas.api.extensions.ExtensionArray
- An ExtensionArray if dtype is ExtensionDtype,
- Otherwise a NumPy ndarray with 'dtype' for its dtype.
- """
-
- dtype = pandas_dtype(dtype)
- if is_dtype_equal(dtype, self.dtype):
- if not copy:
- return self
- else:
- return self.copy()
-
- if isinstance(dtype, ExtensionDtype):
- cls = dtype.construct_array_type()
- return cls._from_sequence(self, dtype=dtype, copy=copy)
-
- elif is_datetime64_dtype(dtype):
- from pandas.core.arrays import DatetimeArray
-
- return DatetimeArray._from_sequence(self, dtype=dtype, copy=copy)
-
- elif is_timedelta64_dtype(dtype):
- from pandas.core.arrays import TimedeltaArray
-
- return TimedeltaArray._from_sequence(self, dtype=dtype, copy=copy)
-
- return np.array(self, dtype=dtype, copy=copy)
-
- def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll:
- """
- A 1-D array indicating if each value is missing.
-
- Returns
- -------
- numpy.ndarray or pandas.api.extensions.ExtensionArray
- In most cases, this should return a NumPy ndarray. For
- exceptional cases like ``SparseArray``, where returning
- an ndarray would be expensive, an ExtensionArray may be
- returned.
-
- Notes
- -----
- If returning an ExtensionArray, then
-
- * ``na_values._is_boolean`` should be True
- * `na_values` should implement :func:`ExtensionArray._reduce`
- * ``na_values.any`` and ``na_values.all`` should be implemented
- """
- raise AbstractMethodError(self)
-
- @property
- def _hasna(self) -> bool:
- # GH#22680
- """
- Equivalent to `self.isna().any()`.
-
- Some ExtensionArray subclasses may be able to optimize this check.
- """
- return bool(self.isna().any())
-
- def _values_for_argsort(self) -> np.ndarray:
- """
- Return values for sorting.
-
- Returns
- -------
- ndarray
- The transformed values should maintain the ordering between values
- within the array.
-
- See Also
- --------
- ExtensionArray.argsort : Return the indices that would sort this array.
-
- Notes
- -----
- The caller is responsible for *not* modifying these values in-place, so
- it is safe for implementors to give views on `self`.
-
- Functions that use this (e.g. ExtensionArray.argsort) should ignore
- entries with missing values in the original array (according to `self.isna()`).
- This means that the corresponding entries in the returned array don't need to
- be modified to sort correctly.
- """
- # Note: this is used in `ExtensionArray.argsort/argmin/argmax`.
- return np.array(self)
-
- def argsort(
- self,
- *,
- ascending: bool = True,
- kind: SortKind = "quicksort",
- na_position: str = "last",
- **kwargs,
- ) -> np.ndarray:
- """
- Return the indices that would sort this array.
-
- Parameters
- ----------
- ascending : bool, default True
- Whether the indices should result in an ascending
- or descending sort.
- kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
- Sorting algorithm.
- *args, **kwargs:
- Passed through to :func:`numpy.argsort`.
-
- Returns
- -------
- np.ndarray[np.intp]
- Array of indices that sort ``self``. If NaN values are contained,
- NaN values are placed at the end.
-
- See Also
- --------
- numpy.argsort : Sorting implementation used internally.
- """
- # Implementor note: You have two places to override the behavior of
- # argsort.
- # 1. _values_for_argsort : construct the values passed to np.argsort
- # 2. argsort : total control over sorting. In case of overriding this,
- # it is recommended to also override argmax/argmin
- ascending = nv.validate_argsort_with_ascending(ascending, (), kwargs)
-
- values = self._values_for_argsort()
- return nargsort(
- values,
- kind=kind,
- ascending=ascending,
- na_position=na_position,
- mask=np.asarray(self.isna()),
- )
-
- def argmin(self, skipna: bool = True) -> int:
- """
- Return the index of minimum value.
-
- In case of multiple occurrences of the minimum value, the index
- corresponding to the first occurrence is returned.
-
- Parameters
- ----------
- skipna : bool, default True
-
- Returns
- -------
- int
-
- See Also
- --------
- ExtensionArray.argmax
- """
- # Implementor note: You have two places to override the behavior of
- # argmin.
- # 1. _values_for_argsort : construct the values used in nargminmax
- # 2. argmin itself : total control over sorting.
- validate_bool_kwarg(skipna, "skipna")
- if not skipna and self._hasna:
- raise NotImplementedError
- return nargminmax(self, "argmin")
-
- def argmax(self, skipna: bool = True) -> int:
- """
- Return the index of maximum value.
-
- In case of multiple occurrences of the maximum value, the index
- corresponding to the first occurrence is returned.
-
- Parameters
- ----------
- skipna : bool, default True
-
- Returns
- -------
- int
-
- See Also
- --------
- ExtensionArray.argmin
- """
- # Implementor note: You have two places to override the behavior of
- # argmax.
- # 1. _values_for_argsort : construct the values used in nargminmax
- # 2. argmax itself : total control over sorting.
- validate_bool_kwarg(skipna, "skipna")
- if not skipna and self._hasna:
- raise NotImplementedError
- return nargminmax(self, "argmax")
-
- def fillna(
- self: ExtensionArrayT,
- value: object | ArrayLike | None = None,
- method: FillnaOptions | None = None,
- limit: int | None = None,
- ) -> ExtensionArrayT:
- """
- Fill NA/NaN values using the specified method.
-
- Parameters
- ----------
- value : scalar, array-like
- If a scalar value is passed it is used to fill all missing values.
- Alternatively, an array-like 'value' can be given. It's expected
- that the array-like have the same length as 'self'.
- method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
- Method to use for filling holes in reindexed Series:
-
- * pad / ffill: propagate last valid observation forward to next valid.
- * backfill / bfill: use NEXT valid observation to fill gap.
-
- limit : int, default None
- If method is specified, this is the maximum number of consecutive
- NaN values to forward/backward fill. In other words, if there is
- a gap with more than this number of consecutive NaNs, it will only
- be partially filled. If method is not specified, this is the
- maximum number of entries along the entire axis where NaNs will be
- filled.
-
- Returns
- -------
- ExtensionArray
- With NA/NaN filled.
- """
- value, method = validate_fillna_kwargs(value, method)
-
- mask = self.isna()
- # error: Argument 2 to "check_value_size" has incompatible type
- # "ExtensionArray"; expected "ndarray"
- value = missing.check_value_size(
- value, mask, len(self) # type: ignore[arg-type]
- )
-
- if mask.any():
- if method is not None:
- func = missing.get_fill_func(method)
- npvalues = self.astype(object)
- func(npvalues, limit=limit, mask=mask)
- new_values = self._from_sequence(npvalues, dtype=self.dtype)
- else:
- # fill with value
- new_values = self.copy()
- new_values[mask] = value
- else:
- new_values = self.copy()
- return new_values
-
- def dropna(self: ExtensionArrayT) -> ExtensionArrayT:
- """
- Return ExtensionArray without NA values.
-
- Returns
- -------
- pandas.api.extensions.ExtensionArray
- """
- # error: Unsupported operand type for ~ ("ExtensionArray")
- return self[~self.isna()] # type: ignore[operator]
-
- def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray:
- """
- Shift values by desired number.
-
- Newly introduced missing values are filled with
- ``self.dtype.na_value``.
-
- Parameters
- ----------
- periods : int, default 1
- The number of periods to shift. Negative values are allowed
- for shifting backwards.
-
- fill_value : object, optional
- The scalar value to use for newly introduced missing values.
- The default is ``self.dtype.na_value``.
-
- Returns
- -------
- ExtensionArray
- Shifted.
-
- Notes
- -----
- If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is
- returned.
-
- If ``periods > len(self)``, then an array of size
- len(self) is returned, with all values filled with
- ``self.dtype.na_value``.
- """
- # Note: this implementation assumes that `self.dtype.na_value` can be
- # stored in an instance of your ExtensionArray with `self.dtype`.
- if not len(self) or periods == 0:
- return self.copy()
-
- if isna(fill_value):
- fill_value = self.dtype.na_value
-
- empty = self._from_sequence(
- [fill_value] * min(abs(periods), len(self)), dtype=self.dtype
- )
- if periods > 0:
- a = empty
- b = self[:-periods]
- else:
- a = self[abs(periods) :]
- b = empty
- return self._concat_same_type([a, b])
-
- def unique(self: ExtensionArrayT) -> ExtensionArrayT:
- """
- Compute the ExtensionArray of unique values.
-
- Returns
- -------
- pandas.api.extensions.ExtensionArray
- """
- uniques = unique(self.astype(object))
- return self._from_sequence(uniques, dtype=self.dtype)
-
- def searchsorted(
- self,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- """
- Find indices where elements should be inserted to maintain order.
-
- Find the indices into a sorted array `self` (a) such that, if the
- corresponding elements in `value` were inserted before the indices,
- the order of `self` would be preserved.
-
- Assuming that `self` is sorted:
-
- ====== ================================
- `side` returned index `i` satisfies
- ====== ================================
- left ``self[i-1] < value <= self[i]``
- right ``self[i-1] <= value < self[i]``
- ====== ================================
-
- Parameters
- ----------
- value : array-like, list or scalar
- Value(s) to insert into `self`.
- side : {'left', 'right'}, optional
- If 'left', the index of the first suitable location found is given.
- If 'right', return the last such index. If there is no suitable
- index, return either 0 or N (where N is the length of `self`).
- sorter : 1-D array-like, optional
- Optional array of integer indices that sort array a into ascending
- order. They are typically the result of argsort.
-
- Returns
- -------
- array of ints or int
- If value is array-like, array of insertion points.
- If value is scalar, a single integer.
-
- See Also
- --------
- numpy.searchsorted : Similar method from NumPy.
- """
- # Note: the base tests provided by pandas only test the basics.
- # We do not test
- # 1. Values outside the range of the `data_for_sorting` fixture
- # 2. Values between the values in the `data_for_sorting` fixture
- # 3. Missing values.
- arr = self.astype(object)
- if isinstance(value, ExtensionArray):
- value = value.astype(object)
- return arr.searchsorted(value, side=side, sorter=sorter)
-
- def equals(self, other: object) -> bool:
- """
- Return if another array is equivalent to this array.
-
- Equivalent means that both arrays have the same shape and dtype, and
- all values compare equal. Missing values in the same location are
- considered equal (in contrast with normal equality).
-
- Parameters
- ----------
- other : ExtensionArray
- Array to compare to this Array.
-
- Returns
- -------
- boolean
- Whether the arrays are equivalent.
- """
- if type(self) != type(other):
- return False
- other = cast(ExtensionArray, other)
- if not is_dtype_equal(self.dtype, other.dtype):
- return False
- elif len(self) != len(other):
- return False
- else:
- equal_values = self == other
- if isinstance(equal_values, ExtensionArray):
- # boolean array with NA -> fill with False
- equal_values = equal_values.fillna(False)
- # error: Unsupported left operand type for & ("ExtensionArray")
- equal_na = self.isna() & other.isna() # type: ignore[operator]
- return bool((equal_values | equal_na).all())
-
- def isin(self, values) -> npt.NDArray[np.bool_]:
- """
- Pointwise comparison for set containment in the given values.
-
- Roughly equivalent to `np.array([x in values for x in self])`
-
- Parameters
- ----------
- values : Sequence
-
- Returns
- -------
- np.ndarray[bool]
- """
- return isin(np.asarray(self), values)
-
- def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
- """
- Return an array and missing value suitable for factorization.
-
- Returns
- -------
- values : ndarray
-
- An array suitable for factorization. This should maintain order
- and be a supported dtype (Float64, Int64, UInt64, String, Object).
- By default, the extension array is cast to object dtype.
- na_value : object
- The value in `values` to consider missing. This will be treated
- as NA in the factorization routines, so it will be coded as
- `-1` and not included in `uniques`. By default,
- ``np.nan`` is used.
-
- Notes
- -----
- The values returned by this method are also used in
- :func:`pandas.util.hash_pandas_object`.
- """
- return self.astype(object), np.nan
-
- def factorize(
- self,
- use_na_sentinel: bool = True,
- ) -> tuple[np.ndarray, ExtensionArray]:
- """
- Encode the extension array as an enumerated type.
-
- Parameters
- ----------
- use_na_sentinel : bool, default True
- If True, the sentinel -1 will be used for NaN values. If False,
- NaN values will be encoded as non-negative integers and will not drop the
- NaN from the uniques of the values.
-
- .. versionadded:: 1.5.0
-
- Returns
- -------
- codes : ndarray
- An integer NumPy array that's an indexer into the original
- ExtensionArray.
- uniques : ExtensionArray
- An ExtensionArray containing the unique values of `self`.
-
- .. note::
-
- uniques will *not* contain an entry for the NA value of
- the ExtensionArray if there are any missing values present
- in `self`.
-
- See Also
- --------
- factorize : Top-level factorize method that dispatches here.
-
- Notes
- -----
- :meth:`pandas.factorize` offers a `sort` keyword as well.
- """
- # Implementer note: There are two ways to override the behavior of
- # pandas.factorize
- # 1. _values_for_factorize and _from_factorize.
- # Specify the values passed to pandas' internal factorization
- # routines, and how to convert from those values back to the
- # original ExtensionArray.
- # 2. ExtensionArray.factorize.
- # Complete control over factorization.
- arr, na_value = self._values_for_factorize()
-
- codes, uniques = factorize_array(
- arr, use_na_sentinel=use_na_sentinel, na_value=na_value
- )
-
- uniques_ea = self._from_factorized(uniques, self)
- return codes, uniques_ea
-
- _extension_array_shared_docs[
- "repeat"
- ] = """
- Repeat elements of a %(klass)s.
-
- Returns a new %(klass)s where each element of the current %(klass)s
- is repeated consecutively a given number of times.
-
- Parameters
- ----------
- repeats : int or array of ints
- The number of repetitions for each element. This should be a
- non-negative integer. Repeating 0 times will return an empty
- %(klass)s.
- axis : None
- Must be ``None``. Has no effect but is accepted for compatibility
- with numpy.
-
- Returns
- -------
- %(klass)s
- Newly created %(klass)s with repeated elements.
-
- See Also
- --------
- Series.repeat : Equivalent function for Series.
- Index.repeat : Equivalent function for Index.
- numpy.repeat : Similar method for :class:`numpy.ndarray`.
- ExtensionArray.take : Take arbitrary positions.
-
- Examples
- --------
- >>> cat = pd.Categorical(['a', 'b', 'c'])
- >>> cat
- ['a', 'b', 'c']
- Categories (3, object): ['a', 'b', 'c']
- >>> cat.repeat(2)
- ['a', 'a', 'b', 'b', 'c', 'c']
- Categories (3, object): ['a', 'b', 'c']
- >>> cat.repeat([1, 2, 3])
- ['a', 'b', 'b', 'c', 'c', 'c']
- Categories (3, object): ['a', 'b', 'c']
- """
-
- @Substitution(klass="ExtensionArray")
- @Appender(_extension_array_shared_docs["repeat"])
- def repeat(
- self: ExtensionArrayT, repeats: int | Sequence[int], axis: AxisInt | None = None
- ) -> ExtensionArrayT:
- nv.validate_repeat((), {"axis": axis})
- ind = np.arange(len(self)).repeat(repeats)
- return self.take(ind)
-
- # ------------------------------------------------------------------------
- # Indexing methods
- # ------------------------------------------------------------------------
-
- def take(
- self: ExtensionArrayT,
- indices: TakeIndexer,
- *,
- allow_fill: bool = False,
- fill_value: Any = None,
- ) -> ExtensionArrayT:
- """
- Take elements from an array.
-
- Parameters
- ----------
- indices : sequence of int or one-dimensional np.ndarray of int
- Indices to be taken.
- allow_fill : bool, default False
- How to handle negative values in `indices`.
-
- * False: negative values in `indices` indicate positional indices
- from the right (the default). This is similar to
- :func:`numpy.take`.
-
- * True: negative values in `indices` indicate
- missing values. These values are set to `fill_value`. Any other
- other negative values raise a ``ValueError``.
-
- fill_value : any, optional
- Fill value to use for NA-indices when `allow_fill` is True.
- This may be ``None``, in which case the default NA value for
- the type, ``self.dtype.na_value``, is used.
-
- For many ExtensionArrays, there will be two representations of
- `fill_value`: a user-facing "boxed" scalar, and a low-level
- physical NA value. `fill_value` should be the user-facing version,
- and the implementation should handle translating that to the
- physical version for processing the take if necessary.
-
- Returns
- -------
- ExtensionArray
-
- Raises
- ------
- IndexError
- When the indices are out of bounds for the array.
- ValueError
- When `indices` contains negative values other than ``-1``
- and `allow_fill` is True.
-
- See Also
- --------
- numpy.take : Take elements from an array along an axis.
- api.extensions.take : Take elements from an array.
-
- Notes
- -----
- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
- ``iloc``, when `indices` is a sequence of values. Additionally,
- it's called by :meth:`Series.reindex`, or any other method
- that causes realignment, with a `fill_value`.
-
- Examples
- --------
- Here's an example implementation, which relies on casting the
- extension array to object dtype. This uses the helper method
- :func:`pandas.api.extensions.take`.
-
- .. code-block:: python
-
- def take(self, indices, allow_fill=False, fill_value=None):
- from pandas.core.algorithms import take
-
- # If the ExtensionArray is backed by an ndarray, then
- # just pass that here instead of coercing to object.
- data = self.astype(object)
-
- if allow_fill and fill_value is None:
- fill_value = self.dtype.na_value
-
- # fill value should always be translated from the scalar
- # type for the array, to the physical storage type for
- # the data, before passing to take.
-
- result = take(data, indices, fill_value=fill_value,
- allow_fill=allow_fill)
- return self._from_sequence(result, dtype=self.dtype)
- """
- # Implementer note: The `fill_value` parameter should be a user-facing
- # value, an instance of self.dtype.type. When passed `fill_value=None`,
- # the default of `self.dtype.na_value` should be used.
- # This may differ from the physical storage type your ExtensionArray
- # uses. In this case, your implementation is responsible for casting
- # the user-facing type to the storage type, before using
- # pandas.api.extensions.take
- raise AbstractMethodError(self)
-
- def copy(self: ExtensionArrayT) -> ExtensionArrayT:
- """
- Return a copy of the array.
-
- Returns
- -------
- ExtensionArray
- """
- raise AbstractMethodError(self)
-
- def view(self, dtype: Dtype | None = None) -> ArrayLike:
- """
- Return a view on the array.
-
- Parameters
- ----------
- dtype : str, np.dtype, or ExtensionDtype, optional
- Default None.
-
- Returns
- -------
- ExtensionArray or np.ndarray
- A view on the :class:`ExtensionArray`'s data.
- """
- # NB:
- # - This must return a *new* object referencing the same data, not self.
- # - The only case that *must* be implemented is with dtype=None,
- # giving a view with the same dtype as self.
- if dtype is not None:
- raise NotImplementedError(dtype)
- return self[:]
-
- # ------------------------------------------------------------------------
- # Printing
- # ------------------------------------------------------------------------
-
- def __repr__(self) -> str:
- if self.ndim > 1:
- return self._repr_2d()
-
- from pandas.io.formats.printing import format_object_summary
-
- # the short repr has no trailing newline, while the truncated
- # repr does. So we include a newline in our template, and strip
- # any trailing newlines from format_object_summary
- data = format_object_summary(
- self, self._formatter(), indent_for_name=False
- ).rstrip(", \n")
- class_name = f"<{type(self).__name__}>\n"
- return f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}"
-
- def _repr_2d(self) -> str:
- from pandas.io.formats.printing import format_object_summary
-
- # the short repr has no trailing newline, while the truncated
- # repr does. So we include a newline in our template, and strip
- # any trailing newlines from format_object_summary
- lines = [
- format_object_summary(x, self._formatter(), indent_for_name=False).rstrip(
- ", \n"
- )
- for x in self
- ]
- data = ",\n".join(lines)
- class_name = f"<{type(self).__name__}>"
- return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}"
-
- def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
- """
- Formatting function for scalar values.
-
- This is used in the default '__repr__'. The returned formatting
- function receives instances of your scalar type.
-
- Parameters
- ----------
- boxed : bool, default False
- An indicated for whether or not your array is being printed
- within a Series, DataFrame, or Index (True), or just by
- itself (False). This may be useful if you want scalar values
- to appear differently within a Series versus on its own (e.g.
- quoted or not).
-
- Returns
- -------
- Callable[[Any], str]
- A callable that gets instances of the scalar type and
- returns a string. By default, :func:`repr` is used
- when ``boxed=False`` and :func:`str` is used when
- ``boxed=True``.
- """
- if boxed:
- return str
- return repr
-
- # ------------------------------------------------------------------------
- # Reshaping
- # ------------------------------------------------------------------------
-
- def transpose(self, *axes: int) -> ExtensionArray:
- """
- Return a transposed view on this array.
-
- Because ExtensionArrays are always 1D, this is a no-op. It is included
- for compatibility with np.ndarray.
- """
- return self[:]
-
- @property
- def T(self) -> ExtensionArray:
- return self.transpose()
-
- def ravel(self, order: Literal["C", "F", "A", "K"] | None = "C") -> ExtensionArray:
- """
- Return a flattened view on this array.
-
- Parameters
- ----------
- order : {None, 'C', 'F', 'A', 'K'}, default 'C'
-
- Returns
- -------
- ExtensionArray
-
- Notes
- -----
- - Because ExtensionArrays are 1D-only, this is a no-op.
- - The "order" argument is ignored, is for compatibility with NumPy.
- """
- return self
-
- @classmethod
- def _concat_same_type(
- cls: type[ExtensionArrayT], to_concat: Sequence[ExtensionArrayT]
- ) -> ExtensionArrayT:
- """
- Concatenate multiple array of this dtype.
-
- Parameters
- ----------
- to_concat : sequence of this type
-
- Returns
- -------
- ExtensionArray
- """
- # Implementer note: this method will only be called with a sequence of
- # ExtensionArrays of this class and with the same dtype as self. This
- # should allow "easy" concatenation (no upcasting needed), and result
- # in a new ExtensionArray of the same dtype.
- # Note: this strict behaviour is only guaranteed starting with pandas 1.1
- raise AbstractMethodError(cls)
-
- # The _can_hold_na attribute is set to True so that pandas internals
- # will use the ExtensionDtype.na_value as the NA value in operations
- # such as take(), reindex(), shift(), etc. In addition, those results
- # will then be of the ExtensionArray subclass rather than an array
- # of objects
- @cache_readonly
- def _can_hold_na(self) -> bool:
- return self.dtype._can_hold_na
-
- def _accumulate(
- self, name: str, *, skipna: bool = True, **kwargs
- ) -> ExtensionArray:
- """
- Return an ExtensionArray performing an accumulation operation.
-
- The underlying data type might change.
-
- Parameters
- ----------
- name : str
- Name of the function, supported values are:
- - cummin
- - cummax
- - cumsum
- - cumprod
- skipna : bool, default True
- If True, skip NA values.
- **kwargs
- Additional keyword arguments passed to the accumulation function.
- Currently, there is no supported kwarg.
-
- Returns
- -------
- array
-
- Raises
- ------
- NotImplementedError : subclass does not define accumulations
- """
- raise NotImplementedError(f"cannot perform {name} with type {self.dtype}")
-
- def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
- """
- Return a scalar result of performing the reduction operation.
-
- Parameters
- ----------
- name : str
- Name of the function, supported values are:
- { any, all, min, max, sum, mean, median, prod,
- std, var, sem, kurt, skew }.
- skipna : bool, default True
- If True, skip NaN values.
- **kwargs
- Additional keyword arguments passed to the reduction function.
- Currently, `ddof` is the only supported kwarg.
-
- Returns
- -------
- scalar
-
- Raises
- ------
- TypeError : subclass does not define reductions
- """
- meth = getattr(self, name, None)
- if meth is None:
- raise TypeError(
- f"'{type(self).__name__}' with dtype {self.dtype} "
- f"does not support reduction '{name}'"
- )
- return meth(skipna=skipna, **kwargs)
-
- # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
- # Incompatible types in assignment (expression has type "None", base class
- # "object" defined the type as "Callable[[object], int]")
- __hash__: ClassVar[None] # type: ignore[assignment]
-
- # ------------------------------------------------------------------------
- # Non-Optimized Default Methods; in the case of the private methods here,
- # these are not guaranteed to be stable across pandas versions.
-
- def tolist(self) -> list:
- """
- Return a list of the values.
-
- These are each a scalar type, which is a Python scalar
- (for str, int, float) or a pandas scalar
- (for Timestamp/Timedelta/Interval/Period)
-
- Returns
- -------
- list
- """
- if self.ndim > 1:
- return [x.tolist() for x in self]
- return list(self)
-
- def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT:
- indexer = np.delete(np.arange(len(self)), loc)
- return self.take(indexer)
-
- def insert(self: ExtensionArrayT, loc: int, item) -> ExtensionArrayT:
- """
- Insert an item at the given position.
-
- Parameters
- ----------
- loc : int
- item : scalar-like
-
- Returns
- -------
- same type as self
-
- Notes
- -----
- This method should be both type and dtype-preserving. If the item
- cannot be held in an array of this type/dtype, either ValueError or
- TypeError should be raised.
-
- The default implementation relies on _from_sequence to raise on invalid
- items.
- """
- loc = validate_insert_loc(loc, len(self))
-
- item_arr = type(self)._from_sequence([item], dtype=self.dtype)
-
- return type(self)._concat_same_type([self[:loc], item_arr, self[loc:]])
-
- def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
- """
- Analogue to np.putmask(self, mask, value)
-
- Parameters
- ----------
- mask : np.ndarray[bool]
- value : scalar or listlike
- If listlike, must be arraylike with same length as self.
-
- Returns
- -------
- None
-
- Notes
- -----
- Unlike np.putmask, we do not repeat listlike values with mismatched length.
- 'value' should either be a scalar or an arraylike with the same length
- as self.
- """
- if is_list_like(value):
- val = value[mask]
- else:
- val = value
-
- self[mask] = val
-
- def _where(
- self: ExtensionArrayT, mask: npt.NDArray[np.bool_], value
- ) -> ExtensionArrayT:
- """
- Analogue to np.where(mask, self, value)
-
- Parameters
- ----------
- mask : np.ndarray[bool]
- value : scalar or listlike
-
- Returns
- -------
- same type as self
- """
- result = self.copy()
-
- if is_list_like(value):
- val = value[~mask]
- else:
- val = value
-
- result[~mask] = val
- return result
-
- def _fill_mask_inplace(
- self, method: str, limit, mask: npt.NDArray[np.bool_]
- ) -> None:
- """
- Replace values in locations specified by 'mask' using pad or backfill.
-
- See also
- --------
- ExtensionArray.fillna
- """
- func = missing.get_fill_func(method)
- npvalues = self.astype(object)
- # NB: if we don't copy mask here, it may be altered inplace, which
- # would mess up the `self[mask] = ...` below.
- func(npvalues, limit=limit, mask=mask.copy())
- new_values = self._from_sequence(npvalues, dtype=self.dtype)
- self[mask] = new_values[mask]
-
- def _rank(
- self,
- *,
- axis: AxisInt = 0,
- method: str = "average",
- na_option: str = "keep",
- ascending: bool = True,
- pct: bool = False,
- ):
- """
- See Series.rank.__doc__.
- """
- if axis != 0:
- raise NotImplementedError
-
- return rank(
- self,
- axis=axis,
- method=method,
- na_option=na_option,
- ascending=ascending,
- pct=pct,
- )
-
- @classmethod
- def _empty(cls, shape: Shape, dtype: ExtensionDtype):
- """
- Create an ExtensionArray with the given shape and dtype.
-
- See also
- --------
- ExtensionDtype.empty
- ExtensionDtype.empty is the 'official' public version of this API.
- """
- # Implementer note: while ExtensionDtype.empty is the public way to
- # call this method, it is still required to implement this `_empty`
- # method as well (it is called internally in pandas)
- obj = cls._from_sequence([], dtype=dtype)
-
- taker = np.broadcast_to(np.intp(-1), shape)
- result = obj.take(taker, allow_fill=True)
- if not isinstance(result, cls) or dtype != result.dtype:
- raise NotImplementedError(
- f"Default 'empty' implementation is invalid for dtype='{dtype}'"
- )
- return result
-
- def _quantile(
- self: ExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str
- ) -> ExtensionArrayT:
- """
- Compute the quantiles of self for each quantile in `qs`.
-
- Parameters
- ----------
- qs : np.ndarray[float64]
- interpolation: str
-
- Returns
- -------
- same type as self
- """
- mask = np.asarray(self.isna())
- arr = np.asarray(self)
- fill_value = np.nan
-
- res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
- return type(self)._from_sequence(res_values)
-
- def _mode(self: ExtensionArrayT, dropna: bool = True) -> ExtensionArrayT:
- """
- Returns the mode(s) of the ExtensionArray.
-
- Always returns `ExtensionArray` even if only one value.
-
- Parameters
- ----------
- dropna : bool, default True
- Don't consider counts of NA values.
-
- Returns
- -------
- same type as self
- Sorted, if possible.
- """
- # error: Incompatible return value type (got "Union[ExtensionArray,
- # ndarray[Any, Any]]", expected "ExtensionArrayT")
- return mode(self, dropna=dropna) # type: ignore[return-value]
-
- def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
- if any(
- isinstance(other, (ABCSeries, ABCIndex, ABCDataFrame)) for other in inputs
- ):
- return NotImplemented
-
- result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
-
- if "out" in kwargs:
- return arraylike.dispatch_ufunc_with_out(
- self, ufunc, method, *inputs, **kwargs
- )
-
- if method == "reduce":
- result = arraylike.dispatch_reduction_ufunc(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
-
- return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs)
-
-
-class ExtensionArraySupportsAnyAll(ExtensionArray):
- def any(self, *, skipna: bool = True) -> bool:
- raise AbstractMethodError(self)
-
- def all(self, *, skipna: bool = True) -> bool:
- raise AbstractMethodError(self)
-
-
-class ExtensionOpsMixin:
- """
- A base class for linking the operators to their dunder names.
-
- .. note::
-
- You may want to set ``__array_priority__`` if you want your
- implementation to be called when involved in binary operations
- with NumPy arrays.
- """
-
- @classmethod
- def _create_arithmetic_method(cls, op):
- raise AbstractMethodError(cls)
-
- @classmethod
- def _add_arithmetic_ops(cls) -> None:
- setattr(cls, "__add__", cls._create_arithmetic_method(operator.add))
- setattr(cls, "__radd__", cls._create_arithmetic_method(roperator.radd))
- setattr(cls, "__sub__", cls._create_arithmetic_method(operator.sub))
- setattr(cls, "__rsub__", cls._create_arithmetic_method(roperator.rsub))
- setattr(cls, "__mul__", cls._create_arithmetic_method(operator.mul))
- setattr(cls, "__rmul__", cls._create_arithmetic_method(roperator.rmul))
- setattr(cls, "__pow__", cls._create_arithmetic_method(operator.pow))
- setattr(cls, "__rpow__", cls._create_arithmetic_method(roperator.rpow))
- setattr(cls, "__mod__", cls._create_arithmetic_method(operator.mod))
- setattr(cls, "__rmod__", cls._create_arithmetic_method(roperator.rmod))
- setattr(cls, "__floordiv__", cls._create_arithmetic_method(operator.floordiv))
- setattr(
- cls, "__rfloordiv__", cls._create_arithmetic_method(roperator.rfloordiv)
- )
- setattr(cls, "__truediv__", cls._create_arithmetic_method(operator.truediv))
- setattr(cls, "__rtruediv__", cls._create_arithmetic_method(roperator.rtruediv))
- setattr(cls, "__divmod__", cls._create_arithmetic_method(divmod))
- setattr(cls, "__rdivmod__", cls._create_arithmetic_method(roperator.rdivmod))
-
- @classmethod
- def _create_comparison_method(cls, op):
- raise AbstractMethodError(cls)
-
- @classmethod
- def _add_comparison_ops(cls) -> None:
- setattr(cls, "__eq__", cls._create_comparison_method(operator.eq))
- setattr(cls, "__ne__", cls._create_comparison_method(operator.ne))
- setattr(cls, "__lt__", cls._create_comparison_method(operator.lt))
- setattr(cls, "__gt__", cls._create_comparison_method(operator.gt))
- setattr(cls, "__le__", cls._create_comparison_method(operator.le))
- setattr(cls, "__ge__", cls._create_comparison_method(operator.ge))
-
- @classmethod
- def _create_logical_method(cls, op):
- raise AbstractMethodError(cls)
-
- @classmethod
- def _add_logical_ops(cls) -> None:
- setattr(cls, "__and__", cls._create_logical_method(operator.and_))
- setattr(cls, "__rand__", cls._create_logical_method(roperator.rand_))
- setattr(cls, "__or__", cls._create_logical_method(operator.or_))
- setattr(cls, "__ror__", cls._create_logical_method(roperator.ror_))
- setattr(cls, "__xor__", cls._create_logical_method(operator.xor))
- setattr(cls, "__rxor__", cls._create_logical_method(roperator.rxor))
-
-
-class ExtensionScalarOpsMixin(ExtensionOpsMixin):
- """
- A mixin for defining ops on an ExtensionArray.
-
- It is assumed that the underlying scalar objects have the operators
- already defined.
-
- Notes
- -----
- If you have defined a subclass MyExtensionArray(ExtensionArray), then
- use MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin) to
- get the arithmetic operators. After the definition of MyExtensionArray,
- insert the lines
-
- MyExtensionArray._add_arithmetic_ops()
- MyExtensionArray._add_comparison_ops()
-
- to link the operators to your class.
-
- .. note::
-
- You may want to set ``__array_priority__`` if you want your
- implementation to be called when involved in binary operations
- with NumPy arrays.
- """
-
- @classmethod
- def _create_method(cls, op, coerce_to_dtype: bool = True, result_dtype=None):
- """
- A class method that returns a method that will correspond to an
- operator for an ExtensionArray subclass, by dispatching to the
- relevant operator defined on the individual elements of the
- ExtensionArray.
-
- Parameters
- ----------
- op : function
- An operator that takes arguments op(a, b)
- coerce_to_dtype : bool, default True
- boolean indicating whether to attempt to convert
- the result to the underlying ExtensionArray dtype.
- If it's not possible to create a new ExtensionArray with the
- values, an ndarray is returned instead.
-
- Returns
- -------
- Callable[[Any, Any], Union[ndarray, ExtensionArray]]
- A method that can be bound to a class. When used, the method
- receives the two arguments, one of which is the instance of
- this class, and should return an ExtensionArray or an ndarray.
-
- Returning an ndarray may be necessary when the result of the
- `op` cannot be stored in the ExtensionArray. The dtype of the
- ndarray uses NumPy's normal inference rules.
-
- Examples
- --------
- Given an ExtensionArray subclass called MyExtensionArray, use
-
- __add__ = cls._create_method(operator.add)
-
- in the class definition of MyExtensionArray to create the operator
- for addition, that will be based on the operator implementation
- of the underlying elements of the ExtensionArray
- """
-
- def _binop(self, other):
- def convert_values(param):
- if isinstance(param, ExtensionArray) or is_list_like(param):
- ovalues = param
- else: # Assume its an object
- ovalues = [param] * len(self)
- return ovalues
-
- if isinstance(other, (ABCSeries, ABCIndex, ABCDataFrame)):
- # rely on pandas to unbox and dispatch to us
- return NotImplemented
-
- lvalues = self
- rvalues = convert_values(other)
-
- # If the operator is not defined for the underlying objects,
- # a TypeError should be raised
- res = [op(a, b) for (a, b) in zip(lvalues, rvalues)]
-
- def _maybe_convert(arr):
- if coerce_to_dtype:
- # https://github.com/pandas-dev/pandas/issues/22850
- # We catch all regular exceptions here, and fall back
- # to an ndarray.
- res = maybe_cast_to_extension_array(type(self), arr)
- if not isinstance(res, type(self)):
- # exception raised in _from_sequence; ensure we have ndarray
- res = np.asarray(arr)
- else:
- res = np.asarray(arr, dtype=result_dtype)
- return res
-
- if op.__name__ in {"divmod", "rdivmod"}:
- a, b = zip(*res)
- return _maybe_convert(a), _maybe_convert(b)
-
- return _maybe_convert(res)
-
- op_name = f"__{op.__name__}__"
- return set_function_name(_binop, op_name, cls)
-
- @classmethod
- def _create_arithmetic_method(cls, op):
- return cls._create_method(op)
-
- @classmethod
- def _create_comparison_method(cls, op):
- return cls._create_method(op, coerce_to_dtype=False, result_dtype=bool)
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/boolean.py b/contrib/python/pandas/py3/pandas/core/arrays/boolean.py
deleted file mode 100644
index 2dba557eda1..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/boolean.py
+++ /dev/null
@@ -1,394 +0,0 @@
-from __future__ import annotations
-
-import numbers
-from typing import (
- TYPE_CHECKING,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs import (
- lib,
- missing as libmissing,
-)
-from pandas._typing import (
- Dtype,
- DtypeObj,
- type_t,
-)
-
-from pandas.core.dtypes.common import (
- is_list_like,
- is_numeric_dtype,
-)
-from pandas.core.dtypes.dtypes import register_extension_dtype
-from pandas.core.dtypes.missing import isna
-
-from pandas.core import ops
-from pandas.core.array_algos import masked_accumulations
-from pandas.core.arrays.masked import (
- BaseMaskedArray,
- BaseMaskedDtype,
-)
-
-if TYPE_CHECKING:
- import pyarrow
-
- from pandas._typing import npt
-
-
-@register_extension_dtype
-class BooleanDtype(BaseMaskedDtype):
- """
- Extension dtype for boolean data.
-
- .. warning::
-
- BooleanDtype is considered experimental. The implementation and
- parts of the API may change without warning.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
-
- Examples
- --------
- >>> pd.BooleanDtype()
- BooleanDtype
- """
-
- name = "boolean"
-
- # https://github.com/python/mypy/issues/4125
- # error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
- @property
- def type(self) -> type: # type: ignore[override]
- return np.bool_
-
- @property
- def kind(self) -> str:
- return "b"
-
- @property
- def numpy_dtype(self) -> np.dtype:
- return np.dtype("bool")
-
- @classmethod
- def construct_array_type(cls) -> type_t[BooleanArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- return BooleanArray
-
- def __repr__(self) -> str:
- return "BooleanDtype"
-
- @property
- def _is_boolean(self) -> bool:
- return True
-
- @property
- def _is_numeric(self) -> bool:
- return True
-
- def __from_arrow__(
- self, array: pyarrow.Array | pyarrow.ChunkedArray
- ) -> BooleanArray:
- """
- Construct BooleanArray from pyarrow Array/ChunkedArray.
- """
- import pyarrow
-
- if array.type != pyarrow.bool_():
- raise TypeError(f"Expected array of boolean type, got {array.type} instead")
-
- if isinstance(array, pyarrow.Array):
- chunks = [array]
- else:
- # pyarrow.ChunkedArray
- chunks = array.chunks
-
- results = []
- for arr in chunks:
- buflist = arr.buffers()
- data = pyarrow.BooleanArray.from_buffers(
- arr.type, len(arr), [None, buflist[1]], offset=arr.offset
- ).to_numpy(zero_copy_only=False)
- if arr.null_count != 0:
- mask = pyarrow.BooleanArray.from_buffers(
- arr.type, len(arr), [None, buflist[0]], offset=arr.offset
- ).to_numpy(zero_copy_only=False)
- mask = ~mask
- else:
- mask = np.zeros(len(arr), dtype=bool)
-
- bool_arr = BooleanArray(data, mask)
- results.append(bool_arr)
-
- if not results:
- return BooleanArray(
- np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
- )
- else:
- return BooleanArray._concat_same_type(results)
-
-
-def coerce_to_array(
- values, mask=None, copy: bool = False
-) -> tuple[np.ndarray, np.ndarray]:
- """
- Coerce the input values array to numpy arrays with a mask.
-
- Parameters
- ----------
- values : 1D list-like
- mask : bool 1D array, optional
- copy : bool, default False
- if True, copy the input
-
- Returns
- -------
- tuple of (values, mask)
- """
- if isinstance(values, BooleanArray):
- if mask is not None:
- raise ValueError("cannot pass mask for BooleanArray input")
- values, mask = values._data, values._mask
- if copy:
- values = values.copy()
- mask = mask.copy()
- return values, mask
-
- mask_values = None
- if isinstance(values, np.ndarray) and values.dtype == np.bool_:
- if copy:
- values = values.copy()
- elif isinstance(values, np.ndarray) and is_numeric_dtype(values.dtype):
- mask_values = isna(values)
-
- values_bool = np.zeros(len(values), dtype=bool)
- values_bool[~mask_values] = values[~mask_values].astype(bool)
-
- if not np.all(
- values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
- ):
- raise TypeError("Need to pass bool-like values")
-
- values = values_bool
- else:
- values_object = np.asarray(values, dtype=object)
-
- inferred_dtype = lib.infer_dtype(values_object, skipna=True)
- integer_like = ("floating", "integer", "mixed-integer-float")
- if inferred_dtype not in ("boolean", "empty") + integer_like:
- raise TypeError("Need to pass bool-like values")
-
- # mypy does not narrow the type of mask_values to npt.NDArray[np.bool_]
- # within this branch, it assumes it can also be None
- mask_values = cast("npt.NDArray[np.bool_]", isna(values_object))
- values = np.zeros(len(values), dtype=bool)
- values[~mask_values] = values_object[~mask_values].astype(bool)
-
- # if the values were integer-like, validate it were actually 0/1's
- if (inferred_dtype in integer_like) and not (
- np.all(
- values[~mask_values].astype(float)
- == values_object[~mask_values].astype(float)
- )
- ):
- raise TypeError("Need to pass bool-like values")
-
- if mask is None and mask_values is None:
- mask = np.zeros(values.shape, dtype=bool)
- elif mask is None:
- mask = mask_values
- else:
- if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
- if mask_values is not None:
- mask = mask | mask_values
- else:
- if copy:
- mask = mask.copy()
- else:
- mask = np.array(mask, dtype=bool)
- if mask_values is not None:
- mask = mask | mask_values
-
- if values.shape != mask.shape:
- raise ValueError("values.shape and mask.shape must match")
-
- return values, mask
-
-
-class BooleanArray(BaseMaskedArray):
- """
- Array of boolean (True/False) data with missing values.
-
- This is a pandas Extension array for boolean data, under the hood
- represented by 2 numpy arrays: a boolean array with the data and
- a boolean array with the mask (True indicating missing).
-
- BooleanArray implements Kleene logic (sometimes called three-value
- logic) for logical operations. See :ref:`boolean.kleene` for more.
-
- To construct an BooleanArray from generic array-like input, use
- :func:`pandas.array` specifying ``dtype="boolean"`` (see examples
- below).
-
- .. warning::
-
- BooleanArray is considered experimental. The implementation and
- parts of the API may change without warning.
-
- Parameters
- ----------
- values : numpy.ndarray
- A 1-d boolean-dtype array with the data.
- mask : numpy.ndarray
- A 1-d boolean-dtype array indicating missing values (True
- indicates missing).
- copy : bool, default False
- Whether to copy the `values` and `mask` arrays.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
-
- Returns
- -------
- BooleanArray
-
- Examples
- --------
- Create an BooleanArray with :func:`pandas.array`:
-
- >>> pd.array([True, False, None], dtype="boolean")
- <BooleanArray>
- [True, False, <NA>]
- Length: 3, dtype: boolean
- """
-
- # The value used to fill '_data' to avoid upcasting
- _internal_fill_value = False
- # Fill values used for any/all
- # Incompatible types in assignment (expression has type "bool", base class
- # "BaseMaskedArray" defined the type as "<typing special form>")
- _truthy_value = True # type: ignore[assignment]
- _falsey_value = False # type: ignore[assignment]
- _TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
- _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
-
- def __init__(
- self, values: np.ndarray, mask: np.ndarray, copy: bool = False
- ) -> None:
- if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
- raise TypeError(
- "values should be boolean numpy array. Use "
- "the 'pd.array' function instead"
- )
- self._dtype = BooleanDtype()
- super().__init__(values, mask, copy=copy)
-
- @property
- def dtype(self) -> BooleanDtype:
- return self._dtype
-
- @classmethod
- def _from_sequence_of_strings(
- cls,
- strings: list[str],
- *,
- dtype: Dtype | None = None,
- copy: bool = False,
- true_values: list[str] | None = None,
- false_values: list[str] | None = None,
- ) -> BooleanArray:
- true_values_union = cls._TRUE_VALUES.union(true_values or [])
- false_values_union = cls._FALSE_VALUES.union(false_values or [])
-
- def map_string(s) -> bool:
- if s in true_values_union:
- return True
- elif s in false_values_union:
- return False
- else:
- raise ValueError(f"{s} cannot be cast to bool")
-
- scalars = np.array(strings, dtype=object)
- mask = isna(scalars)
- scalars[~mask] = list(map(map_string, scalars[~mask]))
- return cls._from_sequence(scalars, dtype=dtype, copy=copy)
-
- _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
-
- @classmethod
- def _coerce_to_array(
- cls, value, *, dtype: DtypeObj, copy: bool = False
- ) -> tuple[np.ndarray, np.ndarray]:
- if dtype:
- assert dtype == "boolean"
- return coerce_to_array(value, copy=copy)
-
- def _logical_method(self, other, op):
- assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
- other_is_scalar = lib.is_scalar(other)
- mask = None
-
- if isinstance(other, BooleanArray):
- other, mask = other._data, other._mask
- elif is_list_like(other):
- other = np.asarray(other, dtype="bool")
- if other.ndim > 1:
- raise NotImplementedError("can only perform ops with 1-d structures")
- other, mask = coerce_to_array(other, copy=False)
- elif isinstance(other, np.bool_):
- other = other.item()
-
- if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
- raise TypeError(
- "'other' should be pandas.NA or a bool. "
- f"Got {type(other).__name__} instead."
- )
-
- if not other_is_scalar and len(self) != len(other):
- raise ValueError("Lengths must match")
-
- if op.__name__ in {"or_", "ror_"}:
- result, mask = ops.kleene_or(self._data, other, self._mask, mask)
- elif op.__name__ in {"and_", "rand_"}:
- result, mask = ops.kleene_and(self._data, other, self._mask, mask)
- else:
- # i.e. xor, rxor
- result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
-
- # i.e. BooleanArray
- return self._maybe_mask_result(result, mask)
-
- def _accumulate(
- self, name: str, *, skipna: bool = True, **kwargs
- ) -> BaseMaskedArray:
- data = self._data
- mask = self._mask
- if name in ("cummin", "cummax"):
- op = getattr(masked_accumulations, name)
- data, mask = op(data, mask, skipna=skipna, **kwargs)
- return type(self)(data, mask, copy=False)
- else:
- from pandas.core.arrays import IntegerArray
-
- return IntegerArray(data.astype(int), mask)._accumulate(
- name, skipna=skipna, **kwargs
- )
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/categorical.py b/contrib/python/pandas/py3/pandas/core/arrays/categorical.py
deleted file mode 100644
index ef76a5301cb..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/categorical.py
+++ /dev/null
@@ -1,2604 +0,0 @@
-from __future__ import annotations
-
-from csv import QUOTE_NONNUMERIC
-from functools import partial
-import operator
-from shutil import get_terminal_size
-from typing import (
- TYPE_CHECKING,
- Hashable,
- Iterator,
- Literal,
- Sequence,
- TypeVar,
- cast,
- overload,
-)
-
-import numpy as np
-
-from pandas._config import get_option
-
-from pandas._libs import (
- NaT,
- algos as libalgos,
- lib,
-)
-from pandas._libs.arrays import NDArrayBacked
-from pandas._typing import (
- ArrayLike,
- AstypeArg,
- AxisInt,
- Dtype,
- NpDtype,
- Ordered,
- Shape,
- SortKind,
- npt,
- type_t,
-)
-from pandas.compat.numpy import function as nv
-from pandas.util._validators import validate_bool_kwarg
-
-from pandas.core.dtypes.cast import (
- coerce_indexer_dtype,
- find_common_type,
-)
-from pandas.core.dtypes.common import (
- ensure_int64,
- ensure_platform_int,
- is_any_real_numeric_dtype,
- is_bool_dtype,
- is_categorical_dtype,
- is_datetime64_dtype,
- is_dict_like,
- is_dtype_equal,
- is_extension_array_dtype,
- is_hashable,
- is_integer_dtype,
- is_list_like,
- is_scalar,
- is_timedelta64_dtype,
- needs_i8_conversion,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import (
- CategoricalDtype,
- ExtensionDtype,
-)
-from pandas.core.dtypes.generic import (
- ABCIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import (
- is_valid_na_for_dtype,
- isna,
-)
-
-from pandas.core import (
- algorithms,
- arraylike,
- ops,
-)
-from pandas.core.accessor import (
- PandasDelegate,
- delegate_names,
-)
-from pandas.core.algorithms import (
- factorize,
- take_nd,
-)
-from pandas.core.arrays._mixins import (
- NDArrayBackedExtensionArray,
- ravel_compat,
-)
-from pandas.core.base import (
- ExtensionArray,
- NoNewAttributesMixin,
- PandasObject,
-)
-import pandas.core.common as com
-from pandas.core.construction import (
- extract_array,
- sanitize_array,
-)
-from pandas.core.ops.common import unpack_zerodim_and_defer
-from pandas.core.sorting import nargsort
-from pandas.core.strings.object_array import ObjectStringArrayMixin
-
-from pandas.io.formats import console
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Index,
- Series,
- )
-
-
-CategoricalT = TypeVar("CategoricalT", bound="Categorical")
-
-
-def _cat_compare_op(op):
- opname = f"__{op.__name__}__"
- fill_value = op is operator.ne
-
- @unpack_zerodim_and_defer(opname)
- def func(self, other):
- hashable = is_hashable(other)
- if is_list_like(other) and len(other) != len(self) and not hashable:
- # in hashable case we may have a tuple that is itself a category
- raise ValueError("Lengths must match.")
-
- if not self.ordered:
- if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:
- raise TypeError(
- "Unordered Categoricals can only compare equality or not"
- )
- if isinstance(other, Categorical):
- # Two Categoricals can only be compared if the categories are
- # the same (maybe up to ordering, depending on ordered)
-
- msg = "Categoricals can only be compared if 'categories' are the same."
- if not self._categories_match_up_to_permutation(other):
- raise TypeError(msg)
-
- if not self.ordered and not self.categories.equals(other.categories):
- # both unordered and different order
- other_codes = recode_for_categories(
- other.codes, other.categories, self.categories, copy=False
- )
- else:
- other_codes = other._codes
-
- ret = op(self._codes, other_codes)
- mask = (self._codes == -1) | (other_codes == -1)
- if mask.any():
- ret[mask] = fill_value
- return ret
-
- if hashable:
- if other in self.categories:
- i = self._unbox_scalar(other)
- ret = op(self._codes, i)
-
- if opname not in {"__eq__", "__ge__", "__gt__"}:
- # GH#29820 performance trick; get_loc will always give i>=0,
- # so in the cases (__ne__, __le__, __lt__) the setting
- # here is a no-op, so can be skipped.
- mask = self._codes == -1
- ret[mask] = fill_value
- return ret
- else:
- return ops.invalid_comparison(self, other, op)
- else:
- # allow categorical vs object dtype array comparisons for equality
- # these are only positional comparisons
- if opname not in ["__eq__", "__ne__"]:
- raise TypeError(
- f"Cannot compare a Categorical for op {opname} with "
- f"type {type(other)}.\nIf you want to compare values, "
- "use 'np.asarray(cat) <op> other'."
- )
-
- if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype):
- # We would return NotImplemented here, but that messes up
- # ExtensionIndex's wrapped methods
- return op(other, self)
- return getattr(np.array(self), opname)(np.array(other))
-
- func.__name__ = opname
-
- return func
-
-
-def contains(cat, key, container) -> bool:
- """
- Helper for membership check for ``key`` in ``cat``.
-
- This is a helper method for :method:`__contains__`
- and :class:`CategoricalIndex.__contains__`.
-
- Returns True if ``key`` is in ``cat.categories`` and the
- location of ``key`` in ``categories`` is in ``container``.
-
- Parameters
- ----------
- cat : :class:`Categorical`or :class:`categoricalIndex`
- key : a hashable object
- The key to check membership for.
- container : Container (e.g. list-like or mapping)
- The container to check for membership in.
-
- Returns
- -------
- is_in : bool
- True if ``key`` is in ``self.categories`` and location of
- ``key`` in ``categories`` is in ``container``, else False.
-
- Notes
- -----
- This method does not check for NaN values. Do that separately
- before calling this method.
- """
- hash(key)
-
- # get location of key in categories.
- # If a KeyError, the key isn't in categories, so logically
- # can't be in container either.
- try:
- loc = cat.categories.get_loc(key)
- except (KeyError, TypeError):
- return False
-
- # loc is the location of key in categories, but also the *value*
- # for key in container. So, `key` may be in categories,
- # but still not in `container`. Example ('b' in categories,
- # but not in values):
- # 'b' in Categorical(['a'], categories=['a', 'b']) # False
- if is_scalar(loc):
- return loc in container
- else:
- # if categories is an IntervalIndex, loc is an array.
- return any(loc_ in container for loc_ in loc)
-
-
-class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin):
- """
- Represent a categorical variable in classic R / S-plus fashion.
-
- `Categoricals` can only take on a limited, and usually fixed, number
- of possible values (`categories`). In contrast to statistical categorical
- variables, a `Categorical` might have an order, but numerical operations
- (additions, divisions, ...) are not possible.
-
- All values of the `Categorical` are either in `categories` or `np.nan`.
- Assigning values outside of `categories` will raise a `ValueError`. Order
- is defined by the order of the `categories`, not lexical order of the
- values.
-
- Parameters
- ----------
- values : list-like
- The values of the categorical. If categories are given, values not in
- categories will be replaced with NaN.
- categories : Index-like (unique), optional
- The unique categories for this categorical. If not given, the
- categories are assumed to be the unique values of `values` (sorted, if
- possible, otherwise in the order in which they appear).
- ordered : bool, default False
- Whether or not this categorical is treated as a ordered categorical.
- If True, the resulting categorical will be ordered.
- An ordered categorical respects, when sorted, the order of its
- `categories` attribute (which in turn is the `categories` argument, if
- provided).
- dtype : CategoricalDtype
- An instance of ``CategoricalDtype`` to use for this categorical.
-
- Attributes
- ----------
- categories : Index
- The categories of this categorical
- codes : ndarray
- The codes (integer positions, which point to the categories) of this
- categorical, read only.
- ordered : bool
- Whether or not this Categorical is ordered.
- dtype : CategoricalDtype
- The instance of ``CategoricalDtype`` storing the ``categories``
- and ``ordered``.
-
- Methods
- -------
- from_codes
- __array__
-
- Raises
- ------
- ValueError
- If the categories do not validate.
- TypeError
- If an explicit ``ordered=True`` is given but no `categories` and the
- `values` are not sortable.
-
- See Also
- --------
- CategoricalDtype : Type for categorical data.
- CategoricalIndex : An Index with an underlying ``Categorical``.
-
- Notes
- -----
- See the `user guide
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__
- for more.
-
- Examples
- --------
- >>> pd.Categorical([1, 2, 3, 1, 2, 3])
- [1, 2, 3, 1, 2, 3]
- Categories (3, int64): [1, 2, 3]
-
- >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
- ['a', 'b', 'c', 'a', 'b', 'c']
- Categories (3, object): ['a', 'b', 'c']
-
- Missing values are not included as a category.
-
- >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan])
- >>> c
- [1, 2, 3, 1, 2, 3, NaN]
- Categories (3, int64): [1, 2, 3]
-
- However, their presence is indicated in the `codes` attribute
- by code `-1`.
-
- >>> c.codes
- array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8)
-
- Ordered `Categoricals` can be sorted according to the custom order
- of the categories and can have a min and max value.
-
- >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,
- ... categories=['c', 'b', 'a'])
- >>> c
- ['a', 'b', 'c', 'a', 'b', 'c']
- Categories (3, object): ['c' < 'b' < 'a']
- >>> c.min()
- 'c'
- """
-
- # For comparisons, so that numpy uses our implementation if the compare
- # ops, which raise
- __array_priority__ = 1000
- # tolist is not actually deprecated, just suppressed in the __dir__
- _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
- _typ = "categorical"
-
- _dtype: CategoricalDtype
-
- def __init__(
- self,
- values,
- categories=None,
- ordered=None,
- dtype: Dtype | None = None,
- fastpath: bool = False,
- copy: bool = True,
- ) -> None:
- dtype = CategoricalDtype._from_values_or_dtype(
- values, categories, ordered, dtype
- )
- # At this point, dtype is always a CategoricalDtype, but
- # we may have dtype.categories be None, and we need to
- # infer categories in a factorization step further below
-
- if fastpath:
- codes = coerce_indexer_dtype(values, dtype.categories)
- dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
- super().__init__(codes, dtype)
- return
-
- if not is_list_like(values):
- # GH#38433
- raise TypeError("Categorical input must be list-like")
-
- # null_mask indicates missing values we want to exclude from inference.
- # This means: only missing values in list-likes (not arrays/ndframes).
- null_mask = np.array(False)
-
- # sanitize input
- if is_categorical_dtype(values):
- if dtype.categories is None:
- dtype = CategoricalDtype(values.categories, dtype.ordered)
- elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)):
- values = com.convert_to_list_like(values)
- if isinstance(values, list) and len(values) == 0:
- # By convention, empty lists result in object dtype:
- values = np.array([], dtype=object)
- elif isinstance(values, np.ndarray):
- if values.ndim > 1:
- # preempt sanitize_array from raising ValueError
- raise NotImplementedError(
- "> 1 ndim Categorical are not supported at this time"
- )
- values = sanitize_array(values, None)
- else:
- # i.e. must be a list
- arr = sanitize_array(values, None)
- null_mask = isna(arr)
- if null_mask.any():
- # We remove null values here, then below will re-insert
- # them, grep "full_codes"
- arr_list = [values[idx] for idx in np.where(~null_mask)[0]]
-
- # GH#44900 Do not cast to float if we have only missing values
- if arr_list or arr.dtype == "object":
- sanitize_dtype = None
- else:
- sanitize_dtype = arr.dtype
-
- arr = sanitize_array(arr_list, None, dtype=sanitize_dtype)
- values = arr
-
- if dtype.categories is None:
- try:
- codes, categories = factorize(values, sort=True)
- except TypeError as err:
- codes, categories = factorize(values, sort=False)
- if dtype.ordered:
- # raise, as we don't have a sortable data structure and so
- # the user should give us one by specifying categories
- raise TypeError(
- "'values' is not ordered, please "
- "explicitly specify the categories order "
- "by passing in a categories argument."
- ) from err
-
- # we're inferring from values
- dtype = CategoricalDtype(categories, dtype.ordered)
-
- elif is_categorical_dtype(values.dtype):
- old_codes = extract_array(values)._codes
- codes = recode_for_categories(
- old_codes, values.dtype.categories, dtype.categories, copy=copy
- )
-
- else:
- codes = _get_codes_for_values(values, dtype.categories)
-
- if null_mask.any():
- # Reinsert -1 placeholders for previously removed missing values
- full_codes = -np.ones(null_mask.shape, dtype=codes.dtype)
- full_codes[~null_mask] = codes
- codes = full_codes
-
- dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
- arr = coerce_indexer_dtype(codes, dtype.categories)
- super().__init__(arr, dtype)
-
- @property
- def dtype(self) -> CategoricalDtype:
- """
- The :class:`~pandas.api.types.CategoricalDtype` for this instance.
- """
- return self._dtype
-
- @property
- def _internal_fill_value(self) -> int:
- # using the specific numpy integer instead of python int to get
- # the correct dtype back from _quantile in the all-NA case
- dtype = self._ndarray.dtype
- return dtype.type(-1)
-
- @classmethod
- def _from_sequence(
- cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
- ) -> Categorical:
- return Categorical(scalars, dtype=dtype, copy=copy)
-
- @overload
- def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
- ...
-
- @overload
- def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
- ...
-
- @overload
- def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
- ...
-
- def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
- """
- Coerce this type to another dtype
-
- Parameters
- ----------
- dtype : numpy dtype or pandas type
- copy : bool, default True
- By default, astype always returns a newly allocated object.
- If copy is set to False and dtype is categorical, the original
- object is returned.
- """
- dtype = pandas_dtype(dtype)
- if self.dtype is dtype:
- result = self.copy() if copy else self
-
- elif is_categorical_dtype(dtype):
- dtype = cast(CategoricalDtype, dtype)
-
- # GH 10696/18593/18630
- dtype = self.dtype.update_dtype(dtype)
- self = self.copy() if copy else self
- result = self._set_dtype(dtype)
-
- elif isinstance(dtype, ExtensionDtype):
- return super().astype(dtype, copy=copy)
-
- elif is_integer_dtype(dtype) and self.isna().any():
- raise ValueError("Cannot convert float NaN to integer")
-
- elif len(self.codes) == 0 or len(self.categories) == 0:
- result = np.array(
- self,
- dtype=dtype,
- copy=copy,
- )
-
- else:
- # GH8628 (PERF): astype category codes instead of astyping array
- new_cats = self.categories._values
-
- try:
- new_cats = new_cats.astype(dtype=dtype, copy=copy)
- fill_value = self.categories._na_value
- if not is_valid_na_for_dtype(fill_value, dtype):
- fill_value = lib.item_from_zerodim(
- np.array(self.categories._na_value).astype(dtype)
- )
- except (
- TypeError, # downstream error msg for CategoricalIndex is misleading
- ValueError,
- ):
- msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
- raise ValueError(msg)
-
- result = take_nd(
- new_cats, ensure_platform_int(self._codes), fill_value=fill_value
- )
-
- return result
-
- def to_list(self):
- """
- Alias for tolist.
- """
- return self.tolist()
-
- @classmethod
- def _from_inferred_categories(
- cls, inferred_categories, inferred_codes, dtype, true_values=None
- ):
- """
- Construct a Categorical from inferred values.
-
- For inferred categories (`dtype` is None) the categories are sorted.
- For explicit `dtype`, the `inferred_categories` are cast to the
- appropriate type.
-
- Parameters
- ----------
- inferred_categories : Index
- inferred_codes : Index
- dtype : CategoricalDtype or 'category'
- true_values : list, optional
- If none are provided, the default ones are
- "True", "TRUE", and "true."
-
- Returns
- -------
- Categorical
- """
- from pandas import (
- Index,
- to_datetime,
- to_numeric,
- to_timedelta,
- )
-
- cats = Index(inferred_categories)
- known_categories = (
- isinstance(dtype, CategoricalDtype) and dtype.categories is not None
- )
-
- if known_categories:
- # Convert to a specialized type with `dtype` if specified.
- if is_any_real_numeric_dtype(dtype.categories):
- cats = to_numeric(inferred_categories, errors="coerce")
- elif is_datetime64_dtype(dtype.categories):
- cats = to_datetime(inferred_categories, errors="coerce")
- elif is_timedelta64_dtype(dtype.categories):
- cats = to_timedelta(inferred_categories, errors="coerce")
- elif is_bool_dtype(dtype.categories):
- if true_values is None:
- true_values = ["True", "TRUE", "true"]
-
- # error: Incompatible types in assignment (expression has type
- # "ndarray", variable has type "Index")
- cats = cats.isin(true_values) # type: ignore[assignment]
-
- if known_categories:
- # Recode from observation order to dtype.categories order.
- categories = dtype.categories
- codes = recode_for_categories(inferred_codes, cats, categories)
- elif not cats.is_monotonic_increasing:
- # Sort categories and recode for unknown categories.
- unsorted = cats.copy()
- categories = cats.sort_values()
-
- codes = recode_for_categories(inferred_codes, unsorted, categories)
- dtype = CategoricalDtype(categories, ordered=False)
- else:
- dtype = CategoricalDtype(cats, ordered=False)
- codes = inferred_codes
-
- return cls(codes, dtype=dtype, fastpath=True)
-
- @classmethod
- def from_codes(
- cls, codes, categories=None, ordered=None, dtype: Dtype | None = None
- ) -> Categorical:
- """
- Make a Categorical type from codes and categories or dtype.
-
- This constructor is useful if you already have codes and
- categories/dtype and so do not need the (computation intensive)
- factorization step, which is usually done on the constructor.
-
- If your data does not follow this convention, please use the normal
- constructor.
-
- Parameters
- ----------
- codes : array-like of int
- An integer array, where each integer points to a category in
- categories or dtype.categories, or else is -1 for NaN.
- categories : index-like, optional
- The categories for the categorical. Items need to be unique.
- If the categories are not given here, then they must be provided
- in `dtype`.
- ordered : bool, optional
- Whether or not this categorical is treated as an ordered
- categorical. If not given here or in `dtype`, the resulting
- categorical will be unordered.
- dtype : CategoricalDtype or "category", optional
- If :class:`CategoricalDtype`, cannot be used together with
- `categories` or `ordered`.
-
- Returns
- -------
- Categorical
-
- Examples
- --------
- >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
- >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
- ['a', 'b', 'a', 'b']
- Categories (2, object): ['a' < 'b']
- """
- dtype = CategoricalDtype._from_values_or_dtype(
- categories=categories, ordered=ordered, dtype=dtype
- )
- if dtype.categories is None:
- msg = (
- "The categories must be provided in 'categories' or "
- "'dtype'. Both were None."
- )
- raise ValueError(msg)
-
- if is_extension_array_dtype(codes) and is_integer_dtype(codes):
- # Avoid the implicit conversion of Int to object
- if isna(codes).any():
- raise ValueError("codes cannot contain NA values")
- codes = codes.to_numpy(dtype=np.int64)
- else:
- codes = np.asarray(codes)
- if len(codes) and not is_integer_dtype(codes):
- raise ValueError("codes need to be array-like integers")
-
- if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
- raise ValueError("codes need to be between -1 and len(categories)-1")
-
- return cls(codes, dtype=dtype, fastpath=True)
-
- # ------------------------------------------------------------------
- # Categories/Codes/Ordered
-
- @property
- def categories(self) -> Index:
- """
- The categories of this categorical.
-
- Setting assigns new values to each category (effectively a rename of
- each individual category).
-
- The assigned value has to be a list-like object. All items must be
- unique and the number of items in the new categories must be the same
- as the number of items in the old categories.
-
- Raises
- ------
- ValueError
- If the new categories do not validate as categories or if the
- number of new categories is unequal the number of old categories
-
- See Also
- --------
- rename_categories : Rename categories.
- reorder_categories : Reorder categories.
- add_categories : Add new categories.
- remove_categories : Remove the specified categories.
- remove_unused_categories : Remove categories which are not used.
- set_categories : Set the categories to the specified ones.
- """
- return self.dtype.categories
-
- @property
- def ordered(self) -> Ordered:
- """
- Whether the categories have an ordered relationship.
- """
- return self.dtype.ordered
-
- @property
- def codes(self) -> np.ndarray:
- """
- The category codes of this categorical.
-
- Codes are an array of integers which are the positions of the actual
- values in the categories array.
-
- There is no setter, use the other categorical methods and the normal item
- setter to change values in the categorical.
-
- Returns
- -------
- ndarray[int]
- A non-writable view of the `codes` array.
- """
- v = self._codes.view()
- v.flags.writeable = False
- return v
-
- def _set_categories(self, categories, fastpath: bool = False) -> None:
- """
- Sets new categories inplace
-
- Parameters
- ----------
- fastpath : bool, default False
- Don't perform validation of the categories for uniqueness or nulls
-
- Examples
- --------
- >>> c = pd.Categorical(['a', 'b'])
- >>> c
- ['a', 'b']
- Categories (2, object): ['a', 'b']
-
- >>> c._set_categories(pd.Index(['a', 'c']))
- >>> c
- ['a', 'c']
- Categories (2, object): ['a', 'c']
- """
- if fastpath:
- new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)
- else:
- new_dtype = CategoricalDtype(categories, ordered=self.ordered)
- if (
- not fastpath
- and self.dtype.categories is not None
- and len(new_dtype.categories) != len(self.dtype.categories)
- ):
- raise ValueError(
- "new categories need to have the same number of "
- "items as the old categories!"
- )
-
- super().__init__(self._ndarray, new_dtype)
-
- def _set_dtype(self, dtype: CategoricalDtype) -> Categorical:
- """
- Internal method for directly updating the CategoricalDtype
-
- Parameters
- ----------
- dtype : CategoricalDtype
-
- Notes
- -----
- We don't do any validation here. It's assumed that the dtype is
- a (valid) instance of `CategoricalDtype`.
- """
- codes = recode_for_categories(self.codes, self.categories, dtype.categories)
- return type(self)(codes, dtype=dtype, fastpath=True)
-
- def set_ordered(self, value: bool) -> Categorical:
- """
- Set the ordered attribute to the boolean value.
-
- Parameters
- ----------
- value : bool
- Set whether this categorical is ordered (True) or not (False).
- """
- new_dtype = CategoricalDtype(self.categories, ordered=value)
- cat = self.copy()
- NDArrayBacked.__init__(cat, cat._ndarray, new_dtype)
- return cat
-
- def as_ordered(self) -> Categorical:
- """
- Set the Categorical to be ordered.
-
- Returns
- -------
- Categorical
- Ordered Categorical.
- """
- return self.set_ordered(True)
-
- def as_unordered(self) -> Categorical:
- """
- Set the Categorical to be unordered.
-
- Returns
- -------
- Categorical
- Unordered Categorical.
- """
- return self.set_ordered(False)
-
- def set_categories(self, new_categories, ordered=None, rename: bool = False):
- """
- Set the categories to the specified new_categories.
-
- `new_categories` can include new categories (which will result in
- unused categories) or remove old categories (which results in values
- set to NaN). If `rename==True`, the categories will simple be renamed
- (less or more items than in old categories will result in values set to
- NaN or in unused categories respectively).
-
- This method can be used to perform more than one action of adding,
- removing, and reordering simultaneously and is therefore faster than
- performing the individual steps via the more specialised methods.
-
- On the other hand this methods does not do checks (e.g., whether the
- old categories are included in the new categories on a reorder), which
- can result in surprising changes, for example when using special string
- dtypes, which does not considers a S1 string equal to a single char
- python string.
-
- Parameters
- ----------
- new_categories : Index-like
- The categories in new order.
- ordered : bool, default False
- Whether or not the categorical is treated as a ordered categorical.
- If not given, do not change the ordered information.
- rename : bool, default False
- Whether or not the new_categories should be considered as a rename
- of the old categories or as reordered categories.
-
- Returns
- -------
- Categorical with reordered categories.
-
- Raises
- ------
- ValueError
- If new_categories does not validate as categories
-
- See Also
- --------
- rename_categories : Rename categories.
- reorder_categories : Reorder categories.
- add_categories : Add new categories.
- remove_categories : Remove the specified categories.
- remove_unused_categories : Remove categories which are not used.
- """
-
- if ordered is None:
- ordered = self.dtype.ordered
- new_dtype = CategoricalDtype(new_categories, ordered=ordered)
-
- cat = self.copy()
- if rename:
- if cat.dtype.categories is not None and len(new_dtype.categories) < len(
- cat.dtype.categories
- ):
- # remove all _codes which are larger and set to -1/NaN
- cat._codes[cat._codes >= len(new_dtype.categories)] = -1
- codes = cat._codes
- else:
- codes = recode_for_categories(
- cat.codes, cat.categories, new_dtype.categories
- )
- NDArrayBacked.__init__(cat, codes, new_dtype)
- return cat
-
- def rename_categories(self, new_categories) -> Categorical:
- """
- Rename categories.
-
- Parameters
- ----------
- new_categories : list-like, dict-like or callable
-
- New categories which will replace old categories.
-
- * list-like: all items must be unique and the number of items in
- the new categories must match the existing number of categories.
-
- * dict-like: specifies a mapping from
- old categories to new. Categories not contained in the mapping
- are passed through and extra categories in the mapping are
- ignored.
-
- * callable : a callable that is called on all items in the old
- categories and whose return values comprise the new categories.
-
- Returns
- -------
- Categorical
- Categorical with renamed categories.
-
- Raises
- ------
- ValueError
- If new categories are list-like and do not have the same number of
- items than the current categories or do not validate as categories
-
- See Also
- --------
- reorder_categories : Reorder categories.
- add_categories : Add new categories.
- remove_categories : Remove the specified categories.
- remove_unused_categories : Remove categories which are not used.
- set_categories : Set the categories to the specified ones.
-
- Examples
- --------
- >>> c = pd.Categorical(['a', 'a', 'b'])
- >>> c.rename_categories([0, 1])
- [0, 0, 1]
- Categories (2, int64): [0, 1]
-
- For dict-like ``new_categories``, extra keys are ignored and
- categories not in the dictionary are passed through
-
- >>> c.rename_categories({'a': 'A', 'c': 'C'})
- ['A', 'A', 'b']
- Categories (2, object): ['A', 'b']
-
- You may also provide a callable to create the new categories
-
- >>> c.rename_categories(lambda x: x.upper())
- ['A', 'A', 'B']
- Categories (2, object): ['A', 'B']
- """
-
- if is_dict_like(new_categories):
- new_categories = [
- new_categories.get(item, item) for item in self.categories
- ]
- elif callable(new_categories):
- new_categories = [new_categories(item) for item in self.categories]
-
- cat = self.copy()
- cat._set_categories(new_categories)
- return cat
-
- def reorder_categories(self, new_categories, ordered=None):
- """
- Reorder categories as specified in new_categories.
-
- `new_categories` need to include all old categories and no new category
- items.
-
- Parameters
- ----------
- new_categories : Index-like
- The categories in new order.
- ordered : bool, optional
- Whether or not the categorical is treated as a ordered categorical.
- If not given, do not change the ordered information.
-
- Returns
- -------
- Categorical
- Categorical with reordered categories.
-
- Raises
- ------
- ValueError
- If the new categories do not contain all old category items or any
- new ones
-
- See Also
- --------
- rename_categories : Rename categories.
- add_categories : Add new categories.
- remove_categories : Remove the specified categories.
- remove_unused_categories : Remove categories which are not used.
- set_categories : Set the categories to the specified ones.
- """
- if (
- len(self.categories) != len(new_categories)
- or not self.categories.difference(new_categories).empty
- ):
- raise ValueError(
- "items in new_categories are not the same as in old categories"
- )
- return self.set_categories(new_categories, ordered=ordered)
-
- def add_categories(self, new_categories) -> Categorical:
- """
- Add new categories.
-
- `new_categories` will be included at the last/highest place in the
- categories and will be unused directly after this call.
-
- Parameters
- ----------
- new_categories : category or list-like of category
- The new categories to be included.
-
- Returns
- -------
- Categorical
- Categorical with new categories added.
-
- Raises
- ------
- ValueError
- If the new categories include old categories or do not validate as
- categories
-
- See Also
- --------
- rename_categories : Rename categories.
- reorder_categories : Reorder categories.
- remove_categories : Remove the specified categories.
- remove_unused_categories : Remove categories which are not used.
- set_categories : Set the categories to the specified ones.
-
- Examples
- --------
- >>> c = pd.Categorical(['c', 'b', 'c'])
- >>> c
- ['c', 'b', 'c']
- Categories (2, object): ['b', 'c']
-
- >>> c.add_categories(['d', 'a'])
- ['c', 'b', 'c']
- Categories (4, object): ['b', 'c', 'd', 'a']
- """
-
- if not is_list_like(new_categories):
- new_categories = [new_categories]
- already_included = set(new_categories) & set(self.dtype.categories)
- if len(already_included) != 0:
- raise ValueError(
- f"new categories must not include old categories: {already_included}"
- )
-
- if hasattr(new_categories, "dtype"):
- from pandas import Series
-
- dtype = find_common_type(
- [self.dtype.categories.dtype, new_categories.dtype]
- )
- new_categories = Series(
- list(self.dtype.categories) + list(new_categories), dtype=dtype
- )
- else:
- new_categories = list(self.dtype.categories) + list(new_categories)
-
- new_dtype = CategoricalDtype(new_categories, self.ordered)
- cat = self.copy()
- codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories)
- NDArrayBacked.__init__(cat, codes, new_dtype)
- return cat
-
- def remove_categories(self, removals):
- """
- Remove the specified categories.
-
- `removals` must be included in the old categories. Values which were in
- the removed categories will be set to NaN
-
- Parameters
- ----------
- removals : category or list of categories
- The categories which should be removed.
-
- Returns
- -------
- Categorical
- Categorical with removed categories.
-
- Raises
- ------
- ValueError
- If the removals are not contained in the categories
-
- See Also
- --------
- rename_categories : Rename categories.
- reorder_categories : Reorder categories.
- add_categories : Add new categories.
- remove_unused_categories : Remove categories which are not used.
- set_categories : Set the categories to the specified ones.
-
- Examples
- --------
- >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])
- >>> c
- ['a', 'c', 'b', 'c', 'd']
- Categories (4, object): ['a', 'b', 'c', 'd']
-
- >>> c.remove_categories(['d', 'a'])
- [NaN, 'c', 'b', 'c', NaN]
- Categories (2, object): ['b', 'c']
- """
- from pandas import Index
-
- if not is_list_like(removals):
- removals = [removals]
-
- removals = Index(removals).unique().dropna()
- new_categories = self.dtype.categories.difference(removals)
- not_included = removals.difference(self.dtype.categories)
-
- if len(not_included) != 0:
- not_included = set(not_included)
- raise ValueError(f"removals must all be in old categories: {not_included}")
-
- return self.set_categories(new_categories, ordered=self.ordered, rename=False)
-
- def remove_unused_categories(self) -> Categorical:
- """
- Remove categories which are not used.
-
- Returns
- -------
- Categorical
- Categorical with unused categories dropped.
-
- See Also
- --------
- rename_categories : Rename categories.
- reorder_categories : Reorder categories.
- add_categories : Add new categories.
- remove_categories : Remove the specified categories.
- set_categories : Set the categories to the specified ones.
-
- Examples
- --------
- >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])
- >>> c
- ['a', 'c', 'b', 'c', 'd']
- Categories (4, object): ['a', 'b', 'c', 'd']
-
- >>> c[2] = 'a'
- >>> c[4] = 'c'
- >>> c
- ['a', 'c', 'a', 'c', 'c']
- Categories (4, object): ['a', 'b', 'c', 'd']
-
- >>> c.remove_unused_categories()
- ['a', 'c', 'a', 'c', 'c']
- Categories (2, object): ['a', 'c']
- """
- idx, inv = np.unique(self._codes, return_inverse=True)
-
- if idx.size != 0 and idx[0] == -1: # na sentinel
- idx, inv = idx[1:], inv - 1
-
- new_categories = self.dtype.categories.take(idx)
- new_dtype = CategoricalDtype._from_fastpath(
- new_categories, ordered=self.ordered
- )
- new_codes = coerce_indexer_dtype(inv, new_dtype.categories)
-
- cat = self.copy()
- NDArrayBacked.__init__(cat, new_codes, new_dtype)
- return cat
-
- # ------------------------------------------------------------------
-
- def map(self, mapper):
- """
- Map categories using an input mapping or function.
-
- Maps the categories to new categories. If the mapping correspondence is
- one-to-one the result is a :class:`~pandas.Categorical` which has the
- same order property as the original, otherwise a :class:`~pandas.Index`
- is returned. NaN values are unaffected.
-
- If a `dict` or :class:`~pandas.Series` is used any unmapped category is
- mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
- will be returned.
-
- Parameters
- ----------
- mapper : function, dict, or Series
- Mapping correspondence.
-
- Returns
- -------
- pandas.Categorical or pandas.Index
- Mapped categorical.
-
- See Also
- --------
- CategoricalIndex.map : Apply a mapping correspondence on a
- :class:`~pandas.CategoricalIndex`.
- Index.map : Apply a mapping correspondence on an
- :class:`~pandas.Index`.
- Series.map : Apply a mapping correspondence on a
- :class:`~pandas.Series`.
- Series.apply : Apply more complex functions on a
- :class:`~pandas.Series`.
-
- Examples
- --------
- >>> cat = pd.Categorical(['a', 'b', 'c'])
- >>> cat
- ['a', 'b', 'c']
- Categories (3, object): ['a', 'b', 'c']
- >>> cat.map(lambda x: x.upper())
- ['A', 'B', 'C']
- Categories (3, object): ['A', 'B', 'C']
- >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
- ['first', 'second', 'third']
- Categories (3, object): ['first', 'second', 'third']
-
- If the mapping is one-to-one the ordering of the categories is
- preserved:
-
- >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
- >>> cat
- ['a', 'b', 'c']
- Categories (3, object): ['a' < 'b' < 'c']
- >>> cat.map({'a': 3, 'b': 2, 'c': 1})
- [3, 2, 1]
- Categories (3, int64): [3 < 2 < 1]
-
- If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
-
- >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'})
- Index(['first', 'second', 'first'], dtype='object')
-
- If a `dict` is used, all unmapped categories are mapped to `NaN` and
- the result is an :class:`~pandas.Index`:
-
- >>> cat.map({'a': 'first', 'b': 'second'})
- Index(['first', 'second', nan], dtype='object')
- """
- new_categories = self.categories.map(mapper)
- try:
- return self.from_codes(
- self._codes.copy(), categories=new_categories, ordered=self.ordered
- )
- except ValueError:
- # NA values are represented in self._codes with -1
- # np.take causes NA values to take final element in new_categories
- if np.any(self._codes == -1):
- new_categories = new_categories.insert(len(new_categories), np.nan)
- return np.take(new_categories, self._codes)
-
- __eq__ = _cat_compare_op(operator.eq)
- __ne__ = _cat_compare_op(operator.ne)
- __lt__ = _cat_compare_op(operator.lt)
- __gt__ = _cat_compare_op(operator.gt)
- __le__ = _cat_compare_op(operator.le)
- __ge__ = _cat_compare_op(operator.ge)
-
- # -------------------------------------------------------------
- # Validators; ideally these can be de-duplicated
-
- def _validate_setitem_value(self, value):
- if not is_hashable(value):
- # wrap scalars and hashable-listlikes in list
- return self._validate_listlike(value)
- else:
- return self._validate_scalar(value)
-
- def _validate_scalar(self, fill_value):
- """
- Convert a user-facing fill_value to a representation to use with our
- underlying ndarray, raising TypeError if this is not possible.
-
- Parameters
- ----------
- fill_value : object
-
- Returns
- -------
- fill_value : int
-
- Raises
- ------
- TypeError
- """
-
- if is_valid_na_for_dtype(fill_value, self.categories.dtype):
- fill_value = -1
- elif fill_value in self.categories:
- fill_value = self._unbox_scalar(fill_value)
- else:
- raise TypeError(
- "Cannot setitem on a Categorical with a new "
- f"category ({fill_value}), set the categories first"
- ) from None
- return fill_value
-
- # -------------------------------------------------------------
-
- @ravel_compat
- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
- """
- The numpy array interface.
-
- Returns
- -------
- numpy.array
- A numpy array of either the specified dtype or,
- if dtype==None (default), the same dtype as
- categorical.categories.dtype.
- """
- ret = take_nd(self.categories._values, self._codes)
- if dtype and not is_dtype_equal(dtype, self.categories.dtype):
- return np.asarray(ret, dtype)
- # When we're a Categorical[ExtensionArray], like Interval,
- # we need to ensure __array__ gets all the way to an
- # ndarray.
- return np.asarray(ret)
-
- def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
- # for binary ops, use our custom dunder methods
- result = ops.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
-
- if "out" in kwargs:
- # e.g. test_numpy_ufuncs_out
- return arraylike.dispatch_ufunc_with_out(
- self, ufunc, method, *inputs, **kwargs
- )
-
- if method == "reduce":
- # e.g. TestCategoricalAnalytics::test_min_max_ordered
- result = arraylike.dispatch_reduction_ufunc(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
-
- # for all other cases, raise for now (similarly as what happens in
- # Series.__array_prepare__)
- raise TypeError(
- f"Object with dtype {self.dtype} cannot perform "
- f"the numpy op {ufunc.__name__}"
- )
-
- def __setstate__(self, state) -> None:
- """Necessary for making this object picklable"""
- if not isinstance(state, dict):
- return super().__setstate__(state)
-
- if "_dtype" not in state:
- state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])
-
- if "_codes" in state and "_ndarray" not in state:
- # backward compat, changed what is property vs attribute
- state["_ndarray"] = state.pop("_codes")
-
- super().__setstate__(state)
-
- @property
- def nbytes(self) -> int:
- return self._codes.nbytes + self.dtype.categories.values.nbytes
-
- def memory_usage(self, deep: bool = False) -> int:
- """
- Memory usage of my values
-
- Parameters
- ----------
- deep : bool
- Introspect the data deeply, interrogate
- `object` dtypes for system-level memory consumption
-
- Returns
- -------
- bytes used
-
- Notes
- -----
- Memory usage does not include memory consumed by elements that
- are not components of the array if deep=False
-
- See Also
- --------
- numpy.ndarray.nbytes
- """
- return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)
-
- def isna(self) -> np.ndarray:
- """
- Detect missing values
-
- Missing values (-1 in .codes) are detected.
-
- Returns
- -------
- np.ndarray[bool] of whether my values are null
-
- See Also
- --------
- isna : Top-level isna.
- isnull : Alias of isna.
- Categorical.notna : Boolean inverse of Categorical.isna.
-
- """
- return self._codes == -1
-
- isnull = isna
-
- def notna(self) -> np.ndarray:
- """
- Inverse of isna
-
- Both missing values (-1 in .codes) and NA as a category are detected as
- null.
-
- Returns
- -------
- np.ndarray[bool] of whether my values are not null
-
- See Also
- --------
- notna : Top-level notna.
- notnull : Alias of notna.
- Categorical.isna : Boolean inverse of Categorical.notna.
-
- """
- return ~self.isna()
-
- notnull = notna
-
- def value_counts(self, dropna: bool = True) -> Series:
- """
- Return a Series containing counts of each category.
-
- Every category will have an entry, even those with a count of 0.
-
- Parameters
- ----------
- dropna : bool, default True
- Don't include counts of NaN.
-
- Returns
- -------
- counts : Series
-
- See Also
- --------
- Series.value_counts
- """
- from pandas import (
- CategoricalIndex,
- Series,
- )
-
- code, cat = self._codes, self.categories
- ncat, mask = (len(cat), code >= 0)
- ix, clean = np.arange(ncat), mask.all()
-
- if dropna or clean:
- obs = code if clean else code[mask]
- count = np.bincount(obs, minlength=ncat or 0)
- else:
- count = np.bincount(np.where(mask, code, ncat))
- ix = np.append(ix, -1)
-
- ix = coerce_indexer_dtype(ix, self.dtype.categories)
- ix = self._from_backing_data(ix)
-
- return Series(
- count, index=CategoricalIndex(ix), dtype="int64", name="count", copy=False
- )
-
- # error: Argument 2 of "_empty" is incompatible with supertype
- # "NDArrayBackedExtensionArray"; supertype defines the argument type as
- # "ExtensionDtype"
- @classmethod
- def _empty( # type: ignore[override]
- cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype
- ) -> Categorical:
- """
- Analogous to np.empty(shape, dtype=dtype)
-
- Parameters
- ----------
- shape : tuple[int]
- dtype : CategoricalDtype
- """
- arr = cls._from_sequence([], dtype=dtype)
-
- # We have to use np.zeros instead of np.empty otherwise the resulting
- # ndarray may contain codes not supported by this dtype, in which
- # case repr(result) could segfault.
- backing = np.zeros(shape, dtype=arr._ndarray.dtype)
-
- return arr._from_backing_data(backing)
-
- def _internal_get_values(self):
- """
- Return the values.
-
- For internal compatibility with pandas formatting.
-
- Returns
- -------
- np.ndarray or Index
- A numpy array of the same dtype as categorical.categories.dtype or
- Index if datetime / periods.
- """
- # if we are a datetime and period index, return Index to keep metadata
- if needs_i8_conversion(self.categories.dtype):
- return self.categories.take(self._codes, fill_value=NaT)
- elif is_integer_dtype(self.categories) and -1 in self._codes:
- return self.categories.astype("object").take(self._codes, fill_value=np.nan)
- return np.array(self)
-
- def check_for_ordered(self, op) -> None:
- """assert that we are ordered"""
- if not self.ordered:
- raise TypeError(
- f"Categorical is not ordered for operation {op}\n"
- "you can use .as_ordered() to change the "
- "Categorical to an ordered one\n"
- )
-
- def argsort(
- self, *, ascending: bool = True, kind: SortKind = "quicksort", **kwargs
- ):
- """
- Return the indices that would sort the Categorical.
-
- Missing values are sorted at the end.
-
- Parameters
- ----------
- ascending : bool, default True
- Whether the indices should result in an ascending
- or descending sort.
- kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
- Sorting algorithm.
- **kwargs:
- passed through to :func:`numpy.argsort`.
-
- Returns
- -------
- np.ndarray[np.intp]
-
- See Also
- --------
- numpy.ndarray.argsort
-
- Notes
- -----
- While an ordering is applied to the category values, arg-sorting
- in this context refers more to organizing and grouping together
- based on matching category values. Thus, this function can be
- called on an unordered Categorical instance unlike the functions
- 'Categorical.min' and 'Categorical.max'.
-
- Examples
- --------
- >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()
- array([2, 0, 1, 3])
-
- >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],
- ... categories=['c', 'b', 'a'],
- ... ordered=True)
- >>> cat.argsort()
- array([3, 0, 1, 2])
-
- Missing values are placed at the end
-
- >>> cat = pd.Categorical([2, None, 1])
- >>> cat.argsort()
- array([2, 0, 1])
- """
- return super().argsort(ascending=ascending, kind=kind, **kwargs)
-
- @overload
- def sort_values(
- self,
- *,
- inplace: Literal[False] = ...,
- ascending: bool = ...,
- na_position: str = ...,
- ) -> Categorical:
- ...
-
- @overload
- def sort_values(
- self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ...
- ) -> None:
- ...
-
- def sort_values(
- self,
- *,
- inplace: bool = False,
- ascending: bool = True,
- na_position: str = "last",
- ) -> Categorical | None:
- """
- Sort the Categorical by category value returning a new
- Categorical by default.
-
- While an ordering is applied to the category values, sorting in this
- context refers more to organizing and grouping together based on
- matching category values. Thus, this function can be called on an
- unordered Categorical instance unlike the functions 'Categorical.min'
- and 'Categorical.max'.
-
- Parameters
- ----------
- inplace : bool, default False
- Do operation in place.
- ascending : bool, default True
- Order ascending. Passing False orders descending. The
- ordering parameter provides the method by which the
- category values are organized.
- na_position : {'first', 'last'} (optional, default='last')
- 'first' puts NaNs at the beginning
- 'last' puts NaNs at the end
-
- Returns
- -------
- Categorical or None
-
- See Also
- --------
- Categorical.sort
- Series.sort_values
-
- Examples
- --------
- >>> c = pd.Categorical([1, 2, 2, 1, 5])
- >>> c
- [1, 2, 2, 1, 5]
- Categories (3, int64): [1, 2, 5]
- >>> c.sort_values()
- [1, 1, 2, 2, 5]
- Categories (3, int64): [1, 2, 5]
- >>> c.sort_values(ascending=False)
- [5, 2, 2, 1, 1]
- Categories (3, int64): [1, 2, 5]
-
- >>> c = pd.Categorical([1, 2, 2, 1, 5])
-
- 'sort_values' behaviour with NaNs. Note that 'na_position'
- is independent of the 'ascending' parameter:
-
- >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
- >>> c
- [NaN, 2, 2, NaN, 5]
- Categories (2, int64): [2, 5]
- >>> c.sort_values()
- [2, 2, 5, NaN, NaN]
- Categories (2, int64): [2, 5]
- >>> c.sort_values(ascending=False)
- [5, 2, 2, NaN, NaN]
- Categories (2, int64): [2, 5]
- >>> c.sort_values(na_position='first')
- [NaN, NaN, 2, 2, 5]
- Categories (2, int64): [2, 5]
- >>> c.sort_values(ascending=False, na_position='first')
- [NaN, NaN, 5, 2, 2]
- Categories (2, int64): [2, 5]
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- if na_position not in ["last", "first"]:
- raise ValueError(f"invalid na_position: {repr(na_position)}")
-
- sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)
-
- if not inplace:
- codes = self._codes[sorted_idx]
- return self._from_backing_data(codes)
- self._codes[:] = self._codes[sorted_idx]
- return None
-
- def _rank(
- self,
- *,
- axis: AxisInt = 0,
- method: str = "average",
- na_option: str = "keep",
- ascending: bool = True,
- pct: bool = False,
- ):
- """
- See Series.rank.__doc__.
- """
- if axis != 0:
- raise NotImplementedError
- vff = self._values_for_rank()
- return algorithms.rank(
- vff,
- axis=axis,
- method=method,
- na_option=na_option,
- ascending=ascending,
- pct=pct,
- )
-
- def _values_for_rank(self):
- """
- For correctly ranking ordered categorical data. See GH#15420
-
- Ordered categorical data should be ranked on the basis of
- codes with -1 translated to NaN.
-
- Returns
- -------
- numpy.array
-
- """
- from pandas import Series
-
- if self.ordered:
- values = self.codes
- mask = values == -1
- if mask.any():
- values = values.astype("float64")
- values[mask] = np.nan
- elif is_any_real_numeric_dtype(self.categories):
- values = np.array(self)
- else:
- # reorder the categories (so rank can use the float codes)
- # instead of passing an object array to rank
- values = np.array(
- self.rename_categories(
- Series(self.categories, copy=False).rank().values
- )
- )
- return values
-
- # ------------------------------------------------------------------
- # NDArrayBackedExtensionArray compat
-
- @property
- def _codes(self) -> np.ndarray:
- return self._ndarray
-
- def _box_func(self, i: int):
- if i == -1:
- return np.NaN
- return self.categories[i]
-
- def _unbox_scalar(self, key) -> int:
- # searchsorted is very performance sensitive. By converting codes
- # to same dtype as self.codes, we get much faster performance.
- code = self.categories.get_loc(key)
- code = self._ndarray.dtype.type(code)
- return code
-
- # ------------------------------------------------------------------
-
- def __iter__(self) -> Iterator:
- """
- Returns an Iterator over the values of this Categorical.
- """
- if self.ndim == 1:
- return iter(self._internal_get_values().tolist())
- else:
- return (self[n] for n in range(len(self)))
-
- def __contains__(self, key) -> bool:
- """
- Returns True if `key` is in this Categorical.
- """
- # if key is a NaN, check if any NaN is in self.
- if is_valid_na_for_dtype(key, self.categories.dtype):
- return bool(self.isna().any())
-
- return contains(self, key, container=self._codes)
-
- # ------------------------------------------------------------------
- # Rendering Methods
-
- def _formatter(self, boxed: bool = False):
- # Defer to CategoricalFormatter's formatter.
- return None
-
- def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str:
- """
- a short repr displaying only max_vals and an optional (but default
- footer)
- """
- num = max_vals // 2
- head = self[:num]._get_repr(length=False, footer=False)
- tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False)
-
- result = f"{head[:-1]}, ..., {tail[1:]}"
- if footer:
- result = f"{result}\n{self._repr_footer()}"
-
- return str(result)
-
- def _repr_categories(self) -> list[str]:
- """
- return the base repr for the categories
- """
- max_categories = (
- 10
- if get_option("display.max_categories") == 0
- else get_option("display.max_categories")
- )
- from pandas.io.formats import format as fmt
-
- format_array = partial(
- fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
- )
- if len(self.categories) > max_categories:
- num = max_categories // 2
- head = format_array(self.categories[:num])
- tail = format_array(self.categories[-num:])
- category_strs = head + ["..."] + tail
- else:
- category_strs = format_array(self.categories)
-
- # Strip all leading spaces, which format_array adds for columns...
- category_strs = [x.strip() for x in category_strs]
- return category_strs
-
- def _repr_categories_info(self) -> str:
- """
- Returns a string representation of the footer.
- """
- category_strs = self._repr_categories()
- dtype = str(self.categories.dtype)
- levheader = f"Categories ({len(self.categories)}, {dtype}): "
- width, height = get_terminal_size()
- max_width = get_option("display.width") or width
- if console.in_ipython_frontend():
- # 0 = no breaks
- max_width = 0
- levstring = ""
- start = True
- cur_col_len = len(levheader) # header
- sep_len, sep = (3, " < ") if self.ordered else (2, ", ")
- linesep = f"{sep.rstrip()}\n" # remove whitespace
- for val in category_strs:
- if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:
- levstring += linesep + (" " * (len(levheader) + 1))
- cur_col_len = len(levheader) + 1 # header + a whitespace
- elif not start:
- levstring += sep
- cur_col_len += len(val)
- levstring += val
- start = False
- # replace to simple save space by
- return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]"
-
- def _repr_footer(self) -> str:
- info = self._repr_categories_info()
- return f"Length: {len(self)}\n{info}"
-
- def _get_repr(
- self, length: bool = True, na_rep: str = "NaN", footer: bool = True
- ) -> str:
- from pandas.io.formats import format as fmt
-
- formatter = fmt.CategoricalFormatter(
- self, length=length, na_rep=na_rep, footer=footer
- )
- result = formatter.to_string()
- return str(result)
-
- def __repr__(self) -> str:
- """
- String representation.
- """
- _maxlen = 10
- if len(self._codes) > _maxlen:
- result = self._tidy_repr(_maxlen)
- elif len(self._codes) > 0:
- result = self._get_repr(length=len(self) > _maxlen)
- else:
- msg = self._get_repr(length=False, footer=True).replace("\n", ", ")
- result = f"[], {msg}"
-
- return result
-
- # ------------------------------------------------------------------
-
- def _validate_listlike(self, value):
- # NB: here we assume scalar-like tuples have already been excluded
- value = extract_array(value, extract_numpy=True)
-
- # require identical categories set
- if isinstance(value, Categorical):
- if not is_dtype_equal(self.dtype, value.dtype):
- raise TypeError(
- "Cannot set a Categorical with another, "
- "without identical categories"
- )
- # is_dtype_equal implies categories_match_up_to_permutation
- value = self._encode_with_my_categories(value)
- return value._codes
-
- from pandas import Index
-
- # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914
- to_add = Index._with_infer(value, tupleize_cols=False).difference(
- self.categories
- )
-
- # no assignments of values not in categories, but it's always ok to set
- # something to np.nan
- if len(to_add) and not isna(to_add).all():
- raise TypeError(
- "Cannot setitem on a Categorical with a new "
- "category, set the categories first"
- )
-
- codes = self.categories.get_indexer(value)
- return codes.astype(self._ndarray.dtype, copy=False)
-
- def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
- """
- Compute the inverse of a categorical, returning
- a dict of categories -> indexers.
-
- *This is an internal function*
-
- Returns
- -------
- Dict[Hashable, np.ndarray[np.intp]]
- dict of categories -> indexers
-
- Examples
- --------
- >>> c = pd.Categorical(list('aabca'))
- >>> c
- ['a', 'a', 'b', 'c', 'a']
- Categories (3, object): ['a', 'b', 'c']
- >>> c.categories
- Index(['a', 'b', 'c'], dtype='object')
- >>> c.codes
- array([0, 0, 1, 2, 0], dtype=int8)
- >>> c._reverse_indexer()
- {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}
-
- """
- categories = self.categories
- r, counts = libalgos.groupsort_indexer(
- ensure_platform_int(self.codes), categories.size
- )
- counts = ensure_int64(counts).cumsum()
- _result = (r[start:end] for start, end in zip(counts, counts[1:]))
- return dict(zip(categories, _result))
-
- # ------------------------------------------------------------------
- # Reductions
-
- def min(self, *, skipna: bool = True, **kwargs):
- """
- The minimum value of the object.
-
- Only ordered `Categoricals` have a minimum!
-
- Raises
- ------
- TypeError
- If the `Categorical` is not `ordered`.
-
- Returns
- -------
- min : the minimum of this `Categorical`, NA value if empty
- """
- nv.validate_minmax_axis(kwargs.get("axis", 0))
- nv.validate_min((), kwargs)
- self.check_for_ordered("min")
-
- if not len(self._codes):
- return self.dtype.na_value
-
- good = self._codes != -1
- if not good.all():
- if skipna and good.any():
- pointer = self._codes[good].min()
- else:
- return np.nan
- else:
- pointer = self._codes.min()
- return self._wrap_reduction_result(None, pointer)
-
- def max(self, *, skipna: bool = True, **kwargs):
- """
- The maximum value of the object.
-
- Only ordered `Categoricals` have a maximum!
-
- Raises
- ------
- TypeError
- If the `Categorical` is not `ordered`.
-
- Returns
- -------
- max : the maximum of this `Categorical`, NA if array is empty
- """
- nv.validate_minmax_axis(kwargs.get("axis", 0))
- nv.validate_max((), kwargs)
- self.check_for_ordered("max")
-
- if not len(self._codes):
- return self.dtype.na_value
-
- good = self._codes != -1
- if not good.all():
- if skipna and good.any():
- pointer = self._codes[good].max()
- else:
- return np.nan
- else:
- pointer = self._codes.max()
- return self._wrap_reduction_result(None, pointer)
-
- def _mode(self, dropna: bool = True) -> Categorical:
- codes = self._codes
- mask = None
- if dropna:
- mask = self.isna()
-
- res_codes = algorithms.mode(codes, mask=mask)
- res_codes = cast(np.ndarray, res_codes)
- assert res_codes.dtype == codes.dtype
- res = self._from_backing_data(res_codes)
- return res
-
- # ------------------------------------------------------------------
- # ExtensionArray Interface
-
- def unique(self):
- """
- Return the ``Categorical`` which ``categories`` and ``codes`` are
- unique.
-
- .. versionchanged:: 1.3.0
-
- Previously, unused categories were dropped from the new categories.
-
- Returns
- -------
- Categorical
-
- See Also
- --------
- pandas.unique
- CategoricalIndex.unique
- Series.unique : Return unique values of Series object.
-
- Examples
- --------
- >>> pd.Categorical(list("baabc")).unique()
- ['b', 'a', 'c']
- Categories (3, object): ['a', 'b', 'c']
- >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique()
- ['b', 'a']
- Categories (3, object): ['a' < 'b' < 'c']
- """
- # pylint: disable=useless-parent-delegation
- return super().unique()
-
- def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
- # make sure we have correct itemsize for resulting codes
- assert res_values.dtype == self._ndarray.dtype
- return res_values
-
- def equals(self, other: object) -> bool:
- """
- Returns True if categorical arrays are equal.
-
- Parameters
- ----------
- other : `Categorical`
-
- Returns
- -------
- bool
- """
- if not isinstance(other, Categorical):
- return False
- elif self._categories_match_up_to_permutation(other):
- other = self._encode_with_my_categories(other)
- return np.array_equal(self._codes, other._codes)
- return False
-
- @classmethod
- def _concat_same_type(
- cls: type[CategoricalT], to_concat: Sequence[CategoricalT], axis: AxisInt = 0
- ) -> CategoricalT:
- from pandas.core.dtypes.concat import union_categoricals
-
- first = to_concat[0]
- if axis >= first.ndim:
- raise ValueError(
- f"axis {axis} is out of bounds for array of dimension {first.ndim}"
- )
-
- if axis == 1:
- # Flatten, concatenate then reshape
- if not all(x.ndim == 2 for x in to_concat):
- raise ValueError
-
- # pass correctly-shaped to union_categoricals
- tc_flat = []
- for obj in to_concat:
- tc_flat.extend([obj[:, i] for i in range(obj.shape[1])])
-
- res_flat = cls._concat_same_type(tc_flat, axis=0)
-
- result = res_flat.reshape(len(first), -1, order="F")
- return result
-
- result = union_categoricals(to_concat)
- return result
-
- # ------------------------------------------------------------------
-
- def _encode_with_my_categories(self, other: Categorical) -> Categorical:
- """
- Re-encode another categorical using this Categorical's categories.
-
- Notes
- -----
- This assumes we have already checked
- self._categories_match_up_to_permutation(other).
- """
- # Indexing on codes is more efficient if categories are the same,
- # so we can apply some optimizations based on the degree of
- # dtype-matching.
- codes = recode_for_categories(
- other.codes, other.categories, self.categories, copy=False
- )
- return self._from_backing_data(codes)
-
- def _categories_match_up_to_permutation(self, other: Categorical) -> bool:
- """
- Returns True if categoricals are the same dtype
- same categories, and same ordered
-
- Parameters
- ----------
- other : Categorical
-
- Returns
- -------
- bool
- """
- return hash(self.dtype) == hash(other.dtype)
-
- def describe(self) -> DataFrame:
- """
- Describes this Categorical
-
- Returns
- -------
- description: `DataFrame`
- A dataframe with frequency and counts by category.
- """
- counts = self.value_counts(dropna=False)
- freqs = counts / counts.sum()
-
- from pandas import Index
- from pandas.core.reshape.concat import concat
-
- result = concat([counts, freqs], axis=1)
- result.columns = Index(["counts", "freqs"])
- result.index.name = "categories"
-
- return result
-
- def isin(self, values) -> npt.NDArray[np.bool_]:
- """
- Check whether `values` are contained in Categorical.
-
- Return a boolean NumPy Array showing whether each element in
- the Categorical matches an element in the passed sequence of
- `values` exactly.
-
- Parameters
- ----------
- values : set or list-like
- The sequence of values to test. Passing in a single string will
- raise a ``TypeError``. Instead, turn a single string into a
- list of one element.
-
- Returns
- -------
- np.ndarray[bool]
-
- Raises
- ------
- TypeError
- * If `values` is not a set or list-like
-
- See Also
- --------
- pandas.Series.isin : Equivalent method on Series.
-
- Examples
- --------
- >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
- ... 'hippo'])
- >>> s.isin(['cow', 'lama'])
- array([ True, True, True, False, True, False])
-
- Passing a single string as ``s.isin('lama')`` will raise an error. Use
- a list of one element instead:
-
- >>> s.isin(['lama'])
- array([ True, False, True, False, True, False])
- """
- if not is_list_like(values):
- values_type = type(values).__name__
- raise TypeError(
- "only list-like objects are allowed to be passed "
- f"to isin(), you passed a [{values_type}]"
- )
- values = sanitize_array(values, None, None)
- null_mask = np.asarray(isna(values))
- code_values = self.categories.get_indexer(values)
- code_values = code_values[null_mask | (code_values >= 0)]
- return algorithms.isin(self.codes, code_values)
-
- def _replace(self, *, to_replace, value, inplace: bool = False):
- from pandas import Index
-
- inplace = validate_bool_kwarg(inplace, "inplace")
- cat = self if inplace else self.copy()
-
- mask = isna(np.asarray(value))
- if mask.any():
- removals = np.asarray(to_replace)[mask]
- removals = cat.categories[cat.categories.isin(removals)]
- new_cat = cat.remove_categories(removals)
- NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype)
-
- ser = cat.categories.to_series()
- ser = ser.replace(to_replace=to_replace, value=value)
-
- all_values = Index(ser)
-
- # GH51016: maintain order of existing categories
- idxr = cat.categories.get_indexer_for(all_values)
- locs = np.arange(len(ser))
- locs = np.where(idxr == -1, locs, idxr)
- locs = locs.argsort()
-
- new_categories = ser.take(locs)
- new_categories = new_categories.drop_duplicates(keep="first")
- new_categories = Index(new_categories)
- new_codes = recode_for_categories(
- cat._codes, all_values, new_categories, copy=False
- )
- new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered)
- NDArrayBacked.__init__(cat, new_codes, new_dtype)
-
- if not inplace:
- return cat
-
- # ------------------------------------------------------------------------
- # String methods interface
- def _str_map(
- self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True
- ):
- # Optimization to apply the callable `f` to the categories once
- # and rebuild the result by `take`ing from the result with the codes.
- # Returns the same type as the object-dtype implementation though.
- from pandas.core.arrays import PandasArray
-
- categories = self.categories
- codes = self.codes
- result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype)
- return take_nd(result, codes, fill_value=na_value)
-
- def _str_get_dummies(self, sep: str = "|"):
- # sep may not be in categories. Just bail on this.
- from pandas.core.arrays import PandasArray
-
- return PandasArray(self.astype(str))._str_get_dummies(sep)
-
-
-# The Series.cat accessor
-
-
-@delegate_names(
- delegate=Categorical, accessors=["categories", "ordered"], typ="property"
-)
-@delegate_names(
- delegate=Categorical,
- accessors=[
- "rename_categories",
- "reorder_categories",
- "add_categories",
- "remove_categories",
- "remove_unused_categories",
- "set_categories",
- "as_ordered",
- "as_unordered",
- ],
- typ="method",
-)
-class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
- """
- Accessor object for categorical properties of the Series values.
-
- Parameters
- ----------
- data : Series or CategoricalIndex
-
- Examples
- --------
- >>> s = pd.Series(list("abbccc")).astype("category")
- >>> s
- 0 a
- 1 b
- 2 b
- 3 c
- 4 c
- 5 c
- dtype: category
- Categories (3, object): ['a', 'b', 'c']
-
- >>> s.cat.categories
- Index(['a', 'b', 'c'], dtype='object')
-
- >>> s.cat.rename_categories(list("cba"))
- 0 c
- 1 b
- 2 b
- 3 a
- 4 a
- 5 a
- dtype: category
- Categories (3, object): ['c', 'b', 'a']
-
- >>> s.cat.reorder_categories(list("cba"))
- 0 a
- 1 b
- 2 b
- 3 c
- 4 c
- 5 c
- dtype: category
- Categories (3, object): ['c', 'b', 'a']
-
- >>> s.cat.add_categories(["d", "e"])
- 0 a
- 1 b
- 2 b
- 3 c
- 4 c
- 5 c
- dtype: category
- Categories (5, object): ['a', 'b', 'c', 'd', 'e']
-
- >>> s.cat.remove_categories(["a", "c"])
- 0 NaN
- 1 b
- 2 b
- 3 NaN
- 4 NaN
- 5 NaN
- dtype: category
- Categories (1, object): ['b']
-
- >>> s1 = s.cat.add_categories(["d", "e"])
- >>> s1.cat.remove_unused_categories()
- 0 a
- 1 b
- 2 b
- 3 c
- 4 c
- 5 c
- dtype: category
- Categories (3, object): ['a', 'b', 'c']
-
- >>> s.cat.set_categories(list("abcde"))
- 0 a
- 1 b
- 2 b
- 3 c
- 4 c
- 5 c
- dtype: category
- Categories (5, object): ['a', 'b', 'c', 'd', 'e']
-
- >>> s.cat.as_ordered()
- 0 a
- 1 b
- 2 b
- 3 c
- 4 c
- 5 c
- dtype: category
- Categories (3, object): ['a' < 'b' < 'c']
-
- >>> s.cat.as_unordered()
- 0 a
- 1 b
- 2 b
- 3 c
- 4 c
- 5 c
- dtype: category
- Categories (3, object): ['a', 'b', 'c']
- """
-
- def __init__(self, data) -> None:
- self._validate(data)
- self._parent = data.values
- self._index = data.index
- self._name = data.name
- self._freeze()
-
- @staticmethod
- def _validate(data):
- if not is_categorical_dtype(data.dtype):
- raise AttributeError("Can only use .cat accessor with a 'category' dtype")
-
- def _delegate_property_get(self, name):
- return getattr(self._parent, name)
-
- def _delegate_property_set(self, name, new_values):
- return setattr(self._parent, name, new_values)
-
- @property
- def codes(self) -> Series:
- """
- Return Series of codes as well as the index.
- """
- from pandas import Series
-
- return Series(self._parent.codes, index=self._index)
-
- def _delegate_method(self, name, *args, **kwargs):
- from pandas import Series
-
- method = getattr(self._parent, name)
- res = method(*args, **kwargs)
- if res is not None:
- return Series(res, index=self._index, name=self._name)
-
-
-# utility routines
-
-
-def _get_codes_for_values(values, categories: Index) -> np.ndarray:
- """
- utility routine to turn values into codes given the specified categories
-
- If `values` is known to be a Categorical, use recode_for_categories instead.
- """
- if values.ndim > 1:
- flat = values.ravel()
- codes = _get_codes_for_values(flat, categories)
- return codes.reshape(values.shape)
-
- codes = categories.get_indexer_for(values)
- return coerce_indexer_dtype(codes, categories)
-
-
-def recode_for_categories(
- codes: np.ndarray, old_categories, new_categories, copy: bool = True
-) -> np.ndarray:
- """
- Convert a set of codes for to a new set of categories
-
- Parameters
- ----------
- codes : np.ndarray
- old_categories, new_categories : Index
- copy: bool, default True
- Whether to copy if the codes are unchanged.
-
- Returns
- -------
- new_codes : np.ndarray[np.int64]
-
- Examples
- --------
- >>> old_cat = pd.Index(['b', 'a', 'c'])
- >>> new_cat = pd.Index(['a', 'b'])
- >>> codes = np.array([0, 1, 1, 2])
- >>> recode_for_categories(codes, old_cat, new_cat)
- array([ 1, 0, 0, -1], dtype=int8)
- """
- if len(old_categories) == 0:
- # All null anyway, so just retain the nulls
- if copy:
- return codes.copy()
- return codes
- elif new_categories.equals(old_categories):
- # Same categories, so no need to actually recode
- if copy:
- return codes.copy()
- return codes
-
- indexer = coerce_indexer_dtype(
- new_categories.get_indexer(old_categories), new_categories
- )
- new_codes = take_nd(indexer, codes, fill_value=-1)
- return new_codes
-
-
-def factorize_from_iterable(values) -> tuple[np.ndarray, Index]:
- """
- Factorize an input `values` into `categories` and `codes`. Preserves
- categorical dtype in `categories`.
-
- Parameters
- ----------
- values : list-like
-
- Returns
- -------
- codes : ndarray
- categories : Index
- If `values` has a categorical dtype, then `categories` is
- a CategoricalIndex keeping the categories and order of `values`.
- """
- from pandas import CategoricalIndex
-
- if not is_list_like(values):
- raise TypeError("Input must be list-like")
-
- categories: Index
- if is_categorical_dtype(values):
- values = extract_array(values)
- # The Categorical we want to build has the same categories
- # as values but its codes are by def [0, ..., len(n_categories) - 1]
- cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)
- cat = Categorical.from_codes(cat_codes, dtype=values.dtype)
-
- categories = CategoricalIndex(cat)
- codes = values.codes
- else:
- # The value of ordered is irrelevant since we don't use cat as such,
- # but only the resulting categories, the order of which is independent
- # from ordered. Set ordered to False as default. See GH #15457
- cat = Categorical(values, ordered=False)
- categories = cat.categories
- codes = cat.codes
- return codes, categories
-
-
-def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]:
- """
- A higher-level wrapper over `factorize_from_iterable`.
-
- Parameters
- ----------
- iterables : list-like of list-likes
-
- Returns
- -------
- codes : list of ndarrays
- categories : list of Indexes
-
- Notes
- -----
- See `factorize_from_iterable` for more info.
- """
- if len(iterables) == 0:
- # For consistency, it should return two empty lists.
- return [], []
-
- codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
- return list(codes), list(categories)
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/datetimelike.py b/contrib/python/pandas/py3/pandas/core/arrays/datetimelike.py
deleted file mode 100644
index 8545cd1499b..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/datetimelike.py
+++ /dev/null
@@ -1,2267 +0,0 @@
-from __future__ import annotations
-
-from datetime import (
- datetime,
- timedelta,
-)
-from functools import wraps
-import operator
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Iterator,
- Literal,
- Sequence,
- TypeVar,
- Union,
- cast,
- final,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import (
- algos,
- lib,
-)
-from pandas._libs.arrays import NDArrayBacked
-from pandas._libs.tslibs import (
- BaseOffset,
- IncompatibleFrequency,
- NaT,
- NaTType,
- Period,
- Resolution,
- Tick,
- Timedelta,
- Timestamp,
- astype_overflowsafe,
- delta_to_nanoseconds,
- get_unit_from_dtype,
- iNaT,
- ints_to_pydatetime,
- ints_to_pytimedelta,
- to_offset,
-)
-from pandas._libs.tslibs.fields import (
- RoundTo,
- round_nsint64,
-)
-from pandas._libs.tslibs.np_datetime import compare_mismatched_resolutions
-from pandas._libs.tslibs.timestamps import integer_op_not_supported
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- DatetimeLikeScalar,
- Dtype,
- DtypeObj,
- F,
- NpDtype,
- PositionalIndexer2D,
- PositionalIndexerTuple,
- ScalarIndexer,
- SequenceIndexer,
- TimeAmbiguous,
- TimeNonexistent,
- npt,
-)
-from pandas.compat.numpy import function as nv
-from pandas.errors import (
- AbstractMethodError,
- InvalidComparison,
- PerformanceWarning,
-)
-from pandas.util._decorators import (
- Appender,
- Substitution,
- cache_readonly,
-)
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import (
- is_all_strings,
- is_categorical_dtype,
- is_datetime64_any_dtype,
- is_datetime64_dtype,
- is_datetime64tz_dtype,
- is_datetime_or_timedelta_dtype,
- is_dtype_equal,
- is_float_dtype,
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- is_period_dtype,
- is_string_dtype,
- is_timedelta64_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import (
- DatetimeTZDtype,
- ExtensionDtype,
-)
-from pandas.core.dtypes.generic import (
- ABCCategorical,
- ABCMultiIndex,
-)
-from pandas.core.dtypes.missing import (
- is_valid_na_for_dtype,
- isna,
-)
-
-from pandas.core import (
- algorithms,
- nanops,
- ops,
-)
-from pandas.core.algorithms import (
- checked_add_with_arr,
- isin,
- unique1d,
-)
-from pandas.core.array_algos import datetimelike_accumulations
-from pandas.core.arraylike import OpsMixin
-from pandas.core.arrays._mixins import (
- NDArrayBackedExtensionArray,
- ravel_compat,
-)
-from pandas.core.arrays.arrow.array import ArrowExtensionArray
-from pandas.core.arrays.base import ExtensionArray
-from pandas.core.arrays.integer import IntegerArray
-import pandas.core.common as com
-from pandas.core.construction import (
- array as pd_array,
- ensure_wrapped_if_datetimelike,
- extract_array,
-)
-from pandas.core.indexers import (
- check_array_indexer,
- check_setitem_lengths,
-)
-from pandas.core.ops.common import unpack_zerodim_and_defer
-from pandas.core.ops.invalid import (
- invalid_comparison,
- make_invalid_op,
-)
-
-from pandas.tseries import frequencies
-
-if TYPE_CHECKING:
- from pandas.core.arrays import (
- DatetimeArray,
- PeriodArray,
- TimedeltaArray,
- )
-
-DTScalarOrNaT = Union[DatetimeLikeScalar, NaTType]
-DatetimeLikeArrayT = TypeVar("DatetimeLikeArrayT", bound="DatetimeLikeArrayMixin")
-
-
-def _period_dispatch(meth: F) -> F:
- """
- For PeriodArray methods, dispatch to DatetimeArray and re-wrap the results
- in PeriodArray. We cannot use ._ndarray directly for the affected
- methods because the i8 data has different semantics on NaT values.
- """
-
- @wraps(meth)
- def new_meth(self, *args, **kwargs):
- if not is_period_dtype(self.dtype):
- return meth(self, *args, **kwargs)
-
- arr = self.view("M8[ns]")
- result = meth(arr, *args, **kwargs)
- if result is NaT:
- return NaT
- elif isinstance(result, Timestamp):
- return self._box_func(result._value)
-
- res_i8 = result.view("i8")
- return self._from_backing_data(res_i8)
-
- return cast(F, new_meth)
-
-
-class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray):
- """
- Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray
-
- Assumes that __new__/__init__ defines:
- _ndarray
-
- and that inheriting subclass implements:
- freq
- """
-
- # _infer_matches -> which infer_dtype strings are close enough to our own
- _infer_matches: tuple[str, ...]
- _is_recognized_dtype: Callable[[DtypeObj], bool]
- _recognized_scalars: tuple[type, ...]
- _ndarray: np.ndarray
- freq: BaseOffset | None
-
- @cache_readonly
- def _can_hold_na(self) -> bool:
- return True
-
- def __init__(
- self, data, dtype: Dtype | None = None, freq=None, copy: bool = False
- ) -> None:
- raise AbstractMethodError(self)
-
- @property
- def _scalar_type(self) -> type[DatetimeLikeScalar]:
- """
- The scalar associated with this datelike
-
- * PeriodArray : Period
- * DatetimeArray : Timestamp
- * TimedeltaArray : Timedelta
- """
- raise AbstractMethodError(self)
-
- def _scalar_from_string(self, value: str) -> DTScalarOrNaT:
- """
- Construct a scalar type from a string.
-
- Parameters
- ----------
- value : str
-
- Returns
- -------
- Period, Timestamp, or Timedelta, or NaT
- Whatever the type of ``self._scalar_type`` is.
-
- Notes
- -----
- This should call ``self._check_compatible_with`` before
- unboxing the result.
- """
- raise AbstractMethodError(self)
-
- def _unbox_scalar(
- self, value: DTScalarOrNaT
- ) -> np.int64 | np.datetime64 | np.timedelta64:
- """
- Unbox the integer value of a scalar `value`.
-
- Parameters
- ----------
- value : Period, Timestamp, Timedelta, or NaT
- Depending on subclass.
-
- Returns
- -------
- int
-
- Examples
- --------
- >>> self._unbox_scalar(Timedelta("10s")) # doctest: +SKIP
- 10000000000
- """
- raise AbstractMethodError(self)
-
- def _check_compatible_with(self, other: DTScalarOrNaT) -> None:
- """
- Verify that `self` and `other` are compatible.
-
- * DatetimeArray verifies that the timezones (if any) match
- * PeriodArray verifies that the freq matches
- * Timedelta has no verification
-
- In each case, NaT is considered compatible.
-
- Parameters
- ----------
- other
-
- Raises
- ------
- Exception
- """
- raise AbstractMethodError(self)
-
- # ------------------------------------------------------------------
-
- def _box_func(self, x):
- """
- box function to get object from internal representation
- """
- raise AbstractMethodError(self)
-
- def _box_values(self, values) -> np.ndarray:
- """
- apply box func to passed values
- """
- return lib.map_infer(values, self._box_func, convert=False)
-
- def __iter__(self) -> Iterator:
- if self.ndim > 1:
- return (self[n] for n in range(len(self)))
- else:
- return (self._box_func(v) for v in self.asi8)
-
- @property
- def asi8(self) -> npt.NDArray[np.int64]:
- """
- Integer representation of the values.
-
- Returns
- -------
- ndarray
- An ndarray with int64 dtype.
- """
- # do not cache or you'll create a memory leak
- return self._ndarray.view("i8")
-
- # ----------------------------------------------------------------
- # Rendering Methods
-
- def _format_native_types(
- self, *, na_rep: str | float = "NaT", date_format=None
- ) -> npt.NDArray[np.object_]:
- """
- Helper method for astype when converting to strings.
-
- Returns
- -------
- ndarray[str]
- """
- raise AbstractMethodError(self)
-
- def _formatter(self, boxed: bool = False):
- # TODO: Remove Datetime & DatetimeTZ formatters.
- return "'{}'".format
-
- # ----------------------------------------------------------------
- # Array-Like / EA-Interface Methods
-
- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
- # used for Timedelta/DatetimeArray, overwritten by PeriodArray
- if is_object_dtype(dtype):
- return np.array(list(self), dtype=object)
- return self._ndarray
-
- @overload
- def __getitem__(self, item: ScalarIndexer) -> DTScalarOrNaT:
- ...
-
- @overload
- def __getitem__(
- self: DatetimeLikeArrayT,
- item: SequenceIndexer | PositionalIndexerTuple,
- ) -> DatetimeLikeArrayT:
- ...
-
- def __getitem__(
- self: DatetimeLikeArrayT, key: PositionalIndexer2D
- ) -> DatetimeLikeArrayT | DTScalarOrNaT:
- """
- This getitem defers to the underlying array, which by-definition can
- only handle list-likes, slices, and integer scalars
- """
- # Use cast as we know we will get back a DatetimeLikeArray or DTScalar,
- # but skip evaluating the Union at runtime for performance
- # (see https://github.com/pandas-dev/pandas/pull/44624)
- result = cast(
- "Union[DatetimeLikeArrayT, DTScalarOrNaT]", super().__getitem__(key)
- )
- if lib.is_scalar(result):
- return result
- else:
- # At this point we know the result is an array.
- result = cast(DatetimeLikeArrayT, result)
- result._freq = self._get_getitem_freq(key)
- return result
-
- def _get_getitem_freq(self, key) -> BaseOffset | None:
- """
- Find the `freq` attribute to assign to the result of a __getitem__ lookup.
- """
- is_period = is_period_dtype(self.dtype)
- if is_period:
- freq = self.freq
- elif self.ndim != 1:
- freq = None
- else:
- key = check_array_indexer(self, key) # maybe ndarray[bool] -> slice
- freq = None
- if isinstance(key, slice):
- if self.freq is not None and key.step is not None:
- freq = key.step * self.freq
- else:
- freq = self.freq
- elif key is Ellipsis:
- # GH#21282 indexing with Ellipsis is similar to a full slice,
- # should preserve `freq` attribute
- freq = self.freq
- elif com.is_bool_indexer(key):
- new_key = lib.maybe_booleans_to_slice(key.view(np.uint8))
- if isinstance(new_key, slice):
- return self._get_getitem_freq(new_key)
- return freq
-
- # error: Argument 1 of "__setitem__" is incompatible with supertype
- # "ExtensionArray"; supertype defines the argument type as "Union[int,
- # ndarray]"
- def __setitem__(
- self,
- key: int | Sequence[int] | Sequence[bool] | slice,
- value: NaTType | Any | Sequence[Any],
- ) -> None:
- # I'm fudging the types a bit here. "Any" above really depends
- # on type(self). For PeriodArray, it's Period (or stuff coercible
- # to a period in from_sequence). For DatetimeArray, it's Timestamp...
- # I don't know if mypy can do that, possibly with Generics.
- # https://mypy.readthedocs.io/en/latest/generics.html
-
- no_op = check_setitem_lengths(key, value, self)
-
- # Calling super() before the no_op short-circuit means that we raise
- # on invalid 'value' even if this is a no-op, e.g. wrong-dtype empty array.
- super().__setitem__(key, value)
-
- if no_op:
- return
-
- self._maybe_clear_freq()
-
- def _maybe_clear_freq(self) -> None:
- # inplace operations like __setitem__ may invalidate the freq of
- # DatetimeArray and TimedeltaArray
- pass
-
- def astype(self, dtype, copy: bool = True):
- # Some notes on cases we don't have to handle here in the base class:
- # 1. PeriodArray.astype handles period -> period
- # 2. DatetimeArray.astype handles conversion between tz.
- # 3. DatetimeArray.astype handles datetime -> period
- dtype = pandas_dtype(dtype)
-
- if is_object_dtype(dtype):
- if self.dtype.kind == "M":
- self = cast("DatetimeArray", self)
- # *much* faster than self._box_values
- # for e.g. test_get_loc_tuple_monotonic_above_size_cutoff
- i8data = self.asi8
- converted = ints_to_pydatetime(
- i8data,
- tz=self.tz,
- box="timestamp",
- reso=self._creso,
- )
- return converted
-
- elif self.dtype.kind == "m":
- return ints_to_pytimedelta(self._ndarray, box=True)
-
- return self._box_values(self.asi8.ravel()).reshape(self.shape)
-
- elif isinstance(dtype, ExtensionDtype):
- return super().astype(dtype, copy=copy)
- elif is_string_dtype(dtype):
- return self._format_native_types()
- elif is_integer_dtype(dtype):
- # we deliberately ignore int32 vs. int64 here.
- # See https://github.com/pandas-dev/pandas/issues/24381 for more.
- values = self.asi8
- if dtype != np.int64:
- raise TypeError(
- f"Converting from {self.dtype} to {dtype} is not supported. "
- "Do obj.astype('int64').astype(dtype) instead"
- )
-
- if copy:
- values = values.copy()
- return values
- elif (
- is_datetime_or_timedelta_dtype(dtype)
- and not is_dtype_equal(self.dtype, dtype)
- ) or is_float_dtype(dtype):
- # disallow conversion between datetime/timedelta,
- # and conversions for any datetimelike to float
- msg = f"Cannot cast {type(self).__name__} to dtype {dtype}"
- raise TypeError(msg)
- else:
- return np.asarray(self, dtype=dtype)
-
- @overload
- def view(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT:
- ...
-
- @overload
- def view(self, dtype: Literal["M8[ns]"]) -> DatetimeArray:
- ...
-
- @overload
- def view(self, dtype: Literal["m8[ns]"]) -> TimedeltaArray:
- ...
-
- @overload
- def view(self, dtype: Dtype | None = ...) -> ArrayLike:
- ...
-
- # pylint: disable-next=useless-parent-delegation
- def view(self, dtype: Dtype | None = None) -> ArrayLike:
- # we need to explicitly call super() method as long as the `@overload`s
- # are present in this file.
- return super().view(dtype)
-
- # ------------------------------------------------------------------
- # ExtensionArray Interface
-
- @classmethod
- def _concat_same_type(
- cls: type[DatetimeLikeArrayT],
- to_concat: Sequence[DatetimeLikeArrayT],
- axis: AxisInt = 0,
- ) -> DatetimeLikeArrayT:
- new_obj = super()._concat_same_type(to_concat, axis)
-
- obj = to_concat[0]
- dtype = obj.dtype
-
- new_freq = None
- if is_period_dtype(dtype):
- new_freq = obj.freq
- elif axis == 0:
- # GH 3232: If the concat result is evenly spaced, we can retain the
- # original frequency
- to_concat = [x for x in to_concat if len(x)]
-
- if obj.freq is not None and all(x.freq == obj.freq for x in to_concat):
- pairs = zip(to_concat[:-1], to_concat[1:])
- if all(pair[0][-1] + obj.freq == pair[1][0] for pair in pairs):
- new_freq = obj.freq
-
- new_obj._freq = new_freq
- return new_obj
-
- def copy(self: DatetimeLikeArrayT, order: str = "C") -> DatetimeLikeArrayT:
- # error: Unexpected keyword argument "order" for "copy"
- new_obj = super().copy(order=order) # type: ignore[call-arg]
- new_obj._freq = self.freq
- return new_obj
-
- # ------------------------------------------------------------------
- # Validation Methods
- # TODO: try to de-duplicate these, ensure identical behavior
-
- def _validate_comparison_value(self, other):
- if isinstance(other, str):
- try:
- # GH#18435 strings get a pass from tzawareness compat
- other = self._scalar_from_string(other)
- except (ValueError, IncompatibleFrequency):
- # failed to parse as Timestamp/Timedelta/Period
- raise InvalidComparison(other)
-
- if isinstance(other, self._recognized_scalars) or other is NaT:
- other = self._scalar_type(other)
- try:
- self._check_compatible_with(other)
- except (TypeError, IncompatibleFrequency) as err:
- # e.g. tzawareness mismatch
- raise InvalidComparison(other) from err
-
- elif not is_list_like(other):
- raise InvalidComparison(other)
-
- elif len(other) != len(self):
- raise ValueError("Lengths must match")
-
- else:
- try:
- other = self._validate_listlike(other, allow_object=True)
- self._check_compatible_with(other)
- except (TypeError, IncompatibleFrequency) as err:
- if is_object_dtype(getattr(other, "dtype", None)):
- # We will have to operate element-wise
- pass
- else:
- raise InvalidComparison(other) from err
-
- return other
-
- def _validate_scalar(
- self,
- value,
- *,
- allow_listlike: bool = False,
- unbox: bool = True,
- ):
- """
- Validate that the input value can be cast to our scalar_type.
-
- Parameters
- ----------
- value : object
- allow_listlike: bool, default False
- When raising an exception, whether the message should say
- listlike inputs are allowed.
- unbox : bool, default True
- Whether to unbox the result before returning. Note: unbox=False
- skips the setitem compatibility check.
-
- Returns
- -------
- self._scalar_type or NaT
- """
- if isinstance(value, self._scalar_type):
- pass
-
- elif isinstance(value, str):
- # NB: Careful about tzawareness
- try:
- value = self._scalar_from_string(value)
- except ValueError as err:
- msg = self._validation_error_message(value, allow_listlike)
- raise TypeError(msg) from err
-
- elif is_valid_na_for_dtype(value, self.dtype):
- # GH#18295
- value = NaT
-
- elif isna(value):
- # if we are dt64tz and value is dt64("NaT"), dont cast to NaT,
- # or else we'll fail to raise in _unbox_scalar
- msg = self._validation_error_message(value, allow_listlike)
- raise TypeError(msg)
-
- elif isinstance(value, self._recognized_scalars):
- value = self._scalar_type(value)
-
- else:
- msg = self._validation_error_message(value, allow_listlike)
- raise TypeError(msg)
-
- if not unbox:
- # NB: In general NDArrayBackedExtensionArray will unbox here;
- # this option exists to prevent a performance hit in
- # TimedeltaIndex.get_loc
- return value
- return self._unbox_scalar(value)
-
- def _validation_error_message(self, value, allow_listlike: bool = False) -> str:
- """
- Construct an exception message on validation error.
-
- Some methods allow only scalar inputs, while others allow either scalar
- or listlike.
-
- Parameters
- ----------
- allow_listlike: bool, default False
-
- Returns
- -------
- str
- """
- if allow_listlike:
- msg = (
- f"value should be a '{self._scalar_type.__name__}', 'NaT', "
- f"or array of those. Got '{type(value).__name__}' instead."
- )
- else:
- msg = (
- f"value should be a '{self._scalar_type.__name__}' or 'NaT'. "
- f"Got '{type(value).__name__}' instead."
- )
- return msg
-
- def _validate_listlike(self, value, allow_object: bool = False):
- if isinstance(value, type(self)):
- return value
-
- if isinstance(value, list) and len(value) == 0:
- # We treat empty list as our own dtype.
- return type(self)._from_sequence([], dtype=self.dtype)
-
- if hasattr(value, "dtype") and value.dtype == object:
- # `array` below won't do inference if value is an Index or Series.
- # so do so here. in the Index case, inferred_type may be cached.
- if lib.infer_dtype(value) in self._infer_matches:
- try:
- value = type(self)._from_sequence(value)
- except (ValueError, TypeError):
- if allow_object:
- return value
- msg = self._validation_error_message(value, True)
- raise TypeError(msg)
-
- # Do type inference if necessary up front (after unpacking PandasArray)
- # e.g. we passed PeriodIndex.values and got an ndarray of Periods
- value = extract_array(value, extract_numpy=True)
- value = pd_array(value)
- value = extract_array(value, extract_numpy=True)
-
- if is_all_strings(value):
- # We got a StringArray
- try:
- # TODO: Could use from_sequence_of_strings if implemented
- # Note: passing dtype is necessary for PeriodArray tests
- value = type(self)._from_sequence(value, dtype=self.dtype)
- except ValueError:
- pass
-
- if is_categorical_dtype(value.dtype):
- # e.g. we have a Categorical holding self.dtype
- if is_dtype_equal(value.categories.dtype, self.dtype):
- # TODO: do we need equal dtype or just comparable?
- value = value._internal_get_values()
- value = extract_array(value, extract_numpy=True)
-
- if allow_object and is_object_dtype(value.dtype):
- pass
-
- elif not type(self)._is_recognized_dtype(value.dtype):
- msg = self._validation_error_message(value, True)
- raise TypeError(msg)
-
- return value
-
- def _validate_setitem_value(self, value):
- if is_list_like(value):
- value = self._validate_listlike(value)
- else:
- return self._validate_scalar(value, allow_listlike=True)
-
- return self._unbox(value)
-
- @final
- def _unbox(self, other) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarray:
- """
- Unbox either a scalar with _unbox_scalar or an instance of our own type.
- """
- if lib.is_scalar(other):
- other = self._unbox_scalar(other)
- else:
- # same type as self
- self._check_compatible_with(other)
- other = other._ndarray
- return other
-
- # ------------------------------------------------------------------
- # Additional array methods
- # These are not part of the EA API, but we implement them because
- # pandas assumes they're there.
-
- @ravel_compat
- def map(self, mapper):
- # TODO(GH-23179): Add ExtensionArray.map
- # Need to figure out if we want ExtensionArray.map first.
- # If so, then we can refactor IndexOpsMixin._map_values to
- # a standalone function and call from here..
- # Else, just rewrite _map_infer_values to do the right thing.
- from pandas import Index
-
- return Index(self).map(mapper).array
-
- def isin(self, values) -> npt.NDArray[np.bool_]:
- """
- Compute boolean array of whether each value is found in the
- passed set of values.
-
- Parameters
- ----------
- values : set or sequence of values
-
- Returns
- -------
- ndarray[bool]
- """
- if not hasattr(values, "dtype"):
- values = np.asarray(values)
-
- if values.dtype.kind in ["f", "i", "u", "c"]:
- # TODO: de-duplicate with equals, validate_comparison_value
- return np.zeros(self.shape, dtype=bool)
-
- if not isinstance(values, type(self)):
- inferable = [
- "timedelta",
- "timedelta64",
- "datetime",
- "datetime64",
- "date",
- "period",
- ]
- if values.dtype == object:
- inferred = lib.infer_dtype(values, skipna=False)
- if inferred not in inferable:
- if inferred == "string":
- pass
-
- elif "mixed" in inferred:
- return isin(self.astype(object), values)
- else:
- return np.zeros(self.shape, dtype=bool)
-
- try:
- values = type(self)._from_sequence(values)
- except ValueError:
- return isin(self.astype(object), values)
-
- if self.dtype.kind in ["m", "M"]:
- self = cast("DatetimeArray | TimedeltaArray", self)
- values = values.as_unit(self.unit)
-
- try:
- self._check_compatible_with(values)
- except (TypeError, ValueError):
- # Includes tzawareness mismatch and IncompatibleFrequencyError
- return np.zeros(self.shape, dtype=bool)
-
- return isin(self.asi8, values.asi8)
-
- # ------------------------------------------------------------------
- # Null Handling
-
- def isna(self) -> npt.NDArray[np.bool_]:
- return self._isnan
-
- @property # NB: override with cache_readonly in immutable subclasses
- def _isnan(self) -> npt.NDArray[np.bool_]:
- """
- return if each value is nan
- """
- return self.asi8 == iNaT
-
- @property # NB: override with cache_readonly in immutable subclasses
- def _hasna(self) -> bool:
- """
- return if I have any nans; enables various perf speedups
- """
- return bool(self._isnan.any())
-
- def _maybe_mask_results(
- self, result: np.ndarray, fill_value=iNaT, convert=None
- ) -> np.ndarray:
- """
- Parameters
- ----------
- result : np.ndarray
- fill_value : object, default iNaT
- convert : str, dtype or None
-
- Returns
- -------
- result : ndarray with values replace by the fill_value
-
- mask the result if needed, convert to the provided dtype if its not
- None
-
- This is an internal routine.
- """
- if self._hasna:
- if convert:
- result = result.astype(convert)
- if fill_value is None:
- fill_value = np.nan
- np.putmask(result, self._isnan, fill_value)
- return result
-
- # ------------------------------------------------------------------
- # Frequency Properties/Methods
-
- @property
- def freqstr(self) -> str | None:
- """
- Return the frequency object as a string if its set, otherwise None.
- """
- if self.freq is None:
- return None
- return self.freq.freqstr
-
- @property # NB: override with cache_readonly in immutable subclasses
- def inferred_freq(self) -> str | None:
- """
- Tries to return a string representing a frequency generated by infer_freq.
-
- Returns None if it can't autodetect the frequency.
- """
- if self.ndim != 1:
- return None
- try:
- return frequencies.infer_freq(self)
- except ValueError:
- return None
-
- @property # NB: override with cache_readonly in immutable subclasses
- def _resolution_obj(self) -> Resolution | None:
- freqstr = self.freqstr
- if freqstr is None:
- return None
- try:
- return Resolution.get_reso_from_freqstr(freqstr)
- except KeyError:
- return None
-
- @property # NB: override with cache_readonly in immutable subclasses
- def resolution(self) -> str:
- """
- Returns day, hour, minute, second, millisecond or microsecond
- """
- # error: Item "None" of "Optional[Any]" has no attribute "attrname"
- return self._resolution_obj.attrname # type: ignore[union-attr]
-
- # monotonicity/uniqueness properties are called via frequencies.infer_freq,
- # see GH#23789
-
- @property
- def _is_monotonic_increasing(self) -> bool:
- return algos.is_monotonic(self.asi8, timelike=True)[0]
-
- @property
- def _is_monotonic_decreasing(self) -> bool:
- return algos.is_monotonic(self.asi8, timelike=True)[1]
-
- @property
- def _is_unique(self) -> bool:
- return len(unique1d(self.asi8.ravel("K"))) == self.size
-
- # ------------------------------------------------------------------
- # Arithmetic Methods
-
- def _cmp_method(self, other, op):
- if self.ndim > 1 and getattr(other, "shape", None) == self.shape:
- # TODO: handle 2D-like listlikes
- return op(self.ravel(), other.ravel()).reshape(self.shape)
-
- try:
- other = self._validate_comparison_value(other)
- except InvalidComparison:
- return invalid_comparison(self, other, op)
-
- dtype = getattr(other, "dtype", None)
- if is_object_dtype(dtype):
- # We have to use comp_method_OBJECT_ARRAY instead of numpy
- # comparison otherwise it would fail to raise when
- # comparing tz-aware and tz-naive
- with np.errstate(all="ignore"):
- result = ops.comp_method_OBJECT_ARRAY(
- op, np.asarray(self.astype(object)), other
- )
- return result
-
- if other is NaT:
- if op is operator.ne:
- result = np.ones(self.shape, dtype=bool)
- else:
- result = np.zeros(self.shape, dtype=bool)
- return result
-
- if not is_period_dtype(self.dtype):
- self = cast(TimelikeOps, self)
- if self._creso != other._creso:
- if not isinstance(other, type(self)):
- # i.e. Timedelta/Timestamp, cast to ndarray and let
- # compare_mismatched_resolutions handle broadcasting
- try:
- # GH#52080 see if we can losslessly cast to shared unit
- other = other.as_unit(self.unit, round_ok=False)
- except ValueError:
- other_arr = np.array(other.asm8)
- return compare_mismatched_resolutions(
- self._ndarray, other_arr, op
- )
- else:
- other_arr = other._ndarray
- return compare_mismatched_resolutions(self._ndarray, other_arr, op)
-
- other_vals = self._unbox(other)
- # GH#37462 comparison on i8 values is almost 2x faster than M8/m8
- result = op(self._ndarray.view("i8"), other_vals.view("i8"))
-
- o_mask = isna(other)
- mask = self._isnan | o_mask
- if mask.any():
- nat_result = op is operator.ne
- np.putmask(result, mask, nat_result)
-
- return result
-
- # pow is invalid for all three subclasses; TimedeltaArray will override
- # the multiplication and division ops
- __pow__ = make_invalid_op("__pow__")
- __rpow__ = make_invalid_op("__rpow__")
- __mul__ = make_invalid_op("__mul__")
- __rmul__ = make_invalid_op("__rmul__")
- __truediv__ = make_invalid_op("__truediv__")
- __rtruediv__ = make_invalid_op("__rtruediv__")
- __floordiv__ = make_invalid_op("__floordiv__")
- __rfloordiv__ = make_invalid_op("__rfloordiv__")
- __mod__ = make_invalid_op("__mod__")
- __rmod__ = make_invalid_op("__rmod__")
- __divmod__ = make_invalid_op("__divmod__")
- __rdivmod__ = make_invalid_op("__rdivmod__")
-
- @final
- def _get_i8_values_and_mask(
- self, other
- ) -> tuple[int | npt.NDArray[np.int64], None | npt.NDArray[np.bool_]]:
- """
- Get the int64 values and b_mask to pass to checked_add_with_arr.
- """
- if isinstance(other, Period):
- i8values = other.ordinal
- mask = None
- elif isinstance(other, (Timestamp, Timedelta)):
- i8values = other._value
- mask = None
- else:
- # PeriodArray, DatetimeArray, TimedeltaArray
- mask = other._isnan
- i8values = other.asi8
- return i8values, mask
-
- @final
- def _get_arithmetic_result_freq(self, other) -> BaseOffset | None:
- """
- Check if we can preserve self.freq in addition or subtraction.
- """
- # Adding or subtracting a Timedelta/Timestamp scalar is freq-preserving
- # whenever self.freq is a Tick
- if is_period_dtype(self.dtype):
- return self.freq
- elif not lib.is_scalar(other):
- return None
- elif isinstance(self.freq, Tick):
- # In these cases
- return self.freq
- return None
-
- @final
- def _add_datetimelike_scalar(self, other) -> DatetimeArray:
- if not is_timedelta64_dtype(self.dtype):
- raise TypeError(
- f"cannot add {type(self).__name__} and {type(other).__name__}"
- )
-
- self = cast("TimedeltaArray", self)
-
- from pandas.core.arrays import DatetimeArray
- from pandas.core.arrays.datetimes import tz_to_dtype
-
- assert other is not NaT
- if isna(other):
- # i.e. np.datetime64("NaT")
- # In this case we specifically interpret NaT as a datetime, not
- # the timedelta interpretation we would get by returning self + NaT
- result = self._ndarray + NaT.to_datetime64().astype(f"M8[{self.unit}]")
- # Preserve our resolution
- return DatetimeArray._simple_new(result, dtype=result.dtype)
-
- other = Timestamp(other)
- self, other = self._ensure_matching_resos(other)
- self = cast("TimedeltaArray", self)
-
- other_i8, o_mask = self._get_i8_values_and_mask(other)
- result = checked_add_with_arr(
- self.asi8, other_i8, arr_mask=self._isnan, b_mask=o_mask
- )
- res_values = result.view(f"M8[{self.unit}]")
-
- dtype = tz_to_dtype(tz=other.tz, unit=self.unit)
- res_values = result.view(f"M8[{self.unit}]")
- new_freq = self._get_arithmetic_result_freq(other)
- return DatetimeArray._simple_new(res_values, dtype=dtype, freq=new_freq)
-
- @final
- def _add_datetime_arraylike(self, other: DatetimeArray) -> DatetimeArray:
- if not is_timedelta64_dtype(self.dtype):
- raise TypeError(
- f"cannot add {type(self).__name__} and {type(other).__name__}"
- )
-
- # defer to DatetimeArray.__add__
- return other + self
-
- @final
- def _sub_datetimelike_scalar(self, other: datetime | np.datetime64):
- if self.dtype.kind != "M":
- raise TypeError(f"cannot subtract a datelike from a {type(self).__name__}")
-
- self = cast("DatetimeArray", self)
- # subtract a datetime from myself, yielding a ndarray[timedelta64[ns]]
-
- if isna(other):
- # i.e. np.datetime64("NaT")
- return self - NaT
-
- ts = Timestamp(other)
-
- self, ts = self._ensure_matching_resos(ts)
- return self._sub_datetimelike(ts)
-
- @final
- def _sub_datetime_arraylike(self, other: DatetimeArray):
- if self.dtype.kind != "M":
- raise TypeError(f"cannot subtract a datelike from a {type(self).__name__}")
-
- if len(self) != len(other):
- raise ValueError("cannot add indices of unequal length")
-
- self = cast("DatetimeArray", self)
-
- self, other = self._ensure_matching_resos(other)
- return self._sub_datetimelike(other)
-
- @final
- def _sub_datetimelike(self, other: Timestamp | DatetimeArray) -> TimedeltaArray:
- self = cast("DatetimeArray", self)
-
- from pandas.core.arrays import TimedeltaArray
-
- try:
- self._assert_tzawareness_compat(other)
- except TypeError as err:
- new_message = str(err).replace("compare", "subtract")
- raise type(err)(new_message) from err
-
- other_i8, o_mask = self._get_i8_values_and_mask(other)
- res_values = checked_add_with_arr(
- self.asi8, -other_i8, arr_mask=self._isnan, b_mask=o_mask
- )
- res_m8 = res_values.view(f"timedelta64[{self.unit}]")
-
- new_freq = self._get_arithmetic_result_freq(other)
- return TimedeltaArray._simple_new(res_m8, dtype=res_m8.dtype, freq=new_freq)
-
- @final
- def _add_period(self, other: Period) -> PeriodArray:
- if not is_timedelta64_dtype(self.dtype):
- raise TypeError(f"cannot add Period to a {type(self).__name__}")
-
- # We will wrap in a PeriodArray and defer to the reversed operation
- from pandas.core.arrays.period import PeriodArray
-
- i8vals = np.broadcast_to(other.ordinal, self.shape)
- parr = PeriodArray(i8vals, freq=other.freq)
- return parr + self
-
- def _add_offset(self, offset):
- raise AbstractMethodError(self)
-
- def _add_timedeltalike_scalar(self, other):
- """
- Add a delta of a timedeltalike
-
- Returns
- -------
- Same type as self
- """
- if isna(other):
- # i.e np.timedelta64("NaT")
- new_values = np.empty(self.shape, dtype="i8").view(self._ndarray.dtype)
- new_values.fill(iNaT)
- return type(self)._simple_new(new_values, dtype=self.dtype)
-
- # PeriodArray overrides, so we only get here with DTA/TDA
- self = cast("DatetimeArray | TimedeltaArray", self)
- other = Timedelta(other)
- self, other = self._ensure_matching_resos(other)
- return self._add_timedeltalike(other)
-
- def _add_timedelta_arraylike(self, other: TimedeltaArray):
- """
- Add a delta of a TimedeltaIndex
-
- Returns
- -------
- Same type as self
- """
- # overridden by PeriodArray
-
- if len(self) != len(other):
- raise ValueError("cannot add indices of unequal length")
-
- self = cast("DatetimeArray | TimedeltaArray", self)
-
- self, other = self._ensure_matching_resos(other)
- return self._add_timedeltalike(other)
-
- @final
- def _add_timedeltalike(self, other: Timedelta | TimedeltaArray):
- self = cast("DatetimeArray | TimedeltaArray", self)
-
- other_i8, o_mask = self._get_i8_values_and_mask(other)
- new_values = checked_add_with_arr(
- self.asi8, other_i8, arr_mask=self._isnan, b_mask=o_mask
- )
- res_values = new_values.view(self._ndarray.dtype)
-
- new_freq = self._get_arithmetic_result_freq(other)
-
- return type(self)._simple_new(res_values, dtype=self.dtype, freq=new_freq)
-
- @final
- def _add_nat(self):
- """
- Add pd.NaT to self
- """
- if is_period_dtype(self.dtype):
- raise TypeError(
- f"Cannot add {type(self).__name__} and {type(NaT).__name__}"
- )
- self = cast("TimedeltaArray | DatetimeArray", self)
-
- # GH#19124 pd.NaT is treated like a timedelta for both timedelta
- # and datetime dtypes
- result = np.empty(self.shape, dtype=np.int64)
- result.fill(iNaT)
- result = result.view(self._ndarray.dtype) # preserve reso
- return type(self)._simple_new(result, dtype=self.dtype, freq=None)
-
- @final
- def _sub_nat(self):
- """
- Subtract pd.NaT from self
- """
- # GH#19124 Timedelta - datetime is not in general well-defined.
- # We make an exception for pd.NaT, which in this case quacks
- # like a timedelta.
- # For datetime64 dtypes by convention we treat NaT as a datetime, so
- # this subtraction returns a timedelta64 dtype.
- # For period dtype, timedelta64 is a close-enough return dtype.
- result = np.empty(self.shape, dtype=np.int64)
- result.fill(iNaT)
- if self.dtype.kind in ["m", "M"]:
- # We can retain unit in dtype
- self = cast("DatetimeArray| TimedeltaArray", self)
- return result.view(f"timedelta64[{self.unit}]")
- else:
- return result.view("timedelta64[ns]")
-
- @final
- def _sub_periodlike(self, other: Period | PeriodArray) -> npt.NDArray[np.object_]:
- # If the operation is well-defined, we return an object-dtype ndarray
- # of DateOffsets. Null entries are filled with pd.NaT
- if not is_period_dtype(self.dtype):
- raise TypeError(
- f"cannot subtract {type(other).__name__} from {type(self).__name__}"
- )
-
- self = cast("PeriodArray", self)
- self._check_compatible_with(other)
-
- other_i8, o_mask = self._get_i8_values_and_mask(other)
- new_i8_data = checked_add_with_arr(
- self.asi8, -other_i8, arr_mask=self._isnan, b_mask=o_mask
- )
- new_data = np.array([self.freq.base * x for x in new_i8_data])
-
- if o_mask is None:
- # i.e. Period scalar
- mask = self._isnan
- else:
- # i.e. PeriodArray
- mask = self._isnan | o_mask
- new_data[mask] = NaT
- return new_data
-
- @final
- def _addsub_object_array(self, other: npt.NDArray[np.object_], op):
- """
- Add or subtract array-like of DateOffset objects
-
- Parameters
- ----------
- other : np.ndarray[object]
- op : {operator.add, operator.sub}
-
- Returns
- -------
- np.ndarray[object]
- Except in fastpath case with length 1 where we operate on the
- contained scalar.
- """
- assert op in [operator.add, operator.sub]
- if len(other) == 1 and self.ndim == 1:
- # Note: without this special case, we could annotate return type
- # as ndarray[object]
- # If both 1D then broadcasting is unambiguous
- return op(self, other[0])
-
- warnings.warn(
- "Adding/subtracting object-dtype array to "
- f"{type(self).__name__} not vectorized.",
- PerformanceWarning,
- stacklevel=find_stack_level(),
- )
-
- # Caller is responsible for broadcasting if necessary
- assert self.shape == other.shape, (self.shape, other.shape)
-
- res_values = op(self.astype("O"), np.asarray(other))
- return res_values
-
- def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):
- if name not in {"cummin", "cummax"}:
- raise TypeError(f"Accumulation {name} not supported for {type(self)}")
-
- op = getattr(datetimelike_accumulations, name)
- result = op(self.copy(), skipna=skipna, **kwargs)
-
- return type(self)._simple_new(
- result, freq=None, dtype=self.dtype # type: ignore[call-arg]
- )
-
- @unpack_zerodim_and_defer("__add__")
- def __add__(self, other):
- other_dtype = getattr(other, "dtype", None)
- other = ensure_wrapped_if_datetimelike(other)
-
- # scalar others
- if other is NaT:
- result = self._add_nat()
- elif isinstance(other, (Tick, timedelta, np.timedelta64)):
- result = self._add_timedeltalike_scalar(other)
- elif isinstance(other, BaseOffset):
- # specifically _not_ a Tick
- result = self._add_offset(other)
- elif isinstance(other, (datetime, np.datetime64)):
- result = self._add_datetimelike_scalar(other)
- elif isinstance(other, Period) and is_timedelta64_dtype(self.dtype):
- result = self._add_period(other)
- elif lib.is_integer(other):
- # This check must come after the check for np.timedelta64
- # as is_integer returns True for these
- if not is_period_dtype(self.dtype):
- raise integer_op_not_supported(self)
- obj = cast("PeriodArray", self)
- result = obj._addsub_int_array_or_scalar(other * obj.freq.n, operator.add)
-
- # array-like others
- elif is_timedelta64_dtype(other_dtype):
- # TimedeltaIndex, ndarray[timedelta64]
- result = self._add_timedelta_arraylike(other)
- elif is_object_dtype(other_dtype):
- # e.g. Array/Index of DateOffset objects
- result = self._addsub_object_array(other, operator.add)
- elif is_datetime64_dtype(other_dtype) or is_datetime64tz_dtype(other_dtype):
- # DatetimeIndex, ndarray[datetime64]
- return self._add_datetime_arraylike(other)
- elif is_integer_dtype(other_dtype):
- if not is_period_dtype(self.dtype):
- raise integer_op_not_supported(self)
- obj = cast("PeriodArray", self)
- result = obj._addsub_int_array_or_scalar(other * obj.freq.n, operator.add)
- else:
- # Includes Categorical, other ExtensionArrays
- # For PeriodDtype, if self is a TimedeltaArray and other is a
- # PeriodArray with a timedelta-like (i.e. Tick) freq, this
- # operation is valid. Defer to the PeriodArray implementation.
- # In remaining cases, this will end up raising TypeError.
- return NotImplemented
-
- if isinstance(result, np.ndarray) and is_timedelta64_dtype(result.dtype):
- from pandas.core.arrays import TimedeltaArray
-
- return TimedeltaArray(result)
- return result
-
- def __radd__(self, other):
- # alias for __add__
- return self.__add__(other)
-
- @unpack_zerodim_and_defer("__sub__")
- def __sub__(self, other):
- other_dtype = getattr(other, "dtype", None)
- other = ensure_wrapped_if_datetimelike(other)
-
- # scalar others
- if other is NaT:
- result = self._sub_nat()
- elif isinstance(other, (Tick, timedelta, np.timedelta64)):
- result = self._add_timedeltalike_scalar(-other)
- elif isinstance(other, BaseOffset):
- # specifically _not_ a Tick
- result = self._add_offset(-other)
- elif isinstance(other, (datetime, np.datetime64)):
- result = self._sub_datetimelike_scalar(other)
- elif lib.is_integer(other):
- # This check must come after the check for np.timedelta64
- # as is_integer returns True for these
- if not is_period_dtype(self.dtype):
- raise integer_op_not_supported(self)
- obj = cast("PeriodArray", self)
- result = obj._addsub_int_array_or_scalar(other * obj.freq.n, operator.sub)
-
- elif isinstance(other, Period):
- result = self._sub_periodlike(other)
-
- # array-like others
- elif is_timedelta64_dtype(other_dtype):
- # TimedeltaIndex, ndarray[timedelta64]
- result = self._add_timedelta_arraylike(-other)
- elif is_object_dtype(other_dtype):
- # e.g. Array/Index of DateOffset objects
- result = self._addsub_object_array(other, operator.sub)
- elif is_datetime64_dtype(other_dtype) or is_datetime64tz_dtype(other_dtype):
- # DatetimeIndex, ndarray[datetime64]
- result = self._sub_datetime_arraylike(other)
- elif is_period_dtype(other_dtype):
- # PeriodIndex
- result = self._sub_periodlike(other)
- elif is_integer_dtype(other_dtype):
- if not is_period_dtype(self.dtype):
- raise integer_op_not_supported(self)
- obj = cast("PeriodArray", self)
- result = obj._addsub_int_array_or_scalar(other * obj.freq.n, operator.sub)
- else:
- # Includes ExtensionArrays, float_dtype
- return NotImplemented
-
- if isinstance(result, np.ndarray) and is_timedelta64_dtype(result.dtype):
- from pandas.core.arrays import TimedeltaArray
-
- return TimedeltaArray(result)
- return result
-
- def __rsub__(self, other):
- other_dtype = getattr(other, "dtype", None)
-
- if is_datetime64_any_dtype(other_dtype) and is_timedelta64_dtype(self.dtype):
- # ndarray[datetime64] cannot be subtracted from self, so
- # we need to wrap in DatetimeArray/Index and flip the operation
- if lib.is_scalar(other):
- # i.e. np.datetime64 object
- return Timestamp(other) - self
- if not isinstance(other, DatetimeLikeArrayMixin):
- # Avoid down-casting DatetimeIndex
- from pandas.core.arrays import DatetimeArray
-
- other = DatetimeArray(other)
- return other - self
- elif (
- is_datetime64_any_dtype(self.dtype)
- and hasattr(other, "dtype")
- and not is_datetime64_any_dtype(other.dtype)
- ):
- # GH#19959 datetime - datetime is well-defined as timedelta,
- # but any other type - datetime is not well-defined.
- raise TypeError(
- f"cannot subtract {type(self).__name__} from {type(other).__name__}"
- )
- elif is_period_dtype(self.dtype) and is_timedelta64_dtype(other_dtype):
- # TODO: Can we simplify/generalize these cases at all?
- raise TypeError(f"cannot subtract {type(self).__name__} from {other.dtype}")
- elif is_timedelta64_dtype(self.dtype):
- self = cast("TimedeltaArray", self)
- return (-self) + other
-
- # We get here with e.g. datetime objects
- return -(self - other)
-
- def __iadd__(self: DatetimeLikeArrayT, other) -> DatetimeLikeArrayT:
- result = self + other
- self[:] = result[:]
-
- if not is_period_dtype(self.dtype):
- # restore freq, which is invalidated by setitem
- self._freq = result.freq
- return self
-
- def __isub__(self: DatetimeLikeArrayT, other) -> DatetimeLikeArrayT:
- result = self - other
- self[:] = result[:]
-
- if not is_period_dtype(self.dtype):
- # restore freq, which is invalidated by setitem
- self._freq = result.freq
- return self
-
- # --------------------------------------------------------------
- # Reductions
-
- @_period_dispatch
- def _quantile(
- self: DatetimeLikeArrayT,
- qs: npt.NDArray[np.float64],
- interpolation: str,
- ) -> DatetimeLikeArrayT:
- return super()._quantile(qs=qs, interpolation=interpolation)
-
- @_period_dispatch
- def min(self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs):
- """
- Return the minimum value of the Array or minimum along
- an axis.
-
- See Also
- --------
- numpy.ndarray.min
- Index.min : Return the minimum value in an Index.
- Series.min : Return the minimum value in a Series.
- """
- nv.validate_min((), kwargs)
- nv.validate_minmax_axis(axis, self.ndim)
-
- result = nanops.nanmin(self._ndarray, axis=axis, skipna=skipna)
- return self._wrap_reduction_result(axis, result)
-
- @_period_dispatch
- def max(self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs):
- """
- Return the maximum value of the Array or maximum along
- an axis.
-
- See Also
- --------
- numpy.ndarray.max
- Index.max : Return the maximum value in an Index.
- Series.max : Return the maximum value in a Series.
- """
- nv.validate_max((), kwargs)
- nv.validate_minmax_axis(axis, self.ndim)
-
- result = nanops.nanmax(self._ndarray, axis=axis, skipna=skipna)
- return self._wrap_reduction_result(axis, result)
-
- def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0):
- """
- Return the mean value of the Array.
-
- Parameters
- ----------
- skipna : bool, default True
- Whether to ignore any NaT elements.
- axis : int, optional, default 0
-
- Returns
- -------
- scalar
- Timestamp or Timedelta.
-
- See Also
- --------
- numpy.ndarray.mean : Returns the average of array elements along a given axis.
- Series.mean : Return the mean value in a Series.
-
- Notes
- -----
- mean is only defined for Datetime and Timedelta dtypes, not for Period.
- """
- if is_period_dtype(self.dtype):
- # See discussion in GH#24757
- raise TypeError(
- f"mean is not implemented for {type(self).__name__} since the "
- "meaning is ambiguous. An alternative is "
- "obj.to_timestamp(how='start').mean()"
- )
-
- result = nanops.nanmean(
- self._ndarray, axis=axis, skipna=skipna, mask=self.isna()
- )
- return self._wrap_reduction_result(axis, result)
-
- @_period_dispatch
- def median(self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs):
- nv.validate_median((), kwargs)
-
- if axis is not None and abs(axis) >= self.ndim:
- raise ValueError("abs(axis) must be less than ndim")
-
- result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
- return self._wrap_reduction_result(axis, result)
-
- def _mode(self, dropna: bool = True):
- mask = None
- if dropna:
- mask = self.isna()
-
- i8modes = algorithms.mode(self.view("i8"), mask=mask)
- npmodes = i8modes.view(self._ndarray.dtype)
- npmodes = cast(np.ndarray, npmodes)
- return self._from_backing_data(npmodes)
-
-
-class DatelikeOps(DatetimeLikeArrayMixin):
- """
- Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex.
- """
-
- @Substitution(
- URL="https://docs.python.org/3/library/datetime.html"
- "#strftime-and-strptime-behavior"
- )
- def strftime(self, date_format: str) -> npt.NDArray[np.object_]:
- """
- Convert to Index using specified date_format.
-
- Return an Index of formatted strings specified by date_format, which
- supports the same string format as the python standard library. Details
- of the string format can be found in `python string format
- doc <%(URL)s>`__.
-
- Formats supported by the C `strftime` API but not by the python string format
- doc (such as `"%%R"`, `"%%r"`) are not officially supported and should be
- preferably replaced with their supported equivalents (such as `"%%H:%%M"`,
- `"%%I:%%M:%%S %%p"`).
-
- Note that `PeriodIndex` support additional directives, detailed in
- `Period.strftime`.
-
- Parameters
- ----------
- date_format : str
- Date format string (e.g. "%%Y-%%m-%%d").
-
- Returns
- -------
- ndarray[object]
- NumPy ndarray of formatted strings.
-
- See Also
- --------
- to_datetime : Convert the given argument to datetime.
- DatetimeIndex.normalize : Return DatetimeIndex with times to midnight.
- DatetimeIndex.round : Round the DatetimeIndex to the specified freq.
- DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq.
- Timestamp.strftime : Format a single Timestamp.
- Period.strftime : Format a single Period.
-
- Examples
- --------
- >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"),
- ... periods=3, freq='s')
- >>> rng.strftime('%%B %%d, %%Y, %%r')
- Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM',
- 'March 10, 2018, 09:00:02 AM'],
- dtype='object')
- """
- result = self._format_native_types(date_format=date_format, na_rep=np.nan)
- return result.astype(object, copy=False)
-
-
-_round_doc = """
- Perform {op} operation on the data to the specified `freq`.
-
- Parameters
- ----------
- freq : str or Offset
- The frequency level to {op} the index to. Must be a fixed
- frequency like 'S' (second) not 'ME' (month end). See
- :ref:`frequency aliases <timeseries.offset_aliases>` for
- a list of possible `freq` values.
- ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
- Only relevant for DatetimeIndex:
-
- - 'infer' will attempt to infer fall dst-transition hours based on
- order
- - bool-ndarray where True signifies a DST time, False designates
- a non-DST time (note that this flag is only applicable for
- ambiguous times)
- - 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous
- times.
-
- nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise'
- A nonexistent time does not exist in a particular timezone
- where clocks moved forward due to DST.
-
- - 'shift_forward' will shift the nonexistent time forward to the
- closest existing time
- - 'shift_backward' will shift the nonexistent time backward to the
- closest existing time
- - 'NaT' will return NaT where there are nonexistent times
- - timedelta objects will shift nonexistent times by the timedelta
- - 'raise' will raise an NonExistentTimeError if there are
- nonexistent times.
-
- Returns
- -------
- DatetimeIndex, TimedeltaIndex, or Series
- Index of the same type for a DatetimeIndex or TimedeltaIndex,
- or a Series with the same index for a Series.
-
- Raises
- ------
- ValueError if the `freq` cannot be converted.
-
- Notes
- -----
- If the timestamps have a timezone, {op}ing will take place relative to the
- local ("wall") time and re-localized to the same timezone. When {op}ing
- near daylight savings time, use ``nonexistent`` and ``ambiguous`` to
- control the re-localization behavior.
-
- Examples
- --------
- **DatetimeIndex**
-
- >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min')
- >>> rng
- DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00',
- '2018-01-01 12:01:00'],
- dtype='datetime64[ns]', freq='T')
- """
-
-_round_example = """>>> rng.round('H')
- DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00',
- '2018-01-01 12:00:00'],
- dtype='datetime64[ns]', freq=None)
-
- **Series**
-
- >>> pd.Series(rng).dt.round("H")
- 0 2018-01-01 12:00:00
- 1 2018-01-01 12:00:00
- 2 2018-01-01 12:00:00
- dtype: datetime64[ns]
-
- When rounding near a daylight savings time transition, use ``ambiguous`` or
- ``nonexistent`` to control how the timestamp should be re-localized.
-
- >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam")
-
- >>> rng_tz.floor("2H", ambiguous=False)
- DatetimeIndex(['2021-10-31 02:00:00+01:00'],
- dtype='datetime64[ns, Europe/Amsterdam]', freq=None)
-
- >>> rng_tz.floor("2H", ambiguous=True)
- DatetimeIndex(['2021-10-31 02:00:00+02:00'],
- dtype='datetime64[ns, Europe/Amsterdam]', freq=None)
- """
-
-_floor_example = """>>> rng.floor('H')
- DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00',
- '2018-01-01 12:00:00'],
- dtype='datetime64[ns]', freq=None)
-
- **Series**
-
- >>> pd.Series(rng).dt.floor("H")
- 0 2018-01-01 11:00:00
- 1 2018-01-01 12:00:00
- 2 2018-01-01 12:00:00
- dtype: datetime64[ns]
-
- When rounding near a daylight savings time transition, use ``ambiguous`` or
- ``nonexistent`` to control how the timestamp should be re-localized.
-
- >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam")
-
- >>> rng_tz.floor("2H", ambiguous=False)
- DatetimeIndex(['2021-10-31 02:00:00+01:00'],
- dtype='datetime64[ns, Europe/Amsterdam]', freq=None)
-
- >>> rng_tz.floor("2H", ambiguous=True)
- DatetimeIndex(['2021-10-31 02:00:00+02:00'],
- dtype='datetime64[ns, Europe/Amsterdam]', freq=None)
- """
-
-_ceil_example = """>>> rng.ceil('H')
- DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00',
- '2018-01-01 13:00:00'],
- dtype='datetime64[ns]', freq=None)
-
- **Series**
-
- >>> pd.Series(rng).dt.ceil("H")
- 0 2018-01-01 12:00:00
- 1 2018-01-01 12:00:00
- 2 2018-01-01 13:00:00
- dtype: datetime64[ns]
-
- When rounding near a daylight savings time transition, use ``ambiguous`` or
- ``nonexistent`` to control how the timestamp should be re-localized.
-
- >>> rng_tz = pd.DatetimeIndex(["2021-10-31 01:30:00"], tz="Europe/Amsterdam")
-
- >>> rng_tz.ceil("H", ambiguous=False)
- DatetimeIndex(['2021-10-31 02:00:00+01:00'],
- dtype='datetime64[ns, Europe/Amsterdam]', freq=None)
-
- >>> rng_tz.ceil("H", ambiguous=True)
- DatetimeIndex(['2021-10-31 02:00:00+02:00'],
- dtype='datetime64[ns, Europe/Amsterdam]', freq=None)
- """
-
-
-TimelikeOpsT = TypeVar("TimelikeOpsT", bound="TimelikeOps")
-
-
-class TimelikeOps(DatetimeLikeArrayMixin):
- """
- Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex.
- """
-
- _default_dtype: np.dtype
-
- def __init__(
- self, values, dtype=None, freq=lib.no_default, copy: bool = False
- ) -> None:
- values = extract_array(values, extract_numpy=True)
- if isinstance(values, IntegerArray):
- values = values.to_numpy("int64", na_value=iNaT)
-
- inferred_freq = getattr(values, "_freq", None)
- explicit_none = freq is None
- freq = freq if freq is not lib.no_default else None
-
- if isinstance(values, type(self)):
- if explicit_none:
- # don't inherit from values
- pass
- elif freq is None:
- freq = values.freq
- elif freq and values.freq:
- freq = to_offset(freq)
- freq, _ = validate_inferred_freq(freq, values.freq, False)
-
- if dtype is not None:
- dtype = pandas_dtype(dtype)
- if not is_dtype_equal(dtype, values.dtype):
- # TODO: we only have tests for this for DTA, not TDA (2022-07-01)
- raise TypeError(
- f"dtype={dtype} does not match data dtype {values.dtype}"
- )
-
- dtype = values.dtype
- values = values._ndarray
-
- elif dtype is None:
- if isinstance(values, np.ndarray) and values.dtype.kind in "Mm":
- dtype = values.dtype
- else:
- dtype = self._default_dtype
-
- if not isinstance(values, np.ndarray):
- raise ValueError(
- f"Unexpected type '{type(values).__name__}'. 'values' must be a "
- f"{type(self).__name__}, ndarray, or Series or Index "
- "containing one of those."
- )
- if values.ndim not in [1, 2]:
- raise ValueError("Only 1-dimensional input arrays are supported.")
-
- if values.dtype == "i8":
- # for compat with datetime/timedelta/period shared methods,
- # we can sometimes get here with int64 values. These represent
- # nanosecond UTC (or tz-naive) unix timestamps
- values = values.view(self._default_dtype)
-
- dtype = self._validate_dtype(values, dtype)
-
- if freq == "infer":
- raise ValueError(
- f"Frequency inference not allowed in {type(self).__name__}.__init__. "
- "Use 'pd.array()' instead."
- )
-
- if copy:
- values = values.copy()
- if freq:
- freq = to_offset(freq)
-
- NDArrayBacked.__init__(self, values=values, dtype=dtype)
- self._freq = freq
-
- if inferred_freq is None and freq is not None:
- type(self)._validate_frequency(self, freq)
-
- @classmethod
- def _validate_dtype(cls, values, dtype):
- raise AbstractMethodError(cls)
-
- @property
- def freq(self):
- """
- Return the frequency object if it is set, otherwise None.
- """
- return self._freq
-
- @freq.setter
- def freq(self, value) -> None:
- if value is not None:
- value = to_offset(value)
- self._validate_frequency(self, value)
-
- if self.ndim > 1:
- raise ValueError("Cannot set freq with ndim > 1")
-
- self._freq = value
-
- @classmethod
- def _validate_frequency(cls, index, freq, **kwargs):
- """
- Validate that a frequency is compatible with the values of a given
- Datetime Array/Index or Timedelta Array/Index
-
- Parameters
- ----------
- index : DatetimeIndex or TimedeltaIndex
- The index on which to determine if the given frequency is valid
- freq : DateOffset
- The frequency to validate
- """
- inferred = index.inferred_freq
- if index.size == 0 or inferred == freq.freqstr:
- return None
-
- try:
- on_freq = cls._generate_range(
- start=index[0],
- end=None,
- periods=len(index),
- freq=freq,
- unit=index.unit,
- **kwargs,
- )
- if not np.array_equal(index.asi8, on_freq.asi8):
- raise ValueError
- except ValueError as err:
- if "non-fixed" in str(err):
- # non-fixed frequencies are not meaningful for timedelta64;
- # we retain that error message
- raise err
- # GH#11587 the main way this is reached is if the `np.array_equal`
- # check above is False. This can also be reached if index[0]
- # is `NaT`, in which case the call to `cls._generate_range` will
- # raise a ValueError, which we re-raise with a more targeted
- # message.
- raise ValueError(
- f"Inferred frequency {inferred} from passed values "
- f"does not conform to passed frequency {freq.freqstr}"
- ) from err
-
- @classmethod
- def _generate_range(
- cls: type[DatetimeLikeArrayT], start, end, periods, freq, *args, **kwargs
- ) -> DatetimeLikeArrayT:
- raise AbstractMethodError(cls)
-
- # --------------------------------------------------------------
-
- @cache_readonly
- def _creso(self) -> int:
- return get_unit_from_dtype(self._ndarray.dtype)
-
- @cache_readonly
- def unit(self) -> str:
- # e.g. "ns", "us", "ms"
- # error: Argument 1 to "dtype_to_unit" has incompatible type
- # "ExtensionDtype"; expected "Union[DatetimeTZDtype, dtype[Any]]"
- return dtype_to_unit(self.dtype) # type: ignore[arg-type]
-
- def as_unit(self: TimelikeOpsT, unit: str) -> TimelikeOpsT:
- if unit not in ["s", "ms", "us", "ns"]:
- raise ValueError("Supported units are 's', 'ms', 'us', 'ns'")
-
- dtype = np.dtype(f"{self.dtype.kind}8[{unit}]")
- new_values = astype_overflowsafe(self._ndarray, dtype, round_ok=True)
-
- if isinstance(self.dtype, np.dtype):
- new_dtype = new_values.dtype
- else:
- tz = cast("DatetimeArray", self).tz
- new_dtype = DatetimeTZDtype(tz=tz, unit=unit)
-
- # error: Unexpected keyword argument "freq" for "_simple_new" of
- # "NDArrayBacked" [call-arg]
- return type(self)._simple_new(
- new_values, dtype=new_dtype, freq=self.freq # type: ignore[call-arg]
- )
-
- # TODO: annotate other as DatetimeArray | TimedeltaArray | Timestamp | Timedelta
- # with the return type matching input type. TypeVar?
- def _ensure_matching_resos(self, other):
- if self._creso != other._creso:
- # Just as with Timestamp/Timedelta, we cast to the higher resolution
- if self._creso < other._creso:
- self = self.as_unit(other.unit)
- else:
- other = other.as_unit(self.unit)
- return self, other
-
- # --------------------------------------------------------------
-
- def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
- if (
- ufunc in [np.isnan, np.isinf, np.isfinite]
- and len(inputs) == 1
- and inputs[0] is self
- ):
- # numpy 1.18 changed isinf and isnan to not raise on dt64/td64
- return getattr(ufunc, method)(self._ndarray, **kwargs)
-
- return super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
-
- def _round(self, freq, mode, ambiguous, nonexistent):
- # round the local times
- if is_datetime64tz_dtype(self.dtype):
- # operate on naive timestamps, then convert back to aware
- self = cast("DatetimeArray", self)
- naive = self.tz_localize(None)
- result = naive._round(freq, mode, ambiguous, nonexistent)
- return result.tz_localize(
- self.tz, ambiguous=ambiguous, nonexistent=nonexistent
- )
-
- values = self.view("i8")
- values = cast(np.ndarray, values)
- offset = to_offset(freq)
- offset.nanos # raises on non-fixed frequencies
- nanos = delta_to_nanoseconds(offset, self._creso)
- if nanos == 0:
- # GH 52761
- return self.copy()
- result_i8 = round_nsint64(values, mode, nanos)
- result = self._maybe_mask_results(result_i8, fill_value=iNaT)
- result = result.view(self._ndarray.dtype)
- return self._simple_new(result, dtype=self.dtype)
-
- @Appender((_round_doc + _round_example).format(op="round"))
- def round(
- self,
- freq,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent)
-
- @Appender((_round_doc + _floor_example).format(op="floor"))
- def floor(
- self,
- freq,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent)
-
- @Appender((_round_doc + _ceil_example).format(op="ceil"))
- def ceil(
- self,
- freq,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ):
- return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent)
-
- # --------------------------------------------------------------
- # Reductions
-
- def any(self, *, axis: AxisInt | None = None, skipna: bool = True) -> bool:
- # GH#34479 the nanops call will issue a FutureWarning for non-td64 dtype
- return nanops.nanany(self._ndarray, axis=axis, skipna=skipna, mask=self.isna())
-
- def all(self, *, axis: AxisInt | None = None, skipna: bool = True) -> bool:
- # GH#34479 the nanops call will issue a FutureWarning for non-td64 dtype
-
- return nanops.nanall(self._ndarray, axis=axis, skipna=skipna, mask=self.isna())
-
- # --------------------------------------------------------------
- # Frequency Methods
-
- def _maybe_clear_freq(self) -> None:
- self._freq = None
-
- def _with_freq(self, freq):
- """
- Helper to get a view on the same data, with a new freq.
-
- Parameters
- ----------
- freq : DateOffset, None, or "infer"
-
- Returns
- -------
- Same type as self
- """
- # GH#29843
- if freq is None:
- # Always valid
- pass
- elif len(self) == 0 and isinstance(freq, BaseOffset):
- # Always valid. In the TimedeltaArray case, we assume this
- # is a Tick offset.
- pass
- else:
- # As an internal method, we can ensure this assertion always holds
- assert freq == "infer"
- freq = to_offset(self.inferred_freq)
-
- arr = self.view()
- arr._freq = freq
- return arr
-
- # --------------------------------------------------------------
-
- def factorize(
- self,
- use_na_sentinel: bool = True,
- sort: bool = False,
- ):
- if self.freq is not None:
- # We must be unique, so can short-circuit (and retain freq)
- codes = np.arange(len(self), dtype=np.intp)
- uniques = self.copy() # TODO: copy or view?
- if sort and self.freq.n < 0:
- codes = codes[::-1]
- uniques = uniques[::-1]
- return codes, uniques
- # FIXME: shouldn't get here; we are ignoring sort
- return super().factorize(use_na_sentinel=use_na_sentinel)
-
-
-# -------------------------------------------------------------------
-# Shared Constructor Helpers
-
-
-def ensure_arraylike_for_datetimelike(data, copy: bool, cls_name: str):
- if not hasattr(data, "dtype"):
- # e.g. list, tuple
- if not isinstance(data, (list, tuple)) and np.ndim(data) == 0:
- # i.e. generator
- data = list(data)
- data = np.asarray(data)
- copy = False
- elif isinstance(data, ABCMultiIndex):
- raise TypeError(f"Cannot create a {cls_name} from a MultiIndex.")
- else:
- data = extract_array(data, extract_numpy=True)
-
- if isinstance(data, IntegerArray) or (
- isinstance(data, ArrowExtensionArray) and data.dtype.kind in "iu"
- ):
- data = data.to_numpy("int64", na_value=iNaT)
- copy = False
- elif not isinstance(data, (np.ndarray, ExtensionArray)) or isinstance(
- data, ArrowExtensionArray
- ):
- # GH#24539 e.g. xarray, dask object
- data = np.asarray(data)
-
- elif isinstance(data, ABCCategorical):
- # GH#18664 preserve tz in going DTI->Categorical->DTI
- # TODO: cases where we need to do another pass through maybe_convert_dtype,
- # e.g. the categories are timedelta64s
- data = data.categories.take(data.codes, fill_value=NaT)._values
- copy = False
-
- return data, copy
-
-
-@overload
-def validate_periods(periods: None) -> None:
- ...
-
-
-@overload
-def validate_periods(periods: int | float) -> int:
- ...
-
-
-def validate_periods(periods: int | float | None) -> int | None:
- """
- If a `periods` argument is passed to the Datetime/Timedelta Array/Index
- constructor, cast it to an integer.
-
- Parameters
- ----------
- periods : None, float, int
-
- Returns
- -------
- periods : None or int
-
- Raises
- ------
- TypeError
- if periods is None, float, or int
- """
- if periods is not None:
- if lib.is_float(periods):
- periods = int(periods)
- elif not lib.is_integer(periods):
- raise TypeError(f"periods must be a number, got {periods}")
- periods = cast(int, periods)
- return periods
-
-
-def validate_inferred_freq(
- freq, inferred_freq, freq_infer
-) -> tuple[BaseOffset | None, bool]:
- """
- If the user passes a freq and another freq is inferred from passed data,
- require that they match.
-
- Parameters
- ----------
- freq : DateOffset or None
- inferred_freq : DateOffset or None
- freq_infer : bool
-
- Returns
- -------
- freq : DateOffset or None
- freq_infer : bool
-
- Notes
- -----
- We assume at this point that `maybe_infer_freq` has been called, so
- `freq` is either a DateOffset object or None.
- """
- if inferred_freq is not None:
- if freq is not None and freq != inferred_freq:
- raise ValueError(
- f"Inferred frequency {inferred_freq} from passed "
- "values does not conform to passed frequency "
- f"{freq.freqstr}"
- )
- if freq is None:
- freq = inferred_freq
- freq_infer = False
-
- return freq, freq_infer
-
-
-def maybe_infer_freq(freq):
- """
- Comparing a DateOffset to the string "infer" raises, so we need to
- be careful about comparisons. Make a dummy variable `freq_infer` to
- signify the case where the given freq is "infer" and set freq to None
- to avoid comparison trouble later on.
-
- Parameters
- ----------
- freq : {DateOffset, None, str}
-
- Returns
- -------
- freq : {DateOffset, None}
- freq_infer : bool
- Whether we should inherit the freq of passed data.
- """
- freq_infer = False
- if not isinstance(freq, BaseOffset):
- # if a passed freq is None, don't infer automatically
- if freq != "infer":
- freq = to_offset(freq)
- else:
- freq_infer = True
- freq = None
- return freq, freq_infer
-
-
-def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str:
- """
- Return the unit str corresponding to the dtype's resolution.
-
- Parameters
- ----------
- dtype : DatetimeTZDtype or np.dtype
- If np.dtype, we assume it is a datetime64 dtype.
-
- Returns
- -------
- str
- """
- if isinstance(dtype, DatetimeTZDtype):
- return dtype.unit
- return np.datetime_data(dtype)[0]
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/datetimes.py b/contrib/python/pandas/py3/pandas/core/arrays/datetimes.py
deleted file mode 100644
index 1624870705b..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/datetimes.py
+++ /dev/null
@@ -1,2595 +0,0 @@
-from __future__ import annotations
-
-from datetime import (
- datetime,
- time,
- timedelta,
- tzinfo,
-)
-from typing import (
- TYPE_CHECKING,
- Iterator,
- cast,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import (
- lib,
- tslib,
-)
-from pandas._libs.tslibs import (
- BaseOffset,
- NaT,
- NaTType,
- Resolution,
- Timestamp,
- astype_overflowsafe,
- fields,
- get_resolution,
- get_supported_reso,
- get_unit_from_dtype,
- ints_to_pydatetime,
- is_date_array_normalized,
- is_supported_unit,
- is_unitless,
- normalize_i8_timestamps,
- npy_unit_to_abbrev,
- timezones,
- to_offset,
- tz_convert_from_utc,
- tzconversion,
-)
-from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
-from pandas._typing import (
- DateTimeErrorChoices,
- IntervalClosedType,
- TimeAmbiguous,
- TimeNonexistent,
- npt,
-)
-from pandas.errors import PerformanceWarning
-from pandas.util._exceptions import find_stack_level
-from pandas.util._validators import validate_inclusive
-
-from pandas.core.dtypes.common import (
- DT64NS_DTYPE,
- INT64_DTYPE,
- is_bool_dtype,
- is_datetime64_any_dtype,
- is_datetime64_dtype,
- is_datetime64tz_dtype,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float_dtype,
- is_object_dtype,
- is_period_dtype,
- is_sparse,
- is_string_dtype,
- is_timedelta64_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import (
- DatetimeTZDtype,
- ExtensionDtype,
-)
-from pandas.core.dtypes.missing import isna
-
-from pandas.core.arrays import datetimelike as dtl
-from pandas.core.arrays._ranges import generate_regular_range
-import pandas.core.common as com
-
-from pandas.tseries.frequencies import get_period_alias
-from pandas.tseries.offsets import (
- Day,
- Tick,
-)
-
-if TYPE_CHECKING:
- from pandas import DataFrame
- from pandas.core.arrays import PeriodArray
-
-_midnight = time(0, 0)
-
-
-def tz_to_dtype(tz: tzinfo | None, unit: str = "ns"):
- """
- Return a datetime64[ns] dtype appropriate for the given timezone.
-
- Parameters
- ----------
- tz : tzinfo or None
- unit : str, default "ns"
-
- Returns
- -------
- np.dtype or Datetime64TZDType
- """
- if tz is None:
- return np.dtype(f"M8[{unit}]")
- else:
- return DatetimeTZDtype(tz=tz, unit=unit)
-
-
-def _field_accessor(name: str, field: str, docstring=None):
- def f(self):
- values = self._local_timestamps()
-
- if field in self._bool_ops:
- result: np.ndarray
-
- if field.endswith(("start", "end")):
- freq = self.freq
- month_kw = 12
- if freq:
- kwds = freq.kwds
- month_kw = kwds.get("startingMonth", kwds.get("month", 12))
-
- result = fields.get_start_end_field(
- values, field, self.freqstr, month_kw, reso=self._creso
- )
- else:
- result = fields.get_date_field(values, field, reso=self._creso)
-
- # these return a boolean by-definition
- return result
-
- if field in self._object_ops:
- result = fields.get_date_name_field(values, field, reso=self._creso)
- result = self._maybe_mask_results(result, fill_value=None)
-
- else:
- result = fields.get_date_field(values, field, reso=self._creso)
- result = self._maybe_mask_results(
- result, fill_value=None, convert="float64"
- )
-
- return result
-
- f.__name__ = name
- f.__doc__ = docstring
- return property(f)
-
-
-class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):
- """
- Pandas ExtensionArray for tz-naive or tz-aware datetime data.
-
- .. warning::
-
- DatetimeArray is currently experimental, and its API may change
- without warning. In particular, :attr:`DatetimeArray.dtype` is
- expected to change to always be an instance of an ``ExtensionDtype``
- subclass.
-
- Parameters
- ----------
- values : Series, Index, DatetimeArray, ndarray
- The datetime data.
-
- For DatetimeArray `values` (or a Series or Index boxing one),
- `dtype` and `freq` will be extracted from `values`.
-
- dtype : numpy.dtype or DatetimeTZDtype
- Note that the only NumPy dtype allowed is 'datetime64[ns]'.
- freq : str or Offset, optional
- The frequency.
- copy : bool, default False
- Whether to copy the underlying array of values.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
- """
-
- _typ = "datetimearray"
- _internal_fill_value = np.datetime64("NaT", "ns")
- _recognized_scalars = (datetime, np.datetime64)
- _is_recognized_dtype = is_datetime64_any_dtype
- _infer_matches = ("datetime", "datetime64", "date")
-
- @property
- def _scalar_type(self) -> type[Timestamp]:
- return Timestamp
-
- # define my properties & methods for delegation
- _bool_ops: list[str] = [
- "is_month_start",
- "is_month_end",
- "is_quarter_start",
- "is_quarter_end",
- "is_year_start",
- "is_year_end",
- "is_leap_year",
- ]
- _object_ops: list[str] = ["freq", "tz"]
- _field_ops: list[str] = [
- "year",
- "month",
- "day",
- "hour",
- "minute",
- "second",
- "weekday",
- "dayofweek",
- "day_of_week",
- "dayofyear",
- "day_of_year",
- "quarter",
- "days_in_month",
- "daysinmonth",
- "microsecond",
- "nanosecond",
- ]
- _other_ops: list[str] = ["date", "time", "timetz"]
- _datetimelike_ops: list[str] = (
- _field_ops + _object_ops + _bool_ops + _other_ops + ["unit"]
- )
- _datetimelike_methods: list[str] = [
- "to_period",
- "tz_localize",
- "tz_convert",
- "normalize",
- "strftime",
- "round",
- "floor",
- "ceil",
- "month_name",
- "day_name",
- "as_unit",
- ]
-
- # ndim is inherited from ExtensionArray, must exist to ensure
- # Timestamp.__richcmp__(DateTimeArray) operates pointwise
-
- # ensure that operations with numpy arrays defer to our implementation
- __array_priority__ = 1000
-
- # -----------------------------------------------------------------
- # Constructors
-
- _dtype: np.dtype | DatetimeTZDtype
- _freq: BaseOffset | None = None
- _default_dtype = DT64NS_DTYPE # used in TimeLikeOps.__init__
-
- @classmethod
- def _validate_dtype(cls, values, dtype):
- # used in TimeLikeOps.__init__
- _validate_dt64_dtype(values.dtype)
- dtype = _validate_dt64_dtype(dtype)
- return dtype
-
- # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked"
- @classmethod
- def _simple_new( # type: ignore[override]
- cls,
- values: np.ndarray,
- freq: BaseOffset | None = None,
- dtype=DT64NS_DTYPE,
- ) -> DatetimeArray:
- assert isinstance(values, np.ndarray)
- assert dtype.kind == "M"
- if isinstance(dtype, np.dtype):
- assert dtype == values.dtype
- assert not is_unitless(dtype)
- else:
- # DatetimeTZDtype. If we have e.g. DatetimeTZDtype[us, UTC],
- # then values.dtype should be M8[us].
- assert dtype._creso == get_unit_from_dtype(values.dtype)
-
- result = super()._simple_new(values, dtype)
- result._freq = freq
- return result
-
- @classmethod
- def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
- return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy)
-
- @classmethod
- def _from_sequence_not_strict(
- cls,
- data,
- *,
- dtype=None,
- copy: bool = False,
- tz=lib.no_default,
- freq: str | BaseOffset | lib.NoDefault | None = lib.no_default,
- dayfirst: bool = False,
- yearfirst: bool = False,
- ambiguous: TimeAmbiguous = "raise",
- ):
- """
- A non-strict version of _from_sequence, called from DatetimeIndex.__new__.
- """
- explicit_none = freq is None
- freq = freq if freq is not lib.no_default else None
- freq, freq_infer = dtl.maybe_infer_freq(freq)
-
- # if the user either explicitly passes tz=None or a tz-naive dtype, we
- # disallows inferring a tz.
- explicit_tz_none = tz is None
- if tz is lib.no_default:
- tz = None
- else:
- tz = timezones.maybe_get_tz(tz)
-
- dtype = _validate_dt64_dtype(dtype)
- # if dtype has an embedded tz, capture it
- tz = _validate_tz_from_dtype(dtype, tz, explicit_tz_none)
-
- unit = None
- if dtype is not None:
- if isinstance(dtype, np.dtype):
- unit = np.datetime_data(dtype)[0]
- else:
- # DatetimeTZDtype
- unit = dtype.unit
-
- subarr, tz, inferred_freq = _sequence_to_dt64ns(
- data,
- copy=copy,
- tz=tz,
- dayfirst=dayfirst,
- yearfirst=yearfirst,
- ambiguous=ambiguous,
- out_unit=unit,
- )
- # We have to call this again after possibly inferring a tz above
- _validate_tz_from_dtype(dtype, tz, explicit_tz_none)
- if tz is not None and explicit_tz_none:
- raise ValueError(
- "Passed data is timezone-aware, incompatible with 'tz=None'. "
- "Use obj.tz_localize(None) instead."
- )
-
- freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer)
- if explicit_none:
- freq = None
-
- data_unit = np.datetime_data(subarr.dtype)[0]
- data_dtype = tz_to_dtype(tz, data_unit)
- result = cls._simple_new(subarr, freq=freq, dtype=data_dtype)
- if unit is not None and unit != result.unit:
- # If unit was specified in user-passed dtype, cast to it here
- result = result.as_unit(unit)
-
- if inferred_freq is None and freq is not None:
- # this condition precludes `freq_infer`
- cls._validate_frequency(result, freq, ambiguous=ambiguous)
-
- elif freq_infer:
- # Set _freq directly to bypass duplicative _validate_frequency
- # check.
- result._freq = to_offset(result.inferred_freq)
-
- return result
-
- # error: Signature of "_generate_range" incompatible with supertype
- # "DatetimeLikeArrayMixin"
- @classmethod
- def _generate_range( # type: ignore[override]
- cls,
- start,
- end,
- periods,
- freq,
- tz=None,
- normalize: bool = False,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- inclusive: IntervalClosedType = "both",
- *,
- unit: str | None = None,
- ) -> DatetimeArray:
- periods = dtl.validate_periods(periods)
- if freq is None and any(x is None for x in [periods, start, end]):
- raise ValueError("Must provide freq argument if no data is supplied")
-
- if com.count_not_none(start, end, periods, freq) != 3:
- raise ValueError(
- "Of the four parameters: start, end, periods, "
- "and freq, exactly three must be specified"
- )
- freq = to_offset(freq)
-
- if start is not None:
- start = Timestamp(start)
-
- if end is not None:
- end = Timestamp(end)
-
- if start is NaT or end is NaT:
- raise ValueError("Neither `start` nor `end` can be NaT")
-
- if unit is not None:
- if unit not in ["s", "ms", "us", "ns"]:
- raise ValueError("'unit' must be one of 's', 'ms', 'us', 'ns'")
- else:
- unit = "ns"
-
- if start is not None and unit is not None:
- start = start.as_unit(unit, round_ok=False)
- if end is not None and unit is not None:
- end = end.as_unit(unit, round_ok=False)
-
- left_inclusive, right_inclusive = validate_inclusive(inclusive)
- start, end = _maybe_normalize_endpoints(start, end, normalize)
- tz = _infer_tz_from_endpoints(start, end, tz)
-
- if tz is not None:
- # Localize the start and end arguments
- start_tz = None if start is None else start.tz
- end_tz = None if end is None else end.tz
- start = _maybe_localize_point(
- start, start_tz, start, freq, tz, ambiguous, nonexistent
- )
- end = _maybe_localize_point(
- end, end_tz, end, freq, tz, ambiguous, nonexistent
- )
-
- if freq is not None:
- # We break Day arithmetic (fixed 24 hour) here and opt for
- # Day to mean calendar day (23/24/25 hour). Therefore, strip
- # tz info from start and day to avoid DST arithmetic
- if isinstance(freq, Day):
- if start is not None:
- start = start.tz_localize(None)
- if end is not None:
- end = end.tz_localize(None)
-
- if isinstance(freq, Tick):
- i8values = generate_regular_range(start, end, periods, freq, unit=unit)
- else:
- xdr = _generate_range(
- start=start, end=end, periods=periods, offset=freq, unit=unit
- )
- i8values = np.array([x._value for x in xdr], dtype=np.int64)
-
- endpoint_tz = start.tz if start is not None else end.tz
-
- if tz is not None and endpoint_tz is None:
- if not timezones.is_utc(tz):
- # short-circuit tz_localize_to_utc which would make
- # an unnecessary copy with UTC but be a no-op.
- creso = abbrev_to_npy_unit(unit)
- i8values = tzconversion.tz_localize_to_utc(
- i8values,
- tz,
- ambiguous=ambiguous,
- nonexistent=nonexistent,
- creso=creso,
- )
-
- # i8values is localized datetime64 array -> have to convert
- # start/end as well to compare
- if start is not None:
- start = start.tz_localize(tz, ambiguous, nonexistent)
- if end is not None:
- end = end.tz_localize(tz, ambiguous, nonexistent)
- else:
- # Create a linearly spaced date_range in local time
- # Nanosecond-granularity timestamps aren't always correctly
- # representable with doubles, so we limit the range that we
- # pass to np.linspace as much as possible
- i8values = (
- np.linspace(0, end._value - start._value, periods, dtype="int64")
- + start._value
- )
- if i8values.dtype != "i8":
- # 2022-01-09 I (brock) am not sure if it is possible for this
- # to overflow and cast to e.g. f8, but if it does we need to cast
- i8values = i8values.astype("i8")
-
- if start == end:
- if not left_inclusive and not right_inclusive:
- i8values = i8values[1:-1]
- else:
- start_i8 = Timestamp(start)._value
- end_i8 = Timestamp(end)._value
- if not left_inclusive or not right_inclusive:
- if not left_inclusive and len(i8values) and i8values[0] == start_i8:
- i8values = i8values[1:]
- if not right_inclusive and len(i8values) and i8values[-1] == end_i8:
- i8values = i8values[:-1]
-
- dt64_values = i8values.view(f"datetime64[{unit}]")
- dtype = tz_to_dtype(tz, unit=unit)
- return cls._simple_new(dt64_values, freq=freq, dtype=dtype)
-
- # -----------------------------------------------------------------
- # DatetimeLike Interface
-
- def _unbox_scalar(self, value) -> np.datetime64:
- if not isinstance(value, self._scalar_type) and value is not NaT:
- raise ValueError("'value' should be a Timestamp.")
- self._check_compatible_with(value)
- if value is NaT:
- return np.datetime64(value._value, self.unit)
- else:
- return value.as_unit(self.unit).asm8
-
- def _scalar_from_string(self, value) -> Timestamp | NaTType:
- return Timestamp(value, tz=self.tz)
-
- def _check_compatible_with(self, other) -> None:
- if other is NaT:
- return
- self._assert_tzawareness_compat(other)
-
- # -----------------------------------------------------------------
- # Descriptive Properties
-
- def _box_func(self, x: np.datetime64) -> Timestamp | NaTType:
- # GH#42228
- value = x.view("i8")
- ts = Timestamp._from_value_and_reso(value, reso=self._creso, tz=self.tz)
- return ts
-
- @property
- # error: Return type "Union[dtype, DatetimeTZDtype]" of "dtype"
- # incompatible with return type "ExtensionDtype" in supertype
- # "ExtensionArray"
- def dtype(self) -> np.dtype | DatetimeTZDtype: # type: ignore[override]
- """
- The dtype for the DatetimeArray.
-
- .. warning::
-
- A future version of pandas will change dtype to never be a
- ``numpy.dtype``. Instead, :attr:`DatetimeArray.dtype` will
- always be an instance of an ``ExtensionDtype`` subclass.
-
- Returns
- -------
- numpy.dtype or DatetimeTZDtype
- If the values are tz-naive, then ``np.dtype('datetime64[ns]')``
- is returned.
-
- If the values are tz-aware, then the ``DatetimeTZDtype``
- is returned.
- """
- return self._dtype
-
- @property
- def tz(self) -> tzinfo | None:
- """
- Return the timezone.
-
- Returns
- -------
- datetime.tzinfo, pytz.tzinfo.BaseTZInfo, dateutil.tz.tz.tzfile, or None
- Returns None when the array is tz-naive.
- """
- # GH 18595
- return getattr(self.dtype, "tz", None)
-
- @tz.setter
- def tz(self, value):
- # GH 3746: Prevent localizing or converting the index by setting tz
- raise AttributeError(
- "Cannot directly set timezone. Use tz_localize() "
- "or tz_convert() as appropriate"
- )
-
- @property
- def tzinfo(self) -> tzinfo | None:
- """
- Alias for tz attribute
- """
- return self.tz
-
- @property # NB: override with cache_readonly in immutable subclasses
- def is_normalized(self) -> bool:
- """
- Returns True if all of the dates are at midnight ("no time")
- """
- return is_date_array_normalized(self.asi8, self.tz, reso=self._creso)
-
- @property # NB: override with cache_readonly in immutable subclasses
- def _resolution_obj(self) -> Resolution:
- return get_resolution(self.asi8, self.tz, reso=self._creso)
-
- # ----------------------------------------------------------------
- # Array-Like / EA-Interface Methods
-
- def __array__(self, dtype=None) -> np.ndarray:
- if dtype is None and self.tz:
- # The default for tz-aware is object, to preserve tz info
- dtype = object
-
- return super().__array__(dtype=dtype)
-
- def __iter__(self) -> Iterator:
- """
- Return an iterator over the boxed values
-
- Yields
- ------
- tstamp : Timestamp
- """
- if self.ndim > 1:
- for i in range(len(self)):
- yield self[i]
- else:
- # convert in chunks of 10k for efficiency
- data = self.asi8
- length = len(self)
- chunksize = 10000
- chunks = (length // chunksize) + 1
-
- for i in range(chunks):
- start_i = i * chunksize
- end_i = min((i + 1) * chunksize, length)
- converted = ints_to_pydatetime(
- data[start_i:end_i],
- tz=self.tz,
- box="timestamp",
- reso=self._creso,
- )
- yield from converted
-
- def astype(self, dtype, copy: bool = True):
- # We handle
- # --> datetime
- # --> period
- # DatetimeLikeArrayMixin Super handles the rest.
- dtype = pandas_dtype(dtype)
-
- if is_dtype_equal(dtype, self.dtype):
- if copy:
- return self.copy()
- return self
-
- elif isinstance(dtype, ExtensionDtype):
- if not isinstance(dtype, DatetimeTZDtype):
- # e.g. Sparse[datetime64[ns]]
- return super().astype(dtype, copy=copy)
- elif self.tz is None:
- # pre-2.0 this did self.tz_localize(dtype.tz), which did not match
- # the Series behavior which did
- # values.tz_localize("UTC").tz_convert(dtype.tz)
- raise TypeError(
- "Cannot use .astype to convert from timezone-naive dtype to "
- "timezone-aware dtype. Use obj.tz_localize instead or "
- "series.dt.tz_localize instead"
- )
- else:
- # tzaware unit conversion e.g. datetime64[s, UTC]
- np_dtype = np.dtype(dtype.str)
- res_values = astype_overflowsafe(self._ndarray, np_dtype, copy=copy)
- return type(self)._simple_new(res_values, dtype=dtype, freq=self.freq)
-
- elif (
- self.tz is None
- and is_datetime64_dtype(dtype)
- and not is_unitless(dtype)
- and is_supported_unit(get_unit_from_dtype(dtype))
- ):
- # unit conversion e.g. datetime64[s]
- res_values = astype_overflowsafe(self._ndarray, dtype, copy=True)
- return type(self)._simple_new(res_values, dtype=res_values.dtype)
- # TODO: preserve freq?
-
- elif self.tz is not None and is_datetime64_dtype(dtype):
- # pre-2.0 behavior for DTA/DTI was
- # values.tz_convert("UTC").tz_localize(None), which did not match
- # the Series behavior
- raise TypeError(
- "Cannot use .astype to convert from timezone-aware dtype to "
- "timezone-naive dtype. Use obj.tz_localize(None) or "
- "obj.tz_convert('UTC').tz_localize(None) instead."
- )
-
- elif (
- self.tz is None
- and is_datetime64_dtype(dtype)
- and dtype != self.dtype
- and is_unitless(dtype)
- ):
- raise TypeError(
- "Casting to unit-less dtype 'datetime64' is not supported. "
- "Pass e.g. 'datetime64[ns]' instead."
- )
-
- elif is_period_dtype(dtype):
- return self.to_period(freq=dtype.freq)
- return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy)
-
- # -----------------------------------------------------------------
- # Rendering Methods
-
- def _format_native_types(
- self, *, na_rep: str | float = "NaT", date_format=None, **kwargs
- ) -> npt.NDArray[np.object_]:
- from pandas.io.formats.format import get_format_datetime64_from_values
-
- fmt = get_format_datetime64_from_values(self, date_format)
-
- return tslib.format_array_from_datetime(
- self.asi8, tz=self.tz, format=fmt, na_rep=na_rep, reso=self._creso
- )
-
- # -----------------------------------------------------------------
- # Comparison Methods
-
- def _has_same_tz(self, other) -> bool:
- # vzone shouldn't be None if value is non-datetime like
- if isinstance(other, np.datetime64):
- # convert to Timestamp as np.datetime64 doesn't have tz attr
- other = Timestamp(other)
-
- if not hasattr(other, "tzinfo"):
- return False
- other_tz = other.tzinfo
- return timezones.tz_compare(self.tzinfo, other_tz)
-
- def _assert_tzawareness_compat(self, other) -> None:
- # adapted from _Timestamp._assert_tzawareness_compat
- other_tz = getattr(other, "tzinfo", None)
- other_dtype = getattr(other, "dtype", None)
-
- if is_datetime64tz_dtype(other_dtype):
- # Get tzinfo from Series dtype
- other_tz = other.dtype.tz
- if other is NaT:
- # pd.NaT quacks both aware and naive
- pass
- elif self.tz is None:
- if other_tz is not None:
- raise TypeError(
- "Cannot compare tz-naive and tz-aware datetime-like objects."
- )
- elif other_tz is None:
- raise TypeError(
- "Cannot compare tz-naive and tz-aware datetime-like objects"
- )
-
- # -----------------------------------------------------------------
- # Arithmetic Methods
-
- def _add_offset(self, offset) -> DatetimeArray:
- assert not isinstance(offset, Tick)
-
- if self.tz is not None:
- values = self.tz_localize(None)
- else:
- values = self
-
- try:
- result = offset._apply_array(values).view(values.dtype)
- except NotImplementedError:
- warnings.warn(
- "Non-vectorized DateOffset being applied to Series or DatetimeIndex.",
- PerformanceWarning,
- stacklevel=find_stack_level(),
- )
- result = self.astype("O") + offset
- result = type(self)._from_sequence(result).as_unit(self.unit)
- if not len(self):
- # GH#30336 _from_sequence won't be able to infer self.tz
- return result.tz_localize(self.tz)
-
- else:
- result = DatetimeArray._simple_new(result, dtype=result.dtype)
- if self.tz is not None:
- result = result.tz_localize(self.tz)
-
- return result
-
- # -----------------------------------------------------------------
- # Timezone Conversion and Localization Methods
-
- def _local_timestamps(self) -> npt.NDArray[np.int64]:
- """
- Convert to an i8 (unix-like nanosecond timestamp) representation
- while keeping the local timezone and not using UTC.
- This is used to calculate time-of-day information as if the timestamps
- were timezone-naive.
- """
- if self.tz is None or timezones.is_utc(self.tz):
- # Avoid the copy that would be made in tzconversion
- return self.asi8
- return tz_convert_from_utc(self.asi8, self.tz, reso=self._creso)
-
- def tz_convert(self, tz) -> DatetimeArray:
- """
- Convert tz-aware Datetime Array/Index from one time zone to another.
-
- Parameters
- ----------
- tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None
- Time zone for time. Corresponding timestamps would be converted
- to this time zone of the Datetime Array/Index. A `tz` of None will
- convert to UTC and remove the timezone information.
-
- Returns
- -------
- Array or Index
-
- Raises
- ------
- TypeError
- If Datetime Array/Index is tz-naive.
-
- See Also
- --------
- DatetimeIndex.tz : A timezone that has a variable offset from UTC.
- DatetimeIndex.tz_localize : Localize tz-naive DatetimeIndex to a
- given time zone, or remove timezone from a tz-aware DatetimeIndex.
-
- Examples
- --------
- With the `tz` parameter, we can change the DatetimeIndex
- to other time zones:
-
- >>> dti = pd.date_range(start='2014-08-01 09:00',
- ... freq='H', periods=3, tz='Europe/Berlin')
-
- >>> dti
- DatetimeIndex(['2014-08-01 09:00:00+02:00',
- '2014-08-01 10:00:00+02:00',
- '2014-08-01 11:00:00+02:00'],
- dtype='datetime64[ns, Europe/Berlin]', freq='H')
-
- >>> dti.tz_convert('US/Central')
- DatetimeIndex(['2014-08-01 02:00:00-05:00',
- '2014-08-01 03:00:00-05:00',
- '2014-08-01 04:00:00-05:00'],
- dtype='datetime64[ns, US/Central]', freq='H')
-
- With the ``tz=None``, we can remove the timezone (after converting
- to UTC if necessary):
-
- >>> dti = pd.date_range(start='2014-08-01 09:00', freq='H',
- ... periods=3, tz='Europe/Berlin')
-
- >>> dti
- DatetimeIndex(['2014-08-01 09:00:00+02:00',
- '2014-08-01 10:00:00+02:00',
- '2014-08-01 11:00:00+02:00'],
- dtype='datetime64[ns, Europe/Berlin]', freq='H')
-
- >>> dti.tz_convert(None)
- DatetimeIndex(['2014-08-01 07:00:00',
- '2014-08-01 08:00:00',
- '2014-08-01 09:00:00'],
- dtype='datetime64[ns]', freq='H')
- """
- tz = timezones.maybe_get_tz(tz)
-
- if self.tz is None:
- # tz naive, use tz_localize
- raise TypeError(
- "Cannot convert tz-naive timestamps, use tz_localize to localize"
- )
-
- # No conversion since timestamps are all UTC to begin with
- dtype = tz_to_dtype(tz, unit=self.unit)
- return self._simple_new(self._ndarray, dtype=dtype, freq=self.freq)
-
- @dtl.ravel_compat
- def tz_localize(
- self,
- tz,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ) -> DatetimeArray:
- """
- Localize tz-naive Datetime Array/Index to tz-aware Datetime Array/Index.
-
- This method takes a time zone (tz) naive Datetime Array/Index object
- and makes this time zone aware. It does not move the time to another
- time zone.
-
- This method can also be used to do the inverse -- to create a time
- zone unaware object from an aware object. To that end, pass `tz=None`.
-
- Parameters
- ----------
- tz : str, pytz.timezone, dateutil.tz.tzfile, datetime.tzinfo or None
- Time zone to convert timestamps to. Passing ``None`` will
- remove the time zone information preserving local time.
- ambiguous : 'infer', 'NaT', bool array, default 'raise'
- When clocks moved backward due to DST, ambiguous times may arise.
- For example in Central European Time (UTC+01), when going from
- 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
- 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
- `ambiguous` parameter dictates how ambiguous times should be
- handled.
-
- - 'infer' will attempt to infer fall dst-transition hours based on
- order
- - bool-ndarray where True signifies a DST time, False signifies a
- non-DST time (note that this flag is only applicable for
- ambiguous times)
- - 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous
- times.
-
- nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \
-default 'raise'
- A nonexistent time does not exist in a particular timezone
- where clocks moved forward due to DST.
-
- - 'shift_forward' will shift the nonexistent time forward to the
- closest existing time
- - 'shift_backward' will shift the nonexistent time backward to the
- closest existing time
- - 'NaT' will return NaT where there are nonexistent times
- - timedelta objects will shift nonexistent times by the timedelta
- - 'raise' will raise an NonExistentTimeError if there are
- nonexistent times.
-
- Returns
- -------
- Same type as self
- Array/Index converted to the specified time zone.
-
- Raises
- ------
- TypeError
- If the Datetime Array/Index is tz-aware and tz is not None.
-
- See Also
- --------
- DatetimeIndex.tz_convert : Convert tz-aware DatetimeIndex from
- one time zone to another.
-
- Examples
- --------
- >>> tz_naive = pd.date_range('2018-03-01 09:00', periods=3)
- >>> tz_naive
- DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00',
- '2018-03-03 09:00:00'],
- dtype='datetime64[ns]', freq='D')
-
- Localize DatetimeIndex in US/Eastern time zone:
-
- >>> tz_aware = tz_naive.tz_localize(tz='US/Eastern')
- >>> tz_aware
- DatetimeIndex(['2018-03-01 09:00:00-05:00',
- '2018-03-02 09:00:00-05:00',
- '2018-03-03 09:00:00-05:00'],
- dtype='datetime64[ns, US/Eastern]', freq=None)
-
- With the ``tz=None``, we can remove the time zone information
- while keeping the local time (not converted to UTC):
-
- >>> tz_aware.tz_localize(None)
- DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00',
- '2018-03-03 09:00:00'],
- dtype='datetime64[ns]', freq=None)
-
- Be careful with DST changes. When there is sequential data, pandas can
- infer the DST time:
-
- >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:30:00',
- ... '2018-10-28 02:00:00',
- ... '2018-10-28 02:30:00',
- ... '2018-10-28 02:00:00',
- ... '2018-10-28 02:30:00',
- ... '2018-10-28 03:00:00',
- ... '2018-10-28 03:30:00']))
- >>> s.dt.tz_localize('CET', ambiguous='infer')
- 0 2018-10-28 01:30:00+02:00
- 1 2018-10-28 02:00:00+02:00
- 2 2018-10-28 02:30:00+02:00
- 3 2018-10-28 02:00:00+01:00
- 4 2018-10-28 02:30:00+01:00
- 5 2018-10-28 03:00:00+01:00
- 6 2018-10-28 03:30:00+01:00
- dtype: datetime64[ns, CET]
-
- In some cases, inferring the DST is impossible. In such cases, you can
- pass an ndarray to the ambiguous parameter to set the DST explicitly
-
- >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:20:00',
- ... '2018-10-28 02:36:00',
- ... '2018-10-28 03:46:00']))
- >>> s.dt.tz_localize('CET', ambiguous=np.array([True, True, False]))
- 0 2018-10-28 01:20:00+02:00
- 1 2018-10-28 02:36:00+02:00
- 2 2018-10-28 03:46:00+01:00
- dtype: datetime64[ns, CET]
-
- If the DST transition causes nonexistent times, you can shift these
- dates forward or backwards with a timedelta object or `'shift_forward'`
- or `'shift_backwards'`.
-
- >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00',
- ... '2015-03-29 03:30:00']))
- >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward')
- 0 2015-03-29 03:00:00+02:00
- 1 2015-03-29 03:30:00+02:00
- dtype: datetime64[ns, Europe/Warsaw]
-
- >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_backward')
- 0 2015-03-29 01:59:59.999999999+01:00
- 1 2015-03-29 03:30:00+02:00
- dtype: datetime64[ns, Europe/Warsaw]
-
- >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H'))
- 0 2015-03-29 03:30:00+02:00
- 1 2015-03-29 03:30:00+02:00
- dtype: datetime64[ns, Europe/Warsaw]
- """
- nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")
- if nonexistent not in nonexistent_options and not isinstance(
- nonexistent, timedelta
- ):
- raise ValueError(
- "The nonexistent argument must be one of 'raise', "
- "'NaT', 'shift_forward', 'shift_backward' or "
- "a timedelta object"
- )
-
- if self.tz is not None:
- if tz is None:
- new_dates = tz_convert_from_utc(self.asi8, self.tz, reso=self._creso)
- else:
- raise TypeError("Already tz-aware, use tz_convert to convert.")
- else:
- tz = timezones.maybe_get_tz(tz)
- # Convert to UTC
-
- new_dates = tzconversion.tz_localize_to_utc(
- self.asi8,
- tz,
- ambiguous=ambiguous,
- nonexistent=nonexistent,
- creso=self._creso,
- )
- new_dates = new_dates.view(f"M8[{self.unit}]")
- dtype = tz_to_dtype(tz, unit=self.unit)
-
- freq = None
- if timezones.is_utc(tz) or (len(self) == 1 and not isna(new_dates[0])):
- # we can preserve freq
- # TODO: Also for fixed-offsets
- freq = self.freq
- elif tz is None and self.tz is None:
- # no-op
- freq = self.freq
- return self._simple_new(new_dates, dtype=dtype, freq=freq)
-
- # ----------------------------------------------------------------
- # Conversion Methods - Vectorized analogues of Timestamp methods
-
- def to_pydatetime(self) -> npt.NDArray[np.object_]:
- """
- Return an ndarray of datetime.datetime objects.
-
- Returns
- -------
- numpy.ndarray
- """
- return ints_to_pydatetime(self.asi8, tz=self.tz, reso=self._creso)
-
- def normalize(self) -> DatetimeArray:
- """
- Convert times to midnight.
-
- The time component of the date-time is converted to midnight i.e.
- 00:00:00. This is useful in cases, when the time does not matter.
- Length is unaltered. The timezones are unaffected.
-
- This method is available on Series with datetime values under
- the ``.dt`` accessor, and directly on Datetime Array/Index.
-
- Returns
- -------
- DatetimeArray, DatetimeIndex or Series
- The same type as the original data. Series will have the same
- name and index. DatetimeIndex will have the same name.
-
- See Also
- --------
- floor : Floor the datetimes to the specified freq.
- ceil : Ceil the datetimes to the specified freq.
- round : Round the datetimes to the specified freq.
-
- Examples
- --------
- >>> idx = pd.date_range(start='2014-08-01 10:00', freq='H',
- ... periods=3, tz='Asia/Calcutta')
- >>> idx
- DatetimeIndex(['2014-08-01 10:00:00+05:30',
- '2014-08-01 11:00:00+05:30',
- '2014-08-01 12:00:00+05:30'],
- dtype='datetime64[ns, Asia/Calcutta]', freq='H')
- >>> idx.normalize()
- DatetimeIndex(['2014-08-01 00:00:00+05:30',
- '2014-08-01 00:00:00+05:30',
- '2014-08-01 00:00:00+05:30'],
- dtype='datetime64[ns, Asia/Calcutta]', freq=None)
- """
- new_values = normalize_i8_timestamps(self.asi8, self.tz, reso=self._creso)
- dt64_values = new_values.view(self._ndarray.dtype)
-
- dta = type(self)._simple_new(dt64_values, dtype=dt64_values.dtype)
- dta = dta._with_freq("infer")
- if self.tz is not None:
- dta = dta.tz_localize(self.tz)
- return dta
-
- def to_period(self, freq=None) -> PeriodArray:
- """
- Cast to PeriodArray/Index at a particular frequency.
-
- Converts DatetimeArray/Index to PeriodArray/Index.
-
- Parameters
- ----------
- freq : str or Offset, optional
- One of pandas' :ref:`offset strings <timeseries.offset_aliases>`
- or an Offset object. Will be inferred by default.
-
- Returns
- -------
- PeriodArray/Index
-
- Raises
- ------
- ValueError
- When converting a DatetimeArray/Index with non-regular values,
- so that a frequency cannot be inferred.
-
- See Also
- --------
- PeriodIndex: Immutable ndarray holding ordinal values.
- DatetimeIndex.to_pydatetime: Return DatetimeIndex as object.
-
- Examples
- --------
- >>> df = pd.DataFrame({"y": [1, 2, 3]},
- ... index=pd.to_datetime(["2000-03-31 00:00:00",
- ... "2000-05-31 00:00:00",
- ... "2000-08-31 00:00:00"]))
- >>> df.index.to_period("M")
- PeriodIndex(['2000-03', '2000-05', '2000-08'],
- dtype='period[M]')
-
- Infer the daily frequency
-
- >>> idx = pd.date_range("2017-01-01", periods=2)
- >>> idx.to_period()
- PeriodIndex(['2017-01-01', '2017-01-02'],
- dtype='period[D]')
- """
- from pandas.core.arrays import PeriodArray
-
- if self.tz is not None:
- warnings.warn(
- "Converting to PeriodArray/Index representation "
- "will drop timezone information.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
-
- if freq is None:
- freq = self.freqstr or self.inferred_freq
-
- if freq is None:
- raise ValueError(
- "You must pass a freq argument as current index has none."
- )
-
- res = get_period_alias(freq)
-
- # https://github.com/pandas-dev/pandas/issues/33358
- if res is None:
- res = freq
-
- freq = res
-
- return PeriodArray._from_datetime64(self._ndarray, freq, tz=self.tz)
-
- # -----------------------------------------------------------------
- # Properties - Vectorized Timestamp Properties/Methods
-
- def month_name(self, locale=None) -> npt.NDArray[np.object_]:
- """
- Return the month names with specified locale.
-
- Parameters
- ----------
- locale : str, optional
- Locale determining the language in which to return the month name.
- Default is English locale (``'en_US.utf8'``). Use the command
- ``locale -a`` on your terminal on Unix systems to find your locale
- language code.
-
- Returns
- -------
- Series or Index
- Series or Index of month names.
-
- Examples
- --------
- >>> s = pd.Series(pd.date_range(start='2018-01', freq='M', periods=3))
- >>> s
- 0 2018-01-31
- 1 2018-02-28
- 2 2018-03-31
- dtype: datetime64[ns]
- >>> s.dt.month_name()
- 0 January
- 1 February
- 2 March
- dtype: object
-
- >>> idx = pd.date_range(start='2018-01', freq='M', periods=3)
- >>> idx
- DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'],
- dtype='datetime64[ns]', freq='M')
- >>> idx.month_name()
- Index(['January', 'February', 'March'], dtype='object')
-
- Using the ``locale`` parameter you can set a different locale language,
- for example: ``idx.month_name(locale='pt_BR.utf8')`` will return month
- names in Brazilian Portuguese language.
-
- >>> idx = pd.date_range(start='2018-01', freq='M', periods=3)
- >>> idx
- DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31'],
- dtype='datetime64[ns]', freq='M')
- >>> idx.month_name(locale='pt_BR.utf8') # doctest: +SKIP
- Index(['Janeiro', 'Fevereiro', 'Março'], dtype='object')
- """
- values = self._local_timestamps()
-
- result = fields.get_date_name_field(
- values, "month_name", locale=locale, reso=self._creso
- )
- result = self._maybe_mask_results(result, fill_value=None)
- return result
-
- def day_name(self, locale=None) -> npt.NDArray[np.object_]:
- """
- Return the day names with specified locale.
-
- Parameters
- ----------
- locale : str, optional
- Locale determining the language in which to return the day name.
- Default is English locale (``'en_US.utf8'``). Use the command
- ``locale -a`` on your terminal on Unix systems to find your locale
- language code.
-
- Returns
- -------
- Series or Index
- Series or Index of day names.
-
- Examples
- --------
- >>> s = pd.Series(pd.date_range(start='2018-01-01', freq='D', periods=3))
- >>> s
- 0 2018-01-01
- 1 2018-01-02
- 2 2018-01-03
- dtype: datetime64[ns]
- >>> s.dt.day_name()
- 0 Monday
- 1 Tuesday
- 2 Wednesday
- dtype: object
-
- >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3)
- >>> idx
- DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'],
- dtype='datetime64[ns]', freq='D')
- >>> idx.day_name()
- Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object')
-
- Using the ``locale`` parameter you can set a different locale language,
- for example: ``idx.day_name(locale='pt_BR.utf8')`` will return day
- names in Brazilian Portuguese language.
-
- >>> idx = pd.date_range(start='2018-01-01', freq='D', periods=3)
- >>> idx
- DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'],
- dtype='datetime64[ns]', freq='D')
- >>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP
- Index(['Segunda', 'Terça', 'Quarta'], dtype='object')
- """
- values = self._local_timestamps()
-
- result = fields.get_date_name_field(
- values, "day_name", locale=locale, reso=self._creso
- )
- result = self._maybe_mask_results(result, fill_value=None)
- return result
-
- @property
- def time(self) -> npt.NDArray[np.object_]:
- """
- Returns numpy array of :class:`datetime.time` objects.
-
- The time part of the Timestamps.
- """
- # If the Timestamps have a timezone that is not UTC,
- # convert them into their i8 representation while
- # keeping their timezone and not using UTC
- timestamps = self._local_timestamps()
-
- return ints_to_pydatetime(timestamps, box="time", reso=self._creso)
-
- @property
- def timetz(self) -> npt.NDArray[np.object_]:
- """
- Returns numpy array of :class:`datetime.time` objects with timezones.
-
- The time part of the Timestamps.
- """
- return ints_to_pydatetime(self.asi8, self.tz, box="time", reso=self._creso)
-
- @property
- def date(self) -> npt.NDArray[np.object_]:
- """
- Returns numpy array of python :class:`datetime.date` objects.
-
- Namely, the date part of Timestamps without time and
- timezone information.
- """
- # If the Timestamps have a timezone that is not UTC,
- # convert them into their i8 representation while
- # keeping their timezone and not using UTC
- timestamps = self._local_timestamps()
-
- return ints_to_pydatetime(timestamps, box="date", reso=self._creso)
-
- def isocalendar(self) -> DataFrame:
- """
- Calculate year, week, and day according to the ISO 8601 standard.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- DataFrame
- With columns year, week and day.
-
- See Also
- --------
- Timestamp.isocalendar : Function return a 3-tuple containing ISO year,
- week number, and weekday for the given Timestamp object.
- datetime.date.isocalendar : Return a named tuple object with
- three components: year, week and weekday.
-
- Examples
- --------
- >>> idx = pd.date_range(start='2019-12-29', freq='D', periods=4)
- >>> idx.isocalendar()
- year week day
- 2019-12-29 2019 52 7
- 2019-12-30 2020 1 1
- 2019-12-31 2020 1 2
- 2020-01-01 2020 1 3
- >>> idx.isocalendar().week
- 2019-12-29 52
- 2019-12-30 1
- 2019-12-31 1
- 2020-01-01 1
- Freq: D, Name: week, dtype: UInt32
- """
- from pandas import DataFrame
-
- values = self._local_timestamps()
- sarray = fields.build_isocalendar_sarray(values, reso=self._creso)
- iso_calendar_df = DataFrame(
- sarray, columns=["year", "week", "day"], dtype="UInt32"
- )
- if self._hasna:
- iso_calendar_df.iloc[self._isnan] = None
- return iso_calendar_df
-
- year = _field_accessor(
- "year",
- "Y",
- """
- The year of the datetime.
-
- Examples
- --------
- >>> datetime_series = pd.Series(
- ... pd.date_range("2000-01-01", periods=3, freq="Y")
- ... )
- >>> datetime_series
- 0 2000-12-31
- 1 2001-12-31
- 2 2002-12-31
- dtype: datetime64[ns]
- >>> datetime_series.dt.year
- 0 2000
- 1 2001
- 2 2002
- dtype: int32
- """,
- )
- month = _field_accessor(
- "month",
- "M",
- """
- The month as January=1, December=12.
-
- Examples
- --------
- >>> datetime_series = pd.Series(
- ... pd.date_range("2000-01-01", periods=3, freq="M")
- ... )
- >>> datetime_series
- 0 2000-01-31
- 1 2000-02-29
- 2 2000-03-31
- dtype: datetime64[ns]
- >>> datetime_series.dt.month
- 0 1
- 1 2
- 2 3
- dtype: int32
- """,
- )
- day = _field_accessor(
- "day",
- "D",
- """
- The day of the datetime.
-
- Examples
- --------
- >>> datetime_series = pd.Series(
- ... pd.date_range("2000-01-01", periods=3, freq="D")
- ... )
- >>> datetime_series
- 0 2000-01-01
- 1 2000-01-02
- 2 2000-01-03
- dtype: datetime64[ns]
- >>> datetime_series.dt.day
- 0 1
- 1 2
- 2 3
- dtype: int32
- """,
- )
- hour = _field_accessor(
- "hour",
- "h",
- """
- The hours of the datetime.
-
- Examples
- --------
- >>> datetime_series = pd.Series(
- ... pd.date_range("2000-01-01", periods=3, freq="h")
- ... )
- >>> datetime_series
- 0 2000-01-01 00:00:00
- 1 2000-01-01 01:00:00
- 2 2000-01-01 02:00:00
- dtype: datetime64[ns]
- >>> datetime_series.dt.hour
- 0 0
- 1 1
- 2 2
- dtype: int32
- """,
- )
- minute = _field_accessor(
- "minute",
- "m",
- """
- The minutes of the datetime.
-
- Examples
- --------
- >>> datetime_series = pd.Series(
- ... pd.date_range("2000-01-01", periods=3, freq="T")
- ... )
- >>> datetime_series
- 0 2000-01-01 00:00:00
- 1 2000-01-01 00:01:00
- 2 2000-01-01 00:02:00
- dtype: datetime64[ns]
- >>> datetime_series.dt.minute
- 0 0
- 1 1
- 2 2
- dtype: int32
- """,
- )
- second = _field_accessor(
- "second",
- "s",
- """
- The seconds of the datetime.
-
- Examples
- --------
- >>> datetime_series = pd.Series(
- ... pd.date_range("2000-01-01", periods=3, freq="s")
- ... )
- >>> datetime_series
- 0 2000-01-01 00:00:00
- 1 2000-01-01 00:00:01
- 2 2000-01-01 00:00:02
- dtype: datetime64[ns]
- >>> datetime_series.dt.second
- 0 0
- 1 1
- 2 2
- dtype: int32
- """,
- )
- microsecond = _field_accessor(
- "microsecond",
- "us",
- """
- The microseconds of the datetime.
-
- Examples
- --------
- >>> datetime_series = pd.Series(
- ... pd.date_range("2000-01-01", periods=3, freq="us")
- ... )
- >>> datetime_series
- 0 2000-01-01 00:00:00.000000
- 1 2000-01-01 00:00:00.000001
- 2 2000-01-01 00:00:00.000002
- dtype: datetime64[ns]
- >>> datetime_series.dt.microsecond
- 0 0
- 1 1
- 2 2
- dtype: int32
- """,
- )
- nanosecond = _field_accessor(
- "nanosecond",
- "ns",
- """
- The nanoseconds of the datetime.
-
- Examples
- --------
- >>> datetime_series = pd.Series(
- ... pd.date_range("2000-01-01", periods=3, freq="ns")
- ... )
- >>> datetime_series
- 0 2000-01-01 00:00:00.000000000
- 1 2000-01-01 00:00:00.000000001
- 2 2000-01-01 00:00:00.000000002
- dtype: datetime64[ns]
- >>> datetime_series.dt.nanosecond
- 0 0
- 1 1
- 2 2
- dtype: int32
- """,
- )
- _dayofweek_doc = """
- The day of the week with Monday=0, Sunday=6.
-
- Return the day of the week. It is assumed the week starts on
- Monday, which is denoted by 0 and ends on Sunday which is denoted
- by 6. This method is available on both Series with datetime
- values (using the `dt` accessor) or DatetimeIndex.
-
- Returns
- -------
- Series or Index
- Containing integers indicating the day number.
-
- See Also
- --------
- Series.dt.dayofweek : Alias.
- Series.dt.weekday : Alias.
- Series.dt.day_name : Returns the name of the day of the week.
-
- Examples
- --------
- >>> s = pd.date_range('2016-12-31', '2017-01-08', freq='D').to_series()
- >>> s.dt.dayofweek
- 2016-12-31 5
- 2017-01-01 6
- 2017-01-02 0
- 2017-01-03 1
- 2017-01-04 2
- 2017-01-05 3
- 2017-01-06 4
- 2017-01-07 5
- 2017-01-08 6
- Freq: D, dtype: int32
- """
- day_of_week = _field_accessor("day_of_week", "dow", _dayofweek_doc)
- dayofweek = day_of_week
- weekday = day_of_week
-
- day_of_year = _field_accessor(
- "dayofyear",
- "doy",
- """
- The ordinal day of the year.
- """,
- )
- dayofyear = day_of_year
- quarter = _field_accessor(
- "quarter",
- "q",
- """
- The quarter of the date.
- """,
- )
- days_in_month = _field_accessor(
- "days_in_month",
- "dim",
- """
- The number of days in the month.
- """,
- )
- daysinmonth = days_in_month
- _is_month_doc = """
- Indicates whether the date is the {first_or_last} day of the month.
-
- Returns
- -------
- Series or array
- For Series, returns a Series with boolean values.
- For DatetimeIndex, returns a boolean array.
-
- See Also
- --------
- is_month_start : Return a boolean indicating whether the date
- is the first day of the month.
- is_month_end : Return a boolean indicating whether the date
- is the last day of the month.
-
- Examples
- --------
- This method is available on Series with datetime values under
- the ``.dt`` accessor, and directly on DatetimeIndex.
-
- >>> s = pd.Series(pd.date_range("2018-02-27", periods=3))
- >>> s
- 0 2018-02-27
- 1 2018-02-28
- 2 2018-03-01
- dtype: datetime64[ns]
- >>> s.dt.is_month_start
- 0 False
- 1 False
- 2 True
- dtype: bool
- >>> s.dt.is_month_end
- 0 False
- 1 True
- 2 False
- dtype: bool
-
- >>> idx = pd.date_range("2018-02-27", periods=3)
- >>> idx.is_month_start
- array([False, False, True])
- >>> idx.is_month_end
- array([False, True, False])
- """
- is_month_start = _field_accessor(
- "is_month_start", "is_month_start", _is_month_doc.format(first_or_last="first")
- )
-
- is_month_end = _field_accessor(
- "is_month_end", "is_month_end", _is_month_doc.format(first_or_last="last")
- )
-
- is_quarter_start = _field_accessor(
- "is_quarter_start",
- "is_quarter_start",
- """
- Indicator for whether the date is the first day of a quarter.
-
- Returns
- -------
- is_quarter_start : Series or DatetimeIndex
- The same type as the original data with boolean values. Series will
- have the same name and index. DatetimeIndex will have the same
- name.
-
- See Also
- --------
- quarter : Return the quarter of the date.
- is_quarter_end : Similar property for indicating the quarter end.
-
- Examples
- --------
- This method is available on Series with datetime values under
- the ``.dt`` accessor, and directly on DatetimeIndex.
-
- >>> df = pd.DataFrame({'dates': pd.date_range("2017-03-30",
- ... periods=4)})
- >>> df.assign(quarter=df.dates.dt.quarter,
- ... is_quarter_start=df.dates.dt.is_quarter_start)
- dates quarter is_quarter_start
- 0 2017-03-30 1 False
- 1 2017-03-31 1 False
- 2 2017-04-01 2 True
- 3 2017-04-02 2 False
-
- >>> idx = pd.date_range('2017-03-30', periods=4)
- >>> idx
- DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'],
- dtype='datetime64[ns]', freq='D')
-
- >>> idx.is_quarter_start
- array([False, False, True, False])
- """,
- )
- is_quarter_end = _field_accessor(
- "is_quarter_end",
- "is_quarter_end",
- """
- Indicator for whether the date is the last day of a quarter.
-
- Returns
- -------
- is_quarter_end : Series or DatetimeIndex
- The same type as the original data with boolean values. Series will
- have the same name and index. DatetimeIndex will have the same
- name.
-
- See Also
- --------
- quarter : Return the quarter of the date.
- is_quarter_start : Similar property indicating the quarter start.
-
- Examples
- --------
- This method is available on Series with datetime values under
- the ``.dt`` accessor, and directly on DatetimeIndex.
-
- >>> df = pd.DataFrame({'dates': pd.date_range("2017-03-30",
- ... periods=4)})
- >>> df.assign(quarter=df.dates.dt.quarter,
- ... is_quarter_end=df.dates.dt.is_quarter_end)
- dates quarter is_quarter_end
- 0 2017-03-30 1 False
- 1 2017-03-31 1 True
- 2 2017-04-01 2 False
- 3 2017-04-02 2 False
-
- >>> idx = pd.date_range('2017-03-30', periods=4)
- >>> idx
- DatetimeIndex(['2017-03-30', '2017-03-31', '2017-04-01', '2017-04-02'],
- dtype='datetime64[ns]', freq='D')
-
- >>> idx.is_quarter_end
- array([False, True, False, False])
- """,
- )
- is_year_start = _field_accessor(
- "is_year_start",
- "is_year_start",
- """
- Indicate whether the date is the first day of a year.
-
- Returns
- -------
- Series or DatetimeIndex
- The same type as the original data with boolean values. Series will
- have the same name and index. DatetimeIndex will have the same
- name.
-
- See Also
- --------
- is_year_end : Similar property indicating the last day of the year.
-
- Examples
- --------
- This method is available on Series with datetime values under
- the ``.dt`` accessor, and directly on DatetimeIndex.
-
- >>> dates = pd.Series(pd.date_range("2017-12-30", periods=3))
- >>> dates
- 0 2017-12-30
- 1 2017-12-31
- 2 2018-01-01
- dtype: datetime64[ns]
-
- >>> dates.dt.is_year_start
- 0 False
- 1 False
- 2 True
- dtype: bool
-
- >>> idx = pd.date_range("2017-12-30", periods=3)
- >>> idx
- DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'],
- dtype='datetime64[ns]', freq='D')
-
- >>> idx.is_year_start
- array([False, False, True])
- """,
- )
- is_year_end = _field_accessor(
- "is_year_end",
- "is_year_end",
- """
- Indicate whether the date is the last day of the year.
-
- Returns
- -------
- Series or DatetimeIndex
- The same type as the original data with boolean values. Series will
- have the same name and index. DatetimeIndex will have the same
- name.
-
- See Also
- --------
- is_year_start : Similar property indicating the start of the year.
-
- Examples
- --------
- This method is available on Series with datetime values under
- the ``.dt`` accessor, and directly on DatetimeIndex.
-
- >>> dates = pd.Series(pd.date_range("2017-12-30", periods=3))
- >>> dates
- 0 2017-12-30
- 1 2017-12-31
- 2 2018-01-01
- dtype: datetime64[ns]
-
- >>> dates.dt.is_year_end
- 0 False
- 1 True
- 2 False
- dtype: bool
-
- >>> idx = pd.date_range("2017-12-30", periods=3)
- >>> idx
- DatetimeIndex(['2017-12-30', '2017-12-31', '2018-01-01'],
- dtype='datetime64[ns]', freq='D')
-
- >>> idx.is_year_end
- array([False, True, False])
- """,
- )
- is_leap_year = _field_accessor(
- "is_leap_year",
- "is_leap_year",
- """
- Boolean indicator if the date belongs to a leap year.
-
- A leap year is a year, which has 366 days (instead of 365) including
- 29th of February as an intercalary day.
- Leap years are years which are multiples of four with the exception
- of years divisible by 100 but not by 400.
-
- Returns
- -------
- Series or ndarray
- Booleans indicating if dates belong to a leap year.
-
- Examples
- --------
- This method is available on Series with datetime values under
- the ``.dt`` accessor, and directly on DatetimeIndex.
-
- >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="Y")
- >>> idx
- DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'],
- dtype='datetime64[ns]', freq='A-DEC')
- >>> idx.is_leap_year
- array([ True, False, False])
-
- >>> dates_series = pd.Series(idx)
- >>> dates_series
- 0 2012-12-31
- 1 2013-12-31
- 2 2014-12-31
- dtype: datetime64[ns]
- >>> dates_series.dt.is_leap_year
- 0 True
- 1 False
- 2 False
- dtype: bool
- """,
- )
-
- def to_julian_date(self) -> npt.NDArray[np.float64]:
- """
- Convert Datetime Array to float64 ndarray of Julian Dates.
- 0 Julian date is noon January 1, 4713 BC.
- https://en.wikipedia.org/wiki/Julian_day
- """
-
- # http://mysite.verizon.net/aesir_research/date/jdalg2.htm
- year = np.asarray(self.year)
- month = np.asarray(self.month)
- day = np.asarray(self.day)
- testarr = month < 3
- year[testarr] -= 1
- month[testarr] += 12
- return (
- day
- + np.fix((153 * month - 457) / 5)
- + 365 * year
- + np.floor(year / 4)
- - np.floor(year / 100)
- + np.floor(year / 400)
- + 1_721_118.5
- + (
- self.hour
- + self.minute / 60
- + self.second / 3600
- + self.microsecond / 3600 / 10**6
- + self.nanosecond / 3600 / 10**9
- )
- / 24
- )
-
- # -----------------------------------------------------------------
- # Reductions
-
- def std(
- self,
- axis=None,
- dtype=None,
- out=None,
- ddof: int = 1,
- keepdims: bool = False,
- skipna: bool = True,
- ):
- """
- Return sample standard deviation over requested axis.
-
- Normalized by N-1 by default. This can be changed using the ddof argument
-
- Parameters
- ----------
- axis : int optional, default None
- Axis for the function to be applied on.
- For `Series` this parameter is unused and defaults to `None`.
- ddof : int, default 1
- Degrees of Freedom. The divisor used in calculations is N - ddof,
- where N represents the number of elements.
- skipna : bool, default True
- Exclude NA/null values. If an entire row/column is NA, the result will be
- NA.
-
- Returns
- -------
- Timedelta
- """
- # Because std is translation-invariant, we can get self.std
- # by calculating (self - Timestamp(0)).std, and we can do it
- # without creating a copy by using a view on self._ndarray
- from pandas.core.arrays import TimedeltaArray
-
- # Find the td64 dtype with the same resolution as our dt64 dtype
- dtype_str = self._ndarray.dtype.name.replace("datetime64", "timedelta64")
- dtype = np.dtype(dtype_str)
-
- tda = TimedeltaArray._simple_new(self._ndarray.view(dtype), dtype=dtype)
-
- return tda.std(axis=axis, out=out, ddof=ddof, keepdims=keepdims, skipna=skipna)
-
-
-# -------------------------------------------------------------------
-# Constructor Helpers
-
-
-def _sequence_to_dt64ns(
- data,
- *,
- copy: bool = False,
- tz: tzinfo | None = None,
- dayfirst: bool = False,
- yearfirst: bool = False,
- ambiguous: TimeAmbiguous = "raise",
- out_unit: str | None = None,
-):
- """
- Parameters
- ----------
- data : list-like
- copy : bool, default False
- tz : tzinfo or None, default None
- dayfirst : bool, default False
- yearfirst : bool, default False
- ambiguous : str, bool, or arraylike, default 'raise'
- See pandas._libs.tslibs.tzconversion.tz_localize_to_utc.
- out_unit : str or None, default None
- Desired output resolution.
-
- Returns
- -------
- result : numpy.ndarray
- The sequence converted to a numpy array with dtype ``datetime64[ns]``.
- tz : tzinfo or None
- Either the user-provided tzinfo or one inferred from the data.
- inferred_freq : Tick or None
- The inferred frequency of the sequence.
-
- Raises
- ------
- TypeError : PeriodDType data is passed
- """
- inferred_freq = None
-
- data, copy = dtl.ensure_arraylike_for_datetimelike(
- data, copy, cls_name="DatetimeArray"
- )
-
- if isinstance(data, DatetimeArray):
- inferred_freq = data.freq
-
- # By this point we are assured to have either a numpy array or Index
- data, copy = maybe_convert_dtype(data, copy, tz=tz)
- data_dtype = getattr(data, "dtype", None)
-
- out_dtype = DT64NS_DTYPE
- if out_unit is not None:
- out_dtype = np.dtype(f"M8[{out_unit}]")
-
- if (
- is_object_dtype(data_dtype)
- or is_string_dtype(data_dtype)
- or is_sparse(data_dtype)
- ):
- # TODO: We do not have tests specific to string-dtypes,
- # also complex or categorical or other extension
- copy = False
- if lib.infer_dtype(data, skipna=False) == "integer":
- data = data.astype(np.int64)
- elif tz is not None and ambiguous == "raise":
- # TODO: yearfirst/dayfirst/etc?
- obj_data = np.asarray(data, dtype=object)
- i8data = tslib.array_to_datetime_with_tz(obj_data, tz)
- return i8data.view(DT64NS_DTYPE), tz, None
- else:
- # data comes back here as either i8 to denote UTC timestamps
- # or M8[ns] to denote wall times
- data, inferred_tz = objects_to_datetime64ns(
- data,
- dayfirst=dayfirst,
- yearfirst=yearfirst,
- allow_object=False,
- )
- if tz and inferred_tz:
- # two timezones: convert to intended from base UTC repr
- assert data.dtype == "i8"
- # GH#42505
- # by convention, these are _already_ UTC, e.g
- return data.view(DT64NS_DTYPE), tz, None
-
- elif inferred_tz:
- tz = inferred_tz
-
- data_dtype = data.dtype
-
- # `data` may have originally been a Categorical[datetime64[ns, tz]],
- # so we need to handle these types.
- if is_datetime64tz_dtype(data_dtype):
- # DatetimeArray -> ndarray
- tz = _maybe_infer_tz(tz, data.tz)
- result = data._ndarray
-
- elif is_datetime64_dtype(data_dtype):
- # tz-naive DatetimeArray or ndarray[datetime64]
- data = getattr(data, "_ndarray", data)
- new_dtype = data.dtype
- data_unit = get_unit_from_dtype(new_dtype)
- if not is_supported_unit(data_unit):
- # Cast to the nearest supported unit, generally "s"
- new_reso = get_supported_reso(data_unit)
- new_unit = npy_unit_to_abbrev(new_reso)
- new_dtype = np.dtype(f"M8[{new_unit}]")
- data = astype_overflowsafe(data, dtype=new_dtype, copy=False)
- data_unit = get_unit_from_dtype(new_dtype)
- copy = False
-
- if data.dtype.byteorder == ">":
- # TODO: better way to handle this? non-copying alternative?
- # without this, test_constructor_datetime64_bigendian fails
- data = data.astype(data.dtype.newbyteorder("<"))
- new_dtype = data.dtype
- copy = False
-
- if tz is not None:
- # Convert tz-naive to UTC
- # TODO: if tz is UTC, are there situations where we *don't* want a
- # copy? tz_localize_to_utc always makes one.
- shape = data.shape
- if data.ndim > 1:
- data = data.ravel()
-
- data = tzconversion.tz_localize_to_utc(
- data.view("i8"), tz, ambiguous=ambiguous, creso=data_unit
- )
- data = data.view(new_dtype)
- data = data.reshape(shape)
-
- assert data.dtype == new_dtype, data.dtype
- result = data
-
- else:
- # must be integer dtype otherwise
- # assume this data are epoch timestamps
- if data.dtype != INT64_DTYPE:
- data = data.astype(np.int64, copy=False)
- result = data.view(out_dtype)
-
- if copy:
- result = result.copy()
-
- assert isinstance(result, np.ndarray), type(result)
- assert result.dtype.kind == "M"
- assert result.dtype != "M8"
- assert is_supported_unit(get_unit_from_dtype(result.dtype))
- return result, tz, inferred_freq
-
-
-def objects_to_datetime64ns(
- data: np.ndarray,
- dayfirst,
- yearfirst,
- utc: bool = False,
- errors: DateTimeErrorChoices = "raise",
- allow_object: bool = False,
-):
- """
- Convert data to array of timestamps.
-
- Parameters
- ----------
- data : np.ndarray[object]
- dayfirst : bool
- yearfirst : bool
- utc : bool, default False
- Whether to convert/localize timestamps to UTC.
- errors : {'raise', 'ignore', 'coerce'}
- allow_object : bool
- Whether to return an object-dtype ndarray instead of raising if the
- data contains more than one timezone.
-
- Returns
- -------
- result : ndarray
- np.int64 dtype if returned values represent UTC timestamps
- np.datetime64[ns] if returned values represent wall times
- object if mixed timezones
- inferred_tz : tzinfo or None
-
- Raises
- ------
- ValueError : if data cannot be converted to datetimes
- """
- assert errors in ["raise", "ignore", "coerce"]
-
- # if str-dtype, convert
- data = np.array(data, copy=False, dtype=np.object_)
-
- result, tz_parsed = tslib.array_to_datetime(
- data,
- errors=errors,
- utc=utc,
- dayfirst=dayfirst,
- yearfirst=yearfirst,
- )
-
- if tz_parsed is not None:
- # We can take a shortcut since the datetime64 numpy array
- # is in UTC
- # Return i8 values to denote unix timestamps
- return result.view("i8"), tz_parsed
- elif is_datetime64_dtype(result):
- # returning M8[ns] denotes wall-times; since tz is None
- # the distinction is a thin one
- return result, tz_parsed
- elif is_object_dtype(result):
- # GH#23675 when called via `pd.to_datetime`, returning an object-dtype
- # array is allowed. When called via `pd.DatetimeIndex`, we can
- # only accept datetime64 dtype, so raise TypeError if object-dtype
- # is returned, as that indicates the values can be recognized as
- # datetimes but they have conflicting timezones/awareness
- if allow_object:
- return result, tz_parsed
- raise TypeError(result)
- else: # pragma: no cover
- # GH#23675 this TypeError should never be hit, whereas the TypeError
- # in the object-dtype branch above is reachable.
- raise TypeError(result)
-
-
-def maybe_convert_dtype(data, copy: bool, tz: tzinfo | None = None):
- """
- Convert data based on dtype conventions, issuing
- errors where appropriate.
-
- Parameters
- ----------
- data : np.ndarray or pd.Index
- copy : bool
- tz : tzinfo or None, default None
-
- Returns
- -------
- data : np.ndarray or pd.Index
- copy : bool
-
- Raises
- ------
- TypeError : PeriodDType data is passed
- """
- if not hasattr(data, "dtype"):
- # e.g. collections.deque
- return data, copy
-
- if is_float_dtype(data.dtype):
- # pre-2.0 we treated these as wall-times, inconsistent with ints
- # GH#23675, GH#45573 deprecated to treat symmetrically with integer dtypes.
- # Note: data.astype(np.int64) fails ARM tests, see
- # https://github.com/pandas-dev/pandas/issues/49468.
- data = data.astype(DT64NS_DTYPE).view("i8")
- copy = False
-
- elif is_timedelta64_dtype(data.dtype) or is_bool_dtype(data.dtype):
- # GH#29794 enforcing deprecation introduced in GH#23539
- raise TypeError(f"dtype {data.dtype} cannot be converted to datetime64[ns]")
- elif is_period_dtype(data.dtype):
- # Note: without explicitly raising here, PeriodIndex
- # test_setops.test_join_does_not_recur fails
- raise TypeError(
- "Passing PeriodDtype data is invalid. Use `data.to_timestamp()` instead"
- )
-
- elif is_extension_array_dtype(data.dtype) and not is_datetime64tz_dtype(data.dtype):
- # TODO: We have no tests for these
- data = np.array(data, dtype=np.object_)
- copy = False
-
- return data, copy
-
-
-# -------------------------------------------------------------------
-# Validation and Inference
-
-
-def _maybe_infer_tz(tz: tzinfo | None, inferred_tz: tzinfo | None) -> tzinfo | None:
- """
- If a timezone is inferred from data, check that it is compatible with
- the user-provided timezone, if any.
-
- Parameters
- ----------
- tz : tzinfo or None
- inferred_tz : tzinfo or None
-
- Returns
- -------
- tz : tzinfo or None
-
- Raises
- ------
- TypeError : if both timezones are present but do not match
- """
- if tz is None:
- tz = inferred_tz
- elif inferred_tz is None:
- pass
- elif not timezones.tz_compare(tz, inferred_tz):
- raise TypeError(
- f"data is already tz-aware {inferred_tz}, unable to "
- f"set specified tz: {tz}"
- )
- return tz
-
-
-def _validate_dt64_dtype(dtype):
- """
- Check that a dtype, if passed, represents either a numpy datetime64[ns]
- dtype or a pandas DatetimeTZDtype.
-
- Parameters
- ----------
- dtype : object
-
- Returns
- -------
- dtype : None, numpy.dtype, or DatetimeTZDtype
-
- Raises
- ------
- ValueError : invalid dtype
-
- Notes
- -----
- Unlike _validate_tz_from_dtype, this does _not_ allow non-existent
- tz errors to go through
- """
- if dtype is not None:
- dtype = pandas_dtype(dtype)
- if is_dtype_equal(dtype, np.dtype("M8")):
- # no precision, disallowed GH#24806
- msg = (
- "Passing in 'datetime64' dtype with no precision is not allowed. "
- "Please pass in 'datetime64[ns]' instead."
- )
- raise ValueError(msg)
-
- if (
- isinstance(dtype, np.dtype)
- and (dtype.kind != "M" or not is_supported_unit(get_unit_from_dtype(dtype)))
- ) or not isinstance(dtype, (np.dtype, DatetimeTZDtype)):
- raise ValueError(
- f"Unexpected value for 'dtype': '{dtype}'. "
- "Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', "
- "'datetime64[ns]' or DatetimeTZDtype'."
- )
-
- if getattr(dtype, "tz", None):
- # https://github.com/pandas-dev/pandas/issues/18595
- # Ensure that we have a standard timezone for pytz objects.
- # Without this, things like adding an array of timedeltas and
- # a tz-aware Timestamp (with a tz specific to its datetime) will
- # be incorrect(ish?) for the array as a whole
- dtype = cast(DatetimeTZDtype, dtype)
- dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz))
-
- return dtype
-
-
-def _validate_tz_from_dtype(
- dtype, tz: tzinfo | None, explicit_tz_none: bool = False
-) -> tzinfo | None:
- """
- If the given dtype is a DatetimeTZDtype, extract the implied
- tzinfo object from it and check that it does not conflict with the given
- tz.
-
- Parameters
- ----------
- dtype : dtype, str
- tz : None, tzinfo
- explicit_tz_none : bool, default False
- Whether tz=None was passed explicitly, as opposed to lib.no_default.
-
- Returns
- -------
- tz : consensus tzinfo
-
- Raises
- ------
- ValueError : on tzinfo mismatch
- """
- if dtype is not None:
- if isinstance(dtype, str):
- try:
- dtype = DatetimeTZDtype.construct_from_string(dtype)
- except TypeError:
- # Things like `datetime64[ns]`, which is OK for the
- # constructors, but also nonsense, which should be validated
- # but not by us. We *do* allow non-existent tz errors to
- # go through
- pass
- dtz = getattr(dtype, "tz", None)
- if dtz is not None:
- if tz is not None and not timezones.tz_compare(tz, dtz):
- raise ValueError("cannot supply both a tz and a dtype with a tz")
- if explicit_tz_none:
- raise ValueError("Cannot pass both a timezone-aware dtype and tz=None")
- tz = dtz
-
- if tz is not None and is_datetime64_dtype(dtype):
- # We also need to check for the case where the user passed a
- # tz-naive dtype (i.e. datetime64[ns])
- if tz is not None and not timezones.tz_compare(tz, dtz):
- raise ValueError(
- "cannot supply both a tz and a "
- "timezone-naive dtype (i.e. datetime64[ns])"
- )
-
- return tz
-
-
-def _infer_tz_from_endpoints(
- start: Timestamp, end: Timestamp, tz: tzinfo | None
-) -> tzinfo | None:
- """
- If a timezone is not explicitly given via `tz`, see if one can
- be inferred from the `start` and `end` endpoints. If more than one
- of these inputs provides a timezone, require that they all agree.
-
- Parameters
- ----------
- start : Timestamp
- end : Timestamp
- tz : tzinfo or None
-
- Returns
- -------
- tz : tzinfo or None
-
- Raises
- ------
- TypeError : if start and end timezones do not agree
- """
- try:
- inferred_tz = timezones.infer_tzinfo(start, end)
- except AssertionError as err:
- # infer_tzinfo raises AssertionError if passed mismatched timezones
- raise TypeError(
- "Start and end cannot both be tz-aware with different timezones"
- ) from err
-
- inferred_tz = timezones.maybe_get_tz(inferred_tz)
- tz = timezones.maybe_get_tz(tz)
-
- if tz is not None and inferred_tz is not None:
- if not timezones.tz_compare(inferred_tz, tz):
- raise AssertionError("Inferred time zone not equal to passed time zone")
-
- elif inferred_tz is not None:
- tz = inferred_tz
-
- return tz
-
-
-def _maybe_normalize_endpoints(
- start: Timestamp | None, end: Timestamp | None, normalize: bool
-):
- if normalize:
- if start is not None:
- start = start.normalize()
-
- if end is not None:
- end = end.normalize()
-
- return start, end
-
-
-def _maybe_localize_point(ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent):
- """
- Localize a start or end Timestamp to the timezone of the corresponding
- start or end Timestamp
-
- Parameters
- ----------
- ts : start or end Timestamp to potentially localize
- is_none : argument that should be None
- is_not_none : argument that should not be None
- freq : Tick, DateOffset, or None
- tz : str, timezone object or None
- ambiguous: str, localization behavior for ambiguous times
- nonexistent: str, localization behavior for nonexistent times
-
- Returns
- -------
- ts : Timestamp
- """
- # Make sure start and end are timezone localized if:
- # 1) freq = a Timedelta-like frequency (Tick)
- # 2) freq = None i.e. generating a linspaced range
- if is_none is None and is_not_none is not None:
- # Note: We can't ambiguous='infer' a singular ambiguous time; however,
- # we have historically defaulted ambiguous=False
- ambiguous = ambiguous if ambiguous != "infer" else False
- localize_args = {"ambiguous": ambiguous, "nonexistent": nonexistent, "tz": None}
- if isinstance(freq, Tick) or freq is None:
- localize_args["tz"] = tz
- ts = ts.tz_localize(**localize_args)
- return ts
-
-
-def _generate_range(
- start: Timestamp | None,
- end: Timestamp | None,
- periods: int | None,
- offset: BaseOffset,
- *,
- unit: str,
-):
- """
- Generates a sequence of dates corresponding to the specified time
- offset. Similar to dateutil.rrule except uses pandas DateOffset
- objects to represent time increments.
-
- Parameters
- ----------
- start : Timestamp or None
- end : Timestamp or None
- periods : int or None
- offset : DateOffset
- unit : str
-
- Notes
- -----
- * This method is faster for generating weekdays than dateutil.rrule
- * At least two of (start, end, periods) must be specified.
- * If both start and end are specified, the returned dates will
- satisfy start <= date <= end.
-
- Returns
- -------
- dates : generator object
- """
- offset = to_offset(offset)
-
- # Argument 1 to "Timestamp" has incompatible type "Optional[Timestamp]";
- # expected "Union[integer[Any], float, str, date, datetime64]"
- start = Timestamp(start) # type: ignore[arg-type]
- if start is not NaT:
- start = start.as_unit(unit)
- else:
- start = None
-
- # Argument 1 to "Timestamp" has incompatible type "Optional[Timestamp]";
- # expected "Union[integer[Any], float, str, date, datetime64]"
- end = Timestamp(end) # type: ignore[arg-type]
- if end is not NaT:
- end = end.as_unit(unit)
- else:
- end = None
-
- if start and not offset.is_on_offset(start):
- # Incompatible types in assignment (expression has type "datetime",
- # variable has type "Optional[Timestamp]")
- start = offset.rollforward(start) # type: ignore[assignment]
-
- elif end and not offset.is_on_offset(end):
- # Incompatible types in assignment (expression has type "datetime",
- # variable has type "Optional[Timestamp]")
- end = offset.rollback(end) # type: ignore[assignment]
-
- # Unsupported operand types for < ("Timestamp" and "None")
- if periods is None and end < start and offset.n >= 0: # type: ignore[operator]
- end = None
- periods = 0
-
- if end is None:
- # error: No overload variant of "__radd__" of "BaseOffset" matches
- # argument type "None"
- end = start + (periods - 1) * offset # type: ignore[operator]
-
- if start is None:
- # error: No overload variant of "__radd__" of "BaseOffset" matches
- # argument type "None"
- start = end - (periods - 1) * offset # type: ignore[operator]
-
- start = cast(Timestamp, start)
- end = cast(Timestamp, end)
-
- cur = start
- if offset.n >= 0:
- while cur <= end:
- yield cur
-
- if cur == end:
- # GH#24252 avoid overflows by not performing the addition
- # in offset.apply unless we have to
- break
-
- # faster than cur + offset
- next_date = offset._apply(cur).as_unit(unit)
- if next_date <= cur:
- raise ValueError(f"Offset {offset} did not increment date")
- cur = next_date
- else:
- while cur >= end:
- yield cur
-
- if cur == end:
- # GH#24252 avoid overflows by not performing the addition
- # in offset.apply unless we have to
- break
-
- # faster than cur + offset
- next_date = offset._apply(cur).as_unit(unit)
- if next_date >= cur:
- raise ValueError(f"Offset {offset} did not decrement date")
- cur = next_date
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/floating.py b/contrib/python/pandas/py3/pandas/core/arrays/floating.py
deleted file mode 100644
index e08e99f7eab..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/floating.py
+++ /dev/null
@@ -1,159 +0,0 @@
-from __future__ import annotations
-
-import numpy as np
-
-from pandas.core.dtypes.base import register_extension_dtype
-from pandas.core.dtypes.common import is_float_dtype
-
-from pandas.core.arrays.numeric import (
- NumericArray,
- NumericDtype,
-)
-
-
-class FloatingDtype(NumericDtype):
- """
- An ExtensionDtype to hold a single size of floating dtype.
-
- These specific implementations are subclasses of the non-public
- FloatingDtype. For example we have Float32Dtype to represent float32.
-
- The attributes name & type are set when these subclasses are created.
- """
-
- _default_np_dtype = np.dtype(np.float64)
- _checker = is_float_dtype
-
- @classmethod
- def construct_array_type(cls) -> type[FloatingArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- return FloatingArray
-
- @classmethod
- def _str_to_dtype_mapping(cls):
- return FLOAT_STR_TO_DTYPE
-
- @classmethod
- def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
- """
- Safely cast the values to the given dtype.
-
- "safe" in this context means the casting is lossless.
- """
- # This is really only here for compatibility with IntegerDtype
- # Here for compat with IntegerDtype
- return values.astype(dtype, copy=copy)
-
-
-class FloatingArray(NumericArray):
- """
- Array of floating (optional missing) values.
-
- .. versionadded:: 1.2.0
-
- .. warning::
-
- FloatingArray is currently experimental, and its API or internal
- implementation may change without warning. Especially the behaviour
- regarding NaN (distinct from NA missing values) is subject to change.
-
- We represent a FloatingArray with 2 numpy arrays:
-
- - data: contains a numpy float array of the appropriate dtype
- - mask: a boolean array holding a mask on the data, True is missing
-
- To construct an FloatingArray from generic array-like input, use
- :func:`pandas.array` with one of the float dtypes (see examples).
-
- See :ref:`integer_na` for more.
-
- Parameters
- ----------
- values : numpy.ndarray
- A 1-d float-dtype array.
- mask : numpy.ndarray
- A 1-d boolean-dtype array indicating missing values.
- copy : bool, default False
- Whether to copy the `values` and `mask`.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
-
- Returns
- -------
- FloatingArray
-
- Examples
- --------
- Create an FloatingArray with :func:`pandas.array`:
-
- >>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype())
- <FloatingArray>
- [0.1, <NA>, 0.3]
- Length: 3, dtype: Float32
-
- String aliases for the dtypes are also available. They are capitalized.
-
- >>> pd.array([0.1, None, 0.3], dtype="Float32")
- <FloatingArray>
- [0.1, <NA>, 0.3]
- Length: 3, dtype: Float32
- """
-
- _dtype_cls = FloatingDtype
-
- # The value used to fill '_data' to avoid upcasting
- _internal_fill_value = np.nan
- # Fill values used for any/all
- # Incompatible types in assignment (expression has type "float", base class
- # "BaseMaskedArray" defined the type as "<typing special form>")
- _truthy_value = 1.0 # type: ignore[assignment]
- _falsey_value = 0.0 # type: ignore[assignment]
-
-
-_dtype_docstring = """
-An ExtensionDtype for {dtype} data.
-
-This dtype uses ``pd.NA`` as missing value indicator.
-
-Attributes
-----------
-None
-
-Methods
--------
-None
-"""
-
-# create the Dtype
-
-
-@register_extension_dtype
-class Float32Dtype(FloatingDtype):
- type = np.float32
- name = "Float32"
- __doc__ = _dtype_docstring.format(dtype="float32")
-
-
-@register_extension_dtype
-class Float64Dtype(FloatingDtype):
- type = np.float64
- name = "Float64"
- __doc__ = _dtype_docstring.format(dtype="float64")
-
-
-FLOAT_STR_TO_DTYPE = {
- "float32": Float32Dtype(),
- "float64": Float64Dtype(),
-}
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/integer.py b/contrib/python/pandas/py3/pandas/core/arrays/integer.py
deleted file mode 100644
index 3b35a2b4e47..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/integer.py
+++ /dev/null
@@ -1,220 +0,0 @@
-from __future__ import annotations
-
-import numpy as np
-
-from pandas.core.dtypes.base import register_extension_dtype
-from pandas.core.dtypes.common import is_integer_dtype
-
-from pandas.core.arrays.numeric import (
- NumericArray,
- NumericDtype,
-)
-
-
-class IntegerDtype(NumericDtype):
- """
- An ExtensionDtype to hold a single size & kind of integer dtype.
-
- These specific implementations are subclasses of the non-public
- IntegerDtype. For example, we have Int8Dtype to represent signed int 8s.
-
- The attributes name & type are set when these subclasses are created.
- """
-
- _default_np_dtype = np.dtype(np.int64)
- _checker = is_integer_dtype
-
- @classmethod
- def construct_array_type(cls) -> type[IntegerArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- return IntegerArray
-
- @classmethod
- def _str_to_dtype_mapping(cls):
- return INT_STR_TO_DTYPE
-
- @classmethod
- def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
- """
- Safely cast the values to the given dtype.
-
- "safe" in this context means the casting is lossless. e.g. if 'values'
- has a floating dtype, each value must be an integer.
- """
- try:
- return values.astype(dtype, casting="safe", copy=copy)
- except TypeError as err:
- casted = values.astype(dtype, copy=copy)
- if (casted == values).all():
- return casted
-
- raise TypeError(
- f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
- ) from err
-
-
-class IntegerArray(NumericArray):
- """
- Array of integer (optional missing) values.
-
- Uses :attr:`pandas.NA` as the missing value.
-
- .. warning::
-
- IntegerArray is currently experimental, and its API or internal
- implementation may change without warning.
-
- We represent an IntegerArray with 2 numpy arrays:
-
- - data: contains a numpy integer array of the appropriate dtype
- - mask: a boolean array holding a mask on the data, True is missing
-
- To construct an IntegerArray from generic array-like input, use
- :func:`pandas.array` with one of the integer dtypes (see examples).
-
- See :ref:`integer_na` for more.
-
- Parameters
- ----------
- values : numpy.ndarray
- A 1-d integer-dtype array.
- mask : numpy.ndarray
- A 1-d boolean-dtype array indicating missing values.
- copy : bool, default False
- Whether to copy the `values` and `mask`.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
-
- Returns
- -------
- IntegerArray
-
- Examples
- --------
- Create an IntegerArray with :func:`pandas.array`.
-
- >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
- >>> int_array
- <IntegerArray>
- [1, <NA>, 3]
- Length: 3, dtype: Int32
-
- String aliases for the dtypes are also available. They are capitalized.
-
- >>> pd.array([1, None, 3], dtype='Int32')
- <IntegerArray>
- [1, <NA>, 3]
- Length: 3, dtype: Int32
-
- >>> pd.array([1, None, 3], dtype='UInt16')
- <IntegerArray>
- [1, <NA>, 3]
- Length: 3, dtype: UInt16
- """
-
- _dtype_cls = IntegerDtype
-
- # The value used to fill '_data' to avoid upcasting
- _internal_fill_value = 1
- # Fill values used for any/all
- # Incompatible types in assignment (expression has type "int", base class
- # "BaseMaskedArray" defined the type as "<typing special form>")
- _truthy_value = 1 # type: ignore[assignment]
- _falsey_value = 0 # type: ignore[assignment]
-
-
-_dtype_docstring = """
-An ExtensionDtype for {dtype} integer data.
-
-Uses :attr:`pandas.NA` as its missing value, rather than :attr:`numpy.nan`.
-
-Attributes
-----------
-None
-
-Methods
--------
-None
-"""
-
-# create the Dtype
-
-
-@register_extension_dtype
-class Int8Dtype(IntegerDtype):
- type = np.int8
- name = "Int8"
- __doc__ = _dtype_docstring.format(dtype="int8")
-
-
-@register_extension_dtype
-class Int16Dtype(IntegerDtype):
- type = np.int16
- name = "Int16"
- __doc__ = _dtype_docstring.format(dtype="int16")
-
-
-@register_extension_dtype
-class Int32Dtype(IntegerDtype):
- type = np.int32
- name = "Int32"
- __doc__ = _dtype_docstring.format(dtype="int32")
-
-
-@register_extension_dtype
-class Int64Dtype(IntegerDtype):
- type = np.int64
- name = "Int64"
- __doc__ = _dtype_docstring.format(dtype="int64")
-
-
-@register_extension_dtype
-class UInt8Dtype(IntegerDtype):
- type = np.uint8
- name = "UInt8"
- __doc__ = _dtype_docstring.format(dtype="uint8")
-
-
-@register_extension_dtype
-class UInt16Dtype(IntegerDtype):
- type = np.uint16
- name = "UInt16"
- __doc__ = _dtype_docstring.format(dtype="uint16")
-
-
-@register_extension_dtype
-class UInt32Dtype(IntegerDtype):
- type = np.uint32
- name = "UInt32"
- __doc__ = _dtype_docstring.format(dtype="uint32")
-
-
-@register_extension_dtype
-class UInt64Dtype(IntegerDtype):
- type = np.uint64
- name = "UInt64"
- __doc__ = _dtype_docstring.format(dtype="uint64")
-
-
-INT_STR_TO_DTYPE: dict[str, IntegerDtype] = {
- "int8": Int8Dtype(),
- "int16": Int16Dtype(),
- "int32": Int32Dtype(),
- "int64": Int64Dtype(),
- "uint8": UInt8Dtype(),
- "uint16": UInt16Dtype(),
- "uint32": UInt32Dtype(),
- "uint64": UInt64Dtype(),
-}
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/interval.py b/contrib/python/pandas/py3/pandas/core/arrays/interval.py
deleted file mode 100644
index 2d63af5dc86..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/interval.py
+++ /dev/null
@@ -1,1796 +0,0 @@
-from __future__ import annotations
-
-import operator
-from operator import (
- le,
- lt,
-)
-import textwrap
-from typing import (
- TYPE_CHECKING,
- Iterator,
- Literal,
- Sequence,
- TypeVar,
- Union,
- cast,
- overload,
-)
-
-import numpy as np
-
-from pandas._config import get_option
-
-from pandas._libs import lib
-from pandas._libs.interval import (
- VALID_CLOSED,
- Interval,
- IntervalMixin,
- intervals_to_interval_bounds,
-)
-from pandas._libs.missing import NA
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- Dtype,
- IntervalClosedType,
- NpDtype,
- PositionalIndexer,
- ScalarIndexer,
- SequenceIndexer,
- SortKind,
- TimeArrayLike,
- npt,
-)
-from pandas.compat.numpy import function as nv
-from pandas.errors import IntCastingNaNError
-from pandas.util._decorators import Appender
-
-from pandas.core.dtypes.cast import (
- LossySetitemError,
- maybe_upcast_numeric_to_64bit,
-)
-from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_dtype_equal,
- is_float_dtype,
- is_integer_dtype,
- is_interval_dtype,
- is_list_like,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- needs_i8_conversion,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import IntervalDtype
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCDatetimeIndex,
- ABCIntervalIndex,
- ABCPeriodIndex,
-)
-from pandas.core.dtypes.missing import (
- is_valid_na_for_dtype,
- isna,
- notna,
-)
-
-from pandas.core.algorithms import (
- isin,
- take,
- unique,
- value_counts,
-)
-from pandas.core.arrays.base import (
- ExtensionArray,
- _extension_array_shared_docs,
-)
-from pandas.core.arrays.datetimes import DatetimeArray
-from pandas.core.arrays.timedeltas import TimedeltaArray
-import pandas.core.common as com
-from pandas.core.construction import (
- array as pd_array,
- ensure_wrapped_if_datetimelike,
- extract_array,
-)
-from pandas.core.indexers import check_array_indexer
-from pandas.core.ops import (
- invalid_comparison,
- unpack_zerodim_and_defer,
-)
-
-if TYPE_CHECKING:
- from pandas import (
- Index,
- Series,
- )
-
-
-IntervalArrayT = TypeVar("IntervalArrayT", bound="IntervalArray")
-IntervalSideT = Union[TimeArrayLike, np.ndarray]
-IntervalOrNA = Union[Interval, float]
-
-_interval_shared_docs: dict[str, str] = {}
-
-_shared_docs_kwargs = {
- "klass": "IntervalArray",
- "qualname": "arrays.IntervalArray",
- "name": "",
-}
-
-
-_interval_shared_docs[
- "class"
-] = """
-%(summary)s
-
-.. versionadded:: %(versionadded)s
-
-Parameters
-----------
-data : array-like (1-dimensional)
- Array-like (ndarray, :class:`DateTimeArray`, :class:`TimeDeltaArray`) containing
- Interval objects from which to build the %(klass)s.
-closed : {'left', 'right', 'both', 'neither'}, default 'right'
- Whether the intervals are closed on the left-side, right-side, both or
- neither.
-dtype : dtype or None, default None
- If None, dtype will be inferred.
-copy : bool, default False
- Copy the input data.
-%(name)s\
-verify_integrity : bool, default True
- Verify that the %(klass)s is valid.
-
-Attributes
-----------
-left
-right
-closed
-mid
-length
-is_empty
-is_non_overlapping_monotonic
-%(extra_attributes)s\
-
-Methods
--------
-from_arrays
-from_tuples
-from_breaks
-contains
-overlaps
-set_closed
-to_tuples
-%(extra_methods)s\
-
-See Also
---------
-Index : The base pandas Index type.
-Interval : A bounded slice-like interval; the elements of an %(klass)s.
-interval_range : Function to create a fixed frequency IntervalIndex.
-cut : Bin values into discrete Intervals.
-qcut : Bin values into equal-sized Intervals based on rank or sample quantiles.
-
-Notes
------
-See the `user guide
-<https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#intervalindex>`__
-for more.
-
-%(examples)s\
-"""
-
-
-@Appender(
- _interval_shared_docs["class"]
- % {
- "klass": "IntervalArray",
- "summary": "Pandas array for interval data that are closed on the same side.",
- "versionadded": "0.24.0",
- "name": "",
- "extra_attributes": "",
- "extra_methods": "",
- "examples": textwrap.dedent(
- """\
- Examples
- --------
- A new ``IntervalArray`` can be constructed directly from an array-like of
- ``Interval`` objects:
-
- >>> pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)])
- <IntervalArray>
- [(0, 1], (1, 5]]
- Length: 2, dtype: interval[int64, right]
-
- It may also be constructed using one of the constructor
- methods: :meth:`IntervalArray.from_arrays`,
- :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`.
- """
- ),
- }
-)
-class IntervalArray(IntervalMixin, ExtensionArray):
- can_hold_na = True
- _na_value = _fill_value = np.nan
-
- @property
- def ndim(self) -> Literal[1]:
- return 1
-
- # To make mypy recognize the fields
- _left: IntervalSideT
- _right: IntervalSideT
- _dtype: IntervalDtype
-
- # ---------------------------------------------------------------------
- # Constructors
-
- def __new__(
- cls: type[IntervalArrayT],
- data,
- closed=None,
- dtype: Dtype | None = None,
- copy: bool = False,
- verify_integrity: bool = True,
- ):
- data = extract_array(data, extract_numpy=True)
-
- if isinstance(data, cls):
- left: IntervalSideT = data._left
- right: IntervalSideT = data._right
- closed = closed or data.closed
- dtype = IntervalDtype(left.dtype, closed=closed)
- else:
- # don't allow scalars
- if is_scalar(data):
- msg = (
- f"{cls.__name__}(...) must be called with a collection "
- f"of some kind, {data} was passed"
- )
- raise TypeError(msg)
-
- # might need to convert empty or purely na data
- data = _maybe_convert_platform_interval(data)
- left, right, infer_closed = intervals_to_interval_bounds(
- data, validate_closed=closed is None
- )
- if left.dtype == object:
- left = lib.maybe_convert_objects(left)
- right = lib.maybe_convert_objects(right)
- closed = closed or infer_closed
-
- left, right, dtype = cls._ensure_simple_new_inputs(
- left,
- right,
- closed=closed,
- copy=copy,
- dtype=dtype,
- )
-
- if verify_integrity:
- cls._validate(left, right, dtype=dtype)
-
- return cls._simple_new(
- left,
- right,
- dtype=dtype,
- )
-
- @classmethod
- def _simple_new(
- cls: type[IntervalArrayT],
- left: IntervalSideT,
- right: IntervalSideT,
- dtype: IntervalDtype,
- ) -> IntervalArrayT:
- result = IntervalMixin.__new__(cls)
- result._left = left
- result._right = right
- result._dtype = dtype
-
- return result
-
- @classmethod
- def _ensure_simple_new_inputs(
- cls,
- left,
- right,
- closed: IntervalClosedType | None = None,
- copy: bool = False,
- dtype: Dtype | None = None,
- ) -> tuple[IntervalSideT, IntervalSideT, IntervalDtype]:
- """Ensure correctness of input parameters for cls._simple_new."""
- from pandas.core.indexes.base import ensure_index
-
- left = ensure_index(left, copy=copy)
- left = maybe_upcast_numeric_to_64bit(left)
-
- right = ensure_index(right, copy=copy)
- right = maybe_upcast_numeric_to_64bit(right)
-
- if closed is None and isinstance(dtype, IntervalDtype):
- closed = dtype.closed
-
- closed = closed or "right"
-
- if dtype is not None:
- # GH 19262: dtype must be an IntervalDtype to override inferred
- dtype = pandas_dtype(dtype)
- if is_interval_dtype(dtype):
- dtype = cast(IntervalDtype, dtype)
- if dtype.subtype is not None:
- left = left.astype(dtype.subtype)
- right = right.astype(dtype.subtype)
- else:
- msg = f"dtype must be an IntervalDtype, got {dtype}"
- raise TypeError(msg)
-
- if dtype.closed is None:
- # possibly loading an old pickle
- dtype = IntervalDtype(dtype.subtype, closed)
- elif closed != dtype.closed:
- raise ValueError("closed keyword does not match dtype.closed")
-
- # coerce dtypes to match if needed
- if is_float_dtype(left) and is_integer_dtype(right):
- right = right.astype(left.dtype)
- elif is_float_dtype(right) and is_integer_dtype(left):
- left = left.astype(right.dtype)
-
- if type(left) != type(right):
- msg = (
- f"must not have differing left [{type(left).__name__}] and "
- f"right [{type(right).__name__}] types"
- )
- raise ValueError(msg)
- if is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype):
- # GH 19016
- msg = (
- "category, object, and string subtypes are not supported "
- "for IntervalArray"
- )
- raise TypeError(msg)
- if isinstance(left, ABCPeriodIndex):
- msg = "Period dtypes are not supported, use a PeriodIndex instead"
- raise ValueError(msg)
- if isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz):
- msg = (
- "left and right must have the same time zone, got "
- f"'{left.tz}' and '{right.tz}'"
- )
- raise ValueError(msg)
-
- # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray
- left = ensure_wrapped_if_datetimelike(left)
- left = extract_array(left, extract_numpy=True)
- right = ensure_wrapped_if_datetimelike(right)
- right = extract_array(right, extract_numpy=True)
-
- lbase = getattr(left, "_ndarray", left).base
- rbase = getattr(right, "_ndarray", right).base
- if lbase is not None and lbase is rbase:
- # If these share data, then setitem could corrupt our IA
- right = right.copy()
-
- dtype = IntervalDtype(left.dtype, closed=closed)
-
- return left, right, dtype
-
- @classmethod
- def _from_sequence(
- cls: type[IntervalArrayT],
- scalars,
- *,
- dtype: Dtype | None = None,
- copy: bool = False,
- ) -> IntervalArrayT:
- return cls(scalars, dtype=dtype, copy=copy)
-
- @classmethod
- def _from_factorized(
- cls: type[IntervalArrayT], values: np.ndarray, original: IntervalArrayT
- ) -> IntervalArrayT:
- if len(values) == 0:
- # An empty array returns object-dtype here. We can't create
- # a new IA from an (empty) object-dtype array, so turn it into the
- # correct dtype.
- values = values.astype(original.dtype.subtype)
- return cls(values, closed=original.closed)
-
- _interval_shared_docs["from_breaks"] = textwrap.dedent(
- """
- Construct an %(klass)s from an array of splits.
-
- Parameters
- ----------
- breaks : array-like (1-dimensional)
- Left and right bounds for each interval.
- closed : {'left', 'right', 'both', 'neither'}, default 'right'
- Whether the intervals are closed on the left-side, right-side, both
- or neither.\
- %(name)s
- copy : bool, default False
- Copy the data.
- dtype : dtype or None, default None
- If None, dtype will be inferred.
-
- Returns
- -------
- %(klass)s
-
- See Also
- --------
- interval_range : Function to create a fixed frequency IntervalIndex.
- %(klass)s.from_arrays : Construct from a left and right array.
- %(klass)s.from_tuples : Construct from a sequence of tuples.
-
- %(examples)s\
- """
- )
-
- @classmethod
- @Appender(
- _interval_shared_docs["from_breaks"]
- % {
- "klass": "IntervalArray",
- "name": "",
- "examples": textwrap.dedent(
- """\
- Examples
- --------
- >>> pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3])
- <IntervalArray>
- [(0, 1], (1, 2], (2, 3]]
- Length: 3, dtype: interval[int64, right]
- """
- ),
- }
- )
- def from_breaks(
- cls: type[IntervalArrayT],
- breaks,
- closed: IntervalClosedType | None = "right",
- copy: bool = False,
- dtype: Dtype | None = None,
- ) -> IntervalArrayT:
- breaks = _maybe_convert_platform_interval(breaks)
-
- return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, dtype=dtype)
-
- _interval_shared_docs["from_arrays"] = textwrap.dedent(
- """
- Construct from two arrays defining the left and right bounds.
-
- Parameters
- ----------
- left : array-like (1-dimensional)
- Left bounds for each interval.
- right : array-like (1-dimensional)
- Right bounds for each interval.
- closed : {'left', 'right', 'both', 'neither'}, default 'right'
- Whether the intervals are closed on the left-side, right-side, both
- or neither.\
- %(name)s
- copy : bool, default False
- Copy the data.
- dtype : dtype, optional
- If None, dtype will be inferred.
-
- Returns
- -------
- %(klass)s
-
- Raises
- ------
- ValueError
- When a value is missing in only one of `left` or `right`.
- When a value in `left` is greater than the corresponding value
- in `right`.
-
- See Also
- --------
- interval_range : Function to create a fixed frequency IntervalIndex.
- %(klass)s.from_breaks : Construct an %(klass)s from an array of
- splits.
- %(klass)s.from_tuples : Construct an %(klass)s from an
- array-like of tuples.
-
- Notes
- -----
- Each element of `left` must be less than or equal to the `right`
- element at the same position. If an element is missing, it must be
- missing in both `left` and `right`. A TypeError is raised when
- using an unsupported type for `left` or `right`. At the moment,
- 'category', 'object', and 'string' subtypes are not supported.
-
- %(examples)s\
- """
- )
-
- @classmethod
- @Appender(
- _interval_shared_docs["from_arrays"]
- % {
- "klass": "IntervalArray",
- "name": "",
- "examples": textwrap.dedent(
- """\
- >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3])
- <IntervalArray>
- [(0, 1], (1, 2], (2, 3]]
- Length: 3, dtype: interval[int64, right]
- """
- ),
- }
- )
- def from_arrays(
- cls: type[IntervalArrayT],
- left,
- right,
- closed: IntervalClosedType | None = "right",
- copy: bool = False,
- dtype: Dtype | None = None,
- ) -> IntervalArrayT:
- left = _maybe_convert_platform_interval(left)
- right = _maybe_convert_platform_interval(right)
-
- left, right, dtype = cls._ensure_simple_new_inputs(
- left,
- right,
- closed=closed,
- copy=copy,
- dtype=dtype,
- )
- cls._validate(left, right, dtype=dtype)
-
- return cls._simple_new(left, right, dtype=dtype)
-
- _interval_shared_docs["from_tuples"] = textwrap.dedent(
- """
- Construct an %(klass)s from an array-like of tuples.
-
- Parameters
- ----------
- data : array-like (1-dimensional)
- Array of tuples.
- closed : {'left', 'right', 'both', 'neither'}, default 'right'
- Whether the intervals are closed on the left-side, right-side, both
- or neither.\
- %(name)s
- copy : bool, default False
- By-default copy the data, this is compat only and ignored.
- dtype : dtype or None, default None
- If None, dtype will be inferred.
-
- Returns
- -------
- %(klass)s
-
- See Also
- --------
- interval_range : Function to create a fixed frequency IntervalIndex.
- %(klass)s.from_arrays : Construct an %(klass)s from a left and
- right array.
- %(klass)s.from_breaks : Construct an %(klass)s from an array of
- splits.
-
- %(examples)s\
- """
- )
-
- @classmethod
- @Appender(
- _interval_shared_docs["from_tuples"]
- % {
- "klass": "IntervalArray",
- "name": "",
- "examples": textwrap.dedent(
- """\
- Examples
- --------
- >>> pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)])
- <IntervalArray>
- [(0, 1], (1, 2]]
- Length: 2, dtype: interval[int64, right]
- """
- ),
- }
- )
- def from_tuples(
- cls: type[IntervalArrayT],
- data,
- closed: IntervalClosedType | None = "right",
- copy: bool = False,
- dtype: Dtype | None = None,
- ) -> IntervalArrayT:
- if len(data):
- left, right = [], []
- else:
- # ensure that empty data keeps input dtype
- left = right = data
-
- for d in data:
- if not isinstance(d, tuple) and isna(d):
- lhs = rhs = np.nan
- else:
- name = cls.__name__
- try:
- # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...]
- lhs, rhs = d
- except ValueError as err:
- msg = f"{name}.from_tuples requires tuples of length 2, got {d}"
- raise ValueError(msg) from err
- except TypeError as err:
- msg = f"{name}.from_tuples received an invalid item, {d}"
- raise TypeError(msg) from err
- left.append(lhs)
- right.append(rhs)
-
- return cls.from_arrays(left, right, closed, copy=False, dtype=dtype)
-
- @classmethod
- def _validate(cls, left, right, dtype: IntervalDtype) -> None:
- """
- Verify that the IntervalArray is valid.
-
- Checks that
-
- * dtype is correct
- * left and right match lengths
- * left and right have the same missing values
- * left is always below right
- """
- if not isinstance(dtype, IntervalDtype):
- msg = f"invalid dtype: {dtype}"
- raise ValueError(msg)
- if len(left) != len(right):
- msg = "left and right must have the same length"
- raise ValueError(msg)
- left_mask = notna(left)
- right_mask = notna(right)
- if not (left_mask == right_mask).all():
- msg = (
- "missing values must be missing in the same "
- "location both left and right sides"
- )
- raise ValueError(msg)
- if not (left[left_mask] <= right[left_mask]).all():
- msg = "left side of interval must be <= right side"
- raise ValueError(msg)
-
- def _shallow_copy(self: IntervalArrayT, left, right) -> IntervalArrayT:
- """
- Return a new IntervalArray with the replacement attributes
-
- Parameters
- ----------
- left : Index
- Values to be used for the left-side of the intervals.
- right : Index
- Values to be used for the right-side of the intervals.
- """
- dtype = IntervalDtype(left.dtype, closed=self.closed)
- left, right, dtype = self._ensure_simple_new_inputs(left, right, dtype=dtype)
-
- return self._simple_new(left, right, dtype=dtype)
-
- # ---------------------------------------------------------------------
- # Descriptive
-
- @property
- def dtype(self) -> IntervalDtype:
- return self._dtype
-
- @property
- def nbytes(self) -> int:
- return self.left.nbytes + self.right.nbytes
-
- @property
- def size(self) -> int:
- # Avoid materializing self.values
- return self.left.size
-
- # ---------------------------------------------------------------------
- # EA Interface
-
- def __iter__(self) -> Iterator:
- return iter(np.asarray(self))
-
- def __len__(self) -> int:
- return len(self._left)
-
- @overload
- def __getitem__(self, key: ScalarIndexer) -> IntervalOrNA:
- ...
-
- @overload
- def __getitem__(self: IntervalArrayT, key: SequenceIndexer) -> IntervalArrayT:
- ...
-
- def __getitem__(
- self: IntervalArrayT, key: PositionalIndexer
- ) -> IntervalArrayT | IntervalOrNA:
- key = check_array_indexer(self, key)
- left = self._left[key]
- right = self._right[key]
-
- if not isinstance(left, (np.ndarray, ExtensionArray)):
- # scalar
- if is_scalar(left) and isna(left):
- return self._fill_value
- return Interval(left, right, self.closed)
- if np.ndim(left) > 1:
- # GH#30588 multi-dimensional indexer disallowed
- raise ValueError("multi-dimensional indexing not allowed")
- # Argument 2 to "_simple_new" of "IntervalArray" has incompatible type
- # "Union[Period, Timestamp, Timedelta, NaTType, DatetimeArray, TimedeltaArray,
- # ndarray[Any, Any]]"; expected "Union[Union[DatetimeArray, TimedeltaArray],
- # ndarray[Any, Any]]"
- return self._simple_new(left, right, dtype=self.dtype) # type: ignore[arg-type]
-
- def __setitem__(self, key, value) -> None:
- value_left, value_right = self._validate_setitem_value(value)
- key = check_array_indexer(self, key)
-
- self._left[key] = value_left
- self._right[key] = value_right
-
- def _cmp_method(self, other, op):
- # ensure pandas array for list-like and eliminate non-interval scalars
- if is_list_like(other):
- if len(self) != len(other):
- raise ValueError("Lengths must match to compare")
- other = pd_array(other)
- elif not isinstance(other, Interval):
- # non-interval scalar -> no matches
- if other is NA:
- # GH#31882
- from pandas.core.arrays import BooleanArray
-
- arr = np.empty(self.shape, dtype=bool)
- mask = np.ones(self.shape, dtype=bool)
- return BooleanArray(arr, mask)
- return invalid_comparison(self, other, op)
-
- # determine the dtype of the elements we want to compare
- if isinstance(other, Interval):
- other_dtype = pandas_dtype("interval")
- elif not is_categorical_dtype(other.dtype):
- other_dtype = other.dtype
- else:
- # for categorical defer to categories for dtype
- other_dtype = other.categories.dtype
-
- # extract intervals if we have interval categories with matching closed
- if is_interval_dtype(other_dtype):
- if self.closed != other.categories.closed:
- return invalid_comparison(self, other, op)
-
- other = other.categories.take(
- other.codes, allow_fill=True, fill_value=other.categories._na_value
- )
-
- # interval-like -> need same closed and matching endpoints
- if is_interval_dtype(other_dtype):
- if self.closed != other.closed:
- return invalid_comparison(self, other, op)
- elif not isinstance(other, Interval):
- other = type(self)(other)
-
- if op is operator.eq:
- return (self._left == other.left) & (self._right == other.right)
- elif op is operator.ne:
- return (self._left != other.left) | (self._right != other.right)
- elif op is operator.gt:
- return (self._left > other.left) | (
- (self._left == other.left) & (self._right > other.right)
- )
- elif op is operator.ge:
- return (self == other) | (self > other)
- elif op is operator.lt:
- return (self._left < other.left) | (
- (self._left == other.left) & (self._right < other.right)
- )
- else:
- # operator.lt
- return (self == other) | (self < other)
-
- # non-interval/non-object dtype -> no matches
- if not is_object_dtype(other_dtype):
- return invalid_comparison(self, other, op)
-
- # object dtype -> iteratively check for intervals
- result = np.zeros(len(self), dtype=bool)
- for i, obj in enumerate(other):
- try:
- result[i] = op(self[i], obj)
- except TypeError:
- if obj is NA:
- # comparison with np.nan returns NA
- # github.com/pandas-dev/pandas/pull/37124#discussion_r509095092
- result = result.astype(object)
- result[i] = NA
- else:
- raise
- return result
-
- @unpack_zerodim_and_defer("__eq__")
- def __eq__(self, other):
- return self._cmp_method(other, operator.eq)
-
- @unpack_zerodim_and_defer("__ne__")
- def __ne__(self, other):
- return self._cmp_method(other, operator.ne)
-
- @unpack_zerodim_and_defer("__gt__")
- def __gt__(self, other):
- return self._cmp_method(other, operator.gt)
-
- @unpack_zerodim_and_defer("__ge__")
- def __ge__(self, other):
- return self._cmp_method(other, operator.ge)
-
- @unpack_zerodim_and_defer("__lt__")
- def __lt__(self, other):
- return self._cmp_method(other, operator.lt)
-
- @unpack_zerodim_and_defer("__le__")
- def __le__(self, other):
- return self._cmp_method(other, operator.le)
-
- def argsort(
- self,
- *,
- ascending: bool = True,
- kind: SortKind = "quicksort",
- na_position: str = "last",
- **kwargs,
- ) -> np.ndarray:
- ascending = nv.validate_argsort_with_ascending(ascending, (), kwargs)
-
- if ascending and kind == "quicksort" and na_position == "last":
- # TODO: in an IntervalIndex we can re-use the cached
- # IntervalTree.left_sorter
- return np.lexsort((self.right, self.left))
-
- # TODO: other cases we can use lexsort for? much more performant.
- return super().argsort(
- ascending=ascending, kind=kind, na_position=na_position, **kwargs
- )
-
- def min(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOrNA:
- nv.validate_minmax_axis(axis, self.ndim)
-
- if not len(self):
- return self._na_value
-
- mask = self.isna()
- if mask.any():
- if not skipna:
- return self._na_value
- obj = self[~mask]
- else:
- obj = self
-
- indexer = obj.argsort()[0]
- return obj[indexer]
-
- def max(self, *, axis: AxisInt | None = None, skipna: bool = True) -> IntervalOrNA:
- nv.validate_minmax_axis(axis, self.ndim)
-
- if not len(self):
- return self._na_value
-
- mask = self.isna()
- if mask.any():
- if not skipna:
- return self._na_value
- obj = self[~mask]
- else:
- obj = self
-
- indexer = obj.argsort()[-1]
- return obj[indexer]
-
- def fillna(
- self: IntervalArrayT, value=None, method=None, limit=None
- ) -> IntervalArrayT:
- """
- Fill NA/NaN values using the specified method.
-
- Parameters
- ----------
- value : scalar, dict, Series
- If a scalar value is passed it is used to fill all missing values.
- Alternatively, a Series or dict can be used to fill in different
- values for each index. The value should not be a list. The
- value(s) passed should be either Interval objects or NA/NaN.
- method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
- (Not implemented yet for IntervalArray)
- Method to use for filling holes in reindexed Series
- limit : int, default None
- (Not implemented yet for IntervalArray)
- If method is specified, this is the maximum number of consecutive
- NaN values to forward/backward fill. In other words, if there is
- a gap with more than this number of consecutive NaNs, it will only
- be partially filled. If method is not specified, this is the
- maximum number of entries along the entire axis where NaNs will be
- filled.
-
- Returns
- -------
- filled : IntervalArray with NA/NaN filled
- """
- if method is not None:
- raise TypeError("Filling by method is not supported for IntervalArray.")
- if limit is not None:
- raise TypeError("limit is not supported for IntervalArray.")
-
- value_left, value_right = self._validate_scalar(value)
-
- left = self.left.fillna(value=value_left)
- right = self.right.fillna(value=value_right)
- return self._shallow_copy(left, right)
-
- def astype(self, dtype, copy: bool = True):
- """
- Cast to an ExtensionArray or NumPy array with dtype 'dtype'.
-
- Parameters
- ----------
- dtype : str or dtype
- Typecode or data-type to which the array is cast.
-
- copy : bool, default True
- Whether to copy the data, even if not necessary. If False,
- a copy is made only if the old dtype does not match the
- new dtype.
-
- Returns
- -------
- array : ExtensionArray or ndarray
- ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
- """
- from pandas import Index
-
- if dtype is not None:
- dtype = pandas_dtype(dtype)
-
- if is_interval_dtype(dtype):
- if dtype == self.dtype:
- return self.copy() if copy else self
-
- if is_float_dtype(self.dtype.subtype) and needs_i8_conversion(
- dtype.subtype
- ):
- # This is allowed on the Index.astype but we disallow it here
- msg = (
- f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible"
- )
- raise TypeError(msg)
-
- # need to cast to different subtype
- try:
- # We need to use Index rules for astype to prevent casting
- # np.nan entries to int subtypes
- new_left = Index(self._left, copy=False).astype(dtype.subtype)
- new_right = Index(self._right, copy=False).astype(dtype.subtype)
- except IntCastingNaNError:
- # e.g test_subtype_integer
- raise
- except (TypeError, ValueError) as err:
- # e.g. test_subtype_integer_errors f8->u8 can be lossy
- # and raises ValueError
- msg = (
- f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible"
- )
- raise TypeError(msg) from err
- return self._shallow_copy(new_left, new_right)
- else:
- try:
- return super().astype(dtype, copy=copy)
- except (TypeError, ValueError) as err:
- msg = f"Cannot cast {type(self).__name__} to dtype {dtype}"
- raise TypeError(msg) from err
-
- def equals(self, other) -> bool:
- if type(self) != type(other):
- return False
-
- return bool(
- self.closed == other.closed
- and self.left.equals(other.left)
- and self.right.equals(other.right)
- )
-
- @classmethod
- def _concat_same_type(
- cls: type[IntervalArrayT], to_concat: Sequence[IntervalArrayT]
- ) -> IntervalArrayT:
- """
- Concatenate multiple IntervalArray
-
- Parameters
- ----------
- to_concat : sequence of IntervalArray
-
- Returns
- -------
- IntervalArray
- """
- closed_set = {interval.closed for interval in to_concat}
- if len(closed_set) != 1:
- raise ValueError("Intervals must all be closed on the same side.")
- closed = closed_set.pop()
-
- left = np.concatenate([interval.left for interval in to_concat])
- right = np.concatenate([interval.right for interval in to_concat])
-
- left, right, dtype = cls._ensure_simple_new_inputs(left, right, closed=closed)
-
- return cls._simple_new(left, right, dtype=dtype)
-
- def copy(self: IntervalArrayT) -> IntervalArrayT:
- """
- Return a copy of the array.
-
- Returns
- -------
- IntervalArray
- """
- left = self._left.copy()
- right = self._right.copy()
- dtype = self.dtype
- return self._simple_new(left, right, dtype=dtype)
-
- def isna(self) -> np.ndarray:
- return isna(self._left)
-
- def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray:
- if not len(self) or periods == 0:
- return self.copy()
-
- self._validate_scalar(fill_value)
-
- # ExtensionArray.shift doesn't work for two reasons
- # 1. IntervalArray.dtype.na_value may not be correct for the dtype.
- # 2. IntervalArray._from_sequence only accepts NaN for missing values,
- # not other values like NaT
-
- empty_len = min(abs(periods), len(self))
- if isna(fill_value):
- from pandas import Index
-
- fill_value = Index(self._left, copy=False)._na_value
- empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1))
- else:
- empty = self._from_sequence([fill_value] * empty_len)
-
- if periods > 0:
- a = empty
- b = self[:-periods]
- else:
- a = self[abs(periods) :]
- b = empty
- return self._concat_same_type([a, b])
-
- def take(
- self: IntervalArrayT,
- indices,
- *,
- allow_fill: bool = False,
- fill_value=None,
- axis=None,
- **kwargs,
- ) -> IntervalArrayT:
- """
- Take elements from the IntervalArray.
-
- Parameters
- ----------
- indices : sequence of integers
- Indices to be taken.
-
- allow_fill : bool, default False
- How to handle negative values in `indices`.
-
- * False: negative values in `indices` indicate positional indices
- from the right (the default). This is similar to
- :func:`numpy.take`.
-
- * True: negative values in `indices` indicate
- missing values. These values are set to `fill_value`. Any other
- other negative values raise a ``ValueError``.
-
- fill_value : Interval or NA, optional
- Fill value to use for NA-indices when `allow_fill` is True.
- This may be ``None``, in which case the default NA value for
- the type, ``self.dtype.na_value``, is used.
-
- For many ExtensionArrays, there will be two representations of
- `fill_value`: a user-facing "boxed" scalar, and a low-level
- physical NA value. `fill_value` should be the user-facing version,
- and the implementation should handle translating that to the
- physical version for processing the take if necessary.
-
- axis : any, default None
- Present for compat with IntervalIndex; does nothing.
-
- Returns
- -------
- IntervalArray
-
- Raises
- ------
- IndexError
- When the indices are out of bounds for the array.
- ValueError
- When `indices` contains negative values other than ``-1``
- and `allow_fill` is True.
- """
- nv.validate_take((), kwargs)
-
- fill_left = fill_right = fill_value
- if allow_fill:
- fill_left, fill_right = self._validate_scalar(fill_value)
-
- left_take = take(
- self._left, indices, allow_fill=allow_fill, fill_value=fill_left
- )
- right_take = take(
- self._right, indices, allow_fill=allow_fill, fill_value=fill_right
- )
-
- return self._shallow_copy(left_take, right_take)
-
- def _validate_listlike(self, value):
- # list-like of intervals
- try:
- array = IntervalArray(value)
- self._check_closed_matches(array, name="value")
- value_left, value_right = array.left, array.right
- except TypeError as err:
- # wrong type: not interval or NA
- msg = f"'value' should be an interval type, got {type(value)} instead."
- raise TypeError(msg) from err
-
- try:
- self.left._validate_fill_value(value_left)
- except (LossySetitemError, TypeError) as err:
- msg = (
- "'value' should be a compatible interval type, "
- f"got {type(value)} instead."
- )
- raise TypeError(msg) from err
-
- return value_left, value_right
-
- def _validate_scalar(self, value):
- if isinstance(value, Interval):
- self._check_closed_matches(value, name="value")
- left, right = value.left, value.right
- # TODO: check subdtype match like _validate_setitem_value?
- elif is_valid_na_for_dtype(value, self.left.dtype):
- # GH#18295
- left = right = self.left._na_value
- else:
- raise TypeError(
- "can only insert Interval objects and NA into an IntervalArray"
- )
- return left, right
-
- def _validate_setitem_value(self, value):
- if is_valid_na_for_dtype(value, self.left.dtype):
- # na value: need special casing to set directly on numpy arrays
- value = self.left._na_value
- if is_integer_dtype(self.dtype.subtype):
- # can't set NaN on a numpy integer array
- # GH#45484 TypeError, not ValueError, matches what we get with
- # non-NA un-holdable value.
- raise TypeError("Cannot set float NaN to integer-backed IntervalArray")
- value_left, value_right = value, value
-
- elif isinstance(value, Interval):
- # scalar interval
- self._check_closed_matches(value, name="value")
- value_left, value_right = value.left, value.right
- self.left._validate_fill_value(value_left)
- self.left._validate_fill_value(value_right)
-
- else:
- return self._validate_listlike(value)
-
- return value_left, value_right
-
- def value_counts(self, dropna: bool = True) -> Series:
- """
- Returns a Series containing counts of each interval.
-
- Parameters
- ----------
- dropna : bool, default True
- Don't include counts of NaN.
-
- Returns
- -------
- counts : Series
-
- See Also
- --------
- Series.value_counts
- """
- # TODO: implement this is a non-naive way!
- return value_counts(np.asarray(self), dropna=dropna)
-
- # ---------------------------------------------------------------------
- # Rendering Methods
-
- def _format_data(self) -> str:
- # TODO: integrate with categorical and make generic
- # name argument is unused here; just for compat with base / categorical
- n = len(self)
- max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10)
-
- formatter = str
-
- if n == 0:
- summary = "[]"
- elif n == 1:
- first = formatter(self[0])
- summary = f"[{first}]"
- elif n == 2:
- first = formatter(self[0])
- last = formatter(self[-1])
- summary = f"[{first}, {last}]"
- else:
- if n > max_seq_items:
- n = min(max_seq_items // 2, 10)
- head = [formatter(x) for x in self[:n]]
- tail = [formatter(x) for x in self[-n:]]
- head_str = ", ".join(head)
- tail_str = ", ".join(tail)
- summary = f"[{head_str} ... {tail_str}]"
- else:
- tail = [formatter(x) for x in self]
- tail_str = ", ".join(tail)
- summary = f"[{tail_str}]"
-
- return summary
-
- def __repr__(self) -> str:
- # the short repr has no trailing newline, while the truncated
- # repr does. So we include a newline in our template, and strip
- # any trailing newlines from format_object_summary
- data = self._format_data()
- class_name = f"<{type(self).__name__}>\n"
-
- template = f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}"
- return template
-
- def _format_space(self) -> str:
- space = " " * (len(type(self).__name__) + 1)
- return f"\n{space}"
-
- # ---------------------------------------------------------------------
- # Vectorized Interval Properties/Attributes
-
- @property
- def left(self):
- """
- Return the left endpoints of each Interval in the IntervalArray as an Index.
- """
- from pandas import Index
-
- return Index(self._left, copy=False)
-
- @property
- def right(self):
- """
- Return the right endpoints of each Interval in the IntervalArray as an Index.
- """
- from pandas import Index
-
- return Index(self._right, copy=False)
-
- @property
- def length(self) -> Index:
- """
- Return an Index with entries denoting the length of each Interval.
- """
- return self.right - self.left
-
- @property
- def mid(self) -> Index:
- """
- Return the midpoint of each Interval in the IntervalArray as an Index.
- """
- try:
- return 0.5 * (self.left + self.right)
- except TypeError:
- # datetime safe version
- return self.left + 0.5 * self.length
-
- _interval_shared_docs["overlaps"] = textwrap.dedent(
- """
- Check elementwise if an Interval overlaps the values in the %(klass)s.
-
- Two intervals overlap if they share a common point, including closed
- endpoints. Intervals that only have an open endpoint in common do not
- overlap.
-
- Parameters
- ----------
- other : %(klass)s
- Interval to check against for an overlap.
-
- Returns
- -------
- ndarray
- Boolean array positionally indicating where an overlap occurs.
-
- See Also
- --------
- Interval.overlaps : Check whether two Interval objects overlap.
-
- Examples
- --------
- %(examples)s
- >>> intervals.overlaps(pd.Interval(0.5, 1.5))
- array([ True, True, False])
-
- Intervals that share closed endpoints overlap:
-
- >>> intervals.overlaps(pd.Interval(1, 3, closed='left'))
- array([ True, True, True])
-
- Intervals that only have an open endpoint in common do not overlap:
-
- >>> intervals.overlaps(pd.Interval(1, 2, closed='right'))
- array([False, True, False])
- """
- )
-
- @Appender(
- _interval_shared_docs["overlaps"]
- % {
- "klass": "IntervalArray",
- "examples": textwrap.dedent(
- """\
- >>> data = [(0, 1), (1, 3), (2, 4)]
- >>> intervals = pd.arrays.IntervalArray.from_tuples(data)
- >>> intervals
- <IntervalArray>
- [(0, 1], (1, 3], (2, 4]]
- Length: 3, dtype: interval[int64, right]
- """
- ),
- }
- )
- def overlaps(self, other):
- if isinstance(other, (IntervalArray, ABCIntervalIndex)):
- raise NotImplementedError
- if not isinstance(other, Interval):
- msg = f"`other` must be Interval-like, got {type(other).__name__}"
- raise TypeError(msg)
-
- # equality is okay if both endpoints are closed (overlap at a point)
- op1 = le if (self.closed_left and other.closed_right) else lt
- op2 = le if (other.closed_left and self.closed_right) else lt
-
- # overlaps is equivalent negation of two interval being disjoint:
- # disjoint = (A.left > B.right) or (B.left > A.right)
- # (simplifying the negation allows this to be done in less operations)
- return op1(self.left, other.right) & op2(other.left, self.right)
-
- # ---------------------------------------------------------------------
-
- @property
- def closed(self) -> IntervalClosedType:
- """
- String describing the inclusive side the intervals.
-
- Either ``left``, ``right``, ``both`` or ``neither``.
- """
- return self.dtype.closed
-
- _interval_shared_docs["set_closed"] = textwrap.dedent(
- """
- Return an identical %(klass)s closed on the specified side.
-
- Parameters
- ----------
- closed : {'left', 'right', 'both', 'neither'}
- Whether the intervals are closed on the left-side, right-side, both
- or neither.
-
- Returns
- -------
- %(klass)s
-
- %(examples)s\
- """
- )
-
- @Appender(
- _interval_shared_docs["set_closed"]
- % {
- "klass": "IntervalArray",
- "examples": textwrap.dedent(
- """\
- Examples
- --------
- >>> index = pd.arrays.IntervalArray.from_breaks(range(4))
- >>> index
- <IntervalArray>
- [(0, 1], (1, 2], (2, 3]]
- Length: 3, dtype: interval[int64, right]
- >>> index.set_closed('both')
- <IntervalArray>
- [[0, 1], [1, 2], [2, 3]]
- Length: 3, dtype: interval[int64, both]
- """
- ),
- }
- )
- def set_closed(self: IntervalArrayT, closed: IntervalClosedType) -> IntervalArrayT:
- if closed not in VALID_CLOSED:
- msg = f"invalid option for 'closed': {closed}"
- raise ValueError(msg)
-
- left, right = self._left, self._right
- dtype = IntervalDtype(left.dtype, closed=closed)
- return self._simple_new(left, right, dtype=dtype)
-
- _interval_shared_docs[
- "is_non_overlapping_monotonic"
- ] = """
- Return a boolean whether the %(klass)s is non-overlapping and monotonic.
-
- Non-overlapping means (no Intervals share points), and monotonic means
- either monotonic increasing or monotonic decreasing.
- """
-
- @property
- @Appender(
- _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs
- )
- def is_non_overlapping_monotonic(self) -> bool:
- # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... )
- # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...)
- # we already require left <= right
-
- # strict inequality for closed == 'both'; equality implies overlapping
- # at a point when both sides of intervals are included
- if self.closed == "both":
- return bool(
- (self._right[:-1] < self._left[1:]).all()
- or (self._left[:-1] > self._right[1:]).all()
- )
-
- # non-strict inequality when closed != 'both'; at least one side is
- # not included in the intervals, so equality does not imply overlapping
- return bool(
- (self._right[:-1] <= self._left[1:]).all()
- or (self._left[:-1] >= self._right[1:]).all()
- )
-
- # ---------------------------------------------------------------------
- # Conversion
-
- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
- """
- Return the IntervalArray's data as a numpy array of Interval
- objects (with dtype='object')
- """
- left = self._left
- right = self._right
- mask = self.isna()
- closed = self.closed
-
- result = np.empty(len(left), dtype=object)
- for i, left_value in enumerate(left):
- if mask[i]:
- result[i] = np.nan
- else:
- result[i] = Interval(left_value, right[i], closed)
- return result
-
- def __arrow_array__(self, type=None):
- """
- Convert myself into a pyarrow Array.
- """
- import pyarrow
-
- from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
-
- try:
- subtype = pyarrow.from_numpy_dtype(self.dtype.subtype)
- except TypeError as err:
- raise TypeError(
- f"Conversion to arrow with subtype '{self.dtype.subtype}' "
- "is not supported"
- ) from err
- interval_type = ArrowIntervalType(subtype, self.closed)
- storage_array = pyarrow.StructArray.from_arrays(
- [
- pyarrow.array(self._left, type=subtype, from_pandas=True),
- pyarrow.array(self._right, type=subtype, from_pandas=True),
- ],
- names=["left", "right"],
- )
- mask = self.isna()
- if mask.any():
- # if there are missing values, set validity bitmap also on the array level
- null_bitmap = pyarrow.array(~mask).buffers()[1]
- storage_array = pyarrow.StructArray.from_buffers(
- storage_array.type,
- len(storage_array),
- [null_bitmap],
- children=[storage_array.field(0), storage_array.field(1)],
- )
-
- if type is not None:
- if type.equals(interval_type.storage_type):
- return storage_array
- elif isinstance(type, ArrowIntervalType):
- # ensure we have the same subtype and closed attributes
- if not type.equals(interval_type):
- raise TypeError(
- "Not supported to convert IntervalArray to type with "
- f"different 'subtype' ({self.dtype.subtype} vs {type.subtype}) "
- f"and 'closed' ({self.closed} vs {type.closed}) attributes"
- )
- else:
- raise TypeError(
- f"Not supported to convert IntervalArray to '{type}' type"
- )
-
- return pyarrow.ExtensionArray.from_storage(interval_type, storage_array)
-
- _interval_shared_docs[
- "to_tuples"
- ] = """
- Return an %(return_type)s of tuples of the form (left, right).
-
- Parameters
- ----------
- na_tuple : bool, default True
- Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA
- value itself if False, ``nan``.
-
- Returns
- -------
- tuples: %(return_type)s
- %(examples)s\
- """
-
- @Appender(
- _interval_shared_docs["to_tuples"] % {"return_type": "ndarray", "examples": ""}
- )
- def to_tuples(self, na_tuple: bool = True) -> np.ndarray:
- tuples = com.asarray_tuplesafe(zip(self._left, self._right))
- if not na_tuple:
- # GH 18756
- tuples = np.where(~self.isna(), tuples, np.nan)
- return tuples
-
- # ---------------------------------------------------------------------
-
- def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
- value_left, value_right = self._validate_setitem_value(value)
-
- if isinstance(self._left, np.ndarray):
- np.putmask(self._left, mask, value_left)
- assert isinstance(self._right, np.ndarray)
- np.putmask(self._right, mask, value_right)
- else:
- self._left._putmask(mask, value_left)
- assert not isinstance(self._right, np.ndarray)
- self._right._putmask(mask, value_right)
-
- def insert(self: IntervalArrayT, loc: int, item: Interval) -> IntervalArrayT:
- """
- Return a new IntervalArray inserting new item at location. Follows
- Python numpy.insert semantics for negative values. Only Interval
- objects and NA can be inserted into an IntervalIndex
-
- Parameters
- ----------
- loc : int
- item : Interval
-
- Returns
- -------
- IntervalArray
- """
- left_insert, right_insert = self._validate_scalar(item)
-
- new_left = self.left.insert(loc, left_insert)
- new_right = self.right.insert(loc, right_insert)
-
- return self._shallow_copy(new_left, new_right)
-
- def delete(self: IntervalArrayT, loc) -> IntervalArrayT:
- if isinstance(self._left, np.ndarray):
- new_left = np.delete(self._left, loc)
- assert isinstance(self._right, np.ndarray)
- new_right = np.delete(self._right, loc)
- else:
- new_left = self._left.delete(loc)
- assert not isinstance(self._right, np.ndarray)
- new_right = self._right.delete(loc)
- return self._shallow_copy(left=new_left, right=new_right)
-
- @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs)
- def repeat(
- self: IntervalArrayT,
- repeats: int | Sequence[int],
- axis: AxisInt | None = None,
- ) -> IntervalArrayT:
- nv.validate_repeat((), {"axis": axis})
- left_repeat = self.left.repeat(repeats)
- right_repeat = self.right.repeat(repeats)
- return self._shallow_copy(left=left_repeat, right=right_repeat)
-
- _interval_shared_docs["contains"] = textwrap.dedent(
- """
- Check elementwise if the Intervals contain the value.
-
- Return a boolean mask whether the value is contained in the Intervals
- of the %(klass)s.
-
- Parameters
- ----------
- other : scalar
- The value to check whether it is contained in the Intervals.
-
- Returns
- -------
- boolean array
-
- See Also
- --------
- Interval.contains : Check whether Interval object contains value.
- %(klass)s.overlaps : Check if an Interval overlaps the values in the
- %(klass)s.
-
- Examples
- --------
- %(examples)s
- >>> intervals.contains(0.5)
- array([ True, False, False])
- """
- )
-
- @Appender(
- _interval_shared_docs["contains"]
- % {
- "klass": "IntervalArray",
- "examples": textwrap.dedent(
- """\
- >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)])
- >>> intervals
- <IntervalArray>
- [(0, 1], (1, 3], (2, 4]]
- Length: 3, dtype: interval[int64, right]
- """
- ),
- }
- )
- def contains(self, other):
- if isinstance(other, Interval):
- raise NotImplementedError("contains not implemented for two intervals")
-
- return (self._left < other if self.open_left else self._left <= other) & (
- other < self._right if self.open_right else other <= self._right
- )
-
- def isin(self, values) -> npt.NDArray[np.bool_]:
- if not hasattr(values, "dtype"):
- values = np.array(values)
- values = extract_array(values, extract_numpy=True)
-
- if is_interval_dtype(values.dtype):
- if self.closed != values.closed:
- # not comparable -> no overlap
- return np.zeros(self.shape, dtype=bool)
-
- if is_dtype_equal(self.dtype, values.dtype):
- # GH#38353 instead of casting to object, operating on a
- # complex128 ndarray is much more performant.
- left = self._combined.view("complex128")
- right = values._combined.view("complex128")
- # error: Argument 1 to "in1d" has incompatible type
- # "Union[ExtensionArray, ndarray[Any, Any],
- # ndarray[Any, dtype[Any]]]"; expected
- # "Union[_SupportsArray[dtype[Any]],
- # _NestedSequence[_SupportsArray[dtype[Any]]], bool,
- # int, float, complex, str, bytes, _NestedSequence[
- # Union[bool, int, float, complex, str, bytes]]]"
- return np.in1d(left, right) # type: ignore[arg-type]
-
- elif needs_i8_conversion(self.left.dtype) ^ needs_i8_conversion(
- values.left.dtype
- ):
- # not comparable -> no overlap
- return np.zeros(self.shape, dtype=bool)
-
- return isin(self.astype(object), values.astype(object))
-
- @property
- def _combined(self) -> IntervalSideT:
- left = self.left._values.reshape(-1, 1)
- right = self.right._values.reshape(-1, 1)
- if needs_i8_conversion(left.dtype):
- comb = left._concat_same_type([left, right], axis=1)
- else:
- comb = np.concatenate([left, right], axis=1)
- return comb
-
- def _from_combined(self, combined: np.ndarray) -> IntervalArray:
- """
- Create a new IntervalArray with our dtype from a 1D complex128 ndarray.
- """
- nc = combined.view("i8").reshape(-1, 2)
-
- dtype = self._left.dtype
- if needs_i8_conversion(dtype):
- assert isinstance(self._left, (DatetimeArray, TimedeltaArray))
- new_left = type(self._left)._from_sequence(nc[:, 0], dtype=dtype)
- assert isinstance(self._right, (DatetimeArray, TimedeltaArray))
- new_right = type(self._right)._from_sequence(nc[:, 1], dtype=dtype)
- else:
- assert isinstance(dtype, np.dtype)
- new_left = nc[:, 0].view(dtype)
- new_right = nc[:, 1].view(dtype)
- return self._shallow_copy(left=new_left, right=new_right)
-
- def unique(self) -> IntervalArray:
- # No overload variant of "__getitem__" of "ExtensionArray" matches argument
- # type "Tuple[slice, int]"
- nc = unique(
- self._combined.view("complex128")[:, 0] # type: ignore[call-overload]
- )
- nc = nc[:, None]
- return self._from_combined(nc)
-
-
-def _maybe_convert_platform_interval(values) -> ArrayLike:
- """
- Try to do platform conversion, with special casing for IntervalArray.
- Wrapper around maybe_convert_platform that alters the default return
- dtype in certain cases to be compatible with IntervalArray. For example,
- empty lists return with integer dtype instead of object dtype, which is
- prohibited for IntervalArray.
-
- Parameters
- ----------
- values : array-like
-
- Returns
- -------
- array
- """
- if isinstance(values, (list, tuple)) and len(values) == 0:
- # GH 19016
- # empty lists/tuples get object dtype by default, but this is
- # prohibited for IntervalArray, so coerce to integer instead
- return np.array([], dtype=np.int64)
- elif not is_list_like(values) or isinstance(values, ABCDataFrame):
- # This will raise later, but we avoid passing to maybe_convert_platform
- return values
- elif is_categorical_dtype(values):
- values = np.asarray(values)
- elif not hasattr(values, "dtype") and not isinstance(values, (list, tuple, range)):
- # TODO: should we just cast these to list?
- return values
- else:
- values = extract_array(values, extract_numpy=True)
-
- if not hasattr(values, "dtype"):
- values = np.asarray(values)
- if is_integer_dtype(values) and values.dtype != np.int64:
- values = values.astype(np.int64)
- return values
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/masked.py b/contrib/python/pandas/py3/pandas/core/arrays/masked.py
deleted file mode 100644
index 6a88b48031d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/masked.py
+++ /dev/null
@@ -1,1391 +0,0 @@
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Any,
- Iterator,
- Literal,
- Sequence,
- TypeVar,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import (
- lib,
- missing as libmissing,
-)
-from pandas._libs.tslibs import (
- get_unit_from_dtype,
- is_supported_unit,
-)
-from pandas._typing import (
- ArrayLike,
- AstypeArg,
- AxisInt,
- DtypeObj,
- NpDtype,
- PositionalIndexer,
- Scalar,
- ScalarIndexer,
- SequenceIndexer,
- Shape,
- npt,
-)
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import doc
-from pandas.util._validators import validate_fillna_kwargs
-
-from pandas.core.dtypes.base import ExtensionDtype
-from pandas.core.dtypes.common import (
- is_bool,
- is_bool_dtype,
- is_datetime64_dtype,
- is_dtype_equal,
- is_float_dtype,
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import BaseMaskedDtype
-from pandas.core.dtypes.inference import is_array_like
-from pandas.core.dtypes.missing import (
- array_equivalent,
- is_valid_na_for_dtype,
- isna,
- notna,
-)
-
-from pandas.core import (
- algorithms as algos,
- arraylike,
- missing,
- nanops,
- ops,
-)
-from pandas.core.algorithms import (
- factorize_array,
- isin,
- take,
-)
-from pandas.core.array_algos import (
- masked_accumulations,
- masked_reductions,
-)
-from pandas.core.array_algos.quantile import quantile_with_mask
-from pandas.core.arraylike import OpsMixin
-from pandas.core.arrays import ExtensionArray
-from pandas.core.construction import ensure_wrapped_if_datetimelike
-from pandas.core.indexers import check_array_indexer
-from pandas.core.ops import invalid_comparison
-
-if TYPE_CHECKING:
- from pandas import Series
- from pandas.core.arrays import BooleanArray
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- )
-
-from pandas.compat.numpy import function as nv
-
-BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")
-
-
-class BaseMaskedArray(OpsMixin, ExtensionArray):
- """
- Base class for masked arrays (which use _data and _mask to store the data).
-
- numpy based
- """
-
- # The value used to fill '_data' to avoid upcasting
- _internal_fill_value: Scalar
- # our underlying data and mask are each ndarrays
- _data: np.ndarray
- _mask: npt.NDArray[np.bool_]
-
- # Fill values used for any/all
- _truthy_value = Scalar # bool(_truthy_value) = True
- _falsey_value = Scalar # bool(_falsey_value) = False
-
- def __init__(
- self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
- ) -> None:
- # values is supposed to already be validated in the subclass
- if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_):
- raise TypeError(
- "mask should be boolean numpy array. Use "
- "the 'pd.array' function instead"
- )
- if values.shape != mask.shape:
- raise ValueError("values.shape must match mask.shape")
-
- if copy:
- values = values.copy()
- mask = mask.copy()
-
- self._data = values
- self._mask = mask
-
- @classmethod
- def _from_sequence(
- cls: type[BaseMaskedArrayT], scalars, *, dtype=None, copy: bool = False
- ) -> BaseMaskedArrayT:
- values, mask = cls._coerce_to_array(scalars, dtype=dtype, copy=copy)
- return cls(values, mask)
-
- @property
- def dtype(self) -> BaseMaskedDtype:
- raise AbstractMethodError(self)
-
- @overload
- def __getitem__(self, item: ScalarIndexer) -> Any:
- ...
-
- @overload
- def __getitem__(self: BaseMaskedArrayT, item: SequenceIndexer) -> BaseMaskedArrayT:
- ...
-
- def __getitem__(
- self: BaseMaskedArrayT, item: PositionalIndexer
- ) -> BaseMaskedArrayT | Any:
- item = check_array_indexer(self, item)
-
- newmask = self._mask[item]
- if is_bool(newmask):
- # This is a scalar indexing
- if newmask:
- return self.dtype.na_value
- return self._data[item]
-
- return type(self)(self._data[item], newmask)
-
- @doc(ExtensionArray.fillna)
- def fillna(
- self: BaseMaskedArrayT, value=None, method=None, limit=None
- ) -> BaseMaskedArrayT:
- value, method = validate_fillna_kwargs(value, method)
-
- mask = self._mask
-
- if is_array_like(value):
- if len(value) != len(self):
- raise ValueError(
- f"Length of 'value' does not match. Got ({len(value)}) "
- f" expected {len(self)}"
- )
- value = value[mask]
-
- if mask.any():
- if method is not None:
- func = missing.get_fill_func(method, ndim=self.ndim)
- npvalues = self._data.copy().T
- new_mask = mask.copy().T
- func(npvalues, limit=limit, mask=new_mask)
- return type(self)(npvalues.T, new_mask.T)
- else:
- # fill with value
- new_values = self.copy()
- new_values[mask] = value
- else:
- new_values = self.copy()
- return new_values
-
- @classmethod
- def _coerce_to_array(
- cls, values, *, dtype: DtypeObj, copy: bool = False
- ) -> tuple[np.ndarray, np.ndarray]:
- raise AbstractMethodError(cls)
-
- def _validate_setitem_value(self, value):
- """
- Check if we have a scalar that we can cast losslessly.
-
- Raises
- ------
- TypeError
- """
- kind = self.dtype.kind
- # TODO: get this all from np_can_hold_element?
- if kind == "b":
- if lib.is_bool(value):
- return value
-
- elif kind == "f":
- if lib.is_integer(value) or lib.is_float(value):
- return value
-
- else:
- if lib.is_integer(value) or (lib.is_float(value) and value.is_integer()):
- return value
- # TODO: unsigned checks
-
- # Note: without the "str" here, the f-string rendering raises in
- # py38 builds.
- raise TypeError(f"Invalid value '{str(value)}' for dtype {self.dtype}")
-
- def __setitem__(self, key, value) -> None:
- key = check_array_indexer(self, key)
-
- if is_scalar(value):
- if is_valid_na_for_dtype(value, self.dtype):
- self._mask[key] = True
- else:
- value = self._validate_setitem_value(value)
- self._data[key] = value
- self._mask[key] = False
- return
-
- value, mask = self._coerce_to_array(value, dtype=self.dtype)
-
- self._data[key] = value
- self._mask[key] = mask
-
- def __iter__(self) -> Iterator:
- if self.ndim == 1:
- if not self._hasna:
- for val in self._data:
- yield val
- else:
- na_value = self.dtype.na_value
- for isna_, val in zip(self._mask, self._data):
- if isna_:
- yield na_value
- else:
- yield val
- else:
- for i in range(len(self)):
- yield self[i]
-
- def __len__(self) -> int:
- return len(self._data)
-
- @property
- def shape(self) -> Shape:
- return self._data.shape
-
- @property
- def ndim(self) -> int:
- return self._data.ndim
-
- def swapaxes(self: BaseMaskedArrayT, axis1, axis2) -> BaseMaskedArrayT:
- data = self._data.swapaxes(axis1, axis2)
- mask = self._mask.swapaxes(axis1, axis2)
- return type(self)(data, mask)
-
- def delete(self: BaseMaskedArrayT, loc, axis: AxisInt = 0) -> BaseMaskedArrayT:
- data = np.delete(self._data, loc, axis=axis)
- mask = np.delete(self._mask, loc, axis=axis)
- return type(self)(data, mask)
-
- def reshape(self: BaseMaskedArrayT, *args, **kwargs) -> BaseMaskedArrayT:
- data = self._data.reshape(*args, **kwargs)
- mask = self._mask.reshape(*args, **kwargs)
- return type(self)(data, mask)
-
- def ravel(self: BaseMaskedArrayT, *args, **kwargs) -> BaseMaskedArrayT:
- # TODO: need to make sure we have the same order for data/mask
- data = self._data.ravel(*args, **kwargs)
- mask = self._mask.ravel(*args, **kwargs)
- return type(self)(data, mask)
-
- @property
- def T(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- return type(self)(self._data.T, self._mask.T)
-
- def round(self, decimals: int = 0, *args, **kwargs):
- """
- Round each value in the array a to the given number of decimals.
-
- Parameters
- ----------
- decimals : int, default 0
- Number of decimal places to round to. If decimals is negative,
- it specifies the number of positions to the left of the decimal point.
- *args, **kwargs
- Additional arguments and keywords have no effect but might be
- accepted for compatibility with NumPy.
-
- Returns
- -------
- NumericArray
- Rounded values of the NumericArray.
-
- See Also
- --------
- numpy.around : Round values of an np.array.
- DataFrame.round : Round values of a DataFrame.
- Series.round : Round values of a Series.
- """
- nv.validate_round(args, kwargs)
- values = np.round(self._data, decimals=decimals, **kwargs)
-
- # Usually we'll get same type as self, but ndarray[bool] casts to float
- return self._maybe_mask_result(values, self._mask.copy())
-
- # ------------------------------------------------------------------
- # Unary Methods
-
- def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- return type(self)(~self._data, self._mask.copy())
-
- def __neg__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- return type(self)(-self._data, self._mask.copy())
-
- def __pos__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- return self.copy()
-
- def __abs__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- return type(self)(abs(self._data), self._mask.copy())
-
- # ------------------------------------------------------------------
-
- def to_numpy(
- self,
- dtype: npt.DTypeLike | None = None,
- copy: bool = False,
- na_value: object = lib.no_default,
- ) -> np.ndarray:
- """
- Convert to a NumPy Array.
-
- By default converts to an object-dtype NumPy array. Specify the `dtype` and
- `na_value` keywords to customize the conversion.
-
- Parameters
- ----------
- dtype : dtype, default object
- The numpy dtype to convert to.
- copy : bool, default False
- Whether to ensure that the returned value is a not a view on
- the array. Note that ``copy=False`` does not *ensure* that
- ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
- a copy is made, even if not strictly necessary. This is typically
- only possible when no missing values are present and `dtype`
- is the equivalent numpy dtype.
- na_value : scalar, optional
- Scalar missing value indicator to use in numpy array. Defaults
- to the native missing value indicator of this array (pd.NA).
-
- Returns
- -------
- numpy.ndarray
-
- Examples
- --------
- An object-dtype is the default result
-
- >>> a = pd.array([True, False, pd.NA], dtype="boolean")
- >>> a.to_numpy()
- array([True, False, <NA>], dtype=object)
-
- When no missing values are present, an equivalent dtype can be used.
-
- >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool")
- array([ True, False])
- >>> pd.array([1, 2], dtype="Int64").to_numpy("int64")
- array([1, 2])
-
- However, requesting such dtype will raise a ValueError if
- missing values are present and the default missing value :attr:`NA`
- is used.
-
- >>> a = pd.array([True, False, pd.NA], dtype="boolean")
- >>> a
- <BooleanArray>
- [True, False, <NA>]
- Length: 3, dtype: boolean
-
- >>> a.to_numpy(dtype="bool")
- Traceback (most recent call last):
- ...
- ValueError: cannot convert to bool numpy array in presence of missing values
-
- Specify a valid `na_value` instead
-
- >>> a.to_numpy(dtype="bool", na_value=False)
- array([ True, False, False])
- """
- if na_value is lib.no_default:
- na_value = libmissing.NA
- if dtype is None:
- dtype = object
- if self._hasna:
- if (
- not is_object_dtype(dtype)
- and not is_string_dtype(dtype)
- and na_value is libmissing.NA
- ):
- raise ValueError(
- f"cannot convert to '{dtype}'-dtype NumPy array "
- "with missing values. Specify an appropriate 'na_value' "
- "for this dtype."
- )
- # don't pass copy to astype -> always need a copy since we are mutating
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore", category=RuntimeWarning)
- data = self._data.astype(dtype)
- data[self._mask] = na_value
- else:
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore", category=RuntimeWarning)
- data = self._data.astype(dtype, copy=copy)
- return data
-
- @doc(ExtensionArray.tolist)
- def tolist(self):
- if self.ndim > 1:
- return [x.tolist() for x in self]
- dtype = None if self._hasna else self._data.dtype
- return self.to_numpy(dtype=dtype).tolist()
-
- @overload
- def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
- ...
-
- @overload
- def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
- ...
-
- @overload
- def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
- ...
-
- def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
- dtype = pandas_dtype(dtype)
-
- if is_dtype_equal(dtype, self.dtype):
- if copy:
- return self.copy()
- return self
-
- # if we are astyping to another nullable masked dtype, we can fastpath
- if isinstance(dtype, BaseMaskedDtype):
- # TODO deal with NaNs for FloatingArray case
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore", category=RuntimeWarning)
- # TODO: Is rounding what we want long term?
- data = self._data.astype(dtype.numpy_dtype, copy=copy)
- # mask is copied depending on whether the data was copied, and
- # not directly depending on the `copy` keyword
- mask = self._mask if data is self._data else self._mask.copy()
- cls = dtype.construct_array_type()
- return cls(data, mask, copy=False)
-
- if isinstance(dtype, ExtensionDtype):
- eacls = dtype.construct_array_type()
- return eacls._from_sequence(self, dtype=dtype, copy=copy)
-
- na_value: float | np.datetime64 | lib.NoDefault
-
- # coerce
- if is_float_dtype(dtype):
- # In astype, we consider dtype=float to also mean na_value=np.nan
- na_value = np.nan
- elif is_datetime64_dtype(dtype):
- na_value = np.datetime64("NaT")
- else:
- na_value = lib.no_default
-
- # to_numpy will also raise, but we get somewhat nicer exception messages here
- if is_integer_dtype(dtype) and self._hasna:
- raise ValueError("cannot convert NA to integer")
- if is_bool_dtype(dtype) and self._hasna:
- # careful: astype_nansafe converts np.nan to True
- raise ValueError("cannot convert float NaN to bool")
-
- data = self.to_numpy(dtype=dtype, na_value=na_value, copy=copy)
- return data
-
- __array_priority__ = 1000 # higher than ndarray so ops dispatch to us
-
- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
- """
- the array interface, return my values
- We return an object array here to preserve our scalar values
- """
- return self.to_numpy(dtype=dtype)
-
- _HANDLED_TYPES: tuple[type, ...]
-
- def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
- # For MaskedArray inputs, we apply the ufunc to ._data
- # and mask the result.
-
- out = kwargs.get("out", ())
-
- for x in inputs + out:
- if not isinstance(x, self._HANDLED_TYPES + (BaseMaskedArray,)):
- return NotImplemented
-
- # for binary ops, use our custom dunder methods
- result = ops.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
-
- if "out" in kwargs:
- # e.g. test_ufunc_with_out
- return arraylike.dispatch_ufunc_with_out(
- self, ufunc, method, *inputs, **kwargs
- )
-
- if method == "reduce":
- result = arraylike.dispatch_reduction_ufunc(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
-
- mask = np.zeros(len(self), dtype=bool)
- inputs2 = []
- for x in inputs:
- if isinstance(x, BaseMaskedArray):
- mask |= x._mask
- inputs2.append(x._data)
- else:
- inputs2.append(x)
-
- def reconstruct(x):
- # we don't worry about scalar `x` here, since we
- # raise for reduce up above.
- from pandas.core.arrays import (
- BooleanArray,
- FloatingArray,
- IntegerArray,
- )
-
- if is_bool_dtype(x.dtype):
- m = mask.copy()
- return BooleanArray(x, m)
- elif is_integer_dtype(x.dtype):
- m = mask.copy()
- return IntegerArray(x, m)
- elif is_float_dtype(x.dtype):
- m = mask.copy()
- if x.dtype == np.float16:
- # reached in e.g. np.sqrt on BooleanArray
- # we don't support float16
- x = x.astype(np.float32)
- return FloatingArray(x, m)
- else:
- x[mask] = np.nan
- return x
-
- result = getattr(ufunc, method)(*inputs2, **kwargs)
- if ufunc.nout > 1:
- # e.g. np.divmod
- return tuple(reconstruct(x) for x in result)
- elif method == "reduce":
- # e.g. np.add.reduce; test_ufunc_reduce_raises
- if self._mask.any():
- return self._na_value
- return result
- else:
- return reconstruct(result)
-
- def __arrow_array__(self, type=None):
- """
- Convert myself into a pyarrow Array.
- """
- import pyarrow as pa
-
- return pa.array(self._data, mask=self._mask, type=type)
-
- @property
- def _hasna(self) -> bool:
- # Note: this is expensive right now! The hope is that we can
- # make this faster by having an optional mask, but not have to change
- # source code using it..
-
- # error: Incompatible return value type (got "bool_", expected "bool")
- return self._mask.any() # type: ignore[return-value]
-
- def _propagate_mask(
- self, mask: npt.NDArray[np.bool_] | None, other
- ) -> npt.NDArray[np.bool_]:
- if mask is None:
- mask = self._mask.copy() # TODO: need test for BooleanArray needing a copy
- if other is libmissing.NA:
- # GH#45421 don't alter inplace
- mask = mask | True
- elif is_list_like(other) and len(other) == len(mask):
- mask = mask | isna(other)
- else:
- mask = self._mask | mask
- # Incompatible return value type (got "Optional[ndarray[Any, dtype[bool_]]]",
- # expected "ndarray[Any, dtype[bool_]]")
- return mask # type: ignore[return-value]
-
- def _arith_method(self, other, op):
- op_name = op.__name__
- omask = None
-
- if (
- not hasattr(other, "dtype")
- and is_list_like(other)
- and len(other) == len(self)
- ):
- # Try inferring masked dtype instead of casting to object
- inferred_dtype = lib.infer_dtype(other, skipna=True)
- if inferred_dtype == "integer":
- from pandas.core.arrays import IntegerArray
-
- other = IntegerArray._from_sequence(other)
- elif inferred_dtype in ["floating", "mixed-integer-float"]:
- from pandas.core.arrays import FloatingArray
-
- other = FloatingArray._from_sequence(other)
-
- elif inferred_dtype in ["boolean"]:
- from pandas.core.arrays import BooleanArray
-
- other = BooleanArray._from_sequence(other)
-
- if isinstance(other, BaseMaskedArray):
- other, omask = other._data, other._mask
-
- elif is_list_like(other):
- if not isinstance(other, ExtensionArray):
- other = np.asarray(other)
- if other.ndim > 1:
- raise NotImplementedError("can only perform ops with 1-d structures")
-
- # We wrap the non-masked arithmetic logic used for numpy dtypes
- # in Series/Index arithmetic ops.
- other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
- pd_op = ops.get_array_op(op)
- other = ensure_wrapped_if_datetimelike(other)
-
- if op_name in {"pow", "rpow"} and isinstance(other, np.bool_):
- # Avoid DeprecationWarning: In future, it will be an error
- # for 'np.bool_' scalars to be interpreted as an index
- # e.g. test_array_scalar_like_equivalence
- other = bool(other)
-
- mask = self._propagate_mask(omask, other)
-
- if other is libmissing.NA:
- result = np.ones_like(self._data)
- if self.dtype.kind == "b":
- if op_name in {
- "floordiv",
- "rfloordiv",
- "pow",
- "rpow",
- "truediv",
- "rtruediv",
- }:
- # GH#41165 Try to match non-masked Series behavior
- # This is still imperfect GH#46043
- raise NotImplementedError(
- f"operator '{op_name}' not implemented for bool dtypes"
- )
- if op_name in {"mod", "rmod"}:
- dtype = "int8"
- else:
- dtype = "bool"
- result = result.astype(dtype)
- elif "truediv" in op_name and self.dtype.kind != "f":
- # The actual data here doesn't matter since the mask
- # will be all-True, but since this is division, we want
- # to end up with floating dtype.
- result = result.astype(np.float64)
- else:
- # Make sure we do this before the "pow" mask checks
- # to get an expected exception message on shape mismatch.
- if self.dtype.kind in ["i", "u"] and op_name in ["floordiv", "mod"]:
- # TODO(GH#30188) ATM we don't match the behavior of non-masked
- # types with respect to floordiv-by-zero
- pd_op = op
-
- with np.errstate(all="ignore"):
- result = pd_op(self._data, other)
-
- if op_name == "pow":
- # 1 ** x is 1.
- mask = np.where((self._data == 1) & ~self._mask, False, mask)
- # x ** 0 is 1.
- if omask is not None:
- mask = np.where((other == 0) & ~omask, False, mask)
- elif other is not libmissing.NA:
- mask = np.where(other == 0, False, mask)
-
- elif op_name == "rpow":
- # 1 ** x is 1.
- if omask is not None:
- mask = np.where((other == 1) & ~omask, False, mask)
- elif other is not libmissing.NA:
- mask = np.where(other == 1, False, mask)
- # x ** 0 is 1.
- mask = np.where((self._data == 0) & ~self._mask, False, mask)
-
- return self._maybe_mask_result(result, mask)
-
- _logical_method = _arith_method
-
- def _cmp_method(self, other, op) -> BooleanArray:
- from pandas.core.arrays import BooleanArray
-
- mask = None
-
- if isinstance(other, BaseMaskedArray):
- other, mask = other._data, other._mask
-
- elif is_list_like(other):
- other = np.asarray(other)
- if other.ndim > 1:
- raise NotImplementedError("can only perform ops with 1-d structures")
- if len(self) != len(other):
- raise ValueError("Lengths must match to compare")
-
- if other is libmissing.NA:
- # numpy does not handle pd.NA well as "other" scalar (it returns
- # a scalar False instead of an array)
- # This may be fixed by NA.__array_ufunc__. Revisit this check
- # once that's implemented.
- result = np.zeros(self._data.shape, dtype="bool")
- mask = np.ones(self._data.shape, dtype="bool")
- else:
- with warnings.catch_warnings():
- # numpy may show a FutureWarning or DeprecationWarning:
- # elementwise comparison failed; returning scalar instead,
- # but in the future will perform elementwise comparison
- # before returning NotImplemented. We fall back to the correct
- # behavior today, so that should be fine to ignore.
- warnings.filterwarnings("ignore", "elementwise", FutureWarning)
- warnings.filterwarnings("ignore", "elementwise", DeprecationWarning)
- with np.errstate(all="ignore"):
- method = getattr(self._data, f"__{op.__name__}__")
- result = method(other)
-
- if result is NotImplemented:
- result = invalid_comparison(self._data, other, op)
-
- mask = self._propagate_mask(mask, other)
- return BooleanArray(result, mask, copy=False)
-
- def _maybe_mask_result(self, result, mask):
- """
- Parameters
- ----------
- result : array-like or tuple[array-like]
- mask : array-like bool
- """
- if isinstance(result, tuple):
- # i.e. divmod
- div, mod = result
- return (
- self._maybe_mask_result(div, mask),
- self._maybe_mask_result(mod, mask),
- )
-
- if is_float_dtype(result.dtype):
- from pandas.core.arrays import FloatingArray
-
- return FloatingArray(result, mask, copy=False)
-
- elif is_bool_dtype(result.dtype):
- from pandas.core.arrays import BooleanArray
-
- return BooleanArray(result, mask, copy=False)
-
- elif (
- isinstance(result.dtype, np.dtype)
- and result.dtype.kind == "m"
- and is_supported_unit(get_unit_from_dtype(result.dtype))
- ):
- # e.g. test_numeric_arr_mul_tdscalar_numexpr_path
- from pandas.core.arrays import TimedeltaArray
-
- if not isinstance(result, TimedeltaArray):
- result = TimedeltaArray._simple_new(result, dtype=result.dtype)
-
- result[mask] = result.dtype.type("NaT")
- return result
-
- elif is_integer_dtype(result.dtype):
- from pandas.core.arrays import IntegerArray
-
- return IntegerArray(result, mask, copy=False)
-
- else:
- result[mask] = np.nan
- return result
-
- def isna(self) -> np.ndarray:
- return self._mask.copy()
-
- @property
- def _na_value(self):
- return self.dtype.na_value
-
- @property
- def nbytes(self) -> int:
- return self._data.nbytes + self._mask.nbytes
-
- @classmethod
- def _concat_same_type(
- cls: type[BaseMaskedArrayT],
- to_concat: Sequence[BaseMaskedArrayT],
- axis: AxisInt = 0,
- ) -> BaseMaskedArrayT:
- data = np.concatenate([x._data for x in to_concat], axis=axis)
- mask = np.concatenate([x._mask for x in to_concat], axis=axis)
- return cls(data, mask)
-
- def take(
- self: BaseMaskedArrayT,
- indexer,
- *,
- allow_fill: bool = False,
- fill_value: Scalar | None = None,
- axis: AxisInt = 0,
- ) -> BaseMaskedArrayT:
- # we always fill with 1 internally
- # to avoid upcasting
- data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value
- result = take(
- self._data,
- indexer,
- fill_value=data_fill_value,
- allow_fill=allow_fill,
- axis=axis,
- )
-
- mask = take(
- self._mask, indexer, fill_value=True, allow_fill=allow_fill, axis=axis
- )
-
- # if we are filling
- # we only fill where the indexer is null
- # not existing missing values
- # TODO(jreback) what if we have a non-na float as a fill value?
- if allow_fill and notna(fill_value):
- fill_mask = np.asarray(indexer) == -1
- result[fill_mask] = fill_value
- mask = mask ^ fill_mask
-
- return type(self)(result, mask, copy=False)
-
- # error: Return type "BooleanArray" of "isin" incompatible with return type
- # "ndarray" in supertype "ExtensionArray"
- def isin(self, values) -> BooleanArray: # type: ignore[override]
- from pandas.core.arrays import BooleanArray
-
- # algorithms.isin will eventually convert values to an ndarray, so no extra
- # cost to doing it here first
- values_arr = np.asarray(values)
- result = isin(self._data, values_arr)
-
- if self._hasna:
- values_have_NA = is_object_dtype(values_arr.dtype) and any(
- val is self.dtype.na_value for val in values_arr
- )
-
- # For now, NA does not propagate so set result according to presence of NA,
- # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
- result[self._mask] = values_have_NA
-
- mask = np.zeros(self._data.shape, dtype=bool)
- return BooleanArray(result, mask, copy=False)
-
- def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- data, mask = self._data, self._mask
- data = data.copy()
- mask = mask.copy()
- return type(self)(data, mask, copy=False)
-
- def unique(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- """
- Compute the BaseMaskedArray of unique values.
-
- Returns
- -------
- uniques : BaseMaskedArray
- """
- uniques, mask = algos.unique_with_mask(self._data, self._mask)
- return type(self)(uniques, mask, copy=False)
-
- @doc(ExtensionArray.searchsorted)
- def searchsorted(
- self,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- if self._hasna:
- raise ValueError(
- "searchsorted requires array to be sorted, which is impossible "
- "with NAs present."
- )
- if isinstance(value, ExtensionArray):
- value = value.astype(object)
- # Base class searchsorted would cast to object, which is *much* slower.
- return self._data.searchsorted(value, side=side, sorter=sorter)
-
- @doc(ExtensionArray.factorize)
- def factorize(
- self,
- use_na_sentinel: bool = True,
- ) -> tuple[np.ndarray, ExtensionArray]:
- arr = self._data
- mask = self._mask
-
- # Use a sentinel for na; recode and add NA to uniques if necessary below
- codes, uniques = factorize_array(arr, use_na_sentinel=True, mask=mask)
-
- # check that factorize_array correctly preserves dtype.
- assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
-
- has_na = mask.any()
- if use_na_sentinel or not has_na:
- size = len(uniques)
- else:
- # Make room for an NA value
- size = len(uniques) + 1
- uniques_mask = np.zeros(size, dtype=bool)
- if not use_na_sentinel and has_na:
- na_index = mask.argmax()
- # Insert na with the proper code
- if na_index == 0:
- na_code = np.intp(0)
- else:
- # mypy error: Slice index must be an integer or None
- # https://github.com/python/mypy/issues/2410
- na_code = codes[:na_index].max() + 1 # type: ignore[misc]
- codes[codes >= na_code] += 1
- codes[codes == -1] = na_code
- # dummy value for uniques; not used since uniques_mask will be True
- uniques = np.insert(uniques, na_code, 0)
- uniques_mask[na_code] = True
- uniques_ea = type(self)(uniques, uniques_mask)
-
- return codes, uniques_ea
-
- @doc(ExtensionArray._values_for_argsort)
- def _values_for_argsort(self) -> np.ndarray:
- return self._data
-
- def value_counts(self, dropna: bool = True) -> Series:
- """
- Returns a Series containing counts of each unique value.
-
- Parameters
- ----------
- dropna : bool, default True
- Don't include counts of missing values.
-
- Returns
- -------
- counts : Series
-
- See Also
- --------
- Series.value_counts
- """
- from pandas import (
- Index,
- Series,
- )
- from pandas.arrays import IntegerArray
-
- keys, value_counts = algos.value_counts_arraylike(
- self._data, dropna=True, mask=self._mask
- )
-
- if dropna:
- res = Series(value_counts, index=keys, name="count", copy=False)
- res.index = res.index.astype(self.dtype)
- res = res.astype("Int64")
- return res
-
- # if we want nans, count the mask
- counts = np.empty(len(value_counts) + 1, dtype="int64")
- counts[:-1] = value_counts
- counts[-1] = self._mask.sum()
-
- index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value)
- index = index.astype(self.dtype)
-
- mask = np.zeros(len(counts), dtype="bool")
- counts_array = IntegerArray(counts, mask)
-
- return Series(counts_array, index=index, name="count", copy=False)
-
- @doc(ExtensionArray.equals)
- def equals(self, other) -> bool:
- if type(self) != type(other):
- return False
- if other.dtype != self.dtype:
- return False
-
- # GH#44382 if e.g. self[1] is np.nan and other[1] is pd.NA, we are NOT
- # equal.
- if not np.array_equal(self._mask, other._mask):
- return False
-
- left = self._data[~self._mask]
- right = other._data[~other._mask]
- return array_equivalent(left, right, dtype_equal=True)
-
- def _quantile(
- self, qs: npt.NDArray[np.float64], interpolation: str
- ) -> BaseMaskedArray:
- """
- Dispatch to quantile_with_mask, needed because we do not have
- _from_factorized.
-
- Notes
- -----
- We assume that all impacted cases are 1D-only.
- """
- res = quantile_with_mask(
- self._data,
- mask=self._mask,
- # TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype)
- # instead of np.nan
- fill_value=np.nan,
- qs=qs,
- interpolation=interpolation,
- )
-
- if self._hasna:
- # Our result mask is all-False unless we are all-NA, in which
- # case it is all-True.
- if self.ndim == 2:
- # I think this should be out_mask=self.isna().all(axis=1)
- # but am holding off until we have tests
- raise NotImplementedError
- if self.isna().all():
- out_mask = np.ones(res.shape, dtype=bool)
-
- if is_integer_dtype(self.dtype):
- # We try to maintain int dtype if possible for not all-na case
- # as well
- res = np.zeros(res.shape, dtype=self.dtype.numpy_dtype)
- else:
- out_mask = np.zeros(res.shape, dtype=bool)
- else:
- out_mask = np.zeros(res.shape, dtype=bool)
- return self._maybe_mask_result(res, mask=out_mask)
-
- # ------------------------------------------------------------------
- # Reductions
-
- def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
- if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}:
- return getattr(self, name)(skipna=skipna, **kwargs)
-
- data = self._data
- mask = self._mask
-
- # median, skew, kurt, sem
- op = getattr(nanops, f"nan{name}")
- result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
-
- if np.isnan(result):
- return libmissing.NA
-
- return result
-
- def _wrap_reduction_result(self, name: str, result, skipna, **kwargs):
- if isinstance(result, np.ndarray):
- axis = kwargs["axis"]
- if skipna:
- # we only retain mask for all-NA rows/columns
- mask = self._mask.all(axis=axis)
- else:
- mask = self._mask.any(axis=axis)
-
- return self._maybe_mask_result(result, mask)
- return result
-
- def sum(
- self,
- *,
- skipna: bool = True,
- min_count: int = 0,
- axis: AxisInt | None = 0,
- **kwargs,
- ):
- nv.validate_sum((), kwargs)
-
- # TODO: do this in validate_sum?
- if "out" in kwargs:
- # np.sum; test_floating_array_numpy_sum
- if kwargs["out"] is not None:
- raise NotImplementedError
- kwargs.pop("out")
-
- result = masked_reductions.sum(
- self._data,
- self._mask,
- skipna=skipna,
- min_count=min_count,
- axis=axis,
- )
- return self._wrap_reduction_result(
- "sum", result, skipna=skipna, axis=axis, **kwargs
- )
-
- def prod(
- self,
- *,
- skipna: bool = True,
- min_count: int = 0,
- axis: AxisInt | None = 0,
- **kwargs,
- ):
- nv.validate_prod((), kwargs)
- result = masked_reductions.prod(
- self._data,
- self._mask,
- skipna=skipna,
- min_count=min_count,
- axis=axis,
- )
- return self._wrap_reduction_result(
- "prod", result, skipna=skipna, axis=axis, **kwargs
- )
-
- def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
- nv.validate_mean((), kwargs)
- result = masked_reductions.mean(
- self._data,
- self._mask,
- skipna=skipna,
- axis=axis,
- )
- return self._wrap_reduction_result(
- "mean", result, skipna=skipna, axis=axis, **kwargs
- )
-
- def var(
- self, *, skipna: bool = True, axis: AxisInt | None = 0, ddof: int = 1, **kwargs
- ):
- nv.validate_stat_ddof_func((), kwargs, fname="var")
- result = masked_reductions.var(
- self._data,
- self._mask,
- skipna=skipna,
- axis=axis,
- ddof=ddof,
- )
- return self._wrap_reduction_result(
- "var", result, skipna=skipna, axis=axis, **kwargs
- )
-
- def std(
- self, *, skipna: bool = True, axis: AxisInt | None = 0, ddof: int = 1, **kwargs
- ):
- nv.validate_stat_ddof_func((), kwargs, fname="std")
- result = masked_reductions.std(
- self._data,
- self._mask,
- skipna=skipna,
- axis=axis,
- ddof=ddof,
- )
- return self._wrap_reduction_result(
- "std", result, skipna=skipna, axis=axis, **kwargs
- )
-
- def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
- nv.validate_min((), kwargs)
- return masked_reductions.min(
- self._data,
- self._mask,
- skipna=skipna,
- axis=axis,
- )
-
- def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
- nv.validate_max((), kwargs)
- return masked_reductions.max(
- self._data,
- self._mask,
- skipna=skipna,
- axis=axis,
- )
-
- def any(self, *, skipna: bool = True, **kwargs):
- """
- Return whether any element is truthy.
-
- Returns False unless there is at least one element that is truthy.
- By default, NAs are skipped. If ``skipna=False`` is specified and
- missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
- is used as for logical operations.
-
- .. versionchanged:: 1.4.0
-
- Parameters
- ----------
- skipna : bool, default True
- Exclude NA values. If the entire array is NA and `skipna` is
- True, then the result will be False, as for an empty array.
- If `skipna` is False, the result will still be True if there is
- at least one element that is truthy, otherwise NA will be returned
- if there are NA's present.
- **kwargs : any, default None
- Additional keywords have no effect but might be accepted for
- compatibility with NumPy.
-
- Returns
- -------
- bool or :attr:`pandas.NA`
-
- See Also
- --------
- numpy.any : Numpy version of this method.
- BaseMaskedArray.all : Return whether all elements are truthy.
-
- Examples
- --------
- The result indicates whether any element is truthy (and by default
- skips NAs):
-
- >>> pd.array([True, False, True]).any()
- True
- >>> pd.array([True, False, pd.NA]).any()
- True
- >>> pd.array([False, False, pd.NA]).any()
- False
- >>> pd.array([], dtype="boolean").any()
- False
- >>> pd.array([pd.NA], dtype="boolean").any()
- False
- >>> pd.array([pd.NA], dtype="Float64").any()
- False
-
- With ``skipna=False``, the result can be NA if this is logically
- required (whether ``pd.NA`` is True or False influences the result):
-
- >>> pd.array([True, False, pd.NA]).any(skipna=False)
- True
- >>> pd.array([1, 0, pd.NA]).any(skipna=False)
- True
- >>> pd.array([False, False, pd.NA]).any(skipna=False)
- <NA>
- >>> pd.array([0, 0, pd.NA]).any(skipna=False)
- <NA>
- """
- kwargs.pop("axis", None)
- nv.validate_any((), kwargs)
-
- values = self._data.copy()
- # error: Argument 3 to "putmask" has incompatible type "object";
- # expected "Union[_SupportsArray[dtype[Any]],
- # _NestedSequence[_SupportsArray[dtype[Any]]],
- # bool, int, float, complex, str, bytes,
- # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
- np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type]
- result = values.any()
- if skipna:
- return result
- else:
- if result or len(self) == 0 or not self._mask.any():
- return result
- else:
- return self.dtype.na_value
-
- def all(self, *, skipna: bool = True, **kwargs):
- """
- Return whether all elements are truthy.
-
- Returns True unless there is at least one element that is falsey.
- By default, NAs are skipped. If ``skipna=False`` is specified and
- missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
- is used as for logical operations.
-
- .. versionchanged:: 1.4.0
-
- Parameters
- ----------
- skipna : bool, default True
- Exclude NA values. If the entire array is NA and `skipna` is
- True, then the result will be True, as for an empty array.
- If `skipna` is False, the result will still be False if there is
- at least one element that is falsey, otherwise NA will be returned
- if there are NA's present.
- **kwargs : any, default None
- Additional keywords have no effect but might be accepted for
- compatibility with NumPy.
-
- Returns
- -------
- bool or :attr:`pandas.NA`
-
- See Also
- --------
- numpy.all : Numpy version of this method.
- BooleanArray.any : Return whether any element is truthy.
-
- Examples
- --------
- The result indicates whether all elements are truthy (and by default
- skips NAs):
-
- >>> pd.array([True, True, pd.NA]).all()
- True
- >>> pd.array([1, 1, pd.NA]).all()
- True
- >>> pd.array([True, False, pd.NA]).all()
- False
- >>> pd.array([], dtype="boolean").all()
- True
- >>> pd.array([pd.NA], dtype="boolean").all()
- True
- >>> pd.array([pd.NA], dtype="Float64").all()
- True
-
- With ``skipna=False``, the result can be NA if this is logically
- required (whether ``pd.NA`` is True or False influences the result):
-
- >>> pd.array([True, True, pd.NA]).all(skipna=False)
- <NA>
- >>> pd.array([1, 1, pd.NA]).all(skipna=False)
- <NA>
- >>> pd.array([True, False, pd.NA]).all(skipna=False)
- False
- >>> pd.array([1, 0, pd.NA]).all(skipna=False)
- False
- """
- kwargs.pop("axis", None)
- nv.validate_all((), kwargs)
-
- values = self._data.copy()
- # error: Argument 3 to "putmask" has incompatible type "object";
- # expected "Union[_SupportsArray[dtype[Any]],
- # _NestedSequence[_SupportsArray[dtype[Any]]],
- # bool, int, float, complex, str, bytes,
- # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
- np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type]
- result = values.all()
-
- if skipna:
- return result
- else:
- if not result or len(self) == 0 or not self._mask.any():
- return result
- else:
- return self.dtype.na_value
-
- def _accumulate(
- self, name: str, *, skipna: bool = True, **kwargs
- ) -> BaseMaskedArray:
- data = self._data
- mask = self._mask
-
- op = getattr(masked_accumulations, name)
- data, mask = op(data, mask, skipna=skipna, **kwargs)
-
- return type(self)(data, mask, copy=False)
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/numeric.py b/contrib/python/pandas/py3/pandas/core/arrays/numeric.py
deleted file mode 100644
index 95802b0175f..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/numeric.py
+++ /dev/null
@@ -1,291 +0,0 @@
-from __future__ import annotations
-
-import numbers
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Mapping,
- TypeVar,
-)
-
-import numpy as np
-
-from pandas._libs import (
- lib,
- missing as libmissing,
-)
-from pandas._typing import (
- Dtype,
- DtypeObj,
- npt,
-)
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import cache_readonly
-
-from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_float_dtype,
- is_integer_dtype,
- is_object_dtype,
- is_string_dtype,
- pandas_dtype,
-)
-
-from pandas.core.arrays.masked import (
- BaseMaskedArray,
- BaseMaskedDtype,
-)
-
-if TYPE_CHECKING:
- import pyarrow
-
-
-T = TypeVar("T", bound="NumericArray")
-
-
-class NumericDtype(BaseMaskedDtype):
- _default_np_dtype: np.dtype
- _checker: Callable[[Any], bool] # is_foo_dtype
-
- def __repr__(self) -> str:
- return f"{self.name}Dtype()"
-
- @cache_readonly
- def is_signed_integer(self) -> bool:
- return self.kind == "i"
-
- @cache_readonly
- def is_unsigned_integer(self) -> bool:
- return self.kind == "u"
-
- @property
- def _is_numeric(self) -> bool:
- return True
-
- def __from_arrow__(
- self, array: pyarrow.Array | pyarrow.ChunkedArray
- ) -> BaseMaskedArray:
- """
- Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
- """
- import pyarrow
-
- from pandas.core.arrays.arrow._arrow_utils import (
- pyarrow_array_to_numpy_and_mask,
- )
-
- array_class = self.construct_array_type()
-
- pyarrow_type = pyarrow.from_numpy_dtype(self.type)
- if not array.type.equals(pyarrow_type):
- # test_from_arrow_type_error raise for string, but allow
- # through itemsize conversion GH#31896
- rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
- if rt_dtype.kind not in ["i", "u", "f"]:
- # Could allow "c" or potentially disallow float<->int conversion,
- # but at the moment we specifically test that uint<->int works
- raise TypeError(
- f"Expected array of {self} type, got {array.type} instead"
- )
-
- array = array.cast(pyarrow_type)
-
- if isinstance(array, pyarrow.Array):
- chunks = [array]
- else:
- # pyarrow.ChunkedArray
- chunks = array.chunks
-
- results = []
- for arr in chunks:
- data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.numpy_dtype)
- num_arr = array_class(data.copy(), ~mask, copy=False)
- results.append(num_arr)
-
- if not results:
- return array_class(
- np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_)
- )
- elif len(results) == 1:
- # avoid additional copy in _concat_same_type
- return results[0]
- else:
- return array_class._concat_same_type(results)
-
- @classmethod
- def _str_to_dtype_mapping(cls) -> Mapping[str, NumericDtype]:
- raise AbstractMethodError(cls)
-
- @classmethod
- def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
- """
- Convert a string representation or a numpy dtype to NumericDtype.
- """
- if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
- # Avoid DeprecationWarning from NumPy about np.dtype("Int64")
- # https://github.com/numpy/numpy/pull/7476
- dtype = dtype.lower()
-
- if not isinstance(dtype, NumericDtype):
- mapping = cls._str_to_dtype_mapping()
- try:
- dtype = mapping[str(np.dtype(dtype))]
- except KeyError as err:
- raise ValueError(f"invalid dtype specified {dtype}") from err
- return dtype
-
- @classmethod
- def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
- """
- Safely cast the values to the given dtype.
-
- "safe" in this context means the casting is lossless.
- """
- raise AbstractMethodError(cls)
-
-
-def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype):
- checker = dtype_cls._checker
-
- inferred_type = None
-
- if dtype is None and hasattr(values, "dtype"):
- if checker(values.dtype):
- dtype = values.dtype
-
- if dtype is not None:
- dtype = dtype_cls._standardize_dtype(dtype)
-
- cls = dtype_cls.construct_array_type()
- if isinstance(values, cls):
- values, mask = values._data, values._mask
- if dtype is not None:
- values = values.astype(dtype.numpy_dtype, copy=False)
-
- if copy:
- values = values.copy()
- mask = mask.copy()
- return values, mask, dtype, inferred_type
-
- original = values
- values = np.array(values, copy=copy)
- inferred_type = None
- if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
- inferred_type = lib.infer_dtype(values, skipna=True)
- if inferred_type == "boolean" and dtype is None:
- name = dtype_cls.__name__.strip("_")
- raise TypeError(f"{values.dtype} cannot be converted to {name}")
-
- elif is_bool_dtype(values) and checker(dtype):
- values = np.array(values, dtype=default_dtype, copy=copy)
-
- elif not (is_integer_dtype(values) or is_float_dtype(values)):
- name = dtype_cls.__name__.strip("_")
- raise TypeError(f"{values.dtype} cannot be converted to {name}")
-
- if values.ndim != 1:
- raise TypeError("values must be a 1D list-like")
-
- if mask is None:
- if is_integer_dtype(values):
- # fastpath
- mask = np.zeros(len(values), dtype=np.bool_)
- else:
- mask = libmissing.is_numeric_na(values)
- else:
- assert len(mask) == len(values)
-
- if mask.ndim != 1:
- raise TypeError("mask must be a 1D list-like")
-
- # infer dtype if needed
- if dtype is None:
- dtype = default_dtype
- else:
- dtype = dtype.type
-
- if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0:
- if mask.all():
- values = np.ones(values.shape, dtype=dtype)
- else:
- idx = np.nanargmax(values)
- if int(values[idx]) != original[idx]:
- # We have ints that lost precision during the cast.
- inferred_type = lib.infer_dtype(original, skipna=True)
- if (
- inferred_type not in ["floating", "mixed-integer-float"]
- and not mask.any()
- ):
- values = np.array(original, dtype=dtype, copy=False)
- else:
- values = np.array(original, dtype="object", copy=False)
-
- # we copy as need to coerce here
- if mask.any():
- values = values.copy()
- values[mask] = cls._internal_fill_value
- if inferred_type in ("string", "unicode"):
- # casts from str are always safe since they raise
- # a ValueError if the str cannot be parsed into a float
- values = values.astype(dtype, copy=copy)
- else:
- values = dtype_cls._safe_cast(values, dtype, copy=False)
-
- return values, mask, dtype, inferred_type
-
-
-class NumericArray(BaseMaskedArray):
- """
- Base class for IntegerArray and FloatingArray.
- """
-
- _dtype_cls: type[NumericDtype]
-
- def __init__(
- self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
- ) -> None:
- checker = self._dtype_cls._checker
- if not (isinstance(values, np.ndarray) and checker(values.dtype)):
- descr = (
- "floating"
- if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
- else "integer"
- )
- raise TypeError(
- f"values should be {descr} numpy array. Use "
- "the 'pd.array' function instead"
- )
- if values.dtype == np.float16:
- # If we don't raise here, then accessing self.dtype would raise
- raise TypeError("FloatingArray does not support np.float16 dtype.")
-
- super().__init__(values, mask, copy=copy)
-
- @cache_readonly
- def dtype(self) -> NumericDtype:
- mapping = self._dtype_cls._str_to_dtype_mapping()
- return mapping[str(self._data.dtype)]
-
- @classmethod
- def _coerce_to_array(
- cls, value, *, dtype: DtypeObj, copy: bool = False
- ) -> tuple[np.ndarray, np.ndarray]:
- dtype_cls = cls._dtype_cls
- default_dtype = dtype_cls._default_np_dtype
- mask = None
- values, mask, _, _ = _coerce_to_data_and_mask(
- value, mask, dtype, copy, dtype_cls, default_dtype
- )
- return values, mask
-
- @classmethod
- def _from_sequence_of_strings(
- cls: type[T], strings, *, dtype: Dtype | None = None, copy: bool = False
- ) -> T:
- from pandas.core.tools.numeric import to_numeric
-
- scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
- return cls._from_sequence(scalars, dtype=dtype, copy=copy)
-
- _HANDLED_TYPES = (np.ndarray, numbers.Number)
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/numpy_.py b/contrib/python/pandas/py3/pandas/core/arrays/numpy_.py
deleted file mode 100644
index 216dbede39a..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/numpy_.py
+++ /dev/null
@@ -1,476 +0,0 @@
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.tslibs import (
- get_unit_from_dtype,
- is_supported_unit,
-)
-from pandas._typing import (
- AxisInt,
- Dtype,
- NpDtype,
- Scalar,
- npt,
-)
-from pandas.compat.numpy import function as nv
-
-from pandas.core.dtypes.astype import astype_array
-from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
-from pandas.core.dtypes.common import (
- is_dtype_equal,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import PandasDtype
-from pandas.core.dtypes.missing import isna
-
-from pandas.core import (
- arraylike,
- nanops,
- ops,
-)
-from pandas.core.arraylike import OpsMixin
-from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
-from pandas.core.construction import ensure_wrapped_if_datetimelike
-from pandas.core.strings.object_array import ObjectStringArrayMixin
-
-
-class PandasArray(
- OpsMixin,
- NDArrayBackedExtensionArray,
- ObjectStringArrayMixin,
-):
- """
- A pandas ExtensionArray for NumPy data.
-
- This is mostly for internal compatibility, and is not especially
- useful on its own.
-
- Parameters
- ----------
- values : ndarray
- The NumPy ndarray to wrap. Must be 1-dimensional.
- copy : bool, default False
- Whether to copy `values`.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
- """
-
- # If you're wondering why pd.Series(cls) doesn't put the array in an
- # ExtensionBlock, search for `ABCPandasArray`. We check for
- # that _typ to ensure that users don't unnecessarily use EAs inside
- # pandas internals, which turns off things like block consolidation.
- _typ = "npy_extension"
- __array_priority__ = 1000
- _ndarray: np.ndarray
- _dtype: PandasDtype
- _internal_fill_value = np.nan
-
- # ------------------------------------------------------------------------
- # Constructors
-
- def __init__(self, values: np.ndarray | PandasArray, copy: bool = False) -> None:
- if isinstance(values, type(self)):
- values = values._ndarray
- if not isinstance(values, np.ndarray):
- raise ValueError(
- f"'values' must be a NumPy array, not {type(values).__name__}"
- )
-
- if values.ndim == 0:
- # Technically we support 2, but do not advertise that fact.
- raise ValueError("PandasArray must be 1-dimensional.")
-
- if copy:
- values = values.copy()
-
- dtype = PandasDtype(values.dtype)
- super().__init__(values, dtype)
-
- @classmethod
- def _from_sequence(
- cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
- ) -> PandasArray:
- if isinstance(dtype, PandasDtype):
- dtype = dtype._dtype
-
- # error: Argument "dtype" to "asarray" has incompatible type
- # "Union[ExtensionDtype, str, dtype[Any], dtype[floating[_64Bit]], Type[object],
- # None]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
- # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
- # _DTypeDict, Tuple[Any, Any]]]"
- result = np.asarray(scalars, dtype=dtype) # type: ignore[arg-type]
- if (
- result.ndim > 1
- and not hasattr(scalars, "dtype")
- and (dtype is None or dtype == object)
- ):
- # e.g. list-of-tuples
- result = construct_1d_object_array_from_listlike(scalars)
-
- if copy and result is scalars:
- result = result.copy()
- return cls(result)
-
- def _from_backing_data(self, arr: np.ndarray) -> PandasArray:
- return type(self)(arr)
-
- # ------------------------------------------------------------------------
- # Data
-
- @property
- def dtype(self) -> PandasDtype:
- return self._dtype
-
- # ------------------------------------------------------------------------
- # NumPy Array Interface
-
- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
- return np.asarray(self._ndarray, dtype=dtype)
-
- def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
- # Lightly modified version of
- # https://numpy.org/doc/stable/reference/generated/numpy.lib.mixins.NDArrayOperatorsMixin.html
- # The primary modification is not boxing scalar return values
- # in PandasArray, since pandas' ExtensionArrays are 1-d.
- out = kwargs.get("out", ())
-
- result = ops.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
-
- if "out" in kwargs:
- # e.g. test_ufunc_unary
- return arraylike.dispatch_ufunc_with_out(
- self, ufunc, method, *inputs, **kwargs
- )
-
- if method == "reduce":
- result = arraylike.dispatch_reduction_ufunc(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- # e.g. tests.series.test_ufunc.TestNumpyReductions
- return result
-
- # Defer to the implementation of the ufunc on unwrapped values.
- inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x for x in inputs)
- if out:
- kwargs["out"] = tuple(
- x._ndarray if isinstance(x, PandasArray) else x for x in out
- )
- result = getattr(ufunc, method)(*inputs, **kwargs)
-
- if ufunc.nout > 1:
- # multiple return values; re-box array-like results
- return tuple(type(self)(x) for x in result)
- elif method == "at":
- # no return value
- return None
- elif method == "reduce":
- if isinstance(result, np.ndarray):
- # e.g. test_np_reduce_2d
- return type(self)(result)
-
- # e.g. test_np_max_nested_tuples
- return result
- else:
- # one return value; re-box array-like results
- return type(self)(result)
-
- # ------------------------------------------------------------------------
- # Pandas ExtensionArray Interface
-
- def astype(self, dtype, copy: bool = True):
- dtype = pandas_dtype(dtype)
-
- if is_dtype_equal(dtype, self.dtype):
- if copy:
- return self.copy()
- return self
-
- result = astype_array(self._ndarray, dtype=dtype, copy=copy)
- return result
-
- def isna(self) -> np.ndarray:
- return isna(self._ndarray)
-
- def _validate_scalar(self, fill_value):
- if fill_value is None:
- # Primarily for subclasses
- fill_value = self.dtype.na_value
- return fill_value
-
- def _values_for_factorize(self) -> tuple[np.ndarray, float | None]:
- if self.dtype.kind in ["i", "u", "b"]:
- fv = None
- else:
- fv = np.nan
- return self._ndarray, fv
-
- # ------------------------------------------------------------------------
- # Reductions
-
- def any(
- self,
- *,
- axis: AxisInt | None = None,
- out=None,
- keepdims: bool = False,
- skipna: bool = True,
- ):
- nv.validate_any((), {"out": out, "keepdims": keepdims})
- result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
- return self._wrap_reduction_result(axis, result)
-
- def all(
- self,
- *,
- axis: AxisInt | None = None,
- out=None,
- keepdims: bool = False,
- skipna: bool = True,
- ):
- nv.validate_all((), {"out": out, "keepdims": keepdims})
- result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
- return self._wrap_reduction_result(axis, result)
-
- def min(
- self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
- ) -> Scalar:
- nv.validate_min((), kwargs)
- result = nanops.nanmin(
- values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
- )
- return self._wrap_reduction_result(axis, result)
-
- def max(
- self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
- ) -> Scalar:
- nv.validate_max((), kwargs)
- result = nanops.nanmax(
- values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
- )
- return self._wrap_reduction_result(axis, result)
-
- def sum(
- self,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- min_count: int = 0,
- **kwargs,
- ) -> Scalar:
- nv.validate_sum((), kwargs)
- result = nanops.nansum(
- self._ndarray, axis=axis, skipna=skipna, min_count=min_count
- )
- return self._wrap_reduction_result(axis, result)
-
- def prod(
- self,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- min_count: int = 0,
- **kwargs,
- ) -> Scalar:
- nv.validate_prod((), kwargs)
- result = nanops.nanprod(
- self._ndarray, axis=axis, skipna=skipna, min_count=min_count
- )
- return self._wrap_reduction_result(axis, result)
-
- def mean(
- self,
- *,
- axis: AxisInt | None = None,
- dtype: NpDtype | None = None,
- out=None,
- keepdims: bool = False,
- skipna: bool = True,
- ):
- nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims})
- result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
- return self._wrap_reduction_result(axis, result)
-
- def median(
- self,
- *,
- axis: AxisInt | None = None,
- out=None,
- overwrite_input: bool = False,
- keepdims: bool = False,
- skipna: bool = True,
- ):
- nv.validate_median(
- (), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims}
- )
- result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
- return self._wrap_reduction_result(axis, result)
-
- def std(
- self,
- *,
- axis: AxisInt | None = None,
- dtype: NpDtype | None = None,
- out=None,
- ddof: int = 1,
- keepdims: bool = False,
- skipna: bool = True,
- ):
- nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std"
- )
- result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
- return self._wrap_reduction_result(axis, result)
-
- def var(
- self,
- *,
- axis: AxisInt | None = None,
- dtype: NpDtype | None = None,
- out=None,
- ddof: int = 1,
- keepdims: bool = False,
- skipna: bool = True,
- ):
- nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var"
- )
- result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
- return self._wrap_reduction_result(axis, result)
-
- def sem(
- self,
- *,
- axis: AxisInt | None = None,
- dtype: NpDtype | None = None,
- out=None,
- ddof: int = 1,
- keepdims: bool = False,
- skipna: bool = True,
- ):
- nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem"
- )
- result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
- return self._wrap_reduction_result(axis, result)
-
- def kurt(
- self,
- *,
- axis: AxisInt | None = None,
- dtype: NpDtype | None = None,
- out=None,
- keepdims: bool = False,
- skipna: bool = True,
- ):
- nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt"
- )
- result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
- return self._wrap_reduction_result(axis, result)
-
- def skew(
- self,
- *,
- axis: AxisInt | None = None,
- dtype: NpDtype | None = None,
- out=None,
- keepdims: bool = False,
- skipna: bool = True,
- ):
- nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew"
- )
- result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
- return self._wrap_reduction_result(axis, result)
-
- # ------------------------------------------------------------------------
- # Additional Methods
-
- def to_numpy(
- self,
- dtype: npt.DTypeLike | None = None,
- copy: bool = False,
- na_value: object = lib.no_default,
- ) -> np.ndarray:
- mask = self.isna()
- if na_value is not lib.no_default and mask.any():
- result = self._ndarray.copy()
- result[mask] = na_value
- else:
- result = self._ndarray
-
- result = np.asarray(result, dtype=dtype)
-
- if copy and result is self._ndarray:
- result = result.copy()
-
- return result
-
- # ------------------------------------------------------------------------
- # Ops
-
- def __invert__(self) -> PandasArray:
- return type(self)(~self._ndarray)
-
- def __neg__(self) -> PandasArray:
- return type(self)(-self._ndarray)
-
- def __pos__(self) -> PandasArray:
- return type(self)(+self._ndarray)
-
- def __abs__(self) -> PandasArray:
- return type(self)(abs(self._ndarray))
-
- def _cmp_method(self, other, op):
- if isinstance(other, PandasArray):
- other = other._ndarray
-
- other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
- pd_op = ops.get_array_op(op)
- other = ensure_wrapped_if_datetimelike(other)
- with np.errstate(all="ignore"):
- result = pd_op(self._ndarray, other)
-
- if op is divmod or op is ops.rdivmod:
- a, b = result
- if isinstance(a, np.ndarray):
- # for e.g. op vs TimedeltaArray, we may already
- # have an ExtensionArray, in which case we do not wrap
- return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b)
- return a, b
-
- if isinstance(result, np.ndarray):
- # for e.g. multiplication vs TimedeltaArray, we may already
- # have an ExtensionArray, in which case we do not wrap
- return self._wrap_ndarray_result(result)
- return result
-
- _arith_method = _cmp_method
-
- def _wrap_ndarray_result(self, result: np.ndarray):
- # If we have timedelta64[ns] result, return a TimedeltaArray instead
- # of a PandasArray
- if result.dtype.kind == "m" and is_supported_unit(
- get_unit_from_dtype(result.dtype)
- ):
- from pandas.core.arrays import TimedeltaArray
-
- return TimedeltaArray._simple_new(result, dtype=result.dtype)
- return type(self)(result)
-
- # ------------------------------------------------------------------------
- # String methods interface
- _str_na_value = np.nan
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/period.py b/contrib/python/pandas/py3/pandas/core/arrays/period.py
deleted file mode 100644
index f9404fbf573..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/period.py
+++ /dev/null
@@ -1,1148 +0,0 @@
-from __future__ import annotations
-
-from datetime import timedelta
-import operator
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Literal,
- Sequence,
- TypeVar,
- overload,
-)
-
-import numpy as np
-
-from pandas._libs import (
- algos as libalgos,
- lib,
-)
-from pandas._libs.arrays import NDArrayBacked
-from pandas._libs.tslibs import (
- BaseOffset,
- NaT,
- NaTType,
- Timedelta,
- astype_overflowsafe,
- dt64arr_to_periodarr as c_dt64arr_to_periodarr,
- get_unit_from_dtype,
- iNaT,
- parsing,
- period as libperiod,
- to_offset,
-)
-from pandas._libs.tslibs.dtypes import FreqGroup
-from pandas._libs.tslibs.fields import isleapyear_arr
-from pandas._libs.tslibs.offsets import (
- Tick,
- delta_to_tick,
-)
-from pandas._libs.tslibs.period import (
- DIFFERENT_FREQ,
- IncompatibleFrequency,
- Period,
- get_period_field_arr,
- period_asfreq_arr,
-)
-from pandas._typing import (
- AnyArrayLike,
- Dtype,
- NpDtype,
- npt,
-)
-from pandas.util._decorators import (
- cache_readonly,
- doc,
-)
-
-from pandas.core.dtypes.common import (
- ensure_object,
- is_datetime64_any_dtype,
- is_datetime64_dtype,
- is_dtype_equal,
- is_float_dtype,
- is_integer_dtype,
- is_period_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import PeriodDtype
-from pandas.core.dtypes.generic import (
- ABCIndex,
- ABCPeriodIndex,
- ABCSeries,
- ABCTimedeltaArray,
-)
-from pandas.core.dtypes.missing import isna
-
-import pandas.core.algorithms as algos
-from pandas.core.arrays import datetimelike as dtl
-import pandas.core.common as com
-
-if TYPE_CHECKING:
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- )
-
- from pandas.core.arrays import (
- DatetimeArray,
- TimedeltaArray,
- )
- from pandas.core.arrays.base import ExtensionArray
-
-
-BaseOffsetT = TypeVar("BaseOffsetT", bound=BaseOffset)
-
-
-_shared_doc_kwargs = {
- "klass": "PeriodArray",
-}
-
-
-def _field_accessor(name: str, docstring=None):
- def f(self):
- base = self.freq._period_dtype_code
- result = get_period_field_arr(name, self.asi8, base)
- return result
-
- f.__name__ = name
- f.__doc__ = docstring
- return property(f)
-
-
-class PeriodArray(dtl.DatelikeOps, libperiod.PeriodMixin):
- """
- Pandas ExtensionArray for storing Period data.
-
- Users should use :func:`~pandas.period_array` to create new instances.
- Alternatively, :func:`~pandas.array` can be used to create new instances
- from a sequence of Period scalars.
-
- Parameters
- ----------
- values : Union[PeriodArray, Series[period], ndarray[int], PeriodIndex]
- The data to store. These should be arrays that can be directly
- converted to ordinals without inference or copy (PeriodArray,
- ndarray[int64]), or a box around such an array (Series[period],
- PeriodIndex).
- dtype : PeriodDtype, optional
- A PeriodDtype instance from which to extract a `freq`. If both
- `freq` and `dtype` are specified, then the frequencies must match.
- freq : str or DateOffset
- The `freq` to use for the array. Mostly applicable when `values`
- is an ndarray of integers, when `freq` is required. When `values`
- is a PeriodArray (or box around), it's checked that ``values.freq``
- matches `freq`.
- copy : bool, default False
- Whether to copy the ordinals before storing.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
-
- See Also
- --------
- Period: Represents a period of time.
- PeriodIndex : Immutable Index for period data.
- period_range: Create a fixed-frequency PeriodArray.
- array: Construct a pandas array.
-
- Notes
- -----
- There are two components to a PeriodArray
-
- - ordinals : integer ndarray
- - freq : pd.tseries.offsets.Offset
-
- The values are physically stored as a 1-D ndarray of integers. These are
- called "ordinals" and represent some kind of offset from a base.
-
- The `freq` indicates the span covered by each element of the array.
- All elements in the PeriodArray have the same `freq`.
- """
-
- # array priority higher than numpy scalars
- __array_priority__ = 1000
- _typ = "periodarray" # ABCPeriodArray
- _internal_fill_value = np.int64(iNaT)
- _recognized_scalars = (Period,)
- _is_recognized_dtype = is_period_dtype # check_compatible_with checks freq match
- _infer_matches = ("period",)
-
- @property
- def _scalar_type(self) -> type[Period]:
- return Period
-
- # Names others delegate to us
- _other_ops: list[str] = []
- _bool_ops: list[str] = ["is_leap_year"]
- _object_ops: list[str] = ["start_time", "end_time", "freq"]
- _field_ops: list[str] = [
- "year",
- "month",
- "day",
- "hour",
- "minute",
- "second",
- "weekofyear",
- "weekday",
- "week",
- "dayofweek",
- "day_of_week",
- "dayofyear",
- "day_of_year",
- "quarter",
- "qyear",
- "days_in_month",
- "daysinmonth",
- ]
- _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops
- _datetimelike_methods: list[str] = ["strftime", "to_timestamp", "asfreq"]
-
- _dtype: PeriodDtype
-
- # --------------------------------------------------------------------
- # Constructors
-
- def __init__(
- self, values, dtype: Dtype | None = None, freq=None, copy: bool = False
- ) -> None:
- freq = validate_dtype_freq(dtype, freq)
-
- if freq is not None:
- freq = Period._maybe_convert_freq(freq)
-
- if isinstance(values, ABCSeries):
- values = values._values
- if not isinstance(values, type(self)):
- raise TypeError("Incorrect dtype")
-
- elif isinstance(values, ABCPeriodIndex):
- values = values._values
-
- if isinstance(values, type(self)):
- if freq is not None and freq != values.freq:
- raise raise_on_incompatible(values, freq)
- values, freq = values._ndarray, values.freq
-
- values = np.array(values, dtype="int64", copy=copy)
- if freq is None:
- raise ValueError("freq is not specified and cannot be inferred")
- NDArrayBacked.__init__(self, values, PeriodDtype(freq))
-
- # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked"
- @classmethod
- def _simple_new( # type: ignore[override]
- cls,
- values: np.ndarray,
- freq: BaseOffset | None = None,
- dtype: Dtype | None = None,
- ) -> PeriodArray:
- # alias for PeriodArray.__init__
- assertion_msg = "Should be numpy array of type i8"
- assert isinstance(values, np.ndarray) and values.dtype == "i8", assertion_msg
- return cls(values, freq=freq, dtype=dtype)
-
- @classmethod
- def _from_sequence(
- cls: type[PeriodArray],
- scalars: Sequence[Period | None] | AnyArrayLike,
- *,
- dtype: Dtype | None = None,
- copy: bool = False,
- ) -> PeriodArray:
- if dtype and isinstance(dtype, PeriodDtype):
- freq = dtype.freq
- else:
- freq = None
-
- if isinstance(scalars, cls):
- validate_dtype_freq(scalars.dtype, freq)
- if copy:
- scalars = scalars.copy()
- return scalars
-
- periods = np.asarray(scalars, dtype=object)
-
- freq = freq or libperiod.extract_freq(periods)
- ordinals = libperiod.extract_ordinals(periods, freq)
- return cls(ordinals, freq=freq)
-
- @classmethod
- def _from_sequence_of_strings(
- cls, strings, *, dtype: Dtype | None = None, copy: bool = False
- ) -> PeriodArray:
- return cls._from_sequence(strings, dtype=dtype, copy=copy)
-
- @classmethod
- def _from_datetime64(cls, data, freq, tz=None) -> PeriodArray:
- """
- Construct a PeriodArray from a datetime64 array
-
- Parameters
- ----------
- data : ndarray[datetime64[ns], datetime64[ns, tz]]
- freq : str or Tick
- tz : tzinfo, optional
-
- Returns
- -------
- PeriodArray[freq]
- """
- data, freq = dt64arr_to_periodarr(data, freq, tz)
- return cls(data, freq=freq)
-
- @classmethod
- def _generate_range(cls, start, end, periods, freq, fields):
- periods = dtl.validate_periods(periods)
-
- if freq is not None:
- freq = Period._maybe_convert_freq(freq)
-
- field_count = len(fields)
- if start is not None or end is not None:
- if field_count > 0:
- raise ValueError(
- "Can either instantiate from fields or endpoints, but not both"
- )
- subarr, freq = _get_ordinal_range(start, end, periods, freq)
- elif field_count > 0:
- subarr, freq = _range_from_fields(freq=freq, **fields)
- else:
- raise ValueError("Not enough parameters to construct Period range")
-
- return subarr, freq
-
- # -----------------------------------------------------------------
- # DatetimeLike Interface
-
- # error: Argument 1 of "_unbox_scalar" is incompatible with supertype
- # "DatetimeLikeArrayMixin"; supertype defines the argument type as
- # "Union[Union[Period, Any, Timedelta], NaTType]"
- def _unbox_scalar( # type: ignore[override]
- self,
- value: Period | NaTType,
- ) -> np.int64:
- if value is NaT:
- # error: Item "Period" of "Union[Period, NaTType]" has no attribute "value"
- return np.int64(value._value) # type: ignore[union-attr]
- elif isinstance(value, self._scalar_type):
- self._check_compatible_with(value)
- return np.int64(value.ordinal)
- else:
- raise ValueError(f"'value' should be a Period. Got '{value}' instead.")
-
- def _scalar_from_string(self, value: str) -> Period:
- return Period(value, freq=self.freq)
-
- def _check_compatible_with(self, other) -> None:
- if other is NaT:
- return
- self._require_matching_freq(other)
-
- # --------------------------------------------------------------------
- # Data / Attributes
-
- @cache_readonly
- def dtype(self) -> PeriodDtype:
- return self._dtype
-
- # error: Cannot override writeable attribute with read-only property
- @property # type: ignore[override]
- def freq(self) -> BaseOffset:
- """
- Return the frequency object for this PeriodArray.
- """
- return self.dtype.freq
-
- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
- if dtype == "i8":
- return self.asi8
- elif dtype == bool:
- return ~self._isnan
-
- # This will raise TypeError for non-object dtypes
- return np.array(list(self), dtype=object)
-
- def __arrow_array__(self, type=None):
- """
- Convert myself into a pyarrow Array.
- """
- import pyarrow
-
- from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
-
- if type is not None:
- if pyarrow.types.is_integer(type):
- return pyarrow.array(self._ndarray, mask=self.isna(), type=type)
- elif isinstance(type, ArrowPeriodType):
- # ensure we have the same freq
- if self.freqstr != type.freq:
- raise TypeError(
- "Not supported to convert PeriodArray to array with different "
- f"'freq' ({self.freqstr} vs {type.freq})"
- )
- else:
- raise TypeError(
- f"Not supported to convert PeriodArray to '{type}' type"
- )
-
- period_type = ArrowPeriodType(self.freqstr)
- storage_array = pyarrow.array(self._ndarray, mask=self.isna(), type="int64")
- return pyarrow.ExtensionArray.from_storage(period_type, storage_array)
-
- # --------------------------------------------------------------------
- # Vectorized analogues of Period properties
-
- year = _field_accessor(
- "year",
- """
- The year of the period.
- """,
- )
- month = _field_accessor(
- "month",
- """
- The month as January=1, December=12.
- """,
- )
- day = _field_accessor(
- "day",
- """
- The days of the period.
- """,
- )
- hour = _field_accessor(
- "hour",
- """
- The hour of the period.
- """,
- )
- minute = _field_accessor(
- "minute",
- """
- The minute of the period.
- """,
- )
- second = _field_accessor(
- "second",
- """
- The second of the period.
- """,
- )
- weekofyear = _field_accessor(
- "week",
- """
- The week ordinal of the year.
- """,
- )
- week = weekofyear
- day_of_week = _field_accessor(
- "day_of_week",
- """
- The day of the week with Monday=0, Sunday=6.
- """,
- )
- dayofweek = day_of_week
- weekday = dayofweek
- dayofyear = day_of_year = _field_accessor(
- "day_of_year",
- """
- The ordinal day of the year.
- """,
- )
- quarter = _field_accessor(
- "quarter",
- """
- The quarter of the date.
- """,
- )
- qyear = _field_accessor("qyear")
- days_in_month = _field_accessor(
- "days_in_month",
- """
- The number of days in the month.
- """,
- )
- daysinmonth = days_in_month
-
- @property
- def is_leap_year(self) -> np.ndarray:
- """
- Logical indicating if the date belongs to a leap year.
- """
- return isleapyear_arr(np.asarray(self.year))
-
- def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray:
- """
- Cast to DatetimeArray/Index.
-
- Parameters
- ----------
- freq : str or DateOffset, optional
- Target frequency. The default is 'D' for week or longer,
- 'S' otherwise.
- how : {'s', 'e', 'start', 'end'}
- Whether to use the start or end of the time period being converted.
-
- Returns
- -------
- DatetimeArray/Index
- """
- from pandas.core.arrays import DatetimeArray
-
- how = libperiod.validate_end_alias(how)
-
- end = how == "E"
- if end:
- if freq == "B" or self.freq == "B":
- # roll forward to ensure we land on B date
- adjust = Timedelta(1, "D") - Timedelta(1, "ns")
- return self.to_timestamp(how="start") + adjust
- else:
- adjust = Timedelta(1, "ns")
- return (self + self.freq).to_timestamp(how="start") - adjust
-
- if freq is None:
- freq = self._dtype._get_to_timestamp_base()
- base = freq
- else:
- freq = Period._maybe_convert_freq(freq)
- base = freq._period_dtype_code
-
- new_parr = self.asfreq(freq, how=how)
-
- new_data = libperiod.periodarr_to_dt64arr(new_parr.asi8, base)
- dta = DatetimeArray(new_data)
-
- if self.freq.name == "B":
- # See if we can retain BDay instead of Day in cases where
- # len(self) is too small for infer_freq to distinguish between them
- diffs = libalgos.unique_deltas(self.asi8)
- if len(diffs) == 1:
- diff = diffs[0]
- if diff == self.freq.n:
- dta._freq = self.freq
- elif diff == 1:
- dta._freq = self.freq.base
- # TODO: other cases?
- return dta
- else:
- return dta._with_freq("infer")
-
- # --------------------------------------------------------------------
-
- def _box_func(self, x) -> Period | NaTType:
- return Period._from_ordinal(ordinal=x, freq=self.freq)
-
- @doc(**_shared_doc_kwargs, other="PeriodIndex", other_name="PeriodIndex")
- def asfreq(self, freq=None, how: str = "E") -> PeriodArray:
- """
- Convert the {klass} to the specified frequency `freq`.
-
- Equivalent to applying :meth:`pandas.Period.asfreq` with the given arguments
- to each :class:`~pandas.Period` in this {klass}.
-
- Parameters
- ----------
- freq : str
- A frequency.
- how : str {{'E', 'S'}}, default 'E'
- Whether the elements should be aligned to the end
- or start within pa period.
-
- * 'E', 'END', or 'FINISH' for end,
- * 'S', 'START', or 'BEGIN' for start.
-
- January 31st ('END') vs. January 1st ('START') for example.
-
- Returns
- -------
- {klass}
- The transformed {klass} with the new frequency.
-
- See Also
- --------
- {other}.asfreq: Convert each Period in a {other_name} to the given frequency.
- Period.asfreq : Convert a :class:`~pandas.Period` object to the given frequency.
-
- Examples
- --------
- >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='A')
- >>> pidx
- PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'],
- dtype='period[A-DEC]')
-
- >>> pidx.asfreq('M')
- PeriodIndex(['2010-12', '2011-12', '2012-12', '2013-12', '2014-12',
- '2015-12'], dtype='period[M]')
-
- >>> pidx.asfreq('M', how='S')
- PeriodIndex(['2010-01', '2011-01', '2012-01', '2013-01', '2014-01',
- '2015-01'], dtype='period[M]')
- """
- how = libperiod.validate_end_alias(how)
-
- freq = Period._maybe_convert_freq(freq)
-
- base1 = self._dtype._dtype_code
- base2 = freq._period_dtype_code
-
- asi8 = self.asi8
- # self.freq.n can't be negative or 0
- end = how == "E"
- if end:
- ordinal = asi8 + self.freq.n - 1
- else:
- ordinal = asi8
-
- new_data = period_asfreq_arr(ordinal, base1, base2, end)
-
- if self._hasna:
- new_data[self._isnan] = iNaT
-
- return type(self)(new_data, freq=freq)
-
- # ------------------------------------------------------------------
- # Rendering Methods
-
- def _formatter(self, boxed: bool = False):
- if boxed:
- return str
- return "'{}'".format
-
- @dtl.ravel_compat
- def _format_native_types(
- self, *, na_rep: str | float = "NaT", date_format=None, **kwargs
- ) -> npt.NDArray[np.object_]:
- """
- actually format my specific types
- """
- values = self.astype(object)
-
- # Create the formatter function
- if date_format:
- formatter = lambda per: per.strftime(date_format)
- else:
- # Uses `_Period.str` which in turn uses `format_period`
- formatter = lambda per: str(per)
-
- # Apply the formatter to all values in the array, possibly with a mask
- if self._hasna:
- mask = self._isnan
- values[mask] = na_rep
- imask = ~mask
- values[imask] = np.array([formatter(per) for per in values[imask]])
- else:
- values = np.array([formatter(per) for per in values])
- return values
-
- # ------------------------------------------------------------------
-
- def astype(self, dtype, copy: bool = True):
- # We handle Period[T] -> Period[U]
- # Our parent handles everything else.
- dtype = pandas_dtype(dtype)
- if is_dtype_equal(dtype, self._dtype):
- if not copy:
- return self
- else:
- return self.copy()
- if is_period_dtype(dtype):
- return self.asfreq(dtype.freq)
-
- if is_datetime64_any_dtype(dtype):
- # GH#45038 match PeriodIndex behavior.
- tz = getattr(dtype, "tz", None)
- return self.to_timestamp().tz_localize(tz)
-
- return super().astype(dtype, copy=copy)
-
- def searchsorted(
- self,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- npvalue = self._validate_setitem_value(value).view("M8[ns]")
-
- # Cast to M8 to get datetime-like NaT placement,
- # similar to dtl._period_dispatch
- m8arr = self._ndarray.view("M8[ns]")
- return m8arr.searchsorted(npvalue, side=side, sorter=sorter)
-
- def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
- if method is not None:
- # view as dt64 so we get treated as timelike in core.missing,
- # similar to dtl._period_dispatch
- dta = self.view("M8[ns]")
- result = dta.fillna(value=value, method=method, limit=limit)
- # error: Incompatible return value type (got "Union[ExtensionArray,
- # ndarray[Any, Any]]", expected "PeriodArray")
- return result.view(self.dtype) # type: ignore[return-value]
- return super().fillna(value=value, method=method, limit=limit)
-
- # ------------------------------------------------------------------
- # Arithmetic Methods
-
- def _addsub_int_array_or_scalar(
- self, other: np.ndarray | int, op: Callable[[Any, Any], Any]
- ) -> PeriodArray:
- """
- Add or subtract array of integers.
-
- Parameters
- ----------
- other : np.ndarray[int64] or int
- op : {operator.add, operator.sub}
-
- Returns
- -------
- result : PeriodArray
- """
- assert op in [operator.add, operator.sub]
- if op is operator.sub:
- other = -other
- res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan)
- return type(self)(res_values, freq=self.freq)
-
- def _add_offset(self, other: BaseOffset):
- assert not isinstance(other, Tick)
-
- self._require_matching_freq(other, base=True)
- return self._addsub_int_array_or_scalar(other.n, operator.add)
-
- # TODO: can we de-duplicate with Period._add_timedeltalike_scalar?
- def _add_timedeltalike_scalar(self, other):
- """
- Parameters
- ----------
- other : timedelta, Tick, np.timedelta64
-
- Returns
- -------
- PeriodArray
- """
- if not isinstance(self.freq, Tick):
- # We cannot add timedelta-like to non-tick PeriodArray
- raise raise_on_incompatible(self, other)
-
- if isna(other):
- # i.e. np.timedelta64("NaT")
- return super()._add_timedeltalike_scalar(other)
-
- td = np.asarray(Timedelta(other).asm8)
- return self._add_timedelta_arraylike(td)
-
- def _add_timedelta_arraylike(
- self, other: TimedeltaArray | npt.NDArray[np.timedelta64]
- ) -> PeriodArray:
- """
- Parameters
- ----------
- other : TimedeltaArray or ndarray[timedelta64]
-
- Returns
- -------
- PeriodArray
- """
- freq = self.freq
- if not isinstance(freq, Tick):
- # We cannot add timedelta-like to non-tick PeriodArray
- raise TypeError(
- f"Cannot add or subtract timedelta64[ns] dtype from {self.dtype}"
- )
-
- dtype = np.dtype(f"m8[{freq._td64_unit}]")
-
- try:
- delta = astype_overflowsafe(
- np.asarray(other), dtype=dtype, copy=False, round_ok=False
- )
- except ValueError as err:
- # e.g. if we have minutes freq and try to add 30s
- # "Cannot losslessly convert units"
- raise IncompatibleFrequency(
- "Cannot add/subtract timedelta-like from PeriodArray that is "
- "not an integer multiple of the PeriodArray's freq."
- ) from err
-
- b_mask = np.isnat(delta)
-
- res_values = algos.checked_add_with_arr(
- self.asi8, delta.view("i8"), arr_mask=self._isnan, b_mask=b_mask
- )
- np.putmask(res_values, self._isnan | b_mask, iNaT)
- return type(self)(res_values, freq=self.freq)
-
- def _check_timedeltalike_freq_compat(self, other):
- """
- Arithmetic operations with timedelta-like scalars or array `other`
- are only valid if `other` is an integer multiple of `self.freq`.
- If the operation is valid, find that integer multiple. Otherwise,
- raise because the operation is invalid.
-
- Parameters
- ----------
- other : timedelta, np.timedelta64, Tick,
- ndarray[timedelta64], TimedeltaArray, TimedeltaIndex
-
- Returns
- -------
- multiple : int or ndarray[int64]
-
- Raises
- ------
- IncompatibleFrequency
- """
- assert isinstance(self.freq, Tick) # checked by calling function
-
- dtype = np.dtype(f"m8[{self.freq._td64_unit}]")
-
- if isinstance(other, (timedelta, np.timedelta64, Tick)):
- td = np.asarray(Timedelta(other).asm8)
- else:
- td = np.asarray(other)
-
- try:
- delta = astype_overflowsafe(td, dtype=dtype, copy=False, round_ok=False)
- except ValueError as err:
- raise raise_on_incompatible(self, other) from err
-
- delta = delta.view("i8")
- return lib.item_from_zerodim(delta)
-
-
-def raise_on_incompatible(left, right):
- """
- Helper function to render a consistent error message when raising
- IncompatibleFrequency.
-
- Parameters
- ----------
- left : PeriodArray
- right : None, DateOffset, Period, ndarray, or timedelta-like
-
- Returns
- -------
- IncompatibleFrequency
- Exception to be raised by the caller.
- """
- # GH#24283 error message format depends on whether right is scalar
- if isinstance(right, (np.ndarray, ABCTimedeltaArray)) or right is None:
- other_freq = None
- elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, BaseOffset)):
- other_freq = right.freqstr
- else:
- other_freq = delta_to_tick(Timedelta(right)).freqstr
-
- msg = DIFFERENT_FREQ.format(
- cls=type(left).__name__, own_freq=left.freqstr, other_freq=other_freq
- )
- return IncompatibleFrequency(msg)
-
-
-# -------------------------------------------------------------------
-# Constructor Helpers
-
-
-def period_array(
- data: Sequence[Period | str | None] | AnyArrayLike,
- freq: str | Tick | None = None,
- copy: bool = False,
-) -> PeriodArray:
- """
- Construct a new PeriodArray from a sequence of Period scalars.
-
- Parameters
- ----------
- data : Sequence of Period objects
- A sequence of Period objects. These are required to all have
- the same ``freq.`` Missing values can be indicated by ``None``
- or ``pandas.NaT``.
- freq : str, Tick, or Offset
- The frequency of every element of the array. This can be specified
- to avoid inferring the `freq` from `data`.
- copy : bool, default False
- Whether to ensure a copy of the data is made.
-
- Returns
- -------
- PeriodArray
-
- See Also
- --------
- PeriodArray
- pandas.PeriodIndex
-
- Examples
- --------
- >>> period_array([pd.Period('2017', freq='A'),
- ... pd.Period('2018', freq='A')])
- <PeriodArray>
- ['2017', '2018']
- Length: 2, dtype: period[A-DEC]
-
- >>> period_array([pd.Period('2017', freq='A'),
- ... pd.Period('2018', freq='A'),
- ... pd.NaT])
- <PeriodArray>
- ['2017', '2018', 'NaT']
- Length: 3, dtype: period[A-DEC]
-
- Integers that look like years are handled
-
- >>> period_array([2000, 2001, 2002], freq='D')
- <PeriodArray>
- ['2000-01-01', '2001-01-01', '2002-01-01']
- Length: 3, dtype: period[D]
-
- Datetime-like strings may also be passed
-
- >>> period_array(['2000-Q1', '2000-Q2', '2000-Q3', '2000-Q4'], freq='Q')
- <PeriodArray>
- ['2000Q1', '2000Q2', '2000Q3', '2000Q4']
- Length: 4, dtype: period[Q-DEC]
- """
- data_dtype = getattr(data, "dtype", None)
-
- if is_datetime64_dtype(data_dtype):
- return PeriodArray._from_datetime64(data, freq)
- if is_period_dtype(data_dtype):
- return PeriodArray(data, freq=freq)
-
- # other iterable of some kind
- if not isinstance(data, (np.ndarray, list, tuple, ABCSeries)):
- data = list(data)
-
- arrdata = np.asarray(data)
-
- dtype: PeriodDtype | None
- if freq:
- dtype = PeriodDtype(freq)
- else:
- dtype = None
-
- if is_float_dtype(arrdata) and len(arrdata) > 0:
- raise TypeError("PeriodIndex does not allow floating point in construction")
-
- if is_integer_dtype(arrdata.dtype):
- arr = arrdata.astype(np.int64, copy=False)
- # error: Argument 2 to "from_ordinals" has incompatible type "Union[str,
- # Tick, None]"; expected "Union[timedelta, BaseOffset, str]"
- ordinals = libperiod.from_ordinals(arr, freq) # type: ignore[arg-type]
- return PeriodArray(ordinals, dtype=dtype)
-
- data = ensure_object(arrdata)
-
- return PeriodArray._from_sequence(data, dtype=dtype)
-
-
-@overload
-def validate_dtype_freq(dtype, freq: BaseOffsetT) -> BaseOffsetT:
- ...
-
-
-@overload
-def validate_dtype_freq(dtype, freq: timedelta | str | None) -> BaseOffset:
- ...
-
-
-def validate_dtype_freq(
- dtype, freq: BaseOffsetT | timedelta | str | None
-) -> BaseOffsetT:
- """
- If both a dtype and a freq are available, ensure they match. If only
- dtype is available, extract the implied freq.
-
- Parameters
- ----------
- dtype : dtype
- freq : DateOffset or None
-
- Returns
- -------
- freq : DateOffset
-
- Raises
- ------
- ValueError : non-period dtype
- IncompatibleFrequency : mismatch between dtype and freq
- """
- if freq is not None:
- # error: Incompatible types in assignment (expression has type
- # "BaseOffset", variable has type "Union[BaseOffsetT, timedelta,
- # str, None]")
- freq = to_offset(freq) # type: ignore[assignment]
-
- if dtype is not None:
- dtype = pandas_dtype(dtype)
- if not is_period_dtype(dtype):
- raise ValueError("dtype must be PeriodDtype")
- if freq is None:
- freq = dtype.freq
- elif freq != dtype.freq:
- raise IncompatibleFrequency("specified freq and dtype are different")
- # error: Incompatible return value type (got "Union[BaseOffset, Any, None]",
- # expected "BaseOffset")
- return freq # type: ignore[return-value]
-
-
-def dt64arr_to_periodarr(
- data, freq, tz=None
-) -> tuple[npt.NDArray[np.int64], BaseOffset]:
- """
- Convert an datetime-like array to values Period ordinals.
-
- Parameters
- ----------
- data : Union[Series[datetime64[ns]], DatetimeIndex, ndarray[datetime64ns]]
- freq : Optional[Union[str, Tick]]
- Must match the `freq` on the `data` if `data` is a DatetimeIndex
- or Series.
- tz : Optional[tzinfo]
-
- Returns
- -------
- ordinals : ndarray[int64]
- freq : Tick
- The frequency extracted from the Series or DatetimeIndex if that's
- used.
-
- """
- if not isinstance(data.dtype, np.dtype) or data.dtype.kind != "M":
- raise ValueError(f"Wrong dtype: {data.dtype}")
-
- if freq is None:
- if isinstance(data, ABCIndex):
- data, freq = data._values, data.freq
- elif isinstance(data, ABCSeries):
- data, freq = data._values, data.dt.freq
-
- elif isinstance(data, (ABCIndex, ABCSeries)):
- data = data._values
-
- reso = get_unit_from_dtype(data.dtype)
- freq = Period._maybe_convert_freq(freq)
- base = freq._period_dtype_code
- return c_dt64arr_to_periodarr(data.view("i8"), base, tz, reso=reso), freq
-
-
-def _get_ordinal_range(start, end, periods, freq, mult: int = 1):
- if com.count_not_none(start, end, periods) != 2:
- raise ValueError(
- "Of the three parameters: start, end, and periods, "
- "exactly two must be specified"
- )
-
- if freq is not None:
- freq = to_offset(freq)
- mult = freq.n
-
- if start is not None:
- start = Period(start, freq)
- if end is not None:
- end = Period(end, freq)
-
- is_start_per = isinstance(start, Period)
- is_end_per = isinstance(end, Period)
-
- if is_start_per and is_end_per and start.freq != end.freq:
- raise ValueError("start and end must have same freq")
- if start is NaT or end is NaT:
- raise ValueError("start and end must not be NaT")
-
- if freq is None:
- if is_start_per:
- freq = start.freq
- elif is_end_per:
- freq = end.freq
- else: # pragma: no cover
- raise ValueError("Could not infer freq from start/end")
-
- if periods is not None:
- periods = periods * mult
- if start is None:
- data = np.arange(
- end.ordinal - periods + mult, end.ordinal + 1, mult, dtype=np.int64
- )
- else:
- data = np.arange(
- start.ordinal, start.ordinal + periods, mult, dtype=np.int64
- )
- else:
- data = np.arange(start.ordinal, end.ordinal + 1, mult, dtype=np.int64)
-
- return data, freq
-
-
-def _range_from_fields(
- year=None,
- month=None,
- quarter=None,
- day=None,
- hour=None,
- minute=None,
- second=None,
- freq=None,
-) -> tuple[np.ndarray, BaseOffset]:
- if hour is None:
- hour = 0
- if minute is None:
- minute = 0
- if second is None:
- second = 0
- if day is None:
- day = 1
-
- ordinals = []
-
- if quarter is not None:
- if freq is None:
- freq = to_offset("Q")
- base = FreqGroup.FR_QTR.value
- else:
- freq = to_offset(freq)
- base = libperiod.freq_to_dtype_code(freq)
- if base != FreqGroup.FR_QTR.value:
- raise AssertionError("base must equal FR_QTR")
-
- freqstr = freq.freqstr
- year, quarter = _make_field_arrays(year, quarter)
- for y, q in zip(year, quarter):
- y, m = parsing.quarter_to_myear(y, q, freqstr)
- val = libperiod.period_ordinal(y, m, 1, 1, 1, 1, 0, 0, base)
- ordinals.append(val)
- else:
- freq = to_offset(freq)
- base = libperiod.freq_to_dtype_code(freq)
- arrays = _make_field_arrays(year, month, day, hour, minute, second)
- for y, mth, d, h, mn, s in zip(*arrays):
- ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base))
-
- return np.array(ordinals, dtype=np.int64), freq
-
-
-def _make_field_arrays(*fields) -> list[np.ndarray]:
- length = None
- for x in fields:
- if isinstance(x, (list, np.ndarray, ABCSeries)):
- if length is not None and len(x) != length:
- raise ValueError("Mismatched Period array lengths")
- if length is None:
- length = len(x)
-
- # error: Argument 2 to "repeat" has incompatible type "Optional[int]"; expected
- # "Union[Union[int, integer[Any]], Union[bool, bool_], ndarray, Sequence[Union[int,
- # integer[Any]]], Sequence[Union[bool, bool_]], Sequence[Sequence[Any]]]"
- return [
- np.asarray(x)
- if isinstance(x, (np.ndarray, list, ABCSeries))
- else np.repeat(x, length) # type: ignore[arg-type]
- for x in fields
- ]
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/sparse/__init__.py b/contrib/python/pandas/py3/pandas/core/arrays/sparse/__init__.py
deleted file mode 100644
index 56dbc6df54f..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/sparse/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from pandas.core.arrays.sparse.accessor import (
- SparseAccessor,
- SparseFrameAccessor,
-)
-from pandas.core.arrays.sparse.array import (
- BlockIndex,
- IntIndex,
- SparseArray,
- make_sparse_index,
-)
-from pandas.core.arrays.sparse.dtype import SparseDtype
-
-__all__ = [
- "BlockIndex",
- "IntIndex",
- "make_sparse_index",
- "SparseAccessor",
- "SparseArray",
- "SparseDtype",
- "SparseFrameAccessor",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/sparse/accessor.py b/contrib/python/pandas/py3/pandas/core/arrays/sparse/accessor.py
deleted file mode 100644
index b3eb5db6329..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/sparse/accessor.py
+++ /dev/null
@@ -1,386 +0,0 @@
-"""Sparse accessor"""
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from pandas.compat._optional import import_optional_dependency
-
-from pandas.core.dtypes.cast import find_common_type
-
-from pandas.core.accessor import (
- PandasDelegate,
- delegate_names,
-)
-from pandas.core.arrays.sparse.array import SparseArray
-from pandas.core.arrays.sparse.dtype import SparseDtype
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
-
-
-class BaseAccessor:
- _validation_msg = "Can only use the '.sparse' accessor with Sparse data."
-
- def __init__(self, data=None) -> None:
- self._parent = data
- self._validate(data)
-
- def _validate(self, data):
- raise NotImplementedError
-
-
-@delegate_names(
- SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
-)
-class SparseAccessor(BaseAccessor, PandasDelegate):
- """
- Accessor for SparseSparse from other sparse matrix data types.
- """
-
- def _validate(self, data):
- if not isinstance(data.dtype, SparseDtype):
- raise AttributeError(self._validation_msg)
-
- def _delegate_property_get(self, name, *args, **kwargs):
- return getattr(self._parent.array, name)
-
- def _delegate_method(self, name, *args, **kwargs):
- if name == "from_coo":
- return self.from_coo(*args, **kwargs)
- elif name == "to_coo":
- return self.to_coo(*args, **kwargs)
- else:
- raise ValueError
-
- @classmethod
- def from_coo(cls, A, dense_index: bool = False) -> Series:
- """
- Create a Series with sparse values from a scipy.sparse.coo_matrix.
-
- Parameters
- ----------
- A : scipy.sparse.coo_matrix
- dense_index : bool, default False
- If False (default), the index consists of only the
- coords of the non-null entries of the original coo_matrix.
- If True, the index consists of the full sorted
- (row, col) coordinates of the coo_matrix.
-
- Returns
- -------
- s : Series
- A Series with sparse values.
-
- Examples
- --------
- >>> from scipy import sparse
-
- >>> A = sparse.coo_matrix(
- ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)
- ... )
- >>> A
- <3x4 sparse matrix of type '<class 'numpy.float64'>'
- with 3 stored elements in COOrdinate format>
-
- >>> A.todense()
- matrix([[0., 0., 1., 2.],
- [3., 0., 0., 0.],
- [0., 0., 0., 0.]])
-
- >>> ss = pd.Series.sparse.from_coo(A)
- >>> ss
- 0 2 1.0
- 3 2.0
- 1 0 3.0
- dtype: Sparse[float64, nan]
- """
- from pandas import Series
- from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series
-
- result = coo_to_sparse_series(A, dense_index=dense_index)
- result = Series(result.array, index=result.index, copy=False)
-
- return result
-
- def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False):
- """
- Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
-
- Use row_levels and column_levels to determine the row and column
- coordinates respectively. row_levels and column_levels are the names
- (labels) or numbers of the levels. {row_levels, column_levels} must be
- a partition of the MultiIndex level names (or numbers).
-
- Parameters
- ----------
- row_levels : tuple/list
- column_levels : tuple/list
- sort_labels : bool, default False
- Sort the row and column labels before forming the sparse matrix.
- When `row_levels` and/or `column_levels` refer to a single level,
- set to `True` for a faster execution.
-
- Returns
- -------
- y : scipy.sparse.coo_matrix
- rows : list (row labels)
- columns : list (column labels)
-
- Examples
- --------
- >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
- >>> s.index = pd.MultiIndex.from_tuples(
- ... [
- ... (1, 2, "a", 0),
- ... (1, 2, "a", 1),
- ... (1, 1, "b", 0),
- ... (1, 1, "b", 1),
- ... (2, 1, "b", 0),
- ... (2, 1, "b", 1)
- ... ],
- ... names=["A", "B", "C", "D"],
- ... )
- >>> s
- A B C D
- 1 2 a 0 3.0
- 1 NaN
- 1 b 0 1.0
- 1 3.0
- 2 1 b 0 NaN
- 1 NaN
- dtype: float64
-
- >>> ss = s.astype("Sparse")
- >>> ss
- A B C D
- 1 2 a 0 3.0
- 1 NaN
- 1 b 0 1.0
- 1 3.0
- 2 1 b 0 NaN
- 1 NaN
- dtype: Sparse[float64, nan]
-
- >>> A, rows, columns = ss.sparse.to_coo(
- ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
- ... )
- >>> A
- <3x4 sparse matrix of type '<class 'numpy.float64'>'
- with 3 stored elements in COOrdinate format>
- >>> A.todense()
- matrix([[0., 0., 1., 3.],
- [3., 0., 0., 0.],
- [0., 0., 0., 0.]])
-
- >>> rows
- [(1, 1), (1, 2), (2, 1)]
- >>> columns
- [('a', 0), ('a', 1), ('b', 0), ('b', 1)]
- """
- from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo
-
- A, rows, columns = sparse_series_to_coo(
- self._parent, row_levels, column_levels, sort_labels=sort_labels
- )
- return A, rows, columns
-
- def to_dense(self) -> Series:
- """
- Convert a Series from sparse values to dense.
-
- Returns
- -------
- Series:
- A Series with the same values, stored as a dense array.
-
- Examples
- --------
- >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))
- >>> series
- 0 0
- 1 1
- 2 0
- dtype: Sparse[int64, 0]
-
- >>> series.sparse.to_dense()
- 0 0
- 1 1
- 2 0
- dtype: int64
- """
- from pandas import Series
-
- return Series(
- self._parent.array.to_dense(),
- index=self._parent.index,
- name=self._parent.name,
- copy=False,
- )
-
-
-class SparseFrameAccessor(BaseAccessor, PandasDelegate):
- """
- DataFrame accessor for sparse data.
- """
-
- def _validate(self, data):
- dtypes = data.dtypes
- if not all(isinstance(t, SparseDtype) for t in dtypes):
- raise AttributeError(self._validation_msg)
-
- @classmethod
- def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
- """
- Create a new DataFrame from a scipy sparse matrix.
-
- Parameters
- ----------
- data : scipy.sparse.spmatrix
- Must be convertible to csc format.
- index, columns : Index, optional
- Row and column labels to use for the resulting DataFrame.
- Defaults to a RangeIndex.
-
- Returns
- -------
- DataFrame
- Each column of the DataFrame is stored as a
- :class:`arrays.SparseArray`.
-
- Examples
- --------
- >>> import scipy.sparse
- >>> mat = scipy.sparse.eye(3)
- >>> pd.DataFrame.sparse.from_spmatrix(mat)
- 0 1 2
- 0 1.0 0.0 0.0
- 1 0.0 1.0 0.0
- 2 0.0 0.0 1.0
- """
- from pandas._libs.sparse import IntIndex
-
- from pandas import DataFrame
-
- data = data.tocsc()
- index, columns = cls._prep_index(data, index, columns)
- n_rows, n_columns = data.shape
- # We need to make sure indices are sorted, as we create
- # IntIndex with no input validation (i.e. check_integrity=False ).
- # Indices may already be sorted in scipy in which case this adds
- # a small overhead.
- data.sort_indices()
- indices = data.indices
- indptr = data.indptr
- array_data = data.data
- dtype = SparseDtype(array_data.dtype, 0)
- arrays = []
- for i in range(n_columns):
- sl = slice(indptr[i], indptr[i + 1])
- idx = IntIndex(n_rows, indices[sl], check_integrity=False)
- arr = SparseArray._simple_new(array_data[sl], idx, dtype)
- arrays.append(arr)
- return DataFrame._from_arrays(
- arrays, columns=columns, index=index, verify_integrity=False
- )
-
- def to_dense(self) -> DataFrame:
- """
- Convert a DataFrame with sparse values to dense.
-
- Returns
- -------
- DataFrame
- A DataFrame with the same values stored as dense arrays.
-
- Examples
- --------
- >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})
- >>> df.sparse.to_dense()
- A
- 0 0
- 1 1
- 2 0
- """
- from pandas import DataFrame
-
- data = {k: v.array.to_dense() for k, v in self._parent.items()}
- return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
-
- def to_coo(self):
- """
- Return the contents of the frame as a sparse SciPy COO matrix.
-
- Returns
- -------
- scipy.sparse.spmatrix
- If the caller is heterogeneous and contains booleans or objects,
- the result will be of dtype=object. See Notes.
-
- Notes
- -----
- The dtype will be the lowest-common-denominator type (implicit
- upcasting); that is to say if the dtypes (even of numeric types)
- are mixed, the one that accommodates all will be chosen.
-
- e.g. If the dtypes are float16 and float32, dtype will be upcast to
- float32. By numpy.find_common_type convention, mixing int64 and
- and uint64 will result in a float64 dtype.
- """
- import_optional_dependency("scipy")
- from scipy.sparse import coo_matrix
-
- dtype = find_common_type(self._parent.dtypes.to_list())
- if isinstance(dtype, SparseDtype):
- dtype = dtype.subtype
-
- cols, rows, data = [], [], []
- for col, (_, ser) in enumerate(self._parent.items()):
- sp_arr = ser.array
- if sp_arr.fill_value != 0:
- raise ValueError("fill value must be 0 when converting to COO matrix")
-
- row = sp_arr.sp_index.indices
- cols.append(np.repeat(col, len(row)))
- rows.append(row)
- data.append(sp_arr.sp_values.astype(dtype, copy=False))
-
- cols = np.concatenate(cols)
- rows = np.concatenate(rows)
- data = np.concatenate(data)
- return coo_matrix((data, (rows, cols)), shape=self._parent.shape)
-
- @property
- def density(self) -> float:
- """
- Ratio of non-sparse points to total (dense) data points.
- """
- tmp = np.mean([column.array.density for _, column in self._parent.items()])
- return tmp
-
- @staticmethod
- def _prep_index(data, index, columns):
- from pandas.core.indexes.api import (
- default_index,
- ensure_index,
- )
-
- N, K = data.shape
- if index is None:
- index = default_index(N)
- else:
- index = ensure_index(index)
- if columns is None:
- columns = default_index(K)
- else:
- columns = ensure_index(columns)
-
- if len(columns) != K:
- raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")
- if len(index) != N:
- raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")
- return index, columns
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/sparse/array.py b/contrib/python/pandas/py3/pandas/core/arrays/sparse/array.py
deleted file mode 100644
index 9b55638b21e..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/sparse/array.py
+++ /dev/null
@@ -1,1892 +0,0 @@
-"""
-SparseArray data structure
-"""
-from __future__ import annotations
-
-from collections import abc
-import numbers
-import operator
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Literal,
- Sequence,
- TypeVar,
- cast,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import lib
-import pandas._libs.sparse as splib
-from pandas._libs.sparse import (
- BlockIndex,
- IntIndex,
- SparseIndex,
-)
-from pandas._libs.tslibs import NaT
-from pandas._typing import (
- ArrayLike,
- AstypeArg,
- Axis,
- AxisInt,
- Dtype,
- NpDtype,
- PositionalIndexer,
- Scalar,
- ScalarIndexer,
- SequenceIndexer,
- npt,
-)
-from pandas.compat.numpy import function as nv
-from pandas.errors import PerformanceWarning
-from pandas.util._exceptions import find_stack_level
-from pandas.util._validators import (
- validate_bool_kwarg,
- validate_insert_loc,
-)
-
-from pandas.core.dtypes.astype import astype_array
-from pandas.core.dtypes.cast import (
- construct_1d_arraylike_from_scalar,
- find_common_type,
- maybe_box_datetimelike,
-)
-from pandas.core.dtypes.common import (
- is_array_like,
- is_bool_dtype,
- is_datetime64_any_dtype,
- is_datetime64tz_dtype,
- is_dtype_equal,
- is_integer,
- is_list_like,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.generic import (
- ABCIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import (
- isna,
- na_value_for_dtype,
- notna,
-)
-
-from pandas.core import (
- arraylike,
- ops,
-)
-import pandas.core.algorithms as algos
-from pandas.core.arraylike import OpsMixin
-from pandas.core.arrays import ExtensionArray
-from pandas.core.arrays.sparse.dtype import SparseDtype
-from pandas.core.base import PandasObject
-import pandas.core.common as com
-from pandas.core.construction import (
- ensure_wrapped_if_datetimelike,
- extract_array,
- sanitize_array,
-)
-from pandas.core.indexers import (
- check_array_indexer,
- unpack_tuple_and_ellipses,
-)
-from pandas.core.missing import interpolate_2d
-from pandas.core.nanops import check_below_min_count
-
-from pandas.io.formats import printing
-
-# See https://github.com/python/typing/issues/684
-if TYPE_CHECKING:
- from enum import Enum
-
- class ellipsis(Enum):
- Ellipsis = "..."
-
- Ellipsis = ellipsis.Ellipsis
-
- from scipy.sparse import spmatrix
-
- from pandas._typing import (
- FillnaOptions,
- NumpySorter,
- )
-
- SparseIndexKind = Literal["integer", "block"]
-
- from pandas import Series
-
-else:
- ellipsis = type(Ellipsis)
-
-
-# ----------------------------------------------------------------------------
-# Array
-
-SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray")
-
-_sparray_doc_kwargs = {"klass": "SparseArray"}
-
-
-def _get_fill(arr: SparseArray) -> np.ndarray:
- """
- Create a 0-dim ndarray containing the fill value
-
- Parameters
- ----------
- arr : SparseArray
-
- Returns
- -------
- fill_value : ndarray
- 0-dim ndarray with just the fill value.
-
- Notes
- -----
- coerce fill_value to arr dtype if possible
- int64 SparseArray can have NaN as fill_value if there is no missing
- """
- try:
- return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)
- except ValueError:
- return np.asarray(arr.fill_value)
-
-
-def _sparse_array_op(
- left: SparseArray, right: SparseArray, op: Callable, name: str
-) -> SparseArray:
- """
- Perform a binary operation between two arrays.
-
- Parameters
- ----------
- left : Union[SparseArray, ndarray]
- right : Union[SparseArray, ndarray]
- op : Callable
- The binary operation to perform
- name str
- Name of the callable.
-
- Returns
- -------
- SparseArray
- """
- if name.startswith("__"):
- # For lookups in _libs.sparse we need non-dunder op name
- name = name[2:-2]
-
- # dtype used to find corresponding sparse method
- ltype = left.dtype.subtype
- rtype = right.dtype.subtype
-
- if not is_dtype_equal(ltype, rtype):
- subtype = find_common_type([ltype, rtype])
- ltype = SparseDtype(subtype, left.fill_value)
- rtype = SparseDtype(subtype, right.fill_value)
-
- left = left.astype(ltype, copy=False)
- right = right.astype(rtype, copy=False)
- dtype = ltype.subtype
- else:
- dtype = ltype
-
- # dtype the result must have
- result_dtype = None
-
- if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
- with np.errstate(all="ignore"):
- result = op(left.to_dense(), right.to_dense())
- fill = op(_get_fill(left), _get_fill(right))
-
- if left.sp_index.ngaps == 0:
- index = left.sp_index
- else:
- index = right.sp_index
- elif left.sp_index.equals(right.sp_index):
- with np.errstate(all="ignore"):
- result = op(left.sp_values, right.sp_values)
- fill = op(_get_fill(left), _get_fill(right))
- index = left.sp_index
- else:
- if name[0] == "r":
- left, right = right, left
- name = name[1:]
-
- if name in ("and", "or", "xor") and dtype == "bool":
- opname = f"sparse_{name}_uint8"
- # to make template simple, cast here
- left_sp_values = left.sp_values.view(np.uint8)
- right_sp_values = right.sp_values.view(np.uint8)
- result_dtype = bool
- else:
- opname = f"sparse_{name}_{dtype}"
- left_sp_values = left.sp_values
- right_sp_values = right.sp_values
-
- if (
- name in ["floordiv", "mod"]
- and (right == 0).any()
- and left.dtype.kind in ["i", "u"]
- ):
- # Match the non-Sparse Series behavior
- opname = f"sparse_{name}_float64"
- left_sp_values = left_sp_values.astype("float64")
- right_sp_values = right_sp_values.astype("float64")
-
- sparse_op = getattr(splib, opname)
-
- with np.errstate(all="ignore"):
- result, index, fill = sparse_op(
- left_sp_values,
- left.sp_index,
- left.fill_value,
- right_sp_values,
- right.sp_index,
- right.fill_value,
- )
-
- if name == "divmod":
- # result is a 2-tuple
- # error: Incompatible return value type (got "Tuple[SparseArray,
- # SparseArray]", expected "SparseArray")
- return ( # type: ignore[return-value]
- _wrap_result(name, result[0], index, fill[0], dtype=result_dtype),
- _wrap_result(name, result[1], index, fill[1], dtype=result_dtype),
- )
-
- if result_dtype is None:
- result_dtype = result.dtype
-
- return _wrap_result(name, result, index, fill, dtype=result_dtype)
-
-
-def _wrap_result(
- name: str, data, sparse_index, fill_value, dtype: Dtype | None = None
-) -> SparseArray:
- """
- wrap op result to have correct dtype
- """
- if name.startswith("__"):
- # e.g. __eq__ --> eq
- name = name[2:-2]
-
- if name in ("eq", "ne", "lt", "gt", "le", "ge"):
- dtype = bool
-
- fill_value = lib.item_from_zerodim(fill_value)
-
- if is_bool_dtype(dtype):
- # fill_value may be np.bool_
- fill_value = bool(fill_value)
- return SparseArray(
- data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype
- )
-
-
-class SparseArray(OpsMixin, PandasObject, ExtensionArray):
- """
- An ExtensionArray for storing sparse data.
-
- Parameters
- ----------
- data : array-like or scalar
- A dense array of values to store in the SparseArray. This may contain
- `fill_value`.
- sparse_index : SparseIndex, optional
- fill_value : scalar, optional
- Elements in data that are ``fill_value`` are not stored in the
- SparseArray. For memory savings, this should be the most common value
- in `data`. By default, `fill_value` depends on the dtype of `data`:
-
- =========== ==========
- data.dtype na_value
- =========== ==========
- float ``np.nan``
- int ``0``
- bool False
- datetime64 ``pd.NaT``
- timedelta64 ``pd.NaT``
- =========== ==========
-
- The fill value is potentially specified in three ways. In order of
- precedence, these are
-
- 1. The `fill_value` argument
- 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is
- a ``SparseDtype``
- 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`
- is not a ``SparseDtype`` and `data` is a ``SparseArray``.
-
- kind : str
- Can be 'integer' or 'block', default is 'integer'.
- The type of storage for sparse locations.
-
- * 'block': Stores a `block` and `block_length` for each
- contiguous *span* of sparse values. This is best when
- sparse data tends to be clumped together, with large
- regions of ``fill-value`` values between sparse values.
- * 'integer': uses an integer to store the location of
- each sparse value.
-
- dtype : np.dtype or SparseDtype, optional
- The dtype to use for the SparseArray. For numpy dtypes, this
- determines the dtype of ``self.sp_values``. For SparseDtype,
- this determines ``self.sp_values`` and ``self.fill_value``.
- copy : bool, default False
- Whether to explicitly copy the incoming `data` array.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
-
- Examples
- --------
- >>> from pandas.arrays import SparseArray
- >>> arr = SparseArray([0, 0, 1, 2])
- >>> arr
- [0, 0, 1, 2]
- Fill: 0
- IntIndex
- Indices: array([2, 3], dtype=int32)
- """
-
- _subtyp = "sparse_array" # register ABCSparseArray
- _hidden_attrs = PandasObject._hidden_attrs | frozenset([])
- _sparse_index: SparseIndex
- _sparse_values: np.ndarray
- _dtype: SparseDtype
-
- def __init__(
- self,
- data,
- sparse_index=None,
- fill_value=None,
- kind: SparseIndexKind = "integer",
- dtype: Dtype | None = None,
- copy: bool = False,
- ) -> None:
- if fill_value is None and isinstance(dtype, SparseDtype):
- fill_value = dtype.fill_value
-
- if isinstance(data, type(self)):
- # disable normal inference on dtype, sparse_index, & fill_value
- if sparse_index is None:
- sparse_index = data.sp_index
- if fill_value is None:
- fill_value = data.fill_value
- if dtype is None:
- dtype = data.dtype
- # TODO: make kind=None, and use data.kind?
- data = data.sp_values
-
- # Handle use-provided dtype
- if isinstance(dtype, str):
- # Two options: dtype='int', regular numpy dtype
- # or dtype='Sparse[int]', a sparse dtype
- try:
- dtype = SparseDtype.construct_from_string(dtype)
- except TypeError:
- dtype = pandas_dtype(dtype)
-
- if isinstance(dtype, SparseDtype):
- if fill_value is None:
- fill_value = dtype.fill_value
- dtype = dtype.subtype
-
- if is_scalar(data):
- if sparse_index is None:
- npoints = 1
- else:
- npoints = sparse_index.length
-
- data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None)
- dtype = data.dtype
-
- if dtype is not None:
- dtype = pandas_dtype(dtype)
-
- # TODO: disentangle the fill_value dtype inference from
- # dtype inference
- if data is None:
- # TODO: What should the empty dtype be? Object or float?
-
- # error: Argument "dtype" to "array" has incompatible type
- # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any],
- # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,
- # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
- data = np.array([], dtype=dtype) # type: ignore[arg-type]
-
- if not is_array_like(data):
- try:
- # probably shared code in sanitize_series
-
- data = sanitize_array(data, index=None)
- except ValueError:
- # NumPy may raise a ValueError on data like [1, []]
- # we retry with object dtype here.
- if dtype is None:
- dtype = np.dtype(object)
- data = np.atleast_1d(np.asarray(data, dtype=dtype))
- else:
- raise
-
- if copy:
- # TODO: avoid double copy when dtype forces cast.
- data = data.copy()
-
- if fill_value is None:
- fill_value_dtype = data.dtype if dtype is None else dtype
- if fill_value_dtype is None:
- fill_value = np.nan
- else:
- fill_value = na_value_for_dtype(fill_value_dtype)
-
- if isinstance(data, type(self)) and sparse_index is None:
- sparse_index = data._sparse_index
- # error: Argument "dtype" to "asarray" has incompatible type
- # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"
- sparse_values = np.asarray(
- data.sp_values, dtype=dtype # type: ignore[arg-type]
- )
- elif sparse_index is None:
- data = extract_array(data, extract_numpy=True)
- if not isinstance(data, np.ndarray):
- # EA
- if is_datetime64tz_dtype(data.dtype):
- warnings.warn(
- f"Creating SparseArray from {data.dtype} data "
- "loses timezone information. Cast to object before "
- "sparse to retain timezone information.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- data = np.asarray(data, dtype="datetime64[ns]")
- if fill_value is NaT:
- fill_value = np.datetime64("NaT", "ns")
- data = np.asarray(data)
- sparse_values, sparse_index, fill_value = _make_sparse(
- # error: Argument "dtype" to "_make_sparse" has incompatible type
- # "Union[ExtensionDtype, dtype[Any], None]"; expected
- # "Optional[dtype[Any]]"
- data,
- kind=kind,
- fill_value=fill_value,
- dtype=dtype, # type: ignore[arg-type]
- )
- else:
- # error: Argument "dtype" to "asarray" has incompatible type
- # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"
- sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type]
- if len(sparse_values) != sparse_index.npoints:
- raise AssertionError(
- f"Non array-like type {type(sparse_values)} must "
- "have the same length as the index"
- )
- self._sparse_index = sparse_index
- self._sparse_values = sparse_values
- self._dtype = SparseDtype(sparse_values.dtype, fill_value)
-
- @classmethod
- def _simple_new(
- cls: type[SparseArrayT],
- sparse_array: np.ndarray,
- sparse_index: SparseIndex,
- dtype: SparseDtype,
- ) -> SparseArrayT:
- new = object.__new__(cls)
- new._sparse_index = sparse_index
- new._sparse_values = sparse_array
- new._dtype = dtype
- return new
-
- @classmethod
- def from_spmatrix(cls: type[SparseArrayT], data: spmatrix) -> SparseArrayT:
- """
- Create a SparseArray from a scipy.sparse matrix.
-
- Parameters
- ----------
- data : scipy.sparse.sp_matrix
- This should be a SciPy sparse matrix where the size
- of the second dimension is 1. In other words, a
- sparse matrix with a single column.
-
- Returns
- -------
- SparseArray
-
- Examples
- --------
- >>> import scipy.sparse
- >>> mat = scipy.sparse.coo_matrix((4, 1))
- >>> pd.arrays.SparseArray.from_spmatrix(mat)
- [0.0, 0.0, 0.0, 0.0]
- Fill: 0.0
- IntIndex
- Indices: array([], dtype=int32)
- """
- length, ncol = data.shape
-
- if ncol != 1:
- raise ValueError(f"'data' must have a single column, not '{ncol}'")
-
- # our sparse index classes require that the positions be strictly
- # increasing. So we need to sort loc, and arr accordingly.
- data = data.tocsc()
- data.sort_indices()
- arr = data.data
- idx = data.indices
-
- zero = np.array(0, dtype=arr.dtype).item()
- dtype = SparseDtype(arr.dtype, zero)
- index = IntIndex(length, idx)
-
- return cls._simple_new(arr, index, dtype)
-
- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
- fill_value = self.fill_value
-
- if self.sp_index.ngaps == 0:
- # Compat for na dtype and int values.
- return self.sp_values
- if dtype is None:
- # Can NumPy represent this type?
- # If not, `np.result_type` will raise. We catch that
- # and return object.
- if is_datetime64_any_dtype(self.sp_values.dtype):
- # However, we *do* special-case the common case of
- # a datetime64 with pandas NaT.
- if fill_value is NaT:
- # Can't put pd.NaT in a datetime64[ns]
- fill_value = np.datetime64("NaT")
- try:
- dtype = np.result_type(self.sp_values.dtype, type(fill_value))
- except TypeError:
- dtype = object
-
- out = np.full(self.shape, fill_value, dtype=dtype)
- out[self.sp_index.indices] = self.sp_values
- return out
-
- def __setitem__(self, key, value):
- # I suppose we could allow setting of non-fill_value elements.
- # TODO(SparseArray.__setitem__): remove special cases in
- # ExtensionBlock.where
- msg = "SparseArray does not support item assignment via setitem"
- raise TypeError(msg)
-
- @classmethod
- def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
- return cls(scalars, dtype=dtype)
-
- @classmethod
- def _from_factorized(cls, values, original):
- return cls(values, dtype=original.dtype)
-
- # ------------------------------------------------------------------------
- # Data
- # ------------------------------------------------------------------------
- @property
- def sp_index(self) -> SparseIndex:
- """
- The SparseIndex containing the location of non- ``fill_value`` points.
- """
- return self._sparse_index
-
- @property
- def sp_values(self) -> np.ndarray:
- """
- An ndarray containing the non- ``fill_value`` values.
-
- Examples
- --------
- >>> from pandas.arrays import SparseArray
- >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
- >>> s.sp_values
- array([1, 2])
- """
- return self._sparse_values
-
- @property
- def dtype(self) -> SparseDtype:
- return self._dtype
-
- @property
- def fill_value(self):
- """
- Elements in `data` that are `fill_value` are not stored.
-
- For memory savings, this should be the most common value in the array.
- """
- return self.dtype.fill_value
-
- @fill_value.setter
- def fill_value(self, value) -> None:
- self._dtype = SparseDtype(self.dtype.subtype, value)
-
- @property
- def kind(self) -> SparseIndexKind:
- """
- The kind of sparse index for this array. One of {'integer', 'block'}.
- """
- if isinstance(self.sp_index, IntIndex):
- return "integer"
- else:
- return "block"
-
- @property
- def _valid_sp_values(self) -> np.ndarray:
- sp_vals = self.sp_values
- mask = notna(sp_vals)
- return sp_vals[mask]
-
- def __len__(self) -> int:
- return self.sp_index.length
-
- @property
- def _null_fill_value(self) -> bool:
- return self._dtype._is_na_fill_value
-
- def _fill_value_matches(self, fill_value) -> bool:
- if self._null_fill_value:
- return isna(fill_value)
- else:
- return self.fill_value == fill_value
-
- @property
- def nbytes(self) -> int:
- return self.sp_values.nbytes + self.sp_index.nbytes
-
- @property
- def density(self) -> float:
- """
- The percent of non- ``fill_value`` points, as decimal.
-
- Examples
- --------
- >>> from pandas.arrays import SparseArray
- >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
- >>> s.density
- 0.6
- """
- return self.sp_index.npoints / self.sp_index.length
-
- @property
- def npoints(self) -> int:
- """
- The number of non- ``fill_value`` points.
-
- Examples
- --------
- >>> from pandas.arrays import SparseArray
- >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
- >>> s.npoints
- 3
- """
- return self.sp_index.npoints
-
- def isna(self):
- # If null fill value, we want SparseDtype[bool, true]
- # to preserve the same memory usage.
- dtype = SparseDtype(bool, self._null_fill_value)
- if self._null_fill_value:
- return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
- mask = np.full(len(self), False, dtype=np.bool_)
- mask[self.sp_index.indices] = isna(self.sp_values)
- return type(self)(mask, fill_value=False, dtype=dtype)
-
- def fillna(
- self: SparseArrayT,
- value=None,
- method: FillnaOptions | None = None,
- limit: int | None = None,
- ) -> SparseArrayT:
- """
- Fill missing values with `value`.
-
- Parameters
- ----------
- value : scalar, optional
- method : str, optional
-
- .. warning::
-
- Using 'method' will result in high memory use,
- as all `fill_value` methods will be converted to
- an in-memory ndarray
-
- limit : int, optional
-
- Returns
- -------
- SparseArray
-
- Notes
- -----
- When `value` is specified, the result's ``fill_value`` depends on
- ``self.fill_value``. The goal is to maintain low-memory use.
-
- If ``self.fill_value`` is NA, the result dtype will be
- ``SparseDtype(self.dtype, fill_value=value)``. This will preserve
- amount of memory used before and after filling.
-
- When ``self.fill_value`` is not NA, the result dtype will be
- ``self.dtype``. Again, this preserves the amount of memory used.
- """
- if (method is None and value is None) or (
- method is not None and value is not None
- ):
- raise ValueError("Must specify one of 'method' or 'value'.")
-
- if method is not None:
- msg = "fillna with 'method' requires high memory usage."
- warnings.warn(
- msg,
- PerformanceWarning,
- stacklevel=find_stack_level(),
- )
- new_values = np.asarray(self)
- # interpolate_2d modifies new_values inplace
- interpolate_2d(new_values, method=method, limit=limit)
- return type(self)(new_values, fill_value=self.fill_value)
-
- else:
- new_values = np.where(isna(self.sp_values), value, self.sp_values)
-
- if self._null_fill_value:
- # This is essentially just updating the dtype.
- new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)
- else:
- new_dtype = self.dtype
-
- return self._simple_new(new_values, self._sparse_index, new_dtype)
-
- def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT:
- if not len(self) or periods == 0:
- return self.copy()
-
- if isna(fill_value):
- fill_value = self.dtype.na_value
-
- subtype = np.result_type(fill_value, self.dtype.subtype)
-
- if subtype != self.dtype.subtype:
- # just coerce up front
- arr = self.astype(SparseDtype(subtype, self.fill_value))
- else:
- arr = self
-
- empty = self._from_sequence(
- [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype
- )
-
- if periods > 0:
- a = empty
- b = arr[:-periods]
- else:
- a = arr[abs(periods) :]
- b = empty
- return arr._concat_same_type([a, b])
-
- def _first_fill_value_loc(self):
- """
- Get the location of the first fill value.
-
- Returns
- -------
- int
- """
- if len(self) == 0 or self.sp_index.npoints == len(self):
- return -1
-
- indices = self.sp_index.indices
- if not len(indices) or indices[0] > 0:
- return 0
-
- # a number larger than 1 should be appended to
- # the last in case of fill value only appears
- # in the tail of array
- diff = np.r_[np.diff(indices), 2]
- return indices[(diff > 1).argmax()] + 1
-
- def unique(self: SparseArrayT) -> SparseArrayT:
- uniques = algos.unique(self.sp_values)
- if len(self.sp_values) != len(self):
- fill_loc = self._first_fill_value_loc()
- # Inorder to align the behavior of pd.unique or
- # pd.Series.unique, we should keep the original
- # order, here we use unique again to find the
- # insertion place. Since the length of sp_values
- # is not large, maybe minor performance hurt
- # is worthwhile to the correctness.
- insert_loc = len(algos.unique(self.sp_values[:fill_loc]))
- uniques = np.insert(uniques, insert_loc, self.fill_value)
- return type(self)._from_sequence(uniques, dtype=self.dtype)
-
- def _values_for_factorize(self):
- # Still override this for hash_pandas_object
- return np.asarray(self), self.fill_value
-
- def factorize(
- self,
- use_na_sentinel: bool = True,
- ) -> tuple[np.ndarray, SparseArray]:
- # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
- # The sparsity on this is backwards from what Sparse would want. Want
- # ExtensionArray.factorize -> Tuple[EA, EA]
- # Given that we have to return a dense array of codes, why bother
- # implementing an efficient factorize?
- codes, uniques = algos.factorize(
- np.asarray(self), use_na_sentinel=use_na_sentinel
- )
- uniques_sp = SparseArray(uniques, dtype=self.dtype)
- return codes, uniques_sp
-
- def value_counts(self, dropna: bool = True) -> Series:
- """
- Returns a Series containing counts of unique values.
-
- Parameters
- ----------
- dropna : bool, default True
- Don't include counts of NaN, even if NaN is in sp_values.
-
- Returns
- -------
- counts : Series
- """
- from pandas import (
- Index,
- Series,
- )
-
- keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
- fcounts = self.sp_index.ngaps
- if fcounts > 0 and (not self._null_fill_value or not dropna):
- mask = isna(keys) if self._null_fill_value else keys == self.fill_value
- if mask.any():
- counts[mask] += fcounts
- else:
- # error: Argument 1 to "insert" has incompatible type "Union[
- # ExtensionArray,ndarray[Any, Any]]"; expected "Union[
- # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype
- # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]],
- # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence
- # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]"
- keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type]
- counts = np.insert(counts, 0, fcounts)
-
- if not isinstance(keys, ABCIndex):
- index = Index(keys)
- else:
- index = keys
- return Series(counts, index=index, copy=False)
-
- # --------
- # Indexing
- # --------
- @overload
- def __getitem__(self, key: ScalarIndexer) -> Any:
- ...
-
- @overload
- def __getitem__(
- self: SparseArrayT,
- key: SequenceIndexer | tuple[int | ellipsis, ...],
- ) -> SparseArrayT:
- ...
-
- def __getitem__(
- self: SparseArrayT,
- key: PositionalIndexer | tuple[int | ellipsis, ...],
- ) -> SparseArrayT | Any:
- if isinstance(key, tuple):
- key = unpack_tuple_and_ellipses(key)
- if key is Ellipsis:
- raise ValueError("Cannot slice with Ellipsis")
-
- if is_integer(key):
- return self._get_val_at(key)
- elif isinstance(key, tuple):
- # error: Invalid index type "Tuple[Union[int, ellipsis], ...]"
- # for "ndarray[Any, Any]"; expected type
- # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,
- # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[
- # Union[bool_, integer[Any]]]]], _NestedSequence[Union[
- # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[
- # dtype[Union[bool_, integer[Any]]]], _NestedSequence[
- # _SupportsArray[dtype[Union[bool_, integer[Any]]]]],
- # _NestedSequence[Union[bool, int]]], ...]]"
- data_slice = self.to_dense()[key] # type: ignore[index]
- elif isinstance(key, slice):
- # Avoid densifying when handling contiguous slices
- if key.step is None or key.step == 1:
- start = 0 if key.start is None else key.start
- if start < 0:
- start += len(self)
-
- end = len(self) if key.stop is None else key.stop
- if end < 0:
- end += len(self)
-
- indices = self.sp_index.indices
- keep_inds = np.flatnonzero((indices >= start) & (indices < end))
- sp_vals = self.sp_values[keep_inds]
-
- sp_index = indices[keep_inds].copy()
-
- # If we've sliced to not include the start of the array, all our indices
- # should be shifted. NB: here we are careful to also not shift by a
- # negative value for a case like [0, 1][-100:] where the start index
- # should be treated like 0
- if start > 0:
- sp_index -= start
-
- # Length of our result should match applying this slice to a range
- # of the length of our original array
- new_len = len(range(len(self))[key])
- new_sp_index = make_sparse_index(new_len, sp_index, self.kind)
- return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)
- else:
- indices = np.arange(len(self), dtype=np.int32)[key]
- return self.take(indices)
-
- elif not is_list_like(key):
- # e.g. "foo" or 2.5
- # exception message copied from numpy
- raise IndexError(
- r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
- r"(`None`) and integer or boolean arrays are valid indices"
- )
-
- else:
- if isinstance(key, SparseArray):
- # NOTE: If we guarantee that SparseDType(bool)
- # has only fill_value - true, false or nan
- # (see GH PR 44955)
- # we can apply mask very fast:
- if is_bool_dtype(key):
- if isna(key.fill_value):
- return self.take(key.sp_index.indices[key.sp_values])
- if not key.fill_value:
- return self.take(key.sp_index.indices)
- n = len(self)
- mask = np.full(n, True, dtype=np.bool_)
- mask[key.sp_index.indices] = False
- return self.take(np.arange(n)[mask])
- else:
- key = np.asarray(key)
-
- key = check_array_indexer(self, key)
-
- if com.is_bool_indexer(key):
- # mypy doesn't know we have an array here
- key = cast(np.ndarray, key)
- return self.take(np.arange(len(key), dtype=np.int32)[key])
- elif hasattr(key, "__len__"):
- return self.take(key)
- else:
- raise ValueError(f"Cannot slice with '{key}'")
-
- return type(self)(data_slice, kind=self.kind)
-
- def _get_val_at(self, loc):
- loc = validate_insert_loc(loc, len(self))
-
- sp_loc = self.sp_index.lookup(loc)
- if sp_loc == -1:
- return self.fill_value
- else:
- val = self.sp_values[sp_loc]
- val = maybe_box_datetimelike(val, self.sp_values.dtype)
- return val
-
- def take(
- self: SparseArrayT, indices, *, allow_fill: bool = False, fill_value=None
- ) -> SparseArrayT:
- if is_scalar(indices):
- raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.")
- indices = np.asarray(indices, dtype=np.int32)
-
- dtype = None
- if indices.size == 0:
- result = np.array([], dtype="object")
- dtype = self.dtype
- elif allow_fill:
- result = self._take_with_fill(indices, fill_value=fill_value)
- else:
- return self._take_without_fill(indices)
-
- return type(self)(
- result, fill_value=self.fill_value, kind=self.kind, dtype=dtype
- )
-
- def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
- if fill_value is None:
- fill_value = self.dtype.na_value
-
- if indices.min() < -1:
- raise ValueError(
- "Invalid value in 'indices'. Must be between -1 "
- "and the length of the array."
- )
-
- if indices.max() >= len(self):
- raise IndexError("out of bounds value in 'indices'.")
-
- if len(self) == 0:
- # Empty... Allow taking only if all empty
- if (indices == -1).all():
- dtype = np.result_type(self.sp_values, type(fill_value))
- taken = np.empty_like(indices, dtype=dtype)
- taken.fill(fill_value)
- return taken
- else:
- raise IndexError("cannot do a non-empty take from an empty axes.")
-
- # sp_indexer may be -1 for two reasons
- # 1.) we took for an index of -1 (new)
- # 2.) we took a value that was self.fill_value (old)
- sp_indexer = self.sp_index.lookup_array(indices)
- new_fill_indices = indices == -1
- old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
-
- if self.sp_index.npoints == 0 and old_fill_indices.all():
- # We've looked up all valid points on an all-sparse array.
- taken = np.full(
- sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype
- )
-
- elif self.sp_index.npoints == 0:
- # Avoid taking from the empty self.sp_values
- _dtype = np.result_type(self.dtype.subtype, type(fill_value))
- taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
- else:
- taken = self.sp_values.take(sp_indexer)
-
- # Fill in two steps.
- # Old fill values
- # New fill values
- # potentially coercing to a new dtype at each stage.
-
- m0 = sp_indexer[old_fill_indices] < 0
- m1 = sp_indexer[new_fill_indices] < 0
-
- result_type = taken.dtype
-
- if m0.any():
- result_type = np.result_type(result_type, type(self.fill_value))
- taken = taken.astype(result_type)
- taken[old_fill_indices] = self.fill_value
-
- if m1.any():
- result_type = np.result_type(result_type, type(fill_value))
- taken = taken.astype(result_type)
- taken[new_fill_indices] = fill_value
-
- return taken
-
- def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT:
- to_shift = indices < 0
-
- n = len(self)
-
- if (indices.max() >= n) or (indices.min() < -n):
- if n == 0:
- raise IndexError("cannot do a non-empty take from an empty axes.")
- raise IndexError("out of bounds value in 'indices'.")
-
- if to_shift.any():
- indices = indices.copy()
- indices[to_shift] += n
-
- sp_indexer = self.sp_index.lookup_array(indices)
- value_mask = sp_indexer != -1
- new_sp_values = self.sp_values[sp_indexer[value_mask]]
-
- value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False)
-
- new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind)
- return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype)
-
- def searchsorted(
- self,
- v: ArrayLike | object,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- msg = "searchsorted requires high memory usage."
- warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
- if not is_scalar(v):
- v = np.asarray(v)
- v = np.asarray(v)
- return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)
-
- def copy(self: SparseArrayT) -> SparseArrayT:
- values = self.sp_values.copy()
- return self._simple_new(values, self.sp_index, self.dtype)
-
- @classmethod
- def _concat_same_type(
- cls: type[SparseArrayT], to_concat: Sequence[SparseArrayT]
- ) -> SparseArrayT:
- fill_value = to_concat[0].fill_value
-
- values = []
- length = 0
-
- if to_concat:
- sp_kind = to_concat[0].kind
- else:
- sp_kind = "integer"
-
- sp_index: SparseIndex
- if sp_kind == "integer":
- indices = []
-
- for arr in to_concat:
- int_idx = arr.sp_index.indices.copy()
- int_idx += length # TODO: wraparound
- length += arr.sp_index.length
-
- values.append(arr.sp_values)
- indices.append(int_idx)
-
- data = np.concatenate(values)
- indices_arr = np.concatenate(indices)
- # error: Argument 2 to "IntIndex" has incompatible type
- # "ndarray[Any, dtype[signedinteger[_32Bit]]]";
- # expected "Sequence[int]"
- sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type]
-
- else:
- # when concatenating block indices, we don't claim that you'll
- # get an identical index as concatenating the values and then
- # creating a new index. We don't want to spend the time trying
- # to merge blocks across arrays in `to_concat`, so the resulting
- # BlockIndex may have more blocks.
- blengths = []
- blocs = []
-
- for arr in to_concat:
- block_idx = arr.sp_index.to_block_index()
-
- values.append(arr.sp_values)
- blocs.append(block_idx.blocs.copy() + length)
- blengths.append(block_idx.blengths)
- length += arr.sp_index.length
-
- data = np.concatenate(values)
- blocs_arr = np.concatenate(blocs)
- blengths_arr = np.concatenate(blengths)
-
- sp_index = BlockIndex(length, blocs_arr, blengths_arr)
-
- return cls(data, sparse_index=sp_index, fill_value=fill_value)
-
- def astype(self, dtype: AstypeArg | None = None, copy: bool = True):
- """
- Change the dtype of a SparseArray.
-
- The output will always be a SparseArray. To convert to a dense
- ndarray with a certain dtype, use :meth:`numpy.asarray`.
-
- Parameters
- ----------
- dtype : np.dtype or ExtensionDtype
- For SparseDtype, this changes the dtype of
- ``self.sp_values`` and the ``self.fill_value``.
-
- For other dtypes, this only changes the dtype of
- ``self.sp_values``.
-
- copy : bool, default True
- Whether to ensure a copy is made, even if not necessary.
-
- Returns
- -------
- SparseArray
-
- Examples
- --------
- >>> arr = pd.arrays.SparseArray([0, 0, 1, 2])
- >>> arr
- [0, 0, 1, 2]
- Fill: 0
- IntIndex
- Indices: array([2, 3], dtype=int32)
-
- >>> arr.astype(SparseDtype(np.dtype('int32')))
- [0, 0, 1, 2]
- Fill: 0
- IntIndex
- Indices: array([2, 3], dtype=int32)
-
- Using a NumPy dtype with a different kind (e.g. float) will coerce
- just ``self.sp_values``.
-
- >>> arr.astype(SparseDtype(np.dtype('float64')))
- ... # doctest: +NORMALIZE_WHITESPACE
- [nan, nan, 1.0, 2.0]
- Fill: nan
- IntIndex
- Indices: array([2, 3], dtype=int32)
-
- Using a SparseDtype, you can also change the fill value as well.
-
- >>> arr.astype(SparseDtype("float64", fill_value=0.0))
- ... # doctest: +NORMALIZE_WHITESPACE
- [0.0, 0.0, 1.0, 2.0]
- Fill: 0.0
- IntIndex
- Indices: array([2, 3], dtype=int32)
- """
- if is_dtype_equal(dtype, self._dtype):
- if not copy:
- return self
- else:
- return self.copy()
-
- future_dtype = pandas_dtype(dtype)
- if not isinstance(future_dtype, SparseDtype):
- # GH#34457
- values = np.asarray(self)
- values = ensure_wrapped_if_datetimelike(values)
- return astype_array(values, dtype=future_dtype, copy=False)
-
- dtype = self.dtype.update_dtype(dtype)
- subtype = pandas_dtype(dtype._subtype_with_str)
- subtype = cast(np.dtype, subtype) # ensured by update_dtype
- values = ensure_wrapped_if_datetimelike(self.sp_values)
- sp_values = astype_array(values, subtype, copy=copy)
- sp_values = np.asarray(sp_values)
-
- return self._simple_new(sp_values, self.sp_index, dtype)
-
- def map(self: SparseArrayT, mapper) -> SparseArrayT:
- """
- Map categories using an input mapping or function.
-
- Parameters
- ----------
- mapper : dict, Series, callable
- The correspondence from old values to new.
-
- Returns
- -------
- SparseArray
- The output array will have the same density as the input.
- The output fill value will be the result of applying the
- mapping to ``self.fill_value``
-
- Examples
- --------
- >>> arr = pd.arrays.SparseArray([0, 1, 2])
- >>> arr.map(lambda x: x + 10)
- [10, 11, 12]
- Fill: 10
- IntIndex
- Indices: array([1, 2], dtype=int32)
-
- >>> arr.map({0: 10, 1: 11, 2: 12})
- [10, 11, 12]
- Fill: 10
- IntIndex
- Indices: array([1, 2], dtype=int32)
-
- >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2]))
- [10, 11, 12]
- Fill: 10
- IntIndex
- Indices: array([1, 2], dtype=int32)
- """
- # this is used in apply.
- # We get hit since we're an "is_extension_array_dtype" but regular extension
- # types are not hit. This may be worth adding to the interface.
- if isinstance(mapper, ABCSeries):
- mapper = mapper.to_dict()
-
- if isinstance(mapper, abc.Mapping):
- fill_value = mapper.get(self.fill_value, self.fill_value)
- sp_values = [mapper.get(x, None) for x in self.sp_values]
- else:
- fill_value = mapper(self.fill_value)
- sp_values = [mapper(x) for x in self.sp_values]
-
- return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value)
-
- def to_dense(self) -> np.ndarray:
- """
- Convert SparseArray to a NumPy array.
-
- Returns
- -------
- arr : NumPy array
- """
- return np.asarray(self, dtype=self.sp_values.dtype)
-
- def _where(self, mask, value):
- # NB: may not preserve dtype, e.g. result may be Sparse[float64]
- # while self is Sparse[int64]
- naive_implementation = np.where(mask, self, value)
- dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value)
- result = type(self)._from_sequence(naive_implementation, dtype=dtype)
- return result
-
- # ------------------------------------------------------------------------
- # IO
- # ------------------------------------------------------------------------
- def __setstate__(self, state) -> None:
- """Necessary for making this object picklable"""
- if isinstance(state, tuple):
- # Compat for pandas < 0.24.0
- nd_state, (fill_value, sp_index) = state
- sparse_values = np.array([])
- sparse_values.__setstate__(nd_state)
-
- self._sparse_values = sparse_values
- self._sparse_index = sp_index
- self._dtype = SparseDtype(sparse_values.dtype, fill_value)
- else:
- self.__dict__.update(state)
-
- def nonzero(self) -> tuple[npt.NDArray[np.int32]]:
- if self.fill_value == 0:
- return (self.sp_index.indices,)
- else:
- return (self.sp_index.indices[self.sp_values != 0],)
-
- # ------------------------------------------------------------------------
- # Reductions
- # ------------------------------------------------------------------------
-
- def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
- method = getattr(self, name, None)
-
- if method is None:
- raise TypeError(f"cannot perform {name} with type {self.dtype}")
-
- if skipna:
- arr = self
- else:
- arr = self.dropna()
-
- return getattr(arr, name)(**kwargs)
-
- def all(self, axis=None, *args, **kwargs):
- """
- Tests whether all elements evaluate True
-
- Returns
- -------
- all : bool
-
- See Also
- --------
- numpy.all
- """
- nv.validate_all(args, kwargs)
-
- values = self.sp_values
-
- if len(values) != len(self) and not np.all(self.fill_value):
- return False
-
- return values.all()
-
- def any(self, axis: AxisInt = 0, *args, **kwargs):
- """
- Tests whether at least one of elements evaluate True
-
- Returns
- -------
- any : bool
-
- See Also
- --------
- numpy.any
- """
- nv.validate_any(args, kwargs)
-
- values = self.sp_values
-
- if len(values) != len(self) and np.any(self.fill_value):
- return True
-
- return values.any().item()
-
- def sum(
- self,
- axis: AxisInt = 0,
- min_count: int = 0,
- skipna: bool = True,
- *args,
- **kwargs,
- ) -> Scalar:
- """
- Sum of non-NA/null values
-
- Parameters
- ----------
- axis : int, default 0
- Not Used. NumPy compatibility.
- min_count : int, default 0
- The required number of valid values to perform the summation. If fewer
- than ``min_count`` valid values are present, the result will be the missing
- value indicator for subarray type.
- *args, **kwargs
- Not Used. NumPy compatibility.
-
- Returns
- -------
- scalar
- """
- nv.validate_sum(args, kwargs)
- valid_vals = self._valid_sp_values
- sp_sum = valid_vals.sum()
- has_na = self.sp_index.ngaps > 0 and not self._null_fill_value
-
- if has_na and not skipna:
- return na_value_for_dtype(self.dtype.subtype, compat=False)
-
- if self._null_fill_value:
- if check_below_min_count(valid_vals.shape, None, min_count):
- return na_value_for_dtype(self.dtype.subtype, compat=False)
- return sp_sum
- else:
- nsparse = self.sp_index.ngaps
- if check_below_min_count(valid_vals.shape, None, min_count - nsparse):
- return na_value_for_dtype(self.dtype.subtype, compat=False)
- return sp_sum + self.fill_value * nsparse
-
- def cumsum(self, axis: AxisInt = 0, *args, **kwargs) -> SparseArray:
- """
- Cumulative sum of non-NA/null values.
-
- When performing the cumulative summation, any non-NA/null values will
- be skipped. The resulting SparseArray will preserve the locations of
- NaN values, but the fill value will be `np.nan` regardless.
-
- Parameters
- ----------
- axis : int or None
- Axis over which to perform the cumulative summation. If None,
- perform cumulative summation over flattened array.
-
- Returns
- -------
- cumsum : SparseArray
- """
- nv.validate_cumsum(args, kwargs)
-
- if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
- raise ValueError(f"axis(={axis}) out of bounds")
-
- if not self._null_fill_value:
- return SparseArray(self.to_dense()).cumsum()
-
- return SparseArray(
- self.sp_values.cumsum(),
- sparse_index=self.sp_index,
- fill_value=self.fill_value,
- )
-
- def mean(self, axis: Axis = 0, *args, **kwargs):
- """
- Mean of non-NA/null values
-
- Returns
- -------
- mean : float
- """
- nv.validate_mean(args, kwargs)
- valid_vals = self._valid_sp_values
- sp_sum = valid_vals.sum()
- ct = len(valid_vals)
-
- if self._null_fill_value:
- return sp_sum / ct
- else:
- nsparse = self.sp_index.ngaps
- return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
-
- def max(self, *, axis: AxisInt | None = None, skipna: bool = True):
- """
- Max of array values, ignoring NA values if specified.
-
- Parameters
- ----------
- axis : int, default 0
- Not Used. NumPy compatibility.
- skipna : bool, default True
- Whether to ignore NA values.
-
- Returns
- -------
- scalar
- """
- nv.validate_minmax_axis(axis, self.ndim)
- return self._min_max("max", skipna=skipna)
-
- def min(self, *, axis: AxisInt | None = None, skipna: bool = True):
- """
- Min of array values, ignoring NA values if specified.
-
- Parameters
- ----------
- axis : int, default 0
- Not Used. NumPy compatibility.
- skipna : bool, default True
- Whether to ignore NA values.
-
- Returns
- -------
- scalar
- """
- nv.validate_minmax_axis(axis, self.ndim)
- return self._min_max("min", skipna=skipna)
-
- def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar:
- """
- Min/max of non-NA/null values
-
- Parameters
- ----------
- kind : {"min", "max"}
- skipna : bool
-
- Returns
- -------
- scalar
- """
- valid_vals = self._valid_sp_values
- has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0
-
- if len(valid_vals) > 0:
- sp_min_max = getattr(valid_vals, kind)()
-
- # If a non-null fill value is currently present, it might be the min/max
- if has_nonnull_fill_vals:
- func = max if kind == "max" else min
- return func(sp_min_max, self.fill_value)
- elif skipna:
- return sp_min_max
- elif self.sp_index.ngaps == 0:
- # No NAs present
- return sp_min_max
- else:
- return na_value_for_dtype(self.dtype.subtype, compat=False)
- elif has_nonnull_fill_vals:
- return self.fill_value
- else:
- return na_value_for_dtype(self.dtype.subtype, compat=False)
-
- def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int:
- values = self._sparse_values
- index = self._sparse_index.indices
- mask = np.asarray(isna(values))
- func = np.argmax if kind == "argmax" else np.argmin
-
- idx = np.arange(values.shape[0])
- non_nans = values[~mask]
- non_nan_idx = idx[~mask]
-
- _candidate = non_nan_idx[func(non_nans)]
- candidate = index[_candidate]
-
- if isna(self.fill_value):
- return candidate
- if kind == "argmin" and self[candidate] < self.fill_value:
- return candidate
- if kind == "argmax" and self[candidate] > self.fill_value:
- return candidate
- _loc = self._first_fill_value_loc()
- if _loc == -1:
- # fill_value doesn't exist
- return candidate
- else:
- return _loc
-
- def argmax(self, skipna: bool = True) -> int:
- validate_bool_kwarg(skipna, "skipna")
- if not skipna and self._hasna:
- raise NotImplementedError
- return self._argmin_argmax("argmax")
-
- def argmin(self, skipna: bool = True) -> int:
- validate_bool_kwarg(skipna, "skipna")
- if not skipna and self._hasna:
- raise NotImplementedError
- return self._argmin_argmax("argmin")
-
- # ------------------------------------------------------------------------
- # Ufuncs
- # ------------------------------------------------------------------------
-
- _HANDLED_TYPES = (np.ndarray, numbers.Number)
-
- def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
- out = kwargs.get("out", ())
-
- for x in inputs + out:
- if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
- return NotImplemented
-
- # for binary ops, use our custom dunder methods
- result = ops.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
-
- if "out" in kwargs:
- # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace
- res = arraylike.dispatch_ufunc_with_out(
- self, ufunc, method, *inputs, **kwargs
- )
- return res
-
- if method == "reduce":
- result = arraylike.dispatch_reduction_ufunc(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- # e.g. tests.series.test_ufunc.TestNumpyReductions
- return result
-
- if len(inputs) == 1:
- # No alignment necessary.
- sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
- fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)
-
- if ufunc.nout > 1:
- # multiple outputs. e.g. modf
- arrays = tuple(
- self._simple_new(
- sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)
- )
- for sp_value, fv in zip(sp_values, fill_value)
- )
- return arrays
- elif method == "reduce":
- # e.g. reductions
- return sp_values
-
- return self._simple_new(
- sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)
- )
-
- new_inputs = tuple(np.asarray(x) for x in inputs)
- result = getattr(ufunc, method)(*new_inputs, **kwargs)
- if out:
- if len(out) == 1:
- out = out[0]
- return out
-
- if ufunc.nout > 1:
- return tuple(type(self)(x) for x in result)
- elif method == "at":
- # no return value
- return None
- else:
- return type(self)(result)
-
- # ------------------------------------------------------------------------
- # Ops
- # ------------------------------------------------------------------------
-
- def _arith_method(self, other, op):
- op_name = op.__name__
-
- if isinstance(other, SparseArray):
- return _sparse_array_op(self, other, op, op_name)
-
- elif is_scalar(other):
- with np.errstate(all="ignore"):
- fill = op(_get_fill(self), np.asarray(other))
- result = op(self.sp_values, other)
-
- if op_name == "divmod":
- left, right = result
- lfill, rfill = fill
- return (
- _wrap_result(op_name, left, self.sp_index, lfill),
- _wrap_result(op_name, right, self.sp_index, rfill),
- )
-
- return _wrap_result(op_name, result, self.sp_index, fill)
-
- else:
- other = np.asarray(other)
- with np.errstate(all="ignore"):
- if len(self) != len(other):
- raise AssertionError(
- f"length mismatch: {len(self)} vs. {len(other)}"
- )
- if not isinstance(other, SparseArray):
- dtype = getattr(other, "dtype", None)
- other = SparseArray(other, fill_value=self.fill_value, dtype=dtype)
- return _sparse_array_op(self, other, op, op_name)
-
- def _cmp_method(self, other, op) -> SparseArray:
- if not is_scalar(other) and not isinstance(other, type(self)):
- # convert list-like to ndarray
- other = np.asarray(other)
-
- if isinstance(other, np.ndarray):
- # TODO: make this more flexible than just ndarray...
- other = SparseArray(other, fill_value=self.fill_value)
-
- if isinstance(other, SparseArray):
- if len(self) != len(other):
- raise ValueError(
- f"operands have mismatched length {len(self)} and {len(other)}"
- )
-
- op_name = op.__name__.strip("_")
- return _sparse_array_op(self, other, op, op_name)
- else:
- # scalar
- with np.errstate(all="ignore"):
- fill_value = op(self.fill_value, other)
- result = np.full(len(self), fill_value, dtype=np.bool_)
- result[self.sp_index.indices] = op(self.sp_values, other)
-
- return type(self)(
- result,
- fill_value=fill_value,
- dtype=np.bool_,
- )
-
- _logical_method = _cmp_method
-
- def _unary_method(self, op) -> SparseArray:
- fill_value = op(np.array(self.fill_value)).item()
- dtype = SparseDtype(self.dtype.subtype, fill_value)
- # NOTE: if fill_value doesn't change
- # we just have to apply op to sp_values
- if isna(self.fill_value) or fill_value == self.fill_value:
- values = op(self.sp_values)
- return type(self)._simple_new(values, self.sp_index, self.dtype)
- # In the other case we have to recalc indexes
- return type(self)(op(self.to_dense()), dtype=dtype)
-
- def __pos__(self) -> SparseArray:
- return self._unary_method(operator.pos)
-
- def __neg__(self) -> SparseArray:
- return self._unary_method(operator.neg)
-
- def __invert__(self) -> SparseArray:
- return self._unary_method(operator.invert)
-
- def __abs__(self) -> SparseArray:
- return self._unary_method(operator.abs)
-
- # ----------
- # Formatting
- # -----------
- def __repr__(self) -> str:
- pp_str = printing.pprint_thing(self)
- pp_fill = printing.pprint_thing(self.fill_value)
- pp_index = printing.pprint_thing(self.sp_index)
- return f"{pp_str}\nFill: {pp_fill}\n{pp_index}"
-
- def _formatter(self, boxed: bool = False):
- # Defer to the formatter from the GenericArrayFormatter calling us.
- # This will infer the correct formatter from the dtype of the values.
- return None
-
-
-def _make_sparse(
- arr: np.ndarray,
- kind: SparseIndexKind = "block",
- fill_value=None,
- dtype: np.dtype | None = None,
-):
- """
- Convert ndarray to sparse format
-
- Parameters
- ----------
- arr : ndarray
- kind : {'block', 'integer'}
- fill_value : NaN or another value
- dtype : np.dtype, optional
- copy : bool, default False
-
- Returns
- -------
- (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
- """
- assert isinstance(arr, np.ndarray)
-
- if arr.ndim > 1:
- raise TypeError("expected dimension <= 1 data")
-
- if fill_value is None:
- fill_value = na_value_for_dtype(arr.dtype)
-
- if isna(fill_value):
- mask = notna(arr)
- else:
- # cast to object comparison to be safe
- if is_string_dtype(arr.dtype):
- arr = arr.astype(object)
-
- if is_object_dtype(arr.dtype):
- # element-wise equality check method in numpy doesn't treat
- # each element type, eg. 0, 0.0, and False are treated as
- # same. So we have to check the both of its type and value.
- mask = splib.make_mask_object_ndarray(arr, fill_value)
- else:
- mask = arr != fill_value
-
- length = len(arr)
- if length != len(mask):
- # the arr is a SparseArray
- indices = mask.sp_index.indices
- else:
- indices = mask.nonzero()[0].astype(np.int32)
-
- index = make_sparse_index(length, indices, kind)
- sparsified_values = arr[mask]
- if dtype is not None:
- sparsified_values = ensure_wrapped_if_datetimelike(sparsified_values)
- sparsified_values = astype_array(sparsified_values, dtype=dtype)
- sparsified_values = np.asarray(sparsified_values)
-
- # TODO: copy
- return sparsified_values, index, fill_value
-
-
-@overload
-def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex:
- ...
-
-
-@overload
-def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex:
- ...
-
-
-def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:
- index: SparseIndex
- if kind == "block":
- locs, lens = splib.get_blocks(indices)
- index = BlockIndex(length, locs, lens)
- elif kind == "integer":
- index = IntIndex(length, indices)
- else: # pragma: no cover
- raise ValueError("must be block or integer type")
- return index
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/sparse/dtype.py b/contrib/python/pandas/py3/pandas/core/arrays/sparse/dtype.py
deleted file mode 100644
index 185ed2911c1..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/sparse/dtype.py
+++ /dev/null
@@ -1,426 +0,0 @@
-"""Sparse Dtype"""
-from __future__ import annotations
-
-import re
-from typing import (
- TYPE_CHECKING,
- Any,
-)
-import warnings
-
-import numpy as np
-
-from pandas._typing import (
- Dtype,
- DtypeObj,
- type_t,
-)
-from pandas.errors import PerformanceWarning
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.astype import astype_array
-from pandas.core.dtypes.base import (
- ExtensionDtype,
- register_extension_dtype,
-)
-from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.missing import (
- isna,
- na_value_for_dtype,
-)
-
-if TYPE_CHECKING:
- from pandas.core.arrays.sparse.array import SparseArray
-
-
-@register_extension_dtype
-class SparseDtype(ExtensionDtype):
- """
- Dtype for data stored in :class:`SparseArray`.
-
- This dtype implements the pandas ExtensionDtype interface.
-
- Parameters
- ----------
- dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64
- The dtype of the underlying array storing the non-fill value values.
- fill_value : scalar, optional
- The scalar value not stored in the SparseArray. By default, this
- depends on `dtype`.
-
- =========== ==========
- dtype na_value
- =========== ==========
- float ``np.nan``
- int ``0``
- bool ``False``
- datetime64 ``pd.NaT``
- timedelta64 ``pd.NaT``
- =========== ==========
-
- The default value may be overridden by specifying a `fill_value`.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
- """
-
- # We include `_is_na_fill_value` in the metadata to avoid hash collisions
- # between SparseDtype(float, 0.0) and SparseDtype(float, nan).
- # Without is_na_fill_value in the comparison, those would be equal since
- # hash(nan) is (sometimes?) 0.
- _metadata = ("_dtype", "_fill_value", "_is_na_fill_value")
-
- def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None:
- if isinstance(dtype, type(self)):
- if fill_value is None:
- fill_value = dtype.fill_value
- dtype = dtype.subtype
-
- dtype = pandas_dtype(dtype)
- if is_string_dtype(dtype):
- dtype = np.dtype("object")
-
- if fill_value is None:
- fill_value = na_value_for_dtype(dtype)
-
- self._dtype = dtype
- self._fill_value = fill_value
- self._check_fill_value()
-
- def __hash__(self) -> int:
- # Python3 doesn't inherit __hash__ when a base class overrides
- # __eq__, so we explicitly do it here.
- return super().__hash__()
-
- def __eq__(self, other: Any) -> bool:
- # We have to override __eq__ to handle NA values in _metadata.
- # The base class does simple == checks, which fail for NA.
- if isinstance(other, str):
- try:
- other = self.construct_from_string(other)
- except TypeError:
- return False
-
- if isinstance(other, type(self)):
- subtype = self.subtype == other.subtype
- if self._is_na_fill_value:
- # this case is complicated by two things:
- # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)
- # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)
- # i.e. we want to treat any floating-point NaN as equal, but
- # not a floating-point NaN and a datetime NaT.
- fill_value = (
- other._is_na_fill_value
- and isinstance(self.fill_value, type(other.fill_value))
- or isinstance(other.fill_value, type(self.fill_value))
- )
- else:
- with warnings.catch_warnings():
- # Ignore spurious numpy warning
- warnings.filterwarnings(
- "ignore",
- "elementwise comparison failed",
- category=DeprecationWarning,
- )
-
- fill_value = self.fill_value == other.fill_value
-
- return subtype and fill_value
- return False
-
- @property
- def fill_value(self):
- """
- The fill value of the array.
-
- Converting the SparseArray to a dense ndarray will fill the
- array with this value.
-
- .. warning::
-
- It's possible to end up with a SparseArray that has ``fill_value``
- values in ``sp_values``. This can occur, for example, when setting
- ``SparseArray.fill_value`` directly.
- """
- return self._fill_value
-
- def _check_fill_value(self):
- if not is_scalar(self._fill_value):
- raise ValueError(
- f"fill_value must be a scalar. Got {self._fill_value} instead"
- )
- # TODO: Right now we can use Sparse boolean array
- # with any fill_value. Here was an attempt
- # to allow only 3 value: True, False or nan
- # but plenty test has failed.
- # see pull 44955
- # if self._is_boolean and not (
- # is_bool(self._fill_value) or isna(self._fill_value)
- # ):
- # raise ValueError(
- # "fill_value must be True, False or nan "
- # f"for boolean type. Got {self._fill_value} instead"
- # )
-
- @property
- def _is_na_fill_value(self) -> bool:
- return isna(self.fill_value)
-
- @property
- def _is_numeric(self) -> bool:
- return not is_object_dtype(self.subtype)
-
- @property
- def _is_boolean(self) -> bool:
- return is_bool_dtype(self.subtype)
-
- @property
- def kind(self) -> str:
- """
- The sparse kind. Either 'integer', or 'block'.
- """
- return self.subtype.kind
-
- @property
- def type(self):
- return self.subtype.type
-
- @property
- def subtype(self):
- return self._dtype
-
- @property
- def name(self) -> str:
- return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]"
-
- def __repr__(self) -> str:
- return self.name
-
- @classmethod
- def construct_array_type(cls) -> type_t[SparseArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- from pandas.core.arrays.sparse.array import SparseArray
-
- return SparseArray
-
- @classmethod
- def construct_from_string(cls, string: str) -> SparseDtype:
- """
- Construct a SparseDtype from a string form.
-
- Parameters
- ----------
- string : str
- Can take the following forms.
-
- string dtype
- ================ ============================
- 'int' SparseDtype[np.int64, 0]
- 'Sparse' SparseDtype[np.float64, nan]
- 'Sparse[int]' SparseDtype[np.int64, 0]
- 'Sparse[int, 0]' SparseDtype[np.int64, 0]
- ================ ============================
-
- It is not possible to specify non-default fill values
- with a string. An argument like ``'Sparse[int, 1]'``
- will raise a ``TypeError`` because the default fill value
- for integers is 0.
-
- Returns
- -------
- SparseDtype
- """
- if not isinstance(string, str):
- raise TypeError(
- f"'construct_from_string' expects a string, got {type(string)}"
- )
- msg = f"Cannot construct a 'SparseDtype' from '{string}'"
- if string.startswith("Sparse"):
- try:
- sub_type, has_fill_value = cls._parse_subtype(string)
- except ValueError as err:
- raise TypeError(msg) from err
- else:
- result = SparseDtype(sub_type)
- msg = (
- f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt "
- "looks like the fill_value in the string is not "
- "the default for the dtype. Non-default fill_values "
- "are not supported. Use the 'SparseDtype()' "
- "constructor instead."
- )
- if has_fill_value and str(result) != string:
- raise TypeError(msg)
- return result
- else:
- raise TypeError(msg)
-
- @staticmethod
- def _parse_subtype(dtype: str) -> tuple[str, bool]:
- """
- Parse a string to get the subtype
-
- Parameters
- ----------
- dtype : str
- A string like
-
- * Sparse[subtype]
- * Sparse[subtype, fill_value]
-
- Returns
- -------
- subtype : str
-
- Raises
- ------
- ValueError
- When the subtype cannot be extracted.
- """
- xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$")
- m = xpr.match(dtype)
- has_fill_value = False
- if m:
- subtype = m.groupdict()["subtype"]
- has_fill_value = bool(m.groupdict()["fill_value"])
- elif dtype == "Sparse":
- subtype = "float64"
- else:
- raise ValueError(f"Cannot parse {dtype}")
- return subtype, has_fill_value
-
- @classmethod
- def is_dtype(cls, dtype: object) -> bool:
- dtype = getattr(dtype, "dtype", dtype)
- if isinstance(dtype, str) and dtype.startswith("Sparse"):
- sub_type, _ = cls._parse_subtype(dtype)
- dtype = np.dtype(sub_type)
- elif isinstance(dtype, cls):
- return True
- return isinstance(dtype, np.dtype) or dtype == "Sparse"
-
- def update_dtype(self, dtype) -> SparseDtype:
- """
- Convert the SparseDtype to a new dtype.
-
- This takes care of converting the ``fill_value``.
-
- Parameters
- ----------
- dtype : Union[str, numpy.dtype, SparseDtype]
- The new dtype to use.
-
- * For a SparseDtype, it is simply returned
- * For a NumPy dtype (or str), the current fill value
- is converted to the new dtype, and a SparseDtype
- with `dtype` and the new fill value is returned.
-
- Returns
- -------
- SparseDtype
- A new SparseDtype with the correct `dtype` and fill value
- for that `dtype`.
-
- Raises
- ------
- ValueError
- When the current fill value cannot be converted to the
- new `dtype` (e.g. trying to convert ``np.nan`` to an
- integer dtype).
-
-
- Examples
- --------
- >>> SparseDtype(int, 0).update_dtype(float)
- Sparse[float64, 0.0]
-
- >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))
- Sparse[float64, nan]
- """
- cls = type(self)
- dtype = pandas_dtype(dtype)
-
- if not isinstance(dtype, cls):
- if not isinstance(dtype, np.dtype):
- raise TypeError("sparse arrays of extension dtypes not supported")
-
- fv_asarray = np.atleast_1d(np.array(self.fill_value))
- fvarr = astype_array(fv_asarray, dtype)
- # NB: not fv_0d.item(), as that casts dt64->int
- fill_value = fvarr[0]
- dtype = cls(dtype, fill_value=fill_value)
-
- return dtype
-
- @property
- def _subtype_with_str(self):
- """
- Whether the SparseDtype's subtype should be considered ``str``.
-
- Typically, pandas will store string data in an object-dtype array.
- When converting values to a dtype, e.g. in ``.astype``, we need to
- be more specific, we need the actual underlying type.
-
- Returns
- -------
- >>> SparseDtype(int, 1)._subtype_with_str
- dtype('int64')
-
- >>> SparseDtype(object, 1)._subtype_with_str
- dtype('O')
-
- >>> dtype = SparseDtype(str, '')
- >>> dtype.subtype
- dtype('O')
-
- >>> dtype._subtype_with_str
- <class 'str'>
- """
- if isinstance(self.fill_value, str):
- return type(self.fill_value)
- return self.subtype
-
- def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
- # TODO for now only handle SparseDtypes and numpy dtypes => extend
- # with other compatible extension dtypes
- from pandas.core.dtypes.cast import np_find_common_type
-
- if any(
- isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)
- for x in dtypes
- ):
- return None
-
- fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]
- fill_value = fill_values[0]
-
- # np.nan isn't a singleton, so we may end up with multiple
- # NaNs here, so we ignore the all NA case too.
- if not (len(set(fill_values)) == 1 or isna(fill_values).all()):
- warnings.warn(
- "Concatenating sparse arrays with multiple fill "
- f"values: '{fill_values}'. Picking the first and "
- "converting the rest.",
- PerformanceWarning,
- stacklevel=find_stack_level(),
- )
-
- np_dtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes)
- return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value)
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/sparse/scipy_sparse.py b/contrib/python/pandas/py3/pandas/core/arrays/sparse/scipy_sparse.py
deleted file mode 100644
index 6ac1931f1dd..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/sparse/scipy_sparse.py
+++ /dev/null
@@ -1,208 +0,0 @@
-"""
-Interaction with scipy.sparse matrices.
-
-Currently only includes to_coo helpers.
-"""
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Iterable,
-)
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import (
- IndexLabel,
- npt,
-)
-
-from pandas.core.dtypes.missing import notna
-
-from pandas.core.algorithms import factorize
-from pandas.core.indexes.api import MultiIndex
-from pandas.core.series import Series
-
-if TYPE_CHECKING:
- import scipy.sparse
-
-
-def _check_is_partition(parts: Iterable, whole: Iterable):
- whole = set(whole)
- parts = [set(x) for x in parts]
- if set.intersection(*parts) != set():
- raise ValueError("Is not a partition because intersection is not null.")
- if set.union(*parts) != whole:
- raise ValueError("Is not a partition because union is not the whole.")
-
-
-def _levels_to_axis(
- ss,
- levels: tuple[int] | list[int],
- valid_ilocs: npt.NDArray[np.intp],
- sort_labels: bool = False,
-) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]:
- """
- For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`,
- where `ax_coords` are the coordinates along one of the two axes of the
- destination sparse matrix, and `ax_labels` are the labels from `ss`' Index
- which correspond to these coordinates.
-
- Parameters
- ----------
- ss : Series
- levels : tuple/list
- valid_ilocs : numpy.ndarray
- Array of integer positions of valid values for the sparse matrix in ss.
- sort_labels : bool, default False
- Sort the axis labels before forming the sparse matrix. When `levels`
- refers to a single level, set to True for a faster execution.
-
- Returns
- -------
- ax_coords : numpy.ndarray (axis coordinates)
- ax_labels : list (axis labels)
- """
- # Since the labels are sorted in `Index.levels`, when we wish to sort and
- # there is only one level of the MultiIndex for this axis, the desired
- # output can be obtained in the following simpler, more efficient way.
- if sort_labels and len(levels) == 1:
- ax_coords = ss.index.codes[levels[0]][valid_ilocs]
- ax_labels = ss.index.levels[levels[0]]
-
- else:
- levels_values = lib.fast_zip(
- [ss.index.get_level_values(lvl).to_numpy() for lvl in levels]
- )
- codes, ax_labels = factorize(levels_values, sort=sort_labels)
- ax_coords = codes[valid_ilocs]
-
- ax_labels = ax_labels.tolist()
- return ax_coords, ax_labels
-
-
-def _to_ijv(
- ss,
- row_levels: tuple[int] | list[int] = (0,),
- column_levels: tuple[int] | list[int] = (1,),
- sort_labels: bool = False,
-) -> tuple[
- np.ndarray,
- npt.NDArray[np.intp],
- npt.NDArray[np.intp],
- list[IndexLabel],
- list[IndexLabel],
-]:
- """
- For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels,
- jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo
- constructor, and ilabels and jlabels are the row and column labels
- respectively.
-
- Parameters
- ----------
- ss : Series
- row_levels : tuple/list
- column_levels : tuple/list
- sort_labels : bool, default False
- Sort the row and column labels before forming the sparse matrix.
- When `row_levels` and/or `column_levels` refer to a single level,
- set to `True` for a faster execution.
-
- Returns
- -------
- values : numpy.ndarray
- Valid values to populate a sparse matrix, extracted from
- ss.
- i_coords : numpy.ndarray (row coordinates of the values)
- j_coords : numpy.ndarray (column coordinates of the values)
- i_labels : list (row labels)
- j_labels : list (column labels)
- """
- # index and column levels must be a partition of the index
- _check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
- # From the sparse Series, get the integer indices and data for valid sparse
- # entries.
- sp_vals = ss.array.sp_values
- na_mask = notna(sp_vals)
- values = sp_vals[na_mask]
- valid_ilocs = ss.array.sp_index.indices[na_mask]
-
- i_coords, i_labels = _levels_to_axis(
- ss, row_levels, valid_ilocs, sort_labels=sort_labels
- )
-
- j_coords, j_labels = _levels_to_axis(
- ss, column_levels, valid_ilocs, sort_labels=sort_labels
- )
-
- return values, i_coords, j_coords, i_labels, j_labels
-
-
-def sparse_series_to_coo(
- ss: Series,
- row_levels: Iterable[int] = (0,),
- column_levels: Iterable[int] = (1,),
- sort_labels: bool = False,
-) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]:
- """
- Convert a sparse Series to a scipy.sparse.coo_matrix using index
- levels row_levels, column_levels as the row and column
- labels respectively. Returns the sparse_matrix, row and column labels.
- """
- import scipy.sparse
-
- if ss.index.nlevels < 2:
- raise ValueError("to_coo requires MultiIndex with nlevels >= 2.")
- if not ss.index.is_unique:
- raise ValueError(
- "Duplicate index entries are not allowed in to_coo transformation."
- )
-
- # to keep things simple, only rely on integer indexing (not labels)
- row_levels = [ss.index._get_level_number(x) for x in row_levels]
- column_levels = [ss.index._get_level_number(x) for x in column_levels]
-
- v, i, j, rows, columns = _to_ijv(
- ss, row_levels=row_levels, column_levels=column_levels, sort_labels=sort_labels
- )
- sparse_matrix = scipy.sparse.coo_matrix(
- (v, (i, j)), shape=(len(rows), len(columns))
- )
- return sparse_matrix, rows, columns
-
-
-def coo_to_sparse_series(
- A: scipy.sparse.coo_matrix, dense_index: bool = False
-) -> Series:
- """
- Convert a scipy.sparse.coo_matrix to a Series with type sparse.
-
- Parameters
- ----------
- A : scipy.sparse.coo_matrix
- dense_index : bool, default False
-
- Returns
- -------
- Series
-
- Raises
- ------
- TypeError if A is not a coo_matrix
- """
- from pandas import SparseDtype
-
- try:
- ser = Series(A.data, MultiIndex.from_arrays((A.row, A.col)), copy=False)
- except AttributeError as err:
- raise TypeError(
- f"Expected coo_matrix. Got {type(A).__name__} instead."
- ) from err
- ser = ser.sort_index()
- ser = ser.astype(SparseDtype(ser.dtype))
- if dense_index:
- ind = MultiIndex.from_product([A.row, A.col])
- ser = ser.reindex(ind)
- return ser
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/string_.py b/contrib/python/pandas/py3/pandas/core/arrays/string_.py
deleted file mode 100644
index 130f61b6891..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/string_.py
+++ /dev/null
@@ -1,608 +0,0 @@
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Literal,
-)
-
-import numpy as np
-
-from pandas._config import get_option
-
-from pandas._libs import (
- lib,
- missing as libmissing,
-)
-from pandas._libs.arrays import NDArrayBacked
-from pandas._typing import (
- AxisInt,
- Dtype,
- Scalar,
- npt,
- type_t,
-)
-from pandas.compat import pa_version_under7p0
-from pandas.compat.numpy import function as nv
-from pandas.util._decorators import doc
-
-from pandas.core.dtypes.base import (
- ExtensionDtype,
- StorageExtensionDtype,
- register_extension_dtype,
-)
-from pandas.core.dtypes.common import (
- is_array_like,
- is_bool_dtype,
- is_dtype_equal,
- is_integer_dtype,
- is_object_dtype,
- is_string_dtype,
- pandas_dtype,
-)
-
-from pandas.core import ops
-from pandas.core.array_algos import masked_reductions
-from pandas.core.arrays import (
- ExtensionArray,
- FloatingArray,
- IntegerArray,
-)
-from pandas.core.arrays.floating import FloatingDtype
-from pandas.core.arrays.integer import IntegerDtype
-from pandas.core.arrays.numpy_ import PandasArray
-from pandas.core.construction import extract_array
-from pandas.core.indexers import check_array_indexer
-from pandas.core.missing import isna
-
-if TYPE_CHECKING:
- import pyarrow
-
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- )
-
- from pandas import Series
-
-
-@register_extension_dtype
-class StringDtype(StorageExtensionDtype):
- """
- Extension dtype for string data.
-
- .. warning::
-
- StringDtype is considered experimental. The implementation and
- parts of the API may change without warning.
-
- Parameters
- ----------
- storage : {"python", "pyarrow"}, optional
- If not given, the value of ``pd.options.mode.string_storage``.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
-
- Examples
- --------
- >>> pd.StringDtype()
- string[python]
-
- >>> pd.StringDtype(storage="pyarrow")
- string[pyarrow]
- """
-
- name = "string"
-
- #: StringDtype().na_value uses pandas.NA
- @property
- def na_value(self) -> libmissing.NAType:
- return libmissing.NA
-
- _metadata = ("storage",)
-
- def __init__(self, storage=None) -> None:
- if storage is None:
- storage = get_option("mode.string_storage")
- if storage not in {"python", "pyarrow"}:
- raise ValueError(
- f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
- )
- if storage == "pyarrow" and pa_version_under7p0:
- raise ImportError(
- "pyarrow>=7.0.0 is required for PyArrow backed StringArray."
- )
- self.storage = storage
-
- @property
- def type(self) -> type[str]:
- return str
-
- @classmethod
- def construct_from_string(cls, string):
- """
- Construct a StringDtype from a string.
-
- Parameters
- ----------
- string : str
- The type of the name. The storage type will be taking from `string`.
- Valid options and their storage types are
-
- ========================== ==============================================
- string result storage
- ========================== ==============================================
- ``'string'`` pd.options.mode.string_storage, default python
- ``'string[python]'`` python
- ``'string[pyarrow]'`` pyarrow
- ========================== ==============================================
-
- Returns
- -------
- StringDtype
-
- Raise
- -----
- TypeError
- If the string is not a valid option.
- """
- if not isinstance(string, str):
- raise TypeError(
- f"'construct_from_string' expects a string, got {type(string)}"
- )
- if string == "string":
- return cls()
- elif string == "string[python]":
- return cls(storage="python")
- elif string == "string[pyarrow]":
- return cls(storage="pyarrow")
- else:
- raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
-
- # https://github.com/pandas-dev/pandas/issues/36126
- # error: Signature of "construct_array_type" incompatible with supertype
- # "ExtensionDtype"
- def construct_array_type( # type: ignore[override]
- self,
- ) -> type_t[BaseStringArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- from pandas.core.arrays.string_arrow import ArrowStringArray
-
- if self.storage == "python":
- return StringArray
- else:
- return ArrowStringArray
-
- def __from_arrow__(
- self, array: pyarrow.Array | pyarrow.ChunkedArray
- ) -> BaseStringArray:
- """
- Construct StringArray from pyarrow Array/ChunkedArray.
- """
- if self.storage == "pyarrow":
- from pandas.core.arrays.string_arrow import ArrowStringArray
-
- return ArrowStringArray(array)
- else:
- import pyarrow
-
- if isinstance(array, pyarrow.Array):
- chunks = [array]
- else:
- # pyarrow.ChunkedArray
- chunks = array.chunks
-
- results = []
- for arr in chunks:
- # using _from_sequence to ensure None is converted to NA
- str_arr = StringArray._from_sequence(np.array(arr))
- results.append(str_arr)
-
- if results:
- return StringArray._concat_same_type(results)
- else:
- return StringArray(np.array([], dtype="object"))
-
-
-class BaseStringArray(ExtensionArray):
- """
- Mixin class for StringArray, ArrowStringArray.
- """
-
- @doc(ExtensionArray.tolist)
- def tolist(self):
- if self.ndim > 1:
- return [x.tolist() for x in self]
- return list(self.to_numpy())
-
-
-class StringArray(BaseStringArray, PandasArray):
- """
- Extension array for string data.
-
- .. warning::
-
- StringArray is considered experimental. The implementation and
- parts of the API may change without warning.
-
- Parameters
- ----------
- values : array-like
- The array of data.
-
- .. warning::
-
- Currently, this expects an object-dtype ndarray
- where the elements are Python strings
- or nan-likes (``None``, ``np.nan``, ``NA``).
- This may change without warning in the future. Use
- :meth:`pandas.array` with ``dtype="string"`` for a stable way of
- creating a `StringArray` from any sequence.
-
- .. versionchanged:: 1.5.0
-
- StringArray now accepts array-likes containing
- nan-likes(``None``, ``np.nan``) for the ``values`` parameter
- in addition to strings and :attr:`pandas.NA`
-
- copy : bool, default False
- Whether to copy the array of data.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
-
- See Also
- --------
- :func:`pandas.array`
- The recommended function for creating a StringArray.
- Series.str
- The string methods are available on Series backed by
- a StringArray.
-
- Notes
- -----
- StringArray returns a BooleanArray for comparison methods.
-
- Examples
- --------
- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
- <StringArray>
- ['This is', 'some text', <NA>, 'data.']
- Length: 4, dtype: string
-
- Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
- will convert the values to strings.
-
- >>> pd.array(['1', 1], dtype="object")
- <PandasArray>
- ['1', 1]
- Length: 2, dtype: object
- >>> pd.array(['1', 1], dtype="string")
- <StringArray>
- ['1', '1']
- Length: 2, dtype: string
-
- However, instantiating StringArrays directly with non-strings will raise an error.
-
- For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
-
- >>> pd.array(["a", None, "c"], dtype="string") == "a"
- <BooleanArray>
- [True, <NA>, False]
- Length: 3, dtype: boolean
- """
-
- # undo the PandasArray hack
- _typ = "extension"
-
- def __init__(self, values, copy: bool = False) -> None:
- values = extract_array(values)
-
- super().__init__(values, copy=copy)
- if not isinstance(values, type(self)):
- self._validate()
- NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
-
- def _validate(self):
- """Validate that we only store NA or strings."""
- if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
- raise ValueError("StringArray requires a sequence of strings or pandas.NA")
- if self._ndarray.dtype != "object":
- raise ValueError(
- "StringArray requires a sequence of strings or pandas.NA. Got "
- f"'{self._ndarray.dtype}' dtype instead."
- )
- # Check to see if need to convert Na values to pd.NA
- if self._ndarray.ndim > 2:
- # Ravel if ndims > 2 b/c no cythonized version available
- lib.convert_nans_to_NA(self._ndarray.ravel("K"))
- else:
- lib.convert_nans_to_NA(self._ndarray)
-
- @classmethod
- def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
- if dtype and not (isinstance(dtype, str) and dtype == "string"):
- dtype = pandas_dtype(dtype)
- assert isinstance(dtype, StringDtype) and dtype.storage == "python"
-
- from pandas.core.arrays.masked import BaseMaskedArray
-
- if isinstance(scalars, BaseMaskedArray):
- # avoid costly conversion to object dtype
- na_values = scalars._mask
- result = scalars._data
- result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
- result[na_values] = libmissing.NA
-
- else:
- if hasattr(scalars, "type"):
- # pyarrow array
- scalars = np.array(scalars)
- # convert non-na-likes to str, and nan-likes to StringDtype().na_value
- result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
-
- # Manually creating new array avoids the validation step in the __init__, so is
- # faster. Refactor need for validation?
- new_string_array = cls.__new__(cls)
- NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
-
- return new_string_array
-
- @classmethod
- def _from_sequence_of_strings(
- cls, strings, *, dtype: Dtype | None = None, copy: bool = False
- ):
- return cls._from_sequence(strings, dtype=dtype, copy=copy)
-
- @classmethod
- def _empty(cls, shape, dtype) -> StringArray:
- values = np.empty(shape, dtype=object)
- values[:] = libmissing.NA
- return cls(values).astype(dtype, copy=False)
-
- def __arrow_array__(self, type=None):
- """
- Convert myself into a pyarrow Array.
- """
- import pyarrow as pa
-
- if type is None:
- type = pa.string()
-
- values = self._ndarray.copy()
- values[self.isna()] = None
- return pa.array(values, type=type, from_pandas=True)
-
- def _values_for_factorize(self):
- arr = self._ndarray.copy()
- mask = self.isna()
- arr[mask] = None
- return arr, None
-
- def __setitem__(self, key, value):
- value = extract_array(value, extract_numpy=True)
- if isinstance(value, type(self)):
- # extract_array doesn't extract PandasArray subclasses
- value = value._ndarray
-
- key = check_array_indexer(self, key)
- scalar_key = lib.is_scalar(key)
- scalar_value = lib.is_scalar(value)
- if scalar_key and not scalar_value:
- raise ValueError("setting an array element with a sequence.")
-
- # validate new items
- if scalar_value:
- if isna(value):
- value = libmissing.NA
- elif not isinstance(value, str):
- raise TypeError(
- f"Cannot set non-string value '{value}' into a StringArray."
- )
- else:
- if not is_array_like(value):
- value = np.asarray(value, dtype=object)
- if len(value) and not lib.is_string_array(value, skipna=True):
- raise TypeError("Must provide strings.")
-
- mask = isna(value)
- if mask.any():
- value = value.copy()
- value[isna(value)] = libmissing.NA
-
- super().__setitem__(key, value)
-
- def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
- # the super() method NDArrayBackedExtensionArray._putmask uses
- # np.putmask which doesn't properly handle None/pd.NA, so using the
- # base class implementation that uses __setitem__
- ExtensionArray._putmask(self, mask, value)
-
- def astype(self, dtype, copy: bool = True):
- dtype = pandas_dtype(dtype)
-
- if is_dtype_equal(dtype, self.dtype):
- if copy:
- return self.copy()
- return self
-
- elif isinstance(dtype, IntegerDtype):
- arr = self._ndarray.copy()
- mask = self.isna()
- arr[mask] = 0
- values = arr.astype(dtype.numpy_dtype)
- return IntegerArray(values, mask, copy=False)
- elif isinstance(dtype, FloatingDtype):
- arr = self.copy()
- mask = self.isna()
- arr[mask] = "0"
- values = arr.astype(dtype.numpy_dtype)
- return FloatingArray(values, mask, copy=False)
- elif isinstance(dtype, ExtensionDtype):
- # Skip the PandasArray.astype method
- return ExtensionArray.astype(self, dtype, copy)
- elif np.issubdtype(dtype, np.floating):
- arr = self._ndarray.copy()
- mask = self.isna()
- arr[mask] = 0
- values = arr.astype(dtype)
- values[mask] = np.nan
- return values
-
- return super().astype(dtype, copy)
-
- def _reduce(
- self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs
- ):
- if name in ["min", "max"]:
- return getattr(self, name)(skipna=skipna, axis=axis)
-
- raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
-
- def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
- nv.validate_min((), kwargs)
- result = masked_reductions.min(
- values=self.to_numpy(), mask=self.isna(), skipna=skipna
- )
- return self._wrap_reduction_result(axis, result)
-
- def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
- nv.validate_max((), kwargs)
- result = masked_reductions.max(
- values=self.to_numpy(), mask=self.isna(), skipna=skipna
- )
- return self._wrap_reduction_result(axis, result)
-
- def value_counts(self, dropna: bool = True) -> Series:
- from pandas import value_counts
-
- result = value_counts(self._ndarray, dropna=dropna).astype("Int64")
- result.index = result.index.astype(self.dtype)
- return result
-
- def memory_usage(self, deep: bool = False) -> int:
- result = self._ndarray.nbytes
- if deep:
- return result + lib.memory_usage_of_objects(self._ndarray)
- return result
-
- @doc(ExtensionArray.searchsorted)
- def searchsorted(
- self,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- if self._hasna:
- raise ValueError(
- "searchsorted requires array to be sorted, which is impossible "
- "with NAs present."
- )
- return super().searchsorted(value=value, side=side, sorter=sorter)
-
- def _cmp_method(self, other, op):
- from pandas.arrays import BooleanArray
-
- if isinstance(other, StringArray):
- other = other._ndarray
-
- mask = isna(self) | isna(other)
- valid = ~mask
-
- if not lib.is_scalar(other):
- if len(other) != len(self):
- # prevent improper broadcasting when other is 2D
- raise ValueError(
- f"Lengths of operands do not match: {len(self)} != {len(other)}"
- )
-
- other = np.asarray(other)
- other = other[valid]
-
- if op.__name__ in ops.ARITHMETIC_BINOPS:
- result = np.empty_like(self._ndarray, dtype="object")
- result[mask] = libmissing.NA
- result[valid] = op(self._ndarray[valid], other)
- return StringArray(result)
- else:
- # logical
- result = np.zeros(len(self._ndarray), dtype="bool")
- result[valid] = op(self._ndarray[valid], other)
- return BooleanArray(result, mask)
-
- _arith_method = _cmp_method
-
- # ------------------------------------------------------------------------
- # String methods interface
- # error: Incompatible types in assignment (expression has type "NAType",
- # base class "PandasArray" defined the type as "float")
- _str_na_value = libmissing.NA # type: ignore[assignment]
-
- def _str_map(
- self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
- ):
- from pandas.arrays import BooleanArray
-
- if dtype is None:
- dtype = StringDtype(storage="python")
- if na_value is None:
- na_value = self.dtype.na_value
-
- mask = isna(self)
- arr = np.asarray(self)
-
- if is_integer_dtype(dtype) or is_bool_dtype(dtype):
- constructor: type[IntegerArray] | type[BooleanArray]
- if is_integer_dtype(dtype):
- constructor = IntegerArray
- else:
- constructor = BooleanArray
-
- na_value_is_na = isna(na_value)
- if na_value_is_na:
- na_value = 1
- result = lib.map_infer_mask(
- arr,
- f,
- mask.view("uint8"),
- convert=False,
- na_value=na_value,
- # error: Argument 1 to "dtype" has incompatible type
- # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
- # "Type[object]"
- dtype=np.dtype(dtype), # type: ignore[arg-type]
- )
-
- if not na_value_is_na:
- mask[:] = False
-
- return constructor(result, mask)
-
- elif is_string_dtype(dtype) and not is_object_dtype(dtype):
- # i.e. StringDtype
- result = lib.map_infer_mask(
- arr, f, mask.view("uint8"), convert=False, na_value=na_value
- )
- return StringArray(result)
- else:
- # This is when the result type is object. We reach this when
- # -> We know the result type is truly object (e.g. .encode returns bytes
- # or .findall returns a list).
- # -> We don't know the result type. E.g. `.get` can return anything.
- return lib.map_infer_mask(arr, f, mask.view("uint8"))
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/string_arrow.py b/contrib/python/pandas/py3/pandas/core/arrays/string_arrow.py
deleted file mode 100644
index 1eea95d02b6..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/string_arrow.py
+++ /dev/null
@@ -1,412 +0,0 @@
-from __future__ import annotations
-
-import re
-from typing import (
- Callable,
- Union,
-)
-
-import numpy as np
-
-from pandas._libs import (
- lib,
- missing as libmissing,
-)
-from pandas._typing import (
- Dtype,
- Scalar,
- npt,
-)
-from pandas.compat import pa_version_under7p0
-
-from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_dtype_equal,
- is_integer_dtype,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.missing import isna
-
-from pandas.core.arrays.arrow import ArrowExtensionArray
-from pandas.core.arrays.boolean import BooleanDtype
-from pandas.core.arrays.integer import Int64Dtype
-from pandas.core.arrays.numeric import NumericDtype
-from pandas.core.arrays.string_ import (
- BaseStringArray,
- StringDtype,
-)
-from pandas.core.strings.object_array import ObjectStringArrayMixin
-
-if not pa_version_under7p0:
- import pyarrow as pa
- import pyarrow.compute as pc
-
- from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
-
-ArrowStringScalarOrNAT = Union[str, libmissing.NAType]
-
-
-def _chk_pyarrow_available() -> None:
- if pa_version_under7p0:
- msg = "pyarrow>=7.0.0 is required for PyArrow backed ArrowExtensionArray."
- raise ImportError(msg)
-
-
-# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
-# ObjectStringArrayMixin because we want to have the object-dtype based methods as
-# fallback for the ones that pyarrow doesn't yet support
-
-
-class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringArray):
- """
- Extension array for string data in a ``pyarrow.ChunkedArray``.
-
- .. versionadded:: 1.2.0
-
- .. warning::
-
- ArrowStringArray is considered experimental. The implementation and
- parts of the API may change without warning.
-
- Parameters
- ----------
- values : pyarrow.Array or pyarrow.ChunkedArray
- The array of data.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
-
- See Also
- --------
- :func:`pandas.array`
- The recommended function for creating a ArrowStringArray.
- Series.str
- The string methods are available on Series backed by
- a ArrowStringArray.
-
- Notes
- -----
- ArrowStringArray returns a BooleanArray for comparison methods.
-
- Examples
- --------
- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
- <ArrowStringArray>
- ['This is', 'some text', <NA>, 'data.']
- Length: 4, dtype: string
- """
-
- # error: Incompatible types in assignment (expression has type "StringDtype",
- # base class "ArrowExtensionArray" defined the type as "ArrowDtype")
- _dtype: StringDtype # type: ignore[assignment]
-
- def __init__(self, values) -> None:
- super().__init__(values)
- self._dtype = StringDtype(storage="pyarrow")
-
- if not pa.types.is_string(self._data.type):
- raise ValueError(
- "ArrowStringArray requires a PyArrow (chunked) array of string type"
- )
-
- def __len__(self) -> int:
- """
- Length of this array.
-
- Returns
- -------
- length : int
- """
- return len(self._data)
-
- @classmethod
- def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
- from pandas.core.arrays.masked import BaseMaskedArray
-
- _chk_pyarrow_available()
-
- if dtype and not (isinstance(dtype, str) and dtype == "string"):
- dtype = pandas_dtype(dtype)
- assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
-
- if isinstance(scalars, BaseMaskedArray):
- # avoid costly conversion to object dtype in ensure_string_array and
- # numerical issues with Float32Dtype
- na_values = scalars._mask
- result = scalars._data
- result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
- return cls(pa.array(result, mask=na_values, type=pa.string()))
- elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
- return cls(pc.cast(scalars, pa.string()))
-
- # convert non-na-likes to str
- result = lib.ensure_string_array(scalars, copy=copy)
- return cls(pa.array(result, type=pa.string(), from_pandas=True))
-
- @classmethod
- def _from_sequence_of_strings(
- cls, strings, dtype: Dtype | None = None, copy: bool = False
- ):
- return cls._from_sequence(strings, dtype=dtype, copy=copy)
-
- @property
- def dtype(self) -> StringDtype: # type: ignore[override]
- """
- An instance of 'string[pyarrow]'.
- """
- return self._dtype
-
- def insert(self, loc: int, item) -> ArrowStringArray:
- if not isinstance(item, str) and item is not libmissing.NA:
- raise TypeError("Scalar must be NA or str")
- return super().insert(loc, item)
-
- def _maybe_convert_setitem_value(self, value):
- """Maybe convert value to be pyarrow compatible."""
- if is_scalar(value):
- if isna(value):
- value = None
- elif not isinstance(value, str):
- raise TypeError("Scalar must be NA or str")
- else:
- value = np.array(value, dtype=object, copy=True)
- value[isna(value)] = None
- for v in value:
- if not (v is None or isinstance(v, str)):
- raise TypeError("Scalar must be NA or str")
- return super()._maybe_convert_setitem_value(value)
-
- def isin(self, values) -> npt.NDArray[np.bool_]:
- value_set = [
- pa_scalar.as_py()
- for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
- if pa_scalar.type in (pa.string(), pa.null())
- ]
-
- # short-circuit to return all False array.
- if not len(value_set):
- return np.zeros(len(self), dtype=bool)
-
- result = pc.is_in(self._data, value_set=pa.array(value_set))
- # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
- # to False
- return np.array(result, dtype=np.bool_)
-
- def astype(self, dtype, copy: bool = True):
- dtype = pandas_dtype(dtype)
-
- if is_dtype_equal(dtype, self.dtype):
- if copy:
- return self.copy()
- return self
- elif isinstance(dtype, NumericDtype):
- data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
- return dtype.__from_arrow__(data)
- elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
- return self.to_numpy(dtype=dtype, na_value=np.nan)
-
- return super().astype(dtype, copy=copy)
-
- # ------------------------------------------------------------------------
- # String methods interface
-
- # error: Incompatible types in assignment (expression has type "NAType",
- # base class "ObjectStringArrayMixin" defined the type as "float")
- _str_na_value = libmissing.NA # type: ignore[assignment]
-
- def _str_map(
- self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
- ):
- # TODO: de-duplicate with StringArray method. This method is moreless copy and
- # paste.
-
- from pandas.arrays import (
- BooleanArray,
- IntegerArray,
- )
-
- if dtype is None:
- dtype = self.dtype
- if na_value is None:
- na_value = self.dtype.na_value
-
- mask = isna(self)
- arr = np.asarray(self)
-
- if is_integer_dtype(dtype) or is_bool_dtype(dtype):
- constructor: type[IntegerArray] | type[BooleanArray]
- if is_integer_dtype(dtype):
- constructor = IntegerArray
- else:
- constructor = BooleanArray
-
- na_value_is_na = isna(na_value)
- if na_value_is_na:
- na_value = 1
- result = lib.map_infer_mask(
- arr,
- f,
- mask.view("uint8"),
- convert=False,
- na_value=na_value,
- # error: Argument 1 to "dtype" has incompatible type
- # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
- # "Type[object]"
- dtype=np.dtype(dtype), # type: ignore[arg-type]
- )
-
- if not na_value_is_na:
- mask[:] = False
-
- return constructor(result, mask)
-
- elif is_string_dtype(dtype) and not is_object_dtype(dtype):
- # i.e. StringDtype
- result = lib.map_infer_mask(
- arr, f, mask.view("uint8"), convert=False, na_value=na_value
- )
- result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True)
- return type(self)(result)
- else:
- # This is when the result type is object. We reach this when
- # -> We know the result type is truly object (e.g. .encode returns bytes
- # or .findall returns a list).
- # -> We don't know the result type. E.g. `.get` can return anything.
- return lib.map_infer_mask(arr, f, mask.view("uint8"))
-
- def _str_contains(
- self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
- ):
- if flags:
- fallback_performancewarning()
- return super()._str_contains(pat, case, flags, na, regex)
-
- if regex:
- if case is False:
- fallback_performancewarning()
- return super()._str_contains(pat, case, flags, na, regex)
- else:
- result = pc.match_substring_regex(self._data, pat)
- else:
- if case:
- result = pc.match_substring(self._data, pat)
- else:
- result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
- result = BooleanDtype().__from_arrow__(result)
- if not isna(na):
- result[isna(result)] = bool(na)
- return result
-
- def _str_startswith(self, pat: str, na=None):
- pat = f"^{re.escape(pat)}"
- return self._str_contains(pat, na=na, regex=True)
-
- def _str_endswith(self, pat: str, na=None):
- pat = f"{re.escape(pat)}$"
- return self._str_contains(pat, na=na, regex=True)
-
- def _str_replace(
- self,
- pat: str | re.Pattern,
- repl: str | Callable,
- n: int = -1,
- case: bool = True,
- flags: int = 0,
- regex: bool = True,
- ):
- if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
- fallback_performancewarning()
- return super()._str_replace(pat, repl, n, case, flags, regex)
-
- func = pc.replace_substring_regex if regex else pc.replace_substring
- result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
- return type(self)(result)
-
- def _str_match(
- self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.startswith("^"):
- pat = f"^{pat}"
- return self._str_contains(pat, case, flags, na, regex=True)
-
- def _str_fullmatch(
- self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.endswith("$") or pat.endswith("//$"):
- pat = f"{pat}$"
- return self._str_match(pat, case, flags, na)
-
- def _str_isalnum(self):
- result = pc.utf8_is_alnum(self._data)
- return BooleanDtype().__from_arrow__(result)
-
- def _str_isalpha(self):
- result = pc.utf8_is_alpha(self._data)
- return BooleanDtype().__from_arrow__(result)
-
- def _str_isdecimal(self):
- result = pc.utf8_is_decimal(self._data)
- return BooleanDtype().__from_arrow__(result)
-
- def _str_isdigit(self):
- result = pc.utf8_is_digit(self._data)
- return BooleanDtype().__from_arrow__(result)
-
- def _str_islower(self):
- result = pc.utf8_is_lower(self._data)
- return BooleanDtype().__from_arrow__(result)
-
- def _str_isnumeric(self):
- result = pc.utf8_is_numeric(self._data)
- return BooleanDtype().__from_arrow__(result)
-
- def _str_isspace(self):
- result = pc.utf8_is_space(self._data)
- return BooleanDtype().__from_arrow__(result)
-
- def _str_istitle(self):
- result = pc.utf8_is_title(self._data)
- return BooleanDtype().__from_arrow__(result)
-
- def _str_isupper(self):
- result = pc.utf8_is_upper(self._data)
- return BooleanDtype().__from_arrow__(result)
-
- def _str_len(self):
- result = pc.utf8_length(self._data)
- return Int64Dtype().__from_arrow__(result)
-
- def _str_lower(self):
- return type(self)(pc.utf8_lower(self._data))
-
- def _str_upper(self):
- return type(self)(pc.utf8_upper(self._data))
-
- def _str_strip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_trim_whitespace(self._data)
- else:
- result = pc.utf8_trim(self._data, characters=to_strip)
- return type(self)(result)
-
- def _str_lstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_ltrim_whitespace(self._data)
- else:
- result = pc.utf8_ltrim(self._data, characters=to_strip)
- return type(self)(result)
-
- def _str_rstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_rtrim_whitespace(self._data)
- else:
- result = pc.utf8_rtrim(self._data, characters=to_strip)
- return type(self)(result)
diff --git a/contrib/python/pandas/py3/pandas/core/arrays/timedeltas.py b/contrib/python/pandas/py3/pandas/core/arrays/timedeltas.py
deleted file mode 100644
index 753b4940b4d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/arrays/timedeltas.py
+++ /dev/null
@@ -1,1062 +0,0 @@
-from __future__ import annotations
-
-from datetime import timedelta
-import operator
-from typing import (
- TYPE_CHECKING,
- Iterator,
- cast,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import (
- lib,
- tslibs,
-)
-from pandas._libs.tslibs import (
- BaseOffset,
- NaT,
- NaTType,
- Tick,
- Timedelta,
- astype_overflowsafe,
- get_supported_reso,
- get_unit_from_dtype,
- iNaT,
- is_supported_unit,
- npy_unit_to_abbrev,
- periods_per_second,
- to_offset,
-)
-from pandas._libs.tslibs.conversion import precision_from_unit
-from pandas._libs.tslibs.fields import (
- get_timedelta_days,
- get_timedelta_field,
-)
-from pandas._libs.tslibs.timedeltas import (
- array_to_timedelta64,
- floordiv_object_array,
- ints_to_pytimedelta,
- parse_timedelta_unit,
- truediv_object_array,
-)
-from pandas._typing import (
- AxisInt,
- DateTimeErrorChoices,
- DtypeObj,
- NpDtype,
- npt,
-)
-from pandas.compat.numpy import function as nv
-from pandas.util._validators import validate_endpoints
-
-from pandas.core.dtypes.common import (
- TD64NS_DTYPE,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float_dtype,
- is_integer_dtype,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- is_timedelta64_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.missing import isna
-
-from pandas.core import nanops
-from pandas.core.array_algos import datetimelike_accumulations
-from pandas.core.arrays import datetimelike as dtl
-from pandas.core.arrays._ranges import generate_regular_range
-import pandas.core.common as com
-from pandas.core.ops import roperator
-from pandas.core.ops.common import unpack_zerodim_and_defer
-
-if TYPE_CHECKING:
- from pandas import DataFrame
-
-
-def _field_accessor(name: str, alias: str, docstring: str):
- def f(self) -> np.ndarray:
- values = self.asi8
- if alias == "days":
- result = get_timedelta_days(values, reso=self._creso)
- else:
- # error: Incompatible types in assignment (
- # expression has type "ndarray[Any, dtype[signedinteger[_32Bit]]]",
- # variable has type "ndarray[Any, dtype[signedinteger[_64Bit]]]
- result = get_timedelta_field(values, alias, reso=self._creso) # type: ignore[assignment] # noqa: E501
- if self._hasna:
- result = self._maybe_mask_results(
- result, fill_value=None, convert="float64"
- )
-
- return result
-
- f.__name__ = name
- f.__doc__ = f"\n{docstring}\n"
- return property(f)
-
-
-class TimedeltaArray(dtl.TimelikeOps):
- """
- Pandas ExtensionArray for timedelta data.
-
- .. warning::
-
- TimedeltaArray is currently experimental, and its API may change
- without warning. In particular, :attr:`TimedeltaArray.dtype` is
- expected to change to be an instance of an ``ExtensionDtype``
- subclass.
-
- Parameters
- ----------
- values : array-like
- The timedelta data.
-
- dtype : numpy.dtype
- Currently, only ``numpy.dtype("timedelta64[ns]")`` is accepted.
- freq : Offset, optional
- copy : bool, default False
- Whether to copy the underlying array of data.
-
- Attributes
- ----------
- None
-
- Methods
- -------
- None
- """
-
- _typ = "timedeltaarray"
- _internal_fill_value = np.timedelta64("NaT", "ns")
- _recognized_scalars = (timedelta, np.timedelta64, Tick)
- _is_recognized_dtype = is_timedelta64_dtype
- _infer_matches = ("timedelta", "timedelta64")
-
- @property
- def _scalar_type(self) -> type[Timedelta]:
- return Timedelta
-
- __array_priority__ = 1000
- # define my properties & methods for delegation
- _other_ops: list[str] = []
- _bool_ops: list[str] = []
- _object_ops: list[str] = ["freq"]
- _field_ops: list[str] = ["days", "seconds", "microseconds", "nanoseconds"]
- _datetimelike_ops: list[str] = _field_ops + _object_ops + _bool_ops + ["unit"]
- _datetimelike_methods: list[str] = [
- "to_pytimedelta",
- "total_seconds",
- "round",
- "floor",
- "ceil",
- "as_unit",
- ]
-
- # Note: ndim must be defined to ensure NaT.__richcmp__(TimedeltaArray)
- # operates pointwise.
-
- def _box_func(self, x: np.timedelta64) -> Timedelta | NaTType:
- y = x.view("i8")
- if y == NaT._value:
- return NaT
- return Timedelta._from_value_and_reso(y, reso=self._creso)
-
- @property
- # error: Return type "dtype" of "dtype" incompatible with return type
- # "ExtensionDtype" in supertype "ExtensionArray"
- def dtype(self) -> np.dtype: # type: ignore[override]
- """
- The dtype for the TimedeltaArray.
-
- .. warning::
-
- A future version of pandas will change dtype to be an instance
- of a :class:`pandas.api.extensions.ExtensionDtype` subclass,
- not a ``numpy.dtype``.
-
- Returns
- -------
- numpy.dtype
- """
- return self._ndarray.dtype
-
- # ----------------------------------------------------------------
- # Constructors
-
- _freq = None
- _default_dtype = TD64NS_DTYPE # used in TimeLikeOps.__init__
-
- @classmethod
- def _validate_dtype(cls, values, dtype):
- # used in TimeLikeOps.__init__
- _validate_td64_dtype(values.dtype)
- dtype = _validate_td64_dtype(dtype)
- return dtype
-
- # error: Signature of "_simple_new" incompatible with supertype "NDArrayBacked"
- @classmethod
- def _simple_new( # type: ignore[override]
- cls, values: np.ndarray, freq: BaseOffset | None = None, dtype=TD64NS_DTYPE
- ) -> TimedeltaArray:
- # Require td64 dtype, not unit-less, matching values.dtype
- assert isinstance(dtype, np.dtype) and dtype.kind == "m"
- assert not tslibs.is_unitless(dtype)
- assert isinstance(values, np.ndarray), type(values)
- assert dtype == values.dtype
-
- result = super()._simple_new(values=values, dtype=dtype)
- result._freq = freq
- return result
-
- @classmethod
- def _from_sequence(cls, data, *, dtype=None, copy: bool = False) -> TimedeltaArray:
- if dtype:
- dtype = _validate_td64_dtype(dtype)
-
- data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=None)
- freq, _ = dtl.validate_inferred_freq(None, inferred_freq, False)
-
- if dtype is not None:
- data = astype_overflowsafe(data, dtype=dtype, copy=False)
-
- return cls._simple_new(data, dtype=data.dtype, freq=freq)
-
- @classmethod
- def _from_sequence_not_strict(
- cls,
- data,
- *,
- dtype=None,
- copy: bool = False,
- freq=lib.no_default,
- unit=None,
- ) -> TimedeltaArray:
- """
- A non-strict version of _from_sequence, called from TimedeltaIndex.__new__.
- """
- if dtype:
- dtype = _validate_td64_dtype(dtype)
-
- assert unit not in ["Y", "y", "M"] # caller is responsible for checking
-
- explicit_none = freq is None
- freq = freq if freq is not lib.no_default else None
-
- freq, freq_infer = dtl.maybe_infer_freq(freq)
-
- data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit)
- freq, freq_infer = dtl.validate_inferred_freq(freq, inferred_freq, freq_infer)
- if explicit_none:
- freq = None
-
- if dtype is not None:
- data = astype_overflowsafe(data, dtype=dtype, copy=False)
-
- result = cls._simple_new(data, dtype=data.dtype, freq=freq)
-
- if inferred_freq is None and freq is not None:
- # this condition precludes `freq_infer`
- cls._validate_frequency(result, freq)
-
- elif freq_infer:
- # Set _freq directly to bypass duplicative _validate_frequency
- # check.
- result._freq = to_offset(result.inferred_freq)
-
- return result
-
- # Signature of "_generate_range" incompatible with supertype
- # "DatetimeLikeArrayMixin"
- @classmethod
- def _generate_range( # type: ignore[override]
- cls, start, end, periods, freq, closed=None, *, unit: str | None = None
- ):
- periods = dtl.validate_periods(periods)
- if freq is None and any(x is None for x in [periods, start, end]):
- raise ValueError("Must provide freq argument if no data is supplied")
-
- if com.count_not_none(start, end, periods, freq) != 3:
- raise ValueError(
- "Of the four parameters: start, end, periods, "
- "and freq, exactly three must be specified"
- )
-
- if start is not None:
- start = Timedelta(start).as_unit("ns")
-
- if end is not None:
- end = Timedelta(end).as_unit("ns")
-
- if unit is not None:
- if unit not in ["s", "ms", "us", "ns"]:
- raise ValueError("'unit' must be one of 's', 'ms', 'us', 'ns'")
- else:
- unit = "ns"
-
- if start is not None and unit is not None:
- start = start.as_unit(unit, round_ok=False)
- if end is not None and unit is not None:
- end = end.as_unit(unit, round_ok=False)
-
- left_closed, right_closed = validate_endpoints(closed)
-
- if freq is not None:
- index = generate_regular_range(start, end, periods, freq, unit=unit)
- else:
- index = np.linspace(start._value, end._value, periods).astype("i8")
-
- if not left_closed:
- index = index[1:]
- if not right_closed:
- index = index[:-1]
-
- td64values = index.view(f"m8[{unit}]")
- return cls._simple_new(td64values, dtype=td64values.dtype, freq=freq)
-
- # ----------------------------------------------------------------
- # DatetimeLike Interface
-
- def _unbox_scalar(self, value) -> np.timedelta64:
- if not isinstance(value, self._scalar_type) and value is not NaT:
- raise ValueError("'value' should be a Timedelta.")
- self._check_compatible_with(value)
- if value is NaT:
- return np.timedelta64(value._value, self.unit)
- else:
- return value.as_unit(self.unit).asm8
-
- def _scalar_from_string(self, value) -> Timedelta | NaTType:
- return Timedelta(value)
-
- def _check_compatible_with(self, other) -> None:
- # we don't have anything to validate.
- pass
-
- # ----------------------------------------------------------------
- # Array-Like / EA-Interface Methods
-
- def astype(self, dtype, copy: bool = True):
- # We handle
- # --> timedelta64[ns]
- # --> timedelta64
- # DatetimeLikeArrayMixin super call handles other cases
- dtype = pandas_dtype(dtype)
-
- if isinstance(dtype, np.dtype) and dtype.kind == "m":
- if dtype == self.dtype:
- if copy:
- return self.copy()
- return self
-
- if is_supported_unit(get_unit_from_dtype(dtype)):
- # unit conversion e.g. timedelta64[s]
- res_values = astype_overflowsafe(self._ndarray, dtype, copy=False)
- return type(self)._simple_new(
- res_values, dtype=res_values.dtype, freq=self.freq
- )
- else:
- raise ValueError(
- f"Cannot convert from {self.dtype} to {dtype}. "
- "Supported resolutions are 's', 'ms', 'us', 'ns'"
- )
-
- return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy)
-
- def __iter__(self) -> Iterator:
- if self.ndim > 1:
- for i in range(len(self)):
- yield self[i]
- else:
- # convert in chunks of 10k for efficiency
- data = self._ndarray
- length = len(self)
- chunksize = 10000
- chunks = (length // chunksize) + 1
- for i in range(chunks):
- start_i = i * chunksize
- end_i = min((i + 1) * chunksize, length)
- converted = ints_to_pytimedelta(data[start_i:end_i], box=True)
- yield from converted
-
- # ----------------------------------------------------------------
- # Reductions
-
- def sum(
- self,
- *,
- axis: AxisInt | None = None,
- dtype: NpDtype | None = None,
- out=None,
- keepdims: bool = False,
- initial=None,
- skipna: bool = True,
- min_count: int = 0,
- ):
- nv.validate_sum(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims, "initial": initial}
- )
-
- result = nanops.nansum(
- self._ndarray, axis=axis, skipna=skipna, min_count=min_count
- )
- return self._wrap_reduction_result(axis, result)
-
- def std(
- self,
- *,
- axis: AxisInt | None = None,
- dtype: NpDtype | None = None,
- out=None,
- ddof: int = 1,
- keepdims: bool = False,
- skipna: bool = True,
- ):
- nv.validate_stat_ddof_func(
- (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std"
- )
-
- result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
- if axis is None or self.ndim == 1:
- return self._box_func(result)
- return self._from_backing_data(result)
-
- # ----------------------------------------------------------------
- # Accumulations
-
- def _accumulate(self, name: str, *, skipna: bool = True, **kwargs):
- if name == "cumsum":
- op = getattr(datetimelike_accumulations, name)
- result = op(self._ndarray.copy(), skipna=skipna, **kwargs)
-
- return type(self)._simple_new(result, freq=None, dtype=self.dtype)
- elif name == "cumprod":
- raise TypeError("cumprod not supported for Timedelta.")
-
- else:
- return super()._accumulate(name, skipna=skipna, **kwargs)
-
- # ----------------------------------------------------------------
- # Rendering Methods
-
- def _formatter(self, boxed: bool = False):
- from pandas.io.formats.format import get_format_timedelta64
-
- return get_format_timedelta64(self, box=True)
-
- def _format_native_types(
- self, *, na_rep: str | float = "NaT", date_format=None, **kwargs
- ) -> npt.NDArray[np.object_]:
- from pandas.io.formats.format import get_format_timedelta64
-
- # Relies on TimeDelta._repr_base
- formatter = get_format_timedelta64(self._ndarray, na_rep)
- # equiv: np.array([formatter(x) for x in self._ndarray])
- # but independent of dimension
- return np.frompyfunc(formatter, 1, 1)(self._ndarray)
-
- # ----------------------------------------------------------------
- # Arithmetic Methods
-
- def _add_offset(self, other):
- assert not isinstance(other, Tick)
- raise TypeError(
- f"cannot add the type {type(other).__name__} to a {type(self).__name__}"
- )
-
- @unpack_zerodim_and_defer("__mul__")
- def __mul__(self, other) -> TimedeltaArray:
- if is_scalar(other):
- # numpy will accept float and int, raise TypeError for others
- result = self._ndarray * other
- freq = None
- if self.freq is not None and not isna(other):
- freq = self.freq * other
- return type(self)._simple_new(result, dtype=result.dtype, freq=freq)
-
- if not hasattr(other, "dtype"):
- # list, tuple
- other = np.array(other)
- if len(other) != len(self) and not is_timedelta64_dtype(other.dtype):
- # Exclude timedelta64 here so we correctly raise TypeError
- # for that instead of ValueError
- raise ValueError("Cannot multiply with unequal lengths")
-
- if is_object_dtype(other.dtype):
- # this multiplication will succeed only if all elements of other
- # are int or float scalars, so we will end up with
- # timedelta64[ns]-dtyped result
- arr = self._ndarray
- result = [arr[n] * other[n] for n in range(len(self))]
- result = np.array(result)
- return type(self)._simple_new(result, dtype=result.dtype)
-
- # numpy will accept float or int dtype, raise TypeError for others
- result = self._ndarray * other
- return type(self)._simple_new(result, dtype=result.dtype)
-
- __rmul__ = __mul__
-
- def _scalar_divlike_op(self, other, op):
- """
- Shared logic for __truediv__, __rtruediv__, __floordiv__, __rfloordiv__
- with scalar 'other'.
- """
- if isinstance(other, self._recognized_scalars):
- other = Timedelta(other)
- # mypy assumes that __new__ returns an instance of the class
- # github.com/python/mypy/issues/1020
- if cast("Timedelta | NaTType", other) is NaT:
- # specifically timedelta64-NaT
- result = np.empty(self.shape, dtype=np.float64)
- result.fill(np.nan)
- return result
-
- # otherwise, dispatch to Timedelta implementation
- return op(self._ndarray, other)
-
- else:
- # caller is responsible for checking lib.is_scalar(other)
- # assume other is numeric, otherwise numpy will raise
-
- if op in [roperator.rtruediv, roperator.rfloordiv]:
- raise TypeError(
- f"Cannot divide {type(other).__name__} by {type(self).__name__}"
- )
-
- result = op(self._ndarray, other)
- freq = None
-
- if self.freq is not None:
- # Note: freq gets division, not floor-division, even if op
- # is floordiv.
- freq = self.freq / other
-
- # TODO: 2022-12-24 test_ufunc_coercions, test_tdi_ops_attributes
- # get here for truediv, no tests for floordiv
-
- if op is operator.floordiv:
- if freq.nanos == 0 and self.freq.nanos != 0:
- # e.g. if self.freq is Nano(1) then dividing by 2
- # rounds down to zero
- # TODO: 2022-12-24 should implement the same check
- # for truediv case
- freq = None
-
- return type(self)._simple_new(result, dtype=result.dtype, freq=freq)
-
- def _cast_divlike_op(self, other):
- if not hasattr(other, "dtype"):
- # e.g. list, tuple
- other = np.array(other)
-
- if len(other) != len(self):
- raise ValueError("Cannot divide vectors with unequal lengths")
- return other
-
- def _vector_divlike_op(self, other, op) -> np.ndarray | TimedeltaArray:
- """
- Shared logic for __truediv__, __floordiv__, and their reversed versions
- with timedelta64-dtype ndarray other.
- """
- # Let numpy handle it
- result = op(self._ndarray, np.asarray(other))
-
- if (is_integer_dtype(other.dtype) or is_float_dtype(other.dtype)) and op in [
- operator.truediv,
- operator.floordiv,
- ]:
- return type(self)._simple_new(result, dtype=result.dtype)
-
- if op in [operator.floordiv, roperator.rfloordiv]:
- mask = self.isna() | isna(other)
- if mask.any():
- result = result.astype(np.float64)
- np.putmask(result, mask, np.nan)
-
- return result
-
- @unpack_zerodim_and_defer("__truediv__")
- def __truediv__(self, other):
- # timedelta / X is well-defined for timedelta-like or numeric X
- op = operator.truediv
- if is_scalar(other):
- return self._scalar_divlike_op(other, op)
-
- other = self._cast_divlike_op(other)
- if (
- is_timedelta64_dtype(other.dtype)
- or is_integer_dtype(other.dtype)
- or is_float_dtype(other.dtype)
- ):
- return self._vector_divlike_op(other, op)
-
- if is_object_dtype(other.dtype):
- other = np.asarray(other)
- if self.ndim > 1:
- res_cols = [left / right for left, right in zip(self, other)]
- res_cols2 = [x.reshape(1, -1) for x in res_cols]
- result = np.concatenate(res_cols2, axis=0)
- else:
- result = truediv_object_array(self._ndarray, other)
-
- return result
-
- else:
- return NotImplemented
-
- @unpack_zerodim_and_defer("__rtruediv__")
- def __rtruediv__(self, other):
- # X / timedelta is defined only for timedelta-like X
- op = roperator.rtruediv
- if is_scalar(other):
- return self._scalar_divlike_op(other, op)
-
- other = self._cast_divlike_op(other)
- if is_timedelta64_dtype(other.dtype):
- return self._vector_divlike_op(other, op)
-
- elif is_object_dtype(other.dtype):
- # Note: unlike in __truediv__, we do not _need_ to do type
- # inference on the result. It does not raise, a numeric array
- # is returned. GH#23829
- result_list = [other[n] / self[n] for n in range(len(self))]
- return np.array(result_list)
-
- else:
- return NotImplemented
-
- @unpack_zerodim_and_defer("__floordiv__")
- def __floordiv__(self, other):
- op = operator.floordiv
- if is_scalar(other):
- return self._scalar_divlike_op(other, op)
-
- other = self._cast_divlike_op(other)
- if (
- is_timedelta64_dtype(other.dtype)
- or is_integer_dtype(other.dtype)
- or is_float_dtype(other.dtype)
- ):
- return self._vector_divlike_op(other, op)
-
- elif is_object_dtype(other.dtype):
- other = np.asarray(other)
- if self.ndim > 1:
- res_cols = [left // right for left, right in zip(self, other)]
- res_cols2 = [x.reshape(1, -1) for x in res_cols]
- result = np.concatenate(res_cols2, axis=0)
- else:
- result = floordiv_object_array(self._ndarray, other)
-
- assert result.dtype == object
- return result
-
- else:
- return NotImplemented
-
- @unpack_zerodim_and_defer("__rfloordiv__")
- def __rfloordiv__(self, other):
- op = roperator.rfloordiv
- if is_scalar(other):
- return self._scalar_divlike_op(other, op)
-
- other = self._cast_divlike_op(other)
- if is_timedelta64_dtype(other.dtype):
- return self._vector_divlike_op(other, op)
-
- elif is_object_dtype(other.dtype):
- result_list = [other[n] // self[n] for n in range(len(self))]
- result = np.array(result_list)
- return result
-
- else:
- return NotImplemented
-
- @unpack_zerodim_and_defer("__mod__")
- def __mod__(self, other):
- # Note: This is a naive implementation, can likely be optimized
- if isinstance(other, self._recognized_scalars):
- other = Timedelta(other)
- return self - (self // other) * other
-
- @unpack_zerodim_and_defer("__rmod__")
- def __rmod__(self, other):
- # Note: This is a naive implementation, can likely be optimized
- if isinstance(other, self._recognized_scalars):
- other = Timedelta(other)
- return other - (other // self) * self
-
- @unpack_zerodim_and_defer("__divmod__")
- def __divmod__(self, other):
- # Note: This is a naive implementation, can likely be optimized
- if isinstance(other, self._recognized_scalars):
- other = Timedelta(other)
-
- res1 = self // other
- res2 = self - res1 * other
- return res1, res2
-
- @unpack_zerodim_and_defer("__rdivmod__")
- def __rdivmod__(self, other):
- # Note: This is a naive implementation, can likely be optimized
- if isinstance(other, self._recognized_scalars):
- other = Timedelta(other)
-
- res1 = other // self
- res2 = other - res1 * self
- return res1, res2
-
- def __neg__(self) -> TimedeltaArray:
- freq = None
- if self.freq is not None:
- freq = -self.freq
- return type(self)._simple_new(-self._ndarray, dtype=self.dtype, freq=freq)
-
- def __pos__(self) -> TimedeltaArray:
- return type(self)(self._ndarray.copy(), freq=self.freq)
-
- def __abs__(self) -> TimedeltaArray:
- # Note: freq is not preserved
- return type(self)(np.abs(self._ndarray))
-
- # ----------------------------------------------------------------
- # Conversion Methods - Vectorized analogues of Timedelta methods
-
- def total_seconds(self) -> npt.NDArray[np.float64]:
- """
- Return total duration of each element expressed in seconds.
-
- This method is available directly on TimedeltaArray, TimedeltaIndex
- and on Series containing timedelta values under the ``.dt`` namespace.
-
- Returns
- -------
- ndarray, Index or Series
- When the calling object is a TimedeltaArray, the return type
- is ndarray. When the calling object is a TimedeltaIndex,
- the return type is an Index with a float64 dtype. When the calling object
- is a Series, the return type is Series of type `float64` whose
- index is the same as the original.
-
- See Also
- --------
- datetime.timedelta.total_seconds : Standard library version
- of this method.
- TimedeltaIndex.components : Return a DataFrame with components of
- each Timedelta.
-
- Examples
- --------
- **Series**
-
- >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='d'))
- >>> s
- 0 0 days
- 1 1 days
- 2 2 days
- 3 3 days
- 4 4 days
- dtype: timedelta64[ns]
-
- >>> s.dt.total_seconds()
- 0 0.0
- 1 86400.0
- 2 172800.0
- 3 259200.0
- 4 345600.0
- dtype: float64
-
- **TimedeltaIndex**
-
- >>> idx = pd.to_timedelta(np.arange(5), unit='d')
- >>> idx
- TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'],
- dtype='timedelta64[ns]', freq=None)
-
- >>> idx.total_seconds()
- Index([0.0, 86400.0, 172800.0, 259200.0, 345600.0], dtype='float64')
- """
- pps = periods_per_second(self._creso)
- return self._maybe_mask_results(self.asi8 / pps, fill_value=None)
-
- def to_pytimedelta(self) -> npt.NDArray[np.object_]:
- """
- Return an ndarray of datetime.timedelta objects.
-
- Returns
- -------
- numpy.ndarray
- """
- return ints_to_pytimedelta(self._ndarray)
-
- days = _field_accessor("days", "days", "Number of days for each element.")
- seconds = _field_accessor(
- "seconds",
- "seconds",
- "Number of seconds (>= 0 and less than 1 day) for each element.",
- )
- microseconds = _field_accessor(
- "microseconds",
- "microseconds",
- "Number of microseconds (>= 0 and less than 1 second) for each element.",
- )
- nanoseconds = _field_accessor(
- "nanoseconds",
- "nanoseconds",
- "Number of nanoseconds (>= 0 and less than 1 microsecond) for each element.",
- )
-
- @property
- def components(self) -> DataFrame:
- """
- Return a DataFrame of the individual resolution components of the Timedeltas.
-
- The components (days, hours, minutes seconds, milliseconds, microseconds,
- nanoseconds) are returned as columns in a DataFrame.
-
- Returns
- -------
- DataFrame
- """
- from pandas import DataFrame
-
- columns = [
- "days",
- "hours",
- "minutes",
- "seconds",
- "milliseconds",
- "microseconds",
- "nanoseconds",
- ]
- hasnans = self._hasna
- if hasnans:
-
- def f(x):
- if isna(x):
- return [np.nan] * len(columns)
- return x.components
-
- else:
-
- def f(x):
- return x.components
-
- result = DataFrame([f(x) for x in self], columns=columns)
- if not hasnans:
- result = result.astype("int64")
- return result
-
-
-# ---------------------------------------------------------------------
-# Constructor Helpers
-
-
-def sequence_to_td64ns(
- data,
- copy: bool = False,
- unit=None,
- errors: DateTimeErrorChoices = "raise",
-) -> tuple[np.ndarray, Tick | None]:
- """
- Parameters
- ----------
- data : list-like
- copy : bool, default False
- unit : str, optional
- The timedelta unit to treat integers as multiples of. For numeric
- data this defaults to ``'ns'``.
- Must be un-specified if the data contains a str and ``errors=="raise"``.
- errors : {"raise", "coerce", "ignore"}, default "raise"
- How to handle elements that cannot be converted to timedelta64[ns].
- See ``pandas.to_timedelta`` for details.
-
- Returns
- -------
- converted : numpy.ndarray
- The sequence converted to a numpy array with dtype ``timedelta64[ns]``.
- inferred_freq : Tick or None
- The inferred frequency of the sequence.
-
- Raises
- ------
- ValueError : Data cannot be converted to timedelta64[ns].
-
- Notes
- -----
- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause
- errors to be ignored; they are caught and subsequently ignored at a
- higher level.
- """
- assert unit not in ["Y", "y", "M"] # caller is responsible for checking
-
- inferred_freq = None
- if unit is not None:
- unit = parse_timedelta_unit(unit)
-
- data, copy = dtl.ensure_arraylike_for_datetimelike(
- data, copy, cls_name="TimedeltaArray"
- )
-
- if isinstance(data, TimedeltaArray):
- inferred_freq = data.freq
-
- # Convert whatever we have into timedelta64[ns] dtype
- if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
- # no need to make a copy, need to convert if string-dtyped
- data = _objects_to_td64ns(data, unit=unit, errors=errors)
- copy = False
-
- elif is_integer_dtype(data.dtype):
- # treat as multiples of the given unit
- data, copy_made = _ints_to_td64ns(data, unit=unit)
- copy = copy and not copy_made
-
- elif is_float_dtype(data.dtype):
- # cast the unit, multiply base/frac separately
- # to avoid precision issues from float -> int
- if is_extension_array_dtype(data):
- mask = data._mask
- data = data._data
- else:
- mask = np.isnan(data)
- # The next few lines are effectively a vectorized 'cast_from_unit'
- m, p = precision_from_unit(unit or "ns")
- with warnings.catch_warnings():
- # Suppress RuntimeWarning about All-NaN slice
- warnings.filterwarnings(
- "ignore", "invalid value encountered in cast", RuntimeWarning
- )
- base = data.astype(np.int64)
- frac = data - base
- if p:
- frac = np.round(frac, p)
- with warnings.catch_warnings():
- warnings.filterwarnings(
- "ignore", "invalid value encountered in cast", RuntimeWarning
- )
- data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
- data[mask] = iNaT
- copy = False
-
- elif is_timedelta64_dtype(data.dtype):
- data_unit = get_unit_from_dtype(data.dtype)
- if not is_supported_unit(data_unit):
- # cast to closest supported unit, i.e. s or ns
- new_reso = get_supported_reso(data_unit)
- new_unit = npy_unit_to_abbrev(new_reso)
- new_dtype = np.dtype(f"m8[{new_unit}]")
- data = astype_overflowsafe(data, dtype=new_dtype, copy=False)
- copy = False
-
- else:
- # This includes datetime64-dtype, see GH#23539, GH#29794
- raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]")
-
- data = np.array(data, copy=copy)
-
- assert data.dtype.kind == "m"
- assert data.dtype != "m8" # i.e. not unit-less
-
- return data, inferred_freq
-
-
-def _ints_to_td64ns(data, unit: str = "ns"):
- """
- Convert an ndarray with integer-dtype to timedelta64[ns] dtype, treating
- the integers as multiples of the given timedelta unit.
-
- Parameters
- ----------
- data : numpy.ndarray with integer-dtype
- unit : str, default "ns"
- The timedelta unit to treat integers as multiples of.
-
- Returns
- -------
- numpy.ndarray : timedelta64[ns] array converted from data
- bool : whether a copy was made
- """
- copy_made = False
- unit = unit if unit is not None else "ns"
-
- if data.dtype != np.int64:
- # converting to int64 makes a copy, so we can avoid
- # re-copying later
- data = data.astype(np.int64)
- copy_made = True
-
- if unit != "ns":
- dtype_str = f"timedelta64[{unit}]"
- data = data.view(dtype_str)
-
- data = astype_overflowsafe(data, dtype=TD64NS_DTYPE)
-
- # the astype conversion makes a copy, so we can avoid re-copying later
- copy_made = True
-
- else:
- data = data.view("timedelta64[ns]")
-
- return data, copy_made
-
-
-def _objects_to_td64ns(data, unit=None, errors: DateTimeErrorChoices = "raise"):
- """
- Convert a object-dtyped or string-dtyped array into an
- timedelta64[ns]-dtyped array.
-
- Parameters
- ----------
- data : ndarray or Index
- unit : str, default "ns"
- The timedelta unit to treat integers as multiples of.
- Must not be specified if the data contains a str.
- errors : {"raise", "coerce", "ignore"}, default "raise"
- How to handle elements that cannot be converted to timedelta64[ns].
- See ``pandas.to_timedelta`` for details.
-
- Returns
- -------
- numpy.ndarray : timedelta64[ns] array converted from data
-
- Raises
- ------
- ValueError : Data cannot be converted to timedelta64[ns].
-
- Notes
- -----
- Unlike `pandas.to_timedelta`, if setting `errors=ignore` will not cause
- errors to be ignored; they are caught and subsequently ignored at a
- higher level.
- """
- # coerce Index to np.ndarray, converting string-dtype if necessary
- values = np.array(data, dtype=np.object_, copy=False)
-
- result = array_to_timedelta64(values, unit=unit, errors=errors)
- return result.view("timedelta64[ns]")
-
-
-def _validate_td64_dtype(dtype) -> DtypeObj:
- dtype = pandas_dtype(dtype)
- if is_dtype_equal(dtype, np.dtype("timedelta64")):
- # no precision disallowed GH#24806
- msg = (
- "Passing in 'timedelta' dtype with no precision is not allowed. "
- "Please pass in 'timedelta64[ns]' instead."
- )
- raise ValueError(msg)
-
- if (
- not isinstance(dtype, np.dtype)
- or dtype.kind != "m"
- or not is_supported_unit(get_unit_from_dtype(dtype))
- ):
- raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]")
-
- return dtype
diff --git a/contrib/python/pandas/py3/pandas/core/base.py b/contrib/python/pandas/py3/pandas/core/base.py
deleted file mode 100644
index a454bfce279..00000000000
--- a/contrib/python/pandas/py3/pandas/core/base.py
+++ /dev/null
@@ -1,1357 +0,0 @@
-"""
-Base and utility classes for pandas objects.
-"""
-
-from __future__ import annotations
-
-import textwrap
-from typing import (
- TYPE_CHECKING,
- Any,
- Generic,
- Hashable,
- Iterator,
- Literal,
- TypeVar,
- cast,
- final,
- overload,
-)
-
-import numpy as np
-
-from pandas._config import using_copy_on_write
-
-from pandas._libs import lib
-from pandas._typing import (
- Axis,
- AxisInt,
- DtypeObj,
- IndexLabel,
- NDFrameT,
- Shape,
- npt,
-)
-from pandas.compat import PYPY
-from pandas.compat.numpy import function as nv
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import (
- cache_readonly,
- doc,
-)
-
-from pandas.core.dtypes.cast import can_hold_element
-from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_dict_like,
- is_extension_array_dtype,
- is_object_dtype,
- is_scalar,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import (
- isna,
- remove_na_arraylike,
-)
-
-from pandas.core import (
- algorithms,
- nanops,
- ops,
-)
-from pandas.core.accessor import DirNamesMixin
-from pandas.core.arraylike import OpsMixin
-from pandas.core.arrays import ExtensionArray
-from pandas.core.construction import (
- ensure_wrapped_if_datetimelike,
- extract_array,
-)
-
-if TYPE_CHECKING:
- from pandas._typing import (
- DropKeep,
- NumpySorter,
- NumpyValueArrayLike,
- ScalarLike_co,
- )
-
- from pandas import (
- Categorical,
- Index,
- Series,
- )
-
-
-_shared_docs: dict[str, str] = {}
-_indexops_doc_kwargs = {
- "klass": "IndexOpsMixin",
- "inplace": "",
- "unique": "IndexOpsMixin",
- "duplicated": "IndexOpsMixin",
-}
-
-_T = TypeVar("_T", bound="IndexOpsMixin")
-
-
-class PandasObject(DirNamesMixin):
- """
- Baseclass for various pandas objects.
- """
-
- # results from calls to methods decorated with cache_readonly get added to _cache
- _cache: dict[str, Any]
-
- @property
- def _constructor(self):
- """
- Class constructor (for this class it's just `__class__`.
- """
- return type(self)
-
- def __repr__(self) -> str:
- """
- Return a string representation for a particular object.
- """
- # Should be overwritten by base classes
- return object.__repr__(self)
-
- def _reset_cache(self, key: str | None = None) -> None:
- """
- Reset cached properties. If ``key`` is passed, only clears that key.
- """
- if not hasattr(self, "_cache"):
- return
- if key is None:
- self._cache.clear()
- else:
- self._cache.pop(key, None)
-
- def __sizeof__(self) -> int:
- """
- Generates the total memory usage for an object that returns
- either a value or Series of values
- """
- memory_usage = getattr(self, "memory_usage", None)
- if memory_usage:
- mem = memory_usage(deep=True) # pylint: disable=not-callable
- return int(mem if is_scalar(mem) else mem.sum())
-
- # no memory_usage attribute, so fall back to object's 'sizeof'
- return super().__sizeof__()
-
-
-class NoNewAttributesMixin:
- """
- Mixin which prevents adding new attributes.
-
- Prevents additional attributes via xxx.attribute = "something" after a
- call to `self.__freeze()`. Mainly used to prevent the user from using
- wrong attributes on an accessor (`Series.cat/.str/.dt`).
-
- If you really want to add a new attribute at a later time, you need to use
- `object.__setattr__(self, key, value)`.
- """
-
- def _freeze(self) -> None:
- """
- Prevents setting additional attributes.
- """
- object.__setattr__(self, "__frozen", True)
-
- # prevent adding any attribute via s.xxx.new_attribute = ...
- def __setattr__(self, key: str, value) -> None:
- # _cache is used by a decorator
- # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key)
- # because
- # 1.) getattr is false for attributes that raise errors
- # 2.) cls.__dict__ doesn't traverse into base classes
- if getattr(self, "__frozen", False) and not (
- key == "_cache"
- or key in type(self).__dict__
- or getattr(self, key, None) is not None
- ):
- raise AttributeError(f"You cannot add any new attribute '{key}'")
- object.__setattr__(self, key, value)
-
-
-class SelectionMixin(Generic[NDFrameT]):
- """
- mixin implementing the selection & aggregation interface on a group-like
- object sub-classes need to define: obj, exclusions
- """
-
- obj: NDFrameT
- _selection: IndexLabel | None = None
- exclusions: frozenset[Hashable]
- _internal_names = ["_cache", "__setstate__"]
- _internal_names_set = set(_internal_names)
-
- @final
- @property
- def _selection_list(self):
- if not isinstance(
- self._selection, (list, tuple, ABCSeries, ABCIndex, np.ndarray)
- ):
- return [self._selection]
- return self._selection
-
- @cache_readonly
- def _selected_obj(self):
- if self._selection is None or isinstance(self.obj, ABCSeries):
- return self.obj
- else:
- return self.obj[self._selection]
-
- @final
- @cache_readonly
- def ndim(self) -> int:
- return self._selected_obj.ndim
-
- @final
- @cache_readonly
- def _obj_with_exclusions(self):
- if isinstance(self.obj, ABCSeries):
- return self.obj
-
- if self._selection is not None:
- return self.obj._getitem_nocopy(self._selection_list)
-
- if len(self.exclusions) > 0:
- # equivalent to `self.obj.drop(self.exclusions, axis=1)
- # but this avoids consolidating and making a copy
- # TODO: following GH#45287 can we now use .drop directly without
- # making a copy?
- return self.obj._drop_axis(self.exclusions, axis=1, only_slice=True)
- else:
- return self.obj
-
- def __getitem__(self, key):
- if self._selection is not None:
- raise IndexError(f"Column(s) {self._selection} already selected")
-
- if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)):
- if len(self.obj.columns.intersection(key)) != len(set(key)):
- bad_keys = list(set(key).difference(self.obj.columns))
- raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}")
- return self._gotitem(list(key), ndim=2)
-
- else:
- if key not in self.obj:
- raise KeyError(f"Column not found: {key}")
- ndim = self.obj[key].ndim
- return self._gotitem(key, ndim=ndim)
-
- def _gotitem(self, key, ndim: int, subset=None):
- """
- sub-classes to define
- return a sliced object
-
- Parameters
- ----------
- key : str / list of selections
- ndim : {1, 2}
- requested ndim of result
- subset : object, default None
- subset to act on
- """
- raise AbstractMethodError(self)
-
- def aggregate(self, func, *args, **kwargs):
- raise AbstractMethodError(self)
-
- agg = aggregate
-
-
-class IndexOpsMixin(OpsMixin):
- """
- Common ops mixin to support a unified interface / docs for Series / Index
- """
-
- # ndarray compatibility
- __array_priority__ = 1000
- _hidden_attrs: frozenset[str] = frozenset(
- ["tolist"] # tolist is not deprecated, just suppressed in the __dir__
- )
-
- @property
- def dtype(self) -> DtypeObj:
- # must be defined here as a property for mypy
- raise AbstractMethodError(self)
-
- @property
- def _values(self) -> ExtensionArray | np.ndarray:
- # must be defined here as a property for mypy
- raise AbstractMethodError(self)
-
- @final
- def transpose(self: _T, *args, **kwargs) -> _T:
- """
- Return the transpose, which is by definition self.
-
- Returns
- -------
- %(klass)s
- """
- nv.validate_transpose(args, kwargs)
- return self
-
- T = property(
- transpose,
- doc="""
- Return the transpose, which is by definition self.
- """,
- )
-
- @property
- def shape(self) -> Shape:
- """
- Return a tuple of the shape of the underlying data.
-
- Examples
- --------
- >>> s = pd.Series([1, 2, 3])
- >>> s.shape
- (3,)
- """
- return self._values.shape
-
- def __len__(self) -> int:
- # We need this defined here for mypy
- raise AbstractMethodError(self)
-
- @property
- def ndim(self) -> Literal[1]:
- """
- Number of dimensions of the underlying data, by definition 1.
- """
- return 1
-
- @final
- def item(self):
- """
- Return the first element of the underlying data as a Python scalar.
-
- Returns
- -------
- scalar
- The first element of %(klass)s.
-
- Raises
- ------
- ValueError
- If the data is not length-1.
- """
- if len(self) == 1:
- return next(iter(self))
- raise ValueError("can only convert an array of size 1 to a Python scalar")
-
- @property
- def nbytes(self) -> int:
- """
- Return the number of bytes in the underlying data.
- """
- return self._values.nbytes
-
- @property
- def size(self) -> int:
- """
- Return the number of elements in the underlying data.
- """
- return len(self._values)
-
- @property
- def array(self) -> ExtensionArray:
- """
- The ExtensionArray of the data backing this Series or Index.
-
- Returns
- -------
- ExtensionArray
- An ExtensionArray of the values stored within. For extension
- types, this is the actual array. For NumPy native types, this
- is a thin (no copy) wrapper around :class:`numpy.ndarray`.
-
- ``.array`` differs ``.values`` which may require converting the
- data to a different form.
-
- See Also
- --------
- Index.to_numpy : Similar method that always returns a NumPy array.
- Series.to_numpy : Similar method that always returns a NumPy array.
-
- Notes
- -----
- This table lays out the different array types for each extension
- dtype within pandas.
-
- ================== =============================
- dtype array type
- ================== =============================
- category Categorical
- period PeriodArray
- interval IntervalArray
- IntegerNA IntegerArray
- string StringArray
- boolean BooleanArray
- datetime64[ns, tz] DatetimeArray
- ================== =============================
-
- For any 3rd-party extension types, the array type will be an
- ExtensionArray.
-
- For all remaining dtypes ``.array`` will be a
- :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray
- stored within. If you absolutely need a NumPy array (possibly with
- copying / coercing data), then use :meth:`Series.to_numpy` instead.
-
- Examples
- --------
- For regular NumPy types like int, and float, a PandasArray
- is returned.
-
- >>> pd.Series([1, 2, 3]).array
- <PandasArray>
- [1, 2, 3]
- Length: 3, dtype: int64
-
- For extension types, like Categorical, the actual ExtensionArray
- is returned
-
- >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
- >>> ser.array
- ['a', 'b', 'a']
- Categories (2, object): ['a', 'b']
- """
- raise AbstractMethodError(self)
-
- @final
- def to_numpy(
- self,
- dtype: npt.DTypeLike | None = None,
- copy: bool = False,
- na_value: object = lib.no_default,
- **kwargs,
- ) -> np.ndarray:
- """
- A NumPy ndarray representing the values in this Series or Index.
-
- Parameters
- ----------
- dtype : str or numpy.dtype, optional
- The dtype to pass to :meth:`numpy.asarray`.
- copy : bool, default False
- Whether to ensure that the returned value is not a view on
- another array. Note that ``copy=False`` does not *ensure* that
- ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
- a copy is made, even if not strictly necessary.
- na_value : Any, optional
- The value to use for missing values. The default value depends
- on `dtype` and the type of the array.
- **kwargs
- Additional keywords passed through to the ``to_numpy`` method
- of the underlying array (for extension arrays).
-
- Returns
- -------
- numpy.ndarray
-
- See Also
- --------
- Series.array : Get the actual data stored within.
- Index.array : Get the actual data stored within.
- DataFrame.to_numpy : Similar method for DataFrame.
-
- Notes
- -----
- The returned array will be the same up to equality (values equal
- in `self` will be equal in the returned array; likewise for values
- that are not equal). When `self` contains an ExtensionArray, the
- dtype may be different. For example, for a category-dtype Series,
- ``to_numpy()`` will return a NumPy array and the categorical dtype
- will be lost.
-
- For NumPy dtypes, this will be a reference to the actual data stored
- in this Series or Index (assuming ``copy=False``). Modifying the result
- in place will modify the data stored in the Series or Index (not that
- we recommend doing that).
-
- For extension types, ``to_numpy()`` *may* require copying data and
- coercing the result to a NumPy type (possibly object), which may be
- expensive. When you need a no-copy reference to the underlying data,
- :attr:`Series.array` should be used instead.
-
- This table lays out the different dtypes and default return types of
- ``to_numpy()`` for various dtypes within pandas.
-
- ================== ================================
- dtype array type
- ================== ================================
- category[T] ndarray[T] (same dtype as input)
- period ndarray[object] (Periods)
- interval ndarray[object] (Intervals)
- IntegerNA ndarray[object]
- datetime64[ns] datetime64[ns]
- datetime64[ns, tz] ndarray[object] (Timestamps)
- ================== ================================
-
- Examples
- --------
- >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
- >>> ser.to_numpy()
- array(['a', 'b', 'a'], dtype=object)
-
- Specify the `dtype` to control how datetime-aware data is represented.
- Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
- objects, each with the correct ``tz``.
-
- >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
- >>> ser.to_numpy(dtype=object)
- array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
- Timestamp('2000-01-02 00:00:00+0100', tz='CET')],
- dtype=object)
-
- Or ``dtype='datetime64[ns]'`` to return an ndarray of native
- datetime64 values. The values are converted to UTC and the timezone
- info is dropped.
-
- >>> ser.to_numpy(dtype="datetime64[ns]")
- ... # doctest: +ELLIPSIS
- array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
- dtype='datetime64[ns]')
- """
- if is_extension_array_dtype(self.dtype):
- return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)
- elif kwargs:
- bad_keys = list(kwargs.keys())[0]
- raise TypeError(
- f"to_numpy() got an unexpected keyword argument '{bad_keys}'"
- )
-
- if na_value is not lib.no_default:
- values = self._values
- if not can_hold_element(values, na_value):
- # if we can't hold the na_value asarray either makes a copy or we
- # error before modifying values. The asarray later on thus won't make
- # another copy
- values = np.asarray(values, dtype=dtype)
- else:
- values = values.copy()
-
- values[np.asanyarray(self.isna())] = na_value
- else:
- values = self._values
-
- result = np.asarray(values, dtype=dtype)
-
- if (copy and na_value is lib.no_default) or (
- not copy and using_copy_on_write()
- ):
- if np.shares_memory(self._values[:2], result[:2]):
- # Take slices to improve performance of check
- if using_copy_on_write() and not copy:
- result = result.view()
- result.flags.writeable = False
- else:
- result = result.copy()
-
- return result
-
- @final
- @property
- def empty(self) -> bool:
- return not self.size
-
- def max(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs):
- """
- Return the maximum value of the Index.
-
- Parameters
- ----------
- axis : int, optional
- For compatibility with NumPy. Only 0 or None are allowed.
- skipna : bool, default True
- Exclude NA/null values when showing the result.
- *args, **kwargs
- Additional arguments and keywords for compatibility with NumPy.
-
- Returns
- -------
- scalar
- Maximum value.
-
- See Also
- --------
- Index.min : Return the minimum value in an Index.
- Series.max : Return the maximum value in a Series.
- DataFrame.max : Return the maximum values in a DataFrame.
-
- Examples
- --------
- >>> idx = pd.Index([3, 2, 1])
- >>> idx.max()
- 3
-
- >>> idx = pd.Index(['c', 'b', 'a'])
- >>> idx.max()
- 'c'
-
- For a MultiIndex, the maximum is determined lexicographically.
-
- >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
- >>> idx.max()
- ('b', 2)
- """
- nv.validate_minmax_axis(axis)
- nv.validate_max(args, kwargs)
- return nanops.nanmax(self._values, skipna=skipna)
-
- @doc(op="max", oppose="min", value="largest")
- def argmax(
- self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs
- ) -> int:
- """
- Return int position of the {value} value in the Series.
-
- If the {op}imum is achieved in multiple locations,
- the first row position is returned.
-
- Parameters
- ----------
- axis : {{None}}
- Unused. Parameter needed for compatibility with DataFrame.
- skipna : bool, default True
- Exclude NA/null values when showing the result.
- *args, **kwargs
- Additional arguments and keywords for compatibility with NumPy.
-
- Returns
- -------
- int
- Row position of the {op}imum value.
-
- See Also
- --------
- Series.arg{op} : Return position of the {op}imum value.
- Series.arg{oppose} : Return position of the {oppose}imum value.
- numpy.ndarray.arg{op} : Equivalent method for numpy arrays.
- Series.idxmax : Return index label of the maximum values.
- Series.idxmin : Return index label of the minimum values.
-
- Examples
- --------
- Consider dataset containing cereal calories
-
- >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0,
- ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}})
- >>> s
- Corn Flakes 100.0
- Almond Delight 110.0
- Cinnamon Toast Crunch 120.0
- Cocoa Puff 110.0
- dtype: float64
-
- >>> s.argmax()
- 2
- >>> s.argmin()
- 0
-
- The maximum cereal calories is the third element and
- the minimum cereal calories is the first element,
- since series is zero-indexed.
- """
- delegate = self._values
- nv.validate_minmax_axis(axis)
- skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs)
-
- if isinstance(delegate, ExtensionArray):
- if not skipna and delegate.isna().any():
- return -1
- else:
- return delegate.argmax()
- else:
- # error: Incompatible return value type (got "Union[int, ndarray]", expected
- # "int")
- return nanops.nanargmax( # type: ignore[return-value]
- delegate, skipna=skipna
- )
-
- def min(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs):
- """
- Return the minimum value of the Index.
-
- Parameters
- ----------
- axis : {None}
- Dummy argument for consistency with Series.
- skipna : bool, default True
- Exclude NA/null values when showing the result.
- *args, **kwargs
- Additional arguments and keywords for compatibility with NumPy.
-
- Returns
- -------
- scalar
- Minimum value.
-
- See Also
- --------
- Index.max : Return the maximum value of the object.
- Series.min : Return the minimum value in a Series.
- DataFrame.min : Return the minimum values in a DataFrame.
-
- Examples
- --------
- >>> idx = pd.Index([3, 2, 1])
- >>> idx.min()
- 1
-
- >>> idx = pd.Index(['c', 'b', 'a'])
- >>> idx.min()
- 'a'
-
- For a MultiIndex, the minimum is determined lexicographically.
-
- >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
- >>> idx.min()
- ('a', 1)
- """
- nv.validate_minmax_axis(axis)
- nv.validate_min(args, kwargs)
- return nanops.nanmin(self._values, skipna=skipna)
-
- @doc(argmax, op="min", oppose="max", value="smallest")
- def argmin(
- self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs
- ) -> int:
- delegate = self._values
- nv.validate_minmax_axis(axis)
- skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs)
-
- if isinstance(delegate, ExtensionArray):
- if not skipna and delegate.isna().any():
- return -1
- else:
- return delegate.argmin()
- else:
- # error: Incompatible return value type (got "Union[int, ndarray]", expected
- # "int")
- return nanops.nanargmin( # type: ignore[return-value]
- delegate, skipna=skipna
- )
-
- def tolist(self):
- """
- Return a list of the values.
-
- These are each a scalar type, which is a Python scalar
- (for str, int, float) or a pandas scalar
- (for Timestamp/Timedelta/Interval/Period)
-
- Returns
- -------
- list
-
- See Also
- --------
- numpy.ndarray.tolist : Return the array as an a.ndim-levels deep
- nested list of Python scalars.
- """
- return self._values.tolist()
-
- to_list = tolist
-
- def __iter__(self) -> Iterator:
- """
- Return an iterator of the values.
-
- These are each a scalar type, which is a Python scalar
- (for str, int, float) or a pandas scalar
- (for Timestamp/Timedelta/Interval/Period)
-
- Returns
- -------
- iterator
- """
- # We are explicitly making element iterators.
- if not isinstance(self._values, np.ndarray):
- # Check type instead of dtype to catch DTA/TDA
- return iter(self._values)
- else:
- return map(self._values.item, range(self._values.size))
-
- @cache_readonly
- def hasnans(self) -> bool:
- """
- Return True if there are any NaNs.
-
- Enables various performance speedups.
-
- Returns
- -------
- bool
- """
- # error: Item "bool" of "Union[bool, ndarray[Any, dtype[bool_]], NDFrame]"
- # has no attribute "any"
- return bool(isna(self).any()) # type: ignore[union-attr]
-
- def isna(self) -> npt.NDArray[np.bool_]:
- return isna(self._values)
-
- def _reduce(
- self,
- op,
- name: str,
- *,
- axis: Axis = 0,
- skipna: bool = True,
- numeric_only=None,
- filter_type=None,
- **kwds,
- ):
- """
- Perform the reduction type operation if we can.
- """
- func = getattr(self, name, None)
- if func is None:
- raise TypeError(
- f"{type(self).__name__} cannot perform the operation {name}"
- )
- return func(skipna=skipna, **kwds)
-
- @final
- def _map_values(self, mapper, na_action=None):
- """
- An internal function that maps values using the input
- correspondence (which can be a dict, Series, or function).
-
- Parameters
- ----------
- mapper : function, dict, or Series
- The input correspondence object
- na_action : {None, 'ignore'}
- If 'ignore', propagate NA values, without passing them to the
- mapping function
-
- Returns
- -------
- Union[Index, MultiIndex], inferred
- The output of the mapping function applied to the index.
- If the function returns a tuple with more than one element
- a MultiIndex will be returned.
- """
- # we can fastpath dict/Series to an efficient map
- # as we know that we are not going to have to yield
- # python types
- if is_dict_like(mapper):
- if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
- # If a dictionary subclass defines a default value method,
- # convert mapper to a lookup function (GH #15999).
- dict_with_default = mapper
- mapper = lambda x: dict_with_default[
- np.nan if isinstance(x, float) and np.isnan(x) else x
- ]
- else:
- # Dictionary does not have a default. Thus it's safe to
- # convert to an Series for efficiency.
- # we specify the keys here to handle the
- # possibility that they are tuples
-
- # The return value of mapping with an empty mapper is
- # expected to be pd.Series(np.nan, ...). As np.nan is
- # of dtype float64 the return value of this method should
- # be float64 as well
- from pandas import Series
-
- if len(mapper) == 0:
- mapper = Series(mapper, dtype=np.float64)
- else:
- mapper = Series(mapper)
-
- if isinstance(mapper, ABCSeries):
- if na_action not in (None, "ignore"):
- msg = (
- "na_action must either be 'ignore' or None, "
- f"{na_action} was passed"
- )
- raise ValueError(msg)
-
- if na_action == "ignore":
- mapper = mapper[mapper.index.notna()]
-
- # Since values were input this means we came from either
- # a dict or a series and mapper should be an index
- if is_categorical_dtype(self.dtype):
- # use the built in categorical series mapper which saves
- # time by mapping the categories instead of all values
-
- cat = cast("Categorical", self._values)
- return cat.map(mapper)
-
- values = self._values
-
- indexer = mapper.index.get_indexer(values)
- new_values = algorithms.take_nd(mapper._values, indexer)
-
- return new_values
-
- # we must convert to python types
- if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
- # GH#23179 some EAs do not have `map`
- values = self._values
- if na_action is not None:
- raise NotImplementedError
- map_f = lambda values, f: values.map(f)
- else:
- values = self._values.astype(object)
- if na_action == "ignore":
- map_f = lambda values, f: lib.map_infer_mask(
- values, f, isna(values).view(np.uint8)
- )
- elif na_action is None:
- map_f = lib.map_infer
- else:
- msg = (
- "na_action must either be 'ignore' or None, "
- f"{na_action} was passed"
- )
- raise ValueError(msg)
-
- # mapper is a function
- new_values = map_f(values, mapper)
-
- return new_values
-
- @final
- def value_counts(
- self,
- normalize: bool = False,
- sort: bool = True,
- ascending: bool = False,
- bins=None,
- dropna: bool = True,
- ) -> Series:
- """
- Return a Series containing counts of unique values.
-
- The resulting object will be in descending order so that the
- first element is the most frequently-occurring element.
- Excludes NA values by default.
-
- Parameters
- ----------
- normalize : bool, default False
- If True then the object returned will contain the relative
- frequencies of the unique values.
- sort : bool, default True
- Sort by frequencies.
- ascending : bool, default False
- Sort in ascending order.
- bins : int, optional
- Rather than count values, group them into half-open bins,
- a convenience for ``pd.cut``, only works with numeric data.
- dropna : bool, default True
- Don't include counts of NaN.
-
- Returns
- -------
- Series
-
- See Also
- --------
- Series.count: Number of non-NA elements in a Series.
- DataFrame.count: Number of non-NA elements in a DataFrame.
- DataFrame.value_counts: Equivalent method on DataFrames.
-
- Examples
- --------
- >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
- >>> index.value_counts()
- 3.0 2
- 1.0 1
- 2.0 1
- 4.0 1
- Name: count, dtype: int64
-
- With `normalize` set to `True`, returns the relative frequency by
- dividing all values by the sum of values.
-
- >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
- >>> s.value_counts(normalize=True)
- 3.0 0.4
- 1.0 0.2
- 2.0 0.2
- 4.0 0.2
- Name: proportion, dtype: float64
-
- **bins**
-
- Bins can be useful for going from a continuous variable to a
- categorical variable; instead of counting unique
- apparitions of values, divide the index in the specified
- number of half-open bins.
-
- >>> s.value_counts(bins=3)
- (0.996, 2.0] 2
- (2.0, 3.0] 2
- (3.0, 4.0] 1
- Name: count, dtype: int64
-
- **dropna**
-
- With `dropna` set to `False` we can also see NaN index values.
-
- >>> s.value_counts(dropna=False)
- 3.0 2
- 1.0 1
- 2.0 1
- 4.0 1
- NaN 1
- Name: count, dtype: int64
- """
- return algorithms.value_counts(
- self,
- sort=sort,
- ascending=ascending,
- normalize=normalize,
- bins=bins,
- dropna=dropna,
- )
-
- def unique(self):
- values = self._values
- if not isinstance(values, np.ndarray):
- # i.e. ExtensionArray
- result = values.unique()
- else:
- result = algorithms.unique1d(values)
- return result
-
- @final
- def nunique(self, dropna: bool = True) -> int:
- """
- Return number of unique elements in the object.
-
- Excludes NA values by default.
-
- Parameters
- ----------
- dropna : bool, default True
- Don't include NaN in the count.
-
- Returns
- -------
- int
-
- See Also
- --------
- DataFrame.nunique: Method nunique for DataFrame.
- Series.count: Count non-NA/null observations in the Series.
-
- Examples
- --------
- >>> s = pd.Series([1, 3, 5, 7, 7])
- >>> s
- 0 1
- 1 3
- 2 5
- 3 7
- 4 7
- dtype: int64
-
- >>> s.nunique()
- 4
- """
- uniqs = self.unique()
- if dropna:
- uniqs = remove_na_arraylike(uniqs)
- return len(uniqs)
-
- @property
- def is_unique(self) -> bool:
- """
- Return boolean if values in the object are unique.
-
- Returns
- -------
- bool
- """
- return self.nunique(dropna=False) == len(self)
-
- @property
- def is_monotonic_increasing(self) -> bool:
- """
- Return boolean if values in the object are monotonically increasing.
-
- Returns
- -------
- bool
- """
- from pandas import Index
-
- return Index(self).is_monotonic_increasing
-
- @property
- def is_monotonic_decreasing(self) -> bool:
- """
- Return boolean if values in the object are monotonically decreasing.
-
- Returns
- -------
- bool
- """
- from pandas import Index
-
- return Index(self).is_monotonic_decreasing
-
- @final
- def _memory_usage(self, deep: bool = False) -> int:
- """
- Memory usage of the values.
-
- Parameters
- ----------
- deep : bool, default False
- Introspect the data deeply, interrogate
- `object` dtypes for system-level memory consumption.
-
- Returns
- -------
- bytes used
-
- See Also
- --------
- numpy.ndarray.nbytes : Total bytes consumed by the elements of the
- array.
-
- Notes
- -----
- Memory usage does not include memory consumed by elements that
- are not components of the array if deep=False or if used on PyPy
- """
- if hasattr(self.array, "memory_usage"):
- return self.array.memory_usage( # pyright: ignore[reportGeneralTypeIssues]
- deep=deep,
- )
-
- v = self.array.nbytes
- if deep and is_object_dtype(self) and not PYPY:
- values = cast(np.ndarray, self._values)
- v += lib.memory_usage_of_objects(values)
- return v
-
- @doc(
- algorithms.factorize,
- values="",
- order="",
- size_hint="",
- sort=textwrap.dedent(
- """\
- sort : bool, default False
- Sort `uniques` and shuffle `codes` to maintain the
- relationship.
- """
- ),
- )
- def factorize(
- self,
- sort: bool = False,
- use_na_sentinel: bool = True,
- ) -> tuple[npt.NDArray[np.intp], Index]:
- codes, uniques = algorithms.factorize(
- self._values, sort=sort, use_na_sentinel=use_na_sentinel
- )
- if uniques.dtype == np.float16:
- uniques = uniques.astype(np.float32)
-
- if isinstance(self, ABCIndex):
- # preserve e.g. MultiIndex
- uniques = self._constructor(uniques)
- else:
- from pandas import Index
-
- uniques = Index(uniques)
- return codes, uniques
-
- _shared_docs[
- "searchsorted"
- ] = """
- Find indices where elements should be inserted to maintain order.
-
- Find the indices into a sorted {klass} `self` such that, if the
- corresponding elements in `value` were inserted before the indices,
- the order of `self` would be preserved.
-
- .. note::
-
- The {klass} *must* be monotonically sorted, otherwise
- wrong locations will likely be returned. Pandas does *not*
- check this for you.
-
- Parameters
- ----------
- value : array-like or scalar
- Values to insert into `self`.
- side : {{'left', 'right'}}, optional
- If 'left', the index of the first suitable location found is given.
- If 'right', return the last such index. If there is no suitable
- index, return either 0 or N (where N is the length of `self`).
- sorter : 1-D array-like, optional
- Optional array of integer indices that sort `self` into ascending
- order. They are typically the result of ``np.argsort``.
-
- Returns
- -------
- int or array of int
- A scalar or array of insertion points with the
- same shape as `value`.
-
- See Also
- --------
- sort_values : Sort by the values along either axis.
- numpy.searchsorted : Similar method from NumPy.
-
- Notes
- -----
- Binary search is used to find the required insertion points.
-
- Examples
- --------
- >>> ser = pd.Series([1, 2, 3])
- >>> ser
- 0 1
- 1 2
- 2 3
- dtype: int64
-
- >>> ser.searchsorted(4)
- 3
-
- >>> ser.searchsorted([0, 4])
- array([0, 3])
-
- >>> ser.searchsorted([1, 3], side='left')
- array([0, 2])
-
- >>> ser.searchsorted([1, 3], side='right')
- array([1, 3])
-
- >>> ser = pd.Series(pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000']))
- >>> ser
- 0 2000-03-11
- 1 2000-03-12
- 2 2000-03-13
- dtype: datetime64[ns]
-
- >>> ser.searchsorted('3/14/2000')
- 3
-
- >>> ser = pd.Categorical(
- ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True
- ... )
- >>> ser
- ['apple', 'bread', 'bread', 'cheese', 'milk']
- Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']
-
- >>> ser.searchsorted('bread')
- 1
-
- >>> ser.searchsorted(['bread'], side='right')
- array([3])
-
- If the values are not monotonically sorted, wrong locations
- may be returned:
-
- >>> ser = pd.Series([2, 1, 3])
- >>> ser
- 0 2
- 1 1
- 2 3
- dtype: int64
-
- >>> ser.searchsorted(1) # doctest: +SKIP
- 0 # wrong result, correct would be 1
- """
-
- # This overload is needed so that the call to searchsorted in
- # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result
-
- @overload
- # The following ignore is also present in numpy/__init__.pyi
- # Possibly a mypy bug??
- # error: Overloaded function signatures 1 and 2 overlap with incompatible
- # return types [misc]
- def searchsorted( # type: ignore[misc]
- self,
- value: ScalarLike_co,
- side: Literal["left", "right"] = ...,
- sorter: NumpySorter = ...,
- ) -> np.intp:
- ...
-
- @overload
- def searchsorted(
- self,
- value: npt.ArrayLike | ExtensionArray,
- side: Literal["left", "right"] = ...,
- sorter: NumpySorter = ...,
- ) -> npt.NDArray[np.intp]:
- ...
-
- @doc(_shared_docs["searchsorted"], klass="Index")
- def searchsorted(
- self,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- if isinstance(value, ABCDataFrame):
- msg = (
- "Value must be 1-D array-like or scalar, "
- f"{type(value).__name__} is not supported"
- )
- raise ValueError(msg)
-
- values = self._values
- if not isinstance(values, np.ndarray):
- # Going through EA.searchsorted directly improves performance GH#38083
- return values.searchsorted(value, side=side, sorter=sorter)
-
- return algorithms.searchsorted(
- values,
- value,
- side=side,
- sorter=sorter,
- )
-
- def drop_duplicates(self, *, keep: DropKeep = "first"):
- duplicated = self._duplicated(keep=keep)
- # error: Value of type "IndexOpsMixin" is not indexable
- return self[~duplicated] # type: ignore[index]
-
- @final
- def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
- return algorithms.duplicated(self._values, keep=keep)
-
- def _arith_method(self, other, op):
- res_name = ops.get_op_result_name(self, other)
-
- lvalues = self._values
- rvalues = extract_array(other, extract_numpy=True, extract_range=True)
- rvalues = ops.maybe_prepare_scalar_for_op(rvalues, lvalues.shape)
- rvalues = ensure_wrapped_if_datetimelike(rvalues)
-
- with np.errstate(all="ignore"):
- result = ops.arithmetic_op(lvalues, rvalues, op)
-
- return self._construct_result(result, name=res_name)
-
- def _construct_result(self, result, name):
- """
- Construct an appropriately-wrapped result from the ArrayLike result
- of an arithmetic-like operation.
- """
- raise AbstractMethodError(self)
diff --git a/contrib/python/pandas/py3/pandas/core/common.py b/contrib/python/pandas/py3/pandas/core/common.py
deleted file mode 100644
index 073af11b719..00000000000
--- a/contrib/python/pandas/py3/pandas/core/common.py
+++ /dev/null
@@ -1,653 +0,0 @@
-"""
-Misc tools for implementing data structures
-
-Note: pandas.core.common is *not* part of the public API.
-"""
-from __future__ import annotations
-
-import builtins
-from collections import (
- abc,
- defaultdict,
-)
-import contextlib
-from functools import partial
-import inspect
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Collection,
- Generator,
- Hashable,
- Iterable,
- Sequence,
- cast,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import (
- AnyArrayLike,
- ArrayLike,
- NpDtype,
- RandomState,
- T,
-)
-
-from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
-from pandas.core.dtypes.common import (
- is_array_like,
- is_bool_dtype,
- is_extension_array_dtype,
- is_integer,
-)
-from pandas.core.dtypes.generic import (
- ABCExtensionArray,
- ABCIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.inference import iterable_not_string
-from pandas.core.dtypes.missing import isna
-
-if TYPE_CHECKING:
- from pandas import Index
-
-
-def flatten(line):
- """
- Flatten an arbitrarily nested sequence.
-
- Parameters
- ----------
- line : sequence
- The non string sequence to flatten
-
- Notes
- -----
- This doesn't consider strings sequences.
-
- Returns
- -------
- flattened : generator
- """
- for element in line:
- if iterable_not_string(element):
- yield from flatten(element)
- else:
- yield element
-
-
-def consensus_name_attr(objs):
- name = objs[0].name
- for obj in objs[1:]:
- try:
- if obj.name != name:
- name = None
- except ValueError:
- name = None
- return name
-
-
-def is_bool_indexer(key: Any) -> bool:
- """
- Check whether `key` is a valid boolean indexer.
-
- Parameters
- ----------
- key : Any
- Only list-likes may be considered boolean indexers.
- All other types are not considered a boolean indexer.
- For array-like input, boolean ndarrays or ExtensionArrays
- with ``_is_boolean`` set are considered boolean indexers.
-
- Returns
- -------
- bool
- Whether `key` is a valid boolean indexer.
-
- Raises
- ------
- ValueError
- When the array is an object-dtype ndarray or ExtensionArray
- and contains missing values.
-
- See Also
- --------
- check_array_indexer : Check that `key` is a valid array to index,
- and convert to an ndarray.
- """
- if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
- is_array_like(key) and is_extension_array_dtype(key.dtype)
- ):
- if key.dtype == np.object_:
- key_array = np.asarray(key)
-
- if not lib.is_bool_array(key_array):
- na_msg = "Cannot mask with non-boolean array containing NA / NaN values"
- if lib.infer_dtype(key_array) == "boolean" and isna(key_array).any():
- # Don't raise on e.g. ["A", "B", np.nan], see
- # test_loc_getitem_list_of_labels_categoricalindex_with_na
- raise ValueError(na_msg)
- return False
- return True
- elif is_bool_dtype(key.dtype):
- return True
- elif isinstance(key, list):
- # check if np.array(key).dtype would be bool
- if len(key) > 0:
- if type(key) is not list:
- # GH#42461 cython will raise TypeError if we pass a subclass
- key = list(key)
- return lib.is_bool_list(key)
-
- return False
-
-
-def cast_scalar_indexer(val):
- """
- Disallow indexing with a float key, even if that key is a round number.
-
- Parameters
- ----------
- val : scalar
-
- Returns
- -------
- outval : scalar
- """
- # assumes lib.is_scalar(val)
- if lib.is_float(val) and val.is_integer():
- raise IndexError(
- # GH#34193
- "Indexing with a float is no longer supported. Manually convert "
- "to an integer key instead."
- )
- return val
-
-
-def not_none(*args):
- """
- Returns a generator consisting of the arguments that are not None.
- """
- return (arg for arg in args if arg is not None)
-
-
-def any_none(*args) -> bool:
- """
- Returns a boolean indicating if any argument is None.
- """
- return any(arg is None for arg in args)
-
-
-def all_none(*args) -> bool:
- """
- Returns a boolean indicating if all arguments are None.
- """
- return all(arg is None for arg in args)
-
-
-def any_not_none(*args) -> bool:
- """
- Returns a boolean indicating if any argument is not None.
- """
- return any(arg is not None for arg in args)
-
-
-def all_not_none(*args) -> bool:
- """
- Returns a boolean indicating if all arguments are not None.
- """
- return all(arg is not None for arg in args)
-
-
-def count_not_none(*args) -> int:
- """
- Returns the count of arguments that are not None.
- """
- return sum(x is not None for x in args)
-
-
-@overload
-def asarray_tuplesafe(
- values: ArrayLike | list | tuple | zip, dtype: NpDtype | None = ...
-) -> np.ndarray:
- # ExtensionArray can only be returned when values is an Index, all other iterables
- # will return np.ndarray. Unfortunately "all other" cannot be encoded in a type
- # signature, so instead we special-case some common types.
- ...
-
-
-@overload
-def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = ...) -> ArrayLike:
- ...
-
-
-def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLike:
- if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")):
- values = list(values)
- elif isinstance(values, ABCIndex):
- return values._values
-
- if isinstance(values, list) and dtype in [np.object_, object]:
- return construct_1d_object_array_from_listlike(values)
-
- try:
- with warnings.catch_warnings():
- # Can remove warning filter once NumPy 1.24 is min version
- warnings.simplefilter("ignore", np.VisibleDeprecationWarning)
- result = np.asarray(values, dtype=dtype)
- except ValueError:
- # Using try/except since it's more performant than checking is_list_like
- # over each element
- # error: Argument 1 to "construct_1d_object_array_from_listlike"
- # has incompatible type "Iterable[Any]"; expected "Sized"
- return construct_1d_object_array_from_listlike(values) # type: ignore[arg-type]
-
- if issubclass(result.dtype.type, str):
- result = np.asarray(values, dtype=object)
-
- if result.ndim == 2:
- # Avoid building an array of arrays:
- values = [tuple(x) for x in values]
- result = construct_1d_object_array_from_listlike(values)
-
- return result
-
-
-def index_labels_to_array(
- labels: np.ndarray | Iterable, dtype: NpDtype | None = None
-) -> np.ndarray:
- """
- Transform label or iterable of labels to array, for use in Index.
-
- Parameters
- ----------
- dtype : dtype
- If specified, use as dtype of the resulting array, otherwise infer.
-
- Returns
- -------
- array
- """
- if isinstance(labels, (str, tuple)):
- labels = [labels]
-
- if not isinstance(labels, (list, np.ndarray)):
- try:
- labels = list(labels)
- except TypeError: # non-iterable
- labels = [labels]
-
- labels = asarray_tuplesafe(labels, dtype=dtype)
-
- return labels
-
-
-def maybe_make_list(obj):
- if obj is not None and not isinstance(obj, (tuple, list)):
- return [obj]
- return obj
-
-
-def maybe_iterable_to_list(obj: Iterable[T] | T) -> Collection[T] | T:
- """
- If obj is Iterable but not list-like, consume into list.
- """
- if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized):
- return list(obj)
- obj = cast(Collection, obj)
- return obj
-
-
-def is_null_slice(obj) -> bool:
- """
- We have a null slice.
- """
- return (
- isinstance(obj, slice)
- and obj.start is None
- and obj.stop is None
- and obj.step is None
- )
-
-
-def is_empty_slice(obj) -> bool:
- """
- We have an empty slice, e.g. no values are selected.
- """
- return (
- isinstance(obj, slice)
- and obj.start is not None
- and obj.stop is not None
- and obj.start == obj.stop
- )
-
-
-def is_true_slices(line) -> list[bool]:
- """
- Find non-trivial slices in "line": return a list of booleans with same length.
- """
- return [isinstance(k, slice) and not is_null_slice(k) for k in line]
-
-
-# TODO: used only once in indexing; belongs elsewhere?
-def is_full_slice(obj, line: int) -> bool:
- """
- We have a full length slice.
- """
- return (
- isinstance(obj, slice)
- and obj.start == 0
- and obj.stop == line
- and obj.step is None
- )
-
-
-def get_callable_name(obj):
- # typical case has name
- if hasattr(obj, "__name__"):
- return getattr(obj, "__name__")
- # some objects don't; could recurse
- if isinstance(obj, partial):
- return get_callable_name(obj.func)
- # fall back to class name
- if callable(obj):
- return type(obj).__name__
- # everything failed (probably because the argument
- # wasn't actually callable); we return None
- # instead of the empty string in this case to allow
- # distinguishing between no name and a name of ''
- return None
-
-
-def apply_if_callable(maybe_callable, obj, **kwargs):
- """
- Evaluate possibly callable input using obj and kwargs if it is callable,
- otherwise return as it is.
-
- Parameters
- ----------
- maybe_callable : possibly a callable
- obj : NDFrame
- **kwargs
- """
- if callable(maybe_callable):
- return maybe_callable(obj, **kwargs)
-
- return maybe_callable
-
-
-def standardize_mapping(into):
- """
- Helper function to standardize a supplied mapping.
-
- Parameters
- ----------
- into : instance or subclass of collections.abc.Mapping
- Must be a class, an initialized collections.defaultdict,
- or an instance of a collections.abc.Mapping subclass.
-
- Returns
- -------
- mapping : a collections.abc.Mapping subclass or other constructor
- a callable object that can accept an iterator to create
- the desired Mapping.
-
- See Also
- --------
- DataFrame.to_dict
- Series.to_dict
- """
- if not inspect.isclass(into):
- if isinstance(into, defaultdict):
- return partial(defaultdict, into.default_factory)
- into = type(into)
- if not issubclass(into, abc.Mapping):
- raise TypeError(f"unsupported type: {into}")
- if into == defaultdict:
- raise TypeError("to_dict() only accepts initialized defaultdicts")
- return into
-
-
-@overload
-def random_state(state: np.random.Generator) -> np.random.Generator:
- ...
-
-
-@overload
-def random_state(
- state: int | ArrayLike | np.random.BitGenerator | np.random.RandomState | None,
-) -> np.random.RandomState:
- ...
-
-
-def random_state(state: RandomState | None = None):
- """
- Helper function for processing random_state arguments.
-
- Parameters
- ----------
- state : int, array-like, BitGenerator, Generator, np.random.RandomState, None.
- If receives an int, array-like, or BitGenerator, passes to
- np.random.RandomState() as seed.
- If receives an np.random RandomState or Generator, just returns that unchanged.
- If receives `None`, returns np.random.
- If receives anything else, raises an informative ValueError.
-
- .. versionchanged:: 1.1.0
-
- array-like and BitGenerator object now passed to np.random.RandomState()
- as seed
-
- Default None.
-
- Returns
- -------
- np.random.RandomState or np.random.Generator. If state is None, returns np.random
-
- """
- if (
- is_integer(state)
- or is_array_like(state)
- or isinstance(state, np.random.BitGenerator)
- ):
- # error: Argument 1 to "RandomState" has incompatible type "Optional[Union[int,
- # Union[ExtensionArray, ndarray[Any, Any]], Generator, RandomState]]"; expected
- # "Union[None, Union[Union[_SupportsArray[dtype[Union[bool_, integer[Any]]]],
- # Sequence[_SupportsArray[dtype[Union[bool_, integer[Any]]]]],
- # Sequence[Sequence[_SupportsArray[dtype[Union[bool_, integer[Any]]]]]],
- # Sequence[Sequence[Sequence[_SupportsArray[dtype[Union[bool_,
- # integer[Any]]]]]]],
- # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[Union[bool_,
- # integer[Any]]]]]]]]], Union[bool, int, Sequence[Union[bool, int]],
- # Sequence[Sequence[Union[bool, int]]], Sequence[Sequence[Sequence[Union[bool,
- # int]]]], Sequence[Sequence[Sequence[Sequence[Union[bool, int]]]]]]],
- # BitGenerator]"
- return np.random.RandomState(state) # type: ignore[arg-type]
- elif isinstance(state, np.random.RandomState):
- return state
- elif isinstance(state, np.random.Generator):
- return state
- elif state is None:
- return np.random
- else:
- raise ValueError(
- "random_state must be an integer, array-like, a BitGenerator, Generator, "
- "a numpy RandomState, or None"
- )
-
-
-def pipe(
- obj, func: Callable[..., T] | tuple[Callable[..., T], str], *args, **kwargs
-) -> T:
- """
- Apply a function ``func`` to object ``obj`` either by passing obj as the
- first argument to the function or, in the case that the func is a tuple,
- interpret the first element of the tuple as a function and pass the obj to
- that function as a keyword argument whose key is the value of the second
- element of the tuple.
-
- Parameters
- ----------
- func : callable or tuple of (callable, str)
- Function to apply to this object or, alternatively, a
- ``(callable, data_keyword)`` tuple where ``data_keyword`` is a
- string indicating the keyword of ``callable`` that expects the
- object.
- *args : iterable, optional
- Positional arguments passed into ``func``.
- **kwargs : dict, optional
- A dictionary of keyword arguments passed into ``func``.
-
- Returns
- -------
- object : the return type of ``func``.
- """
- if isinstance(func, tuple):
- func, target = func
- if target in kwargs:
- msg = f"{target} is both the pipe target and a keyword argument"
- raise ValueError(msg)
- kwargs[target] = obj
- return func(*args, **kwargs)
- else:
- return func(obj, *args, **kwargs)
-
-
-def get_rename_function(mapper):
- """
- Returns a function that will map names/labels, dependent if mapper
- is a dict, Series or just a function.
- """
-
- def f(x):
- if x in mapper:
- return mapper[x]
- else:
- return x
-
- return f if isinstance(mapper, (abc.Mapping, ABCSeries)) else mapper
-
-
-def convert_to_list_like(
- values: Hashable | Iterable | AnyArrayLike,
-) -> list | AnyArrayLike:
- """
- Convert list-like or scalar input to list-like. List, numpy and pandas array-like
- inputs are returned unmodified whereas others are converted to list.
- """
- if isinstance(values, (list, np.ndarray, ABCIndex, ABCSeries, ABCExtensionArray)):
- return values
- elif isinstance(values, abc.Iterable) and not isinstance(values, str):
- return list(values)
-
- return [values]
-
-
-@contextlib.contextmanager
-def temp_setattr(obj, attr: str, value) -> Generator[None, None, None]:
- """Temporarily set attribute on an object.
-
- Args:
- obj: Object whose attribute will be modified.
- attr: Attribute to modify.
- value: Value to temporarily set attribute to.
-
- Yields:
- obj with modified attribute.
- """
- old_value = getattr(obj, attr)
- setattr(obj, attr, value)
- try:
- yield obj
- finally:
- setattr(obj, attr, old_value)
-
-
-def require_length_match(data, index: Index) -> None:
- """
- Check the length of data matches the length of the index.
- """
- if len(data) != len(index):
- raise ValueError(
- "Length of values "
- f"({len(data)}) "
- "does not match length of index "
- f"({len(index)})"
- )
-
-
-# the ufuncs np.maximum.reduce and np.minimum.reduce default to axis=0,
-# whereas np.min and np.max (which directly call obj.min and obj.max)
-# default to axis=None.
-_builtin_table = {
- builtins.sum: np.sum,
- builtins.max: np.maximum.reduce,
- builtins.min: np.minimum.reduce,
-}
-
-_cython_table = {
- builtins.sum: "sum",
- builtins.max: "max",
- builtins.min: "min",
- np.all: "all",
- np.any: "any",
- np.sum: "sum",
- np.nansum: "sum",
- np.mean: "mean",
- np.nanmean: "mean",
- np.prod: "prod",
- np.nanprod: "prod",
- np.std: "std",
- np.nanstd: "std",
- np.var: "var",
- np.nanvar: "var",
- np.median: "median",
- np.nanmedian: "median",
- np.max: "max",
- np.nanmax: "max",
- np.min: "min",
- np.nanmin: "min",
- np.cumprod: "cumprod",
- np.nancumprod: "cumprod",
- np.cumsum: "cumsum",
- np.nancumsum: "cumsum",
-}
-
-
-def get_cython_func(arg: Callable) -> str | None:
- """
- if we define an internal function for this argument, return it
- """
- return _cython_table.get(arg)
-
-
-def is_builtin_func(arg):
- """
- if we define a builtin function for this argument, return it,
- otherwise return the arg
- """
- return _builtin_table.get(arg, arg)
-
-
-def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]:
- """
- If a name is missing then replace it by level_n, where n is the count
-
- .. versionadded:: 1.4.0
-
- Parameters
- ----------
- names : list-like
- list of column names or None values.
-
- Returns
- -------
- list
- list of column names with the None values replaced.
- """
- return [f"level_{i}" if name is None else name for i, name in enumerate(names)]
diff --git a/contrib/python/pandas/py3/pandas/core/computation/__init__.py b/contrib/python/pandas/py3/pandas/core/computation/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/core/computation/align.py b/contrib/python/pandas/py3/pandas/core/computation/align.py
deleted file mode 100644
index fff605eb7cf..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/align.py
+++ /dev/null
@@ -1,213 +0,0 @@
-"""
-Core eval alignment algorithms.
-"""
-from __future__ import annotations
-
-from functools import (
- partial,
- wraps,
-)
-from typing import (
- TYPE_CHECKING,
- Callable,
- Sequence,
-)
-import warnings
-
-import numpy as np
-
-from pandas.errors import PerformanceWarning
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-
-from pandas.core.base import PandasObject
-import pandas.core.common as com
-from pandas.core.computation.common import result_type_many
-
-if TYPE_CHECKING:
- from pandas._typing import F
-
- from pandas.core.generic import NDFrame
- from pandas.core.indexes.api import Index
-
-
-def _align_core_single_unary_op(
- term,
-) -> tuple[partial | type[NDFrame], dict[str, Index] | None]:
- typ: partial | type[NDFrame]
- axes: dict[str, Index] | None = None
-
- if isinstance(term.value, np.ndarray):
- typ = partial(np.asanyarray, dtype=term.value.dtype)
- else:
- typ = type(term.value)
- if hasattr(term.value, "axes"):
- axes = _zip_axes_from_type(typ, term.value.axes)
-
- return typ, axes
-
-
-def _zip_axes_from_type(
- typ: type[NDFrame], new_axes: Sequence[Index]
-) -> dict[str, Index]:
- return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)}
-
-
-def _any_pandas_objects(terms) -> bool:
- """
- Check a sequence of terms for instances of PandasObject.
- """
- return any(isinstance(term.value, PandasObject) for term in terms)
-
-
-def _filter_special_cases(f) -> Callable[[F], F]:
- @wraps(f)
- def wrapper(terms):
- # single unary operand
- if len(terms) == 1:
- return _align_core_single_unary_op(terms[0])
-
- term_values = (term.value for term in terms)
-
- # we don't have any pandas objects
- if not _any_pandas_objects(terms):
- return result_type_many(*term_values), None
-
- return f(terms)
-
- return wrapper
-
-
-@_filter_special_cases
-def _align_core(terms):
- term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")]
- term_dims = [terms[i].value.ndim for i in term_index]
-
- from pandas import Series
-
- ndims = Series(dict(zip(term_index, term_dims)))
-
- # initial axes are the axes of the largest-axis'd term
- biggest = terms[ndims.idxmax()].value
- typ = biggest._constructor
- axes = biggest.axes
- naxes = len(axes)
- gt_than_one_axis = naxes > 1
-
- for value in (terms[i].value for i in term_index):
- is_series = isinstance(value, ABCSeries)
- is_series_and_gt_one_axis = is_series and gt_than_one_axis
-
- for axis, items in enumerate(value.axes):
- if is_series_and_gt_one_axis:
- ax, itm = naxes - 1, value.index
- else:
- ax, itm = axis, items
-
- if not axes[ax].is_(itm):
- axes[ax] = axes[ax].join(itm, how="outer")
-
- for i, ndim in ndims.items():
- for axis, items in zip(range(ndim), axes):
- ti = terms[i].value
-
- if hasattr(ti, "reindex"):
- transpose = isinstance(ti, ABCSeries) and naxes > 1
- reindexer = axes[naxes - 1] if transpose else items
-
- term_axis_size = len(ti.axes[axis])
- reindexer_size = len(reindexer)
-
- ordm = np.log10(max(1, abs(reindexer_size - term_axis_size)))
- if ordm >= 1 and reindexer_size >= 10000:
- w = (
- f"Alignment difference on axis {axis} is larger "
- f"than an order of magnitude on term {repr(terms[i].name)}, "
- f"by more than {ordm:.4g}; performance may suffer."
- )
- warnings.warn(
- w, category=PerformanceWarning, stacklevel=find_stack_level()
- )
-
- f = partial(ti.reindex, reindexer, axis=axis, copy=False)
-
- terms[i].update(f())
-
- terms[i].update(terms[i].value.values)
-
- return typ, _zip_axes_from_type(typ, axes)
-
-
-def align_terms(terms):
- """
- Align a set of terms.
- """
- try:
- # flatten the parse tree (a nested list, really)
- terms = list(com.flatten(terms))
- except TypeError:
- # can't iterate so it must just be a constant or single variable
- if isinstance(terms.value, (ABCSeries, ABCDataFrame)):
- typ = type(terms.value)
- return typ, _zip_axes_from_type(typ, terms.value.axes)
- return np.result_type(terms.type), None
-
- # if all resolved variables are numeric scalars
- if all(term.is_scalar for term in terms):
- return result_type_many(*(term.value for term in terms)).type, None
-
- # perform the main alignment
- typ, axes = _align_core(terms)
- return typ, axes
-
-
-def reconstruct_object(typ, obj, axes, dtype):
- """
- Reconstruct an object given its type, raw value, and possibly empty
- (None) axes.
-
- Parameters
- ----------
- typ : object
- A type
- obj : object
- The value to use in the type constructor
- axes : dict
- The axes to use to construct the resulting pandas object
-
- Returns
- -------
- ret : typ
- An object of type ``typ`` with the value `obj` and possible axes
- `axes`.
- """
- try:
- typ = typ.type
- except AttributeError:
- pass
-
- res_t = np.result_type(obj.dtype, dtype)
-
- if not isinstance(typ, partial) and issubclass(typ, PandasObject):
- return typ(obj, dtype=res_t, **axes)
-
- # special case for pathological things like ~True/~False
- if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_:
- ret_value = res_t.type(obj)
- else:
- ret_value = typ(obj).astype(res_t)
- # The condition is to distinguish 0-dim array (returned in case of
- # scalar) and 1 element array
- # e.g. np.array(0) and np.array([0])
- if (
- len(obj.shape) == 1
- and len(obj) == 1
- and not isinstance(ret_value, np.ndarray)
- ):
- ret_value = np.array([ret_value]).astype(res_t)
-
- return ret_value
diff --git a/contrib/python/pandas/py3/pandas/core/computation/api.py b/contrib/python/pandas/py3/pandas/core/computation/api.py
deleted file mode 100644
index bd3be5b3f8c..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/api.py
+++ /dev/null
@@ -1,2 +0,0 @@
-__all__ = ["eval"]
-from pandas.core.computation.eval import eval
diff --git a/contrib/python/pandas/py3/pandas/core/computation/check.py b/contrib/python/pandas/py3/pandas/core/computation/check.py
deleted file mode 100644
index 3221b158241..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/check.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from __future__ import annotations
-
-from pandas.compat._optional import import_optional_dependency
-
-ne = import_optional_dependency("numexpr", errors="warn")
-NUMEXPR_INSTALLED = ne is not None
-if NUMEXPR_INSTALLED:
- NUMEXPR_VERSION = ne.__version__
-else:
- NUMEXPR_VERSION = None
-
-__all__ = ["NUMEXPR_INSTALLED", "NUMEXPR_VERSION"]
diff --git a/contrib/python/pandas/py3/pandas/core/computation/common.py b/contrib/python/pandas/py3/pandas/core/computation/common.py
deleted file mode 100644
index 115191829f0..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/common.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from __future__ import annotations
-
-from functools import reduce
-
-import numpy as np
-
-from pandas._config import get_option
-
-
-def ensure_decoded(s) -> str:
- """
- If we have bytes, decode them to unicode.
- """
- if isinstance(s, (np.bytes_, bytes)):
- s = s.decode(get_option("display.encoding"))
- return s
-
-
-def result_type_many(*arrays_and_dtypes):
- """
- Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32)
- argument limit.
- """
- try:
- return np.result_type(*arrays_and_dtypes)
- except ValueError:
- # we have > NPY_MAXARGS terms in our expression
- return reduce(np.result_type, arrays_and_dtypes)
- except TypeError:
- from pandas.core.dtypes.cast import find_common_type
- from pandas.core.dtypes.common import is_extension_array_dtype
-
- arr_and_dtypes = list(arrays_and_dtypes)
- ea_dtypes, non_ea_dtypes = [], []
- for arr_or_dtype in arr_and_dtypes:
- if is_extension_array_dtype(arr_or_dtype):
- ea_dtypes.append(arr_or_dtype)
- else:
- non_ea_dtypes.append(arr_or_dtype)
-
- if non_ea_dtypes:
- try:
- np_dtype = np.result_type(*non_ea_dtypes)
- except ValueError:
- np_dtype = reduce(np.result_type, arrays_and_dtypes)
- return find_common_type(ea_dtypes + [np_dtype])
-
- return find_common_type(ea_dtypes)
diff --git a/contrib/python/pandas/py3/pandas/core/computation/engines.py b/contrib/python/pandas/py3/pandas/core/computation/engines.py
deleted file mode 100644
index a3a05a9d75c..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/engines.py
+++ /dev/null
@@ -1,143 +0,0 @@
-"""
-Engine classes for :func:`~pandas.eval`
-"""
-from __future__ import annotations
-
-import abc
-from typing import TYPE_CHECKING
-
-from pandas.errors import NumExprClobberingError
-
-from pandas.core.computation.align import (
- align_terms,
- reconstruct_object,
-)
-from pandas.core.computation.ops import (
- MATHOPS,
- REDUCTIONS,
-)
-
-from pandas.io.formats import printing
-
-if TYPE_CHECKING:
- from pandas.core.computation.expr import Expr
-
-_ne_builtins = frozenset(MATHOPS + REDUCTIONS)
-
-
-def _check_ne_builtin_clash(expr: Expr) -> None:
- """
- Attempt to prevent foot-shooting in a helpful way.
-
- Parameters
- ----------
- expr : Expr
- Terms can contain
- """
- names = expr.names
- overlap = names & _ne_builtins
-
- if overlap:
- s = ", ".join([repr(x) for x in overlap])
- raise NumExprClobberingError(
- f'Variables in expression "{expr}" overlap with builtins: ({s})'
- )
-
-
-class AbstractEngine(metaclass=abc.ABCMeta):
- """Object serving as a base class for all engines."""
-
- has_neg_frac = False
-
- def __init__(self, expr) -> None:
- self.expr = expr
- self.aligned_axes = None
- self.result_type = None
-
- def convert(self) -> str:
- """
- Convert an expression for evaluation.
-
- Defaults to return the expression as a string.
- """
- return printing.pprint_thing(self.expr)
-
- def evaluate(self) -> object:
- """
- Run the engine on the expression.
-
- This method performs alignment which is necessary no matter what engine
- is being used, thus its implementation is in the base class.
-
- Returns
- -------
- object
- The result of the passed expression.
- """
- if not self._is_aligned:
- self.result_type, self.aligned_axes = align_terms(self.expr.terms)
-
- # make sure no names in resolvers and locals/globals clash
- res = self._evaluate()
- return reconstruct_object(
- self.result_type, res, self.aligned_axes, self.expr.terms.return_type
- )
-
- @property
- def _is_aligned(self) -> bool:
- return self.aligned_axes is not None and self.result_type is not None
-
- @abc.abstractmethod
- def _evaluate(self):
- """
- Return an evaluated expression.
-
- Parameters
- ----------
- env : Scope
- The local and global environment in which to evaluate an
- expression.
-
- Notes
- -----
- Must be implemented by subclasses.
- """
-
-
-class NumExprEngine(AbstractEngine):
- """NumExpr engine class"""
-
- has_neg_frac = True
-
- def _evaluate(self):
- import numexpr as ne
-
- # convert the expression to a valid numexpr expression
- s = self.convert()
-
- env = self.expr.env
- scope = env.full_scope
- _check_ne_builtin_clash(self.expr)
- return ne.evaluate(s, local_dict=scope)
-
-
-class PythonEngine(AbstractEngine):
- """
- Evaluate an expression in Python space.
-
- Mostly for testing purposes.
- """
-
- has_neg_frac = False
-
- def evaluate(self):
- return self.expr()
-
- def _evaluate(self) -> None:
- pass
-
-
-ENGINES: dict[str, type[AbstractEngine]] = {
- "numexpr": NumExprEngine,
- "python": PythonEngine,
-}
diff --git a/contrib/python/pandas/py3/pandas/core/computation/eval.py b/contrib/python/pandas/py3/pandas/core/computation/eval.py
deleted file mode 100644
index d19730a321b..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/eval.py
+++ /dev/null
@@ -1,413 +0,0 @@
-"""
-Top level ``eval`` module.
-"""
-from __future__ import annotations
-
-import tokenize
-from typing import TYPE_CHECKING
-import warnings
-
-from pandas.util._exceptions import find_stack_level
-from pandas.util._validators import validate_bool_kwarg
-
-from pandas.core.dtypes.common import is_extension_array_dtype
-
-from pandas.core.computation.engines import ENGINES
-from pandas.core.computation.expr import (
- PARSERS,
- Expr,
-)
-from pandas.core.computation.parsing import tokenize_string
-from pandas.core.computation.scope import ensure_scope
-from pandas.core.generic import NDFrame
-
-from pandas.io.formats.printing import pprint_thing
-
-if TYPE_CHECKING:
- from pandas.core.computation.ops import BinOp
-
-
-def _check_engine(engine: str | None) -> str:
- """
- Make sure a valid engine is passed.
-
- Parameters
- ----------
- engine : str
- String to validate.
-
- Raises
- ------
- KeyError
- * If an invalid engine is passed.
- ImportError
- * If numexpr was requested but doesn't exist.
-
- Returns
- -------
- str
- Engine name.
- """
- from pandas.core.computation.check import NUMEXPR_INSTALLED
- from pandas.core.computation.expressions import USE_NUMEXPR
-
- if engine is None:
- engine = "numexpr" if USE_NUMEXPR else "python"
-
- if engine not in ENGINES:
- valid_engines = list(ENGINES.keys())
- raise KeyError(
- f"Invalid engine '{engine}' passed, valid engines are {valid_engines}"
- )
-
- # TODO: validate this in a more general way (thinking of future engines
- # that won't necessarily be import-able)
- # Could potentially be done on engine instantiation
- if engine == "numexpr" and not NUMEXPR_INSTALLED:
- raise ImportError(
- "'numexpr' is not installed or an unsupported version. Cannot use "
- "engine='numexpr' for query/eval if 'numexpr' is not installed"
- )
-
- return engine
-
-
-def _check_parser(parser: str):
- """
- Make sure a valid parser is passed.
-
- Parameters
- ----------
- parser : str
-
- Raises
- ------
- KeyError
- * If an invalid parser is passed
- """
- if parser not in PARSERS:
- raise KeyError(
- f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}"
- )
-
-
-def _check_resolvers(resolvers):
- if resolvers is not None:
- for resolver in resolvers:
- if not hasattr(resolver, "__getitem__"):
- name = type(resolver).__name__
- raise TypeError(
- f"Resolver of type '{name}' does not "
- "implement the __getitem__ method"
- )
-
-
-def _check_expression(expr):
- """
- Make sure an expression is not an empty string
-
- Parameters
- ----------
- expr : object
- An object that can be converted to a string
-
- Raises
- ------
- ValueError
- * If expr is an empty string
- """
- if not expr:
- raise ValueError("expr cannot be an empty string")
-
-
-def _convert_expression(expr) -> str:
- """
- Convert an object to an expression.
-
- This function converts an object to an expression (a unicode string) and
- checks to make sure it isn't empty after conversion. This is used to
- convert operators to their string representation for recursive calls to
- :func:`~pandas.eval`.
-
- Parameters
- ----------
- expr : object
- The object to be converted to a string.
-
- Returns
- -------
- str
- The string representation of an object.
-
- Raises
- ------
- ValueError
- * If the expression is empty.
- """
- s = pprint_thing(expr)
- _check_expression(s)
- return s
-
-
-def _check_for_locals(expr: str, stack_level: int, parser: str):
- at_top_of_stack = stack_level == 0
- not_pandas_parser = parser != "pandas"
-
- if not_pandas_parser:
- msg = "The '@' prefix is only supported by the pandas parser"
- elif at_top_of_stack:
- msg = (
- "The '@' prefix is not allowed in top-level eval calls.\n"
- "please refer to your variables by name without the '@' prefix."
- )
-
- if at_top_of_stack or not_pandas_parser:
- for toknum, tokval in tokenize_string(expr):
- if toknum == tokenize.OP and tokval == "@":
- raise SyntaxError(msg)
-
-
-def eval(
- expr: str | BinOp, # we leave BinOp out of the docstr bc it isn't for users
- parser: str = "pandas",
- engine: str | None = None,
- local_dict=None,
- global_dict=None,
- resolvers=(),
- level: int = 0,
- target=None,
- inplace: bool = False,
-):
- """
- Evaluate a Python expression as a string using various backends.
-
- The following arithmetic operations are supported: ``+``, ``-``, ``*``,
- ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
- boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
- Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
- :keyword:`or`, and :keyword:`not` with the same semantics as the
- corresponding bitwise operators. :class:`~pandas.Series` and
- :class:`~pandas.DataFrame` objects are supported and behave as they would
- with plain ol' Python evaluation.
-
- Parameters
- ----------
- expr : str
- The expression to evaluate. This string cannot contain any Python
- `statements
- <https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__,
- only Python `expressions
- <https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__.
- parser : {'pandas', 'python'}, default 'pandas'
- The parser to use to construct the syntax tree from the expression. The
- default of ``'pandas'`` parses code slightly different than standard
- Python. Alternatively, you can parse an expression using the
- ``'python'`` parser to retain strict Python semantics. See the
- :ref:`enhancing performance <enhancingperf.eval>` documentation for
- more details.
- engine : {'python', 'numexpr'}, default 'numexpr'
-
- The engine used to evaluate the expression. Supported engines are
-
- - None : tries to use ``numexpr``, falls back to ``python``
- - ``'numexpr'`` : This default engine evaluates pandas objects using
- numexpr for large speed ups in complex expressions with large frames.
- - ``'python'`` : Performs operations as if you had ``eval``'d in top
- level python. This engine is generally not that useful.
-
- More backends may be available in the future.
- local_dict : dict or None, optional
- A dictionary of local variables, taken from locals() by default.
- global_dict : dict or None, optional
- A dictionary of global variables, taken from globals() by default.
- resolvers : list of dict-like or None, optional
- A list of objects implementing the ``__getitem__`` special method that
- you can use to inject an additional collection of namespaces to use for
- variable lookup. For example, this is used in the
- :meth:`~DataFrame.query` method to inject the
- ``DataFrame.index`` and ``DataFrame.columns``
- variables that refer to their respective :class:`~pandas.DataFrame`
- instance attributes.
- level : int, optional
- The number of prior stack frames to traverse and add to the current
- scope. Most users will **not** need to change this parameter.
- target : object, optional, default None
- This is the target object for assignment. It is used when there is
- variable assignment in the expression. If so, then `target` must
- support item assignment with string keys, and if a copy is being
- returned, it must also support `.copy()`.
- inplace : bool, default False
- If `target` is provided, and the expression mutates `target`, whether
- to modify `target` inplace. Otherwise, return a copy of `target` with
- the mutation.
-
- Returns
- -------
- ndarray, numeric scalar, DataFrame, Series, or None
- The completion value of evaluating the given code or None if ``inplace=True``.
-
- Raises
- ------
- ValueError
- There are many instances where such an error can be raised:
-
- - `target=None`, but the expression is multiline.
- - The expression is multiline, but not all them have item assignment.
- An example of such an arrangement is this:
-
- a = b + 1
- a + 2
-
- Here, there are expressions on different lines, making it multiline,
- but the last line has no variable assigned to the output of `a + 2`.
- - `inplace=True`, but the expression is missing item assignment.
- - Item assignment is provided, but the `target` does not support
- string item assignment.
- - Item assignment is provided and `inplace=False`, but the `target`
- does not support the `.copy()` method
-
- See Also
- --------
- DataFrame.query : Evaluates a boolean expression to query the columns
- of a frame.
- DataFrame.eval : Evaluate a string describing operations on
- DataFrame columns.
-
- Notes
- -----
- The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
- recursively cast to ``float64``.
-
- See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
- more details.
-
- Examples
- --------
- >>> df = pd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]})
- >>> df
- animal age
- 0 dog 10
- 1 pig 20
-
- We can add a new column using ``pd.eval``:
-
- >>> pd.eval("double_age = df.age * 2", target=df)
- animal age double_age
- 0 dog 10 20
- 1 pig 20 40
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
-
- exprs: list[str | BinOp]
- if isinstance(expr, str):
- _check_expression(expr)
- exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""]
- else:
- # ops.BinOp; for internal compat, not intended to be passed by users
- exprs = [expr]
- multi_line = len(exprs) > 1
-
- if multi_line and target is None:
- raise ValueError(
- "multi-line expressions are only valid in the "
- "context of data, use DataFrame.eval"
- )
- engine = _check_engine(engine)
- _check_parser(parser)
- _check_resolvers(resolvers)
-
- ret = None
- first_expr = True
- target_modified = False
-
- for expr in exprs:
- expr = _convert_expression(expr)
- _check_for_locals(expr, level, parser)
-
- # get our (possibly passed-in) scope
- env = ensure_scope(
- level + 1,
- global_dict=global_dict,
- local_dict=local_dict,
- resolvers=resolvers,
- target=target,
- )
-
- parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
-
- if engine == "numexpr" and (
- is_extension_array_dtype(parsed_expr.terms.return_type)
- or getattr(parsed_expr.terms, "operand_types", None) is not None
- and any(
- is_extension_array_dtype(elem)
- for elem in parsed_expr.terms.operand_types
- )
- ):
- warnings.warn(
- "Engine has switched to 'python' because numexpr does not support "
- "extension array dtypes. Please set your engine to python manually.",
- RuntimeWarning,
- stacklevel=find_stack_level(),
- )
- engine = "python"
-
- # construct the engine and evaluate the parsed expression
- eng = ENGINES[engine]
- eng_inst = eng(parsed_expr)
- ret = eng_inst.evaluate()
-
- if parsed_expr.assigner is None:
- if multi_line:
- raise ValueError(
- "Multi-line expressions are only valid "
- "if all expressions contain an assignment"
- )
- if inplace:
- raise ValueError("Cannot operate inplace if there is no assignment")
-
- # assign if needed
- assigner = parsed_expr.assigner
- if env.target is not None and assigner is not None:
- target_modified = True
-
- # if returning a copy, copy only on the first assignment
- if not inplace and first_expr:
- try:
- target = env.target.copy()
- except AttributeError as err:
- raise ValueError("Cannot return a copy of the target") from err
- else:
- target = env.target
-
- # TypeError is most commonly raised (e.g. int, list), but you
- # get IndexError if you try to do this assignment on np.ndarray.
- # we will ignore numpy warnings here; e.g. if trying
- # to use a non-numeric indexer
- try:
- with warnings.catch_warnings(record=True):
- # TODO: Filter the warnings we actually care about here.
- if inplace and isinstance(target, NDFrame):
- target.loc[:, assigner] = ret
- else:
- target[assigner] = ret
- except (TypeError, IndexError) as err:
- raise ValueError("Cannot assign expression output to target") from err
-
- if not resolvers:
- resolvers = ({assigner: ret},)
- else:
- # existing resolver needs updated to handle
- # case of mutating existing column in copy
- for resolver in resolvers:
- if assigner in resolver:
- resolver[assigner] = ret
- break
- else:
- resolvers += ({assigner: ret},)
-
- ret = None
- first_expr = False
-
- # We want to exclude `inplace=None` as being False.
- if inplace is False:
- return target if target_modified else ret
diff --git a/contrib/python/pandas/py3/pandas/core/computation/expr.py b/contrib/python/pandas/py3/pandas/core/computation/expr.py
deleted file mode 100644
index 75e8b30d2e1..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/expr.py
+++ /dev/null
@@ -1,840 +0,0 @@
-"""
-:func:`~pandas.eval` parsers.
-"""
-from __future__ import annotations
-
-import ast
-from functools import (
- partial,
- reduce,
-)
-from keyword import iskeyword
-import tokenize
-from typing import (
- Callable,
- TypeVar,
-)
-
-import numpy as np
-
-from pandas.compat import PY39
-from pandas.errors import UndefinedVariableError
-
-import pandas.core.common as com
-from pandas.core.computation.ops import (
- ARITH_OPS_SYMS,
- BOOL_OPS_SYMS,
- CMP_OPS_SYMS,
- LOCAL_TAG,
- MATHOPS,
- REDUCTIONS,
- UNARY_OPS_SYMS,
- BinOp,
- Constant,
- Div,
- FuncNode,
- Op,
- Term,
- UnaryOp,
- is_term,
-)
-from pandas.core.computation.parsing import (
- clean_backtick_quoted_toks,
- tokenize_string,
-)
-from pandas.core.computation.scope import Scope
-
-from pandas.io.formats import printing
-
-
-def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]:
- """
- Rewrite the assignment operator for PyTables expressions that use ``=``
- as a substitute for ``==``.
-
- Parameters
- ----------
- tok : tuple of int, str
- ints correspond to the all caps constants in the tokenize module
-
- Returns
- -------
- tuple of int, str
- Either the input or token or the replacement values
- """
- toknum, tokval = tok
- return toknum, "==" if tokval == "=" else tokval
-
-
-def _replace_booleans(tok: tuple[int, str]) -> tuple[int, str]:
- """
- Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
- precedence is changed to boolean precedence.
-
- Parameters
- ----------
- tok : tuple of int, str
- ints correspond to the all caps constants in the tokenize module
-
- Returns
- -------
- tuple of int, str
- Either the input or token or the replacement values
- """
- toknum, tokval = tok
- if toknum == tokenize.OP:
- if tokval == "&":
- return tokenize.NAME, "and"
- elif tokval == "|":
- return tokenize.NAME, "or"
- return toknum, tokval
- return toknum, tokval
-
-
-def _replace_locals(tok: tuple[int, str]) -> tuple[int, str]:
- """
- Replace local variables with a syntactically valid name.
-
- Parameters
- ----------
- tok : tuple of int, str
- ints correspond to the all caps constants in the tokenize module
-
- Returns
- -------
- tuple of int, str
- Either the input or token or the replacement values
-
- Notes
- -----
- This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
- ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
- is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
- """
- toknum, tokval = tok
- if toknum == tokenize.OP and tokval == "@":
- return tokenize.OP, LOCAL_TAG
- return toknum, tokval
-
-
-def _compose2(f, g):
- """
- Compose 2 callables.
- """
- return lambda *args, **kwargs: f(g(*args, **kwargs))
-
-
-def _compose(*funcs):
- """
- Compose 2 or more callables.
- """
- assert len(funcs) > 1, "At least 2 callables must be passed to compose"
- return reduce(_compose2, funcs)
-
-
-def _preparse(
- source: str,
- f=_compose(
- _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks
- ),
-) -> str:
- """
- Compose a collection of tokenization functions.
-
- Parameters
- ----------
- source : str
- A Python source code string
- f : callable
- This takes a tuple of (toknum, tokval) as its argument and returns a
- tuple with the same structure but possibly different elements. Defaults
- to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
- ``_replace_locals``.
-
- Returns
- -------
- str
- Valid Python source code
-
- Notes
- -----
- The `f` parameter can be any callable that takes *and* returns input of the
- form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
- the ``tokenize`` module and ``tokval`` is a string.
- """
- assert callable(f), "f must be callable"
- return tokenize.untokenize(f(x) for x in tokenize_string(source))
-
-
-def _is_type(t):
- """
- Factory for a type checking function of type ``t`` or tuple of types.
- """
- return lambda x: isinstance(x.value, t)
-
-
-_is_list = _is_type(list)
-_is_str = _is_type(str)
-
-
-# partition all AST nodes
-_all_nodes = frozenset(
- node
- for node in (getattr(ast, name) for name in dir(ast))
- if isinstance(node, type) and issubclass(node, ast.AST)
-)
-
-
-def _filter_nodes(superclass, all_nodes=_all_nodes):
- """
- Filter out AST nodes that are subclasses of ``superclass``.
- """
- node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass))
- return frozenset(node_names)
-
-
-_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes))
-_mod_nodes = _filter_nodes(ast.mod)
-_stmt_nodes = _filter_nodes(ast.stmt)
-_expr_nodes = _filter_nodes(ast.expr)
-_expr_context_nodes = _filter_nodes(ast.expr_context)
-_boolop_nodes = _filter_nodes(ast.boolop)
-_operator_nodes = _filter_nodes(ast.operator)
-_unary_op_nodes = _filter_nodes(ast.unaryop)
-_cmp_op_nodes = _filter_nodes(ast.cmpop)
-_comprehension_nodes = _filter_nodes(ast.comprehension)
-_handler_nodes = _filter_nodes(ast.excepthandler)
-_arguments_nodes = _filter_nodes(ast.arguments)
-_keyword_nodes = _filter_nodes(ast.keyword)
-_alias_nodes = _filter_nodes(ast.alias)
-
-if not PY39:
- _slice_nodes = _filter_nodes(ast.slice)
-
-
-# nodes that we don't support directly but are needed for parsing
-_hacked_nodes = frozenset(["Assign", "Module", "Expr"])
-
-
-_unsupported_expr_nodes = frozenset(
- [
- "Yield",
- "GeneratorExp",
- "IfExp",
- "DictComp",
- "SetComp",
- "Repr",
- "Lambda",
- "Set",
- "AST",
- "Is",
- "IsNot",
- ]
-)
-
-# these nodes are low priority or won't ever be supported (e.g., AST)
-_unsupported_nodes = (
- _stmt_nodes
- | _mod_nodes
- | _handler_nodes
- | _arguments_nodes
- | _keyword_nodes
- | _alias_nodes
- | _expr_context_nodes
- | _unsupported_expr_nodes
-) - _hacked_nodes
-
-# we're adding a different assignment in some cases to be equality comparison
-# and we don't want `stmt` and friends in their so get only the class whose
-# names are capitalized
-_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
-intersection = _unsupported_nodes & _base_supported_nodes
-_msg = f"cannot both support and not support {intersection}"
-assert not intersection, _msg
-
-
-def _node_not_implemented(node_name: str) -> Callable[..., None]:
- """
- Return a function that raises a NotImplementedError with a passed node name.
- """
-
- def f(self, *args, **kwargs):
- raise NotImplementedError(f"'{node_name}' nodes are not implemented")
-
- return f
-
-
-# should be bound by BaseExprVisitor but that creates a circular dependency:
-# _T is used in disallow, but disallow is used to define BaseExprVisitor
-# https://github.com/microsoft/pyright/issues/2315
-_T = TypeVar("_T")
-
-
-def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]:
- """
- Decorator to disallow certain nodes from parsing. Raises a
- NotImplementedError instead.
-
- Returns
- -------
- callable
- """
-
- def disallowed(cls: type[_T]) -> type[_T]:
- # error: "Type[_T]" has no attribute "unsupported_nodes"
- cls.unsupported_nodes = () # type: ignore[attr-defined]
- for node in nodes:
- new_method = _node_not_implemented(node)
- name = f"visit_{node}"
- # error: "Type[_T]" has no attribute "unsupported_nodes"
- cls.unsupported_nodes += (name,) # type: ignore[attr-defined]
- setattr(cls, name, new_method)
- return cls
-
- return disallowed
-
-
-def _op_maker(op_class, op_symbol):
- """
- Return a function to create an op class with its symbol already passed.
-
- Returns
- -------
- callable
- """
-
- def f(self, node, *args, **kwargs):
- """
- Return a partial function with an Op subclass with an operator already passed.
-
- Returns
- -------
- callable
- """
- return partial(op_class, op_symbol, *args, **kwargs)
-
- return f
-
-
-_op_classes = {"binary": BinOp, "unary": UnaryOp}
-
-
-def add_ops(op_classes):
- """
- Decorator to add default implementation of ops.
- """
-
- def f(cls):
- for op_attr_name, op_class in op_classes.items():
- ops = getattr(cls, f"{op_attr_name}_ops")
- ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map")
- for op in ops:
- op_node = ops_map[op]
- if op_node is not None:
- made_op = _op_maker(op_class, op)
- setattr(cls, f"visit_{op_node}", made_op)
- return cls
-
- return f
-
-
-@disallow(_unsupported_nodes)
-@add_ops(_op_classes)
-class BaseExprVisitor(ast.NodeVisitor):
- """
- Custom ast walker. Parsers of other engines should subclass this class
- if necessary.
-
- Parameters
- ----------
- env : Scope
- engine : str
- parser : str
- preparser : callable
- """
-
- const_type: type[Term] = Constant
- term_type = Term
-
- binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS
- binary_op_nodes = (
- "Gt",
- "Lt",
- "GtE",
- "LtE",
- "Eq",
- "NotEq",
- "In",
- "NotIn",
- "BitAnd",
- "BitOr",
- "And",
- "Or",
- "Add",
- "Sub",
- "Mult",
- None,
- "Pow",
- "FloorDiv",
- "Mod",
- )
- binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))
-
- unary_ops = UNARY_OPS_SYMS
- unary_op_nodes = "UAdd", "USub", "Invert", "Not"
- unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes))
-
- rewrite_map = {
- ast.Eq: ast.In,
- ast.NotEq: ast.NotIn,
- ast.In: ast.In,
- ast.NotIn: ast.NotIn,
- }
-
- unsupported_nodes: tuple[str, ...]
-
- def __init__(self, env, engine, parser, preparser=_preparse) -> None:
- self.env = env
- self.engine = engine
- self.parser = parser
- self.preparser = preparser
- self.assigner = None
-
- def visit(self, node, **kwargs):
- if isinstance(node, str):
- clean = self.preparser(node)
- try:
- node = ast.fix_missing_locations(ast.parse(clean))
- except SyntaxError as e:
- if any(iskeyword(x) for x in clean.split()):
- e.msg = "Python keyword not valid identifier in numexpr query"
- raise e
-
- method = f"visit_{type(node).__name__}"
- visitor = getattr(self, method)
- return visitor(node, **kwargs)
-
- def visit_Module(self, node, **kwargs):
- if len(node.body) != 1:
- raise SyntaxError("only a single expression is allowed")
- expr = node.body[0]
- return self.visit(expr, **kwargs)
-
- def visit_Expr(self, node, **kwargs):
- return self.visit(node.value, **kwargs)
-
- def _rewrite_membership_op(self, node, left, right):
- # the kind of the operator (is actually an instance)
- op_instance = node.op
- op_type = type(op_instance)
-
- # must be two terms and the comparison operator must be ==/!=/in/not in
- if is_term(left) and is_term(right) and op_type in self.rewrite_map:
- left_list, right_list = map(_is_list, (left, right))
- left_str, right_str = map(_is_str, (left, right))
-
- # if there are any strings or lists in the expression
- if left_list or right_list or left_str or right_str:
- op_instance = self.rewrite_map[op_type]()
-
- # pop the string variable out of locals and replace it with a list
- # of one string, kind of a hack
- if right_str:
- name = self.env.add_tmp([right.value])
- right = self.term_type(name, self.env)
-
- if left_str:
- name = self.env.add_tmp([left.value])
- left = self.term_type(name, self.env)
-
- op = self.visit(op_instance)
- return op, op_instance, left, right
-
- def _maybe_transform_eq_ne(self, node, left=None, right=None):
- if left is None:
- left = self.visit(node.left, side="left")
- if right is None:
- right = self.visit(node.right, side="right")
- op, op_class, left, right = self._rewrite_membership_op(node, left, right)
- return op, op_class, left, right
-
- def _maybe_downcast_constants(self, left, right):
- f32 = np.dtype(np.float32)
- if (
- left.is_scalar
- and hasattr(left, "value")
- and not right.is_scalar
- and right.return_type == f32
- ):
- # right is a float32 array, left is a scalar
- name = self.env.add_tmp(np.float32(left.value))
- left = self.term_type(name, self.env)
- if (
- right.is_scalar
- and hasattr(right, "value")
- and not left.is_scalar
- and left.return_type == f32
- ):
- # left is a float32 array, right is a scalar
- name = self.env.add_tmp(np.float32(right.value))
- right = self.term_type(name, self.env)
-
- return left, right
-
- def _maybe_eval(self, binop, eval_in_python):
- # eval `in` and `not in` (for now) in "partial" python space
- # things that can be evaluated in "eval" space will be turned into
- # temporary variables. for example,
- # [1,2] in a + 2 * b
- # in that case a + 2 * b will be evaluated using numexpr, and the "in"
- # call will be evaluated using isin (in python space)
- return binop.evaluate(
- self.env, self.engine, self.parser, self.term_type, eval_in_python
- )
-
- def _maybe_evaluate_binop(
- self,
- op,
- op_class,
- lhs,
- rhs,
- eval_in_python=("in", "not in"),
- maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="),
- ):
- res = op(lhs, rhs)
-
- if res.has_invalid_return_type:
- raise TypeError(
- f"unsupported operand type(s) for {res.op}: "
- f"'{lhs.type}' and '{rhs.type}'"
- )
-
- if self.engine != "pytables" and (
- res.op in CMP_OPS_SYMS
- and getattr(lhs, "is_datetime", False)
- or getattr(rhs, "is_datetime", False)
- ):
- # all date ops must be done in python bc numexpr doesn't work
- # well with NaT
- return self._maybe_eval(res, self.binary_ops)
-
- if res.op in eval_in_python:
- # "in"/"not in" ops are always evaluated in python
- return self._maybe_eval(res, eval_in_python)
- elif self.engine != "pytables":
- if (
- getattr(lhs, "return_type", None) == object
- or getattr(rhs, "return_type", None) == object
- ):
- # evaluate "==" and "!=" in python if either of our operands
- # has an object return type
- return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
- return res
-
- def visit_BinOp(self, node, **kwargs):
- op, op_class, left, right = self._maybe_transform_eq_ne(node)
- left, right = self._maybe_downcast_constants(left, right)
- return self._maybe_evaluate_binop(op, op_class, left, right)
-
- def visit_Div(self, node, **kwargs):
- return lambda lhs, rhs: Div(lhs, rhs)
-
- def visit_UnaryOp(self, node, **kwargs):
- op = self.visit(node.op)
- operand = self.visit(node.operand)
- return op(operand)
-
- def visit_Name(self, node, **kwargs):
- return self.term_type(node.id, self.env, **kwargs)
-
- def visit_NameConstant(self, node, **kwargs) -> Term:
- return self.const_type(node.value, self.env)
-
- def visit_Num(self, node, **kwargs) -> Term:
- return self.const_type(node.n, self.env)
-
- def visit_Constant(self, node, **kwargs) -> Term:
- return self.const_type(node.n, self.env)
-
- def visit_Str(self, node, **kwargs):
- name = self.env.add_tmp(node.s)
- return self.term_type(name, self.env)
-
- def visit_List(self, node, **kwargs):
- name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts])
- return self.term_type(name, self.env)
-
- visit_Tuple = visit_List
-
- def visit_Index(self, node, **kwargs):
- """df.index[4]"""
- return self.visit(node.value)
-
- def visit_Subscript(self, node, **kwargs):
- from pandas import eval as pd_eval
-
- value = self.visit(node.value)
- slobj = self.visit(node.slice)
- result = pd_eval(
- slobj, local_dict=self.env, engine=self.engine, parser=self.parser
- )
- try:
- # a Term instance
- v = value.value[result]
- except AttributeError:
- # an Op instance
- lhs = pd_eval(
- value, local_dict=self.env, engine=self.engine, parser=self.parser
- )
- v = lhs[result]
- name = self.env.add_tmp(v)
- return self.term_type(name, env=self.env)
-
- def visit_Slice(self, node, **kwargs):
- """df.index[slice(4,6)]"""
- lower = node.lower
- if lower is not None:
- lower = self.visit(lower).value
- upper = node.upper
- if upper is not None:
- upper = self.visit(upper).value
- step = node.step
- if step is not None:
- step = self.visit(step).value
-
- return slice(lower, upper, step)
-
- def visit_Assign(self, node, **kwargs):
- """
- support a single assignment node, like
-
- c = a + b
-
- set the assigner at the top level, must be a Name node which
- might or might not exist in the resolvers
-
- """
- if len(node.targets) != 1:
- raise SyntaxError("can only assign a single expression")
- if not isinstance(node.targets[0], ast.Name):
- raise SyntaxError("left hand side of an assignment must be a single name")
- if self.env.target is None:
- raise ValueError("cannot assign without a target object")
-
- try:
- assigner = self.visit(node.targets[0], **kwargs)
- except UndefinedVariableError:
- assigner = node.targets[0].id
-
- self.assigner = getattr(assigner, "name", assigner)
- if self.assigner is None:
- raise SyntaxError(
- "left hand side of an assignment must be a single resolvable name"
- )
-
- return self.visit(node.value, **kwargs)
-
- def visit_Attribute(self, node, **kwargs):
- attr = node.attr
- value = node.value
-
- ctx = node.ctx
- if isinstance(ctx, ast.Load):
- # resolve the value
- resolved = self.visit(value).value
- try:
- v = getattr(resolved, attr)
- name = self.env.add_tmp(v)
- return self.term_type(name, self.env)
- except AttributeError:
- # something like datetime.datetime where scope is overridden
- if isinstance(value, ast.Name) and value.id == attr:
- return resolved
- raise
-
- raise ValueError(f"Invalid Attribute context {type(ctx).__name__}")
-
- def visit_Call(self, node, side=None, **kwargs):
- if isinstance(node.func, ast.Attribute) and node.func.attr != "__call__":
- res = self.visit_Attribute(node.func)
- elif not isinstance(node.func, ast.Name):
- raise TypeError("Only named functions are supported")
- else:
- try:
- res = self.visit(node.func)
- except UndefinedVariableError:
- # Check if this is a supported function name
- try:
- res = FuncNode(node.func.id)
- except ValueError:
- # Raise original error
- raise
-
- if res is None:
- # error: "expr" has no attribute "id"
- raise ValueError(
- f"Invalid function call {node.func.id}" # type: ignore[attr-defined]
- )
- if hasattr(res, "value"):
- res = res.value
-
- if isinstance(res, FuncNode):
- new_args = [self.visit(arg) for arg in node.args]
-
- if node.keywords:
- raise TypeError(
- f'Function "{res.name}" does not support keyword arguments'
- )
-
- return res(*new_args)
-
- else:
- new_args = [self.visit(arg)(self.env) for arg in node.args]
-
- for key in node.keywords:
- if not isinstance(key, ast.keyword):
- # error: "expr" has no attribute "id"
- raise ValueError(
- "keyword error in function call " # type: ignore[attr-defined]
- f"'{node.func.id}'"
- )
-
- if key.arg:
- kwargs[key.arg] = self.visit(key.value)(self.env)
-
- name = self.env.add_tmp(res(*new_args, **kwargs))
- return self.term_type(name=name, env=self.env)
-
- def translate_In(self, op):
- return op
-
- def visit_Compare(self, node, **kwargs):
- ops = node.ops
- comps = node.comparators
-
- # base case: we have something like a CMP b
- if len(comps) == 1:
- op = self.translate_In(ops[0])
- binop = ast.BinOp(op=op, left=node.left, right=comps[0])
- return self.visit(binop)
-
- # recursive case: we have a chained comparison, a CMP b CMP c, etc.
- left = node.left
- values = []
- for op, comp in zip(ops, comps):
- new_node = self.visit(
- ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)])
- )
- left = comp
- values.append(new_node)
- return self.visit(ast.BoolOp(op=ast.And(), values=values))
-
- def _try_visit_binop(self, bop):
- if isinstance(bop, (Op, Term)):
- return bop
- return self.visit(bop)
-
- def visit_BoolOp(self, node, **kwargs):
- def visitor(x, y):
- lhs = self._try_visit_binop(x)
- rhs = self._try_visit_binop(y)
-
- op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs)
- return self._maybe_evaluate_binop(op, node.op, lhs, rhs)
-
- operands = node.values
- return reduce(visitor, operands)
-
-
-_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"])
-_numexpr_supported_calls = frozenset(REDUCTIONS + MATHOPS)
-
-
-@disallow(
- (_unsupported_nodes | _python_not_supported)
- - (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"]))
-)
-class PandasExprVisitor(BaseExprVisitor):
- def __init__(
- self,
- env,
- engine,
- parser,
- preparser=partial(
- _preparse,
- f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks),
- ),
- ) -> None:
- super().__init__(env, engine, parser, preparser)
-
-
-@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"]))
-class PythonExprVisitor(BaseExprVisitor):
- def __init__(
- self, env, engine, parser, preparser=lambda source, f=None: source
- ) -> None:
- super().__init__(env, engine, parser, preparser=preparser)
-
-
-class Expr:
- """
- Object encapsulating an expression.
-
- Parameters
- ----------
- expr : str
- engine : str, optional, default 'numexpr'
- parser : str, optional, default 'pandas'
- env : Scope, optional, default None
- level : int, optional, default 2
- """
-
- env: Scope
- engine: str
- parser: str
-
- def __init__(
- self,
- expr,
- engine: str = "numexpr",
- parser: str = "pandas",
- env: Scope | None = None,
- level: int = 0,
- ) -> None:
- self.expr = expr
- self.env = env or Scope(level=level + 1)
- self.engine = engine
- self.parser = parser
- self._visitor = PARSERS[parser](self.env, self.engine, self.parser)
- self.terms = self.parse()
-
- @property
- def assigner(self):
- return getattr(self._visitor, "assigner", None)
-
- def __call__(self):
- return self.terms(self.env)
-
- def __repr__(self) -> str:
- return printing.pprint_thing(self.terms)
-
- def __len__(self) -> int:
- return len(self.expr)
-
- def parse(self):
- """
- Parse an expression.
- """
- return self._visitor.visit(self.expr)
-
- @property
- def names(self):
- """
- Get the names in an expression.
- """
- if is_term(self.terms):
- return frozenset([self.terms.name])
- return frozenset(term.name for term in com.flatten(self.terms))
-
-
-PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor}
diff --git a/contrib/python/pandas/py3/pandas/core/computation/expressions.py b/contrib/python/pandas/py3/pandas/core/computation/expressions.py
deleted file mode 100644
index 2b34258982a..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/expressions.py
+++ /dev/null
@@ -1,283 +0,0 @@
-"""
-Expressions
------------
-
-Offer fast expression evaluation through numexpr
-
-"""
-from __future__ import annotations
-
-import operator
-import warnings
-
-import numpy as np
-
-from pandas._config import get_option
-
-from pandas._typing import FuncType
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.computation.check import NUMEXPR_INSTALLED
-from pandas.core.ops import roperator
-
-if NUMEXPR_INSTALLED:
- import numexpr as ne
-
-_TEST_MODE: bool | None = None
-_TEST_RESULT: list[bool] = []
-USE_NUMEXPR = NUMEXPR_INSTALLED
-_evaluate: FuncType | None = None
-_where: FuncType | None = None
-
-# the set of dtypes that we will allow pass to numexpr
-_ALLOWED_DTYPES = {
- "evaluate": {"int64", "int32", "float64", "float32", "bool"},
- "where": {"int64", "float64", "bool"},
-}
-
-# the minimum prod shape that we will use numexpr
-_MIN_ELEMENTS = 1_000_000
-
-
-def set_use_numexpr(v: bool = True) -> None:
- # set/unset to use numexpr
- global USE_NUMEXPR
- if NUMEXPR_INSTALLED:
- USE_NUMEXPR = v
-
- # choose what we are going to do
- global _evaluate, _where
-
- _evaluate = _evaluate_numexpr if USE_NUMEXPR else _evaluate_standard
- _where = _where_numexpr if USE_NUMEXPR else _where_standard
-
-
-def set_numexpr_threads(n=None) -> None:
- # if we are using numexpr, set the threads to n
- # otherwise reset
- if NUMEXPR_INSTALLED and USE_NUMEXPR:
- if n is None:
- n = ne.detect_number_of_cores()
- ne.set_num_threads(n)
-
-
-def _evaluate_standard(op, op_str, a, b):
- """
- Standard evaluation.
- """
- if _TEST_MODE:
- _store_test_result(False)
- return op(a, b)
-
-
-def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
- """return a boolean if we WILL be using numexpr"""
- if op_str is not None:
- # required min elements (otherwise we are adding overhead)
- if a.size > _MIN_ELEMENTS:
- # check for dtype compatibility
- dtypes: set[str] = set()
- for o in [a, b]:
- # ndarray and Series Case
- if hasattr(o, "dtype"):
- dtypes |= {o.dtype.name}
-
- # allowed are a superset
- if not len(dtypes) or _ALLOWED_DTYPES[dtype_check] >= dtypes:
- return True
-
- return False
-
-
-def _evaluate_numexpr(op, op_str, a, b):
- result = None
-
- if _can_use_numexpr(op, op_str, a, b, "evaluate"):
- is_reversed = op.__name__.strip("_").startswith("r")
- if is_reversed:
- # we were originally called by a reversed op method
- a, b = b, a
-
- a_value = a
- b_value = b
-
- try:
- result = ne.evaluate(
- f"a_value {op_str} b_value",
- local_dict={"a_value": a_value, "b_value": b_value},
- casting="safe",
- )
- except TypeError:
- # numexpr raises eg for array ** array with integers
- # (https://github.com/pydata/numexpr/issues/379)
- pass
- except NotImplementedError:
- if _bool_arith_fallback(op_str, a, b):
- pass
- else:
- raise
-
- if is_reversed:
- # reverse order to original for fallback
- a, b = b, a
-
- if _TEST_MODE:
- _store_test_result(result is not None)
-
- if result is None:
- result = _evaluate_standard(op, op_str, a, b)
-
- return result
-
-
-_op_str_mapping = {
- operator.add: "+",
- roperator.radd: "+",
- operator.mul: "*",
- roperator.rmul: "*",
- operator.sub: "-",
- roperator.rsub: "-",
- operator.truediv: "/",
- roperator.rtruediv: "/",
- # floordiv not supported by numexpr 2.x
- operator.floordiv: None,
- roperator.rfloordiv: None,
- # we require Python semantics for mod of negative for backwards compatibility
- # see https://github.com/pydata/numexpr/issues/365
- # so sticking with unaccelerated for now GH#36552
- operator.mod: None,
- roperator.rmod: None,
- operator.pow: "**",
- roperator.rpow: "**",
- operator.eq: "==",
- operator.ne: "!=",
- operator.le: "<=",
- operator.lt: "<",
- operator.ge: ">=",
- operator.gt: ">",
- operator.and_: "&",
- roperator.rand_: "&",
- operator.or_: "|",
- roperator.ror_: "|",
- operator.xor: "^",
- roperator.rxor: "^",
- divmod: None,
- roperator.rdivmod: None,
-}
-
-
-def _where_standard(cond, a, b):
- # Caller is responsible for extracting ndarray if necessary
- return np.where(cond, a, b)
-
-
-def _where_numexpr(cond, a, b):
- # Caller is responsible for extracting ndarray if necessary
- result = None
-
- if _can_use_numexpr(None, "where", a, b, "where"):
- result = ne.evaluate(
- "where(cond_value, a_value, b_value)",
- local_dict={"cond_value": cond, "a_value": a, "b_value": b},
- casting="safe",
- )
-
- if result is None:
- result = _where_standard(cond, a, b)
-
- return result
-
-
-# turn myself on
-set_use_numexpr(get_option("compute.use_numexpr"))
-
-
-def _has_bool_dtype(x):
- try:
- return x.dtype == bool
- except AttributeError:
- return isinstance(x, (bool, np.bool_))
-
-
-_BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"}
-
-
-def _bool_arith_fallback(op_str, a, b) -> bool:
- """
- Check if we should fallback to the python `_evaluate_standard` in case
- of an unsupported operation by numexpr, which is the case for some
- boolean ops.
- """
- if _has_bool_dtype(a) and _has_bool_dtype(b):
- if op_str in _BOOL_OP_UNSUPPORTED:
- warnings.warn(
- f"evaluating in Python space because the {repr(op_str)} "
- "operator is not supported by numexpr for the bool dtype, "
- f"use {repr(_BOOL_OP_UNSUPPORTED[op_str])} instead.",
- stacklevel=find_stack_level(),
- )
- return True
- return False
-
-
-def evaluate(op, a, b, use_numexpr: bool = True):
- """
- Evaluate and return the expression of the op on a and b.
-
- Parameters
- ----------
- op : the actual operand
- a : left operand
- b : right operand
- use_numexpr : bool, default True
- Whether to try to use numexpr.
- """
- op_str = _op_str_mapping[op]
- if op_str is not None:
- if use_numexpr:
- # error: "None" not callable
- return _evaluate(op, op_str, a, b) # type: ignore[misc]
- return _evaluate_standard(op, op_str, a, b)
-
-
-def where(cond, a, b, use_numexpr: bool = True):
- """
- Evaluate the where condition cond on a and b.
-
- Parameters
- ----------
- cond : np.ndarray[bool]
- a : return if cond is True
- b : return if cond is False
- use_numexpr : bool, default True
- Whether to try to use numexpr.
- """
- assert _where is not None
- return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b)
-
-
-def set_test_mode(v: bool = True) -> None:
- """
- Keeps track of whether numexpr was used.
-
- Stores an additional ``True`` for every successful use of evaluate with
- numexpr since the last ``get_test_result``.
- """
- global _TEST_MODE, _TEST_RESULT
- _TEST_MODE = v
- _TEST_RESULT = []
-
-
-def _store_test_result(used_numexpr: bool) -> None:
- if used_numexpr:
- _TEST_RESULT.append(used_numexpr)
-
-
-def get_test_result() -> list[bool]:
- """
- Get test result and reset test_results.
- """
- global _TEST_RESULT
- res = _TEST_RESULT
- _TEST_RESULT = []
- return res
diff --git a/contrib/python/pandas/py3/pandas/core/computation/ops.py b/contrib/python/pandas/py3/pandas/core/computation/ops.py
deleted file mode 100644
index 0538cc7b8d4..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/ops.py
+++ /dev/null
@@ -1,620 +0,0 @@
-"""
-Operator classes for eval.
-"""
-
-from __future__ import annotations
-
-from datetime import datetime
-from functools import partial
-import operator
-from typing import (
- Callable,
- Iterable,
- Iterator,
- Literal,
-)
-
-import numpy as np
-
-from pandas._libs.tslibs import Timestamp
-
-from pandas.core.dtypes.common import (
- is_list_like,
- is_scalar,
-)
-
-import pandas.core.common as com
-from pandas.core.computation.common import (
- ensure_decoded,
- result_type_many,
-)
-from pandas.core.computation.scope import DEFAULT_GLOBALS
-
-from pandas.io.formats.printing import (
- pprint_thing,
- pprint_thing_encoded,
-)
-
-REDUCTIONS = ("sum", "prod", "min", "max")
-
-_unary_math_ops = (
- "sin",
- "cos",
- "exp",
- "log",
- "expm1",
- "log1p",
- "sqrt",
- "sinh",
- "cosh",
- "tanh",
- "arcsin",
- "arccos",
- "arctan",
- "arccosh",
- "arcsinh",
- "arctanh",
- "abs",
- "log10",
- "floor",
- "ceil",
-)
-_binary_math_ops = ("arctan2",)
-
-MATHOPS = _unary_math_ops + _binary_math_ops
-
-
-LOCAL_TAG = "__pd_eval_local_"
-
-
-class Term:
- def __new__(cls, name, env, side=None, encoding=None):
- klass = Constant if not isinstance(name, str) else cls
- # error: Argument 2 for "super" not an instance of argument 1
- supr_new = super(Term, klass).__new__ # type: ignore[misc]
- return supr_new(klass)
-
- is_local: bool
-
- def __init__(self, name, env, side=None, encoding=None) -> None:
- # name is a str for Term, but may be something else for subclasses
- self._name = name
- self.env = env
- self.side = side
- tname = str(name)
- self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS
- self._value = self._resolve_name()
- self.encoding = encoding
-
- @property
- def local_name(self) -> str:
- return self.name.replace(LOCAL_TAG, "")
-
- def __repr__(self) -> str:
- return pprint_thing(self.name)
-
- def __call__(self, *args, **kwargs):
- return self.value
-
- def evaluate(self, *args, **kwargs) -> Term:
- return self
-
- def _resolve_name(self):
- local_name = str(self.local_name)
- is_local = self.is_local
- if local_name in self.env.scope and isinstance(
- self.env.scope[local_name], type
- ):
- is_local = False
-
- res = self.env.resolve(local_name, is_local=is_local)
- self.update(res)
-
- if hasattr(res, "ndim") and res.ndim > 2:
- raise NotImplementedError(
- "N-dimensional objects, where N > 2, are not supported with eval"
- )
- return res
-
- def update(self, value) -> None:
- """
- search order for local (i.e., @variable) variables:
-
- scope, key_variable
- [('locals', 'local_name'),
- ('globals', 'local_name'),
- ('locals', 'key'),
- ('globals', 'key')]
- """
- key = self.name
-
- # if it's a variable name (otherwise a constant)
- if isinstance(key, str):
- self.env.swapkey(self.local_name, key, new_value=value)
-
- self.value = value
-
- @property
- def is_scalar(self) -> bool:
- return is_scalar(self._value)
-
- @property
- def type(self):
- try:
- # potentially very slow for large, mixed dtype frames
- return self._value.values.dtype
- except AttributeError:
- try:
- # ndarray
- return self._value.dtype
- except AttributeError:
- # scalar
- return type(self._value)
-
- return_type = type
-
- @property
- def raw(self) -> str:
- return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})"
-
- @property
- def is_datetime(self) -> bool:
- try:
- t = self.type.type
- except AttributeError:
- t = self.type
-
- return issubclass(t, (datetime, np.datetime64))
-
- @property
- def value(self):
- return self._value
-
- @value.setter
- def value(self, new_value) -> None:
- self._value = new_value
-
- @property
- def name(self):
- return self._name
-
- @property
- def ndim(self) -> int:
- return self._value.ndim
-
-
-class Constant(Term):
- def __init__(self, value, env, side=None, encoding=None) -> None:
- super().__init__(value, env, side=side, encoding=encoding)
-
- def _resolve_name(self):
- return self._name
-
- @property
- def name(self):
- return self.value
-
- def __repr__(self) -> str:
- # in python 2 str() of float
- # can truncate shorter than repr()
- return repr(self.name)
-
-
-_bool_op_map = {"not": "~", "and": "&", "or": "|"}
-
-
-class Op:
- """
- Hold an operator of arbitrary arity.
- """
-
- op: str
-
- def __init__(self, op: str, operands: Iterable[Term | Op], encoding=None) -> None:
- self.op = _bool_op_map.get(op, op)
- self.operands = operands
- self.encoding = encoding
-
- def __iter__(self) -> Iterator:
- return iter(self.operands)
-
- def __repr__(self) -> str:
- """
- Print a generic n-ary operator and its operands using infix notation.
- """
- # recurse over the operands
- parened = (f"({pprint_thing(opr)})" for opr in self.operands)
- return pprint_thing(f" {self.op} ".join(parened))
-
- @property
- def return_type(self):
- # clobber types to bool if the op is a boolean operator
- if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS):
- return np.bool_
- return result_type_many(*(term.type for term in com.flatten(self)))
-
- @property
- def has_invalid_return_type(self) -> bool:
- types = self.operand_types
- obj_dtype_set = frozenset([np.dtype("object")])
- return self.return_type == object and types - obj_dtype_set
-
- @property
- def operand_types(self):
- return frozenset(term.type for term in com.flatten(self))
-
- @property
- def is_scalar(self) -> bool:
- return all(operand.is_scalar for operand in self.operands)
-
- @property
- def is_datetime(self) -> bool:
- try:
- t = self.return_type.type
- except AttributeError:
- t = self.return_type
-
- return issubclass(t, (datetime, np.datetime64))
-
-
-def _in(x, y):
- """
- Compute the vectorized membership of ``x in y`` if possible, otherwise
- use Python.
- """
- try:
- return x.isin(y)
- except AttributeError:
- if is_list_like(x):
- try:
- return y.isin(x)
- except AttributeError:
- pass
- return x in y
-
-
-def _not_in(x, y):
- """
- Compute the vectorized membership of ``x not in y`` if possible,
- otherwise use Python.
- """
- try:
- return ~x.isin(y)
- except AttributeError:
- if is_list_like(x):
- try:
- return ~y.isin(x)
- except AttributeError:
- pass
- return x not in y
-
-
-CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in")
-_cmp_ops_funcs = (
- operator.gt,
- operator.lt,
- operator.ge,
- operator.le,
- operator.eq,
- operator.ne,
- _in,
- _not_in,
-)
-_cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs))
-
-BOOL_OPS_SYMS = ("&", "|", "and", "or")
-_bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_)
-_bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs))
-
-ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%")
-_arith_ops_funcs = (
- operator.add,
- operator.sub,
- operator.mul,
- operator.truediv,
- operator.pow,
- operator.floordiv,
- operator.mod,
-)
-_arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs))
-
-SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%")
-_special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod)
-_special_case_arith_ops_dict = dict(
- zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs)
-)
-
-_binary_ops_dict = {}
-
-for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict):
- _binary_ops_dict.update(d)
-
-
-def _cast_inplace(terms, acceptable_dtypes, dtype) -> None:
- """
- Cast an expression inplace.
-
- Parameters
- ----------
- terms : Op
- The expression that should cast.
- acceptable_dtypes : list of acceptable numpy.dtype
- Will not cast if term's dtype in this list.
- dtype : str or numpy.dtype
- The dtype to cast to.
- """
- dt = np.dtype(dtype)
- for term in terms:
- if term.type in acceptable_dtypes:
- continue
-
- try:
- new_value = term.value.astype(dt)
- except AttributeError:
- new_value = dt.type(term.value)
- term.update(new_value)
-
-
-def is_term(obj) -> bool:
- return isinstance(obj, Term)
-
-
-class BinOp(Op):
- """
- Hold a binary operator and its operands.
-
- Parameters
- ----------
- op : str
- lhs : Term or Op
- rhs : Term or Op
- """
-
- def __init__(self, op: str, lhs, rhs) -> None:
- super().__init__(op, (lhs, rhs))
- self.lhs = lhs
- self.rhs = rhs
-
- self._disallow_scalar_only_bool_ops()
-
- self.convert_values()
-
- try:
- self.func = _binary_ops_dict[op]
- except KeyError as err:
- # has to be made a list for python3
- keys = list(_binary_ops_dict.keys())
- raise ValueError(
- f"Invalid binary operator {repr(op)}, valid operators are {keys}"
- ) from err
-
- def __call__(self, env):
- """
- Recursively evaluate an expression in Python space.
-
- Parameters
- ----------
- env : Scope
-
- Returns
- -------
- object
- The result of an evaluated expression.
- """
- # recurse over the left/right nodes
- left = self.lhs(env)
- right = self.rhs(env)
-
- return self.func(left, right)
-
- def evaluate(self, env, engine: str, parser, term_type, eval_in_python):
- """
- Evaluate a binary operation *before* being passed to the engine.
-
- Parameters
- ----------
- env : Scope
- engine : str
- parser : str
- term_type : type
- eval_in_python : list
-
- Returns
- -------
- term_type
- The "pre-evaluated" expression as an instance of ``term_type``
- """
- if engine == "python":
- res = self(env)
- else:
- # recurse over the left/right nodes
-
- left = self.lhs.evaluate(
- env,
- engine=engine,
- parser=parser,
- term_type=term_type,
- eval_in_python=eval_in_python,
- )
-
- right = self.rhs.evaluate(
- env,
- engine=engine,
- parser=parser,
- term_type=term_type,
- eval_in_python=eval_in_python,
- )
-
- # base cases
- if self.op in eval_in_python:
- res = self.func(left.value, right.value)
- else:
- from pandas.core.computation.eval import eval
-
- res = eval(self, local_dict=env, engine=engine, parser=parser)
-
- name = env.add_tmp(res)
- return term_type(name, env=env)
-
- def convert_values(self) -> None:
- """
- Convert datetimes to a comparable value in an expression.
- """
-
- def stringify(value):
- encoder: Callable
- if self.encoding is not None:
- encoder = partial(pprint_thing_encoded, encoding=self.encoding)
- else:
- encoder = pprint_thing
- return encoder(value)
-
- lhs, rhs = self.lhs, self.rhs
-
- if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar:
- v = rhs.value
- if isinstance(v, (int, float)):
- v = stringify(v)
- v = Timestamp(ensure_decoded(v))
- if v.tz is not None:
- v = v.tz_convert("UTC")
- self.rhs.update(v)
-
- if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar:
- v = lhs.value
- if isinstance(v, (int, float)):
- v = stringify(v)
- v = Timestamp(ensure_decoded(v))
- if v.tz is not None:
- v = v.tz_convert("UTC")
- self.lhs.update(v)
-
- def _disallow_scalar_only_bool_ops(self):
- rhs = self.rhs
- lhs = self.lhs
-
- # GH#24883 unwrap dtype if necessary to ensure we have a type object
- rhs_rt = rhs.return_type
- rhs_rt = getattr(rhs_rt, "type", rhs_rt)
- lhs_rt = lhs.return_type
- lhs_rt = getattr(lhs_rt, "type", lhs_rt)
- if (
- (lhs.is_scalar or rhs.is_scalar)
- and self.op in _bool_ops_dict
- and (
- not (
- issubclass(rhs_rt, (bool, np.bool_))
- and issubclass(lhs_rt, (bool, np.bool_))
- )
- )
- ):
- raise NotImplementedError("cannot evaluate scalar only bool ops")
-
-
-def isnumeric(dtype) -> bool:
- return issubclass(np.dtype(dtype).type, np.number)
-
-
-class Div(BinOp):
- """
- Div operator to special case casting.
-
- Parameters
- ----------
- lhs, rhs : Term or Op
- The Terms or Ops in the ``/`` expression.
- """
-
- def __init__(self, lhs, rhs) -> None:
- super().__init__("/", lhs, rhs)
-
- if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type):
- raise TypeError(
- f"unsupported operand type(s) for {self.op}: "
- f"'{lhs.return_type}' and '{rhs.return_type}'"
- )
-
- # do not upcast float32s to float64 un-necessarily
- acceptable_dtypes = [np.float32, np.float_]
- _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_)
-
-
-UNARY_OPS_SYMS = ("+", "-", "~", "not")
-_unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert)
-_unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs))
-
-
-class UnaryOp(Op):
- """
- Hold a unary operator and its operands.
-
- Parameters
- ----------
- op : str
- The token used to represent the operator.
- operand : Term or Op
- The Term or Op operand to the operator.
-
- Raises
- ------
- ValueError
- * If no function associated with the passed operator token is found.
- """
-
- def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None:
- super().__init__(op, (operand,))
- self.operand = operand
-
- try:
- self.func = _unary_ops_dict[op]
- except KeyError as err:
- raise ValueError(
- f"Invalid unary operator {repr(op)}, "
- f"valid operators are {UNARY_OPS_SYMS}"
- ) from err
-
- def __call__(self, env) -> MathCall:
- operand = self.operand(env)
- # error: Cannot call function of unknown type
- return self.func(operand) # type: ignore[operator]
-
- def __repr__(self) -> str:
- return pprint_thing(f"{self.op}({self.operand})")
-
- @property
- def return_type(self) -> np.dtype:
- operand = self.operand
- if operand.return_type == np.dtype("bool"):
- return np.dtype("bool")
- if isinstance(operand, Op) and (
- operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict
- ):
- return np.dtype("bool")
- return np.dtype("int")
-
-
-class MathCall(Op):
- def __init__(self, func, args) -> None:
- super().__init__(func.name, args)
- self.func = func
-
- def __call__(self, env):
- # error: "Op" not callable
- operands = [op(env) for op in self.operands] # type: ignore[operator]
- with np.errstate(all="ignore"):
- return self.func.func(*operands)
-
- def __repr__(self) -> str:
- operands = map(str, self.operands)
- return pprint_thing(f"{self.op}({','.join(operands)})")
-
-
-class FuncNode:
- def __init__(self, name: str) -> None:
- if name not in MATHOPS:
- raise ValueError(f'"{name}" is not a supported function')
- self.name = name
- self.func = getattr(np, name)
-
- def __call__(self, *args):
- return MathCall(self, args)
diff --git a/contrib/python/pandas/py3/pandas/core/computation/parsing.py b/contrib/python/pandas/py3/pandas/core/computation/parsing.py
deleted file mode 100644
index 4020ec7b5e9..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/parsing.py
+++ /dev/null
@@ -1,195 +0,0 @@
-"""
-:func:`~pandas.eval` source string parsing functions
-"""
-from __future__ import annotations
-
-from io import StringIO
-from keyword import iskeyword
-import token
-import tokenize
-from typing import (
- Hashable,
- Iterator,
-)
-
-# A token value Python's tokenizer probably will never use.
-BACKTICK_QUOTED_STRING = 100
-
-
-def create_valid_python_identifier(name: str) -> str:
- """
- Create valid Python identifiers from any string.
-
- Check if name contains any special characters. If it contains any
- special characters, the special characters will be replaced by
- a special string and a prefix is added.
-
- Raises
- ------
- SyntaxError
- If the returned name is not a Python valid identifier, raise an exception.
- This can happen if there is a hashtag in the name, as the tokenizer will
- than terminate and not find the backtick.
- But also for characters that fall out of the range of (U+0001..U+007F).
- """
- if name.isidentifier() and not iskeyword(name):
- return name
-
- # Create a dict with the special characters and their replacement string.
- # EXACT_TOKEN_TYPES contains these special characters
- # token.tok_name contains a readable description of the replacement string.
- special_characters_replacements = {
- char: f"_{token.tok_name[tokval]}_"
- for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items())
- }
- special_characters_replacements.update(
- {
- " ": "_",
- "?": "_QUESTIONMARK_",
- "!": "_EXCLAMATIONMARK_",
- "$": "_DOLLARSIGN_",
- "€": "_EUROSIGN_",
- "°": "_DEGREESIGN_",
- # Including quotes works, but there are exceptions.
- "'": "_SINGLEQUOTE_",
- '"': "_DOUBLEQUOTE_",
- # Currently not possible. Terminates parser and won't find backtick.
- # "#": "_HASH_",
- }
- )
-
- name = "".join([special_characters_replacements.get(char, char) for char in name])
- name = f"BACKTICK_QUOTED_STRING_{name}"
-
- if not name.isidentifier():
- raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")
-
- return name
-
-
-def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]:
- """
- Clean up a column name if surrounded by backticks.
-
- Backtick quoted string are indicated by a certain tokval value. If a string
- is a backtick quoted token it will processed by
- :func:`_create_valid_python_identifier` so that the parser can find this
- string when the query is executed.
- In this case the tok will get the NAME tokval.
-
- Parameters
- ----------
- tok : tuple of int, str
- ints correspond to the all caps constants in the tokenize module
-
- Returns
- -------
- tok : Tuple[int, str]
- Either the input or token or the replacement values
- """
- toknum, tokval = tok
- if toknum == BACKTICK_QUOTED_STRING:
- return tokenize.NAME, create_valid_python_identifier(tokval)
- return toknum, tokval
-
-
-def clean_column_name(name: Hashable) -> Hashable:
- """
- Function to emulate the cleaning of a backtick quoted name.
-
- The purpose for this function is to see what happens to the name of
- identifier if it goes to the process of being parsed a Python code
- inside a backtick quoted string and than being cleaned
- (removed of any special characters).
-
- Parameters
- ----------
- name : hashable
- Name to be cleaned.
-
- Returns
- -------
- name : hashable
- Returns the name after tokenizing and cleaning.
-
- Notes
- -----
- For some cases, a name cannot be converted to a valid Python identifier.
- In that case :func:`tokenize_string` raises a SyntaxError.
- In that case, we just return the name unmodified.
-
- If this name was used in the query string (this makes the query call impossible)
- an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
- which is not caught and propagates to the user level.
- """
- try:
- tokenized = tokenize_string(f"`{name}`")
- tokval = next(tokenized)[1]
- return create_valid_python_identifier(tokval)
- except SyntaxError:
- return name
-
-
-def tokenize_backtick_quoted_string(
- token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
-) -> tuple[int, str]:
- """
- Creates a token from a backtick quoted string.
-
- Moves the token_generator forwards till right after the next backtick.
-
- Parameters
- ----------
- token_generator : Iterator[tokenize.TokenInfo]
- The generator that yields the tokens of the source string (Tuple[int, str]).
- The generator is at the first token after the backtick (`)
-
- source : str
- The Python source code string.
-
- string_start : int
- This is the start of backtick quoted string inside the source string.
-
- Returns
- -------
- tok: Tuple[int, str]
- The token that represents the backtick quoted string.
- The integer is equal to BACKTICK_QUOTED_STRING (100).
- """
- for _, tokval, start, _, _ in token_generator:
- if tokval == "`":
- string_end = start[1]
- break
-
- return BACKTICK_QUOTED_STRING, source[string_start:string_end]
-
-
-def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
- """
- Tokenize a Python source code string.
-
- Parameters
- ----------
- source : str
- The Python source code string.
-
- Returns
- -------
- tok_generator : Iterator[Tuple[int, str]]
- An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
- """
- line_reader = StringIO(source).readline
- token_generator = tokenize.generate_tokens(line_reader)
-
- # Loop over all tokens till a backtick (`) is found.
- # Then, take all tokens till the next backtick to form a backtick quoted string
- for toknum, tokval, start, _, _ in token_generator:
- if tokval == "`":
- try:
- yield tokenize_backtick_quoted_string(
- token_generator, source, string_start=start[1] + 1
- )
- except Exception as err:
- raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
- else:
- yield toknum, tokval
diff --git a/contrib/python/pandas/py3/pandas/core/computation/pytables.py b/contrib/python/pandas/py3/pandas/core/computation/pytables.py
deleted file mode 100644
index 5c8602c0291..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/pytables.py
+++ /dev/null
@@ -1,641 +0,0 @@
-""" manage PyTables query interface via Expressions """
-from __future__ import annotations
-
-import ast
-from functools import partial
-from typing import Any
-
-import numpy as np
-
-from pandas._libs.tslibs import (
- Timedelta,
- Timestamp,
-)
-from pandas._typing import npt
-from pandas.errors import UndefinedVariableError
-
-from pandas.core.dtypes.common import is_list_like
-
-import pandas.core.common as com
-from pandas.core.computation import (
- expr,
- ops,
- scope as _scope,
-)
-from pandas.core.computation.common import ensure_decoded
-from pandas.core.computation.expr import BaseExprVisitor
-from pandas.core.computation.ops import is_term
-from pandas.core.construction import extract_array
-from pandas.core.indexes.base import Index
-
-from pandas.io.formats.printing import (
- pprint_thing,
- pprint_thing_encoded,
-)
-
-
-class PyTablesScope(_scope.Scope):
- __slots__ = ("queryables",)
-
- queryables: dict[str, Any]
-
- def __init__(
- self,
- level: int,
- global_dict=None,
- local_dict=None,
- queryables: dict[str, Any] | None = None,
- ) -> None:
- super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict)
- self.queryables = queryables or {}
-
-
-class Term(ops.Term):
- env: PyTablesScope
-
- def __new__(cls, name, env, side=None, encoding=None):
- if isinstance(name, str):
- klass = cls
- else:
- klass = Constant
- return object.__new__(klass)
-
- def __init__(self, name, env: PyTablesScope, side=None, encoding=None) -> None:
- super().__init__(name, env, side=side, encoding=encoding)
-
- def _resolve_name(self):
- # must be a queryables
- if self.side == "left":
- # Note: The behavior of __new__ ensures that self.name is a str here
- if self.name not in self.env.queryables:
- raise NameError(f"name {repr(self.name)} is not defined")
- return self.name
-
- # resolve the rhs (and allow it to be None)
- try:
- return self.env.resolve(self.name, is_local=False)
- except UndefinedVariableError:
- return self.name
-
- # read-only property overwriting read/write property
- @property # type: ignore[misc]
- def value(self):
- return self._value
-
-
-class Constant(Term):
- def __init__(self, value, env: PyTablesScope, side=None, encoding=None) -> None:
- assert isinstance(env, PyTablesScope), type(env)
- super().__init__(value, env, side=side, encoding=encoding)
-
- def _resolve_name(self):
- return self._name
-
-
-class BinOp(ops.BinOp):
- _max_selectors = 31
-
- op: str
- queryables: dict[str, Any]
- condition: str | None
-
- def __init__(self, op: str, lhs, rhs, queryables: dict[str, Any], encoding) -> None:
- super().__init__(op, lhs, rhs)
- self.queryables = queryables
- self.encoding = encoding
- self.condition = None
-
- def _disallow_scalar_only_bool_ops(self) -> None:
- pass
-
- def prune(self, klass):
- def pr(left, right):
- """create and return a new specialized BinOp from myself"""
- if left is None:
- return right
- elif right is None:
- return left
-
- k = klass
- if isinstance(left, ConditionBinOp):
- if isinstance(right, ConditionBinOp):
- k = JointConditionBinOp
- elif isinstance(left, k):
- return left
- elif isinstance(right, k):
- return right
-
- elif isinstance(left, FilterBinOp):
- if isinstance(right, FilterBinOp):
- k = JointFilterBinOp
- elif isinstance(left, k):
- return left
- elif isinstance(right, k):
- return right
-
- return k(
- self.op, left, right, queryables=self.queryables, encoding=self.encoding
- ).evaluate()
-
- left, right = self.lhs, self.rhs
-
- if is_term(left) and is_term(right):
- res = pr(left.value, right.value)
- elif not is_term(left) and is_term(right):
- res = pr(left.prune(klass), right.value)
- elif is_term(left) and not is_term(right):
- res = pr(left.value, right.prune(klass))
- elif not (is_term(left) or is_term(right)):
- res = pr(left.prune(klass), right.prune(klass))
-
- return res
-
- def conform(self, rhs):
- """inplace conform rhs"""
- if not is_list_like(rhs):
- rhs = [rhs]
- if isinstance(rhs, np.ndarray):
- rhs = rhs.ravel()
- return rhs
-
- @property
- def is_valid(self) -> bool:
- """return True if this is a valid field"""
- return self.lhs in self.queryables
-
- @property
- def is_in_table(self) -> bool:
- """
- return True if this is a valid column name for generation (e.g. an
- actual column in the table)
- """
- return self.queryables.get(self.lhs) is not None
-
- @property
- def kind(self):
- """the kind of my field"""
- return getattr(self.queryables.get(self.lhs), "kind", None)
-
- @property
- def meta(self):
- """the meta of my field"""
- return getattr(self.queryables.get(self.lhs), "meta", None)
-
- @property
- def metadata(self):
- """the metadata of my field"""
- return getattr(self.queryables.get(self.lhs), "metadata", None)
-
- def generate(self, v) -> str:
- """create and return the op string for this TermValue"""
- val = v.tostring(self.encoding)
- return f"({self.lhs} {self.op} {val})"
-
- def convert_value(self, v) -> TermValue:
- """
- convert the expression that is in the term to something that is
- accepted by pytables
- """
-
- def stringify(value):
- if self.encoding is not None:
- return pprint_thing_encoded(value, encoding=self.encoding)
- return pprint_thing(value)
-
- kind = ensure_decoded(self.kind)
- meta = ensure_decoded(self.meta)
- if kind in ("datetime64", "datetime"):
- if isinstance(v, (int, float)):
- v = stringify(v)
- v = ensure_decoded(v)
- v = Timestamp(v).as_unit("ns")
- if v.tz is not None:
- v = v.tz_convert("UTC")
- return TermValue(v, v._value, kind)
- elif kind in ("timedelta64", "timedelta"):
- if isinstance(v, str):
- v = Timedelta(v)
- else:
- v = Timedelta(v, unit="s")
- v = v.as_unit("ns")._value
- return TermValue(int(v), v, kind)
- elif meta == "category":
- metadata = extract_array(self.metadata, extract_numpy=True)
- result: npt.NDArray[np.intp] | np.intp | int
- if v not in metadata:
- result = -1
- else:
- result = metadata.searchsorted(v, side="left")
- return TermValue(result, result, "integer")
- elif kind == "integer":
- v = int(float(v))
- return TermValue(v, v, kind)
- elif kind == "float":
- v = float(v)
- return TermValue(v, v, kind)
- elif kind == "bool":
- if isinstance(v, str):
- v = v.strip().lower() not in [
- "false",
- "f",
- "no",
- "n",
- "none",
- "0",
- "[]",
- "{}",
- "",
- ]
- else:
- v = bool(v)
- return TermValue(v, v, kind)
- elif isinstance(v, str):
- # string quoting
- return TermValue(v, stringify(v), "string")
- else:
- raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column")
-
- def convert_values(self) -> None:
- pass
-
-
-class FilterBinOp(BinOp):
- filter: tuple[Any, Any, Index] | None = None
-
- def __repr__(self) -> str:
- if self.filter is None:
- return "Filter: Not Initialized"
- return pprint_thing(f"[Filter : [{self.filter[0]}] -> [{self.filter[1]}]")
-
- def invert(self):
- """invert the filter"""
- if self.filter is not None:
- self.filter = (
- self.filter[0],
- self.generate_filter_op(invert=True),
- self.filter[2],
- )
- return self
-
- def format(self):
- """return the actual filter format"""
- return [self.filter]
-
- def evaluate(self):
- if not self.is_valid:
- raise ValueError(f"query term is not valid [{self}]")
-
- rhs = self.conform(self.rhs)
- values = list(rhs)
-
- if self.is_in_table:
- # if too many values to create the expression, use a filter instead
- if self.op in ["==", "!="] and len(values) > self._max_selectors:
- filter_op = self.generate_filter_op()
- self.filter = (self.lhs, filter_op, Index(values))
-
- return self
- return None
-
- # equality conditions
- if self.op in ["==", "!="]:
- filter_op = self.generate_filter_op()
- self.filter = (self.lhs, filter_op, Index(values))
-
- else:
- raise TypeError(
- f"passing a filterable condition to a non-table indexer [{self}]"
- )
-
- return self
-
- def generate_filter_op(self, invert: bool = False):
- if (self.op == "!=" and not invert) or (self.op == "==" and invert):
- return lambda axis, vals: ~axis.isin(vals)
- else:
- return lambda axis, vals: axis.isin(vals)
-
-
-class JointFilterBinOp(FilterBinOp):
- def format(self):
- raise NotImplementedError("unable to collapse Joint Filters")
-
- def evaluate(self):
- return self
-
-
-class ConditionBinOp(BinOp):
- def __repr__(self) -> str:
- return pprint_thing(f"[Condition : [{self.condition}]]")
-
- def invert(self):
- """invert the condition"""
- # if self.condition is not None:
- # self.condition = "~(%s)" % self.condition
- # return self
- raise NotImplementedError(
- "cannot use an invert condition when passing to numexpr"
- )
-
- def format(self):
- """return the actual ne format"""
- return self.condition
-
- def evaluate(self):
- if not self.is_valid:
- raise ValueError(f"query term is not valid [{self}]")
-
- # convert values if we are in the table
- if not self.is_in_table:
- return None
-
- rhs = self.conform(self.rhs)
- values = [self.convert_value(v) for v in rhs]
-
- # equality conditions
- if self.op in ["==", "!="]:
- # too many values to create the expression?
- if len(values) <= self._max_selectors:
- vs = [self.generate(v) for v in values]
- self.condition = f"({' | '.join(vs)})"
-
- # use a filter after reading
- else:
- return None
- else:
- self.condition = self.generate(values[0])
-
- return self
-
-
-class JointConditionBinOp(ConditionBinOp):
- def evaluate(self):
- self.condition = f"({self.lhs.condition} {self.op} {self.rhs.condition})"
- return self
-
-
-class UnaryOp(ops.UnaryOp):
- def prune(self, klass):
- if self.op != "~":
- raise NotImplementedError("UnaryOp only support invert type ops")
-
- operand = self.operand
- operand = operand.prune(klass)
-
- if operand is not None and (
- issubclass(klass, ConditionBinOp)
- and operand.condition is not None
- or not issubclass(klass, ConditionBinOp)
- and issubclass(klass, FilterBinOp)
- and operand.filter is not None
- ):
- return operand.invert()
- return None
-
-
-class PyTablesExprVisitor(BaseExprVisitor):
- const_type = Constant
- term_type = Term
-
- def __init__(self, env, engine, parser, **kwargs) -> None:
- super().__init__(env, engine, parser)
- for bin_op in self.binary_ops:
- bin_node = self.binary_op_nodes_map[bin_op]
- setattr(
- self,
- f"visit_{bin_node}",
- lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs),
- )
-
- def visit_UnaryOp(self, node, **kwargs):
- if isinstance(node.op, (ast.Not, ast.Invert)):
- return UnaryOp("~", self.visit(node.operand))
- elif isinstance(node.op, ast.USub):
- return self.const_type(-self.visit(node.operand).value, self.env)
- elif isinstance(node.op, ast.UAdd):
- raise NotImplementedError("Unary addition not supported")
-
- def visit_Index(self, node, **kwargs):
- return self.visit(node.value).value
-
- def visit_Assign(self, node, **kwargs):
- cmpr = ast.Compare(
- ops=[ast.Eq()], left=node.targets[0], comparators=[node.value]
- )
- return self.visit(cmpr)
-
- def visit_Subscript(self, node, **kwargs):
- # only allow simple subscripts
-
- value = self.visit(node.value)
- slobj = self.visit(node.slice)
- try:
- value = value.value
- except AttributeError:
- pass
-
- if isinstance(slobj, Term):
- # In py39 np.ndarray lookups with Term containing int raise
- slobj = slobj.value
-
- try:
- return self.const_type(value[slobj], self.env)
- except TypeError as err:
- raise ValueError(
- f"cannot subscript {repr(value)} with {repr(slobj)}"
- ) from err
-
- def visit_Attribute(self, node, **kwargs):
- attr = node.attr
- value = node.value
-
- ctx = type(node.ctx)
- if ctx == ast.Load:
- # resolve the value
- resolved = self.visit(value)
-
- # try to get the value to see if we are another expression
- try:
- resolved = resolved.value
- except AttributeError:
- pass
-
- try:
- return self.term_type(getattr(resolved, attr), self.env)
- except AttributeError:
- # something like datetime.datetime where scope is overridden
- if isinstance(value, ast.Name) and value.id == attr:
- return resolved
-
- raise ValueError(f"Invalid Attribute context {ctx.__name__}")
-
- def translate_In(self, op):
- return ast.Eq() if isinstance(op, ast.In) else op
-
- def _rewrite_membership_op(self, node, left, right):
- return self.visit(node.op), node.op, left, right
-
-
-def _validate_where(w):
- """
- Validate that the where statement is of the right type.
-
- The type may either be String, Expr, or list-like of Exprs.
-
- Parameters
- ----------
- w : String term expression, Expr, or list-like of Exprs.
-
- Returns
- -------
- where : The original where clause if the check was successful.
-
- Raises
- ------
- TypeError : An invalid data type was passed in for w (e.g. dict).
- """
- if not (isinstance(w, (PyTablesExpr, str)) or is_list_like(w)):
- raise TypeError(
- "where must be passed as a string, PyTablesExpr, "
- "or list-like of PyTablesExpr"
- )
-
- return w
-
-
-class PyTablesExpr(expr.Expr):
- """
- Hold a pytables-like expression, comprised of possibly multiple 'terms'.
-
- Parameters
- ----------
- where : string term expression, PyTablesExpr, or list-like of PyTablesExprs
- queryables : a "kinds" map (dict of column name -> kind), or None if column
- is non-indexable
- encoding : an encoding that will encode the query terms
-
- Returns
- -------
- a PyTablesExpr object
-
- Examples
- --------
- 'index>=date'
- "columns=['A', 'D']"
- 'columns=A'
- 'columns==A'
- "~(columns=['A','B'])"
- 'index>df.index[3] & string="bar"'
- '(index>df.index[3] & index<=df.index[6]) | string="bar"'
- "ts>=Timestamp('2012-02-01')"
- "major_axis>=20130101"
- """
-
- _visitor: PyTablesExprVisitor | None
- env: PyTablesScope
- expr: str
-
- def __init__(
- self,
- where,
- queryables: dict[str, Any] | None = None,
- encoding=None,
- scope_level: int = 0,
- ) -> None:
- where = _validate_where(where)
-
- self.encoding = encoding
- self.condition = None
- self.filter = None
- self.terms = None
- self._visitor = None
-
- # capture the environment if needed
- local_dict: _scope.DeepChainMap[Any, Any] | None = None
-
- if isinstance(where, PyTablesExpr):
- local_dict = where.env.scope
- _where = where.expr
-
- elif is_list_like(where):
- where = list(where)
- for idx, w in enumerate(where):
- if isinstance(w, PyTablesExpr):
- local_dict = w.env.scope
- else:
- w = _validate_where(w)
- where[idx] = w
- _where = " & ".join([f"({w})" for w in com.flatten(where)])
- else:
- # _validate_where ensures we otherwise have a string
- _where = where
-
- self.expr = _where
- self.env = PyTablesScope(scope_level + 1, local_dict=local_dict)
-
- if queryables is not None and isinstance(self.expr, str):
- self.env.queryables.update(queryables)
- self._visitor = PyTablesExprVisitor(
- self.env,
- queryables=queryables,
- parser="pytables",
- engine="pytables",
- encoding=encoding,
- )
- self.terms = self.parse()
-
- def __repr__(self) -> str:
- if self.terms is not None:
- return pprint_thing(self.terms)
- return pprint_thing(self.expr)
-
- def evaluate(self):
- """create and return the numexpr condition and filter"""
- try:
- self.condition = self.terms.prune(ConditionBinOp)
- except AttributeError as err:
- raise ValueError(
- f"cannot process expression [{self.expr}], [{self}] "
- "is not a valid condition"
- ) from err
- try:
- self.filter = self.terms.prune(FilterBinOp)
- except AttributeError as err:
- raise ValueError(
- f"cannot process expression [{self.expr}], [{self}] "
- "is not a valid filter"
- ) from err
-
- return self.condition, self.filter
-
-
-class TermValue:
- """hold a term value the we use to construct a condition/filter"""
-
- def __init__(self, value, converted, kind: str) -> None:
- assert isinstance(kind, str), kind
- self.value = value
- self.converted = converted
- self.kind = kind
-
- def tostring(self, encoding) -> str:
- """quote the string if not encoded else encode and return"""
- if self.kind == "string":
- if encoding is not None:
- return str(self.converted)
- return f'"{self.converted}"'
- elif self.kind == "float":
- # python 2 str(float) is not always
- # round-trippable so use repr()
- return repr(self.converted)
- return str(self.converted)
-
-
-def maybe_expression(s) -> bool:
- """loose checking if s is a pytables-acceptable expression"""
- if not isinstance(s, str):
- return False
- operations = PyTablesExprVisitor.binary_ops + PyTablesExprVisitor.unary_ops + ("=",)
-
- # make sure we have an op at least
- return any(op in s for op in operations)
diff --git a/contrib/python/pandas/py3/pandas/core/computation/scope.py b/contrib/python/pandas/py3/pandas/core/computation/scope.py
deleted file mode 100644
index 0b9ba84cae5..00000000000
--- a/contrib/python/pandas/py3/pandas/core/computation/scope.py
+++ /dev/null
@@ -1,357 +0,0 @@
-"""
-Module for scope operations
-"""
-from __future__ import annotations
-
-import datetime
-import inspect
-from io import StringIO
-import itertools
-import pprint
-import struct
-import sys
-from typing import (
- ChainMap,
- TypeVar,
-)
-
-import numpy as np
-
-from pandas._libs.tslibs import Timestamp
-from pandas.errors import UndefinedVariableError
-
-_KT = TypeVar("_KT")
-_VT = TypeVar("_VT")
-
-
-# https://docs.python.org/3/library/collections.html#chainmap-examples-and-recipes
-class DeepChainMap(ChainMap[_KT, _VT]):
- """
- Variant of ChainMap that allows direct updates to inner scopes.
-
- Only works when all passed mapping are mutable.
- """
-
- def __setitem__(self, key: _KT, value: _VT) -> None:
- for mapping in self.maps:
- if key in mapping:
- mapping[key] = value
- return
- self.maps[0][key] = value
-
- def __delitem__(self, key: _KT) -> None:
- """
- Raises
- ------
- KeyError
- If `key` doesn't exist.
- """
- for mapping in self.maps:
- if key in mapping:
- del mapping[key]
- return
- raise KeyError(key)
-
-
-def ensure_scope(
- level: int, global_dict=None, local_dict=None, resolvers=(), target=None
-) -> Scope:
- """Ensure that we are grabbing the correct scope."""
- return Scope(
- level + 1,
- global_dict=global_dict,
- local_dict=local_dict,
- resolvers=resolvers,
- target=target,
- )
-
-
-def _replacer(x) -> str:
- """
- Replace a number with its hexadecimal representation. Used to tag
- temporary variables with their calling scope's id.
- """
- # get the hex repr of the binary char and remove 0x and pad by pad_size
- # zeros
- try:
- hexin = ord(x)
- except TypeError:
- # bytes literals masquerade as ints when iterating in py3
- hexin = x
-
- return hex(hexin)
-
-
-def _raw_hex_id(obj) -> str:
- """Return the padded hexadecimal id of ``obj``."""
- # interpret as a pointer since that's what really what id returns
- packed = struct.pack("@P", id(obj))
- return "".join([_replacer(x) for x in packed])
-
-
-DEFAULT_GLOBALS = {
- "Timestamp": Timestamp,
- "datetime": datetime.datetime,
- "True": True,
- "False": False,
- "list": list,
- "tuple": tuple,
- "inf": np.inf,
- "Inf": np.inf,
-}
-
-
-def _get_pretty_string(obj) -> str:
- """
- Return a prettier version of obj.
-
- Parameters
- ----------
- obj : object
- Object to pretty print
-
- Returns
- -------
- str
- Pretty print object repr
- """
- sio = StringIO()
- pprint.pprint(obj, stream=sio)
- return sio.getvalue()
-
-
-class Scope:
- """
- Object to hold scope, with a few bells to deal with some custom syntax
- and contexts added by pandas.
-
- Parameters
- ----------
- level : int
- global_dict : dict or None, optional, default None
- local_dict : dict or Scope or None, optional, default None
- resolvers : list-like or None, optional, default None
- target : object
-
- Attributes
- ----------
- level : int
- scope : DeepChainMap
- target : object
- temps : dict
- """
-
- __slots__ = ["level", "scope", "target", "resolvers", "temps"]
- level: int
- scope: DeepChainMap
- resolvers: DeepChainMap
- temps: dict
-
- def __init__(
- self, level: int, global_dict=None, local_dict=None, resolvers=(), target=None
- ) -> None:
- self.level = level + 1
-
- # shallow copy because we don't want to keep filling this up with what
- # was there before if there are multiple calls to Scope/_ensure_scope
- self.scope = DeepChainMap(DEFAULT_GLOBALS.copy())
- self.target = target
-
- if isinstance(local_dict, Scope):
- self.scope.update(local_dict.scope)
- if local_dict.target is not None:
- self.target = local_dict.target
- self._update(local_dict.level)
-
- frame = sys._getframe(self.level)
-
- try:
- # shallow copy here because we don't want to replace what's in
- # scope when we align terms (alignment accesses the underlying
- # numpy array of pandas objects)
- scope_global = self.scope.new_child(
- (global_dict if global_dict is not None else frame.f_globals).copy()
- )
- self.scope = DeepChainMap(scope_global)
- if not isinstance(local_dict, Scope):
- scope_local = self.scope.new_child(
- (local_dict if local_dict is not None else frame.f_locals).copy()
- )
- self.scope = DeepChainMap(scope_local)
- finally:
- del frame
-
- # assumes that resolvers are going from outermost scope to inner
- if isinstance(local_dict, Scope):
- resolvers += tuple(local_dict.resolvers.maps)
- self.resolvers = DeepChainMap(*resolvers)
- self.temps = {}
-
- def __repr__(self) -> str:
- scope_keys = _get_pretty_string(list(self.scope.keys()))
- res_keys = _get_pretty_string(list(self.resolvers.keys()))
- return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})"
-
- @property
- def has_resolvers(self) -> bool:
- """
- Return whether we have any extra scope.
-
- For example, DataFrames pass Their columns as resolvers during calls to
- ``DataFrame.eval()`` and ``DataFrame.query()``.
-
- Returns
- -------
- hr : bool
- """
- return bool(len(self.resolvers))
-
- def resolve(self, key: str, is_local: bool):
- """
- Resolve a variable name in a possibly local context.
-
- Parameters
- ----------
- key : str
- A variable name
- is_local : bool
- Flag indicating whether the variable is local or not (prefixed with
- the '@' symbol)
-
- Returns
- -------
- value : object
- The value of a particular variable
- """
- try:
- # only look for locals in outer scope
- if is_local:
- return self.scope[key]
-
- # not a local variable so check in resolvers if we have them
- if self.has_resolvers:
- return self.resolvers[key]
-
- # if we're here that means that we have no locals and we also have
- # no resolvers
- assert not is_local and not self.has_resolvers
- return self.scope[key]
- except KeyError:
- try:
- # last ditch effort we look in temporaries
- # these are created when parsing indexing expressions
- # e.g., df[df > 0]
- return self.temps[key]
- except KeyError as err:
- raise UndefinedVariableError(key, is_local) from err
-
- def swapkey(self, old_key: str, new_key: str, new_value=None) -> None:
- """
- Replace a variable name, with a potentially new value.
-
- Parameters
- ----------
- old_key : str
- Current variable name to replace
- new_key : str
- New variable name to replace `old_key` with
- new_value : object
- Value to be replaced along with the possible renaming
- """
- if self.has_resolvers:
- maps = self.resolvers.maps + self.scope.maps
- else:
- maps = self.scope.maps
-
- maps.append(self.temps)
-
- for mapping in maps:
- if old_key in mapping:
- mapping[new_key] = new_value
- return
-
- def _get_vars(self, stack, scopes: list[str]) -> None:
- """
- Get specifically scoped variables from a list of stack frames.
-
- Parameters
- ----------
- stack : list
- A list of stack frames as returned by ``inspect.stack()``
- scopes : sequence of strings
- A sequence containing valid stack frame attribute names that
- evaluate to a dictionary. For example, ('locals', 'globals')
- """
- variables = itertools.product(scopes, stack)
- for scope, (frame, _, _, _, _, _) in variables:
- try:
- d = getattr(frame, f"f_{scope}")
- self.scope = DeepChainMap(self.scope.new_child(d))
- finally:
- # won't remove it, but DECREF it
- # in Py3 this probably isn't necessary since frame won't be
- # scope after the loop
- del frame
-
- def _update(self, level: int) -> None:
- """
- Update the current scope by going back `level` levels.
-
- Parameters
- ----------
- level : int
- """
- sl = level + 1
-
- # add sl frames to the scope starting with the
- # most distant and overwriting with more current
- # makes sure that we can capture variable scope
- stack = inspect.stack()
-
- try:
- self._get_vars(stack[:sl], scopes=["locals"])
- finally:
- del stack[:], stack
-
- def add_tmp(self, value) -> str:
- """
- Add a temporary variable to the scope.
-
- Parameters
- ----------
- value : object
- An arbitrary object to be assigned to a temporary variable.
-
- Returns
- -------
- str
- The name of the temporary variable created.
- """
- name = f"{type(value).__name__}_{self.ntemps}_{_raw_hex_id(self)}"
-
- # add to inner most scope
- assert name not in self.temps
- self.temps[name] = value
- assert name in self.temps
-
- # only increment if the variable gets put in the scope
- return name
-
- @property
- def ntemps(self) -> int:
- """The number of temporary variables in this scope"""
- return len(self.temps)
-
- @property
- def full_scope(self) -> DeepChainMap:
- """
- Return the full scope for use with passing to engines transparently
- as a mapping.
-
- Returns
- -------
- vars : DeepChainMap
- All variables in this scope.
- """
- maps = [self.temps] + self.resolvers.maps + self.scope.maps
- return DeepChainMap(*maps)
diff --git a/contrib/python/pandas/py3/pandas/core/config_init.py b/contrib/python/pandas/py3/pandas/core/config_init.py
deleted file mode 100644
index af844afa933..00000000000
--- a/contrib/python/pandas/py3/pandas/core/config_init.py
+++ /dev/null
@@ -1,883 +0,0 @@
-"""
-This module is imported from the pandas package __init__.py file
-in order to ensure that the core.config options registered here will
-be available as soon as the user loads the package. if register_option
-is invoked inside specific modules, they will not be registered until that
-module is imported, which may or may not be a problem.
-
-If you need to make sure options are available even before a certain
-module is imported, register them here rather than in the module.
-
-"""
-from __future__ import annotations
-
-import os
-from typing import Callable
-
-import pandas._config.config as cf
-from pandas._config.config import (
- is_bool,
- is_callable,
- is_instance_factory,
- is_int,
- is_nonnegative_int,
- is_one_of_factory,
- is_str,
- is_text,
-)
-
-# compute
-
-use_bottleneck_doc = """
-: bool
- Use the bottleneck library to accelerate if it is installed,
- the default is True
- Valid values: False,True
-"""
-
-
-def use_bottleneck_cb(key) -> None:
- from pandas.core import nanops
-
- nanops.set_use_bottleneck(cf.get_option(key))
-
-
-use_numexpr_doc = """
-: bool
- Use the numexpr library to accelerate computation if it is installed,
- the default is True
- Valid values: False,True
-"""
-
-
-def use_numexpr_cb(key) -> None:
- from pandas.core.computation import expressions
-
- expressions.set_use_numexpr(cf.get_option(key))
-
-
-use_numba_doc = """
-: bool
- Use the numba engine option for select operations if it is installed,
- the default is False
- Valid values: False,True
-"""
-
-
-def use_numba_cb(key) -> None:
- from pandas.core.util import numba_
-
- numba_.set_use_numba(cf.get_option(key))
-
-
-with cf.config_prefix("compute"):
- cf.register_option(
- "use_bottleneck",
- True,
- use_bottleneck_doc,
- validator=is_bool,
- cb=use_bottleneck_cb,
- )
- cf.register_option(
- "use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb
- )
- cf.register_option(
- "use_numba", False, use_numba_doc, validator=is_bool, cb=use_numba_cb
- )
-#
-# options from the "display" namespace
-
-pc_precision_doc = """
-: int
- Floating point output precision in terms of number of places after the
- decimal, for regular formatting as well as scientific notation. Similar
- to ``precision`` in :meth:`numpy.set_printoptions`.
-"""
-
-pc_colspace_doc = """
-: int
- Default space for DataFrame columns.
-"""
-
-pc_max_rows_doc = """
-: int
- If max_rows is exceeded, switch to truncate view. Depending on
- `large_repr`, objects are either centrally truncated or printed as
- a summary view. 'None' value means unlimited.
-
- In case python/IPython is running in a terminal and `large_repr`
- equals 'truncate' this can be set to 0 and pandas will auto-detect
- the height of the terminal and print a truncated object which fits
- the screen height. The IPython notebook, IPython qtconsole, or
- IDLE do not run in a terminal and hence it is not possible to do
- correct auto-detection.
-"""
-
-pc_min_rows_doc = """
-: int
- The numbers of rows to show in a truncated view (when `max_rows` is
- exceeded). Ignored when `max_rows` is set to None or 0. When set to
- None, follows the value of `max_rows`.
-"""
-
-pc_max_cols_doc = """
-: int
- If max_cols is exceeded, switch to truncate view. Depending on
- `large_repr`, objects are either centrally truncated or printed as
- a summary view. 'None' value means unlimited.
-
- In case python/IPython is running in a terminal and `large_repr`
- equals 'truncate' this can be set to 0 or None and pandas will auto-detect
- the width of the terminal and print a truncated object which fits
- the screen width. The IPython notebook, IPython qtconsole, or IDLE
- do not run in a terminal and hence it is not possible to do
- correct auto-detection and defaults to 20.
-"""
-
-pc_max_categories_doc = """
-: int
- This sets the maximum number of categories pandas should output when
- printing out a `Categorical` or a Series of dtype "category".
-"""
-
-pc_max_info_cols_doc = """
-: int
- max_info_columns is used in DataFrame.info method to decide if
- per column information will be printed.
-"""
-
-pc_nb_repr_h_doc = """
-: boolean
- When True, IPython notebook will use html representation for
- pandas objects (if it is available).
-"""
-
-pc_pprint_nest_depth = """
-: int
- Controls the number of nested levels to process when pretty-printing
-"""
-
-pc_multi_sparse_doc = """
-: boolean
- "sparsify" MultiIndex display (don't display repeated
- elements in outer levels within groups)
-"""
-
-float_format_doc = """
-: callable
- The callable should accept a floating point number and return
- a string with the desired format of the number. This is used
- in some places like SeriesFormatter.
- See formats.format.EngFormatter for an example.
-"""
-
-max_colwidth_doc = """
-: int or None
- The maximum width in characters of a column in the repr of
- a pandas data structure. When the column overflows, a "..."
- placeholder is embedded in the output. A 'None' value means unlimited.
-"""
-
-colheader_justify_doc = """
-: 'left'/'right'
- Controls the justification of column headers. used by DataFrameFormatter.
-"""
-
-pc_expand_repr_doc = """
-: boolean
- Whether to print out the full DataFrame repr for wide DataFrames across
- multiple lines, `max_columns` is still respected, but the output will
- wrap-around across multiple "pages" if its width exceeds `display.width`.
-"""
-
-pc_show_dimensions_doc = """
-: boolean or 'truncate'
- Whether to print out dimensions at the end of DataFrame repr.
- If 'truncate' is specified, only print out the dimensions if the
- frame is truncated (e.g. not display all rows and/or columns)
-"""
-
-pc_east_asian_width_doc = """
-: boolean
- Whether to use the Unicode East Asian Width to calculate the display text
- width.
- Enabling this may affect to the performance (default: False)
-"""
-
-pc_ambiguous_as_wide_doc = """
-: boolean
- Whether to handle Unicode characters belong to Ambiguous as Wide (width=2)
- (default: False)
-"""
-
-pc_table_schema_doc = """
-: boolean
- Whether to publish a Table Schema representation for frontends
- that support it.
- (default: False)
-"""
-
-pc_html_border_doc = """
-: int
- A ``border=value`` attribute is inserted in the ``<table>`` tag
- for the DataFrame HTML repr.
-"""
-
-pc_html_use_mathjax_doc = """\
-: boolean
- When True, Jupyter notebook will process table contents using MathJax,
- rendering mathematical expressions enclosed by the dollar symbol.
- (default: True)
-"""
-
-pc_max_dir_items = """\
-: int
- The number of items that will be added to `dir(...)`. 'None' value means
- unlimited. Because dir is cached, changing this option will not immediately
- affect already existing dataframes until a column is deleted or added.
-
- This is for instance used to suggest columns from a dataframe to tab
- completion.
-"""
-
-pc_width_doc = """
-: int
- Width of the display in characters. In case python/IPython is running in
- a terminal this can be set to None and pandas will correctly auto-detect
- the width.
- Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a
- terminal and hence it is not possible to correctly detect the width.
-"""
-
-pc_chop_threshold_doc = """
-: float or None
- if set to a float value, all float values smaller than the given threshold
- will be displayed as exactly 0 by repr and friends.
-"""
-
-pc_max_seq_items = """
-: int or None
- When pretty-printing a long sequence, no more then `max_seq_items`
- will be printed. If items are omitted, they will be denoted by the
- addition of "..." to the resulting string.
-
- If set to None, the number of items to be printed is unlimited.
-"""
-
-pc_max_info_rows_doc = """
-: int or None
- df.info() will usually show null-counts for each column.
- For large frames this can be quite slow. max_info_rows and max_info_cols
- limit this null check only to frames with smaller dimensions than
- specified.
-"""
-
-pc_large_repr_doc = """
-: 'truncate'/'info'
- For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can
- show a truncated table (the default from 0.13), or switch to the view from
- df.info() (the behaviour in earlier versions of pandas).
-"""
-
-pc_memory_usage_doc = """
-: bool, string or None
- This specifies if the memory usage of a DataFrame should be displayed when
- df.info() is called. Valid values True,False,'deep'
-"""
-
-
-def table_schema_cb(key) -> None:
- from pandas.io.formats.printing import enable_data_resource_formatter
-
- enable_data_resource_formatter(cf.get_option(key))
-
-
-def is_terminal() -> bool:
- """
- Detect if Python is running in a terminal.
-
- Returns True if Python is running in a terminal or False if not.
- """
- try:
- # error: Name 'get_ipython' is not defined
- ip = get_ipython() # type: ignore[name-defined]
- except NameError: # assume standard Python interpreter in a terminal
- return True
- else:
- if hasattr(ip, "kernel"): # IPython as a Jupyter kernel
- return False
- else: # IPython in a terminal
- return True
-
-
-with cf.config_prefix("display"):
- cf.register_option("precision", 6, pc_precision_doc, validator=is_nonnegative_int)
- cf.register_option(
- "float_format",
- None,
- float_format_doc,
- validator=is_one_of_factory([None, is_callable]),
- )
- cf.register_option(
- "max_info_rows",
- 1690785,
- pc_max_info_rows_doc,
- validator=is_instance_factory((int, type(None))),
- )
- cf.register_option("max_rows", 60, pc_max_rows_doc, validator=is_nonnegative_int)
- cf.register_option(
- "min_rows",
- 10,
- pc_min_rows_doc,
- validator=is_instance_factory([type(None), int]),
- )
- cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int)
-
- cf.register_option(
- "max_colwidth",
- 50,
- max_colwidth_doc,
- validator=is_nonnegative_int,
- )
- if is_terminal():
- max_cols = 0 # automatically determine optimal number of columns
- else:
- max_cols = 20 # cannot determine optimal number of columns
- cf.register_option(
- "max_columns", max_cols, pc_max_cols_doc, validator=is_nonnegative_int
- )
- cf.register_option(
- "large_repr",
- "truncate",
- pc_large_repr_doc,
- validator=is_one_of_factory(["truncate", "info"]),
- )
- cf.register_option("max_info_columns", 100, pc_max_info_cols_doc, validator=is_int)
- cf.register_option(
- "colheader_justify", "right", colheader_justify_doc, validator=is_text
- )
- cf.register_option("notebook_repr_html", True, pc_nb_repr_h_doc, validator=is_bool)
- cf.register_option("pprint_nest_depth", 3, pc_pprint_nest_depth, validator=is_int)
- cf.register_option("multi_sparse", True, pc_multi_sparse_doc, validator=is_bool)
- cf.register_option("expand_frame_repr", True, pc_expand_repr_doc)
- cf.register_option(
- "show_dimensions",
- "truncate",
- pc_show_dimensions_doc,
- validator=is_one_of_factory([True, False, "truncate"]),
- )
- cf.register_option("chop_threshold", None, pc_chop_threshold_doc)
- cf.register_option("max_seq_items", 100, pc_max_seq_items)
- cf.register_option(
- "width", 80, pc_width_doc, validator=is_instance_factory([type(None), int])
- )
- cf.register_option(
- "memory_usage",
- True,
- pc_memory_usage_doc,
- validator=is_one_of_factory([None, True, False, "deep"]),
- )
- cf.register_option(
- "unicode.east_asian_width", False, pc_east_asian_width_doc, validator=is_bool
- )
- cf.register_option(
- "unicode.ambiguous_as_wide", False, pc_east_asian_width_doc, validator=is_bool
- )
- cf.register_option(
- "html.table_schema",
- False,
- pc_table_schema_doc,
- validator=is_bool,
- cb=table_schema_cb,
- )
- cf.register_option("html.border", 1, pc_html_border_doc, validator=is_int)
- cf.register_option(
- "html.use_mathjax", True, pc_html_use_mathjax_doc, validator=is_bool
- )
- cf.register_option(
- "max_dir_items", 100, pc_max_dir_items, validator=is_nonnegative_int
- )
-
-tc_sim_interactive_doc = """
-: boolean
- Whether to simulate interactive mode for purposes of testing
-"""
-
-with cf.config_prefix("mode"):
- cf.register_option("sim_interactive", False, tc_sim_interactive_doc)
-
-use_inf_as_na_doc = """
-: boolean
- True means treat None, NaN, INF, -INF as NA (old way),
- False means None and NaN are null, but INF, -INF are not NA
- (new way).
-"""
-
-# We don't want to start importing everything at the global context level
-# or we'll hit circular deps.
-
-
-def use_inf_as_na_cb(key) -> None:
- from pandas.core.dtypes.missing import _use_inf_as_na
-
- _use_inf_as_na(key)
-
-
-with cf.config_prefix("mode"):
- cf.register_option("use_inf_as_na", False, use_inf_as_na_doc, cb=use_inf_as_na_cb)
-
-
-data_manager_doc = """
-: string
- Internal data manager type; can be "block" or "array". Defaults to "block",
- unless overridden by the 'PANDAS_DATA_MANAGER' environment variable (needs
- to be set before pandas is imported).
-"""
-
-
-with cf.config_prefix("mode"):
- cf.register_option(
- "data_manager",
- # Get the default from an environment variable, if set, otherwise defaults
- # to "block". This environment variable can be set for testing.
- os.environ.get("PANDAS_DATA_MANAGER", "block"),
- data_manager_doc,
- validator=is_one_of_factory(["block", "array"]),
- )
-
-
-# TODO better name?
-copy_on_write_doc = """
-: bool
- Use new copy-view behaviour using Copy-on-Write. Defaults to False,
- unless overridden by the 'PANDAS_COPY_ON_WRITE' environment variable
- (if set to "1" for True, needs to be set before pandas is imported).
-"""
-
-
-with cf.config_prefix("mode"):
- cf.register_option(
- "copy_on_write",
- # Get the default from an environment variable, if set, otherwise defaults
- # to False. This environment variable can be set for testing.
- os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1",
- copy_on_write_doc,
- validator=is_bool,
- )
-
-
-# user warnings
-chained_assignment = """
-: string
- Raise an exception, warn, or no action if trying to use chained assignment,
- The default is warn
-"""
-
-with cf.config_prefix("mode"):
- cf.register_option(
- "chained_assignment",
- "warn",
- chained_assignment,
- validator=is_one_of_factory([None, "warn", "raise"]),
- )
-
-
-string_storage_doc = """
-: string
- The default storage for StringDtype.
-"""
-
-with cf.config_prefix("mode"):
- cf.register_option(
- "string_storage",
- "python",
- string_storage_doc,
- validator=is_one_of_factory(["python", "pyarrow"]),
- )
-
-
-# Set up the io.excel specific reader configuration.
-reader_engine_doc = """
-: string
- The default Excel reader engine for '{ext}' files. Available options:
- auto, {others}.
-"""
-
-_xls_options = ["xlrd"]
-_xlsm_options = ["xlrd", "openpyxl"]
-_xlsx_options = ["xlrd", "openpyxl"]
-_ods_options = ["odf"]
-_xlsb_options = ["pyxlsb"]
-
-
-with cf.config_prefix("io.excel.xls"):
- cf.register_option(
- "reader",
- "auto",
- reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)),
- validator=is_one_of_factory(_xls_options + ["auto"]),
- )
-
-with cf.config_prefix("io.excel.xlsm"):
- cf.register_option(
- "reader",
- "auto",
- reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
- validator=is_one_of_factory(_xlsm_options + ["auto"]),
- )
-
-
-with cf.config_prefix("io.excel.xlsx"):
- cf.register_option(
- "reader",
- "auto",
- reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
- validator=is_one_of_factory(_xlsx_options + ["auto"]),
- )
-
-
-with cf.config_prefix("io.excel.ods"):
- cf.register_option(
- "reader",
- "auto",
- reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
- validator=is_one_of_factory(_ods_options + ["auto"]),
- )
-
-with cf.config_prefix("io.excel.xlsb"):
- cf.register_option(
- "reader",
- "auto",
- reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)),
- validator=is_one_of_factory(_xlsb_options + ["auto"]),
- )
-
-# Set up the io.excel specific writer configuration.
-writer_engine_doc = """
-: string
- The default Excel writer engine for '{ext}' files. Available options:
- auto, {others}.
-"""
-
-_xlsm_options = ["openpyxl"]
-_xlsx_options = ["openpyxl", "xlsxwriter"]
-_ods_options = ["odf"]
-
-
-with cf.config_prefix("io.excel.xlsm"):
- cf.register_option(
- "writer",
- "auto",
- writer_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
- validator=str,
- )
-
-
-with cf.config_prefix("io.excel.xlsx"):
- cf.register_option(
- "writer",
- "auto",
- writer_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
- validator=str,
- )
-
-
-with cf.config_prefix("io.excel.ods"):
- cf.register_option(
- "writer",
- "auto",
- writer_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
- validator=str,
- )
-
-
-# Set up the io.parquet specific configuration.
-parquet_engine_doc = """
-: string
- The default parquet reader/writer engine. Available options:
- 'auto', 'pyarrow', 'fastparquet', the default is 'auto'
-"""
-
-with cf.config_prefix("io.parquet"):
- cf.register_option(
- "engine",
- "auto",
- parquet_engine_doc,
- validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]),
- )
-
-
-# Set up the io.sql specific configuration.
-sql_engine_doc = """
-: string
- The default sql reader/writer engine. Available options:
- 'auto', 'sqlalchemy', the default is 'auto'
-"""
-
-with cf.config_prefix("io.sql"):
- cf.register_option(
- "engine",
- "auto",
- sql_engine_doc,
- validator=is_one_of_factory(["auto", "sqlalchemy"]),
- )
-
-# --------
-# Plotting
-# ---------
-
-plotting_backend_doc = """
-: str
- The plotting backend to use. The default value is "matplotlib", the
- backend provided with pandas. Other backends can be specified by
- providing the name of the module that implements the backend.
-"""
-
-
-def register_plotting_backend_cb(key) -> None:
- if key == "matplotlib":
- # We defer matplotlib validation, since it's the default
- return
- from pandas.plotting._core import _get_plot_backend
-
- _get_plot_backend(key)
-
-
-with cf.config_prefix("plotting"):
- cf.register_option(
- "backend",
- defval="matplotlib",
- doc=plotting_backend_doc,
- validator=register_plotting_backend_cb,
- )
-
-
-register_converter_doc = """
-: bool or 'auto'.
- Whether to register converters with matplotlib's units registry for
- dates, times, datetimes, and Periods. Toggling to False will remove
- the converters, restoring any converters that pandas overwrote.
-"""
-
-
-def register_converter_cb(key) -> None:
- from pandas.plotting import (
- deregister_matplotlib_converters,
- register_matplotlib_converters,
- )
-
- if cf.get_option(key):
- register_matplotlib_converters()
- else:
- deregister_matplotlib_converters()
-
-
-with cf.config_prefix("plotting.matplotlib"):
- cf.register_option(
- "register_converters",
- "auto",
- register_converter_doc,
- validator=is_one_of_factory(["auto", True, False]),
- cb=register_converter_cb,
- )
-
-# ------
-# Styler
-# ------
-
-styler_sparse_index_doc = """
-: bool
- Whether to sparsify the display of a hierarchical index. Setting to False will
- display each explicit level element in a hierarchical key for each row.
-"""
-
-styler_sparse_columns_doc = """
-: bool
- Whether to sparsify the display of hierarchical columns. Setting to False will
- display each explicit level element in a hierarchical key for each column.
-"""
-
-styler_render_repr = """
-: str
- Determine which output to use in Jupyter Notebook in {"html", "latex"}.
-"""
-
-styler_max_elements = """
-: int
- The maximum number of data-cell (<td>) elements that will be rendered before
- trimming will occur over columns, rows or both if needed.
-"""
-
-styler_max_rows = """
-: int, optional
- The maximum number of rows that will be rendered. May still be reduced to
- satsify ``max_elements``, which takes precedence.
-"""
-
-styler_max_columns = """
-: int, optional
- The maximum number of columns that will be rendered. May still be reduced to
- satsify ``max_elements``, which takes precedence.
-"""
-
-styler_precision = """
-: int
- The precision for floats and complex numbers.
-"""
-
-styler_decimal = """
-: str
- The character representation for the decimal separator for floats and complex.
-"""
-
-styler_thousands = """
-: str, optional
- The character representation for thousands separator for floats, int and complex.
-"""
-
-styler_na_rep = """
-: str, optional
- The string representation for values identified as missing.
-"""
-
-styler_escape = """
-: str, optional
- Whether to escape certain characters according to the given context; html or latex.
-"""
-
-styler_formatter = """
-: str, callable, dict, optional
- A formatter object to be used as default within ``Styler.format``.
-"""
-
-styler_multirow_align = """
-: {"c", "t", "b"}
- The specifier for vertical alignment of sparsified LaTeX multirows.
-"""
-
-styler_multicol_align = r"""
-: {"r", "c", "l", "naive-l", "naive-r"}
- The specifier for horizontal alignment of sparsified LaTeX multicolumns. Pipe
- decorators can also be added to non-naive values to draw vertical
- rules, e.g. "\|r" will draw a rule on the left side of right aligned merged cells.
-"""
-
-styler_hrules = """
-: bool
- Whether to add horizontal rules on top and bottom and below the headers.
-"""
-
-styler_environment = """
-: str
- The environment to replace ``\\begin{table}``. If "longtable" is used results
- in a specific longtable environment format.
-"""
-
-styler_encoding = """
-: str
- The encoding used for output HTML and LaTeX files.
-"""
-
-styler_mathjax = """
-: bool
- If False will render special CSS classes to table attributes that indicate Mathjax
- will not be used in Jupyter Notebook.
-"""
-
-with cf.config_prefix("styler"):
- cf.register_option("sparse.index", True, styler_sparse_index_doc, validator=is_bool)
-
- cf.register_option(
- "sparse.columns", True, styler_sparse_columns_doc, validator=is_bool
- )
-
- cf.register_option(
- "render.repr",
- "html",
- styler_render_repr,
- validator=is_one_of_factory(["html", "latex"]),
- )
-
- cf.register_option(
- "render.max_elements",
- 2**18,
- styler_max_elements,
- validator=is_nonnegative_int,
- )
-
- cf.register_option(
- "render.max_rows",
- None,
- styler_max_rows,
- validator=is_nonnegative_int,
- )
-
- cf.register_option(
- "render.max_columns",
- None,
- styler_max_columns,
- validator=is_nonnegative_int,
- )
-
- cf.register_option("render.encoding", "utf-8", styler_encoding, validator=is_str)
-
- cf.register_option("format.decimal", ".", styler_decimal, validator=is_str)
-
- cf.register_option(
- "format.precision", 6, styler_precision, validator=is_nonnegative_int
- )
-
- cf.register_option(
- "format.thousands",
- None,
- styler_thousands,
- validator=is_instance_factory([type(None), str]),
- )
-
- cf.register_option(
- "format.na_rep",
- None,
- styler_na_rep,
- validator=is_instance_factory([type(None), str]),
- )
-
- cf.register_option(
- "format.escape",
- None,
- styler_escape,
- validator=is_one_of_factory([None, "html", "latex"]),
- )
-
- cf.register_option(
- "format.formatter",
- None,
- styler_formatter,
- validator=is_instance_factory([type(None), dict, Callable, str]),
- )
-
- cf.register_option("html.mathjax", True, styler_mathjax, validator=is_bool)
-
- cf.register_option(
- "latex.multirow_align",
- "c",
- styler_multirow_align,
- validator=is_one_of_factory(["c", "t", "b", "naive"]),
- )
-
- val_mca = ["r", "|r|", "|r", "r|", "c", "|c|", "|c", "c|", "l", "|l|", "|l", "l|"]
- val_mca += ["naive-l", "naive-r"]
- cf.register_option(
- "latex.multicol_align",
- "r",
- styler_multicol_align,
- validator=is_one_of_factory(val_mca),
- )
-
- cf.register_option("latex.hrules", False, styler_hrules, validator=is_bool)
-
- cf.register_option(
- "latex.environment",
- None,
- styler_environment,
- validator=is_instance_factory([type(None), str]),
- )
diff --git a/contrib/python/pandas/py3/pandas/core/construction.py b/contrib/python/pandas/py3/pandas/core/construction.py
deleted file mode 100644
index 8c5f291742b..00000000000
--- a/contrib/python/pandas/py3/pandas/core/construction.py
+++ /dev/null
@@ -1,767 +0,0 @@
-"""
-Constructor functions intended to be shared by pd.array, Series.__init__,
-and Index.__new__.
-
-These should not depend on core.internals.
-"""
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Optional,
- Sequence,
- Union,
- cast,
- overload,
-)
-
-import numpy as np
-from numpy import ma
-
-from pandas._libs import lib
-from pandas._libs.tslibs.period import Period
-from pandas._typing import (
- AnyArrayLike,
- ArrayLike,
- Dtype,
- DtypeObj,
- T,
-)
-
-from pandas.core.dtypes.base import (
- ExtensionDtype,
- _registry as registry,
-)
-from pandas.core.dtypes.cast import (
- construct_1d_arraylike_from_scalar,
- construct_1d_object_array_from_listlike,
- maybe_cast_to_datetime,
- maybe_cast_to_integer_array,
- maybe_convert_platform,
- maybe_infer_to_datetimelike,
- maybe_promote,
-)
-from pandas.core.dtypes.common import (
- is_datetime64_ns_dtype,
- is_dtype_equal,
- is_extension_array_dtype,
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- is_timedelta64_ns_dtype,
-)
-from pandas.core.dtypes.dtypes import PandasDtype
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCExtensionArray,
- ABCIndex,
- ABCPandasArray,
- ABCRangeIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import isna
-
-import pandas.core.common as com
-
-if TYPE_CHECKING:
- from pandas import (
- Index,
- Series,
- )
- from pandas.core.arrays.base import ExtensionArray
-
-
-def array(
- data: Sequence[object] | AnyArrayLike,
- dtype: Dtype | None = None,
- copy: bool = True,
-) -> ExtensionArray:
- """
- Create an array.
-
- Parameters
- ----------
- data : Sequence of objects
- The scalars inside `data` should be instances of the
- scalar type for `dtype`. It's expected that `data`
- represents a 1-dimensional array of data.
-
- When `data` is an Index or Series, the underlying array
- will be extracted from `data`.
-
- dtype : str, np.dtype, or ExtensionDtype, optional
- The dtype to use for the array. This may be a NumPy
- dtype or an extension type registered with pandas using
- :meth:`pandas.api.extensions.register_extension_dtype`.
-
- If not specified, there are two possibilities:
-
- 1. When `data` is a :class:`Series`, :class:`Index`, or
- :class:`ExtensionArray`, the `dtype` will be taken
- from the data.
- 2. Otherwise, pandas will attempt to infer the `dtype`
- from the data.
-
- Note that when `data` is a NumPy array, ``data.dtype`` is
- *not* used for inferring the array type. This is because
- NumPy cannot represent all the types of data that can be
- held in extension arrays.
-
- Currently, pandas will infer an extension dtype for sequences of
-
- ============================== =======================================
- Scalar Type Array Type
- ============================== =======================================
- :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray`
- :class:`pandas.Period` :class:`pandas.arrays.PeriodArray`
- :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray`
- :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`
- :class:`int` :class:`pandas.arrays.IntegerArray`
- :class:`float` :class:`pandas.arrays.FloatingArray`
- :class:`str` :class:`pandas.arrays.StringArray` or
- :class:`pandas.arrays.ArrowStringArray`
- :class:`bool` :class:`pandas.arrays.BooleanArray`
- ============================== =======================================
-
- The ExtensionArray created when the scalar type is :class:`str` is determined by
- ``pd.options.mode.string_storage`` if the dtype is not explicitly given.
-
- For all other cases, NumPy's usual inference rules will be used.
-
- .. versionchanged:: 1.2.0
-
- Pandas now also infers nullable-floating dtype for float-like
- input data
-
- copy : bool, default True
- Whether to copy the data, even if not necessary. Depending
- on the type of `data`, creating the new array may require
- copying data, even if ``copy=False``.
-
- Returns
- -------
- ExtensionArray
- The newly created array.
-
- Raises
- ------
- ValueError
- When `data` is not 1-dimensional.
-
- See Also
- --------
- numpy.array : Construct a NumPy array.
- Series : Construct a pandas Series.
- Index : Construct a pandas Index.
- arrays.PandasArray : ExtensionArray wrapping a NumPy array.
- Series.array : Extract the array stored within a Series.
-
- Notes
- -----
- Omitting the `dtype` argument means pandas will attempt to infer the
- best array type from the values in the data. As new array types are
- added by pandas and 3rd party libraries, the "best" array type may
- change. We recommend specifying `dtype` to ensure that
-
- 1. the correct array type for the data is returned
- 2. the returned array type doesn't change as new extension types
- are added by pandas and third-party libraries
-
- Additionally, if the underlying memory representation of the returned
- array matters, we recommend specifying the `dtype` as a concrete object
- rather than a string alias or allowing it to be inferred. For example,
- a future version of pandas or a 3rd-party library may include a
- dedicated ExtensionArray for string data. In this event, the following
- would no longer return a :class:`arrays.PandasArray` backed by a NumPy
- array.
-
- >>> pd.array(['a', 'b'], dtype=str)
- <PandasArray>
- ['a', 'b']
- Length: 2, dtype: str32
-
- This would instead return the new ExtensionArray dedicated for string
- data. If you really need the new array to be backed by a NumPy array,
- specify that in the dtype.
-
- >>> pd.array(['a', 'b'], dtype=np.dtype("<U1"))
- <PandasArray>
- ['a', 'b']
- Length: 2, dtype: str32
-
- Finally, Pandas has arrays that mostly overlap with NumPy
-
- * :class:`arrays.DatetimeArray`
- * :class:`arrays.TimedeltaArray`
-
- When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is
- passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``
- rather than a ``PandasArray``. This is for symmetry with the case of
- timezone-aware data, which NumPy does not natively support.
-
- >>> pd.array(['2015', '2016'], dtype='datetime64[ns]')
- <DatetimeArray>
- ['2015-01-01 00:00:00', '2016-01-01 00:00:00']
- Length: 2, dtype: datetime64[ns]
-
- >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]')
- <TimedeltaArray>
- ['0 days 01:00:00', '0 days 02:00:00']
- Length: 2, dtype: timedelta64[ns]
-
- Examples
- --------
- If a dtype is not specified, pandas will infer the best dtype from the values.
- See the description of `dtype` for the types pandas infers for.
-
- >>> pd.array([1, 2])
- <IntegerArray>
- [1, 2]
- Length: 2, dtype: Int64
-
- >>> pd.array([1, 2, np.nan])
- <IntegerArray>
- [1, 2, <NA>]
- Length: 3, dtype: Int64
-
- >>> pd.array([1.1, 2.2])
- <FloatingArray>
- [1.1, 2.2]
- Length: 2, dtype: Float64
-
- >>> pd.array(["a", None, "c"])
- <StringArray>
- ['a', <NA>, 'c']
- Length: 3, dtype: string
-
- >>> with pd.option_context("string_storage", "pyarrow"):
- ... arr = pd.array(["a", None, "c"])
- ...
- >>> arr
- <ArrowStringArray>
- ['a', <NA>, 'c']
- Length: 3, dtype: string
-
- >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
- <PeriodArray>
- ['2000-01-01', '2000-01-01']
- Length: 2, dtype: period[D]
-
- You can use the string alias for `dtype`
-
- >>> pd.array(['a', 'b', 'a'], dtype='category')
- ['a', 'b', 'a']
- Categories (2, object): ['a', 'b']
-
- Or specify the actual dtype
-
- >>> pd.array(['a', 'b', 'a'],
- ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))
- ['a', 'b', 'a']
- Categories (3, object): ['a' < 'b' < 'c']
-
- If pandas does not infer a dedicated extension type a
- :class:`arrays.PandasArray` is returned.
-
- >>> pd.array([1 + 1j, 3 + 2j])
- <PandasArray>
- [(1+1j), (3+2j)]
- Length: 2, dtype: complex128
-
- As mentioned in the "Notes" section, new extension types may be added
- in the future (by pandas or 3rd party libraries), causing the return
- value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype`
- as a NumPy dtype if you need to ensure there's no future change in
- behavior.
-
- >>> pd.array([1, 2], dtype=np.dtype("int32"))
- <PandasArray>
- [1, 2]
- Length: 2, dtype: int32
-
- `data` must be 1-dimensional. A ValueError is raised when the input
- has the wrong dimensionality.
-
- >>> pd.array(1)
- Traceback (most recent call last):
- ...
- ValueError: Cannot pass scalar '1' to 'pandas.array'.
- """
- from pandas.core.arrays import (
- BooleanArray,
- DatetimeArray,
- ExtensionArray,
- FloatingArray,
- IntegerArray,
- IntervalArray,
- PandasArray,
- PeriodArray,
- TimedeltaArray,
- )
- from pandas.core.arrays.string_ import StringDtype
-
- if lib.is_scalar(data):
- msg = f"Cannot pass scalar '{data}' to 'pandas.array'."
- raise ValueError(msg)
- elif isinstance(data, ABCDataFrame):
- raise TypeError("Cannot pass DataFrame to 'pandas.array'")
-
- if dtype is None and isinstance(data, (ABCSeries, ABCIndex, ExtensionArray)):
- # Note: we exclude np.ndarray here, will do type inference on it
- dtype = data.dtype
-
- data = extract_array(data, extract_numpy=True)
-
- # this returns None for not-found dtypes.
- if isinstance(dtype, str):
- dtype = registry.find(dtype) or dtype
-
- if isinstance(data, ExtensionArray) and (
- dtype is None or is_dtype_equal(dtype, data.dtype)
- ):
- # e.g. TimedeltaArray[s], avoid casting to PandasArray
- if copy:
- return data.copy()
- return data
-
- if is_extension_array_dtype(dtype):
- cls = cast(ExtensionDtype, dtype).construct_array_type()
- return cls._from_sequence(data, dtype=dtype, copy=copy)
-
- if dtype is None:
- inferred_dtype = lib.infer_dtype(data, skipna=True)
- if inferred_dtype == "period":
- period_data = cast(Union[Sequence[Optional[Period]], AnyArrayLike], data)
- return PeriodArray._from_sequence(period_data, copy=copy)
-
- elif inferred_dtype == "interval":
- return IntervalArray(data, copy=copy)
-
- elif inferred_dtype.startswith("datetime"):
- # datetime, datetime64
- try:
- return DatetimeArray._from_sequence(data, copy=copy)
- except ValueError:
- # Mixture of timezones, fall back to PandasArray
- pass
-
- elif inferred_dtype.startswith("timedelta"):
- # timedelta, timedelta64
- return TimedeltaArray._from_sequence(data, copy=copy)
-
- elif inferred_dtype == "string":
- # StringArray/ArrowStringArray depending on pd.options.mode.string_storage
- return StringDtype().construct_array_type()._from_sequence(data, copy=copy)
-
- elif inferred_dtype == "integer":
- return IntegerArray._from_sequence(data, copy=copy)
-
- elif (
- inferred_dtype in ("floating", "mixed-integer-float")
- and getattr(data, "dtype", None) != np.float16
- ):
- # GH#44715 Exclude np.float16 bc FloatingArray does not support it;
- # we will fall back to PandasArray.
- return FloatingArray._from_sequence(data, copy=copy)
-
- elif inferred_dtype == "boolean":
- return BooleanArray._from_sequence(data, copy=copy)
-
- # Pandas overrides NumPy for
- # 1. datetime64[ns]
- # 2. timedelta64[ns]
- # so that a DatetimeArray is returned.
- if is_datetime64_ns_dtype(dtype):
- return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)
- elif is_timedelta64_ns_dtype(dtype):
- return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)
-
- return PandasArray._from_sequence(data, dtype=dtype, copy=copy)
-
-
-@overload
-def extract_array(
- obj: Series | Index, extract_numpy: bool = ..., extract_range: bool = ...
-) -> ArrayLike:
- ...
-
-
-@overload
-def extract_array(
- obj: T, extract_numpy: bool = ..., extract_range: bool = ...
-) -> T | ArrayLike:
- ...
-
-
-def extract_array(
- obj: T, extract_numpy: bool = False, extract_range: bool = False
-) -> T | ArrayLike:
- """
- Extract the ndarray or ExtensionArray from a Series or Index.
-
- For all other types, `obj` is just returned as is.
-
- Parameters
- ----------
- obj : object
- For Series / Index, the underlying ExtensionArray is unboxed.
-
- extract_numpy : bool, default False
- Whether to extract the ndarray from a PandasArray.
-
- extract_range : bool, default False
- If we have a RangeIndex, return range._values if True
- (which is a materialized integer ndarray), otherwise return unchanged.
-
- Returns
- -------
- arr : object
-
- Examples
- --------
- >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category'))
- ['a', 'b', 'c']
- Categories (3, object): ['a', 'b', 'c']
-
- Other objects like lists, arrays, and DataFrames are just passed through.
-
- >>> extract_array([1, 2, 3])
- [1, 2, 3]
-
- For an ndarray-backed Series / Index the ndarray is returned.
-
- >>> extract_array(pd.Series([1, 2, 3]))
- array([1, 2, 3])
-
- To extract all the way down to the ndarray, pass ``extract_numpy=True``.
-
- >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True)
- array([1, 2, 3])
- """
- if isinstance(obj, (ABCIndex, ABCSeries)):
- if isinstance(obj, ABCRangeIndex):
- if extract_range:
- return obj._values
- # https://github.com/python/mypy/issues/1081
- # error: Incompatible return value type (got "RangeIndex", expected
- # "Union[T, Union[ExtensionArray, ndarray[Any, Any]]]")
- return obj # type: ignore[return-value]
-
- return obj._values
-
- elif extract_numpy and isinstance(obj, ABCPandasArray):
- return obj.to_numpy()
-
- return obj
-
-
-def ensure_wrapped_if_datetimelike(arr):
- """
- Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray.
- """
- if isinstance(arr, np.ndarray):
- if arr.dtype.kind == "M":
- from pandas.core.arrays import DatetimeArray
-
- return DatetimeArray._from_sequence(arr)
-
- elif arr.dtype.kind == "m":
- from pandas.core.arrays import TimedeltaArray
-
- return TimedeltaArray._from_sequence(arr)
-
- return arr
-
-
-def sanitize_masked_array(data: ma.MaskedArray) -> np.ndarray:
- """
- Convert numpy MaskedArray to ensure mask is softened.
- """
- mask = ma.getmaskarray(data)
- if mask.any():
- dtype, fill_value = maybe_promote(data.dtype, np.nan)
- dtype = cast(np.dtype, dtype)
- # Incompatible types in assignment (expression has type "ndarray[Any,
- # dtype[Any]]", variable has type "MaskedArray[Any, Any]")
- data = data.astype(dtype, copy=True) # type: ignore[assignment]
- data.soften_mask() # set hardmask False if it was True
- data[mask] = fill_value
- else:
- data = data.copy()
- return data
-
-
-def sanitize_array(
- data,
- index: Index | None,
- dtype: DtypeObj | None = None,
- copy: bool = False,
- *,
- allow_2d: bool = False,
-) -> ArrayLike:
- """
- Sanitize input data to an ndarray or ExtensionArray, copy if specified,
- coerce to the dtype if specified.
-
- Parameters
- ----------
- data : Any
- index : Index or None, default None
- dtype : np.dtype, ExtensionDtype, or None, default None
- copy : bool, default False
- allow_2d : bool, default False
- If False, raise if we have a 2D Arraylike.
-
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- if isinstance(data, ma.MaskedArray):
- data = sanitize_masked_array(data)
-
- if isinstance(dtype, PandasDtype):
- # Avoid ending up with a PandasArray
- dtype = dtype.numpy_dtype
-
- # extract ndarray or ExtensionArray, ensure we have no PandasArray
- data = extract_array(data, extract_numpy=True, extract_range=True)
-
- if isinstance(data, np.ndarray) and data.ndim == 0:
- if dtype is None:
- dtype = data.dtype
- data = lib.item_from_zerodim(data)
- elif isinstance(data, range):
- # GH#16804
- data = range_to_ndarray(data)
- copy = False
-
- if not is_list_like(data):
- if index is None:
- raise ValueError("index must be specified when data is not list-like")
- data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
- return data
-
- elif isinstance(data, ABCExtensionArray):
- # it is already ensured above this is not a PandasArray
- # Until GH#49309 is fixed this check needs to come before the
- # ExtensionDtype check
- if dtype is not None:
- subarr = data.astype(dtype, copy=copy)
- elif copy:
- subarr = data.copy()
- else:
- subarr = data
-
- elif isinstance(dtype, ExtensionDtype):
- # create an extension array from its dtype
- _sanitize_non_ordered(data)
- cls = dtype.construct_array_type()
- subarr = cls._from_sequence(data, dtype=dtype, copy=copy)
-
- # GH#846
- elif isinstance(data, np.ndarray):
- if isinstance(data, np.matrix):
- data = data.A
-
- if dtype is None:
- subarr = data
- if data.dtype == object:
- subarr = maybe_infer_to_datetimelike(data)
-
- if subarr is data and copy:
- subarr = subarr.copy()
-
- else:
- # we will try to copy by-definition here
- subarr = _try_cast(data, dtype, copy)
-
- elif hasattr(data, "__array__"):
- # e.g. dask array GH#38645
- data = np.array(data, copy=copy)
- return sanitize_array(
- data,
- index=index,
- dtype=dtype,
- copy=False,
- allow_2d=allow_2d,
- )
-
- else:
- _sanitize_non_ordered(data)
- # materialize e.g. generators, convert e.g. tuples, abc.ValueView
- data = list(data)
-
- if len(data) == 0 and dtype is None:
- # We default to float64, matching numpy
- subarr = np.array([], dtype=np.float64)
-
- elif dtype is not None:
- subarr = _try_cast(data, dtype, copy)
-
- else:
- subarr = maybe_convert_platform(data)
- if subarr.dtype == object:
- subarr = cast(np.ndarray, subarr)
- subarr = maybe_infer_to_datetimelike(subarr)
-
- subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
-
- if isinstance(subarr, np.ndarray):
- # at this point we should have dtype be None or subarr.dtype == dtype
- dtype = cast(np.dtype, dtype)
- subarr = _sanitize_str_dtypes(subarr, data, dtype, copy)
-
- return subarr
-
-
-def range_to_ndarray(rng: range) -> np.ndarray:
- """
- Cast a range object to ndarray.
- """
- # GH#30171 perf avoid realizing range as a list in np.array
- try:
- arr = np.arange(rng.start, rng.stop, rng.step, dtype="int64")
- except OverflowError:
- # GH#30173 handling for ranges that overflow int64
- if (rng.start >= 0 and rng.step > 0) or (rng.step < 0 <= rng.stop):
- try:
- arr = np.arange(rng.start, rng.stop, rng.step, dtype="uint64")
- except OverflowError:
- arr = construct_1d_object_array_from_listlike(list(rng))
- else:
- arr = construct_1d_object_array_from_listlike(list(rng))
- return arr
-
-
-def _sanitize_non_ordered(data) -> None:
- """
- Raise only for unordered sets, e.g., not for dict_keys
- """
- if isinstance(data, (set, frozenset)):
- raise TypeError(f"'{type(data).__name__}' type is unordered")
-
-
-def _sanitize_ndim(
- result: ArrayLike,
- data,
- dtype: DtypeObj | None,
- index: Index | None,
- *,
- allow_2d: bool = False,
-) -> ArrayLike:
- """
- Ensure we have a 1-dimensional result array.
- """
- if getattr(result, "ndim", 0) == 0:
- raise ValueError("result should be arraylike with ndim > 0")
-
- if result.ndim == 1:
- # the result that we want
- result = _maybe_repeat(result, index)
-
- elif result.ndim > 1:
- if isinstance(data, np.ndarray):
- if allow_2d:
- return result
- raise ValueError(
- f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead"
- )
- if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype):
- # i.e. PandasDtype("O")
-
- result = com.asarray_tuplesafe(data, dtype=np.dtype("object"))
- cls = dtype.construct_array_type()
- result = cls._from_sequence(result, dtype=dtype)
- else:
- # error: Argument "dtype" to "asarray_tuplesafe" has incompatible type
- # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[str,
- # dtype[Any], None]"
- result = com.asarray_tuplesafe(data, dtype=dtype) # type: ignore[arg-type]
- return result
-
-
-def _sanitize_str_dtypes(
- result: np.ndarray, data, dtype: np.dtype | None, copy: bool
-) -> np.ndarray:
- """
- Ensure we have a dtype that is supported by pandas.
- """
-
- # This is to prevent mixed-type Series getting all casted to
- # NumPy string type, e.g. NaN --> '-1#IND'.
- if issubclass(result.dtype.type, str):
- # GH#16605
- # If not empty convert the data to dtype
- # GH#19853: If data is a scalar, result has already the result
- if not lib.is_scalar(data):
- if not np.all(isna(data)):
- data = np.array(data, dtype=dtype, copy=False)
- result = np.array(data, dtype=object, copy=copy)
- return result
-
-
-def _maybe_repeat(arr: ArrayLike, index: Index | None) -> ArrayLike:
- """
- If we have a length-1 array and an index describing how long we expect
- the result to be, repeat the array.
- """
- if index is not None:
- if 1 == len(arr) != len(index):
- arr = arr.repeat(len(index))
- return arr
-
-
-def _try_cast(
- arr: list | np.ndarray,
- dtype: np.dtype,
- copy: bool,
-) -> ArrayLike:
- """
- Convert input to numpy ndarray and optionally cast to a given dtype.
-
- Parameters
- ----------
- arr : ndarray or list
- Excludes: ExtensionArray, Series, Index.
- dtype : np.dtype
- copy : bool
- If False, don't copy the data if not needed.
-
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- is_ndarray = isinstance(arr, np.ndarray)
-
- if is_object_dtype(dtype):
- if not is_ndarray:
- subarr = construct_1d_object_array_from_listlike(arr)
- return subarr
- return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy)
-
- elif dtype.kind == "U":
- # TODO: test cases with arr.dtype.kind in ["m", "M"]
- if is_ndarray:
- arr = cast(np.ndarray, arr)
- shape = arr.shape
- if arr.ndim > 1:
- arr = arr.ravel()
- else:
- shape = (len(arr),)
- return lib.ensure_string_array(arr, convert_na_value=False, copy=copy).reshape(
- shape
- )
-
- elif dtype.kind in ["m", "M"]:
- return maybe_cast_to_datetime(arr, dtype)
-
- # GH#15832: Check if we are requesting a numeric dtype and
- # that we can convert the data to the requested dtype.
- elif is_integer_dtype(dtype):
- # this will raise if we have e.g. floats
-
- subarr = maybe_cast_to_integer_array(arr, dtype)
- else:
- subarr = np.array(arr, dtype=dtype, copy=copy)
-
- return subarr
diff --git a/contrib/python/pandas/py3/pandas/core/dtypes/__init__.py b/contrib/python/pandas/py3/pandas/core/dtypes/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/dtypes/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/core/dtypes/api.py b/contrib/python/pandas/py3/pandas/core/dtypes/api.py
deleted file mode 100644
index 254abe330b8..00000000000
--- a/contrib/python/pandas/py3/pandas/core/dtypes/api.py
+++ /dev/null
@@ -1,85 +0,0 @@
-from pandas.core.dtypes.common import (
- is_any_real_numeric_dtype,
- is_array_like,
- is_bool,
- is_bool_dtype,
- is_categorical_dtype,
- is_complex,
- is_complex_dtype,
- is_datetime64_any_dtype,
- is_datetime64_dtype,
- is_datetime64_ns_dtype,
- is_datetime64tz_dtype,
- is_dict_like,
- is_dtype_equal,
- is_extension_array_dtype,
- is_file_like,
- is_float,
- is_float_dtype,
- is_hashable,
- is_int64_dtype,
- is_integer,
- is_integer_dtype,
- is_interval,
- is_interval_dtype,
- is_iterator,
- is_list_like,
- is_named_tuple,
- is_number,
- is_numeric_dtype,
- is_object_dtype,
- is_period_dtype,
- is_re,
- is_re_compilable,
- is_scalar,
- is_signed_integer_dtype,
- is_sparse,
- is_string_dtype,
- is_timedelta64_dtype,
- is_timedelta64_ns_dtype,
- is_unsigned_integer_dtype,
- pandas_dtype,
-)
-
-__all__ = [
- "is_any_real_numeric_dtype",
- "is_array_like",
- "is_bool",
- "is_bool_dtype",
- "is_categorical_dtype",
- "is_complex",
- "is_complex_dtype",
- "is_datetime64_any_dtype",
- "is_datetime64_dtype",
- "is_datetime64_ns_dtype",
- "is_datetime64tz_dtype",
- "is_dict_like",
- "is_dtype_equal",
- "is_extension_array_dtype",
- "is_file_like",
- "is_float",
- "is_float_dtype",
- "is_hashable",
- "is_int64_dtype",
- "is_integer",
- "is_integer_dtype",
- "is_interval",
- "is_interval_dtype",
- "is_iterator",
- "is_list_like",
- "is_named_tuple",
- "is_number",
- "is_numeric_dtype",
- "is_object_dtype",
- "is_period_dtype",
- "is_re",
- "is_re_compilable",
- "is_scalar",
- "is_signed_integer_dtype",
- "is_sparse",
- "is_string_dtype",
- "is_timedelta64_dtype",
- "is_timedelta64_ns_dtype",
- "is_unsigned_integer_dtype",
- "pandas_dtype",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/dtypes/astype.py b/contrib/python/pandas/py3/pandas/core/dtypes/astype.py
deleted file mode 100644
index 87a36aed418..00000000000
--- a/contrib/python/pandas/py3/pandas/core/dtypes/astype.py
+++ /dev/null
@@ -1,306 +0,0 @@
-"""
-Functions for implementing 'astype' methods according to pandas conventions,
-particularly ones that differ from numpy.
-"""
-from __future__ import annotations
-
-import inspect
-from typing import (
- TYPE_CHECKING,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.tslibs.timedeltas import array_to_timedelta64
-from pandas._typing import (
- ArrayLike,
- DtypeObj,
- IgnoreRaise,
-)
-from pandas.errors import IntCastingNaNError
-
-from pandas.core.dtypes.common import (
- is_datetime64_dtype,
- is_dtype_equal,
- is_integer_dtype,
- is_object_dtype,
- is_string_dtype,
- is_timedelta64_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import (
- ExtensionDtype,
- PandasDtype,
-)
-
-if TYPE_CHECKING:
- from pandas.core.arrays import ExtensionArray
-
-
-_dtype_obj = np.dtype(object)
-
-
-@overload
-def _astype_nansafe(
- arr: np.ndarray, dtype: np.dtype, copy: bool = ..., skipna: bool = ...
-) -> np.ndarray:
- ...
-
-
-@overload
-def _astype_nansafe(
- arr: np.ndarray, dtype: ExtensionDtype, copy: bool = ..., skipna: bool = ...
-) -> ExtensionArray:
- ...
-
-
-def _astype_nansafe(
- arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False
-) -> ArrayLike:
- """
- Cast the elements of an array to a given dtype a nan-safe manner.
-
- Parameters
- ----------
- arr : ndarray
- dtype : np.dtype or ExtensionDtype
- copy : bool, default True
- If False, a view will be attempted but may fail, if
- e.g. the item sizes don't align.
- skipna: bool, default False
- Whether or not we should skip NaN when casting as a string-type.
-
- Raises
- ------
- ValueError
- The dtype was a datetime64/timedelta64 dtype, but it had no unit.
- """
-
- # dispatch on extension dtype if needed
- if isinstance(dtype, ExtensionDtype):
- return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
-
- elif not isinstance(dtype, np.dtype): # pragma: no cover
- raise ValueError("dtype must be np.dtype or ExtensionDtype")
-
- if arr.dtype.kind in ["m", "M"]:
- from pandas.core.construction import ensure_wrapped_if_datetimelike
-
- arr = ensure_wrapped_if_datetimelike(arr)
- res = arr.astype(dtype, copy=copy)
- return np.asarray(res)
-
- if issubclass(dtype.type, str):
- shape = arr.shape
- if arr.ndim > 1:
- arr = arr.ravel()
- return lib.ensure_string_array(
- arr, skipna=skipna, convert_na_value=False
- ).reshape(shape)
-
- elif np.issubdtype(arr.dtype, np.floating) and is_integer_dtype(dtype):
- return _astype_float_to_int_nansafe(arr, dtype, copy)
-
- elif is_object_dtype(arr.dtype):
- # if we have a datetime/timedelta array of objects
- # then coerce to datetime64[ns] and use DatetimeArray.astype
-
- if is_datetime64_dtype(dtype):
- from pandas import to_datetime
-
- dti = to_datetime(arr.ravel())
- dta = dti._data.reshape(arr.shape)
- return dta.astype(dtype, copy=False)._ndarray
-
- elif is_timedelta64_dtype(dtype):
- from pandas.core.construction import ensure_wrapped_if_datetimelike
-
- # bc we know arr.dtype == object, this is equivalent to
- # `np.asarray(to_timedelta(arr))`, but using a lower-level API that
- # does not require a circular import.
- tdvals = array_to_timedelta64(arr).view("m8[ns]")
-
- tda = ensure_wrapped_if_datetimelike(tdvals)
- return tda.astype(dtype, copy=False)._ndarray
-
- if dtype.name in ("datetime64", "timedelta64"):
- msg = (
- f"The '{dtype.name}' dtype has no unit. Please pass in "
- f"'{dtype.name}[ns]' instead."
- )
- raise ValueError(msg)
-
- if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype):
- # Explicit copy, or required since NumPy can't view from / to object.
- return arr.astype(dtype, copy=True)
-
- return arr.astype(dtype, copy=copy)
-
-
-def _astype_float_to_int_nansafe(
- values: np.ndarray, dtype: np.dtype, copy: bool
-) -> np.ndarray:
- """
- astype with a check preventing converting NaN to an meaningless integer value.
- """
- if not np.isfinite(values).all():
- raise IntCastingNaNError(
- "Cannot convert non-finite values (NA or inf) to integer"
- )
- if dtype.kind == "u":
- # GH#45151
- if not (values >= 0).all():
- raise ValueError(f"Cannot losslessly cast from {values.dtype} to {dtype}")
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore", category=RuntimeWarning)
- return values.astype(dtype, copy=copy)
-
-
-def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike:
- """
- Cast array (ndarray or ExtensionArray) to the new dtype.
-
- Parameters
- ----------
- values : ndarray or ExtensionArray
- dtype : dtype object
- copy : bool, default False
- copy if indicated
-
- Returns
- -------
- ndarray or ExtensionArray
- """
- if is_dtype_equal(values.dtype, dtype):
- if copy:
- return values.copy()
- return values
-
- if not isinstance(values, np.ndarray):
- # i.e. ExtensionArray
- values = values.astype(dtype, copy=copy)
-
- else:
- values = _astype_nansafe(values, dtype, copy=copy)
-
- # in pandas we don't store numpy str dtypes, so convert to object
- if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str):
- values = np.array(values, dtype=object)
-
- return values
-
-
-def astype_array_safe(
- values: ArrayLike, dtype, copy: bool = False, errors: IgnoreRaise = "raise"
-) -> ArrayLike:
- """
- Cast array (ndarray or ExtensionArray) to the new dtype.
-
- This basically is the implementation for DataFrame/Series.astype and
- includes all custom logic for pandas (NaN-safety, converting str to object,
- not allowing )
-
- Parameters
- ----------
- values : ndarray or ExtensionArray
- dtype : str, dtype convertible
- copy : bool, default False
- copy if indicated
- errors : str, {'raise', 'ignore'}, default 'raise'
- - ``raise`` : allow exceptions to be raised
- - ``ignore`` : suppress exceptions. On error return original object
-
- Returns
- -------
- ndarray or ExtensionArray
- """
- errors_legal_values = ("raise", "ignore")
-
- if errors not in errors_legal_values:
- invalid_arg = (
- "Expected value of kwarg 'errors' to be one of "
- f"{list(errors_legal_values)}. Supplied value is '{errors}'"
- )
- raise ValueError(invalid_arg)
-
- if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype):
- msg = (
- f"Expected an instance of {dtype.__name__}, "
- "but got the class instead. Try instantiating 'dtype'."
- )
- raise TypeError(msg)
-
- dtype = pandas_dtype(dtype)
- if isinstance(dtype, PandasDtype):
- # Ensure we don't end up with a PandasArray
- dtype = dtype.numpy_dtype
-
- try:
- new_values = astype_array(values, dtype, copy=copy)
- except (ValueError, TypeError):
- # e.g. _astype_nansafe can fail on object-dtype of strings
- # trying to convert to float
- if errors == "ignore":
- new_values = values
- else:
- raise
-
- return new_values
-
-
-def astype_is_view(dtype: DtypeObj, new_dtype: DtypeObj) -> bool:
- """Checks if astype avoided copying the data.
-
- Parameters
- ----------
- dtype : Original dtype
- new_dtype : target dtype
-
- Returns
- -------
- True if new data is a view or not guaranteed to be a copy, False otherwise
- """
- if isinstance(dtype, np.dtype) and not isinstance(new_dtype, np.dtype):
- new_dtype, dtype = dtype, new_dtype
-
- if dtype == new_dtype:
- return True
-
- elif isinstance(dtype, np.dtype) and isinstance(new_dtype, np.dtype):
- # Only equal numpy dtypes avoid a copy
- return False
-
- elif is_string_dtype(dtype) and is_string_dtype(new_dtype):
- # Potentially! a view when converting from object to string
- return True
-
- elif is_object_dtype(dtype) and new_dtype.kind == "O":
- # When the underlying array has dtype object, we don't have to make a copy
- return True
-
- elif dtype.kind in "mM" and new_dtype.kind in "mM":
- dtype = getattr(dtype, "numpy_dtype", dtype)
- new_dtype = getattr(new_dtype, "numpy_dtype", new_dtype)
- return getattr(dtype, "unit", None) == getattr(new_dtype, "unit", None)
-
- numpy_dtype = getattr(dtype, "numpy_dtype", None)
- new_numpy_dtype = getattr(new_dtype, "numpy_dtype", None)
-
- if numpy_dtype is None and isinstance(dtype, np.dtype):
- numpy_dtype = dtype
-
- if new_numpy_dtype is None and isinstance(new_dtype, np.dtype):
- new_numpy_dtype = new_dtype
-
- if numpy_dtype is not None and new_numpy_dtype is not None:
- # if both have NumPy dtype or one of them is a numpy dtype
- # they are only a view when the numpy dtypes are equal, e.g.
- # int64 -> Int64 or int64[pyarrow]
- # int64 -> Int32 copies
- return numpy_dtype == new_numpy_dtype
-
- # Assume this is a view since we don't know for sure if a copy was made
- return True
diff --git a/contrib/python/pandas/py3/pandas/core/dtypes/base.py b/contrib/python/pandas/py3/pandas/core/dtypes/base.py
deleted file mode 100644
index bce2a82f057..00000000000
--- a/contrib/python/pandas/py3/pandas/core/dtypes/base.py
+++ /dev/null
@@ -1,528 +0,0 @@
-"""
-Extend pandas with custom array types.
-"""
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Any,
- TypeVar,
- cast,
- overload,
-)
-
-import numpy as np
-
-from pandas._libs import missing as libmissing
-from pandas._libs.hashtable import object_hash
-from pandas._typing import (
- DtypeObj,
- Shape,
- npt,
- type_t,
-)
-from pandas.errors import AbstractMethodError
-
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCIndex,
- ABCSeries,
-)
-
-if TYPE_CHECKING:
- from pandas.core.arrays import ExtensionArray
-
- # To parameterize on same ExtensionDtype
- ExtensionDtypeT = TypeVar("ExtensionDtypeT", bound="ExtensionDtype")
-
-
-class ExtensionDtype:
- """
- A custom data type, to be paired with an ExtensionArray.
-
- See Also
- --------
- extensions.register_extension_dtype: Register an ExtensionType
- with pandas as class decorator.
- extensions.ExtensionArray: Abstract base class for custom 1-D array types.
-
- Notes
- -----
- The interface includes the following abstract methods that must
- be implemented by subclasses:
-
- * type
- * name
- * construct_array_type
-
- The following attributes and methods influence the behavior of the dtype in
- pandas operations
-
- * _is_numeric
- * _is_boolean
- * _get_common_dtype
-
- The `na_value` class attribute can be used to set the default NA value
- for this type. :attr:`numpy.nan` is used by default.
-
- ExtensionDtypes are required to be hashable. The base class provides
- a default implementation, which relies on the ``_metadata`` class
- attribute. ``_metadata`` should be a tuple containing the strings
- that define your data type. For example, with ``PeriodDtype`` that's
- the ``freq`` attribute.
-
- **If you have a parametrized dtype you should set the ``_metadata``
- class property**.
-
- Ideally, the attributes in ``_metadata`` will match the
- parameters to your ``ExtensionDtype.__init__`` (if any). If any of
- the attributes in ``_metadata`` don't implement the standard
- ``__eq__`` or ``__hash__``, the default implementations here will not
- work.
-
- For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method
- can be implemented: this method receives a pyarrow Array or ChunkedArray
- as only argument and is expected to return the appropriate pandas
- ExtensionArray for this dtype and the passed values::
-
- class ExtensionDtype:
-
- def __from_arrow__(
- self, array: Union[pyarrow.Array, pyarrow.ChunkedArray]
- ) -> ExtensionArray:
- ...
-
- This class does not inherit from 'abc.ABCMeta' for performance reasons.
- Methods and properties required by the interface raise
- ``pandas.errors.AbstractMethodError`` and no ``register`` method is
- provided for registering virtual subclasses.
- """
-
- _metadata: tuple[str, ...] = ()
-
- def __str__(self) -> str:
- return self.name
-
- def __eq__(self, other: Any) -> bool:
- """
- Check whether 'other' is equal to self.
-
- By default, 'other' is considered equal if either
-
- * it's a string matching 'self.name'.
- * it's an instance of this type and all of the attributes
- in ``self._metadata`` are equal between `self` and `other`.
-
- Parameters
- ----------
- other : Any
-
- Returns
- -------
- bool
- """
- if isinstance(other, str):
- try:
- other = self.construct_from_string(other)
- except TypeError:
- return False
- if isinstance(other, type(self)):
- return all(
- getattr(self, attr) == getattr(other, attr) for attr in self._metadata
- )
- return False
-
- def __hash__(self) -> int:
- # for python>=3.10, different nan objects have different hashes
- # we need to avoid that and thus use hash function with old behavior
- return object_hash(tuple(getattr(self, attr) for attr in self._metadata))
-
- def __ne__(self, other: Any) -> bool:
- return not self.__eq__(other)
-
- @property
- def na_value(self) -> object:
- """
- Default NA value to use for this type.
-
- This is used in e.g. ExtensionArray.take. This should be the
- user-facing "boxed" version of the NA value, not the physical NA value
- for storage. e.g. for JSONArray, this is an empty dictionary.
- """
- return np.nan
-
- @property
- def type(self) -> type_t[Any]:
- """
- The scalar type for the array, e.g. ``int``
-
- It's expected ``ExtensionArray[item]`` returns an instance
- of ``ExtensionDtype.type`` for scalar ``item``, assuming
- that value is valid (not NA). NA values do not need to be
- instances of `type`.
- """
- raise AbstractMethodError(self)
-
- @property
- def kind(self) -> str:
- """
- A character code (one of 'biufcmMOSUV'), default 'O'
-
- This should match the NumPy dtype used when the array is
- converted to an ndarray, which is probably 'O' for object if
- the extension type cannot be represented as a built-in NumPy
- type.
-
- See Also
- --------
- numpy.dtype.kind
- """
- return "O"
-
- @property
- def name(self) -> str:
- """
- A string identifying the data type.
-
- Will be used for display in, e.g. ``Series.dtype``
- """
- raise AbstractMethodError(self)
-
- @property
- def names(self) -> list[str] | None:
- """
- Ordered list of field names, or None if there are no fields.
-
- This is for compatibility with NumPy arrays, and may be removed in the
- future.
- """
- return None
-
- @classmethod
- def construct_array_type(cls) -> type_t[ExtensionArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- raise AbstractMethodError(cls)
-
- def empty(self, shape: Shape) -> type_t[ExtensionArray]:
- """
- Construct an ExtensionArray of this dtype with the given shape.
-
- Analogous to numpy.empty.
-
- Parameters
- ----------
- shape : int or tuple[int]
-
- Returns
- -------
- ExtensionArray
- """
- cls = self.construct_array_type()
- return cls._empty(shape, dtype=self)
-
- @classmethod
- def construct_from_string(
- cls: type_t[ExtensionDtypeT], string: str
- ) -> ExtensionDtypeT:
- r"""
- Construct this type from a string.
-
- This is useful mainly for data types that accept parameters.
- For example, a period dtype accepts a frequency parameter that
- can be set as ``period[H]`` (where H means hourly frequency).
-
- By default, in the abstract class, just the name of the type is
- expected. But subclasses can overwrite this method to accept
- parameters.
-
- Parameters
- ----------
- string : str
- The name of the type, for example ``category``.
-
- Returns
- -------
- ExtensionDtype
- Instance of the dtype.
-
- Raises
- ------
- TypeError
- If a class cannot be constructed from this 'string'.
-
- Examples
- --------
- For extension dtypes with arguments the following may be an
- adequate implementation.
-
- >>> @classmethod
- ... def construct_from_string(cls, string):
- ... pattern = re.compile(r"^my_type\[(?P<arg_name>.+)\]$")
- ... match = pattern.match(string)
- ... if match:
- ... return cls(**match.groupdict())
- ... else:
- ... raise TypeError(
- ... f"Cannot construct a '{cls.__name__}' from '{string}'"
- ... )
- """
- if not isinstance(string, str):
- raise TypeError(
- f"'construct_from_string' expects a string, got {type(string)}"
- )
- # error: Non-overlapping equality check (left operand type: "str", right
- # operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap]
- assert isinstance(cls.name, str), (cls, type(cls.name))
- if string != cls.name:
- raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
- return cls()
-
- @classmethod
- def is_dtype(cls, dtype: object) -> bool:
- """
- Check if we match 'dtype'.
-
- Parameters
- ----------
- dtype : object
- The object to check.
-
- Returns
- -------
- bool
-
- Notes
- -----
- The default implementation is True if
-
- 1. ``cls.construct_from_string(dtype)`` is an instance
- of ``cls``.
- 2. ``dtype`` is an object and is an instance of ``cls``
- 3. ``dtype`` has a ``dtype`` attribute, and any of the above
- conditions is true for ``dtype.dtype``.
- """
- dtype = getattr(dtype, "dtype", dtype)
-
- if isinstance(dtype, (ABCSeries, ABCIndex, ABCDataFrame, np.dtype)):
- # https://github.com/pandas-dev/pandas/issues/22960
- # avoid passing data to `construct_from_string`. This could
- # cause a FutureWarning from numpy about failing elementwise
- # comparison from, e.g., comparing DataFrame == 'category'.
- return False
- elif dtype is None:
- return False
- elif isinstance(dtype, cls):
- return True
- if isinstance(dtype, str):
- try:
- return cls.construct_from_string(dtype) is not None
- except TypeError:
- return False
- return False
-
- @property
- def _is_numeric(self) -> bool:
- """
- Whether columns with this dtype should be considered numeric.
-
- By default ExtensionDtypes are assumed to be non-numeric.
- They'll be excluded from operations that exclude non-numeric
- columns, like (groupby) reductions, plotting, etc.
- """
- return False
-
- @property
- def _is_boolean(self) -> bool:
- """
- Whether this dtype should be considered boolean.
-
- By default, ExtensionDtypes are assumed to be non-numeric.
- Setting this to True will affect the behavior of several places,
- e.g.
-
- * is_bool
- * boolean indexing
-
- Returns
- -------
- bool
- """
- return False
-
- def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
- """
- Return the common dtype, if one exists.
-
- Used in `find_common_type` implementation. This is for example used
- to determine the resulting dtype in a concat operation.
-
- If no common dtype exists, return None (which gives the other dtypes
- the chance to determine a common dtype). If all dtypes in the list
- return None, then the common dtype will be "object" dtype (this means
- it is never needed to return "object" dtype from this method itself).
-
- Parameters
- ----------
- dtypes : list of dtypes
- The dtypes for which to determine a common dtype. This is a list
- of np.dtype or ExtensionDtype instances.
-
- Returns
- -------
- Common dtype (np.dtype or ExtensionDtype) or None
- """
- if len(set(dtypes)) == 1:
- # only itself
- return self
- else:
- return None
-
- @property
- def _can_hold_na(self) -> bool:
- """
- Can arrays of this dtype hold NA values?
- """
- return True
-
-
-class StorageExtensionDtype(ExtensionDtype):
- """ExtensionDtype that may be backed by more than one implementation."""
-
- name: str
- _metadata = ("storage",)
-
- def __init__(self, storage=None) -> None:
- self.storage = storage
-
- def __repr__(self) -> str:
- return f"{self.name}[{self.storage}]"
-
- def __str__(self) -> str:
- return self.name
-
- def __eq__(self, other: Any) -> bool:
- if isinstance(other, str) and other == self.name:
- return True
- return super().__eq__(other)
-
- def __hash__(self) -> int:
- # custom __eq__ so have to override __hash__
- return super().__hash__()
-
- @property
- def na_value(self) -> libmissing.NAType:
- return libmissing.NA
-
-
-def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]:
- """
- Register an ExtensionType with pandas as class decorator.
-
- This enables operations like ``.astype(name)`` for the name
- of the ExtensionDtype.
-
- Returns
- -------
- callable
- A class decorator.
-
- Examples
- --------
- >>> from pandas.api.extensions import register_extension_dtype, ExtensionDtype
- >>> @register_extension_dtype
- ... class MyExtensionDtype(ExtensionDtype):
- ... name = "myextension"
- """
- _registry.register(cls)
- return cls
-
-
-class Registry:
- """
- Registry for dtype inference.
-
- The registry allows one to map a string repr of a extension
- dtype to an extension dtype. The string alias can be used in several
- places, including
-
- * Series and Index constructors
- * :meth:`pandas.array`
- * :meth:`pandas.Series.astype`
-
- Multiple extension types can be registered.
- These are tried in order.
- """
-
- def __init__(self) -> None:
- self.dtypes: list[type_t[ExtensionDtype]] = []
-
- def register(self, dtype: type_t[ExtensionDtype]) -> None:
- """
- Parameters
- ----------
- dtype : ExtensionDtype class
- """
- if not issubclass(dtype, ExtensionDtype):
- raise ValueError("can only register pandas extension dtypes")
-
- self.dtypes.append(dtype)
-
- @overload
- def find(self, dtype: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]:
- ...
-
- @overload
- def find(self, dtype: ExtensionDtypeT) -> ExtensionDtypeT:
- ...
-
- @overload
- def find(self, dtype: str) -> ExtensionDtype | None:
- ...
-
- @overload
- def find(
- self, dtype: npt.DTypeLike
- ) -> type_t[ExtensionDtype] | ExtensionDtype | None:
- ...
-
- def find(
- self, dtype: type_t[ExtensionDtype] | ExtensionDtype | npt.DTypeLike
- ) -> type_t[ExtensionDtype] | ExtensionDtype | None:
- """
- Parameters
- ----------
- dtype : ExtensionDtype class or instance or str or numpy dtype or python type
-
- Returns
- -------
- return the first matching dtype, otherwise return None
- """
- if not isinstance(dtype, str):
- dtype_type: type_t
- if not isinstance(dtype, type):
- dtype_type = type(dtype)
- else:
- dtype_type = dtype
- if issubclass(dtype_type, ExtensionDtype):
- # cast needed here as mypy doesn't know we have figured
- # out it is an ExtensionDtype or type_t[ExtensionDtype]
- return cast("ExtensionDtype | type_t[ExtensionDtype]", dtype)
-
- return None
-
- for dtype_type in self.dtypes:
- try:
- return dtype_type.construct_from_string(dtype)
- except TypeError:
- pass
-
- return None
-
-
-_registry = Registry()
diff --git a/contrib/python/pandas/py3/pandas/core/dtypes/cast.py b/contrib/python/pandas/py3/pandas/core/dtypes/cast.py
deleted file mode 100644
index 75dd4cfb23a..00000000000
--- a/contrib/python/pandas/py3/pandas/core/dtypes/cast.py
+++ /dev/null
@@ -1,1921 +0,0 @@
-"""
-Routines for casting.
-"""
-
-from __future__ import annotations
-
-import datetime as dt
-import functools
-from typing import (
- TYPE_CHECKING,
- Any,
- Literal,
- Sized,
- TypeVar,
- cast,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.missing import (
- NA,
- NAType,
- checknull,
-)
-from pandas._libs.tslibs import (
- NaT,
- OutOfBoundsDatetime,
- OutOfBoundsTimedelta,
- Timedelta,
- Timestamp,
- get_unit_from_dtype,
- is_supported_unit,
-)
-from pandas._libs.tslibs.timedeltas import array_to_timedelta64
-from pandas._typing import (
- ArrayLike,
- Dtype,
- DtypeObj,
- NumpyIndexT,
- Scalar,
- npt,
-)
-from pandas.errors import (
- IntCastingNaNError,
- LossySetitemError,
-)
-
-from pandas.core.dtypes.common import (
- ensure_int8,
- ensure_int16,
- ensure_int32,
- ensure_int64,
- ensure_object,
- ensure_str,
- is_bool,
- is_bool_dtype,
- is_complex,
- is_complex_dtype,
- is_datetime64_dtype,
- is_extension_array_dtype,
- is_float,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_numeric_dtype,
- is_object_dtype,
- is_scalar,
- is_signed_integer_dtype,
- is_string_dtype,
- is_timedelta64_dtype,
- is_unsigned_integer_dtype,
- pandas_dtype as pandas_dtype_func,
-)
-from pandas.core.dtypes.dtypes import (
- BaseMaskedDtype,
- CategoricalDtype,
- DatetimeTZDtype,
- ExtensionDtype,
- IntervalDtype,
- PandasExtensionDtype,
- PeriodDtype,
-)
-from pandas.core.dtypes.generic import (
- ABCExtensionArray,
- ABCIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.inference import is_list_like
-from pandas.core.dtypes.missing import (
- is_valid_na_for_dtype,
- isna,
- na_value_for_dtype,
- notna,
-)
-
-from pandas.io._util import _arrow_dtype_mapping
-
-if TYPE_CHECKING:
- from pandas import Index
- from pandas.core.arrays import (
- Categorical,
- DatetimeArray,
- ExtensionArray,
- IntervalArray,
- PeriodArray,
- TimedeltaArray,
- )
-
-
-_int8_max = np.iinfo(np.int8).max
-_int16_max = np.iinfo(np.int16).max
-_int32_max = np.iinfo(np.int32).max
-_int64_max = np.iinfo(np.int64).max
-
-_dtype_obj = np.dtype(object)
-
-NumpyArrayT = TypeVar("NumpyArrayT", bound=np.ndarray)
-
-
-def maybe_convert_platform(
- values: list | tuple | range | np.ndarray | ExtensionArray,
-) -> ArrayLike:
- """try to do platform conversion, allow ndarray or list here"""
- arr: ArrayLike
-
- if isinstance(values, (list, tuple, range)):
- arr = construct_1d_object_array_from_listlike(values)
- else:
- # The caller is responsible for ensuring that we have np.ndarray
- # or ExtensionArray here.
- arr = values
-
- if arr.dtype == _dtype_obj:
- arr = cast(np.ndarray, arr)
- arr = lib.maybe_convert_objects(arr)
-
- return arr
-
-
-def is_nested_object(obj) -> bool:
- """
- return a boolean if we have a nested object, e.g. a Series with 1 or
- more Series elements
-
- This may not be necessarily be performant.
-
- """
- return bool(
- isinstance(obj, ABCSeries)
- and is_object_dtype(obj.dtype)
- and any(isinstance(v, ABCSeries) for v in obj._values)
- )
-
-
-def maybe_box_datetimelike(value: Scalar, dtype: Dtype | None = None) -> Scalar:
- """
- Cast scalar to Timestamp or Timedelta if scalar is datetime-like
- and dtype is not object.
-
- Parameters
- ----------
- value : scalar
- dtype : Dtype, optional
-
- Returns
- -------
- scalar
- """
- if dtype == _dtype_obj:
- pass
- elif isinstance(value, (np.datetime64, dt.datetime)):
- value = Timestamp(value)
- elif isinstance(value, (np.timedelta64, dt.timedelta)):
- value = Timedelta(value)
-
- return value
-
-
-def maybe_box_native(value: Scalar | None | NAType) -> Scalar | None | NAType:
- """
- If passed a scalar cast the scalar to a python native type.
-
- Parameters
- ----------
- value : scalar or Series
-
- Returns
- -------
- scalar or Series
- """
- if is_float(value):
- # error: Argument 1 to "float" has incompatible type
- # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";
- # expected "Union[SupportsFloat, _SupportsIndex, str]"
- value = float(value) # type: ignore[arg-type]
- elif is_integer(value):
- # error: Argument 1 to "int" has incompatible type
- # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";
- # expected "Union[str, SupportsInt, _SupportsIndex, _SupportsTrunc]"
- value = int(value) # type: ignore[arg-type]
- elif is_bool(value):
- value = bool(value)
- elif isinstance(value, (np.datetime64, np.timedelta64)):
- value = maybe_box_datetimelike(value)
- elif value is NA:
- value = None
- return value
-
-
-def _maybe_unbox_datetimelike(value: Scalar, dtype: DtypeObj) -> Scalar:
- """
- Convert a Timedelta or Timestamp to timedelta64 or datetime64 for setting
- into a numpy array. Failing to unbox would risk dropping nanoseconds.
-
- Notes
- -----
- Caller is responsible for checking dtype.kind in ["m", "M"]
- """
- if is_valid_na_for_dtype(value, dtype):
- # GH#36541: can't fill array directly with pd.NaT
- # > np.empty(10, dtype="datetime64[ns]").fill(pd.NaT)
- # ValueError: cannot convert float NaN to integer
- value = dtype.type("NaT", "ns")
- elif isinstance(value, Timestamp):
- if value.tz is None:
- value = value.to_datetime64()
- elif not isinstance(dtype, DatetimeTZDtype):
- raise TypeError("Cannot unbox tzaware Timestamp to tznaive dtype")
- elif isinstance(value, Timedelta):
- value = value.to_timedelta64()
-
- _disallow_mismatched_datetimelike(value, dtype)
- return value
-
-
-def _disallow_mismatched_datetimelike(value, dtype: DtypeObj):
- """
- numpy allows np.array(dt64values, dtype="timedelta64[ns]") and
- vice-versa, but we do not want to allow this, so we need to
- check explicitly
- """
- vdtype = getattr(value, "dtype", None)
- if vdtype is None:
- return
- elif (vdtype.kind == "m" and dtype.kind == "M") or (
- vdtype.kind == "M" and dtype.kind == "m"
- ):
- raise TypeError(f"Cannot cast {repr(value)} to {dtype}")
-
-
-@overload
-def maybe_downcast_to_dtype(result: np.ndarray, dtype: str | np.dtype) -> np.ndarray:
- ...
-
-
-@overload
-def maybe_downcast_to_dtype(result: ExtensionArray, dtype: str | np.dtype) -> ArrayLike:
- ...
-
-
-def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLike:
- """
- try to cast to the specified dtype (e.g. convert back to bool/int
- or could be an astype of float64->float32
- """
- do_round = False
-
- if isinstance(dtype, str):
- if dtype == "infer":
- inferred_type = lib.infer_dtype(result, skipna=False)
- if inferred_type == "boolean":
- dtype = "bool"
- elif inferred_type == "integer":
- dtype = "int64"
- elif inferred_type == "datetime64":
- dtype = "datetime64[ns]"
- elif inferred_type in ["timedelta", "timedelta64"]:
- dtype = "timedelta64[ns]"
-
- # try to upcast here
- elif inferred_type == "floating":
- dtype = "int64"
- if issubclass(result.dtype.type, np.number):
- do_round = True
-
- else:
- # TODO: complex? what if result is already non-object?
- dtype = "object"
-
- dtype = np.dtype(dtype)
-
- if not isinstance(dtype, np.dtype):
- # enforce our signature annotation
- raise TypeError(dtype) # pragma: no cover
-
- converted = maybe_downcast_numeric(result, dtype, do_round)
- if converted is not result:
- return converted
-
- # a datetimelike
- # GH12821, iNaT is cast to float
- if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]:
- result = result.astype(dtype)
-
- elif dtype.kind == "m" and result.dtype == _dtype_obj:
- # test_where_downcast_to_td64
- result = cast(np.ndarray, result)
- result = array_to_timedelta64(result)
-
- elif dtype == np.dtype("M8[ns]") and result.dtype == _dtype_obj:
- result = cast(np.ndarray, result)
- return np.asarray(maybe_cast_to_datetime(result, dtype=dtype))
-
- return result
-
-
-@overload
-def maybe_downcast_numeric(
- result: np.ndarray, dtype: np.dtype, do_round: bool = False
-) -> np.ndarray:
- ...
-
-
-@overload
-def maybe_downcast_numeric(
- result: ExtensionArray, dtype: DtypeObj, do_round: bool = False
-) -> ArrayLike:
- ...
-
-
-def maybe_downcast_numeric(
- result: ArrayLike, dtype: DtypeObj, do_round: bool = False
-) -> ArrayLike:
- """
- Subset of maybe_downcast_to_dtype restricted to numeric dtypes.
-
- Parameters
- ----------
- result : ndarray or ExtensionArray
- dtype : np.dtype or ExtensionDtype
- do_round : bool
-
- Returns
- -------
- ndarray or ExtensionArray
- """
- if not isinstance(dtype, np.dtype) or not isinstance(result.dtype, np.dtype):
- # e.g. SparseDtype has no itemsize attr
- return result
-
- def trans(x):
- if do_round:
- return x.round()
- return x
-
- if dtype.kind == result.dtype.kind:
- # don't allow upcasts here (except if empty)
- if result.dtype.itemsize <= dtype.itemsize and result.size:
- return result
-
- if is_bool_dtype(dtype) or is_integer_dtype(dtype):
- if not result.size:
- # if we don't have any elements, just astype it
- return trans(result).astype(dtype)
-
- # do a test on the first element, if it fails then we are done
- r = result.ravel()
- arr = np.array([r[0]])
-
- if isna(arr).any():
- # if we have any nulls, then we are done
- return result
-
- elif not isinstance(r[0], (np.integer, np.floating, int, float, bool)):
- # a comparable, e.g. a Decimal may slip in here
- return result
-
- if (
- issubclass(result.dtype.type, (np.object_, np.number))
- and notna(result).all()
- ):
- new_result = trans(result).astype(dtype)
- if new_result.dtype.kind == "O" or result.dtype.kind == "O":
- # np.allclose may raise TypeError on object-dtype
- if (new_result == result).all():
- return new_result
- else:
- if np.allclose(new_result, result, rtol=0):
- return new_result
-
- elif (
- issubclass(dtype.type, np.floating)
- and not is_bool_dtype(result.dtype)
- and not is_string_dtype(result.dtype)
- ):
- with warnings.catch_warnings():
- warnings.filterwarnings(
- "ignore", "overflow encountered in cast", RuntimeWarning
- )
- new_result = result.astype(dtype)
-
- # Adjust tolerances based on floating point size
- size_tols = {4: 5e-4, 8: 5e-8, 16: 5e-16}
-
- atol = size_tols.get(new_result.dtype.itemsize, 0.0)
-
- # Check downcast float values are still equal within 7 digits when
- # converting from float64 to float32
- if np.allclose(new_result, result, equal_nan=True, rtol=0.0, atol=atol):
- return new_result
-
- elif dtype.kind == result.dtype.kind == "c":
- new_result = result.astype(dtype)
-
- if np.array_equal(new_result, result, equal_nan=True):
- # TODO: use tolerance like we do for float?
- return new_result
-
- return result
-
-
-def maybe_upcast_numeric_to_64bit(arr: NumpyIndexT) -> NumpyIndexT:
- """
- If array is a int/uint/float bit size lower than 64 bit, upcast it to 64 bit.
-
- Parameters
- ----------
- arr : ndarray or ExtensionArray
-
- Returns
- -------
- ndarray or ExtensionArray
- """
- dtype = arr.dtype
- if is_signed_integer_dtype(dtype) and dtype != np.int64:
- return arr.astype(np.int64)
- elif is_unsigned_integer_dtype(dtype) and dtype != np.uint64:
- return arr.astype(np.uint64)
- elif is_float_dtype(dtype) and dtype != np.float64:
- return arr.astype(np.float64)
- else:
- return arr
-
-
-def maybe_cast_pointwise_result(
- result: ArrayLike,
- dtype: DtypeObj,
- numeric_only: bool = False,
- same_dtype: bool = True,
-) -> ArrayLike:
- """
- Try casting result of a pointwise operation back to the original dtype if
- appropriate.
-
- Parameters
- ----------
- result : array-like
- Result to cast.
- dtype : np.dtype or ExtensionDtype
- Input Series from which result was calculated.
- numeric_only : bool, default False
- Whether to cast only numerics or datetimes as well.
- same_dtype : bool, default True
- Specify dtype when calling _from_sequence
-
- Returns
- -------
- result : array-like
- result maybe casted to the dtype.
- """
-
- assert not is_scalar(result)
-
- if isinstance(dtype, ExtensionDtype):
- if not isinstance(dtype, (CategoricalDtype, DatetimeTZDtype)):
- # TODO: avoid this special-casing
- # We have to special case categorical so as not to upcast
- # things like counts back to categorical
-
- cls = dtype.construct_array_type()
- if same_dtype:
- result = maybe_cast_to_extension_array(cls, result, dtype=dtype)
- else:
- result = maybe_cast_to_extension_array(cls, result)
-
- elif (numeric_only and is_numeric_dtype(dtype)) or not numeric_only:
- result = maybe_downcast_to_dtype(result, dtype)
-
- return result
-
-
-def maybe_cast_to_extension_array(
- cls: type[ExtensionArray], obj: ArrayLike, dtype: ExtensionDtype | None = None
-) -> ArrayLike:
- """
- Call to `_from_sequence` that returns the object unchanged on Exception.
-
- Parameters
- ----------
- cls : class, subclass of ExtensionArray
- obj : arraylike
- Values to pass to cls._from_sequence
- dtype : ExtensionDtype, optional
-
- Returns
- -------
- ExtensionArray or obj
- """
- from pandas.core.arrays.string_ import BaseStringArray
-
- assert isinstance(cls, type), f"must pass a type: {cls}"
- assertion_msg = f"must pass a subclass of ExtensionArray: {cls}"
- assert issubclass(cls, ABCExtensionArray), assertion_msg
-
- # Everything can be converted to StringArrays, but we may not want to convert
- if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string":
- return obj
-
- try:
- result = cls._from_sequence(obj, dtype=dtype)
- except Exception:
- # We can't predict what downstream EA constructors may raise
- result = obj
- return result
-
-
-@overload
-def ensure_dtype_can_hold_na(dtype: np.dtype) -> np.dtype:
- ...
-
-
-@overload
-def ensure_dtype_can_hold_na(dtype: ExtensionDtype) -> ExtensionDtype:
- ...
-
-
-def ensure_dtype_can_hold_na(dtype: DtypeObj) -> DtypeObj:
- """
- If we have a dtype that cannot hold NA values, find the best match that can.
- """
- if isinstance(dtype, ExtensionDtype):
- if dtype._can_hold_na:
- return dtype
- elif isinstance(dtype, IntervalDtype):
- # TODO(GH#45349): don't special-case IntervalDtype, allow
- # overriding instead of returning object below.
- return IntervalDtype(np.float64, closed=dtype.closed)
- return _dtype_obj
- elif dtype.kind == "b":
- return _dtype_obj
- elif dtype.kind in ["i", "u"]:
- return np.dtype(np.float64)
- return dtype
-
-
-_canonical_nans = {
- np.datetime64: np.datetime64("NaT", "ns"),
- np.timedelta64: np.timedelta64("NaT", "ns"),
- type(np.nan): np.nan,
-}
-
-
-def maybe_promote(dtype: np.dtype, fill_value=np.nan):
- """
- Find the minimal dtype that can hold both the given dtype and fill_value.
-
- Parameters
- ----------
- dtype : np.dtype
- fill_value : scalar, default np.nan
-
- Returns
- -------
- dtype
- Upcasted from dtype argument if necessary.
- fill_value
- Upcasted from fill_value argument if necessary.
-
- Raises
- ------
- ValueError
- If fill_value is a non-scalar and dtype is not object.
- """
- orig = fill_value
- orig_is_nat = False
- if checknull(fill_value):
- # https://github.com/pandas-dev/pandas/pull/39692#issuecomment-1441051740
- # avoid cache misses with NaN/NaT values that are not singletons
- if fill_value is not NA:
- try:
- orig_is_nat = np.isnat(fill_value)
- except TypeError:
- pass
-
- fill_value = _canonical_nans.get(type(fill_value), fill_value)
-
- # for performance, we are using a cached version of the actual implementation
- # of the function in _maybe_promote. However, this doesn't always work (in case
- # of non-hashable arguments), so we fallback to the actual implementation if needed
- try:
- # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type
- # "Type[Any]"; expected "Hashable" [arg-type]
- dtype, fill_value = _maybe_promote_cached(
- dtype, fill_value, type(fill_value) # type: ignore[arg-type]
- )
- except TypeError:
- # if fill_value is not hashable (required for caching)
- dtype, fill_value = _maybe_promote(dtype, fill_value)
-
- if (dtype == _dtype_obj and orig is not None) or (
- orig_is_nat and np.datetime_data(orig)[0] != "ns"
- ):
- # GH#51592,53497 restore our potentially non-canonical fill_value
- fill_value = orig
- return dtype, fill_value
-
-
-@functools.lru_cache(maxsize=128)
-def _maybe_promote_cached(dtype, fill_value, fill_value_type):
- # The cached version of _maybe_promote below
- # This also use fill_value_type as (unused) argument to use this in the
- # cache lookup -> to differentiate 1 and True
- return _maybe_promote(dtype, fill_value)
-
-
-def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
- # The actual implementation of the function, use `maybe_promote` above for
- # a cached version.
- if not is_scalar(fill_value):
- # with object dtype there is nothing to promote, and the user can
- # pass pretty much any weird fill_value they like
- if not is_object_dtype(dtype):
- # with object dtype there is nothing to promote, and the user can
- # pass pretty much any weird fill_value they like
- raise ValueError("fill_value must be a scalar")
- dtype = _dtype_obj
- return dtype, fill_value
-
- kinds = ["i", "u", "f", "c", "m", "M"]
- if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in kinds:
- dtype = ensure_dtype_can_hold_na(dtype)
- fv = na_value_for_dtype(dtype)
- return dtype, fv
-
- elif isinstance(dtype, CategoricalDtype):
- if fill_value in dtype.categories or isna(fill_value):
- return dtype, fill_value
- else:
- return object, ensure_object(fill_value)
-
- elif isna(fill_value):
- dtype = _dtype_obj
- if fill_value is None:
- # but we retain e.g. pd.NA
- fill_value = np.nan
- return dtype, fill_value
-
- # returns tuple of (dtype, fill_value)
- if issubclass(dtype.type, np.datetime64):
- inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True)
- if inferred == dtype:
- return dtype, fv
-
- from pandas.core.arrays import DatetimeArray
-
- dta = DatetimeArray._from_sequence([], dtype="M8[ns]")
- try:
- fv = dta._validate_setitem_value(fill_value)
- return dta.dtype, fv
- except (ValueError, TypeError):
- return _dtype_obj, fill_value
-
- elif issubclass(dtype.type, np.timedelta64):
- inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True)
- if inferred == dtype:
- return dtype, fv
-
- return np.dtype("object"), fill_value
-
- elif is_float(fill_value):
- if issubclass(dtype.type, np.bool_):
- dtype = np.dtype(np.object_)
-
- elif issubclass(dtype.type, np.integer):
- dtype = np.dtype(np.float64)
-
- elif dtype.kind == "f":
- mst = np.min_scalar_type(fill_value)
- if mst > dtype:
- # e.g. mst is np.float64 and dtype is np.float32
- dtype = mst
-
- elif dtype.kind == "c":
- mst = np.min_scalar_type(fill_value)
- dtype = np.promote_types(dtype, mst)
-
- elif is_bool(fill_value):
- if not issubclass(dtype.type, np.bool_):
- dtype = np.dtype(np.object_)
-
- elif is_integer(fill_value):
- if issubclass(dtype.type, np.bool_):
- dtype = np.dtype(np.object_)
-
- elif issubclass(dtype.type, np.integer):
- if not np.can_cast(fill_value, dtype):
- # upcast to prevent overflow
- mst = np.min_scalar_type(fill_value)
- dtype = np.promote_types(dtype, mst)
- if dtype.kind == "f":
- # Case where we disagree with numpy
- dtype = np.dtype(np.object_)
-
- elif is_complex(fill_value):
- if issubclass(dtype.type, np.bool_):
- dtype = np.dtype(np.object_)
-
- elif issubclass(dtype.type, (np.integer, np.floating)):
- mst = np.min_scalar_type(fill_value)
- dtype = np.promote_types(dtype, mst)
-
- elif dtype.kind == "c":
- mst = np.min_scalar_type(fill_value)
- if mst > dtype:
- # e.g. mst is np.complex128 and dtype is np.complex64
- dtype = mst
-
- else:
- dtype = np.dtype(np.object_)
-
- # in case we have a string that looked like a number
- if issubclass(dtype.type, (bytes, str)):
- dtype = np.dtype(np.object_)
-
- fill_value = _ensure_dtype_type(fill_value, dtype)
- return dtype, fill_value
-
-
-def _ensure_dtype_type(value, dtype: np.dtype):
- """
- Ensure that the given value is an instance of the given dtype.
-
- e.g. if out dtype is np.complex64_, we should have an instance of that
- as opposed to a python complex object.
-
- Parameters
- ----------
- value : object
- dtype : np.dtype
-
- Returns
- -------
- object
- """
- # Start with exceptions in which we do _not_ cast to numpy types
-
- if dtype == _dtype_obj:
- return value
-
- # Note: before we get here we have already excluded isna(value)
- return dtype.type(value)
-
-
-def infer_dtype_from(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]:
- """
- Interpret the dtype from a scalar or array.
-
- Parameters
- ----------
- val : object
- pandas_dtype : bool, default False
- whether to infer dtype including pandas extension types.
- If False, scalar/array belongs to pandas extension types is inferred as
- object
- """
- if not is_list_like(val):
- return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
- return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
-
-
-def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]:
- """
- Interpret the dtype from a scalar.
-
- Parameters
- ----------
- pandas_dtype : bool, default False
- whether to infer dtype including pandas extension types.
- If False, scalar belongs to pandas extension types is inferred as
- object
- """
- dtype: DtypeObj = _dtype_obj
-
- # a 1-element ndarray
- if isinstance(val, np.ndarray):
- if val.ndim != 0:
- msg = "invalid ndarray passed to infer_dtype_from_scalar"
- raise ValueError(msg)
-
- dtype = val.dtype
- val = lib.item_from_zerodim(val)
-
- elif isinstance(val, str):
- # If we create an empty array using a string to infer
- # the dtype, NumPy will only allocate one character per entry
- # so this is kind of bad. Alternately we could use np.repeat
- # instead of np.empty (but then you still don't want things
- # coming out as np.str_!
-
- dtype = _dtype_obj
-
- elif isinstance(val, (np.datetime64, dt.datetime)):
- try:
- val = Timestamp(val)
- if val is not NaT:
- val = val.as_unit("ns")
- except OutOfBoundsDatetime:
- return _dtype_obj, val
-
- if val is NaT or val.tz is None:
- val = val.to_datetime64()
- dtype = val.dtype
- # TODO: test with datetime(2920, 10, 1) based on test_replace_dtypes
- else:
- if pandas_dtype:
- dtype = DatetimeTZDtype(unit="ns", tz=val.tz)
- else:
- # return datetimetz as object
- return _dtype_obj, val
-
- elif isinstance(val, (np.timedelta64, dt.timedelta)):
- try:
- val = Timedelta(val)
- except (OutOfBoundsTimedelta, OverflowError):
- dtype = _dtype_obj
- else:
- dtype = np.dtype("m8[ns]")
- val = np.timedelta64(val.value, "ns")
-
- elif is_bool(val):
- dtype = np.dtype(np.bool_)
-
- elif is_integer(val):
- if isinstance(val, np.integer):
- dtype = np.dtype(type(val))
- else:
- dtype = np.dtype(np.int64)
-
- try:
- np.array(val, dtype=dtype)
- except OverflowError:
- dtype = np.array(val).dtype
-
- elif is_float(val):
- if isinstance(val, np.floating):
- dtype = np.dtype(type(val))
- else:
- dtype = np.dtype(np.float64)
-
- elif is_complex(val):
- dtype = np.dtype(np.complex_)
-
- elif pandas_dtype:
- if lib.is_period(val):
- dtype = PeriodDtype(freq=val.freq)
- elif lib.is_interval(val):
- subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0]
- dtype = IntervalDtype(subtype=subtype, closed=val.closed)
-
- return dtype, val
-
-
-def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]:
- """
- Convert datetimelike-keyed dicts to a Timestamp-keyed dict.
-
- Parameters
- ----------
- d: dict-like object
-
- Returns
- -------
- dict
- """
- return {maybe_box_datetimelike(key): value for key, value in d.items()}
-
-
-def infer_dtype_from_array(
- arr, pandas_dtype: bool = False
-) -> tuple[DtypeObj, ArrayLike]:
- """
- Infer the dtype from an array.
-
- Parameters
- ----------
- arr : array
- pandas_dtype : bool, default False
- whether to infer dtype including pandas extension types.
- If False, array belongs to pandas extension types
- is inferred as object
-
- Returns
- -------
- tuple (numpy-compat/pandas-compat dtype, array)
-
- Notes
- -----
- if pandas_dtype=False. these infer to numpy dtypes
- exactly with the exception that mixed / object dtypes
- are not coerced by stringifying or conversion
-
- if pandas_dtype=True. datetime64tz-aware/categorical
- types will retain there character.
-
- Examples
- --------
- >>> np.asarray([1, '1'])
- array(['1', '1'], dtype='<U21')
-
- >>> infer_dtype_from_array([1, '1'])
- (dtype('O'), [1, '1'])
- """
- if isinstance(arr, np.ndarray):
- return arr.dtype, arr
-
- if not is_list_like(arr):
- raise TypeError("'arr' must be list-like")
-
- if pandas_dtype and is_extension_array_dtype(arr):
- return arr.dtype, arr
-
- elif isinstance(arr, ABCSeries):
- return arr.dtype, np.asarray(arr)
-
- # don't force numpy coerce with nan's
- inferred = lib.infer_dtype(arr, skipna=False)
- if inferred in ["string", "bytes", "mixed", "mixed-integer"]:
- return (np.dtype(np.object_), arr)
-
- arr = np.asarray(arr)
- return arr.dtype, arr
-
-
-def _maybe_infer_dtype_type(element):
- """
- Try to infer an object's dtype, for use in arithmetic ops.
-
- Uses `element.dtype` if that's available.
- Objects implementing the iterator protocol are cast to a NumPy array,
- and from there the array's type is used.
-
- Parameters
- ----------
- element : object
- Possibly has a `.dtype` attribute, and possibly the iterator
- protocol.
-
- Returns
- -------
- tipo : type
-
- Examples
- --------
- >>> from collections import namedtuple
- >>> Foo = namedtuple("Foo", "dtype")
- >>> _maybe_infer_dtype_type(Foo(np.dtype("i8")))
- dtype('int64')
- """
- tipo = None
- if hasattr(element, "dtype"):
- tipo = element.dtype
- elif is_list_like(element):
- element = np.asarray(element)
- tipo = element.dtype
- return tipo
-
-
-def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None:
- """
- Change string like dtypes to object for
- ``DataFrame.select_dtypes()``.
- """
- # error: Argument 1 to <set> has incompatible type "Type[generic]"; expected
- # "Union[dtype[Any], ExtensionDtype, None]"
- # error: Argument 2 to <set> has incompatible type "Type[generic]"; expected
- # "Union[dtype[Any], ExtensionDtype, None]"
- non_string_dtypes = dtype_set - {
- np.dtype("S").type, # type: ignore[arg-type]
- np.dtype("<U").type, # type: ignore[arg-type]
- }
- if non_string_dtypes != dtype_set:
- raise TypeError("string dtypes are not allowed, use 'object' instead")
-
-
-def coerce_indexer_dtype(indexer, categories) -> np.ndarray:
- """coerce the indexer input array to the smallest dtype possible"""
- length = len(categories)
- if length < _int8_max:
- return ensure_int8(indexer)
- elif length < _int16_max:
- return ensure_int16(indexer)
- elif length < _int32_max:
- return ensure_int32(indexer)
- return ensure_int64(indexer)
-
-
-def convert_dtypes(
- input_array: ArrayLike,
- convert_string: bool = True,
- convert_integer: bool = True,
- convert_boolean: bool = True,
- convert_floating: bool = True,
- infer_objects: bool = False,
- dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable",
-) -> DtypeObj:
- """
- Convert objects to best possible type, and optionally,
- to types supporting ``pd.NA``.
-
- Parameters
- ----------
- input_array : ExtensionArray or np.ndarray
- convert_string : bool, default True
- Whether object dtypes should be converted to ``StringDtype()``.
- convert_integer : bool, default True
- Whether, if possible, conversion can be done to integer extension types.
- convert_boolean : bool, defaults True
- Whether object dtypes should be converted to ``BooleanDtypes()``.
- convert_floating : bool, defaults True
- Whether, if possible, conversion can be done to floating extension types.
- If `convert_integer` is also True, preference will be give to integer
- dtypes if the floats can be faithfully casted to integers.
- infer_objects : bool, defaults False
- Whether to also infer objects to float/int if possible. Is only hit if the
- object array contains pd.NA.
- dtype_backend : str, default "numpy_nullable"
- Nullable dtype implementation to use.
-
- * "numpy_nullable" returns numpy-backed nullable types
- * "pyarrow" returns pyarrow-backed nullable types using ``ArrowDtype``
-
- Returns
- -------
- np.dtype, or ExtensionDtype
- """
- inferred_dtype: str | DtypeObj
-
- from pandas.core.arrays.arrow.dtype import ArrowDtype
-
- if (
- convert_string or convert_integer or convert_boolean or convert_floating
- ) and isinstance(input_array, np.ndarray):
- if is_object_dtype(input_array.dtype):
- inferred_dtype = lib.infer_dtype(input_array)
- else:
- inferred_dtype = input_array.dtype
-
- if is_string_dtype(inferred_dtype):
- if not convert_string or inferred_dtype == "bytes":
- inferred_dtype = input_array.dtype
- else:
- inferred_dtype = pandas_dtype_func("string")
-
- if convert_integer:
- target_int_dtype = pandas_dtype_func("Int64")
-
- if is_integer_dtype(input_array.dtype):
- from pandas.core.arrays.integer import INT_STR_TO_DTYPE
-
- inferred_dtype = INT_STR_TO_DTYPE.get(
- input_array.dtype.name, target_int_dtype
- )
- elif is_numeric_dtype(input_array.dtype):
- # TODO: de-dup with maybe_cast_to_integer_array?
- arr = input_array[notna(input_array)]
- if (arr.astype(int) == arr).all():
- inferred_dtype = target_int_dtype
- else:
- inferred_dtype = input_array.dtype
- elif (
- infer_objects
- and is_object_dtype(input_array.dtype)
- and (isinstance(inferred_dtype, str) and inferred_dtype == "integer")
- ):
- inferred_dtype = target_int_dtype
-
- if convert_floating:
- if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
- input_array.dtype
- ):
- from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
-
- inferred_float_dtype: DtypeObj = FLOAT_STR_TO_DTYPE.get(
- input_array.dtype.name, pandas_dtype_func("Float64")
- )
- # if we could also convert to integer, check if all floats
- # are actually integers
- if convert_integer:
- # TODO: de-dup with maybe_cast_to_integer_array?
- arr = input_array[notna(input_array)]
- if (arr.astype(int) == arr).all():
- inferred_dtype = pandas_dtype_func("Int64")
- else:
- inferred_dtype = inferred_float_dtype
- else:
- inferred_dtype = inferred_float_dtype
- elif (
- infer_objects
- and is_object_dtype(input_array.dtype)
- and (
- isinstance(inferred_dtype, str)
- and inferred_dtype == "mixed-integer-float"
- )
- ):
- inferred_dtype = pandas_dtype_func("Float64")
-
- if convert_boolean:
- if is_bool_dtype(input_array.dtype):
- inferred_dtype = pandas_dtype_func("boolean")
- elif isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
- inferred_dtype = pandas_dtype_func("boolean")
-
- if isinstance(inferred_dtype, str):
- # If we couldn't do anything else, then we retain the dtype
- inferred_dtype = input_array.dtype
-
- else:
- inferred_dtype = input_array.dtype
-
- if dtype_backend == "pyarrow":
- from pandas.core.arrays.arrow.array import to_pyarrow_type
- from pandas.core.arrays.string_ import StringDtype
-
- assert not isinstance(inferred_dtype, str)
-
- if (
- (convert_integer and inferred_dtype.kind in "iu")
- or (convert_floating and inferred_dtype.kind in "fc")
- or (convert_boolean and inferred_dtype.kind == "b")
- or (convert_string and isinstance(inferred_dtype, StringDtype))
- or (
- inferred_dtype.kind not in "iufcb"
- and not isinstance(inferred_dtype, StringDtype)
- )
- ):
- if isinstance(inferred_dtype, PandasExtensionDtype) and not isinstance(
- inferred_dtype, DatetimeTZDtype
- ):
- base_dtype = inferred_dtype.base
- elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)):
- base_dtype = inferred_dtype.numpy_dtype
- elif isinstance(inferred_dtype, StringDtype):
- base_dtype = np.dtype(str)
- else:
- base_dtype = inferred_dtype
- pa_type = to_pyarrow_type(base_dtype)
- if pa_type is not None:
- inferred_dtype = ArrowDtype(pa_type)
- elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype):
- # GH 53648
- inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype]
-
- # error: Incompatible return value type (got "Union[str, Union[dtype[Any],
- # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]")
- return inferred_dtype # type: ignore[return-value]
-
-
-def maybe_infer_to_datetimelike(
- value: npt.NDArray[np.object_],
-) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray:
- """
- we might have a array (or single object) that is datetime like,
- and no dtype is passed don't change the value unless we find a
- datetime/timedelta set
-
- this is pretty strict in that a datetime/timedelta is REQUIRED
- in addition to possible nulls/string likes
-
- Parameters
- ----------
- value : np.ndarray[object]
-
- Returns
- -------
- np.ndarray, DatetimeArray, TimedeltaArray, PeriodArray, or IntervalArray
-
- """
- if not isinstance(value, np.ndarray) or value.dtype != object:
- # Caller is responsible for passing only ndarray[object]
- raise TypeError(type(value)) # pragma: no cover
- if value.ndim != 1:
- # Caller is responsible
- raise ValueError(value.ndim) # pragma: no cover
-
- if not len(value):
- return value
-
- # error: Incompatible return value type (got "Union[ExtensionArray,
- # ndarray[Any, Any]]", expected "Union[ndarray[Any, Any], DatetimeArray,
- # TimedeltaArray, PeriodArray, IntervalArray]")
- return lib.maybe_convert_objects( # type: ignore[return-value]
- value,
- # Here we do not convert numeric dtypes, as if we wanted that,
- # numpy would have done it for us.
- convert_numeric=False,
- convert_period=True,
- convert_interval=True,
- convert_timedelta=True,
- convert_datetime=True,
- dtype_if_all_nat=np.dtype("M8[ns]"),
- )
-
-
-def maybe_cast_to_datetime(
- value: np.ndarray | list, dtype: np.dtype
-) -> ExtensionArray | np.ndarray:
- """
- try to cast the array/value to a datetimelike dtype, converting float
- nan to iNaT
-
- Caller is responsible for handling ExtensionDtype cases and non dt64/td64
- cases.
- """
- from pandas.core.arrays.datetimes import DatetimeArray
- from pandas.core.arrays.timedeltas import TimedeltaArray
-
- assert dtype.kind in ["m", "M"]
- if not is_list_like(value):
- raise TypeError("value must be listlike")
-
- # TODO: _from_sequence would raise ValueError in cases where
- # _ensure_nanosecond_dtype raises TypeError
- _ensure_nanosecond_dtype(dtype)
-
- if is_timedelta64_dtype(dtype):
- res = TimedeltaArray._from_sequence(value, dtype=dtype)
- return res
- else:
- try:
- dta = DatetimeArray._from_sequence(value, dtype=dtype)
- except ValueError as err:
- # We can give a Series-specific exception message.
- if "cannot supply both a tz and a timezone-naive dtype" in str(err):
- raise ValueError(
- "Cannot convert timezone-aware data to "
- "timezone-naive dtype. Use "
- "pd.Series(values).dt.tz_localize(None) instead."
- ) from err
- raise
-
- return dta
-
-
-def _ensure_nanosecond_dtype(dtype: DtypeObj) -> None:
- """
- Convert dtypes with granularity less than nanosecond to nanosecond
-
- >>> _ensure_nanosecond_dtype(np.dtype("M8[us]"))
-
- >>> _ensure_nanosecond_dtype(np.dtype("M8[D]"))
- Traceback (most recent call last):
- ...
- TypeError: dtype=datetime64[D] is not supported. Supported resolutions are 's', 'ms', 'us', and 'ns'
-
- >>> _ensure_nanosecond_dtype(np.dtype("m8[ps]"))
- Traceback (most recent call last):
- ...
- TypeError: dtype=timedelta64[ps] is not supported. Supported resolutions are 's', 'ms', 'us', and 'ns'
- """ # noqa:E501
- msg = (
- f"The '{dtype.name}' dtype has no unit. "
- f"Please pass in '{dtype.name}[ns]' instead."
- )
-
- # unpack e.g. SparseDtype
- dtype = getattr(dtype, "subtype", dtype)
-
- if not isinstance(dtype, np.dtype):
- # i.e. datetime64tz
- pass
-
- elif dtype.kind in ["m", "M"]:
- reso = get_unit_from_dtype(dtype)
- if not is_supported_unit(reso):
- # pre-2.0 we would silently swap in nanos for lower-resolutions,
- # raise for above-nano resolutions
- if dtype.name in ["datetime64", "timedelta64"]:
- raise ValueError(msg)
- # TODO: ValueError or TypeError? existing test
- # test_constructor_generic_timestamp_bad_frequency expects TypeError
- raise TypeError(
- f"dtype={dtype} is not supported. Supported resolutions are 's', "
- "'ms', 'us', and 'ns'"
- )
-
-
-# TODO: other value-dependent functions to standardize here include
-# Index._find_common_type_compat
-def find_result_type(left: ArrayLike, right: Any) -> DtypeObj:
- """
- Find the type/dtype for a the result of an operation between these objects.
-
- This is similar to find_common_type, but looks at the objects instead
- of just their dtypes. This can be useful in particular when one of the
- objects does not have a `dtype`.
-
- Parameters
- ----------
- left : np.ndarray or ExtensionArray
- right : Any
-
- Returns
- -------
- np.dtype or ExtensionDtype
-
- See also
- --------
- find_common_type
- numpy.result_type
- """
- new_dtype: DtypeObj
-
- if (
- isinstance(left, np.ndarray)
- and left.dtype.kind in ["i", "u", "c"]
- and (lib.is_integer(right) or lib.is_float(right))
- ):
- # e.g. with int8 dtype and right=512, we want to end up with
- # np.int16, whereas infer_dtype_from(512) gives np.int64,
- # which will make us upcast too far.
- if lib.is_float(right) and right.is_integer() and left.dtype.kind != "f":
- right = int(right)
-
- new_dtype = np.result_type(left, right)
-
- elif is_valid_na_for_dtype(right, left.dtype):
- # e.g. IntervalDtype[int] and None/np.nan
- new_dtype = ensure_dtype_can_hold_na(left.dtype)
-
- else:
- dtype, _ = infer_dtype_from(right, pandas_dtype=True)
-
- new_dtype = find_common_type([left.dtype, dtype])
-
- return new_dtype
-
-
-def common_dtype_categorical_compat(
- objs: list[Index | ArrayLike], dtype: DtypeObj
-) -> DtypeObj:
- """
- Update the result of find_common_type to account for NAs in a Categorical.
-
- Parameters
- ----------
- objs : list[np.ndarray | ExtensionArray | Index]
- dtype : np.dtype or ExtensionDtype
-
- Returns
- -------
- np.dtype or ExtensionDtype
- """
- # GH#38240
-
- # TODO: more generally, could do `not can_hold_na(dtype)`
- if isinstance(dtype, np.dtype) and dtype.kind in ["i", "u"]:
- for obj in objs:
- # We don't want to accientally allow e.g. "categorical" str here
- obj_dtype = getattr(obj, "dtype", None)
- if isinstance(obj_dtype, CategoricalDtype):
- if isinstance(obj, ABCIndex):
- # This check may already be cached
- hasnas = obj.hasnans
- else:
- # Categorical
- hasnas = cast("Categorical", obj)._hasna
-
- if hasnas:
- # see test_union_int_categorical_with_nan
- dtype = np.dtype(np.float64)
- break
- return dtype
-
-
-def np_find_common_type(*dtypes: np.dtype) -> np.dtype:
- """
- np.find_common_type implementation pre-1.25 deprecation using np.result_type
- https://github.com/pandas-dev/pandas/pull/49569#issuecomment-1308300065
-
- Parameters
- ----------
- dtypes : np.dtypes
-
- Returns
- -------
- np.dtype
- """
- try:
- common_dtype = np.result_type(*dtypes)
- if common_dtype.kind in "mMSU":
- # NumPy promotion currently (1.25) misbehaves for for times and strings,
- # so fall back to object (find_common_dtype did unless there
- # was only one dtype)
- common_dtype = np.dtype("O")
-
- except TypeError:
- common_dtype = np.dtype("O")
- return common_dtype
-
-
-@overload
-def find_common_type(types: list[np.dtype]) -> np.dtype:
- ...
-
-
-@overload
-def find_common_type(types: list[ExtensionDtype]) -> DtypeObj:
- ...
-
-
-@overload
-def find_common_type(types: list[DtypeObj]) -> DtypeObj:
- ...
-
-
-def find_common_type(types):
- """
- Find a common data type among the given dtypes.
-
- Parameters
- ----------
- types : list of dtypes
-
- Returns
- -------
- pandas extension or numpy dtype
-
- See Also
- --------
- numpy.find_common_type
-
- """
- if not types:
- raise ValueError("no types given")
-
- first = types[0]
-
- # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2)
- # => object
- if lib.dtypes_all_equal(list(types)):
- return first
-
- # get unique types (dict.fromkeys is used as order-preserving set())
- types = list(dict.fromkeys(types).keys())
-
- if any(isinstance(t, ExtensionDtype) for t in types):
- for t in types:
- if isinstance(t, ExtensionDtype):
- res = t._get_common_dtype(types)
- if res is not None:
- return res
- return np.dtype("object")
-
- # take lowest unit
- if all(is_datetime64_dtype(t) for t in types):
- return np.dtype(max(types))
- if all(is_timedelta64_dtype(t) for t in types):
- return np.dtype(max(types))
-
- # don't mix bool / int or float or complex
- # this is different from numpy, which casts bool with float/int as int
- has_bools = any(is_bool_dtype(t) for t in types)
- if has_bools:
- for t in types:
- if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t):
- return np.dtype("object")
-
- return np_find_common_type(*types)
-
-
-def construct_2d_arraylike_from_scalar(
- value: Scalar, length: int, width: int, dtype: np.dtype, copy: bool
-) -> np.ndarray:
- shape = (length, width)
-
- if dtype.kind in ["m", "M"]:
- value = _maybe_box_and_unbox_datetimelike(value, dtype)
- elif dtype == _dtype_obj:
- if isinstance(value, (np.timedelta64, np.datetime64)):
- # calling np.array below would cast to pytimedelta/pydatetime
- out = np.empty(shape, dtype=object)
- out.fill(value)
- return out
-
- # Attempt to coerce to a numpy array
- try:
- arr = np.array(value, dtype=dtype, copy=copy)
- except (ValueError, TypeError) as err:
- raise TypeError(
- f"DataFrame constructor called with incompatible data and dtype: {err}"
- ) from err
-
- if arr.ndim != 0:
- raise ValueError("DataFrame constructor not properly called!")
-
- return np.full(shape, arr)
-
-
-def construct_1d_arraylike_from_scalar(
- value: Scalar, length: int, dtype: DtypeObj | None
-) -> ArrayLike:
- """
- create a np.ndarray / pandas type of specified shape and dtype
- filled with values
-
- Parameters
- ----------
- value : scalar value
- length : int
- dtype : pandas_dtype or np.dtype
-
- Returns
- -------
- np.ndarray / pandas type of length, filled with value
-
- """
-
- if dtype is None:
- try:
- dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True)
- except OutOfBoundsDatetime:
- dtype = _dtype_obj
-
- if isinstance(dtype, ExtensionDtype):
- cls = dtype.construct_array_type()
- seq = [] if length == 0 else [value]
- subarr = cls._from_sequence(seq, dtype=dtype).repeat(length)
-
- else:
- if length and is_integer_dtype(dtype) and isna(value):
- # coerce if we have nan for an integer dtype
- dtype = np.dtype("float64")
- elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):
- # we need to coerce to object dtype to avoid
- # to allow numpy to take our string as a scalar value
- dtype = np.dtype("object")
- if not isna(value):
- value = ensure_str(value)
- elif dtype.kind in ["M", "m"]:
- value = _maybe_box_and_unbox_datetimelike(value, dtype)
-
- subarr = np.empty(length, dtype=dtype)
- if length:
- # GH 47391: numpy > 1.24 will raise filling np.nan into int dtypes
- subarr.fill(value)
-
- return subarr
-
-
-def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj):
- # Caller is responsible for checking dtype.kind in ["m", "M"]
-
- if isinstance(value, dt.datetime):
- # we dont want to box dt64, in particular datetime64("NaT")
- value = maybe_box_datetimelike(value, dtype)
-
- return _maybe_unbox_datetimelike(value, dtype)
-
-
-def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
- """
- Transform any list-like object in a 1-dimensional numpy array of object
- dtype.
-
- Parameters
- ----------
- values : any iterable which has a len()
-
- Raises
- ------
- TypeError
- * If `values` does not have a len()
-
- Returns
- -------
- 1-dimensional numpy array of dtype object
- """
- # numpy will try to interpret nested lists as further dimensions, hence
- # making a 1D array that contains list-likes is a bit tricky:
- result = np.empty(len(values), dtype="object")
- result[:] = values
- return result
-
-
-def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray:
- """
- Takes any dtype and returns the casted version, raising for when data is
- incompatible with integer/unsigned integer dtypes.
-
- Parameters
- ----------
- arr : np.ndarray or list
- The array to cast.
- dtype : np.dtype
- The integer dtype to cast the array to.
-
- Returns
- -------
- ndarray
- Array of integer or unsigned integer dtype.
-
- Raises
- ------
- OverflowError : the dtype is incompatible with the data
- ValueError : loss of precision has occurred during casting
-
- Examples
- --------
- If you try to coerce negative values to unsigned integers, it raises:
-
- >>> pd.Series([-1], dtype="uint64")
- Traceback (most recent call last):
- ...
- OverflowError: Trying to coerce negative values to unsigned integers
-
- Also, if you try to coerce float values to integers, it raises:
-
- >>> maybe_cast_to_integer_array([1, 2, 3.5], dtype=np.dtype("int64"))
- Traceback (most recent call last):
- ...
- ValueError: Trying to coerce float values to integers
- """
- assert is_integer_dtype(dtype)
-
- try:
- if not isinstance(arr, np.ndarray):
- with warnings.catch_warnings():
- # We already disallow dtype=uint w/ negative numbers
- # (test_constructor_coercion_signed_to_unsigned) so safe to ignore.
- warnings.filterwarnings(
- "ignore",
- "NumPy will stop allowing conversion of out-of-bound Python int",
- DeprecationWarning,
- )
- casted = np.array(arr, dtype=dtype, copy=False)
- else:
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore", category=RuntimeWarning)
- casted = arr.astype(dtype, copy=False)
- except OverflowError as err:
- raise OverflowError(
- "The elements provided in the data cannot all be "
- f"casted to the dtype {dtype}"
- ) from err
-
- if isinstance(arr, np.ndarray) and arr.dtype == dtype:
- # avoid expensive array_equal check
- return casted
-
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore", category=RuntimeWarning)
- if np.array_equal(arr, casted):
- return casted
-
- # We do this casting to allow for proper
- # data and dtype checking.
- #
- # We didn't do this earlier because NumPy
- # doesn't handle `uint64` correctly.
- arr = np.asarray(arr)
-
- if np.issubdtype(arr.dtype, str):
- if (casted.astype(str) == arr).all():
- return casted
- raise ValueError(f"string values cannot be losslessly cast to {dtype}")
-
- if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
- raise OverflowError("Trying to coerce negative values to unsigned integers")
-
- if is_float_dtype(arr.dtype):
- if not np.isfinite(arr).all():
- raise IntCastingNaNError(
- "Cannot convert non-finite values (NA or inf) to integer"
- )
- raise ValueError("Trying to coerce float values to integers")
- if is_object_dtype(arr.dtype):
- raise ValueError("Trying to coerce float values to integers")
-
- if casted.dtype < arr.dtype:
- # GH#41734 e.g. [1, 200, 923442] and dtype="int8" -> overflows
- raise ValueError(
- f"Values are too large to be losslessly converted to {dtype}. "
- f"To cast anyway, use pd.Series(values).astype({dtype})"
- )
-
- if arr.dtype.kind in ["m", "M"]:
- # test_constructor_maskedarray_nonfloat
- raise TypeError(
- f"Constructing a Series or DataFrame from {arr.dtype} values and "
- f"dtype={dtype} is not supported. Use values.view({dtype}) instead."
- )
-
- # No known cases that get here, but raising explicitly to cover our bases.
- raise ValueError(f"values cannot be losslessly cast to {dtype}")
-
-
-def can_hold_element(arr: ArrayLike, element: Any) -> bool:
- """
- Can we do an inplace setitem with this element in an array with this dtype?
-
- Parameters
- ----------
- arr : np.ndarray or ExtensionArray
- element : Any
-
- Returns
- -------
- bool
- """
- dtype = arr.dtype
- if not isinstance(dtype, np.dtype) or dtype.kind in ["m", "M"]:
- if isinstance(dtype, (PeriodDtype, IntervalDtype, DatetimeTZDtype, np.dtype)):
- # np.dtype here catches datetime64ns and timedelta64ns; we assume
- # in this case that we have DatetimeArray/TimedeltaArray
- arr = cast(
- "PeriodArray | DatetimeArray | TimedeltaArray | IntervalArray", arr
- )
- try:
- arr._validate_setitem_value(element)
- return True
- except (ValueError, TypeError):
- # TODO: re-use _catch_deprecated_value_error to ensure we are
- # strict about what exceptions we allow through here.
- return False
-
- # This is technically incorrect, but maintains the behavior of
- # ExtensionBlock._can_hold_element
- return True
-
- try:
- np_can_hold_element(dtype, element)
- return True
- except (TypeError, LossySetitemError):
- return False
-
-
-def np_can_hold_element(dtype: np.dtype, element: Any) -> Any:
- """
- Raise if we cannot losslessly set this element into an ndarray with this dtype.
-
- Specifically about places where we disagree with numpy. i.e. there are
- cases where numpy will raise in doing the setitem that we do not check
- for here, e.g. setting str "X" into a numeric ndarray.
-
- Returns
- -------
- Any
- The element, potentially cast to the dtype.
-
- Raises
- ------
- ValueError : If we cannot losslessly store this element with this dtype.
- """
- if dtype == _dtype_obj:
- return element
-
- tipo = _maybe_infer_dtype_type(element)
-
- if dtype.kind in ["i", "u"]:
- if isinstance(element, range):
- if _dtype_can_hold_range(element, dtype):
- return element
- raise LossySetitemError
-
- if is_integer(element) or (is_float(element) and element.is_integer()):
- # e.g. test_setitem_series_int8 if we have a python int 1
- # tipo may be np.int32, despite the fact that it will fit
- # in smaller int dtypes.
- info = np.iinfo(dtype)
- if info.min <= element <= info.max:
- return dtype.type(element)
- raise LossySetitemError
-
- if tipo is not None:
- if tipo.kind not in ["i", "u"]:
- if isinstance(element, np.ndarray) and element.dtype.kind == "f":
- # If all can be losslessly cast to integers, then we can hold them
- with np.errstate(invalid="ignore"):
- # We check afterwards if cast was losslessly, so no need to show
- # the warning
- casted = element.astype(dtype)
- comp = casted == element
- if comp.all():
- # Return the casted values bc they can be passed to
- # np.putmask, whereas the raw values cannot.
- # see TestSetitemFloatNDarrayIntoIntegerSeries
- return casted
- raise LossySetitemError
-
- # Anything other than integer we cannot hold
- raise LossySetitemError
- if (
- dtype.kind == "u"
- and isinstance(element, np.ndarray)
- and element.dtype.kind == "i"
- ):
- # see test_where_uint64
- casted = element.astype(dtype)
- if (casted == element).all():
- # TODO: faster to check (element >=0).all()? potential
- # itemsize issues there?
- return casted
- raise LossySetitemError
- if dtype.itemsize < tipo.itemsize:
- raise LossySetitemError
- if not isinstance(tipo, np.dtype):
- # i.e. nullable IntegerDtype; we can put this into an ndarray
- # losslessly iff it has no NAs
- if element._hasna:
- raise LossySetitemError
- return element
-
- return element
-
- raise LossySetitemError
-
- if dtype.kind == "f":
- if lib.is_integer(element) or lib.is_float(element):
- casted = dtype.type(element)
- if np.isnan(casted) or casted == element:
- return casted
- # otherwise e.g. overflow see TestCoercionFloat32
- raise LossySetitemError
-
- if tipo is not None:
- # TODO: itemsize check?
- if tipo.kind not in ["f", "i", "u"]:
- # Anything other than float/integer we cannot hold
- raise LossySetitemError
- if not isinstance(tipo, np.dtype):
- # i.e. nullable IntegerDtype or FloatingDtype;
- # we can put this into an ndarray losslessly iff it has no NAs
- if element._hasna:
- raise LossySetitemError
- return element
- elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind:
- if isinstance(element, np.ndarray):
- # e.g. TestDataFrameIndexingWhere::test_where_alignment
- casted = element.astype(dtype)
- if np.array_equal(casted, element, equal_nan=True):
- return casted
- raise LossySetitemError
-
- return element
-
- raise LossySetitemError
-
- if dtype.kind == "c":
- if lib.is_integer(element) or lib.is_complex(element) or lib.is_float(element):
- if np.isnan(element):
- # see test_where_complex GH#6345
- return dtype.type(element)
-
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore")
- casted = dtype.type(element)
- if casted == element:
- return casted
- # otherwise e.g. overflow see test_32878_complex_itemsize
- raise LossySetitemError
-
- if tipo is not None:
- if tipo.kind in ["c", "f", "i", "u"]:
- return element
- raise LossySetitemError
- raise LossySetitemError
-
- if dtype.kind == "b":
- if tipo is not None:
- if tipo.kind == "b":
- if not isinstance(tipo, np.dtype):
- # i.e. we have a BooleanArray
- if element._hasna:
- # i.e. there are pd.NA elements
- raise LossySetitemError
- return element
- raise LossySetitemError
- if lib.is_bool(element):
- return element
- raise LossySetitemError
-
- if dtype.kind == "S":
- # TODO: test tests.frame.methods.test_replace tests get here,
- # need more targeted tests. xref phofl has a PR about this
- if tipo is not None:
- if tipo.kind == "S" and tipo.itemsize <= dtype.itemsize:
- return element
- raise LossySetitemError
- if isinstance(element, bytes) and len(element) <= dtype.itemsize:
- return element
- raise LossySetitemError
-
- if dtype.kind == "V":
- # i.e. np.void, which cannot hold _anything_
- raise LossySetitemError
-
- raise NotImplementedError(dtype)
-
-
-def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool:
- """
- _maybe_infer_dtype_type infers to int64 (and float64 for very large endpoints),
- but in many cases a range can be held by a smaller integer dtype.
- Check if this is one of those cases.
- """
- if not len(rng):
- return True
- return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype)
diff --git a/contrib/python/pandas/py3/pandas/core/dtypes/common.py b/contrib/python/pandas/py3/pandas/core/dtypes/common.py
deleted file mode 100644
index 9461812332b..00000000000
--- a/contrib/python/pandas/py3/pandas/core/dtypes/common.py
+++ /dev/null
@@ -1,1792 +0,0 @@
-"""
-Common type operations.
-"""
-from __future__ import annotations
-
-from typing import (
- Any,
- Callable,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import (
- Interval,
- Period,
- algos,
- lib,
-)
-from pandas._libs.tslibs import conversion
-from pandas._typing import (
- ArrayLike,
- DtypeObj,
-)
-
-from pandas.core.dtypes.base import _registry as registry
-from pandas.core.dtypes.dtypes import (
- CategoricalDtype,
- DatetimeTZDtype,
- ExtensionDtype,
- IntervalDtype,
- PeriodDtype,
-)
-from pandas.core.dtypes.generic import ABCIndex
-from pandas.core.dtypes.inference import (
- is_array_like,
- is_bool,
- is_complex,
- is_dataclass,
- is_decimal,
- is_dict_like,
- is_file_like,
- is_float,
- is_hashable,
- is_integer,
- is_interval,
- is_iterator,
- is_list_like,
- is_named_tuple,
- is_nested_list_like,
- is_number,
- is_re,
- is_re_compilable,
- is_scalar,
- is_sequence,
-)
-
-DT64NS_DTYPE = conversion.DT64NS_DTYPE
-TD64NS_DTYPE = conversion.TD64NS_DTYPE
-INT64_DTYPE = np.dtype(np.int64)
-
-# oh the troubles to reduce import time
-_is_scipy_sparse = None
-
-ensure_float64 = algos.ensure_float64
-
-
-def ensure_float(arr):
- """
- Ensure that an array object has a float dtype if possible.
-
- Parameters
- ----------
- arr : array-like
- The array whose data type we want to enforce as float.
-
- Returns
- -------
- float_arr : The original array cast to the float dtype if
- possible. Otherwise, the original array is returned.
- """
- if is_extension_array_dtype(arr.dtype):
- if is_float_dtype(arr.dtype):
- arr = arr.to_numpy(dtype=arr.dtype.numpy_dtype, na_value=np.nan)
- else:
- arr = arr.to_numpy(dtype="float64", na_value=np.nan)
- elif issubclass(arr.dtype.type, (np.integer, np.bool_)):
- arr = arr.astype(float)
- return arr
-
-
-ensure_int64 = algos.ensure_int64
-ensure_int32 = algos.ensure_int32
-ensure_int16 = algos.ensure_int16
-ensure_int8 = algos.ensure_int8
-ensure_platform_int = algos.ensure_platform_int
-ensure_object = algos.ensure_object
-ensure_uint64 = algos.ensure_uint64
-
-
-def ensure_str(value: bytes | Any) -> str:
- """
- Ensure that bytes and non-strings get converted into ``str`` objects.
- """
- if isinstance(value, bytes):
- value = value.decode("utf-8")
- elif not isinstance(value, str):
- value = str(value)
- return value
-
-
-def ensure_python_int(value: int | np.integer) -> int:
- """
- Ensure that a value is a python int.
-
- Parameters
- ----------
- value: int or numpy.integer
-
- Returns
- -------
- int
-
- Raises
- ------
- TypeError: if the value isn't an int or can't be converted to one.
- """
- if not (is_integer(value) or is_float(value)):
- if not is_scalar(value):
- raise TypeError(
- f"Value needs to be a scalar value, was type {type(value).__name__}"
- )
- raise TypeError(f"Wrong type {type(value)} for value {value}")
- try:
- new_value = int(value)
- assert new_value == value
- except (TypeError, ValueError, AssertionError) as err:
- raise TypeError(f"Wrong type {type(value)} for value {value}") from err
- return new_value
-
-
-def classes(*klasses) -> Callable:
- """Evaluate if the tipo is a subclass of the klasses."""
- return lambda tipo: issubclass(tipo, klasses)
-
-
-def classes_and_not_datetimelike(*klasses) -> Callable:
- """
- Evaluate if the tipo is a subclass of the klasses
- and not a datetimelike.
- """
- return lambda tipo: (
- issubclass(tipo, klasses)
- and not issubclass(tipo, (np.datetime64, np.timedelta64))
- )
-
-
-def is_object_dtype(arr_or_dtype) -> bool:
- """
- Check whether an array-like or dtype is of the object dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array-like or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array-like or dtype is of the object dtype.
-
- Examples
- --------
- >>> from pandas.api.types import is_object_dtype
- >>> is_object_dtype(object)
- True
- >>> is_object_dtype(int)
- False
- >>> is_object_dtype(np.array([], dtype=object))
- True
- >>> is_object_dtype(np.array([], dtype=int))
- False
- >>> is_object_dtype([1, 2, 3])
- False
- """
- return _is_dtype_type(arr_or_dtype, classes(np.object_))
-
-
-def is_sparse(arr) -> bool:
- """
- Check whether an array-like is a 1-D pandas sparse array.
-
- Check that the one-dimensional array-like is a pandas sparse array.
- Returns True if it is a pandas sparse array, not another type of
- sparse array.
-
- Parameters
- ----------
- arr : array-like
- Array-like to check.
-
- Returns
- -------
- bool
- Whether or not the array-like is a pandas sparse array.
-
- Examples
- --------
- Returns `True` if the parameter is a 1-D pandas sparse array.
-
- >>> is_sparse(pd.arrays.SparseArray([0, 0, 1, 0]))
- True
- >>> is_sparse(pd.Series(pd.arrays.SparseArray([0, 0, 1, 0])))
- True
-
- Returns `False` if the parameter is not sparse.
-
- >>> is_sparse(np.array([0, 0, 1, 0]))
- False
- >>> is_sparse(pd.Series([0, 1, 0, 0]))
- False
-
- Returns `False` if the parameter is not a pandas sparse array.
-
- >>> from scipy.sparse import bsr_matrix
- >>> is_sparse(bsr_matrix([0, 1, 0, 0]))
- False
-
- Returns `False` if the parameter has more than one dimension.
- """
- from pandas.core.arrays.sparse import SparseDtype
-
- dtype = getattr(arr, "dtype", arr)
- return isinstance(dtype, SparseDtype)
-
-
-def is_scipy_sparse(arr) -> bool:
- """
- Check whether an array-like is a scipy.sparse.spmatrix instance.
-
- Parameters
- ----------
- arr : array-like
- The array-like to check.
-
- Returns
- -------
- boolean
- Whether or not the array-like is a scipy.sparse.spmatrix instance.
-
- Notes
- -----
- If scipy is not installed, this function will always return False.
-
- Examples
- --------
- >>> from scipy.sparse import bsr_matrix
- >>> is_scipy_sparse(bsr_matrix([1, 2, 3]))
- True
- >>> is_scipy_sparse(pd.arrays.SparseArray([1, 2, 3]))
- False
- """
- global _is_scipy_sparse
-
- if _is_scipy_sparse is None: # pylint: disable=used-before-assignment
- try:
- from scipy.sparse import issparse as _is_scipy_sparse
- except ImportError:
- _is_scipy_sparse = lambda _: False
-
- assert _is_scipy_sparse is not None
- return _is_scipy_sparse(arr)
-
-
-def is_datetime64_dtype(arr_or_dtype) -> bool:
- """
- Check whether an array-like or dtype is of the datetime64 dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array-like or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array-like or dtype is of the datetime64 dtype.
-
- Examples
- --------
- >>> from pandas.api.types import is_datetime64_dtype
- >>> is_datetime64_dtype(object)
- False
- >>> is_datetime64_dtype(np.datetime64)
- True
- >>> is_datetime64_dtype(np.array([], dtype=int))
- False
- >>> is_datetime64_dtype(np.array([], dtype=np.datetime64))
- True
- >>> is_datetime64_dtype([1, 2, 3])
- False
- """
- if isinstance(arr_or_dtype, np.dtype):
- # GH#33400 fastpath for dtype object
- return arr_or_dtype.kind == "M"
- return _is_dtype_type(arr_or_dtype, classes(np.datetime64))
-
-
-def is_datetime64tz_dtype(arr_or_dtype) -> bool:
- """
- Check whether an array-like or dtype is of a DatetimeTZDtype dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array-like or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array-like or dtype is of a DatetimeTZDtype dtype.
-
- Examples
- --------
- >>> is_datetime64tz_dtype(object)
- False
- >>> is_datetime64tz_dtype([1, 2, 3])
- False
- >>> is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) # tz-naive
- False
- >>> is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))
- True
-
- >>> dtype = DatetimeTZDtype("ns", tz="US/Eastern")
- >>> s = pd.Series([], dtype=dtype)
- >>> is_datetime64tz_dtype(dtype)
- True
- >>> is_datetime64tz_dtype(s)
- True
- """
- if isinstance(arr_or_dtype, DatetimeTZDtype):
- # GH#33400 fastpath for dtype object
- # GH 34986
- return True
-
- if arr_or_dtype is None:
- return False
- return DatetimeTZDtype.is_dtype(arr_or_dtype)
-
-
-def is_timedelta64_dtype(arr_or_dtype) -> bool:
- """
- Check whether an array-like or dtype is of the timedelta64 dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array-like or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array-like or dtype is of the timedelta64 dtype.
-
- Examples
- --------
- >>> from pandas.core.dtypes.common import is_timedelta64_dtype
- >>> is_timedelta64_dtype(object)
- False
- >>> is_timedelta64_dtype(np.timedelta64)
- True
- >>> is_timedelta64_dtype([1, 2, 3])
- False
- >>> is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]"))
- True
- >>> is_timedelta64_dtype('0 days')
- False
- """
- if isinstance(arr_or_dtype, np.dtype):
- # GH#33400 fastpath for dtype object
- return arr_or_dtype.kind == "m"
-
- return _is_dtype_type(arr_or_dtype, classes(np.timedelta64))
-
-
-def is_period_dtype(arr_or_dtype) -> bool:
- """
- Check whether an array-like or dtype is of the Period dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array-like or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array-like or dtype is of the Period dtype.
-
- Examples
- --------
- >>> is_period_dtype(object)
- False
- >>> is_period_dtype(PeriodDtype(freq="D"))
- True
- >>> is_period_dtype([1, 2, 3])
- False
- >>> is_period_dtype(pd.Period("2017-01-01"))
- False
- >>> is_period_dtype(pd.PeriodIndex([], freq="A"))
- True
- """
- if isinstance(arr_or_dtype, ExtensionDtype):
- # GH#33400 fastpath for dtype object
- return arr_or_dtype.type is Period
-
- if arr_or_dtype is None:
- return False
- return PeriodDtype.is_dtype(arr_or_dtype)
-
-
-def is_interval_dtype(arr_or_dtype) -> bool:
- """
- Check whether an array-like or dtype is of the Interval dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array-like or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array-like or dtype is of the Interval dtype.
-
- Examples
- --------
- >>> is_interval_dtype(object)
- False
- >>> is_interval_dtype(IntervalDtype())
- True
- >>> is_interval_dtype([1, 2, 3])
- False
- >>>
- >>> interval = pd.Interval(1, 2, closed="right")
- >>> is_interval_dtype(interval)
- False
- >>> is_interval_dtype(pd.IntervalIndex([interval]))
- True
- """
- if isinstance(arr_or_dtype, ExtensionDtype):
- # GH#33400 fastpath for dtype object
- return arr_or_dtype.type is Interval
-
- if arr_or_dtype is None:
- return False
- return IntervalDtype.is_dtype(arr_or_dtype)
-
-
-def is_categorical_dtype(arr_or_dtype) -> bool:
- """
- Check whether an array-like or dtype is of the Categorical dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array-like or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array-like or dtype is of the Categorical dtype.
-
- Examples
- --------
- >>> from pandas.api.types import is_categorical_dtype
- >>> from pandas import CategoricalDtype
- >>> is_categorical_dtype(object)
- False
- >>> is_categorical_dtype(CategoricalDtype())
- True
- >>> is_categorical_dtype([1, 2, 3])
- False
- >>> is_categorical_dtype(pd.Categorical([1, 2, 3]))
- True
- >>> is_categorical_dtype(pd.CategoricalIndex([1, 2, 3]))
- True
- """
- if isinstance(arr_or_dtype, ExtensionDtype):
- # GH#33400 fastpath for dtype object
- return arr_or_dtype.name == "category"
-
- if arr_or_dtype is None:
- return False
- return CategoricalDtype.is_dtype(arr_or_dtype)
-
-
-def is_string_or_object_np_dtype(dtype: np.dtype) -> bool:
- """
- Faster alternative to is_string_dtype, assumes we have a np.dtype object.
- """
- return dtype == object or dtype.kind in "SU"
-
-
-def is_string_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of the string dtype.
-
- If an array is passed with an object dtype, the elements must be
- inferred as strings.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of the string dtype.
-
- Examples
- --------
- >>> is_string_dtype(str)
- True
- >>> is_string_dtype(object)
- True
- >>> is_string_dtype(int)
- False
- >>> is_string_dtype(np.array(['a', 'b']))
- True
- >>> is_string_dtype(pd.Series([1, 2]))
- False
- >>> is_string_dtype(pd.Series([1, 2], dtype=object))
- False
- """
- if hasattr(arr_or_dtype, "dtype") and get_dtype(arr_or_dtype).kind == "O":
- return is_all_strings(arr_or_dtype)
-
- def condition(dtype) -> bool:
- if is_string_or_object_np_dtype(dtype):
- return True
- try:
- return dtype == "string"
- except TypeError:
- return False
-
- return _is_dtype(arr_or_dtype, condition)
-
-
-def is_dtype_equal(source, target) -> bool:
- """
- Check if two dtypes are equal.
-
- Parameters
- ----------
- source : The first dtype to compare
- target : The second dtype to compare
-
- Returns
- -------
- boolean
- Whether or not the two dtypes are equal.
-
- Examples
- --------
- >>> is_dtype_equal(int, float)
- False
- >>> is_dtype_equal("int", int)
- True
- >>> is_dtype_equal(object, "category")
- False
- >>> is_dtype_equal(CategoricalDtype(), "category")
- True
- >>> is_dtype_equal(DatetimeTZDtype(tz="UTC"), "datetime64")
- False
- """
- if isinstance(target, str):
- if not isinstance(source, str):
- # GH#38516 ensure we get the same behavior from
- # is_dtype_equal(CDT, "category") and CDT == "category"
- try:
- src = get_dtype(source)
- if isinstance(src, ExtensionDtype):
- return src == target
- except (TypeError, AttributeError, ImportError):
- return False
- elif isinstance(source, str):
- return is_dtype_equal(target, source)
-
- try:
- source = get_dtype(source)
- target = get_dtype(target)
- return source == target
- except (TypeError, AttributeError, ImportError):
- # invalid comparison
- # object == category will hit this
- return False
-
-
-def is_any_int_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of an integer dtype.
-
- In this function, timedelta64 instances are also considered "any-integer"
- type objects and will return True.
-
- This function is internal and should not be exposed in the public API.
-
- The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
- as integer by this function.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of an integer dtype.
-
- Examples
- --------
- >>> is_any_int_dtype(str)
- False
- >>> is_any_int_dtype(int)
- True
- >>> is_any_int_dtype(float)
- False
- >>> is_any_int_dtype(np.uint64)
- True
- >>> is_any_int_dtype(np.datetime64)
- False
- >>> is_any_int_dtype(np.timedelta64)
- True
- >>> is_any_int_dtype(np.array(['a', 'b']))
- False
- >>> is_any_int_dtype(pd.Series([1, 2]))
- True
- >>> is_any_int_dtype(np.array([], dtype=np.timedelta64))
- True
- >>> is_any_int_dtype(pd.Index([1, 2.])) # float
- False
- """
- return _is_dtype_type(
- arr_or_dtype, classes(np.integer, np.timedelta64)
- ) or _is_dtype(
- arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind in "iu"
- )
-
-
-def is_integer_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of an integer dtype.
-
- Unlike in `is_any_int_dtype`, timedelta64 instances will return False.
-
- The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
- as integer by this function.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of an integer dtype and
- not an instance of timedelta64.
-
- Examples
- --------
- >>> is_integer_dtype(str)
- False
- >>> is_integer_dtype(int)
- True
- >>> is_integer_dtype(float)
- False
- >>> is_integer_dtype(np.uint64)
- True
- >>> is_integer_dtype('int8')
- True
- >>> is_integer_dtype('Int8')
- True
- >>> is_integer_dtype(pd.Int8Dtype)
- True
- >>> is_integer_dtype(np.datetime64)
- False
- >>> is_integer_dtype(np.timedelta64)
- False
- >>> is_integer_dtype(np.array(['a', 'b']))
- False
- >>> is_integer_dtype(pd.Series([1, 2]))
- True
- >>> is_integer_dtype(np.array([], dtype=np.timedelta64))
- False
- >>> is_integer_dtype(pd.Index([1, 2.])) # float
- False
- """
- return _is_dtype_type(
- arr_or_dtype, classes_and_not_datetimelike(np.integer)
- ) or _is_dtype(
- arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind in "iu"
- )
-
-
-def is_signed_integer_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of a signed integer dtype.
-
- Unlike in `is_any_int_dtype`, timedelta64 instances will return False.
-
- The nullable Integer dtypes (e.g. pandas.Int64Dtype) are also considered
- as integer by this function.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of a signed integer dtype
- and not an instance of timedelta64.
-
- Examples
- --------
- >>> is_signed_integer_dtype(str)
- False
- >>> is_signed_integer_dtype(int)
- True
- >>> is_signed_integer_dtype(float)
- False
- >>> is_signed_integer_dtype(np.uint64) # unsigned
- False
- >>> is_signed_integer_dtype('int8')
- True
- >>> is_signed_integer_dtype('Int8')
- True
- >>> is_signed_integer_dtype(pd.Int8Dtype)
- True
- >>> is_signed_integer_dtype(np.datetime64)
- False
- >>> is_signed_integer_dtype(np.timedelta64)
- False
- >>> is_signed_integer_dtype(np.array(['a', 'b']))
- False
- >>> is_signed_integer_dtype(pd.Series([1, 2]))
- True
- >>> is_signed_integer_dtype(np.array([], dtype=np.timedelta64))
- False
- >>> is_signed_integer_dtype(pd.Index([1, 2.])) # float
- False
- >>> is_signed_integer_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned
- False
- """
- return _is_dtype_type(
- arr_or_dtype, classes_and_not_datetimelike(np.signedinteger)
- ) or _is_dtype(
- arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind == "i"
- )
-
-
-def is_unsigned_integer_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of an unsigned integer dtype.
-
- The nullable Integer dtypes (e.g. pandas.UInt64Dtype) are also
- considered as integer by this function.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of an unsigned integer dtype.
-
- Examples
- --------
- >>> is_unsigned_integer_dtype(str)
- False
- >>> is_unsigned_integer_dtype(int) # signed
- False
- >>> is_unsigned_integer_dtype(float)
- False
- >>> is_unsigned_integer_dtype(np.uint64)
- True
- >>> is_unsigned_integer_dtype('uint8')
- True
- >>> is_unsigned_integer_dtype('UInt8')
- True
- >>> is_unsigned_integer_dtype(pd.UInt8Dtype)
- True
- >>> is_unsigned_integer_dtype(np.array(['a', 'b']))
- False
- >>> is_unsigned_integer_dtype(pd.Series([1, 2])) # signed
- False
- >>> is_unsigned_integer_dtype(pd.Index([1, 2.])) # float
- False
- >>> is_unsigned_integer_dtype(np.array([1, 2], dtype=np.uint32))
- True
- """
- return _is_dtype_type(
- arr_or_dtype, classes_and_not_datetimelike(np.unsignedinteger)
- ) or _is_dtype(
- arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind == "u"
- )
-
-
-def is_int64_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of the int64 dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of the int64 dtype.
-
- Notes
- -----
- Depending on system architecture, the return value of `is_int64_dtype(
- int)` will be True if the OS uses 64-bit integers and False if the OS
- uses 32-bit integers.
-
- Examples
- --------
- >>> from pandas.api.types import is_int64_dtype
- >>> is_int64_dtype(str)
- False
- >>> is_int64_dtype(np.int32)
- False
- >>> is_int64_dtype(np.int64)
- True
- >>> is_int64_dtype('int8')
- False
- >>> is_int64_dtype('Int8')
- False
- >>> is_int64_dtype(pd.Int64Dtype)
- True
- >>> is_int64_dtype(float)
- False
- >>> is_int64_dtype(np.uint64) # unsigned
- False
- >>> is_int64_dtype(np.array(['a', 'b']))
- False
- >>> is_int64_dtype(np.array([1, 2], dtype=np.int64))
- True
- >>> is_int64_dtype(pd.Index([1, 2.])) # float
- False
- >>> is_int64_dtype(np.array([1, 2], dtype=np.uint32)) # unsigned
- False
- """
- return _is_dtype_type(arr_or_dtype, classes(np.int64))
-
-
-def is_datetime64_any_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of the datetime64 dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- bool
- Whether or not the array or dtype is of the datetime64 dtype.
-
- Examples
- --------
- >>> is_datetime64_any_dtype(str)
- False
- >>> is_datetime64_any_dtype(int)
- False
- >>> is_datetime64_any_dtype(np.datetime64) # can be tz-naive
- True
- >>> is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern"))
- True
- >>> is_datetime64_any_dtype(np.array(['a', 'b']))
- False
- >>> is_datetime64_any_dtype(np.array([1, 2]))
- False
- >>> is_datetime64_any_dtype(np.array([], dtype="datetime64[ns]"))
- True
- >>> is_datetime64_any_dtype(pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]"))
- True
- """
- if isinstance(arr_or_dtype, (np.dtype, ExtensionDtype)):
- # GH#33400 fastpath for dtype object
- return arr_or_dtype.kind == "M"
-
- if arr_or_dtype is None:
- return False
- return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype)
-
-
-def is_datetime64_ns_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of the datetime64[ns] dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- bool
- Whether or not the array or dtype is of the datetime64[ns] dtype.
-
- Examples
- --------
- >>> is_datetime64_ns_dtype(str)
- False
- >>> is_datetime64_ns_dtype(int)
- False
- >>> is_datetime64_ns_dtype(np.datetime64) # no unit
- False
- >>> is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern"))
- True
- >>> is_datetime64_ns_dtype(np.array(['a', 'b']))
- False
- >>> is_datetime64_ns_dtype(np.array([1, 2]))
- False
- >>> is_datetime64_ns_dtype(np.array([], dtype="datetime64")) # no unit
- False
- >>> is_datetime64_ns_dtype(np.array([], dtype="datetime64[ps]")) # wrong unit
- False
- >>> is_datetime64_ns_dtype(pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]"))
- True
- """
- if arr_or_dtype is None:
- return False
- try:
- tipo = get_dtype(arr_or_dtype)
- except TypeError:
- if is_datetime64tz_dtype(arr_or_dtype):
- tipo = get_dtype(arr_or_dtype.dtype)
- else:
- return False
- return tipo == DT64NS_DTYPE or (
- isinstance(tipo, DatetimeTZDtype) and tipo.unit == "ns"
- )
-
-
-def is_timedelta64_ns_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of the timedelta64[ns] dtype.
-
- This is a very specific dtype, so generic ones like `np.timedelta64`
- will return False if passed into this function.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of the timedelta64[ns] dtype.
-
- Examples
- --------
- >>> from pandas.core.dtypes.common import is_timedelta64_ns_dtype
- >>> is_timedelta64_ns_dtype(np.dtype('m8[ns]'))
- True
- >>> is_timedelta64_ns_dtype(np.dtype('m8[ps]')) # Wrong frequency
- False
- >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype='m8[ns]'))
- True
- >>> is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64))
- False
- """
- return _is_dtype(arr_or_dtype, lambda dtype: dtype == TD64NS_DTYPE)
-
-
-def is_datetime_or_timedelta_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of
- a timedelta64 or datetime64 dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of a timedelta64,
- or datetime64 dtype.
-
- Examples
- --------
- >>> is_datetime_or_timedelta_dtype(str)
- False
- >>> is_datetime_or_timedelta_dtype(int)
- False
- >>> is_datetime_or_timedelta_dtype(np.datetime64)
- True
- >>> is_datetime_or_timedelta_dtype(np.timedelta64)
- True
- >>> is_datetime_or_timedelta_dtype(np.array(['a', 'b']))
- False
- >>> is_datetime_or_timedelta_dtype(pd.Series([1, 2]))
- False
- >>> is_datetime_or_timedelta_dtype(np.array([], dtype=np.timedelta64))
- True
- >>> is_datetime_or_timedelta_dtype(np.array([], dtype=np.datetime64))
- True
- """
- return _is_dtype_type(arr_or_dtype, classes(np.datetime64, np.timedelta64))
-
-
-# This exists to silence numpy deprecation warnings, see GH#29553
-def is_numeric_v_string_like(a: ArrayLike, b) -> bool:
- """
- Check if we are comparing a string-like object to a numeric ndarray.
- NumPy doesn't like to compare such objects, especially numeric arrays
- and scalar string-likes.
-
- Parameters
- ----------
- a : array-like, scalar
- The first object to check.
- b : array-like, scalar
- The second object to check.
-
- Returns
- -------
- boolean
- Whether we return a comparing a string-like object to a numeric array.
-
- Examples
- --------
- >>> is_numeric_v_string_like(np.array([1]), "foo")
- True
- >>> is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"]))
- True
- >>> is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2]))
- True
- >>> is_numeric_v_string_like(np.array([1]), np.array([2]))
- False
- >>> is_numeric_v_string_like(np.array(["foo"]), np.array(["foo"]))
- False
- """
- is_a_array = isinstance(a, np.ndarray)
- is_b_array = isinstance(b, np.ndarray)
-
- is_a_numeric_array = is_a_array and a.dtype.kind in ("u", "i", "f", "c", "b")
- is_b_numeric_array = is_b_array and b.dtype.kind in ("u", "i", "f", "c", "b")
- is_a_string_array = is_a_array and a.dtype.kind in ("S", "U")
- is_b_string_array = is_b_array and b.dtype.kind in ("S", "U")
-
- is_b_scalar_string_like = not is_b_array and isinstance(b, str)
-
- return (
- (is_a_numeric_array and is_b_scalar_string_like)
- or (is_a_numeric_array and is_b_string_array)
- or (is_b_numeric_array and is_a_string_array)
- )
-
-
-def needs_i8_conversion(arr_or_dtype) -> bool:
- """
- Check whether the array or dtype should be converted to int64.
-
- An array-like or dtype "needs" such a conversion if the array-like
- or dtype is of a datetime-like dtype
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype should be converted to int64.
-
- Examples
- --------
- >>> needs_i8_conversion(str)
- False
- >>> needs_i8_conversion(np.int64)
- False
- >>> needs_i8_conversion(np.datetime64)
- True
- >>> needs_i8_conversion(np.array(['a', 'b']))
- False
- >>> needs_i8_conversion(pd.Series([1, 2]))
- False
- >>> needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]"))
- True
- >>> needs_i8_conversion(pd.DatetimeIndex([1, 2, 3], tz="US/Eastern"))
- True
- """
- if arr_or_dtype is None:
- return False
- if isinstance(arr_or_dtype, np.dtype):
- return arr_or_dtype.kind in ["m", "M"]
- elif isinstance(arr_or_dtype, ExtensionDtype):
- return isinstance(arr_or_dtype, (PeriodDtype, DatetimeTZDtype))
-
- try:
- dtype = get_dtype(arr_or_dtype)
- except (TypeError, ValueError):
- return False
- if isinstance(dtype, np.dtype):
- return dtype.kind in ["m", "M"]
- return isinstance(dtype, (PeriodDtype, DatetimeTZDtype))
-
-
-def is_numeric_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of a numeric dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of a numeric dtype.
-
- Examples
- --------
- >>> from pandas.api.types import is_numeric_dtype
- >>> is_numeric_dtype(str)
- False
- >>> is_numeric_dtype(int)
- True
- >>> is_numeric_dtype(float)
- True
- >>> is_numeric_dtype(np.uint64)
- True
- >>> is_numeric_dtype(np.datetime64)
- False
- >>> is_numeric_dtype(np.timedelta64)
- False
- >>> is_numeric_dtype(np.array(['a', 'b']))
- False
- >>> is_numeric_dtype(pd.Series([1, 2]))
- True
- >>> is_numeric_dtype(pd.Index([1, 2.]))
- True
- >>> is_numeric_dtype(np.array([], dtype=np.timedelta64))
- False
- """
- return _is_dtype_type(
- arr_or_dtype, classes_and_not_datetimelike(np.number, np.bool_)
- ) or _is_dtype(
- arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ._is_numeric
- )
-
-
-def is_any_real_numeric_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of a real number dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of a real number dtype.
-
- Examples
- --------
- >>> from pandas.api.types import is_any_real_numeric_dtype
- >>> is_any_real_numeric_dtype(int)
- True
- >>> is_any_real_numeric_dtype(float)
- True
- >>> is_any_real_numeric_dtype(object)
- False
- >>> is_any_real_numeric_dtype(str)
- False
- >>> is_any_real_numeric_dtype(complex(1, 2))
- False
- >>> is_any_real_numeric_dtype(bool)
- False
- """
- return (
- is_numeric_dtype(arr_or_dtype)
- and not is_complex_dtype(arr_or_dtype)
- and not is_bool_dtype(arr_or_dtype)
- )
-
-
-def is_float_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of a float dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of a float dtype.
-
- Examples
- --------
- >>> from pandas.api.types import is_float_dtype
- >>> is_float_dtype(str)
- False
- >>> is_float_dtype(int)
- False
- >>> is_float_dtype(float)
- True
- >>> is_float_dtype(np.array(['a', 'b']))
- False
- >>> is_float_dtype(pd.Series([1, 2]))
- False
- >>> is_float_dtype(pd.Index([1, 2.]))
- True
- """
- return _is_dtype_type(arr_or_dtype, classes(np.floating)) or _is_dtype(
- arr_or_dtype, lambda typ: isinstance(typ, ExtensionDtype) and typ.kind in "f"
- )
-
-
-def is_bool_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of a boolean dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of a boolean dtype.
-
- Notes
- -----
- An ExtensionArray is considered boolean when the ``_is_boolean``
- attribute is set to True.
-
- Examples
- --------
- >>> from pandas.api.types import is_bool_dtype
- >>> is_bool_dtype(str)
- False
- >>> is_bool_dtype(int)
- False
- >>> is_bool_dtype(bool)
- True
- >>> is_bool_dtype(np.bool_)
- True
- >>> is_bool_dtype(np.array(['a', 'b']))
- False
- >>> is_bool_dtype(pd.Series([1, 2]))
- False
- >>> is_bool_dtype(np.array([True, False]))
- True
- >>> is_bool_dtype(pd.Categorical([True, False]))
- True
- >>> is_bool_dtype(pd.arrays.SparseArray([True, False]))
- True
- """
- if arr_or_dtype is None:
- return False
- try:
- dtype = get_dtype(arr_or_dtype)
- except (TypeError, ValueError):
- return False
-
- if isinstance(dtype, CategoricalDtype):
- arr_or_dtype = dtype.categories
- # now we use the special definition for Index
-
- if isinstance(arr_or_dtype, ABCIndex):
- # Allow Index[object] that is all-bools or Index["boolean"]
- return arr_or_dtype.inferred_type == "boolean"
- elif isinstance(dtype, ExtensionDtype):
- return getattr(dtype, "_is_boolean", False)
-
- return issubclass(dtype.type, np.bool_)
-
-
-def is_1d_only_ea_obj(obj: Any) -> bool:
- """
- ExtensionArray that does not support 2D, or more specifically that does
- not use HybridBlock.
- """
- from pandas.core.arrays import (
- DatetimeArray,
- ExtensionArray,
- PeriodArray,
- TimedeltaArray,
- )
-
- return isinstance(obj, ExtensionArray) and not isinstance(
- obj, (DatetimeArray, TimedeltaArray, PeriodArray)
- )
-
-
-def is_1d_only_ea_dtype(dtype: DtypeObj | None) -> bool:
- """
- Analogue to is_extension_array_dtype but excluding DatetimeTZDtype.
- """
- # Note: if other EA dtypes are ever held in HybridBlock, exclude those
- # here too.
- # NB: need to check DatetimeTZDtype and not is_datetime64tz_dtype
- # to exclude ArrowTimestampUSDtype
- return isinstance(dtype, ExtensionDtype) and not isinstance(
- dtype, (DatetimeTZDtype, PeriodDtype)
- )
-
-
-def is_extension_array_dtype(arr_or_dtype) -> bool:
- """
- Check if an object is a pandas extension array type.
-
- See the :ref:`Use Guide <extending.extension-types>` for more.
-
- Parameters
- ----------
- arr_or_dtype : object
- For array-like input, the ``.dtype`` attribute will
- be extracted.
-
- Returns
- -------
- bool
- Whether the `arr_or_dtype` is an extension array type.
-
- Notes
- -----
- This checks whether an object implements the pandas extension
- array interface. In pandas, this includes:
-
- * Categorical
- * Sparse
- * Interval
- * Period
- * DatetimeArray
- * TimedeltaArray
-
- Third-party libraries may implement arrays or types satisfying
- this interface as well.
-
- Examples
- --------
- >>> from pandas.api.types import is_extension_array_dtype
- >>> arr = pd.Categorical(['a', 'b'])
- >>> is_extension_array_dtype(arr)
- True
- >>> is_extension_array_dtype(arr.dtype)
- True
-
- >>> arr = np.array(['a', 'b'])
- >>> is_extension_array_dtype(arr.dtype)
- False
- """
- dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype)
- if isinstance(dtype, ExtensionDtype):
- return True
- elif isinstance(dtype, np.dtype):
- return False
- else:
- return registry.find(dtype) is not None
-
-
-def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool:
- """
- Check for ExtensionDtype, datetime64 dtype, or timedelta64 dtype.
-
- Notes
- -----
- Checks only for dtype objects, not dtype-castable strings or types.
- """
- return isinstance(dtype, ExtensionDtype) or (
- isinstance(dtype, np.dtype) and dtype.kind in ["m", "M"]
- )
-
-
-def is_complex_dtype(arr_or_dtype) -> bool:
- """
- Check whether the provided array or dtype is of a complex dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array or dtype to check.
-
- Returns
- -------
- boolean
- Whether or not the array or dtype is of a complex dtype.
-
- Examples
- --------
- >>> from pandas.api.types import is_complex_dtype
- >>> is_complex_dtype(str)
- False
- >>> is_complex_dtype(int)
- False
- >>> is_complex_dtype(np.complex_)
- True
- >>> is_complex_dtype(np.array(['a', 'b']))
- False
- >>> is_complex_dtype(pd.Series([1, 2]))
- False
- >>> is_complex_dtype(np.array([1 + 1j, 5]))
- True
- """
- return _is_dtype_type(arr_or_dtype, classes(np.complexfloating))
-
-
-def _is_dtype(arr_or_dtype, condition) -> bool:
- """
- Return true if the condition is satisfied for the arr_or_dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like, str, np.dtype, or ExtensionArrayType
- The array-like or dtype object whose dtype we want to extract.
- condition : callable[Union[np.dtype, ExtensionDtype]]
-
- Returns
- -------
- bool
-
- """
- if arr_or_dtype is None:
- return False
- try:
- dtype = get_dtype(arr_or_dtype)
- except (TypeError, ValueError):
- return False
- return condition(dtype)
-
-
-def get_dtype(arr_or_dtype) -> DtypeObj:
- """
- Get the dtype instance associated with an array
- or dtype object.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array-like or dtype object whose dtype we want to extract.
-
- Returns
- -------
- obj_dtype : The extract dtype instance from the
- passed in array or dtype object.
-
- Raises
- ------
- TypeError : The passed in object is None.
- """
- if arr_or_dtype is None:
- raise TypeError("Cannot deduce dtype from null object")
-
- # fastpath
- if isinstance(arr_or_dtype, np.dtype):
- return arr_or_dtype
- elif isinstance(arr_or_dtype, type):
- return np.dtype(arr_or_dtype)
-
- # if we have an array-like
- elif hasattr(arr_or_dtype, "dtype"):
- arr_or_dtype = arr_or_dtype.dtype
-
- return pandas_dtype(arr_or_dtype)
-
-
-def _is_dtype_type(arr_or_dtype, condition) -> bool:
- """
- Return true if the condition is satisfied for the arr_or_dtype.
-
- Parameters
- ----------
- arr_or_dtype : array-like or dtype
- The array-like or dtype object whose dtype we want to extract.
- condition : callable[Union[np.dtype, ExtensionDtypeType]]
-
- Returns
- -------
- bool : if the condition is satisfied for the arr_or_dtype
- """
- if arr_or_dtype is None:
- return condition(type(None))
-
- # fastpath
- if isinstance(arr_or_dtype, np.dtype):
- return condition(arr_or_dtype.type)
- elif isinstance(arr_or_dtype, type):
- if issubclass(arr_or_dtype, ExtensionDtype):
- arr_or_dtype = arr_or_dtype.type
- return condition(np.dtype(arr_or_dtype).type)
-
- # if we have an array-like
- if hasattr(arr_or_dtype, "dtype"):
- arr_or_dtype = arr_or_dtype.dtype
-
- # we are not possibly a dtype
- elif is_list_like(arr_or_dtype):
- return condition(type(None))
-
- try:
- tipo = pandas_dtype(arr_or_dtype).type
- except (TypeError, ValueError):
- if is_scalar(arr_or_dtype):
- return condition(type(None))
-
- return False
-
- return condition(tipo)
-
-
-def infer_dtype_from_object(dtype) -> type:
- """
- Get a numpy dtype.type-style object for a dtype object.
-
- This methods also includes handling of the datetime64[ns] and
- datetime64[ns, TZ] objects.
-
- If no dtype can be found, we return ``object``.
-
- Parameters
- ----------
- dtype : dtype, type
- The dtype object whose numpy dtype.type-style
- object we want to extract.
-
- Returns
- -------
- type
- """
- if isinstance(dtype, type) and issubclass(dtype, np.generic):
- # Type object from a dtype
-
- return dtype
- elif isinstance(dtype, (np.dtype, ExtensionDtype)):
- # dtype object
- try:
- _validate_date_like_dtype(dtype)
- except TypeError:
- # Should still pass if we don't have a date-like
- pass
- if hasattr(dtype, "numpy_dtype"):
- # TODO: Implement this properly
- # https://github.com/pandas-dev/pandas/issues/52576
- return dtype.numpy_dtype.type
- return dtype.type
-
- try:
- dtype = pandas_dtype(dtype)
- except TypeError:
- pass
-
- if is_extension_array_dtype(dtype):
- return dtype.type
- elif isinstance(dtype, str):
- # TODO(jreback)
- # should deprecate these
- if dtype in ["datetimetz", "datetime64tz"]:
- return DatetimeTZDtype.type
- elif dtype in ["period"]:
- raise NotImplementedError
-
- if dtype in ["datetime", "timedelta"]:
- dtype += "64"
- try:
- return infer_dtype_from_object(getattr(np, dtype))
- except (AttributeError, TypeError):
- # Handles cases like get_dtype(int) i.e.,
- # Python objects that are valid dtypes
- # (unlike user-defined types, in general)
- #
- # TypeError handles the float16 type code of 'e'
- # further handle internal types
- pass
-
- return infer_dtype_from_object(np.dtype(dtype))
-
-
-def _validate_date_like_dtype(dtype) -> None:
- """
- Check whether the dtype is a date-like dtype. Raises an error if invalid.
-
- Parameters
- ----------
- dtype : dtype, type
- The dtype to check.
-
- Raises
- ------
- TypeError : The dtype could not be casted to a date-like dtype.
- ValueError : The dtype is an illegal date-like dtype (e.g. the
- frequency provided is too specific)
- """
- try:
- typ = np.datetime_data(dtype)[0]
- except ValueError as e:
- raise TypeError(e) from e
- if typ not in ["generic", "ns"]:
- raise ValueError(
- f"{repr(dtype.name)} is too specific of a frequency, "
- f"try passing {repr(dtype.type.__name__)}"
- )
-
-
-def validate_all_hashable(*args, error_name: str | None = None) -> None:
- """
- Return None if all args are hashable, else raise a TypeError.
-
- Parameters
- ----------
- *args
- Arguments to validate.
- error_name : str, optional
- The name to use if error
-
- Raises
- ------
- TypeError : If an argument is not hashable
-
- Returns
- -------
- None
- """
- if not all(is_hashable(arg) for arg in args):
- if error_name:
- raise TypeError(f"{error_name} must be a hashable type")
- raise TypeError("All elements must be hashable")
-
-
-def pandas_dtype(dtype) -> DtypeObj:
- """
- Convert input into a pandas only dtype object or a numpy dtype object.
-
- Parameters
- ----------
- dtype : object to be converted
-
- Returns
- -------
- np.dtype or a pandas dtype
-
- Raises
- ------
- TypeError if not a dtype
- """
- # short-circuit
- if isinstance(dtype, np.ndarray):
- return dtype.dtype
- elif isinstance(dtype, (np.dtype, ExtensionDtype)):
- return dtype
-
- # registered extension types
- result = registry.find(dtype)
- if result is not None:
- return result
-
- # try a numpy dtype
- # raise a consistent TypeError if failed
- try:
- with warnings.catch_warnings():
- # GH#51523 - Series.astype(np.integer) doesn't show
- # numpy deprication warning of np.integer
- # Hence enabling DeprecationWarning
- warnings.simplefilter("always", DeprecationWarning)
- npdtype = np.dtype(dtype)
- except SyntaxError as err:
- # np.dtype uses `eval` which can raise SyntaxError
- raise TypeError(f"data type '{dtype}' not understood") from err
-
- # Any invalid dtype (such as pd.Timestamp) should raise an error.
- # np.dtype(invalid_type).kind = 0 for such objects. However, this will
- # also catch some valid dtypes such as object, np.object_ and 'object'
- # which we safeguard against by catching them earlier and returning
- # np.dtype(valid_dtype) before this condition is evaluated.
- if is_hashable(dtype) and dtype in [object, np.object_, "object", "O"]:
- # check hashability to avoid errors/DeprecationWarning when we get
- # here and `dtype` is an array
- return npdtype
- elif npdtype.kind == "O":
- raise TypeError(f"dtype '{dtype}' not understood")
-
- return npdtype
-
-
-def is_all_strings(value: ArrayLike) -> bool:
- """
- Check if this is an array of strings that we should try parsing.
-
- Includes object-dtype ndarray containing all-strings, StringArray,
- and Categorical with all-string categories.
- Does not include numpy string dtypes.
- """
- dtype = value.dtype
-
- if isinstance(dtype, np.dtype):
- return (
- dtype == np.dtype("object")
- and lib.infer_dtype(value, skipna=False) == "string"
- )
- elif isinstance(dtype, CategoricalDtype):
- return dtype.categories.inferred_type == "string"
- return dtype == "string"
-
-
-__all__ = [
- "classes",
- "classes_and_not_datetimelike",
- "DT64NS_DTYPE",
- "ensure_float",
- "ensure_float64",
- "ensure_python_int",
- "ensure_str",
- "get_dtype",
- "infer_dtype_from_object",
- "INT64_DTYPE",
- "is_1d_only_ea_dtype",
- "is_1d_only_ea_obj",
- "is_all_strings",
- "is_any_int_dtype",
- "is_any_real_numeric_dtype",
- "is_array_like",
- "is_bool",
- "is_bool_dtype",
- "is_categorical_dtype",
- "is_complex",
- "is_complex_dtype",
- "is_dataclass",
- "is_datetime64_any_dtype",
- "is_datetime64_dtype",
- "is_datetime64_ns_dtype",
- "is_datetime64tz_dtype",
- "is_datetime_or_timedelta_dtype",
- "is_decimal",
- "is_dict_like",
- "is_dtype_equal",
- "is_ea_or_datetimelike_dtype",
- "is_extension_array_dtype",
- "is_file_like",
- "is_float_dtype",
- "is_int64_dtype",
- "is_integer_dtype",
- "is_interval",
- "is_interval_dtype",
- "is_iterator",
- "is_named_tuple",
- "is_nested_list_like",
- "is_number",
- "is_numeric_dtype",
- "is_object_dtype",
- "is_period_dtype",
- "is_re",
- "is_re_compilable",
- "is_scipy_sparse",
- "is_sequence",
- "is_signed_integer_dtype",
- "is_sparse",
- "is_string_dtype",
- "is_string_or_object_np_dtype",
- "is_timedelta64_dtype",
- "is_timedelta64_ns_dtype",
- "is_unsigned_integer_dtype",
- "needs_i8_conversion",
- "pandas_dtype",
- "TD64NS_DTYPE",
- "validate_all_hashable",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/dtypes/concat.py b/contrib/python/pandas/py3/pandas/core/dtypes/concat.py
deleted file mode 100644
index 70956302077..00000000000
--- a/contrib/python/pandas/py3/pandas/core/dtypes/concat.py
+++ /dev/null
@@ -1,323 +0,0 @@
-"""
-Utility functions related to concat.
-"""
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from pandas._typing import AxisInt
-
-from pandas.core.dtypes.astype import astype_array
-from pandas.core.dtypes.cast import (
- common_dtype_categorical_compat,
- find_common_type,
- np_find_common_type,
-)
-from pandas.core.dtypes.common import is_dtype_equal
-from pandas.core.dtypes.dtypes import (
- DatetimeTZDtype,
- ExtensionDtype,
-)
-from pandas.core.dtypes.generic import (
- ABCCategoricalIndex,
- ABCExtensionArray,
- ABCSeries,
-)
-
-if TYPE_CHECKING:
- from pandas.core.arrays import Categorical
-
-
-def concat_compat(to_concat, axis: AxisInt = 0, ea_compat_axis: bool = False):
- """
- provide concatenation of an array of arrays each of which is a single
- 'normalized' dtypes (in that for example, if it's object, then it is a
- non-datetimelike and provide a combined dtype for the resulting array that
- preserves the overall dtype if possible)
-
- Parameters
- ----------
- to_concat : array of arrays
- axis : axis to provide concatenation
- ea_compat_axis : bool, default False
- For ExtensionArray compat, behave as if axis == 1 when determining
- whether to drop empty arrays.
-
- Returns
- -------
- a single array, preserving the combined dtypes
- """
-
- # filter empty arrays
- # 1-d dtypes always are included here
- def is_nonempty(x) -> bool:
- if x.ndim <= axis:
- return True
- return x.shape[axis] > 0
-
- # If all arrays are empty, there's nothing to convert, just short-cut to
- # the concatenation, #3121.
- #
- # Creating an empty array directly is tempting, but the winnings would be
- # marginal given that it would still require shape & dtype calculation and
- # np.concatenate which has them both implemented is compiled.
- non_empties = [x for x in to_concat if is_nonempty(x)]
- if non_empties and axis == 0 and not ea_compat_axis:
- # ea_compat_axis see GH#39574
- to_concat = non_empties
-
- dtypes = {obj.dtype for obj in to_concat}
- kinds = {obj.dtype.kind for obj in to_concat}
- contains_datetime = any(
- isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in ["m", "M"]
- for dtype in dtypes
- ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat)
-
- all_empty = not len(non_empties)
- single_dtype = len({x.dtype for x in to_concat}) == 1
- any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat)
-
- if contains_datetime:
- return _concat_datetime(to_concat, axis=axis)
-
- if any_ea:
- # we ignore axis here, as internally concatting with EAs is always
- # for axis=0
- if not single_dtype:
- target_dtype = find_common_type([x.dtype for x in to_concat])
- target_dtype = common_dtype_categorical_compat(to_concat, target_dtype)
- to_concat = [
- astype_array(arr, target_dtype, copy=False) for arr in to_concat
- ]
-
- if isinstance(to_concat[0], ABCExtensionArray):
- # TODO: what about EA-backed Index?
- cls = type(to_concat[0])
- return cls._concat_same_type(to_concat)
- else:
- return np.concatenate(to_concat)
-
- elif all_empty:
- # we have all empties, but may need to coerce the result dtype to
- # object if we have non-numeric type operands (numpy would otherwise
- # cast this to float)
- if len(kinds) != 1:
- if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}):
- # let numpy coerce
- pass
- else:
- # coerce to object
- to_concat = [x.astype("object") for x in to_concat]
- kinds = {"o"}
- else:
- target_dtype = np_find_common_type(*dtypes)
-
- result = np.concatenate(to_concat, axis=axis)
- if "b" in kinds and result.dtype.kind in ["i", "u", "f"]:
- # GH#39817 cast to object instead of casting bools to numeric
- result = result.astype(object, copy=False)
- return result
-
-
-def union_categoricals(
- to_union, sort_categories: bool = False, ignore_order: bool = False
-) -> Categorical:
- """
- Combine list-like of Categorical-like, unioning categories.
-
- All categories must have the same dtype.
-
- Parameters
- ----------
- to_union : list-like
- Categorical, CategoricalIndex, or Series with dtype='category'.
- sort_categories : bool, default False
- If true, resulting categories will be lexsorted, otherwise
- they will be ordered as they appear in the data.
- ignore_order : bool, default False
- If true, the ordered attribute of the Categoricals will be ignored.
- Results in an unordered categorical.
-
- Returns
- -------
- Categorical
-
- Raises
- ------
- TypeError
- - all inputs do not have the same dtype
- - all inputs do not have the same ordered property
- - all inputs are ordered and their categories are not identical
- - sort_categories=True and Categoricals are ordered
- ValueError
- Empty list of categoricals passed
-
- Notes
- -----
- To learn more about categories, see `link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__
-
- Examples
- --------
- If you want to combine categoricals that do not necessarily have
- the same categories, `union_categoricals` will combine a list-like
- of categoricals. The new categories will be the union of the
- categories being combined.
-
- >>> a = pd.Categorical(["b", "c"])
- >>> b = pd.Categorical(["a", "b"])
- >>> pd.api.types.union_categoricals([a, b])
- ['b', 'c', 'a', 'b']
- Categories (3, object): ['b', 'c', 'a']
-
- By default, the resulting categories will be ordered as they appear
- in the `categories` of the data. If you want the categories to be
- lexsorted, use `sort_categories=True` argument.
-
- >>> pd.api.types.union_categoricals([a, b], sort_categories=True)
- ['b', 'c', 'a', 'b']
- Categories (3, object): ['a', 'b', 'c']
-
- `union_categoricals` also works with the case of combining two
- categoricals of the same categories and order information (e.g. what
- you could also `append` for).
-
- >>> a = pd.Categorical(["a", "b"], ordered=True)
- >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
- >>> pd.api.types.union_categoricals([a, b])
- ['a', 'b', 'a', 'b', 'a']
- Categories (2, object): ['a' < 'b']
-
- Raises `TypeError` because the categories are ordered and not identical.
-
- >>> a = pd.Categorical(["a", "b"], ordered=True)
- >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
- >>> pd.api.types.union_categoricals([a, b])
- Traceback (most recent call last):
- ...
- TypeError: to union ordered Categoricals, all categories must be the same
-
- New in version 0.20.0
-
- Ordered categoricals with different categories or orderings can be
- combined by using the `ignore_ordered=True` argument.
-
- >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
- >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
- >>> pd.api.types.union_categoricals([a, b], ignore_order=True)
- ['a', 'b', 'c', 'c', 'b', 'a']
- Categories (3, object): ['a', 'b', 'c']
-
- `union_categoricals` also works with a `CategoricalIndex`, or `Series`
- containing categorical data, but note that the resulting array will
- always be a plain `Categorical`
-
- >>> a = pd.Series(["b", "c"], dtype='category')
- >>> b = pd.Series(["a", "b"], dtype='category')
- >>> pd.api.types.union_categoricals([a, b])
- ['b', 'c', 'a', 'b']
- Categories (3, object): ['b', 'c', 'a']
- """
- from pandas import Categorical
- from pandas.core.arrays.categorical import recode_for_categories
-
- if len(to_union) == 0:
- raise ValueError("No Categoricals to union")
-
- def _maybe_unwrap(x):
- if isinstance(x, (ABCCategoricalIndex, ABCSeries)):
- return x._values
- elif isinstance(x, Categorical):
- return x
- else:
- raise TypeError("all components to combine must be Categorical")
-
- to_union = [_maybe_unwrap(x) for x in to_union]
- first = to_union[0]
-
- if not all(
- is_dtype_equal(other.categories.dtype, first.categories.dtype)
- for other in to_union[1:]
- ):
- raise TypeError("dtype of categories must be the same")
-
- ordered = False
- if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]):
- # identical categories - fastpath
- categories = first.categories
- ordered = first.ordered
-
- all_codes = [first._encode_with_my_categories(x)._codes for x in to_union]
- new_codes = np.concatenate(all_codes)
-
- if sort_categories and not ignore_order and ordered:
- raise TypeError("Cannot use sort_categories=True with ordered Categoricals")
-
- if sort_categories and not categories.is_monotonic_increasing:
- categories = categories.sort_values()
- indexer = categories.get_indexer(first.categories)
-
- from pandas.core.algorithms import take_nd
-
- new_codes = take_nd(indexer, new_codes, fill_value=-1)
- elif ignore_order or all(not c.ordered for c in to_union):
- # different categories - union and recode
- cats = first.categories.append([c.categories for c in to_union[1:]])
- categories = cats.unique()
- if sort_categories:
- categories = categories.sort_values()
-
- new_codes = [
- recode_for_categories(c.codes, c.categories, categories) for c in to_union
- ]
- new_codes = np.concatenate(new_codes)
- else:
- # ordered - to show a proper error message
- if all(c.ordered for c in to_union):
- msg = "to union ordered Categoricals, all categories must be the same"
- raise TypeError(msg)
- raise TypeError("Categorical.ordered must be the same")
-
- if ignore_order:
- ordered = False
-
- return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
-
-
-def _concatenate_2d(to_concat, axis: AxisInt):
- # coerce to 2d if needed & concatenate
- if axis == 1:
- to_concat = [np.atleast_2d(x) for x in to_concat]
- return np.concatenate(to_concat, axis=axis)
-
-
-def _concat_datetime(to_concat, axis: AxisInt = 0):
- """
- provide concatenation of an datetimelike array of arrays each of which is a
- single M8[ns], datetime64[ns, tz] or m8[ns] dtype
-
- Parameters
- ----------
- to_concat : array of arrays
- axis : axis to provide concatenation
-
- Returns
- -------
- a single array, preserving the combined dtypes
- """
- from pandas.core.construction import ensure_wrapped_if_datetimelike
-
- to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]
-
- single_dtype = len({x.dtype for x in to_concat}) == 1
-
- # multiple types, need to coerce to object
- if not single_dtype:
- # ensure_wrapped_if_datetimelike ensures that astype(object) wraps
- # in Timestamp/Timedelta
- return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)
-
- result = type(to_concat[0])._concat_same_type(to_concat, axis=axis)
- return result
diff --git a/contrib/python/pandas/py3/pandas/core/dtypes/dtypes.py b/contrib/python/pandas/py3/pandas/core/dtypes/dtypes.py
deleted file mode 100644
index 123e20d6258..00000000000
--- a/contrib/python/pandas/py3/pandas/core/dtypes/dtypes.py
+++ /dev/null
@@ -1,1478 +0,0 @@
-"""
-Define extension dtypes.
-"""
-from __future__ import annotations
-
-import re
-from typing import (
- TYPE_CHECKING,
- Any,
- MutableMapping,
- cast,
-)
-
-import numpy as np
-import pytz
-
-from pandas._libs import missing as libmissing
-from pandas._libs.interval import Interval
-from pandas._libs.properties import cache_readonly
-from pandas._libs.tslibs import (
- BaseOffset,
- NaT,
- NaTType,
- Period,
- Timestamp,
- timezones,
- to_offset,
- tz_compare,
-)
-from pandas._libs.tslibs.dtypes import (
- PeriodDtypeBase,
- abbrev_to_npy_unit,
-)
-from pandas._typing import (
- Dtype,
- DtypeObj,
- Ordered,
- npt,
- type_t,
-)
-
-from pandas.core.dtypes.base import (
- ExtensionDtype,
- register_extension_dtype,
-)
-from pandas.core.dtypes.generic import (
- ABCCategoricalIndex,
- ABCIndex,
-)
-from pandas.core.dtypes.inference import (
- is_bool,
- is_list_like,
-)
-
-if TYPE_CHECKING:
- from datetime import tzinfo
-
- import pyarrow
-
- from pandas import (
- Categorical,
- Index,
- )
- from pandas.core.arrays import (
- BaseMaskedArray,
- DatetimeArray,
- IntervalArray,
- PandasArray,
- PeriodArray,
- )
-
-str_type = str
-
-
-class PandasExtensionDtype(ExtensionDtype):
- """
- A np.dtype duck-typed class, suitable for holding a custom dtype.
-
- THIS IS NOT A REAL NUMPY DTYPE
- """
-
- type: Any
- kind: Any
- # The Any type annotations above are here only because mypy seems to have a
- # problem dealing with multiple inheritance from PandasExtensionDtype
- # and ExtensionDtype's @properties in the subclasses below. The kind and
- # type variables in those subclasses are explicitly typed below.
- subdtype = None
- str: str_type
- num = 100
- shape: tuple[int, ...] = ()
- itemsize = 8
- base: DtypeObj | None = None
- isbuiltin = 0
- isnative = 0
- _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
-
- def __repr__(self) -> str_type:
- """
- Return a string representation for a particular object.
- """
- return str(self)
-
- def __hash__(self) -> int:
- raise NotImplementedError("sub-classes should implement an __hash__ method")
-
- def __getstate__(self) -> dict[str_type, Any]:
- # pickle support; we don't want to pickle the cache
- return {k: getattr(self, k, None) for k in self._metadata}
-
- @classmethod
- def reset_cache(cls) -> None:
- """clear the cache"""
- cls._cache_dtypes = {}
-
-
-class CategoricalDtypeType(type):
- """
- the type of CategoricalDtype, this metaclass determines subclass ability
- """
-
-
-@register_extension_dtype
-class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
- """
- Type for categorical data with the categories and orderedness.
-
- Parameters
- ----------
- categories : sequence, optional
- Must be unique, and must not contain any nulls.
- The categories are stored in an Index,
- and if an index is provided the dtype of that index will be used.
- ordered : bool or None, default False
- Whether or not this categorical is treated as a ordered categorical.
- None can be used to maintain the ordered value of existing categoricals when
- used in operations that combine categoricals, e.g. astype, and will resolve to
- False if there is no existing ordered to maintain.
-
- Attributes
- ----------
- categories
- ordered
-
- Methods
- -------
- None
-
- See Also
- --------
- Categorical : Represent a categorical variable in classic R / S-plus fashion.
-
- Notes
- -----
- This class is useful for specifying the type of a ``Categorical``
- independent of the values. See :ref:`categorical.categoricaldtype`
- for more.
-
- Examples
- --------
- >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True)
- >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t)
- 0 a
- 1 b
- 2 a
- 3 NaN
- dtype: category
- Categories (2, object): ['b' < 'a']
-
- An empty CategoricalDtype with a specific dtype can be created
- by providing an empty index. As follows,
-
- >>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype
- dtype('<M8[ns]')
- """
-
- # TODO: Document public vs. private API
- name = "category"
- type: type[CategoricalDtypeType] = CategoricalDtypeType
- kind: str_type = "O"
- str = "|O08"
- base = np.dtype("O")
- _metadata = ("categories", "ordered")
- _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
-
- def __init__(self, categories=None, ordered: Ordered = False) -> None:
- self._finalize(categories, ordered, fastpath=False)
-
- @classmethod
- def _from_fastpath(
- cls, categories=None, ordered: bool | None = None
- ) -> CategoricalDtype:
- self = cls.__new__(cls)
- self._finalize(categories, ordered, fastpath=True)
- return self
-
- @classmethod
- def _from_categorical_dtype(
- cls, dtype: CategoricalDtype, categories=None, ordered: Ordered = None
- ) -> CategoricalDtype:
- if categories is ordered is None:
- return dtype
- if categories is None:
- categories = dtype.categories
- if ordered is None:
- ordered = dtype.ordered
- return cls(categories, ordered)
-
- @classmethod
- def _from_values_or_dtype(
- cls,
- values=None,
- categories=None,
- ordered: bool | None = None,
- dtype: Dtype | None = None,
- ) -> CategoricalDtype:
- """
- Construct dtype from the input parameters used in :class:`Categorical`.
-
- This constructor method specifically does not do the factorization
- step, if that is needed to find the categories. This constructor may
- therefore return ``CategoricalDtype(categories=None, ordered=None)``,
- which may not be useful. Additional steps may therefore have to be
- taken to create the final dtype.
-
- The return dtype is specified from the inputs in this prioritized
- order:
- 1. if dtype is a CategoricalDtype, return dtype
- 2. if dtype is the string 'category', create a CategoricalDtype from
- the supplied categories and ordered parameters, and return that.
- 3. if values is a categorical, use value.dtype, but override it with
- categories and ordered if either/both of those are not None.
- 4. if dtype is None and values is not a categorical, construct the
- dtype from categories and ordered, even if either of those is None.
-
- Parameters
- ----------
- values : list-like, optional
- The list-like must be 1-dimensional.
- categories : list-like, optional
- Categories for the CategoricalDtype.
- ordered : bool, optional
- Designating if the categories are ordered.
- dtype : CategoricalDtype or the string "category", optional
- If ``CategoricalDtype``, cannot be used together with
- `categories` or `ordered`.
-
- Returns
- -------
- CategoricalDtype
-
- Examples
- --------
- >>> pd.CategoricalDtype._from_values_or_dtype()
- CategoricalDtype(categories=None, ordered=None)
- >>> pd.CategoricalDtype._from_values_or_dtype(
- ... categories=['a', 'b'], ordered=True
- ... )
- CategoricalDtype(categories=['a', 'b'], ordered=True)
- >>> dtype1 = pd.CategoricalDtype(['a', 'b'], ordered=True)
- >>> dtype2 = pd.CategoricalDtype(['x', 'y'], ordered=False)
- >>> c = pd.Categorical([0, 1], dtype=dtype1, fastpath=True)
- >>> pd.CategoricalDtype._from_values_or_dtype(
- ... c, ['x', 'y'], ordered=True, dtype=dtype2
- ... )
- Traceback (most recent call last):
- ...
- ValueError: Cannot specify `categories` or `ordered` together with
- `dtype`.
-
- The supplied dtype takes precedence over values' dtype:
-
- >>> pd.CategoricalDtype._from_values_or_dtype(c, dtype=dtype2)
- CategoricalDtype(categories=['x', 'y'], ordered=False)
- """
-
- if dtype is not None:
- # The dtype argument takes precedence over values.dtype (if any)
- if isinstance(dtype, str):
- if dtype == "category":
- if ordered is None and cls.is_dtype(values):
- # GH#49309 preserve orderedness
- ordered = values.dtype.ordered
-
- dtype = CategoricalDtype(categories, ordered)
- else:
- raise ValueError(f"Unknown dtype {repr(dtype)}")
- elif categories is not None or ordered is not None:
- raise ValueError(
- "Cannot specify `categories` or `ordered` together with `dtype`."
- )
- elif not isinstance(dtype, CategoricalDtype):
- raise ValueError(f"Cannot not construct CategoricalDtype from {dtype}")
- elif cls.is_dtype(values):
- # If no "dtype" was passed, use the one from "values", but honor
- # the "ordered" and "categories" arguments
- dtype = values.dtype._from_categorical_dtype(
- values.dtype, categories, ordered
- )
- else:
- # If dtype=None and values is not categorical, create a new dtype.
- # Note: This could potentially have categories=None and
- # ordered=None.
- dtype = CategoricalDtype(categories, ordered)
-
- return cast(CategoricalDtype, dtype)
-
- @classmethod
- def construct_from_string(cls, string: str_type) -> CategoricalDtype:
- """
- Construct a CategoricalDtype from a string.
-
- Parameters
- ----------
- string : str
- Must be the string "category" in order to be successfully constructed.
-
- Returns
- -------
- CategoricalDtype
- Instance of the dtype.
-
- Raises
- ------
- TypeError
- If a CategoricalDtype cannot be constructed from the input.
- """
- if not isinstance(string, str):
- raise TypeError(
- f"'construct_from_string' expects a string, got {type(string)}"
- )
- if string != cls.name:
- raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'")
-
- # need ordered=None to ensure that operations specifying dtype="category" don't
- # override the ordered value for existing categoricals
- return cls(ordered=None)
-
- def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None:
- if ordered is not None:
- self.validate_ordered(ordered)
-
- if categories is not None:
- categories = self.validate_categories(categories, fastpath=fastpath)
-
- self._categories = categories
- self._ordered = ordered
-
- def __setstate__(self, state: MutableMapping[str_type, Any]) -> None:
- # for pickle compat. __get_state__ is defined in the
- # PandasExtensionDtype superclass and uses the public properties to
- # pickle -> need to set the settable private ones here (see GH26067)
- self._categories = state.pop("categories", None)
- self._ordered = state.pop("ordered", False)
-
- def __hash__(self) -> int:
- # _hash_categories returns a uint64, so use the negative
- # space for when we have unknown categories to avoid a conflict
- if self.categories is None:
- if self.ordered:
- return -1
- else:
- return -2
- # We *do* want to include the real self.ordered here
- return int(self._hash_categories)
-
- def __eq__(self, other: Any) -> bool:
- """
- Rules for CDT equality:
- 1) Any CDT is equal to the string 'category'
- 2) Any CDT is equal to itself
- 3) Any CDT is equal to a CDT with categories=None regardless of ordered
- 4) A CDT with ordered=True is only equal to another CDT with
- ordered=True and identical categories in the same order
- 5) A CDT with ordered={False, None} is only equal to another CDT with
- ordered={False, None} and identical categories, but same order is
- not required. There is no distinction between False/None.
- 6) Any other comparison returns False
- """
- if isinstance(other, str):
- return other == self.name
- elif other is self:
- return True
- elif not (hasattr(other, "ordered") and hasattr(other, "categories")):
- return False
- elif self.categories is None or other.categories is None:
- # For non-fully-initialized dtypes, these are only equal to
- # - the string "category" (handled above)
- # - other CategoricalDtype with categories=None
- return self.categories is other.categories
- elif self.ordered or other.ordered:
- # At least one has ordered=True; equal if both have ordered=True
- # and the same values for categories in the same order.
- return (self.ordered == other.ordered) and self.categories.equals(
- other.categories
- )
- else:
- # Neither has ordered=True; equal if both have the same categories,
- # but same order is not necessary. There is no distinction between
- # ordered=False and ordered=None: CDT(., False) and CDT(., None)
- # will be equal if they have the same categories.
- left = self.categories
- right = other.categories
-
- # GH#36280 the ordering of checks here is for performance
- if not left.dtype == right.dtype:
- return False
-
- if len(left) != len(right):
- return False
-
- if self.categories.equals(other.categories):
- # Check and see if they happen to be identical categories
- return True
-
- if left.dtype != object:
- # Faster than calculating hash
- indexer = left.get_indexer(right)
- # Because left and right have the same length and are unique,
- # `indexer` not having any -1s implies that there is a
- # bijection between `left` and `right`.
- return (indexer != -1).all()
-
- # With object-dtype we need a comparison that identifies
- # e.g. int(2) as distinct from float(2)
- return hash(self) == hash(other)
-
- def __repr__(self) -> str_type:
- if self.categories is None:
- data = "None"
- else:
- data = self.categories._format_data(name=type(self).__name__)
- if data is None:
- # self.categories is RangeIndex
- data = str(self.categories._range)
- data = data.rstrip(", ")
- return f"CategoricalDtype(categories={data}, ordered={self.ordered})"
-
- @cache_readonly
- def _hash_categories(self) -> int:
- from pandas.core.util.hashing import (
- combine_hash_arrays,
- hash_array,
- hash_tuples,
- )
-
- categories = self.categories
- ordered = self.ordered
-
- if len(categories) and isinstance(categories[0], tuple):
- # assumes if any individual category is a tuple, then all our. ATM
- # I don't really want to support just some of the categories being
- # tuples.
- cat_list = list(categories) # breaks if a np.array of categories
- cat_array = hash_tuples(cat_list)
- else:
- if categories.dtype == "O" and len({type(x) for x in categories}) != 1:
- # TODO: hash_array doesn't handle mixed types. It casts
- # everything to a str first, which means we treat
- # {'1', '2'} the same as {'1', 2}
- # find a better solution
- hashed = hash((tuple(categories), ordered))
- return hashed
-
- if DatetimeTZDtype.is_dtype(categories.dtype):
- # Avoid future warning.
- categories = categories.view("datetime64[ns]")
-
- cat_array = hash_array(np.asarray(categories), categorize=False)
- if ordered:
- cat_array = np.vstack(
- [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)]
- )
- else:
- cat_array = np.array([cat_array])
- combined_hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array))
- return np.bitwise_xor.reduce(combined_hashed)
-
- @classmethod
- def construct_array_type(cls) -> type_t[Categorical]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- from pandas import Categorical
-
- return Categorical
-
- @staticmethod
- def validate_ordered(ordered: Ordered) -> None:
- """
- Validates that we have a valid ordered parameter. If
- it is not a boolean, a TypeError will be raised.
-
- Parameters
- ----------
- ordered : object
- The parameter to be verified.
-
- Raises
- ------
- TypeError
- If 'ordered' is not a boolean.
- """
- if not is_bool(ordered):
- raise TypeError("'ordered' must either be 'True' or 'False'")
-
- @staticmethod
- def validate_categories(categories, fastpath: bool = False) -> Index:
- """
- Validates that we have good categories
-
- Parameters
- ----------
- categories : array-like
- fastpath : bool
- Whether to skip nan and uniqueness checks
-
- Returns
- -------
- categories : Index
- """
- from pandas.core.indexes.base import Index
-
- if not fastpath and not is_list_like(categories):
- raise TypeError(
- f"Parameter 'categories' must be list-like, was {repr(categories)}"
- )
- if not isinstance(categories, ABCIndex):
- categories = Index._with_infer(categories, tupleize_cols=False)
-
- if not fastpath:
- if categories.hasnans:
- raise ValueError("Categorical categories cannot be null")
-
- if not categories.is_unique:
- raise ValueError("Categorical categories must be unique")
-
- if isinstance(categories, ABCCategoricalIndex):
- categories = categories.categories
-
- return categories
-
- def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype:
- """
- Returns a CategoricalDtype with categories and ordered taken from dtype
- if specified, otherwise falling back to self if unspecified
-
- Parameters
- ----------
- dtype : CategoricalDtype
-
- Returns
- -------
- new_dtype : CategoricalDtype
- """
- if isinstance(dtype, str) and dtype == "category":
- # dtype='category' should not change anything
- return self
- elif not self.is_dtype(dtype):
- raise ValueError(
- f"a CategoricalDtype must be passed to perform an update, "
- f"got {repr(dtype)}"
- )
- else:
- # from here on, dtype is a CategoricalDtype
- dtype = cast(CategoricalDtype, dtype)
-
- # update categories/ordered unless they've been explicitly passed as None
- new_categories = (
- dtype.categories if dtype.categories is not None else self.categories
- )
- new_ordered = dtype.ordered if dtype.ordered is not None else self.ordered
-
- return CategoricalDtype(new_categories, new_ordered)
-
- @property
- def categories(self) -> Index:
- """
- An ``Index`` containing the unique categories allowed.
- """
- return self._categories
-
- @property
- def ordered(self) -> Ordered:
- """
- Whether the categories have an ordered relationship.
- """
- return self._ordered
-
- @property
- def _is_boolean(self) -> bool:
- from pandas.core.dtypes.common import is_bool_dtype
-
- return is_bool_dtype(self.categories)
-
- def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
- from pandas.core.arrays.sparse import SparseDtype
-
- # check if we have all categorical dtype with identical categories
- if all(isinstance(x, CategoricalDtype) for x in dtypes):
- first = dtypes[0]
- if all(first == other for other in dtypes[1:]):
- return first
-
- # special case non-initialized categorical
- # TODO we should figure out the expected return value in general
- non_init_cats = [
- isinstance(x, CategoricalDtype) and x.categories is None for x in dtypes
- ]
- if all(non_init_cats):
- return self
- elif any(non_init_cats):
- return None
-
- # categorical is aware of Sparse -> extract sparse subdtypes
- dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
- # extract the categories' dtype
- non_cat_dtypes = [
- x.categories.dtype if isinstance(x, CategoricalDtype) else x for x in dtypes
- ]
- # TODO should categorical always give an answer?
- from pandas.core.dtypes.cast import find_common_type
-
- return find_common_type(non_cat_dtypes)
-
-
-@register_extension_dtype
-class DatetimeTZDtype(PandasExtensionDtype):
- """
- An ExtensionDtype for timezone-aware datetime data.
-
- **This is not an actual numpy dtype**, but a duck type.
-
- Parameters
- ----------
- unit : str, default "ns"
- The precision of the datetime data. Currently limited
- to ``"ns"``.
- tz : str, int, or datetime.tzinfo
- The timezone.
-
- Attributes
- ----------
- unit
- tz
-
- Methods
- -------
- None
-
- Raises
- ------
- pytz.UnknownTimeZoneError
- When the requested timezone cannot be found.
-
- Examples
- --------
- >>> pd.DatetimeTZDtype(tz='UTC')
- datetime64[ns, UTC]
-
- >>> pd.DatetimeTZDtype(tz='dateutil/US/Central')
- datetime64[ns, tzfile('/usr/share/zoneinfo/US/Central')]
- """
-
- type: type[Timestamp] = Timestamp
- kind: str_type = "M"
- num = 101
- base = np.dtype("M8[ns]") # TODO: depend on reso?
- _metadata = ("unit", "tz")
- _match = re.compile(r"(datetime64|M8)\[(?P<unit>.+), (?P<tz>.+)\]")
- _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
-
- @property
- def na_value(self) -> NaTType:
- return NaT
-
- # error: Signature of "str" incompatible with supertype "PandasExtensionDtype"
- @cache_readonly
- def str(self) -> str: # type: ignore[override]
- return f"|M8[{self.unit}]"
-
- def __init__(self, unit: str_type | DatetimeTZDtype = "ns", tz=None) -> None:
- if isinstance(unit, DatetimeTZDtype):
- # error: "str" has no attribute "tz"
- unit, tz = unit.unit, unit.tz # type: ignore[attr-defined]
-
- if unit != "ns":
- if isinstance(unit, str) and tz is None:
- # maybe a string like datetime64[ns, tz], which we support for
- # now.
- result = type(self).construct_from_string(unit)
- unit = result.unit
- tz = result.tz
- msg = (
- f"Passing a dtype alias like 'datetime64[ns, {tz}]' "
- "to DatetimeTZDtype is no longer supported. Use "
- "'DatetimeTZDtype.construct_from_string()' instead."
- )
- raise ValueError(msg)
- if unit not in ["s", "ms", "us", "ns"]:
- raise ValueError("DatetimeTZDtype only supports s, ms, us, ns units")
-
- if tz:
- tz = timezones.maybe_get_tz(tz)
- tz = timezones.tz_standardize(tz)
- elif tz is not None:
- raise pytz.UnknownTimeZoneError(tz)
- if tz is None:
- raise TypeError("A 'tz' is required.")
-
- self._unit = unit
- self._tz = tz
-
- @cache_readonly
- def _creso(self) -> int:
- """
- The NPY_DATETIMEUNIT corresponding to this dtype's resolution.
- """
- return abbrev_to_npy_unit(self.unit)
-
- @property
- def unit(self) -> str_type:
- """
- The precision of the datetime data.
- """
- return self._unit
-
- @property
- def tz(self) -> tzinfo:
- """
- The timezone.
- """
- return self._tz
-
- @classmethod
- def construct_array_type(cls) -> type_t[DatetimeArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- from pandas.core.arrays import DatetimeArray
-
- return DatetimeArray
-
- @classmethod
- def construct_from_string(cls, string: str_type) -> DatetimeTZDtype:
- """
- Construct a DatetimeTZDtype from a string.
-
- Parameters
- ----------
- string : str
- The string alias for this DatetimeTZDtype.
- Should be formatted like ``datetime64[ns, <tz>]``,
- where ``<tz>`` is the timezone name.
-
- Examples
- --------
- >>> DatetimeTZDtype.construct_from_string('datetime64[ns, UTC]')
- datetime64[ns, UTC]
- """
- if not isinstance(string, str):
- raise TypeError(
- f"'construct_from_string' expects a string, got {type(string)}"
- )
-
- msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'"
- match = cls._match.match(string)
- if match:
- d = match.groupdict()
- try:
- return cls(unit=d["unit"], tz=d["tz"])
- except (KeyError, TypeError, ValueError) as err:
- # KeyError if maybe_get_tz tries and fails to get a
- # pytz timezone (actually pytz.UnknownTimeZoneError).
- # TypeError if we pass a nonsense tz;
- # ValueError if we pass a unit other than "ns"
- raise TypeError(msg) from err
- raise TypeError(msg)
-
- def __str__(self) -> str_type:
- return f"datetime64[{self.unit}, {self.tz}]"
-
- @property
- def name(self) -> str_type:
- """A string representation of the dtype."""
- return str(self)
-
- def __hash__(self) -> int:
- # make myself hashable
- # TODO: update this.
- return hash(str(self))
-
- def __eq__(self, other: Any) -> bool:
- if isinstance(other, str):
- if other.startswith("M8["):
- other = f"datetime64[{other[3:]}"
- return other == self.name
-
- return (
- isinstance(other, DatetimeTZDtype)
- and self.unit == other.unit
- and tz_compare(self.tz, other.tz)
- )
-
- def __setstate__(self, state) -> None:
- # for pickle compat. __get_state__ is defined in the
- # PandasExtensionDtype superclass and uses the public properties to
- # pickle -> need to set the settable private ones here (see GH26067)
- self._tz = state["tz"]
- self._unit = state["unit"]
-
-
-@register_extension_dtype
-class PeriodDtype(PeriodDtypeBase, PandasExtensionDtype):
- """
- An ExtensionDtype for Period data.
-
- **This is not an actual numpy dtype**, but a duck type.
-
- Parameters
- ----------
- freq : str or DateOffset
- The frequency of this PeriodDtype.
-
- Attributes
- ----------
- freq
-
- Methods
- -------
- None
-
- Examples
- --------
- >>> pd.PeriodDtype(freq='D')
- period[D]
-
- >>> pd.PeriodDtype(freq=pd.offsets.MonthEnd())
- period[M]
- """
-
- type: type[Period] = Period
- kind: str_type = "O"
- str = "|O08"
- base = np.dtype("O")
- num = 102
- _metadata = ("freq",)
- _match = re.compile(r"(P|p)eriod\[(?P<freq>.+)\]")
- _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
-
- def __new__(cls, freq=None):
- """
- Parameters
- ----------
- freq : frequency
- """
- if isinstance(freq, PeriodDtype):
- return freq
-
- elif freq is None:
- # empty constructor for pickle compat
- # -10_000 corresponds to PeriodDtypeCode.UNDEFINED
- u = PeriodDtypeBase.__new__(cls, -10_000)
- u._freq = None
- return u
-
- if not isinstance(freq, BaseOffset):
- freq = cls._parse_dtype_strict(freq)
-
- try:
- return cls._cache_dtypes[freq.freqstr]
- except KeyError:
- dtype_code = freq._period_dtype_code
- u = PeriodDtypeBase.__new__(cls, dtype_code)
- u._freq = freq
- cls._cache_dtypes[freq.freqstr] = u
- return u
-
- def __reduce__(self):
- return type(self), (self.freq,)
-
- @property
- def freq(self):
- """
- The frequency object of this PeriodDtype.
- """
- return self._freq
-
- @classmethod
- def _parse_dtype_strict(cls, freq: str_type) -> BaseOffset:
- if isinstance(freq, str): # note: freq is already of type str!
- if freq.startswith("period[") or freq.startswith("Period["):
- m = cls._match.search(freq)
- if m is not None:
- freq = m.group("freq")
-
- freq_offset = to_offset(freq)
- if freq_offset is not None:
- return freq_offset
-
- raise ValueError("could not construct PeriodDtype")
-
- @classmethod
- def construct_from_string(cls, string: str_type) -> PeriodDtype:
- """
- Strict construction from a string, raise a TypeError if not
- possible
- """
- if (
- isinstance(string, str)
- and (string.startswith("period[") or string.startswith("Period["))
- or isinstance(string, BaseOffset)
- ):
- # do not parse string like U as period[U]
- # avoid tuple to be regarded as freq
- try:
- return cls(freq=string)
- except ValueError:
- pass
- if isinstance(string, str):
- msg = f"Cannot construct a 'PeriodDtype' from '{string}'"
- else:
- msg = f"'construct_from_string' expects a string, got {type(string)}"
- raise TypeError(msg)
-
- def __str__(self) -> str_type:
- return self.name
-
- @property
- def name(self) -> str_type:
- return f"period[{self.freq.freqstr}]"
-
- @property
- def na_value(self) -> NaTType:
- return NaT
-
- def __hash__(self) -> int:
- # make myself hashable
- return hash(str(self))
-
- def __eq__(self, other: Any) -> bool:
- if isinstance(other, str):
- return other in [self.name, self.name.title()]
-
- elif isinstance(other, PeriodDtype):
- # For freqs that can be held by a PeriodDtype, this check is
- # equivalent to (and much faster than) self.freq == other.freq
- sfreq = self.freq
- ofreq = other.freq
- return (
- sfreq.n == ofreq.n
- and sfreq._period_dtype_code == ofreq._period_dtype_code
- )
-
- return False
-
- def __ne__(self, other: Any) -> bool:
- return not self.__eq__(other)
-
- def __setstate__(self, state) -> None:
- # for pickle compat. __getstate__ is defined in the
- # PandasExtensionDtype superclass and uses the public properties to
- # pickle -> need to set the settable private ones here (see GH26067)
- self._freq = state["freq"]
-
- @classmethod
- def is_dtype(cls, dtype: object) -> bool:
- """
- Return a boolean if we if the passed type is an actual dtype that we
- can match (via string or type)
- """
- if isinstance(dtype, str):
- # PeriodDtype can be instantiated from freq string like "U",
- # but doesn't regard freq str like "U" as dtype.
- if dtype.startswith("period[") or dtype.startswith("Period["):
- try:
- return cls._parse_dtype_strict(dtype) is not None
- except ValueError:
- return False
- else:
- return False
- return super().is_dtype(dtype)
-
- @classmethod
- def construct_array_type(cls) -> type_t[PeriodArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- from pandas.core.arrays import PeriodArray
-
- return PeriodArray
-
- def __from_arrow__(
- self, array: pyarrow.Array | pyarrow.ChunkedArray
- ) -> PeriodArray:
- """
- Construct PeriodArray from pyarrow Array/ChunkedArray.
- """
- import pyarrow
-
- from pandas.core.arrays import PeriodArray
- from pandas.core.arrays.arrow._arrow_utils import (
- pyarrow_array_to_numpy_and_mask,
- )
-
- if isinstance(array, pyarrow.Array):
- chunks = [array]
- else:
- chunks = array.chunks
-
- results = []
- for arr in chunks:
- data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=np.dtype(np.int64))
- parr = PeriodArray(data.copy(), freq=self.freq, copy=False)
- # error: Invalid index type "ndarray[Any, dtype[bool_]]" for "PeriodArray";
- # expected type "Union[int, Sequence[int], Sequence[bool], slice]"
- parr[~mask] = NaT # type: ignore[index]
- results.append(parr)
-
- if not results:
- return PeriodArray(np.array([], dtype="int64"), freq=self.freq, copy=False)
- return PeriodArray._concat_same_type(results)
-
-
-@register_extension_dtype
-class IntervalDtype(PandasExtensionDtype):
- """
- An ExtensionDtype for Interval data.
-
- **This is not an actual numpy dtype**, but a duck type.
-
- Parameters
- ----------
- subtype : str, np.dtype
- The dtype of the Interval bounds.
-
- Attributes
- ----------
- subtype
-
- Methods
- -------
- None
-
- Examples
- --------
- >>> pd.IntervalDtype(subtype='int64', closed='both')
- interval[int64, both]
- """
-
- name = "interval"
- kind: str_type = "O"
- str = "|O08"
- base = np.dtype("O")
- num = 103
- _metadata = (
- "subtype",
- "closed",
- )
-
- _match = re.compile(
- r"(I|i)nterval\[(?P<subtype>[^,]+(\[.+\])?)"
- r"(, (?P<closed>(right|left|both|neither)))?\]"
- )
-
- _cache_dtypes: dict[str_type, PandasExtensionDtype] = {}
-
- def __new__(cls, subtype=None, closed: str_type | None = None):
- from pandas.core.dtypes.common import (
- is_string_dtype,
- pandas_dtype,
- )
-
- if closed is not None and closed not in {"right", "left", "both", "neither"}:
- raise ValueError("closed must be one of 'right', 'left', 'both', 'neither'")
-
- if isinstance(subtype, IntervalDtype):
- if closed is not None and closed != subtype.closed:
- raise ValueError(
- "dtype.closed and 'closed' do not match. "
- "Try IntervalDtype(dtype.subtype, closed) instead."
- )
- return subtype
- elif subtype is None:
- # we are called as an empty constructor
- # generally for pickle compat
- u = object.__new__(cls)
- u._subtype = None
- u._closed = closed
- return u
- elif isinstance(subtype, str) and subtype.lower() == "interval":
- subtype = None
- else:
- if isinstance(subtype, str):
- m = cls._match.search(subtype)
- if m is not None:
- gd = m.groupdict()
- subtype = gd["subtype"]
- if gd.get("closed", None) is not None:
- if closed is not None:
- if closed != gd["closed"]:
- raise ValueError(
- "'closed' keyword does not match value "
- "specified in dtype string"
- )
- closed = gd["closed"]
-
- try:
- subtype = pandas_dtype(subtype)
- except TypeError as err:
- raise TypeError("could not construct IntervalDtype") from err
-
- if CategoricalDtype.is_dtype(subtype) or is_string_dtype(subtype):
- # GH 19016
- msg = (
- "category, object, and string subtypes are not supported "
- "for IntervalDtype"
- )
- raise TypeError(msg)
-
- key = f"{subtype}{closed}"
- try:
- return cls._cache_dtypes[key]
- except KeyError:
- u = object.__new__(cls)
- u._subtype = subtype
- u._closed = closed
- cls._cache_dtypes[key] = u
- return u
-
- @cache_readonly
- def _can_hold_na(self) -> bool:
- subtype = self._subtype
- if subtype is None:
- # partially-initialized
- raise NotImplementedError(
- "_can_hold_na is not defined for partially-initialized IntervalDtype"
- )
- if subtype.kind in ["i", "u"]:
- return False
- return True
-
- @property
- def closed(self):
- return self._closed
-
- @property
- def subtype(self):
- """
- The dtype of the Interval bounds.
- """
- return self._subtype
-
- @classmethod
- def construct_array_type(cls) -> type[IntervalArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- from pandas.core.arrays import IntervalArray
-
- return IntervalArray
-
- @classmethod
- def construct_from_string(cls, string: str_type) -> IntervalDtype:
- """
- attempt to construct this type from a string, raise a TypeError
- if its not possible
- """
- if not isinstance(string, str):
- raise TypeError(
- f"'construct_from_string' expects a string, got {type(string)}"
- )
-
- if string.lower() == "interval" or cls._match.search(string) is not None:
- return cls(string)
-
- msg = (
- f"Cannot construct a 'IntervalDtype' from '{string}'.\n\n"
- "Incorrectly formatted string passed to constructor. "
- "Valid formats include Interval or Interval[dtype] "
- "where dtype is numeric, datetime, or timedelta"
- )
- raise TypeError(msg)
-
- @property
- def type(self) -> type[Interval]:
- return Interval
-
- def __str__(self) -> str_type:
- if self.subtype is None:
- return "interval"
- if self.closed is None:
- # Only partially initialized GH#38394
- return f"interval[{self.subtype}]"
- return f"interval[{self.subtype}, {self.closed}]"
-
- def __hash__(self) -> int:
- # make myself hashable
- return hash(str(self))
-
- def __eq__(self, other: Any) -> bool:
- if isinstance(other, str):
- return other.lower() in (self.name.lower(), str(self).lower())
- elif not isinstance(other, IntervalDtype):
- return False
- elif self.subtype is None or other.subtype is None:
- # None should match any subtype
- return True
- elif self.closed != other.closed:
- return False
- else:
- from pandas.core.dtypes.common import is_dtype_equal
-
- return is_dtype_equal(self.subtype, other.subtype)
-
- def __setstate__(self, state) -> None:
- # for pickle compat. __get_state__ is defined in the
- # PandasExtensionDtype superclass and uses the public properties to
- # pickle -> need to set the settable private ones here (see GH26067)
- self._subtype = state["subtype"]
-
- # backward-compat older pickles won't have "closed" key
- self._closed = state.pop("closed", None)
-
- @classmethod
- def is_dtype(cls, dtype: object) -> bool:
- """
- Return a boolean if we if the passed type is an actual dtype that we
- can match (via string or type)
- """
- if isinstance(dtype, str):
- if dtype.lower().startswith("interval"):
- try:
- return cls.construct_from_string(dtype) is not None
- except (ValueError, TypeError):
- return False
- else:
- return False
- return super().is_dtype(dtype)
-
- def __from_arrow__(
- self, array: pyarrow.Array | pyarrow.ChunkedArray
- ) -> IntervalArray:
- """
- Construct IntervalArray from pyarrow Array/ChunkedArray.
- """
- import pyarrow
-
- from pandas.core.arrays import IntervalArray
-
- if isinstance(array, pyarrow.Array):
- chunks = [array]
- else:
- chunks = array.chunks
-
- results = []
- for arr in chunks:
- if isinstance(arr, pyarrow.ExtensionArray):
- arr = arr.storage
- left = np.asarray(arr.field("left"), dtype=self.subtype)
- right = np.asarray(arr.field("right"), dtype=self.subtype)
- iarr = IntervalArray.from_arrays(left, right, closed=self.closed)
- results.append(iarr)
-
- if not results:
- return IntervalArray.from_arrays(
- np.array([], dtype=self.subtype),
- np.array([], dtype=self.subtype),
- closed=self.closed,
- )
- return IntervalArray._concat_same_type(results)
-
- def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
- if not all(isinstance(x, IntervalDtype) for x in dtypes):
- return None
-
- closed = cast("IntervalDtype", dtypes[0]).closed
- if not all(cast("IntervalDtype", x).closed == closed for x in dtypes):
- return np.dtype(object)
-
- from pandas.core.dtypes.cast import find_common_type
-
- common = find_common_type([cast("IntervalDtype", x).subtype for x in dtypes])
- if common == object:
- return np.dtype(object)
- return IntervalDtype(common, closed=closed)
-
-
-class PandasDtype(ExtensionDtype):
- """
- A Pandas ExtensionDtype for NumPy dtypes.
-
- This is mostly for internal compatibility, and is not especially
- useful on its own.
-
- Parameters
- ----------
- dtype : object
- Object to be converted to a NumPy data type object.
-
- See Also
- --------
- numpy.dtype
- """
-
- _metadata = ("_dtype",)
-
- def __init__(self, dtype: npt.DTypeLike | PandasDtype | None) -> None:
- if isinstance(dtype, PandasDtype):
- # make constructor univalent
- dtype = dtype.numpy_dtype
- self._dtype = np.dtype(dtype)
-
- def __repr__(self) -> str:
- return f"PandasDtype({repr(self.name)})"
-
- @property
- def numpy_dtype(self) -> np.dtype:
- """
- The NumPy dtype this PandasDtype wraps.
- """
- return self._dtype
-
- @property
- def name(self) -> str:
- """
- A bit-width name for this data-type.
- """
- return self._dtype.name
-
- @property
- def type(self) -> type[np.generic]:
- """
- The type object used to instantiate a scalar of this NumPy data-type.
- """
- return self._dtype.type
-
- @property
- def _is_numeric(self) -> bool:
- # exclude object, str, unicode, void.
- return self.kind in set("biufc")
-
- @property
- def _is_boolean(self) -> bool:
- return self.kind == "b"
-
- @classmethod
- def construct_from_string(cls, string: str) -> PandasDtype:
- try:
- dtype = np.dtype(string)
- except TypeError as err:
- if not isinstance(string, str):
- msg = f"'construct_from_string' expects a string, got {type(string)}"
- else:
- msg = f"Cannot construct a 'PandasDtype' from '{string}'"
- raise TypeError(msg) from err
- return cls(dtype)
-
- @classmethod
- def construct_array_type(cls) -> type_t[PandasArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- from pandas.core.arrays import PandasArray
-
- return PandasArray
-
- @property
- def kind(self) -> str:
- """
- A character code (one of 'biufcmMOSUV') identifying the general kind of data.
- """
- return self._dtype.kind
-
- @property
- def itemsize(self) -> int:
- """
- The element size of this data-type object.
- """
- return self._dtype.itemsize
-
-
-class BaseMaskedDtype(ExtensionDtype):
- """
- Base class for dtypes for BaseMaskedArray subclasses.
- """
-
- name: str
- base = None
- type: type
-
- @property
- def na_value(self) -> libmissing.NAType:
- return libmissing.NA
-
- @cache_readonly
- def numpy_dtype(self) -> np.dtype:
- """Return an instance of our numpy dtype"""
- return np.dtype(self.type)
-
- @cache_readonly
- def kind(self) -> str:
- return self.numpy_dtype.kind
-
- @cache_readonly
- def itemsize(self) -> int:
- """Return the number of bytes in this dtype"""
- return self.numpy_dtype.itemsize
-
- @classmethod
- def construct_array_type(cls) -> type_t[BaseMaskedArray]:
- """
- Return the array type associated with this dtype.
-
- Returns
- -------
- type
- """
- raise NotImplementedError
-
- @classmethod
- def from_numpy_dtype(cls, dtype: np.dtype) -> BaseMaskedDtype:
- """
- Construct the MaskedDtype corresponding to the given numpy dtype.
- """
- if dtype.kind == "b":
- from pandas.core.arrays.boolean import BooleanDtype
-
- return BooleanDtype()
- elif dtype.kind in ["i", "u"]:
- from pandas.core.arrays.integer import INT_STR_TO_DTYPE
-
- return INT_STR_TO_DTYPE[dtype.name]
- elif dtype.kind == "f":
- from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
-
- return FLOAT_STR_TO_DTYPE[dtype.name]
- else:
- raise NotImplementedError(dtype)
-
- def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
- # We unwrap any masked dtypes, find the common dtype we would use
- # for that, then re-mask the result.
- from pandas.core.dtypes.cast import find_common_type
-
- new_dtype = find_common_type(
- [
- dtype.numpy_dtype if isinstance(dtype, BaseMaskedDtype) else dtype
- for dtype in dtypes
- ]
- )
- if not isinstance(new_dtype, np.dtype):
- # If we ever support e.g. Masked[DatetimeArray] then this will change
- return None
- try:
- return type(self).from_numpy_dtype(new_dtype)
- except (KeyError, NotImplementedError):
- return None
diff --git a/contrib/python/pandas/py3/pandas/core/dtypes/generic.py b/contrib/python/pandas/py3/pandas/core/dtypes/generic.py
deleted file mode 100644
index 5904ba4895a..00000000000
--- a/contrib/python/pandas/py3/pandas/core/dtypes/generic.py
+++ /dev/null
@@ -1,147 +0,0 @@
-""" define generic base classes for pandas objects """
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Type,
- cast,
-)
-
-if TYPE_CHECKING:
- from pandas import (
- Categorical,
- CategoricalIndex,
- DataFrame,
- DatetimeIndex,
- Index,
- IntervalIndex,
- MultiIndex,
- PeriodIndex,
- RangeIndex,
- Series,
- TimedeltaIndex,
- )
- from pandas.core.arrays import (
- DatetimeArray,
- ExtensionArray,
- PandasArray,
- PeriodArray,
- TimedeltaArray,
- )
- from pandas.core.generic import NDFrame
-
-
-# define abstract base classes to enable isinstance type checking on our
-# objects
-def create_pandas_abc_type(name, attr, comp):
- def _check(inst) -> bool:
- return getattr(inst, attr, "_typ") in comp
-
- # https://github.com/python/mypy/issues/1006
- # error: 'classmethod' used with a non-method
- @classmethod # type: ignore[misc]
- def _instancecheck(cls, inst) -> bool:
- return _check(inst) and not isinstance(inst, type)
-
- @classmethod # type: ignore[misc]
- def _subclasscheck(cls, inst) -> bool:
- # Raise instead of returning False
- # This is consistent with default __subclasscheck__ behavior
- if not isinstance(inst, type):
- raise TypeError("issubclass() arg 1 must be a class")
-
- return _check(inst)
-
- dct = {"__instancecheck__": _instancecheck, "__subclasscheck__": _subclasscheck}
- meta = type("ABCBase", (type,), dct)
- return meta(name, (), dct)
-
-
-ABCRangeIndex = cast(
- "Type[RangeIndex]",
- create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)),
-)
-ABCMultiIndex = cast(
- "Type[MultiIndex]",
- create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)),
-)
-ABCDatetimeIndex = cast(
- "Type[DatetimeIndex]",
- create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)),
-)
-ABCTimedeltaIndex = cast(
- "Type[TimedeltaIndex]",
- create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)),
-)
-ABCPeriodIndex = cast(
- "Type[PeriodIndex]",
- create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)),
-)
-ABCCategoricalIndex = cast(
- "Type[CategoricalIndex]",
- create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)),
-)
-ABCIntervalIndex = cast(
- "Type[IntervalIndex]",
- create_pandas_abc_type("ABCIntervalIndex", "_typ", ("intervalindex",)),
-)
-ABCIndex = cast(
- "Type[Index]",
- create_pandas_abc_type(
- "ABCIndex",
- "_typ",
- {
- "index",
- "rangeindex",
- "multiindex",
- "datetimeindex",
- "timedeltaindex",
- "periodindex",
- "categoricalindex",
- "intervalindex",
- },
- ),
-)
-
-
-ABCNDFrame = cast(
- "Type[NDFrame]",
- create_pandas_abc_type("ABCNDFrame", "_typ", ("series", "dataframe")),
-)
-ABCSeries = cast(
- "Type[Series]",
- create_pandas_abc_type("ABCSeries", "_typ", ("series",)),
-)
-ABCDataFrame = cast(
- "Type[DataFrame]", create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",))
-)
-
-ABCCategorical = cast(
- "Type[Categorical]",
- create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")),
-)
-ABCDatetimeArray = cast(
- "Type[DatetimeArray]",
- create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray")),
-)
-ABCTimedeltaArray = cast(
- "Type[TimedeltaArray]",
- create_pandas_abc_type("ABCTimedeltaArray", "_typ", ("timedeltaarray")),
-)
-ABCPeriodArray = cast(
- "Type[PeriodArray]",
- create_pandas_abc_type("ABCPeriodArray", "_typ", ("periodarray",)),
-)
-ABCExtensionArray = cast(
- "Type[ExtensionArray]",
- create_pandas_abc_type(
- "ABCExtensionArray",
- "_typ",
- # Note: IntervalArray and SparseArray are included bc they have _typ="extension"
- {"extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"},
- ),
-)
-ABCPandasArray = cast(
- "Type[PandasArray]",
- create_pandas_abc_type("ABCPandasArray", "_typ", ("npy_extension",)),
-)
diff --git a/contrib/python/pandas/py3/pandas/core/dtypes/inference.py b/contrib/python/pandas/py3/pandas/core/dtypes/inference.py
deleted file mode 100644
index 28e034de869..00000000000
--- a/contrib/python/pandas/py3/pandas/core/dtypes/inference.py
+++ /dev/null
@@ -1,431 +0,0 @@
-""" basic inference routines """
-
-from __future__ import annotations
-
-from collections import abc
-from numbers import Number
-import re
-from typing import Pattern
-
-import numpy as np
-
-from pandas._libs import lib
-
-is_bool = lib.is_bool
-
-is_integer = lib.is_integer
-
-is_float = lib.is_float
-
-is_complex = lib.is_complex
-
-is_scalar = lib.is_scalar
-
-is_decimal = lib.is_decimal
-
-is_interval = lib.is_interval
-
-is_list_like = lib.is_list_like
-
-is_iterator = lib.is_iterator
-
-
-def is_number(obj) -> bool:
- """
- Check if the object is a number.
-
- Returns True when the object is a number, and False if is not.
-
- Parameters
- ----------
- obj : any type
- The object to check if is a number.
-
- Returns
- -------
- bool
- Whether `obj` is a number or not.
-
- See Also
- --------
- api.types.is_integer: Checks a subgroup of numbers.
-
- Examples
- --------
- >>> from pandas.api.types import is_number
- >>> is_number(1)
- True
- >>> is_number(7.15)
- True
-
- Booleans are valid because they are int subclass.
-
- >>> is_number(False)
- True
-
- >>> is_number("foo")
- False
- >>> is_number("5")
- False
- """
- return isinstance(obj, (Number, np.number))
-
-
-def iterable_not_string(obj) -> bool:
- """
- Check if the object is an iterable but not a string.
-
- Parameters
- ----------
- obj : The object to check.
-
- Returns
- -------
- is_iter_not_string : bool
- Whether `obj` is a non-string iterable.
-
- Examples
- --------
- >>> iterable_not_string([1, 2, 3])
- True
- >>> iterable_not_string("foo")
- False
- >>> iterable_not_string(1)
- False
- """
- return isinstance(obj, abc.Iterable) and not isinstance(obj, str)
-
-
-def is_file_like(obj) -> bool:
- """
- Check if the object is a file-like object.
-
- For objects to be considered file-like, they must
- be an iterator AND have either a `read` and/or `write`
- method as an attribute.
-
- Note: file-like objects must be iterable, but
- iterable objects need not be file-like.
-
- Parameters
- ----------
- obj : The object to check
-
- Returns
- -------
- bool
- Whether `obj` has file-like properties.
-
- Examples
- --------
- >>> import io
- >>> from pandas.api.types import is_file_like
- >>> buffer = io.StringIO("data")
- >>> is_file_like(buffer)
- True
- >>> is_file_like([1, 2, 3])
- False
- """
- if not (hasattr(obj, "read") or hasattr(obj, "write")):
- return False
-
- return bool(hasattr(obj, "__iter__"))
-
-
-def is_re(obj) -> bool:
- """
- Check if the object is a regex pattern instance.
-
- Parameters
- ----------
- obj : The object to check
-
- Returns
- -------
- bool
- Whether `obj` is a regex pattern.
-
- Examples
- --------
- >>> from pandas.api.types import is_re
- >>> import re
- >>> is_re(re.compile(".*"))
- True
- >>> is_re("foo")
- False
- """
- return isinstance(obj, Pattern)
-
-
-def is_re_compilable(obj) -> bool:
- """
- Check if the object can be compiled into a regex pattern instance.
-
- Parameters
- ----------
- obj : The object to check
-
- Returns
- -------
- bool
- Whether `obj` can be compiled as a regex pattern.
-
- Examples
- --------
- >>> from pandas.api.types import is_re_compilable
- >>> is_re_compilable(".*")
- True
- >>> is_re_compilable(1)
- False
- """
- try:
- re.compile(obj)
- except TypeError:
- return False
- else:
- return True
-
-
-def is_array_like(obj) -> bool:
- """
- Check if the object is array-like.
-
- For an object to be considered array-like, it must be list-like and
- have a `dtype` attribute.
-
- Parameters
- ----------
- obj : The object to check
-
- Returns
- -------
- is_array_like : bool
- Whether `obj` has array-like properties.
-
- Examples
- --------
- >>> is_array_like(np.array([1, 2, 3]))
- True
- >>> is_array_like(pd.Series(["a", "b"]))
- True
- >>> is_array_like(pd.Index(["2016-01-01"]))
- True
- >>> is_array_like([1, 2, 3])
- False
- >>> is_array_like(("a", "b"))
- False
- """
- return is_list_like(obj) and hasattr(obj, "dtype")
-
-
-def is_nested_list_like(obj) -> bool:
- """
- Check if the object is list-like, and that all of its elements
- are also list-like.
-
- Parameters
- ----------
- obj : The object to check
-
- Returns
- -------
- is_list_like : bool
- Whether `obj` has list-like properties.
-
- Examples
- --------
- >>> is_nested_list_like([[1, 2, 3]])
- True
- >>> is_nested_list_like([{1, 2, 3}, {1, 2, 3}])
- True
- >>> is_nested_list_like(["foo"])
- False
- >>> is_nested_list_like([])
- False
- >>> is_nested_list_like([[1, 2, 3], 1])
- False
-
- Notes
- -----
- This won't reliably detect whether a consumable iterator (e. g.
- a generator) is a nested-list-like without consuming the iterator.
- To avoid consuming it, we always return False if the outer container
- doesn't define `__len__`.
-
- See Also
- --------
- is_list_like
- """
- return (
- is_list_like(obj)
- and hasattr(obj, "__len__")
- and len(obj) > 0
- and all(is_list_like(item) for item in obj)
- )
-
-
-def is_dict_like(obj) -> bool:
- """
- Check if the object is dict-like.
-
- Parameters
- ----------
- obj : The object to check
-
- Returns
- -------
- bool
- Whether `obj` has dict-like properties.
-
- Examples
- --------
- >>> from pandas.api.types import is_dict_like
- >>> is_dict_like({1: 2})
- True
- >>> is_dict_like([1, 2, 3])
- False
- >>> is_dict_like(dict)
- False
- >>> is_dict_like(dict())
- True
- """
- dict_like_attrs = ("__getitem__", "keys", "__contains__")
- return (
- all(hasattr(obj, attr) for attr in dict_like_attrs)
- # [GH 25196] exclude classes
- and not isinstance(obj, type)
- )
-
-
-def is_named_tuple(obj) -> bool:
- """
- Check if the object is a named tuple.
-
- Parameters
- ----------
- obj : The object to check
-
- Returns
- -------
- bool
- Whether `obj` is a named tuple.
-
- Examples
- --------
- >>> from collections import namedtuple
- >>> from pandas.api.types import is_named_tuple
- >>> Point = namedtuple("Point", ["x", "y"])
- >>> p = Point(1, 2)
- >>>
- >>> is_named_tuple(p)
- True
- >>> is_named_tuple((1, 2))
- False
- """
- return isinstance(obj, abc.Sequence) and hasattr(obj, "_fields")
-
-
-def is_hashable(obj) -> bool:
- """
- Return True if hash(obj) will succeed, False otherwise.
-
- Some types will pass a test against collections.abc.Hashable but fail when
- they are actually hashed with hash().
-
- Distinguish between these and other types by trying the call to hash() and
- seeing if they raise TypeError.
-
- Returns
- -------
- bool
-
- Examples
- --------
- >>> import collections
- >>> from pandas.api.types import is_hashable
- >>> a = ([],)
- >>> isinstance(a, collections.abc.Hashable)
- True
- >>> is_hashable(a)
- False
- """
- # Unfortunately, we can't use isinstance(obj, collections.abc.Hashable),
- # which can be faster than calling hash. That is because numpy scalars
- # fail this test.
-
- # Reconsider this decision once this numpy bug is fixed:
- # https://github.com/numpy/numpy/issues/5562
-
- try:
- hash(obj)
- except TypeError:
- return False
- else:
- return True
-
-
-def is_sequence(obj) -> bool:
- """
- Check if the object is a sequence of objects.
- String types are not included as sequences here.
-
- Parameters
- ----------
- obj : The object to check
-
- Returns
- -------
- is_sequence : bool
- Whether `obj` is a sequence of objects.
-
- Examples
- --------
- >>> l = [1, 2, 3]
- >>>
- >>> is_sequence(l)
- True
- >>> is_sequence(iter(l))
- False
- """
- try:
- iter(obj) # Can iterate over it.
- len(obj) # Has a length associated with it.
- return not isinstance(obj, (str, bytes))
- except (TypeError, AttributeError):
- return False
-
-
-def is_dataclass(item):
- """
- Checks if the object is a data-class instance
-
- Parameters
- ----------
- item : object
-
- Returns
- --------
- is_dataclass : bool
- True if the item is an instance of a data-class,
- will return false if you pass the data class itself
-
- Examples
- --------
- >>> from dataclasses import dataclass
- >>> @dataclass
- ... class Point:
- ... x: int
- ... y: int
-
- >>> is_dataclass(Point)
- False
- >>> is_dataclass(Point(0,2))
- True
-
- """
- try:
- import dataclasses
-
- return dataclasses.is_dataclass(item) and not isinstance(item, type)
- except ImportError:
- return False
diff --git a/contrib/python/pandas/py3/pandas/core/dtypes/missing.py b/contrib/python/pandas/py3/pandas/core/dtypes/missing.py
deleted file mode 100644
index 99c0553998d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/dtypes/missing.py
+++ /dev/null
@@ -1,761 +0,0 @@
-"""
-missing types & inference
-"""
-from __future__ import annotations
-
-from decimal import Decimal
-from functools import partial
-from typing import (
- TYPE_CHECKING,
- overload,
-)
-
-import numpy as np
-
-from pandas._config import get_option
-
-from pandas._libs import lib
-import pandas._libs.missing as libmissing
-from pandas._libs.tslibs import (
- NaT,
- iNaT,
-)
-
-from pandas.core.dtypes.common import (
- DT64NS_DTYPE,
- TD64NS_DTYPE,
- ensure_object,
- is_bool_dtype,
- is_categorical_dtype,
- is_complex_dtype,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float_dtype,
- is_integer_dtype,
- is_object_dtype,
- is_scalar,
- is_string_or_object_np_dtype,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.dtypes import (
- CategoricalDtype,
- DatetimeTZDtype,
- ExtensionDtype,
- IntervalDtype,
- PeriodDtype,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCExtensionArray,
- ABCIndex,
- ABCMultiIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.inference import is_list_like
-
-if TYPE_CHECKING:
- from pandas._typing import (
- ArrayLike,
- DtypeObj,
- NDFrame,
- NDFrameT,
- Scalar,
- npt,
- )
-
- from pandas.core.indexes.base import Index
-
-
-isposinf_scalar = libmissing.isposinf_scalar
-isneginf_scalar = libmissing.isneginf_scalar
-
-nan_checker = np.isnan
-INF_AS_NA = False
-_dtype_object = np.dtype("object")
-_dtype_str = np.dtype(str)
-
-
-@overload
-def isna(obj: Scalar) -> bool:
- ...
-
-
-@overload
-def isna(
- obj: ArrayLike | Index | list,
-) -> npt.NDArray[np.bool_]:
- ...
-
-
-@overload
-def isna(obj: NDFrameT) -> NDFrameT:
- ...
-
-
-# handle unions
-@overload
-def isna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]:
- ...
-
-
-@overload
-def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
- ...
-
-
-def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
- """
- Detect missing values for an array-like object.
-
- This function takes a scalar or array-like object and indicates
- whether values are missing (``NaN`` in numeric arrays, ``None`` or ``NaN``
- in object arrays, ``NaT`` in datetimelike).
-
- Parameters
- ----------
- obj : scalar or array-like
- Object to check for null or missing values.
-
- Returns
- -------
- bool or array-like of bool
- For scalar input, returns a scalar boolean.
- For array input, returns an array of boolean indicating whether each
- corresponding element is missing.
-
- See Also
- --------
- notna : Boolean inverse of pandas.isna.
- Series.isna : Detect missing values in a Series.
- DataFrame.isna : Detect missing values in a DataFrame.
- Index.isna : Detect missing values in an Index.
-
- Examples
- --------
- Scalar arguments (including strings) result in a scalar boolean.
-
- >>> pd.isna('dog')
- False
-
- >>> pd.isna(pd.NA)
- True
-
- >>> pd.isna(np.nan)
- True
-
- ndarrays result in an ndarray of booleans.
-
- >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])
- >>> array
- array([[ 1., nan, 3.],
- [ 4., 5., nan]])
- >>> pd.isna(array)
- array([[False, True, False],
- [False, False, True]])
-
- For indexes, an ndarray of booleans is returned.
-
- >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None,
- ... "2017-07-08"])
- >>> index
- DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'],
- dtype='datetime64[ns]', freq=None)
- >>> pd.isna(index)
- array([False, False, True, False])
-
- For Series and DataFrame, the same type is returned, containing booleans.
-
- >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])
- >>> df
- 0 1 2
- 0 ant bee cat
- 1 dog None fly
- >>> pd.isna(df)
- 0 1 2
- 0 False False False
- 1 False True False
-
- >>> pd.isna(df[1])
- 0 False
- 1 True
- Name: 1, dtype: bool
- """
- return _isna(obj)
-
-
-isnull = isna
-
-
-def _isna(obj, inf_as_na: bool = False):
- """
- Detect missing values, treating None, NaN or NA as null. Infinite
- values will also be treated as null if inf_as_na is True.
-
- Parameters
- ----------
- obj: ndarray or object value
- Input array or scalar value.
- inf_as_na: bool
- Whether to treat infinity as null.
-
- Returns
- -------
- boolean ndarray or boolean
- """
- if is_scalar(obj):
- return libmissing.checknull(obj, inf_as_na=inf_as_na)
- elif isinstance(obj, ABCMultiIndex):
- raise NotImplementedError("isna is not defined for MultiIndex")
- elif isinstance(obj, type):
- return False
- elif isinstance(obj, (np.ndarray, ABCExtensionArray)):
- return _isna_array(obj, inf_as_na=inf_as_na)
- elif isinstance(obj, ABCIndex):
- # Try to use cached isna, which also short-circuits for integer dtypes
- # and avoids materializing RangeIndex._values
- if not obj._can_hold_na:
- return obj.isna()
- return _isna_array(obj._values, inf_as_na=inf_as_na)
-
- elif isinstance(obj, ABCSeries):
- result = _isna_array(obj._values, inf_as_na=inf_as_na)
- # box
- result = obj._constructor(result, index=obj.index, name=obj.name, copy=False)
- return result
- elif isinstance(obj, ABCDataFrame):
- return obj.isna()
- elif isinstance(obj, list):
- return _isna_array(np.asarray(obj, dtype=object), inf_as_na=inf_as_na)
- elif hasattr(obj, "__array__"):
- return _isna_array(np.asarray(obj), inf_as_na=inf_as_na)
- else:
- return False
-
-
-def _use_inf_as_na(key) -> None:
- """
- Option change callback for na/inf behaviour.
-
- Choose which replacement for numpy.isnan / -numpy.isfinite is used.
-
- Parameters
- ----------
- flag: bool
- True means treat None, NaN, INF, -INF as null (old way),
- False means None and NaN are null, but INF, -INF are not null
- (new way).
-
- Notes
- -----
- This approach to setting global module values is discussed and
- approved here:
-
- * https://stackoverflow.com/questions/4859217/
- programmatically-creating-variables-in-python/4859312#4859312
- """
- inf_as_na = get_option(key)
- globals()["_isna"] = partial(_isna, inf_as_na=inf_as_na)
- if inf_as_na:
- globals()["nan_checker"] = lambda x: ~np.isfinite(x)
- globals()["INF_AS_NA"] = True
- else:
- globals()["nan_checker"] = np.isnan
- globals()["INF_AS_NA"] = False
-
-
-def _isna_array(values: ArrayLike, inf_as_na: bool = False):
- """
- Return an array indicating which values of the input array are NaN / NA.
-
- Parameters
- ----------
- obj: ndarray or ExtensionArray
- The input array whose elements are to be checked.
- inf_as_na: bool
- Whether or not to treat infinite values as NA.
-
- Returns
- -------
- array-like
- Array of boolean values denoting the NA status of each element.
- """
- dtype = values.dtype
-
- if not isinstance(values, np.ndarray):
- # i.e. ExtensionArray
- if inf_as_na and is_categorical_dtype(dtype):
- result = libmissing.isnaobj(values.to_numpy(), inf_as_na=inf_as_na)
- else:
- # error: Incompatible types in assignment (expression has type
- # "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has
- # type "ndarray[Any, dtype[bool_]]")
- result = values.isna() # type: ignore[assignment]
- elif is_string_or_object_np_dtype(values.dtype):
- result = _isna_string_dtype(values, inf_as_na=inf_as_na)
- elif needs_i8_conversion(dtype):
- # this is the NaT pattern
- result = values.view("i8") == iNaT
- else:
- if inf_as_na:
- result = ~np.isfinite(values)
- else:
- result = np.isnan(values)
-
- return result
-
-
-def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bool_]:
- # Working around NumPy ticket 1542
- dtype = values.dtype
-
- if dtype.kind in ("S", "U"):
- result = np.zeros(values.shape, dtype=bool)
- else:
- if values.ndim in {1, 2}:
- result = libmissing.isnaobj(values, inf_as_na=inf_as_na)
- else:
- # 0-D, reached via e.g. mask_missing
- result = libmissing.isnaobj(values.ravel(), inf_as_na=inf_as_na)
- result = result.reshape(values.shape)
-
- return result
-
-
-@overload
-def notna(obj: Scalar) -> bool:
- ...
-
-
-@overload
-def notna(
- obj: ArrayLike | Index | list,
-) -> npt.NDArray[np.bool_]:
- ...
-
-
-@overload
-def notna(obj: NDFrameT) -> NDFrameT:
- ...
-
-
-# handle unions
-@overload
-def notna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]:
- ...
-
-
-@overload
-def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
- ...
-
-
-def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
- """
- Detect non-missing values for an array-like object.
-
- This function takes a scalar or array-like object and indicates
- whether values are valid (not missing, which is ``NaN`` in numeric
- arrays, ``None`` or ``NaN`` in object arrays, ``NaT`` in datetimelike).
-
- Parameters
- ----------
- obj : array-like or object value
- Object to check for *not* null or *non*-missing values.
-
- Returns
- -------
- bool or array-like of bool
- For scalar input, returns a scalar boolean.
- For array input, returns an array of boolean indicating whether each
- corresponding element is valid.
-
- See Also
- --------
- isna : Boolean inverse of pandas.notna.
- Series.notna : Detect valid values in a Series.
- DataFrame.notna : Detect valid values in a DataFrame.
- Index.notna : Detect valid values in an Index.
-
- Examples
- --------
- Scalar arguments (including strings) result in a scalar boolean.
-
- >>> pd.notna('dog')
- True
-
- >>> pd.notna(pd.NA)
- False
-
- >>> pd.notna(np.nan)
- False
-
- ndarrays result in an ndarray of booleans.
-
- >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])
- >>> array
- array([[ 1., nan, 3.],
- [ 4., 5., nan]])
- >>> pd.notna(array)
- array([[ True, False, True],
- [ True, True, False]])
-
- For indexes, an ndarray of booleans is returned.
-
- >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None,
- ... "2017-07-08"])
- >>> index
- DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'],
- dtype='datetime64[ns]', freq=None)
- >>> pd.notna(index)
- array([ True, True, False, True])
-
- For Series and DataFrame, the same type is returned, containing booleans.
-
- >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])
- >>> df
- 0 1 2
- 0 ant bee cat
- 1 dog None fly
- >>> pd.notna(df)
- 0 1 2
- 0 True True True
- 1 True False True
-
- >>> pd.notna(df[1])
- 0 True
- 1 False
- Name: 1, dtype: bool
- """
- res = isna(obj)
- if isinstance(res, bool):
- return not res
- return ~res
-
-
-notnull = notna
-
-
-def isna_compat(arr, fill_value=np.nan) -> bool:
- """
- Parameters
- ----------
- arr: a numpy array
- fill_value: fill value, default to np.nan
-
- Returns
- -------
- True if we can fill using this fill_value
- """
- if isna(fill_value):
- dtype = arr.dtype
- return not (is_bool_dtype(dtype) or is_integer_dtype(dtype))
- return True
-
-
-def array_equivalent(
- left,
- right,
- strict_nan: bool = False,
- dtype_equal: bool = False,
-) -> bool:
- """
- True if two arrays, left and right, have equal non-NaN elements, and NaNs
- in corresponding locations. False otherwise. It is assumed that left and
- right are NumPy arrays of the same dtype. The behavior of this function
- (particularly with respect to NaNs) is not defined if the dtypes are
- different.
-
- Parameters
- ----------
- left, right : ndarrays
- strict_nan : bool, default False
- If True, consider NaN and None to be different.
- dtype_equal : bool, default False
- Whether `left` and `right` are known to have the same dtype
- according to `is_dtype_equal`. Some methods like `BlockManager.equals`.
- require that the dtypes match. Setting this to ``True`` can improve
- performance, but will give different results for arrays that are
- equal but different dtypes.
-
- Returns
- -------
- b : bool
- Returns True if the arrays are equivalent.
-
- Examples
- --------
- >>> array_equivalent(
- ... np.array([1, 2, np.nan]),
- ... np.array([1, 2, np.nan]))
- True
- >>> array_equivalent(
- ... np.array([1, np.nan, 2]),
- ... np.array([1, 2, np.nan]))
- False
- """
- left, right = np.asarray(left), np.asarray(right)
-
- # shape compat
- if left.shape != right.shape:
- return False
-
- if dtype_equal:
- # fastpath when we require that the dtypes match (Block.equals)
- if left.dtype.kind in ["f", "c"]:
- return _array_equivalent_float(left, right)
- elif needs_i8_conversion(left.dtype):
- return _array_equivalent_datetimelike(left, right)
- elif is_string_or_object_np_dtype(left.dtype):
- # TODO: fastpath for pandas' StringDtype
- return _array_equivalent_object(left, right, strict_nan)
- else:
- return np.array_equal(left, right)
-
- # Slow path when we allow comparing different dtypes.
- # Object arrays can contain None, NaN and NaT.
- # string dtypes must be come to this path for NumPy 1.7.1 compat
- if left.dtype.kind in "OSU" or right.dtype.kind in "OSU":
- # Note: `in "OSU"` is non-trivially faster than `in ["O", "S", "U"]`
- # or `in ("O", "S", "U")`
- return _array_equivalent_object(left, right, strict_nan)
-
- # NaNs can occur in float and complex arrays.
- if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype):
- if not (left.size and right.size):
- return True
- return ((left == right) | (isna(left) & isna(right))).all()
-
- elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype):
- # datetime64, timedelta64, Period
- if not is_dtype_equal(left.dtype, right.dtype):
- return False
-
- left = left.view("i8")
- right = right.view("i8")
-
- # if we have structured dtypes, compare first
- if (
- left.dtype.type is np.void or right.dtype.type is np.void
- ) and left.dtype != right.dtype:
- return False
-
- return np.array_equal(left, right)
-
-
-def _array_equivalent_float(left, right) -> bool:
- return bool(((left == right) | (np.isnan(left) & np.isnan(right))).all())
-
-
-def _array_equivalent_datetimelike(left, right):
- return np.array_equal(left.view("i8"), right.view("i8"))
-
-
-def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bool):
- if not strict_nan:
- # isna considers NaN and None to be equivalent.
-
- return lib.array_equivalent_object(ensure_object(left), ensure_object(right))
-
- for left_value, right_value in zip(left, right):
- if left_value is NaT and right_value is not NaT:
- return False
-
- elif left_value is libmissing.NA and right_value is not libmissing.NA:
- return False
-
- elif isinstance(left_value, float) and np.isnan(left_value):
- if not isinstance(right_value, float) or not np.isnan(right_value):
- return False
- else:
- try:
- if np.any(np.asarray(left_value != right_value)):
- return False
- except TypeError as err:
- if "boolean value of NA is ambiguous" in str(err):
- return False
- raise
- except ValueError:
- # numpy can raise a ValueError if left and right cannot be
- # compared (e.g. nested arrays)
- return False
- return True
-
-
-def array_equals(left: ArrayLike, right: ArrayLike) -> bool:
- """
- ExtensionArray-compatible implementation of array_equivalent.
- """
- if not is_dtype_equal(left.dtype, right.dtype):
- return False
- elif isinstance(left, ABCExtensionArray):
- return left.equals(right)
- else:
- return array_equivalent(left, right, dtype_equal=True)
-
-
-def infer_fill_value(val):
- """
- infer the fill value for the nan/NaT from the provided
- scalar/ndarray/list-like if we are a NaT, return the correct dtyped
- element to provide proper block construction
- """
- if not is_list_like(val):
- val = [val]
- val = np.array(val, copy=False)
- if needs_i8_conversion(val.dtype):
- return np.array("NaT", dtype=val.dtype)
- elif is_object_dtype(val.dtype):
- dtype = lib.infer_dtype(ensure_object(val), skipna=False)
- if dtype in ["datetime", "datetime64"]:
- return np.array("NaT", dtype=DT64NS_DTYPE)
- elif dtype in ["timedelta", "timedelta64"]:
- return np.array("NaT", dtype=TD64NS_DTYPE)
- return np.nan
-
-
-def maybe_fill(arr: np.ndarray) -> np.ndarray:
- """
- Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype.
- """
- if arr.dtype.kind not in ("u", "i", "b"):
- arr.fill(np.nan)
- return arr
-
-
-def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
- """
- Return a dtype compat na value
-
- Parameters
- ----------
- dtype : string / dtype
- compat : bool, default True
-
- Returns
- -------
- np.dtype or a pandas dtype
-
- Examples
- --------
- >>> na_value_for_dtype(np.dtype('int64'))
- 0
- >>> na_value_for_dtype(np.dtype('int64'), compat=False)
- nan
- >>> na_value_for_dtype(np.dtype('float64'))
- nan
- >>> na_value_for_dtype(np.dtype('bool'))
- False
- >>> na_value_for_dtype(np.dtype('datetime64[ns]'))
- numpy.datetime64('NaT')
- """
-
- if isinstance(dtype, ExtensionDtype):
- return dtype.na_value
- elif needs_i8_conversion(dtype):
- return dtype.type("NaT", "ns")
- elif is_float_dtype(dtype):
- return np.nan
- elif is_integer_dtype(dtype):
- if compat:
- return 0
- return np.nan
- elif is_bool_dtype(dtype):
- if compat:
- return False
- return np.nan
- return np.nan
-
-
-def remove_na_arraylike(arr):
- """
- Return array-like containing only true/non-NaN values, possibly empty.
- """
- if is_extension_array_dtype(arr):
- return arr[notna(arr)]
- else:
- return arr[notna(np.asarray(arr))]
-
-
-def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool:
- """
- isna check that excludes incompatible dtypes
-
- Parameters
- ----------
- obj : object
- dtype : np.datetime64, np.timedelta64, DatetimeTZDtype, or PeriodDtype
-
- Returns
- -------
- bool
- """
- if not lib.is_scalar(obj) or not isna(obj):
- return False
- elif dtype.kind == "M":
- if isinstance(dtype, np.dtype):
- # i.e. not tzaware
- return not isinstance(obj, (np.timedelta64, Decimal))
- # we have to rule out tznaive dt64("NaT")
- return not isinstance(obj, (np.timedelta64, np.datetime64, Decimal))
- elif dtype.kind == "m":
- return not isinstance(obj, (np.datetime64, Decimal))
- elif dtype.kind in ["i", "u", "f", "c"]:
- # Numeric
- return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64))
- elif dtype.kind == "b":
- # We allow pd.NA, None, np.nan in BooleanArray (same as IntervalDtype)
- return lib.is_float(obj) or obj is None or obj is libmissing.NA
-
- elif dtype == _dtype_str:
- # numpy string dtypes to avoid float np.nan
- return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal, float))
-
- elif dtype == _dtype_object:
- # This is needed for Categorical, but is kind of weird
- return True
-
- elif isinstance(dtype, PeriodDtype):
- return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
-
- elif isinstance(dtype, IntervalDtype):
- return lib.is_float(obj) or obj is None or obj is libmissing.NA
-
- elif isinstance(dtype, CategoricalDtype):
- return is_valid_na_for_dtype(obj, dtype.categories.dtype)
-
- # fallback, default to allowing NaN, None, NA, NaT
- return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
-
-
-def isna_all(arr: ArrayLike) -> bool:
- """
- Optimized equivalent to isna(arr).all()
- """
- total_len = len(arr)
-
- # Usually it's enough to check but a small fraction of values to see if
- # a block is NOT null, chunks should help in such cases.
- # parameters 1000 and 40 were chosen arbitrarily
- chunk_len = max(total_len // 40, 1000)
-
- dtype = arr.dtype
- if dtype.kind == "f" and isinstance(dtype, np.dtype):
- checker = nan_checker
-
- elif (isinstance(dtype, np.dtype) and dtype.kind in ["m", "M"]) or isinstance(
- dtype, (DatetimeTZDtype, PeriodDtype)
- ):
- # error: Incompatible types in assignment (expression has type
- # "Callable[[Any], Any]", variable has type "ufunc")
- checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment]
-
- else:
- # error: Incompatible types in assignment (expression has type "Callable[[Any],
- # Any]", variable has type "ufunc")
- checker = lambda x: _isna_array( # type: ignore[assignment]
- x, inf_as_na=INF_AS_NA
- )
-
- return all(
- checker(arr[i : i + chunk_len]).all() for i in range(0, total_len, chunk_len)
- )
diff --git a/contrib/python/pandas/py3/pandas/core/flags.py b/contrib/python/pandas/py3/pandas/core/flags.py
deleted file mode 100644
index f07c6917d91..00000000000
--- a/contrib/python/pandas/py3/pandas/core/flags.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from __future__ import annotations
-
-import weakref
-
-
-class Flags:
- """
- Flags that apply to pandas objects.
-
- .. versionadded:: 1.2.0
-
- Parameters
- ----------
- obj : Series or DataFrame
- The object these flags are associated with.
- allows_duplicate_labels : bool, default True
- Whether to allow duplicate labels in this object. By default,
- duplicate labels are permitted. Setting this to ``False`` will
- cause an :class:`errors.DuplicateLabelError` to be raised when
- `index` (or columns for DataFrame) is not unique, or any
- subsequent operation on introduces duplicates.
- See :ref:`duplicates.disallow` for more.
-
- .. warning::
-
- This is an experimental feature. Currently, many methods fail to
- propagate the ``allows_duplicate_labels`` value. In future versions
- it is expected that every method taking or returning one or more
- DataFrame or Series objects will propagate ``allows_duplicate_labels``.
-
- Notes
- -----
- Attributes can be set in two ways
-
- >>> df = pd.DataFrame()
- >>> df.flags
- <Flags(allows_duplicate_labels=True)>
- >>> df.flags.allows_duplicate_labels = False
- >>> df.flags
- <Flags(allows_duplicate_labels=False)>
-
- >>> df.flags['allows_duplicate_labels'] = True
- >>> df.flags
- <Flags(allows_duplicate_labels=True)>
- """
-
- _keys = {"allows_duplicate_labels"}
-
- def __init__(self, obj, *, allows_duplicate_labels) -> None:
- self._allows_duplicate_labels = allows_duplicate_labels
- self._obj = weakref.ref(obj)
-
- @property
- def allows_duplicate_labels(self) -> bool:
- """
- Whether this object allows duplicate labels.
-
- Setting ``allows_duplicate_labels=False`` ensures that the
- index (and columns of a DataFrame) are unique. Most methods
- that accept and return a Series or DataFrame will propagate
- the value of ``allows_duplicate_labels``.
-
- See :ref:`duplicates` for more.
-
- See Also
- --------
- DataFrame.attrs : Set global metadata on this object.
- DataFrame.set_flags : Set global flags on this object.
-
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a'])
- >>> df.flags.allows_duplicate_labels
- True
- >>> df.flags.allows_duplicate_labels = False
- Traceback (most recent call last):
- ...
- pandas.errors.DuplicateLabelError: Index has duplicates.
- positions
- label
- a [0, 1]
- """
- return self._allows_duplicate_labels
-
- @allows_duplicate_labels.setter
- def allows_duplicate_labels(self, value: bool) -> None:
- value = bool(value)
- obj = self._obj()
- if obj is None:
- raise ValueError("This flag's object has been deleted.")
-
- if not value:
- for ax in obj.axes:
- ax._maybe_check_unique()
-
- self._allows_duplicate_labels = value
-
- def __getitem__(self, key):
- if key not in self._keys:
- raise KeyError(key)
-
- return getattr(self, key)
-
- def __setitem__(self, key, value) -> None:
- if key not in self._keys:
- raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}")
- setattr(self, key, value)
-
- def __repr__(self) -> str:
- return f"<Flags(allows_duplicate_labels={self.allows_duplicate_labels})>"
-
- def __eq__(self, other):
- if isinstance(other, type(self)):
- return self.allows_duplicate_labels == other.allows_duplicate_labels
- return False
diff --git a/contrib/python/pandas/py3/pandas/core/frame.py b/contrib/python/pandas/py3/pandas/core/frame.py
deleted file mode 100644
index 7e1d8711aee..00000000000
--- a/contrib/python/pandas/py3/pandas/core/frame.py
+++ /dev/null
@@ -1,11620 +0,0 @@
-"""
-DataFrame
----------
-An efficient 2D container for potentially mixed-type time series or other
-labeled data series.
-
-Similar to its R counterpart, data.frame, except providing automatic data
-alignment and a host of useful data manipulation methods having to do with the
-labeling information
-"""
-from __future__ import annotations
-
-import collections
-from collections import abc
-import datetime
-import functools
-from io import StringIO
-import itertools
-import sys
-from textwrap import dedent
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Iterable,
- Iterator,
- Literal,
- Mapping,
- Sequence,
- cast,
- overload,
-)
-import warnings
-
-import numpy as np
-from numpy import ma
-
-from pandas._config import (
- get_option,
- using_copy_on_write,
-)
-
-from pandas._libs import (
- algos as libalgos,
- lib,
- properties,
-)
-from pandas._libs.hashtable import duplicated
-from pandas._libs.lib import (
- NoDefault,
- is_range_indexer,
- no_default,
-)
-from pandas._typing import (
- AggFuncType,
- AlignJoin,
- AnyAll,
- AnyArrayLike,
- ArrayLike,
- Axes,
- Axis,
- AxisInt,
- ColspaceArgType,
- CompressionOptions,
- CorrelationMethod,
- DropKeep,
- Dtype,
- DtypeObj,
- FilePath,
- FillnaOptions,
- FloatFormatType,
- FormattersType,
- Frequency,
- IgnoreRaise,
- IndexKeyFunc,
- IndexLabel,
- Level,
- MergeHow,
- NaPosition,
- PythonFuncType,
- QuantileInterpolation,
- ReadBuffer,
- Renamer,
- Scalar,
- SortKind,
- StorageOptions,
- Suffixes,
- TimedeltaConvertibleTypes,
- TimestampConvertibleTypes,
- ValueKeyFunc,
- WriteBuffer,
- npt,
-)
-from pandas.compat import PYPY
-from pandas.compat._optional import import_optional_dependency
-from pandas.compat.numpy import (
- function as nv,
- np_percentile_argname,
-)
-from pandas.errors import (
- ChainedAssignmentError,
- InvalidIndexError,
- _chained_assignment_msg,
-)
-from pandas.util._decorators import (
- Appender,
- Substitution,
- doc,
-)
-from pandas.util._exceptions import find_stack_level
-from pandas.util._validators import (
- validate_ascending,
- validate_bool_kwarg,
- validate_percentile,
-)
-
-from pandas.core.dtypes.cast import (
- LossySetitemError,
- can_hold_element,
- construct_1d_arraylike_from_scalar,
- construct_2d_arraylike_from_scalar,
- find_common_type,
- infer_dtype_from_scalar,
- invalidate_string_dtypes,
- maybe_box_native,
- maybe_downcast_to_dtype,
-)
-from pandas.core.dtypes.common import (
- infer_dtype_from_object,
- is_1d_only_ea_dtype,
- is_bool_dtype,
- is_dataclass,
- is_dict_like,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float,
- is_float_dtype,
- is_hashable,
- is_integer,
- is_integer_dtype,
- is_iterator,
- is_list_like,
- is_scalar,
- is_sequence,
- needs_i8_conversion,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import ExtensionDtype
-from pandas.core.dtypes.missing import (
- isna,
- notna,
-)
-
-from pandas.core import (
- algorithms,
- common as com,
- nanops,
- ops,
-)
-from pandas.core.accessor import CachedAccessor
-from pandas.core.apply import (
- reconstruct_func,
- relabel_result,
-)
-from pandas.core.array_algos.take import take_2d_multi
-from pandas.core.arraylike import OpsMixin
-from pandas.core.arrays import (
- DatetimeArray,
- ExtensionArray,
- PeriodArray,
- TimedeltaArray,
-)
-from pandas.core.arrays.arrow import ArrowDtype
-from pandas.core.arrays.sparse import SparseFrameAccessor
-from pandas.core.construction import (
- ensure_wrapped_if_datetimelike,
- extract_array,
- sanitize_array,
- sanitize_masked_array,
-)
-from pandas.core.generic import NDFrame
-from pandas.core.indexers import check_key_length
-from pandas.core.indexes.api import (
- DatetimeIndex,
- Index,
- PeriodIndex,
- default_index,
- ensure_index,
- ensure_index_from_sequences,
-)
-from pandas.core.indexes.multi import (
- MultiIndex,
- maybe_droplevels,
-)
-from pandas.core.indexing import (
- check_bool_indexer,
- check_dict_or_set_indexers,
-)
-from pandas.core.internals import (
- ArrayManager,
- BlockManager,
-)
-from pandas.core.internals.construction import (
- arrays_to_mgr,
- dataclasses_to_dicts,
- dict_to_mgr,
- mgr_to_mgr,
- ndarray_to_mgr,
- nested_data_to_arrays,
- rec_array_to_mgr,
- reorder_arrays,
- to_arrays,
- treat_as_nested,
-)
-from pandas.core.methods import selectn
-from pandas.core.reshape.melt import melt
-from pandas.core.series import Series
-from pandas.core.shared_docs import _shared_docs
-from pandas.core.sorting import (
- get_group_index,
- lexsort_indexer,
- nargsort,
-)
-
-from pandas.io.common import get_handle
-from pandas.io.formats import (
- console,
- format as fmt,
-)
-from pandas.io.formats.info import (
- INFO_DOCSTRING,
- DataFrameInfo,
- frame_sub_kwargs,
-)
-import pandas.plotting
-
-if TYPE_CHECKING:
- from pandas.core.groupby.generic import DataFrameGroupBy
- from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
- from pandas.core.internals import SingleDataManager
- from pandas.core.resample import Resampler
-
- from pandas.io.formats.style import Styler
-
-# ---------------------------------------------------------------------
-# Docstring templates
-
-_shared_doc_kwargs = {
- "axes": "index, columns",
- "klass": "DataFrame",
- "axes_single_arg": "{0 or 'index', 1 or 'columns'}",
- "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
- If 0 or 'index': apply function to each column.
- If 1 or 'columns': apply function to each row.""",
- "inplace": """
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.""",
- "optional_by": """
-by : str or list of str
- Name or list of names to sort by.
-
- - if `axis` is 0 or `'index'` then `by` may contain index
- levels and/or column labels.
- - if `axis` is 1 or `'columns'` then `by` may contain column
- levels and/or index labels.""",
- "optional_reindex": """
-labels : array-like, optional
- New labels / index to conform the axis specified by 'axis' to.
-index : array-like, optional
- New labels for the index. Preferably an Index object to avoid
- duplicating data.
-columns : array-like, optional
- New labels for the columns. Preferably an Index object to avoid
- duplicating data.
-axis : int or str, optional
- Axis to target. Can be either the axis name ('index', 'columns')
- or number (0, 1).""",
- "replace_iloc": """
- This differs from updating with ``.loc`` or ``.iloc``, which require
- you to specify a location to update with some value.""",
-}
-
-_numeric_only_doc = """numeric_only : bool, default False
- Include only float, int, boolean data.
-"""
-
-_merge_doc = """
-Merge DataFrame or named Series objects with a database-style join.
-
-A named Series object is treated as a DataFrame with a single named column.
-
-The join is done on columns or indexes. If joining columns on
-columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
-on indexes or indexes on a column or columns, the index will be passed on.
-When performing a cross merge, no column specifications to merge on are
-allowed.
-
-.. warning::
-
- If both key columns contain rows where the key is a null value, those
- rows will be matched against each other. This is different from usual SQL
- join behaviour and can lead to unexpected results.
-
-Parameters
-----------%s
-right : DataFrame or named Series
- Object to merge with.
-how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
- Type of merge to be performed.
-
- * left: use only keys from left frame, similar to a SQL left outer join;
- preserve key order.
- * right: use only keys from right frame, similar to a SQL right outer join;
- preserve key order.
- * outer: use union of keys from both frames, similar to a SQL full outer
- join; sort keys lexicographically.
- * inner: use intersection of keys from both frames, similar to a SQL inner
- join; preserve the order of the left keys.
- * cross: creates the cartesian product from both frames, preserves the order
- of the left keys.
-
- .. versionadded:: 1.2.0
-
-on : label or list
- Column or index level names to join on. These must be found in both
- DataFrames. If `on` is None and not merging on indexes then this defaults
- to the intersection of the columns in both DataFrames.
-left_on : label or list, or array-like
- Column or index level names to join on in the left DataFrame. Can also
- be an array or list of arrays of the length of the left DataFrame.
- These arrays are treated as if they are columns.
-right_on : label or list, or array-like
- Column or index level names to join on in the right DataFrame. Can also
- be an array or list of arrays of the length of the right DataFrame.
- These arrays are treated as if they are columns.
-left_index : bool, default False
- Use the index from the left DataFrame as the join key(s). If it is a
- MultiIndex, the number of keys in the other DataFrame (either the index
- or a number of columns) must match the number of levels.
-right_index : bool, default False
- Use the index from the right DataFrame as the join key. Same caveats as
- left_index.
-sort : bool, default False
- Sort the join keys lexicographically in the result DataFrame. If False,
- the order of the join keys depends on the join type (how keyword).
-suffixes : list-like, default is ("_x", "_y")
- A length-2 sequence where each element is optionally a string
- indicating the suffix to add to overlapping column names in
- `left` and `right` respectively. Pass a value of `None` instead
- of a string to indicate that the column name from `left` or
- `right` should be left as-is, with no suffix. At least one of the
- values must not be None.
-copy : bool, default True
- If False, avoid copy if possible.
-indicator : bool or str, default False
- If True, adds a column to the output DataFrame called "_merge" with
- information on the source of each row. The column can be given a different
- name by providing a string argument. The column will have a Categorical
- type with the value of "left_only" for observations whose merge key only
- appears in the left DataFrame, "right_only" for observations
- whose merge key only appears in the right DataFrame, and "both"
- if the observation's merge key is found in both DataFrames.
-
-validate : str, optional
- If specified, checks if merge is of specified type.
-
- * "one_to_one" or "1:1": check if merge keys are unique in both
- left and right datasets.
- * "one_to_many" or "1:m": check if merge keys are unique in left
- dataset.
- * "many_to_one" or "m:1": check if merge keys are unique in right
- dataset.
- * "many_to_many" or "m:m": allowed, but does not result in checks.
-
-Returns
--------
-DataFrame
- A DataFrame of the two merged objects.
-
-See Also
---------
-merge_ordered : Merge with optional filling/interpolation.
-merge_asof : Merge on nearest keys.
-DataFrame.join : Similar method using indices.
-
-Notes
------
-Support for specifying index levels as the `on`, `left_on`, and
-`right_on` parameters was added in version 0.23.0
-Support for merging named Series objects was added in version 0.24.0
-
-Examples
---------
->>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
-... 'value': [1, 2, 3, 5]})
->>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
-... 'value': [5, 6, 7, 8]})
->>> df1
- lkey value
-0 foo 1
-1 bar 2
-2 baz 3
-3 foo 5
->>> df2
- rkey value
-0 foo 5
-1 bar 6
-2 baz 7
-3 foo 8
-
-Merge df1 and df2 on the lkey and rkey columns. The value columns have
-the default suffixes, _x and _y, appended.
-
->>> df1.merge(df2, left_on='lkey', right_on='rkey')
- lkey value_x rkey value_y
-0 foo 1 foo 5
-1 foo 1 foo 8
-2 foo 5 foo 5
-3 foo 5 foo 8
-4 bar 2 bar 6
-5 baz 3 baz 7
-
-Merge DataFrames df1 and df2 with specified left and right suffixes
-appended to any overlapping columns.
-
->>> df1.merge(df2, left_on='lkey', right_on='rkey',
-... suffixes=('_left', '_right'))
- lkey value_left rkey value_right
-0 foo 1 foo 5
-1 foo 1 foo 8
-2 foo 5 foo 5
-3 foo 5 foo 8
-4 bar 2 bar 6
-5 baz 3 baz 7
-
-Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
-any overlapping columns.
-
->>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
-Traceback (most recent call last):
-...
-ValueError: columns overlap but no suffix specified:
- Index(['value'], dtype='object')
-
->>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
->>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
->>> df1
- a b
-0 foo 1
-1 bar 2
->>> df2
- a c
-0 foo 3
-1 baz 4
-
->>> df1.merge(df2, how='inner', on='a')
- a b c
-0 foo 1 3
-
->>> df1.merge(df2, how='left', on='a')
- a b c
-0 foo 1 3.0
-1 bar 2 NaN
-
->>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
->>> df2 = pd.DataFrame({'right': [7, 8]})
->>> df1
- left
-0 foo
-1 bar
->>> df2
- right
-0 7
-1 8
-
->>> df1.merge(df2, how='cross')
- left right
-0 foo 7
-1 foo 8
-2 bar 7
-3 bar 8
-"""
-
-
-# -----------------------------------------------------------------------
-# DataFrame class
-
-
-class DataFrame(NDFrame, OpsMixin):
- """
- Two-dimensional, size-mutable, potentially heterogeneous tabular data.
-
- Data structure also contains labeled axes (rows and columns).
- Arithmetic operations align on both row and column labels. Can be
- thought of as a dict-like container for Series objects. The primary
- pandas data structure.
-
- Parameters
- ----------
- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
- Dict can contain Series, arrays, constants, dataclass or list-like objects. If
- data is a dict, column order follows insertion-order. If a dict contains Series
- which have an index defined, it is aligned by its index. This alignment also
- occurs if data is a Series or a DataFrame itself. Alignment is done on
- Series/DataFrame inputs.
-
- If data is a list of dicts, column order follows insertion-order.
-
- index : Index or array-like
- Index to use for resulting frame. Will default to RangeIndex if
- no indexing information part of input data and no index provided.
- columns : Index or array-like
- Column labels to use for resulting frame when data does not have them,
- defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
- will perform column selection instead.
- dtype : dtype, default None
- Data type to force. Only a single dtype is allowed. If None, infer.
- copy : bool or None, default None
- Copy data from inputs.
- For dict data, the default of None behaves like ``copy=True``. For DataFrame
- or 2d ndarray input, the default of None behaves like ``copy=False``.
- If data is a dict containing one or more Series (possibly of different dtypes),
- ``copy=False`` will ensure that these inputs are not copied.
-
- .. versionchanged:: 1.3.0
-
- See Also
- --------
- DataFrame.from_records : Constructor from tuples, also record arrays.
- DataFrame.from_dict : From dicts of Series, arrays, or dicts.
- read_csv : Read a comma-separated values (csv) file into DataFrame.
- read_table : Read general delimited file into DataFrame.
- read_clipboard : Read text from clipboard into DataFrame.
-
- Notes
- -----
- Please reference the :ref:`User Guide <basics.dataframe>` for more information.
-
- Examples
- --------
- Constructing DataFrame from a dictionary.
-
- >>> d = {'col1': [1, 2], 'col2': [3, 4]}
- >>> df = pd.DataFrame(data=d)
- >>> df
- col1 col2
- 0 1 3
- 1 2 4
-
- Notice that the inferred dtype is int64.
-
- >>> df.dtypes
- col1 int64
- col2 int64
- dtype: object
-
- To enforce a single dtype:
-
- >>> df = pd.DataFrame(data=d, dtype=np.int8)
- >>> df.dtypes
- col1 int8
- col2 int8
- dtype: object
-
- Constructing DataFrame from a dictionary including Series:
-
- >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
- >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
- col1 col2
- 0 0 NaN
- 1 1 NaN
- 2 2 2.0
- 3 3 3.0
-
- Constructing DataFrame from numpy ndarray:
-
- >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
- ... columns=['a', 'b', 'c'])
- >>> df2
- a b c
- 0 1 2 3
- 1 4 5 6
- 2 7 8 9
-
- Constructing DataFrame from a numpy ndarray that has labeled columns:
-
- >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
- ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
- >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
- ...
- >>> df3
- c a
- 0 3 1
- 1 6 4
- 2 9 7
-
- Constructing DataFrame from dataclass:
-
- >>> from dataclasses import make_dataclass
- >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
- >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
- x y
- 0 0 0
- 1 0 3
- 2 2 3
-
- Constructing DataFrame from Series/DataFrame:
-
- >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"])
- >>> df = pd.DataFrame(data=ser, index=["a", "c"])
- >>> df
- 0
- a 1
- c 3
-
- >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"])
- >>> df2 = pd.DataFrame(data=df1, index=["a", "c"])
- >>> df2
- x
- a 1
- c 3
- """
-
- _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set
- _typ = "dataframe"
- _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
- _accessors: set[str] = {"sparse"}
- _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])
- _mgr: BlockManager | ArrayManager
-
- @property
- def _constructor(self) -> Callable[..., DataFrame]:
- return DataFrame
-
- _constructor_sliced: Callable[..., Series] = Series
-
- # ----------------------------------------------------------------------
- # Constructors
-
- def __init__(
- self,
- data=None,
- index: Axes | None = None,
- columns: Axes | None = None,
- dtype: Dtype | None = None,
- copy: bool | None = None,
- ) -> None:
- if dtype is not None:
- dtype = self._validate_dtype(dtype)
-
- if isinstance(data, DataFrame):
- data = data._mgr
- if not copy:
- # if not copying data, ensure to still return a shallow copy
- # to avoid the result sharing the same Manager
- data = data.copy(deep=False)
-
- if isinstance(data, (BlockManager, ArrayManager)):
- if using_copy_on_write():
- data = data.copy(deep=False)
- # first check if a Manager is passed without any other arguments
- # -> use fastpath (without checking Manager type)
- if index is None and columns is None and dtype is None and not copy:
- # GH#33357 fastpath
- NDFrame.__init__(self, data)
- return
-
- manager = get_option("mode.data_manager")
-
- # GH47215
- if index is not None and isinstance(index, set):
- raise ValueError("index cannot be a set")
- if columns is not None and isinstance(columns, set):
- raise ValueError("columns cannot be a set")
-
- if copy is None:
- if isinstance(data, dict):
- # retain pre-GH#38939 default behavior
- copy = True
- elif (
- manager == "array"
- and isinstance(data, (np.ndarray, ExtensionArray))
- and data.ndim == 2
- ):
- # INFO(ArrayManager) by default copy the 2D input array to get
- # contiguous 1D arrays
- copy = True
- elif using_copy_on_write() and not isinstance(
- data, (Index, DataFrame, Series)
- ):
- copy = True
- else:
- copy = False
-
- if data is None:
- index = index if index is not None else default_index(0)
- columns = columns if columns is not None else default_index(0)
- dtype = dtype if dtype is not None else pandas_dtype(object)
- data = []
-
- if isinstance(data, (BlockManager, ArrayManager)):
- mgr = self._init_mgr(
- data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
- )
-
- elif isinstance(data, dict):
- # GH#38939 de facto copy defaults to False only in non-dict cases
- mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
- elif isinstance(data, ma.MaskedArray):
- from numpy.ma import mrecords
-
- # masked recarray
- if isinstance(data, mrecords.MaskedRecords):
- raise TypeError(
- "MaskedRecords are not supported. Pass "
- "{name: data[name] for name in data.dtype.names} "
- "instead"
- )
-
- # a masked array
- data = sanitize_masked_array(data)
- mgr = ndarray_to_mgr(
- data,
- index,
- columns,
- dtype=dtype,
- copy=copy,
- typ=manager,
- )
-
- elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):
- if data.dtype.names:
- # i.e. numpy structured array
- data = cast(np.ndarray, data)
- mgr = rec_array_to_mgr(
- data,
- index,
- columns,
- dtype,
- copy,
- typ=manager,
- )
- elif getattr(data, "name", None) is not None:
- # i.e. Series/Index with non-None name
- _copy = copy if using_copy_on_write() else True
- mgr = dict_to_mgr(
- # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
- # attribute "name"
- {data.name: data}, # type: ignore[union-attr]
- index,
- columns,
- dtype=dtype,
- typ=manager,
- copy=_copy,
- )
- else:
- mgr = ndarray_to_mgr(
- data,
- index,
- columns,
- dtype=dtype,
- copy=copy,
- typ=manager,
- )
-
- # For data is list-like, or Iterable (will consume into list)
- elif is_list_like(data):
- if not isinstance(data, abc.Sequence):
- if hasattr(data, "__array__"):
- # GH#44616 big perf improvement for e.g. pytorch tensor
- data = np.asarray(data)
- else:
- data = list(data)
- if len(data) > 0:
- if is_dataclass(data[0]):
- data = dataclasses_to_dicts(data)
- if not isinstance(data, np.ndarray) and treat_as_nested(data):
- # exclude ndarray as we may have cast it a few lines above
- if columns is not None:
- columns = ensure_index(columns)
- arrays, columns, index = nested_data_to_arrays(
- # error: Argument 3 to "nested_data_to_arrays" has incompatible
- # type "Optional[Collection[Any]]"; expected "Optional[Index]"
- data,
- columns,
- index, # type: ignore[arg-type]
- dtype,
- )
- mgr = arrays_to_mgr(
- arrays,
- columns,
- index,
- dtype=dtype,
- typ=manager,
- )
- else:
- mgr = ndarray_to_mgr(
- data,
- index,
- columns,
- dtype=dtype,
- copy=copy,
- typ=manager,
- )
- else:
- mgr = dict_to_mgr(
- {},
- index,
- columns if columns is not None else default_index(0),
- dtype=dtype,
- typ=manager,
- )
- # For data is scalar
- else:
- if index is None or columns is None:
- raise ValueError("DataFrame constructor not properly called!")
-
- index = ensure_index(index)
- columns = ensure_index(columns)
-
- if not dtype:
- dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True)
-
- # For data is a scalar extension dtype
- if isinstance(dtype, ExtensionDtype):
- # TODO(EA2D): special case not needed with 2D EAs
-
- values = [
- construct_1d_arraylike_from_scalar(data, len(index), dtype)
- for _ in range(len(columns))
- ]
- mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)
- else:
- arr2d = construct_2d_arraylike_from_scalar(
- data,
- len(index),
- len(columns),
- dtype,
- copy,
- )
-
- mgr = ndarray_to_mgr(
- arr2d,
- index,
- columns,
- dtype=arr2d.dtype,
- copy=False,
- typ=manager,
- )
-
- # ensure correct Manager type according to settings
- mgr = mgr_to_mgr(mgr, typ=manager)
-
- NDFrame.__init__(self, mgr)
-
- # ----------------------------------------------------------------------
- def __dataframe__(
- self, nan_as_null: bool = False, allow_copy: bool = True
- ) -> DataFrameXchg:
- """
- Return the dataframe interchange object implementing the interchange protocol.
-
- Parameters
- ----------
- nan_as_null : bool, default False
- Whether to tell the DataFrame to overwrite null values in the data
- with ``NaN`` (or ``NaT``).
- allow_copy : bool, default True
- Whether to allow memory copying when exporting. If set to False
- it would cause non-zero-copy exports to fail.
-
- Returns
- -------
- DataFrame interchange object
- The object which consuming library can use to ingress the dataframe.
-
- Notes
- -----
- Details on the interchange protocol:
- https://data-apis.org/dataframe-protocol/latest/index.html
-
- `nan_as_null` currently has no effect; once support for nullable extension
- dtypes is added, this value should be propagated to columns.
- """
-
- from pandas.core.interchange.dataframe import PandasDataFrameXchg
-
- return PandasDataFrameXchg(self, nan_as_null, allow_copy)
-
- # ----------------------------------------------------------------------
-
- @property
- def axes(self) -> list[Index]:
- """
- Return a list representing the axes of the DataFrame.
-
- It has the row axis labels and column axis labels as the only members.
- They are returned in that order.
-
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.axes
- [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
- dtype='object')]
- """
- return [self.index, self.columns]
-
- @property
- def shape(self) -> tuple[int, int]:
- """
- Return a tuple representing the dimensionality of the DataFrame.
-
- See Also
- --------
- ndarray.shape : Tuple of array dimensions.
-
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.shape
- (2, 2)
-
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
- ... 'col3': [5, 6]})
- >>> df.shape
- (2, 3)
- """
- return len(self.index), len(self.columns)
-
- @property
- def _is_homogeneous_type(self) -> bool:
- """
- Whether all the columns in a DataFrame have the same type.
-
- Returns
- -------
- bool
-
- See Also
- --------
- Index._is_homogeneous_type : Whether the object has a single
- dtype.
- MultiIndex._is_homogeneous_type : Whether all the levels of a
- MultiIndex have the same dtype.
-
- Examples
- --------
- >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
- True
- >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
- False
-
- Items with the same type but different sizes are considered
- different types.
-
- >>> DataFrame({
- ... "A": np.array([1, 2], dtype=np.int32),
- ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
- False
- """
- if isinstance(self._mgr, ArrayManager):
- return len({arr.dtype for arr in self._mgr.arrays}) == 1
- if self._mgr.any_extension_types:
- return len({block.dtype for block in self._mgr.blocks}) == 1
- else:
- return not self._is_mixed_type
-
- @property
- def _can_fast_transpose(self) -> bool:
- """
- Can we transpose this DataFrame without creating any new array objects.
- """
- if isinstance(self._mgr, ArrayManager):
- return False
- blocks = self._mgr.blocks
- if len(blocks) != 1:
- return False
-
- dtype = blocks[0].dtype
- # TODO(EA2D) special case would be unnecessary with 2D EAs
- return not is_1d_only_ea_dtype(dtype)
-
- @property
- def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:
- """
- Analogue to ._values that may return a 2D ExtensionArray.
- """
- mgr = self._mgr
-
- if isinstance(mgr, ArrayManager):
- if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):
- # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"
- # has no attribute "reshape"
- return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]
- return ensure_wrapped_if_datetimelike(self.values)
-
- blocks = mgr.blocks
- if len(blocks) != 1:
- return ensure_wrapped_if_datetimelike(self.values)
-
- arr = blocks[0].values
- if arr.ndim == 1:
- # non-2D ExtensionArray
- return self.values
-
- # more generally, whatever we allow in NDArrayBackedExtensionBlock
- arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr)
- return arr.T
-
- # ----------------------------------------------------------------------
- # Rendering Methods
-
- def _repr_fits_vertical_(self) -> bool:
- """
- Check length against max_rows.
- """
- max_rows = get_option("display.max_rows")
- return len(self) <= max_rows
-
- def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:
- """
- Check if full repr fits in horizontal boundaries imposed by the display
- options width and max_columns.
-
- In case of non-interactive session, no boundaries apply.
-
- `ignore_width` is here so ipynb+HTML output can behave the way
- users expect. display.max_columns remains in effect.
- GH3541, GH3573
- """
- width, height = console.get_console_size()
- max_columns = get_option("display.max_columns")
- nb_columns = len(self.columns)
-
- # exceed max columns
- if (max_columns and nb_columns > max_columns) or (
- (not ignore_width) and width and nb_columns > (width // 2)
- ):
- return False
-
- # used by repr_html under IPython notebook or scripts ignore terminal
- # dims
- if ignore_width or width is None or not console.in_interactive_session():
- return True
-
- if get_option("display.width") is not None or console.in_ipython_frontend():
- # check at least the column row for excessive width
- max_rows = 1
- else:
- max_rows = get_option("display.max_rows")
-
- # when auto-detecting, so width=None and not in ipython front end
- # check whether repr fits horizontal by actually checking
- # the width of the rendered repr
- buf = StringIO()
-
- # only care about the stuff we'll actually print out
- # and to_string on entire frame may be expensive
- d = self
-
- if max_rows is not None: # unlimited rows
- # min of two, where one may be None
- d = d.iloc[: min(max_rows, len(d))]
- else:
- return True
-
- d.to_string(buf=buf)
- value = buf.getvalue()
- repr_width = max(len(line) for line in value.split("\n"))
-
- return repr_width < width
-
- def _info_repr(self) -> bool:
- """
- True if the repr should show the info view.
- """
- info_repr_option = get_option("display.large_repr") == "info"
- return info_repr_option and not (
- self._repr_fits_horizontal_() and self._repr_fits_vertical_()
- )
-
- def __repr__(self) -> str:
- """
- Return a string representation for a particular DataFrame.
- """
- if self._info_repr():
- buf = StringIO()
- self.info(buf=buf)
- return buf.getvalue()
-
- repr_params = fmt.get_dataframe_repr_params()
- return self.to_string(**repr_params)
-
- def _repr_html_(self) -> str | None:
- """
- Return a html representation for a particular DataFrame.
-
- Mainly for IPython notebook.
- """
- if self._info_repr():
- buf = StringIO()
- self.info(buf=buf)
- # need to escape the <class>, should be the first line.
- val = buf.getvalue().replace("<", r"&lt;", 1)
- val = val.replace(">", r"&gt;", 1)
- return f"<pre>{val}</pre>"
-
- if get_option("display.notebook_repr_html"):
- max_rows = get_option("display.max_rows")
- min_rows = get_option("display.min_rows")
- max_cols = get_option("display.max_columns")
- show_dimensions = get_option("display.show_dimensions")
-
- formatter = fmt.DataFrameFormatter(
- self,
- columns=None,
- col_space=None,
- na_rep="NaN",
- formatters=None,
- float_format=None,
- sparsify=None,
- justify=None,
- index_names=True,
- header=True,
- index=True,
- bold_rows=True,
- escape=True,
- max_rows=max_rows,
- min_rows=min_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=".",
- )
- return fmt.DataFrameRenderer(formatter).to_html(notebook=True)
- else:
- return None
-
- @overload
- def to_string(
- self,
- buf: None = ...,
- columns: Sequence[str] | None = ...,
- col_space: int | list[int] | dict[Hashable, int] | None = ...,
- header: bool | Sequence[str] = ...,
- index: bool = ...,
- na_rep: str = ...,
- formatters: fmt.FormattersType | None = ...,
- float_format: fmt.FloatFormatType | None = ...,
- sparsify: bool | None = ...,
- index_names: bool = ...,
- justify: str | None = ...,
- max_rows: int | None = ...,
- max_cols: int | None = ...,
- show_dimensions: bool = ...,
- decimal: str = ...,
- line_width: int | None = ...,
- min_rows: int | None = ...,
- max_colwidth: int | None = ...,
- encoding: str | None = ...,
- ) -> str:
- ...
-
- @overload
- def to_string(
- self,
- buf: FilePath | WriteBuffer[str],
- columns: Sequence[str] | None = ...,
- col_space: int | list[int] | dict[Hashable, int] | None = ...,
- header: bool | Sequence[str] = ...,
- index: bool = ...,
- na_rep: str = ...,
- formatters: fmt.FormattersType | None = ...,
- float_format: fmt.FloatFormatType | None = ...,
- sparsify: bool | None = ...,
- index_names: bool = ...,
- justify: str | None = ...,
- max_rows: int | None = ...,
- max_cols: int | None = ...,
- show_dimensions: bool = ...,
- decimal: str = ...,
- line_width: int | None = ...,
- min_rows: int | None = ...,
- max_colwidth: int | None = ...,
- encoding: str | None = ...,
- ) -> None:
- ...
-
- @Substitution(
- header_type="bool or sequence of str",
- header="Write out the column names. If a list of strings "
- "is given, it is assumed to be aliases for the "
- "column names",
- col_space_type="int, list or dict of int",
- col_space="The minimum width of each column. If a list of ints is given "
- "every integers corresponds with one column. If a dict is given, the key "
- "references the column, while the value defines the space to use.",
- )
- @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
- def to_string(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- columns: Sequence[str] | None = None,
- col_space: int | list[int] | dict[Hashable, int] | None = None,
- header: bool | Sequence[str] = True,
- index: bool = True,
- na_rep: str = "NaN",
- formatters: fmt.FormattersType | None = None,
- float_format: fmt.FloatFormatType | None = None,
- sparsify: bool | None = None,
- index_names: bool = True,
- justify: str | None = None,
- max_rows: int | None = None,
- max_cols: int | None = None,
- show_dimensions: bool = False,
- decimal: str = ".",
- line_width: int | None = None,
- min_rows: int | None = None,
- max_colwidth: int | None = None,
- encoding: str | None = None,
- ) -> str | None:
- """
- Render a DataFrame to a console-friendly tabular output.
- %(shared_params)s
- line_width : int, optional
- Width to wrap a line in characters.
- min_rows : int, optional
- The number of rows to display in the console in a truncated repr
- (when number of rows is above `max_rows`).
- max_colwidth : int, optional
- Max width to truncate each column in characters. By default, no limit.
- encoding : str, default "utf-8"
- Set character encoding.
- %(returns)s
- See Also
- --------
- to_html : Convert DataFrame to HTML.
-
- Examples
- --------
- >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
- >>> df = pd.DataFrame(d)
- >>> print(df.to_string())
- col1 col2
- 0 1 4
- 1 2 5
- 2 3 6
- """
- from pandas import option_context
-
- with option_context("display.max_colwidth", max_colwidth):
- formatter = fmt.DataFrameFormatter(
- self,
- columns=columns,
- col_space=col_space,
- na_rep=na_rep,
- formatters=formatters,
- float_format=float_format,
- sparsify=sparsify,
- justify=justify,
- index_names=index_names,
- header=header,
- index=index,
- min_rows=min_rows,
- max_rows=max_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=decimal,
- )
- return fmt.DataFrameRenderer(formatter).to_string(
- buf=buf,
- encoding=encoding,
- line_width=line_width,
- )
-
- # ----------------------------------------------------------------------
-
- @property
- def style(self) -> Styler:
- """
- Returns a Styler object.
-
- Contains methods for building a styled HTML representation of the DataFrame.
-
- See Also
- --------
- io.formats.style.Styler : Helps style a DataFrame or Series according to the
- data with HTML and CSS.
- """
- from pandas.io.formats.style import Styler
-
- return Styler(self)
-
- _shared_docs[
- "items"
- ] = r"""
- Iterate over (column name, Series) pairs.
-
- Iterates over the DataFrame columns, returning a tuple with
- the column name and the content as a Series.
-
- Yields
- ------
- label : object
- The column names for the DataFrame being iterated over.
- content : Series
- The column entries belonging to each label, as a Series.
-
- See Also
- --------
- DataFrame.iterrows : Iterate over DataFrame rows as
- (index, Series) pairs.
- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
- of the values.
-
- Examples
- --------
- >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
- ... 'population': [1864, 22000, 80000]},
- ... index=['panda', 'polar', 'koala'])
- >>> df
- species population
- panda bear 1864
- polar bear 22000
- koala marsupial 80000
- >>> for label, content in df.items():
- ... print(f'label: {label}')
- ... print(f'content: {content}', sep='\n')
- ...
- label: species
- content:
- panda bear
- polar bear
- koala marsupial
- Name: species, dtype: object
- label: population
- content:
- panda 1864
- polar 22000
- koala 80000
- Name: population, dtype: int64
- """
-
- @Appender(_shared_docs["items"])
- def items(self) -> Iterable[tuple[Hashable, Series]]:
- if self.columns.is_unique and hasattr(self, "_item_cache"):
- for k in self.columns:
- yield k, self._get_item_cache(k)
- else:
- for i, k in enumerate(self.columns):
- yield k, self._ixs(i, axis=1)
-
- def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
- """
- Iterate over DataFrame rows as (index, Series) pairs.
-
- Yields
- ------
- index : label or tuple of label
- The index of the row. A tuple for a `MultiIndex`.
- data : Series
- The data of the row as a Series.
-
- See Also
- --------
- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
- DataFrame.items : Iterate over (column name, Series) pairs.
-
- Notes
- -----
- 1. Because ``iterrows`` returns a Series for each row,
- it does **not** preserve dtypes across the rows (dtypes are
- preserved across columns for DataFrames). For example,
-
- >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
- >>> row = next(df.iterrows())[1]
- >>> row
- int 1.0
- float 1.5
- Name: 0, dtype: float64
- >>> print(row['int'].dtype)
- float64
- >>> print(df['int'].dtype)
- int64
-
- To preserve dtypes while iterating over the rows, it is better
- to use :meth:`itertuples` which returns namedtuples of the values
- and which is generally faster than ``iterrows``.
-
- 2. You should **never modify** something you are iterating over.
- This is not guaranteed to work in all cases. Depending on the
- data types, the iterator returns a copy and not a view, and writing
- to it will have no effect.
- """
- columns = self.columns
- klass = self._constructor_sliced
- using_cow = using_copy_on_write()
- for k, v in zip(self.index, self.values):
- s = klass(v, index=columns, name=k).__finalize__(self)
- if using_cow and self._mgr.is_single_block:
- s._mgr.add_references(self._mgr) # type: ignore[arg-type]
- yield k, s
-
- def itertuples(
- self, index: bool = True, name: str | None = "Pandas"
- ) -> Iterable[tuple[Any, ...]]:
- """
- Iterate over DataFrame rows as namedtuples.
-
- Parameters
- ----------
- index : bool, default True
- If True, return the index as the first element of the tuple.
- name : str or None, default "Pandas"
- The name of the returned namedtuples or None to return regular
- tuples.
-
- Returns
- -------
- iterator
- An object to iterate over namedtuples for each row in the
- DataFrame with the first field possibly being the index and
- following fields being the column values.
-
- See Also
- --------
- DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
- pairs.
- DataFrame.items : Iterate over (column name, Series) pairs.
-
- Notes
- -----
- The column names will be renamed to positional names if they are
- invalid Python identifiers, repeated, or start with an underscore.
-
- Examples
- --------
- >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
- ... index=['dog', 'hawk'])
- >>> df
- num_legs num_wings
- dog 4 0
- hawk 2 2
- >>> for row in df.itertuples():
- ... print(row)
- ...
- Pandas(Index='dog', num_legs=4, num_wings=0)
- Pandas(Index='hawk', num_legs=2, num_wings=2)
-
- By setting the `index` parameter to False we can remove the index
- as the first element of the tuple:
-
- >>> for row in df.itertuples(index=False):
- ... print(row)
- ...
- Pandas(num_legs=4, num_wings=0)
- Pandas(num_legs=2, num_wings=2)
-
- With the `name` parameter set we set a custom name for the yielded
- namedtuples:
-
- >>> for row in df.itertuples(name='Animal'):
- ... print(row)
- ...
- Animal(Index='dog', num_legs=4, num_wings=0)
- Animal(Index='hawk', num_legs=2, num_wings=2)
- """
- arrays = []
- fields = list(self.columns)
- if index:
- arrays.append(self.index)
- fields.insert(0, "Index")
-
- # use integer indexing because of possible duplicate column names
- arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
-
- if name is not None:
- # https://github.com/python/mypy/issues/9046
- # error: namedtuple() expects a string literal as the first argument
- itertuple = collections.namedtuple( # type: ignore[misc]
- name, fields, rename=True
- )
- return map(itertuple._make, zip(*arrays))
-
- # fallback to regular tuples
- return zip(*arrays)
-
- def __len__(self) -> int:
- """
- Returns length of info axis, but here we use the index.
- """
- return len(self.index)
-
- @overload
- def dot(self, other: Series) -> Series:
- ...
-
- @overload
- def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:
- ...
-
- def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
- """
- Compute the matrix multiplication between the DataFrame and other.
-
- This method computes the matrix product between the DataFrame and the
- values of an other Series, DataFrame or a numpy array.
-
- It can also be called using ``self @ other`` in Python >= 3.5.
-
- Parameters
- ----------
- other : Series, DataFrame or array-like
- The other object to compute the matrix product with.
-
- Returns
- -------
- Series or DataFrame
- If other is a Series, return the matrix product between self and
- other as a Series. If other is a DataFrame or a numpy.array, return
- the matrix product of self and other in a DataFrame of a np.array.
-
- See Also
- --------
- Series.dot: Similar method for Series.
-
- Notes
- -----
- The dimensions of DataFrame and other must be compatible in order to
- compute the matrix multiplication. In addition, the column names of
- DataFrame and the index of other must contain the same values, as they
- will be aligned prior to the multiplication.
-
- The dot method for Series computes the inner product, instead of the
- matrix product here.
-
- Examples
- --------
- Here we multiply a DataFrame with a Series.
-
- >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
- >>> s = pd.Series([1, 1, 2, 1])
- >>> df.dot(s)
- 0 -4
- 1 5
- dtype: int64
-
- Here we multiply a DataFrame with another DataFrame.
-
- >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
- >>> df.dot(other)
- 0 1
- 0 1 4
- 1 2 2
-
- Note that the dot method give the same result as @
-
- >>> df @ other
- 0 1
- 0 1 4
- 1 2 2
-
- The dot method works also if other is an np.array.
-
- >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
- >>> df.dot(arr)
- 0 1
- 0 1 4
- 1 2 2
-
- Note how shuffling of the objects does not change the result.
-
- >>> s2 = s.reindex([1, 0, 2, 3])
- >>> df.dot(s2)
- 0 -4
- 1 5
- dtype: int64
- """
- if isinstance(other, (Series, DataFrame)):
- common = self.columns.union(other.index)
- if len(common) > len(self.columns) or len(common) > len(other.index):
- raise ValueError("matrices are not aligned")
-
- left = self.reindex(columns=common, copy=False)
- right = other.reindex(index=common, copy=False)
- lvals = left.values
- rvals = right._values
- else:
- left = self
- lvals = self.values
- rvals = np.asarray(other)
- if lvals.shape[1] != rvals.shape[0]:
- raise ValueError(
- f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
- )
-
- if isinstance(other, DataFrame):
- return self._constructor(
- np.dot(lvals, rvals),
- index=left.index,
- columns=other.columns,
- copy=False,
- )
- elif isinstance(other, Series):
- return self._constructor_sliced(
- np.dot(lvals, rvals), index=left.index, copy=False
- )
- elif isinstance(rvals, (np.ndarray, Index)):
- result = np.dot(lvals, rvals)
- if result.ndim == 2:
- return self._constructor(result, index=left.index, copy=False)
- else:
- return self._constructor_sliced(result, index=left.index, copy=False)
- else: # pragma: no cover
- raise TypeError(f"unsupported type: {type(other)}")
-
- @overload
- def __matmul__(self, other: Series) -> Series:
- ...
-
- @overload
- def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
- ...
-
- def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- return self.dot(other)
-
- def __rmatmul__(self, other) -> DataFrame:
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- try:
- return self.T.dot(np.transpose(other)).T
- except ValueError as err:
- if "shape mismatch" not in str(err):
- raise
- # GH#21581 give exception message for original shapes
- msg = f"shapes {np.shape(other)} and {self.shape} not aligned"
- raise ValueError(msg) from err
-
- # ----------------------------------------------------------------------
- # IO methods (to / from other formats)
-
- @classmethod
- def from_dict(
- cls,
- data: dict,
- orient: str = "columns",
- dtype: Dtype | None = None,
- columns: Axes | None = None,
- ) -> DataFrame:
- """
- Construct DataFrame from dict of array-like or dicts.
-
- Creates DataFrame object from dictionary by columns or by index
- allowing dtype specification.
-
- Parameters
- ----------
- data : dict
- Of the form {field : array-like} or {field : dict}.
- orient : {'columns', 'index', 'tight'}, default 'columns'
- The "orientation" of the data. If the keys of the passed dict
- should be the columns of the resulting DataFrame, pass 'columns'
- (default). Otherwise if the keys should be rows, pass 'index'.
- If 'tight', assume a dict with keys ['index', 'columns', 'data',
- 'index_names', 'column_names'].
-
- .. versionadded:: 1.4.0
- 'tight' as an allowed value for the ``orient`` argument
-
- dtype : dtype, default None
- Data type to force after DataFrame construction, otherwise infer.
- columns : list, default None
- Column labels to use when ``orient='index'``. Raises a ValueError
- if used with ``orient='columns'`` or ``orient='tight'``.
-
- Returns
- -------
- DataFrame
-
- See Also
- --------
- DataFrame.from_records : DataFrame from structured ndarray, sequence
- of tuples or dicts, or DataFrame.
- DataFrame : DataFrame object creation using constructor.
- DataFrame.to_dict : Convert the DataFrame to a dictionary.
-
- Examples
- --------
- By default the keys of the dict become the DataFrame columns:
-
- >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
- >>> pd.DataFrame.from_dict(data)
- col_1 col_2
- 0 3 a
- 1 2 b
- 2 1 c
- 3 0 d
-
- Specify ``orient='index'`` to create the DataFrame using dictionary
- keys as rows:
-
- >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
- >>> pd.DataFrame.from_dict(data, orient='index')
- 0 1 2 3
- row_1 3 2 1 0
- row_2 a b c d
-
- When using the 'index' orientation, the column names can be
- specified manually:
-
- >>> pd.DataFrame.from_dict(data, orient='index',
- ... columns=['A', 'B', 'C', 'D'])
- A B C D
- row_1 3 2 1 0
- row_2 a b c d
-
- Specify ``orient='tight'`` to create the DataFrame using a 'tight'
- format:
-
- >>> data = {'index': [('a', 'b'), ('a', 'c')],
- ... 'columns': [('x', 1), ('y', 2)],
- ... 'data': [[1, 3], [2, 4]],
- ... 'index_names': ['n1', 'n2'],
- ... 'column_names': ['z1', 'z2']}
- >>> pd.DataFrame.from_dict(data, orient='tight')
- z1 x y
- z2 1 2
- n1 n2
- a b 1 3
- c 2 4
- """
- index = None
- orient = orient.lower()
- if orient == "index":
- if len(data) > 0:
- # TODO speed up Series case
- if isinstance(list(data.values())[0], (Series, dict)):
- data = _from_nested_dict(data)
- else:
- index = list(data.keys())
- # error: Incompatible types in assignment (expression has type
- # "List[Any]", variable has type "Dict[Any, Any]")
- data = list(data.values()) # type: ignore[assignment]
- elif orient in ("columns", "tight"):
- if columns is not None:
- raise ValueError(f"cannot use columns parameter with orient='{orient}'")
- else: # pragma: no cover
- raise ValueError(
- f"Expected 'index', 'columns' or 'tight' for orient parameter. "
- f"Got '{orient}' instead"
- )
-
- if orient != "tight":
- return cls(data, index=index, columns=columns, dtype=dtype)
- else:
- realdata = data["data"]
-
- def create_index(indexlist, namelist):
- index: Index
- if len(namelist) > 1:
- index = MultiIndex.from_tuples(indexlist, names=namelist)
- else:
- index = Index(indexlist, name=namelist[0])
- return index
-
- index = create_index(data["index"], data["index_names"])
- columns = create_index(data["columns"], data["column_names"])
- return cls(realdata, index=index, columns=columns, dtype=dtype)
-
- def to_numpy(
- self,
- dtype: npt.DTypeLike | None = None,
- copy: bool = False,
- na_value: object = lib.no_default,
- ) -> np.ndarray:
- """
- Convert the DataFrame to a NumPy array.
-
- By default, the dtype of the returned array will be the common NumPy
- dtype of all types in the DataFrame. For example, if the dtypes are
- ``float16`` and ``float32``, the results dtype will be ``float32``.
- This may require copying data and coercing values, which may be
- expensive.
-
- Parameters
- ----------
- dtype : str or numpy.dtype, optional
- The dtype to pass to :meth:`numpy.asarray`.
- copy : bool, default False
- Whether to ensure that the returned value is not a view on
- another array. Note that ``copy=False`` does not *ensure* that
- ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
- a copy is made, even if not strictly necessary.
- na_value : Any, optional
- The value to use for missing values. The default value depends
- on `dtype` and the dtypes of the DataFrame columns.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- numpy.ndarray
-
- See Also
- --------
- Series.to_numpy : Similar method for Series.
-
- Examples
- --------
- >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
- array([[1, 3],
- [2, 4]])
-
- With heterogeneous data, the lowest common type will have to
- be used.
-
- >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
- >>> df.to_numpy()
- array([[1. , 3. ],
- [2. , 4.5]])
-
- For a mix of numeric and non-numeric types, the output array will
- have object dtype.
-
- >>> df['C'] = pd.date_range('2000', periods=2)
- >>> df.to_numpy()
- array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
- [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
- """
- if dtype is not None:
- dtype = np.dtype(dtype)
- result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)
- if result.dtype is not dtype:
- result = np.array(result, dtype=dtype, copy=False)
-
- return result
-
- def _create_data_for_split_and_tight_to_dict(
- self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
- ) -> list:
- """
- Simple helper method to create data for to ``to_dict(orient="split")`` and
- ``to_dict(orient="tight")`` to create the main output data
- """
- if are_all_object_dtype_cols:
- data = [
- list(map(maybe_box_native, t))
- for t in self.itertuples(index=False, name=None)
- ]
- else:
- data = [list(t) for t in self.itertuples(index=False, name=None)]
- if object_dtype_indices:
- # If we have object_dtype_cols, apply maybe_box_naive after list
- # comprehension for perf
- for row in data:
- for i in object_dtype_indices:
- row[i] = maybe_box_native(row[i])
- return data
-
- @overload
- def to_dict(
- self,
- orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
- into: type[dict] = ...,
- ) -> dict:
- ...
-
- @overload
- def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]:
- ...
-
- def to_dict(
- self,
- orient: Literal[
- "dict", "list", "series", "split", "tight", "records", "index"
- ] = "dict",
- into: type[dict] = dict,
- index: bool = True,
- ) -> dict | list[dict]:
- """
- Convert the DataFrame to a dictionary.
-
- The type of the key-value pairs can be customized with the parameters
- (see below).
-
- Parameters
- ----------
- orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
- Determines the type of the values of the dictionary.
-
- - 'dict' (default) : dict like {column -> {index -> value}}
- - 'list' : dict like {column -> [values]}
- - 'series' : dict like {column -> Series(values)}
- - 'split' : dict like
- {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
- - 'tight' : dict like
- {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
- 'index_names' -> [index.names], 'column_names' -> [column.names]}
- - 'records' : list like
- [{column -> value}, ... , {column -> value}]
- - 'index' : dict like {index -> {column -> value}}
-
- .. versionadded:: 1.4.0
- 'tight' as an allowed value for the ``orient`` argument
-
- into : class, default dict
- The collections.abc.Mapping subclass used for all Mappings
- in the return value. Can be the actual class or an empty
- instance of the mapping type you want. If you want a
- collections.defaultdict, you must pass it initialized.
-
- index : bool, default True
- Whether to include the index item (and index_names item if `orient`
- is 'tight') in the returned dictionary. Can only be ``False``
- when `orient` is 'split' or 'tight'.
-
- .. versionadded:: 2.0.0
-
- Returns
- -------
- dict, list or collections.abc.Mapping
- Return a collections.abc.Mapping object representing the DataFrame.
- The resulting transformation depends on the `orient` parameter.
-
- See Also
- --------
- DataFrame.from_dict: Create a DataFrame from a dictionary.
- DataFrame.to_json: Convert a DataFrame to JSON format.
-
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2],
- ... 'col2': [0.5, 0.75]},
- ... index=['row1', 'row2'])
- >>> df
- col1 col2
- row1 1 0.50
- row2 2 0.75
- >>> df.to_dict()
- {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
-
- You can specify the return orientation.
-
- >>> df.to_dict('series')
- {'col1': row1 1
- row2 2
- Name: col1, dtype: int64,
- 'col2': row1 0.50
- row2 0.75
- Name: col2, dtype: float64}
-
- >>> df.to_dict('split')
- {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
- 'data': [[1, 0.5], [2, 0.75]]}
-
- >>> df.to_dict('records')
- [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
-
- >>> df.to_dict('index')
- {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
-
- >>> df.to_dict('tight')
- {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
- 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
-
- You can also specify the mapping type.
-
- >>> from collections import OrderedDict, defaultdict
- >>> df.to_dict(into=OrderedDict)
- OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
- ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
-
- If you want a `defaultdict`, you need to initialize it:
-
- >>> dd = defaultdict(list)
- >>> df.to_dict('records', into=dd)
- [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
- defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
- """
- from pandas.core.methods.to_dict import to_dict
-
- return to_dict(self, orient, into, index)
-
- def to_gbq(
- self,
- destination_table: str,
- project_id: str | None = None,
- chunksize: int | None = None,
- reauth: bool = False,
- if_exists: str = "fail",
- auth_local_webserver: bool = True,
- table_schema: list[dict[str, str]] | None = None,
- location: str | None = None,
- progress_bar: bool = True,
- credentials=None,
- ) -> None:
- """
- Write a DataFrame to a Google BigQuery table.
-
- This function requires the `pandas-gbq package
- <https://pandas-gbq.readthedocs.io>`__.
-
- See the `How to authenticate with Google BigQuery
- <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
- guide for authentication instructions.
-
- Parameters
- ----------
- destination_table : str
- Name of table to be written, in the form ``dataset.tablename``.
- project_id : str, optional
- Google BigQuery Account project ID. Optional when available from
- the environment.
- chunksize : int, optional
- Number of rows to be inserted in each chunk from the dataframe.
- Set to ``None`` to load the whole dataframe at once.
- reauth : bool, default False
- Force Google BigQuery to re-authenticate the user. This is useful
- if multiple accounts are used.
- if_exists : str, default 'fail'
- Behavior when the destination table exists. Value can be one of:
-
- ``'fail'``
- If table exists raise pandas_gbq.gbq.TableCreationError.
- ``'replace'``
- If table exists, drop it, recreate it, and insert data.
- ``'append'``
- If table exists, insert data. Create if does not exist.
- auth_local_webserver : bool, default True
- Use the `local webserver flow`_ instead of the `console flow`_
- when getting user credentials.
-
- .. _local webserver flow:
- https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
- .. _console flow:
- https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
-
- *New in version 0.2.0 of pandas-gbq*.
-
- .. versionchanged:: 1.5.0
- Default value is changed to ``True``. Google has deprecated the
- ``auth_local_webserver = False`` `"out of band" (copy-paste)
- flow
- <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
- table_schema : list of dicts, optional
- List of BigQuery table fields to which according DataFrame
- columns conform to, e.g. ``[{'name': 'col1', 'type':
- 'STRING'},...]``. If schema is not provided, it will be
- generated according to dtypes of DataFrame columns. See
- BigQuery API documentation on available names of a field.
-
- *New in version 0.3.1 of pandas-gbq*.
- location : str, optional
- Location where the load job should run. See the `BigQuery locations
- documentation
- <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
- list of available locations. The location must match that of the
- target dataset.
-
- *New in version 0.5.0 of pandas-gbq*.
- progress_bar : bool, default True
- Use the library `tqdm` to show the progress bar for the upload,
- chunk by chunk.
-
- *New in version 0.5.0 of pandas-gbq*.
- credentials : google.auth.credentials.Credentials, optional
- Credentials for accessing Google APIs. Use this parameter to
- override default credentials, such as to use Compute Engine
- :class:`google.auth.compute_engine.Credentials` or Service
- Account :class:`google.oauth2.service_account.Credentials`
- directly.
-
- *New in version 0.8.0 of pandas-gbq*.
-
- See Also
- --------
- pandas_gbq.to_gbq : This function in the pandas-gbq library.
- read_gbq : Read a DataFrame from Google BigQuery.
- """
- from pandas.io import gbq
-
- gbq.to_gbq(
- self,
- destination_table,
- project_id=project_id,
- chunksize=chunksize,
- reauth=reauth,
- if_exists=if_exists,
- auth_local_webserver=auth_local_webserver,
- table_schema=table_schema,
- location=location,
- progress_bar=progress_bar,
- credentials=credentials,
- )
-
- @classmethod
- def from_records(
- cls,
- data,
- index=None,
- exclude=None,
- columns=None,
- coerce_float: bool = False,
- nrows: int | None = None,
- ) -> DataFrame:
- """
- Convert structured or record ndarray to DataFrame.
-
- Creates a DataFrame object from a structured ndarray, sequence of
- tuples or dicts, or DataFrame.
-
- Parameters
- ----------
- data : structured ndarray, sequence of tuples or dicts, or DataFrame
- Structured input data.
- index : str, list of fields, array-like
- Field of array to use as the index, alternately a specific set of
- input labels to use.
- exclude : sequence, default None
- Columns or fields to exclude.
- columns : sequence, default None
- Column names to use. If the passed data do not have names
- associated with them, this argument provides names for the
- columns. Otherwise this argument indicates the order of the columns
- in the result (any names not found in the data will become all-NA
- columns).
- coerce_float : bool, default False
- Attempt to convert values of non-string, non-numeric objects (like
- decimal.Decimal) to floating point, useful for SQL result sets.
- nrows : int, default None
- Number of rows to read if data is an iterator.
-
- Returns
- -------
- DataFrame
-
- See Also
- --------
- DataFrame.from_dict : DataFrame from dict of array-like or dicts.
- DataFrame : DataFrame object creation using constructor.
-
- Examples
- --------
- Data can be provided as a structured ndarray:
-
- >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
- ... dtype=[('col_1', 'i4'), ('col_2', 'U1')])
- >>> pd.DataFrame.from_records(data)
- col_1 col_2
- 0 3 a
- 1 2 b
- 2 1 c
- 3 0 d
-
- Data can be provided as a list of dicts:
-
- >>> data = [{'col_1': 3, 'col_2': 'a'},
- ... {'col_1': 2, 'col_2': 'b'},
- ... {'col_1': 1, 'col_2': 'c'},
- ... {'col_1': 0, 'col_2': 'd'}]
- >>> pd.DataFrame.from_records(data)
- col_1 col_2
- 0 3 a
- 1 2 b
- 2 1 c
- 3 0 d
-
- Data can be provided as a list of tuples with corresponding columns:
-
- >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
- >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
- col_1 col_2
- 0 3 a
- 1 2 b
- 2 1 c
- 3 0 d
- """
- if isinstance(data, DataFrame):
- if columns is not None:
- if is_scalar(columns):
- columns = [columns]
- data = data[columns]
- if index is not None:
- data = data.set_index(index)
- if exclude is not None:
- data = data.drop(columns=exclude)
- return data.copy(deep=False)
-
- result_index = None
-
- # Make a copy of the input columns so we can modify it
- if columns is not None:
- columns = ensure_index(columns)
-
- def maybe_reorder(
- arrays: list[ArrayLike], arr_columns: Index, columns: Index, index
- ) -> tuple[list[ArrayLike], Index, Index | None]:
- """
- If our desired 'columns' do not match the data's pre-existing 'arr_columns',
- we re-order our arrays. This is like a pre-emptive (cheap) reindex.
- """
- if len(arrays):
- length = len(arrays[0])
- else:
- length = 0
-
- result_index = None
- if len(arrays) == 0 and index is None and length == 0:
- result_index = default_index(0)
-
- arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)
- return arrays, arr_columns, result_index
-
- if is_iterator(data):
- if nrows == 0:
- return cls()
-
- try:
- first_row = next(data)
- except StopIteration:
- return cls(index=index, columns=columns)
-
- dtype = None
- if hasattr(first_row, "dtype") and first_row.dtype.names:
- dtype = first_row.dtype
-
- values = [first_row]
-
- if nrows is None:
- values += data
- else:
- values.extend(itertools.islice(data, nrows - 1))
-
- if dtype is not None:
- data = np.array(values, dtype=dtype)
- else:
- data = values
-
- if isinstance(data, dict):
- if columns is None:
- columns = arr_columns = ensure_index(sorted(data))
- arrays = [data[k] for k in columns]
- else:
- arrays = []
- arr_columns_list = []
- for k, v in data.items():
- if k in columns:
- arr_columns_list.append(k)
- arrays.append(v)
-
- arr_columns = Index(arr_columns_list)
- arrays, arr_columns, result_index = maybe_reorder(
- arrays, arr_columns, columns, index
- )
-
- elif isinstance(data, (np.ndarray, DataFrame)):
- arrays, columns = to_arrays(data, columns)
- arr_columns = columns
- else:
- arrays, arr_columns = to_arrays(data, columns)
- if coerce_float:
- for i, arr in enumerate(arrays):
- if arr.dtype == object:
- # error: Argument 1 to "maybe_convert_objects" has
- # incompatible type "Union[ExtensionArray, ndarray]";
- # expected "ndarray"
- arrays[i] = lib.maybe_convert_objects(
- arr, # type: ignore[arg-type]
- try_float=True,
- )
-
- arr_columns = ensure_index(arr_columns)
- if columns is None:
- columns = arr_columns
- else:
- arrays, arr_columns, result_index = maybe_reorder(
- arrays, arr_columns, columns, index
- )
-
- if exclude is None:
- exclude = set()
- else:
- exclude = set(exclude)
-
- if index is not None:
- if isinstance(index, str) or not hasattr(index, "__iter__"):
- i = columns.get_loc(index)
- exclude.add(index)
- if len(arrays) > 0:
- result_index = Index(arrays[i], name=index)
- else:
- result_index = Index([], name=index)
- else:
- try:
- index_data = [arrays[arr_columns.get_loc(field)] for field in index]
- except (KeyError, TypeError):
- # raised by get_loc, see GH#29258
- result_index = index
- else:
- result_index = ensure_index_from_sequences(index_data, names=index)
- exclude.update(index)
-
- if any(exclude):
- arr_exclude = [x for x in exclude if x in arr_columns]
- to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
- arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
-
- columns = columns.drop(exclude)
-
- manager = get_option("mode.data_manager")
- mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)
-
- return cls(mgr)
-
- def to_records(
- self, index: bool = True, column_dtypes=None, index_dtypes=None
- ) -> np.recarray:
- """
- Convert DataFrame to a NumPy record array.
-
- Index will be included as the first field of the record array if
- requested.
-
- Parameters
- ----------
- index : bool, default True
- Include index in resulting record array, stored in 'index'
- field or using the index label, if set.
- column_dtypes : str, type, dict, default None
- If a string or type, the data type to store all columns. If
- a dictionary, a mapping of column names and indices (zero-indexed)
- to specific data types.
- index_dtypes : str, type, dict, default None
- If a string or type, the data type to store all index levels. If
- a dictionary, a mapping of index level names and indices
- (zero-indexed) to specific data types.
-
- This mapping is applied only if `index=True`.
-
- Returns
- -------
- numpy.recarray
- NumPy ndarray with the DataFrame labels as fields and each row
- of the DataFrame as entries.
-
- See Also
- --------
- DataFrame.from_records: Convert structured or record ndarray
- to DataFrame.
- numpy.recarray: An ndarray that allows field access using
- attributes, analogous to typed columns in a
- spreadsheet.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
- ... index=['a', 'b'])
- >>> df
- A B
- a 1 0.50
- b 2 0.75
- >>> df.to_records()
- rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
- dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
-
- If the DataFrame index has no label then the recarray field name
- is set to 'index'. If the index has a label then this is used as the
- field name:
-
- >>> df.index = df.index.rename("I")
- >>> df.to_records()
- rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
- dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
-
- The index can be excluded from the record array:
-
- >>> df.to_records(index=False)
- rec.array([(1, 0.5 ), (2, 0.75)],
- dtype=[('A', '<i8'), ('B', '<f8')])
-
- Data types can be specified for the columns:
-
- >>> df.to_records(column_dtypes={"A": "int32"})
- rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
- dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
-
- As well as for the index:
-
- >>> df.to_records(index_dtypes="<S2")
- rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
- dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
-
- >>> index_dtypes = f"<S{df.index.str.len().max()}"
- >>> df.to_records(index_dtypes=index_dtypes)
- rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
- dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
- """
- if index:
- ix_vals = [
- np.asarray(self.index.get_level_values(i))
- for i in range(self.index.nlevels)
- ]
-
- arrays = ix_vals + [
- np.asarray(self.iloc[:, i]) for i in range(len(self.columns))
- ]
-
- index_names = list(self.index.names)
-
- if isinstance(self.index, MultiIndex):
- index_names = com.fill_missing_names(index_names)
- elif index_names[0] is None:
- index_names = ["index"]
-
- names = [str(name) for name in itertools.chain(index_names, self.columns)]
- else:
- arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]
- names = [str(c) for c in self.columns]
- index_names = []
-
- index_len = len(index_names)
- formats = []
-
- for i, v in enumerate(arrays):
- index_int = i
-
- # When the names and arrays are collected, we
- # first collect those in the DataFrame's index,
- # followed by those in its columns.
- #
- # Thus, the total length of the array is:
- # len(index_names) + len(DataFrame.columns).
- #
- # This check allows us to see whether we are
- # handling a name / array in the index or column.
- if index_int < index_len:
- dtype_mapping = index_dtypes
- name = index_names[index_int]
- else:
- index_int -= index_len
- dtype_mapping = column_dtypes
- name = self.columns[index_int]
-
- # We have a dictionary, so we get the data type
- # associated with the index or column (which can
- # be denoted by its name in the DataFrame or its
- # position in DataFrame's array of indices or
- # columns, whichever is applicable.
- if is_dict_like(dtype_mapping):
- if name in dtype_mapping:
- dtype_mapping = dtype_mapping[name]
- elif index_int in dtype_mapping:
- dtype_mapping = dtype_mapping[index_int]
- else:
- dtype_mapping = None
-
- # If no mapping can be found, use the array's
- # dtype attribute for formatting.
- #
- # A valid dtype must either be a type or
- # string naming a type.
- if dtype_mapping is None:
- formats.append(v.dtype)
- elif isinstance(dtype_mapping, (type, np.dtype, str)):
- # error: Argument 1 to "append" of "list" has incompatible
- # type "Union[type, dtype[Any], str]"; expected "dtype[Any]"
- formats.append(dtype_mapping) # type: ignore[arg-type]
- else:
- element = "row" if i < index_len else "column"
- msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
- raise ValueError(msg)
-
- return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
-
- @classmethod
- def _from_arrays(
- cls,
- arrays,
- columns,
- index,
- dtype: Dtype | None = None,
- verify_integrity: bool = True,
- ) -> DataFrame:
- """
- Create DataFrame from a list of arrays corresponding to the columns.
-
- Parameters
- ----------
- arrays : list-like of arrays
- Each array in the list corresponds to one column, in order.
- columns : list-like, Index
- The column names for the resulting DataFrame.
- index : list-like, Index
- The rows labels for the resulting DataFrame.
- dtype : dtype, optional
- Optional dtype to enforce for all arrays.
- verify_integrity : bool, default True
- Validate and homogenize all input. If set to False, it is assumed
- that all elements of `arrays` are actual arrays how they will be
- stored in a block (numpy ndarray or ExtensionArray), have the same
- length as and are aligned with the index, and that `columns` and
- `index` are ensured to be an Index object.
-
- Returns
- -------
- DataFrame
- """
- if dtype is not None:
- dtype = pandas_dtype(dtype)
-
- manager = get_option("mode.data_manager")
- columns = ensure_index(columns)
- if len(columns) != len(arrays):
- raise ValueError("len(columns) must match len(arrays)")
- mgr = arrays_to_mgr(
- arrays,
- columns,
- index,
- dtype=dtype,
- verify_integrity=verify_integrity,
- typ=manager,
- )
- return cls(mgr)
-
- @doc(
- storage_options=_shared_docs["storage_options"],
- compression_options=_shared_docs["compression_options"] % "path",
- )
- def to_stata(
- self,
- path: FilePath | WriteBuffer[bytes],
- *,
- convert_dates: dict[Hashable, str] | None = None,
- write_index: bool = True,
- byteorder: str | None = None,
- time_stamp: datetime.datetime | None = None,
- data_label: str | None = None,
- variable_labels: dict[Hashable, str] | None = None,
- version: int | None = 114,
- convert_strl: Sequence[Hashable] | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
- value_labels: dict[Hashable, dict[float, str]] | None = None,
- ) -> None:
- """
- Export DataFrame object to Stata dta format.
-
- Writes the DataFrame to a Stata dataset file.
- "dta" files contain a Stata dataset.
-
- Parameters
- ----------
- path : str, path object, or buffer
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``write()`` function.
-
- convert_dates : dict
- Dictionary mapping columns containing datetime types to stata
- internal format to use when writing the dates. Options are 'tc',
- 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
- or a name. Datetime columns that do not have a conversion type
- specified will be converted to 'tc'. Raises NotImplementedError if
- a datetime column has timezone information.
- write_index : bool
- Write the index to Stata dataset.
- byteorder : str
- Can be ">", "<", "little", or "big". default is `sys.byteorder`.
- time_stamp : datetime
- A datetime to use as file creation date. Default is the current
- time.
- data_label : str, optional
- A label for the data set. Must be 80 characters or smaller.
- variable_labels : dict
- Dictionary containing columns as keys and variable labels as
- values. Each label must be 80 characters or smaller.
- version : {{114, 117, 118, 119, None}}, default 114
- Version to use in the output dta file. Set to None to let pandas
- decide between 118 or 119 formats depending on the number of
- columns in the frame. Version 114 can be read by Stata 10 and
- later. Version 117 can be read by Stata 13 or later. Version 118
- is supported in Stata 14 and later. Version 119 is supported in
- Stata 15 and later. Version 114 limits string variables to 244
- characters or fewer while versions 117 and later allow strings
- with lengths up to 2,000,000 characters. Versions 118 and 119
- support Unicode characters, and version 119 supports more than
- 32,767 variables.
-
- Version 119 should usually only be used when the number of
- variables exceeds the capacity of dta format 118. Exporting
- smaller datasets in format 119 may have unintended consequences,
- and, as of November 2020, Stata SE cannot read version 119 files.
-
- convert_strl : list, optional
- List of column names to convert to string columns to Stata StrL
- format. Only available if version is 117. Storing strings in the
- StrL format can produce smaller dta files if strings have more than
- 8 characters and values are repeated.
- {compression_options}
-
- .. versionadded:: 1.1.0
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- value_labels : dict of dicts
- Dictionary containing columns as keys and dictionaries of column value
- to labels as values. Labels for a single variable must be 32,000
- characters or smaller.
-
- .. versionadded:: 1.4.0
-
- Raises
- ------
- NotImplementedError
- * If datetimes contain timezone information
- * Column dtype is not representable in Stata
- ValueError
- * Columns listed in convert_dates are neither datetime64[ns]
- or datetime.datetime
- * Column listed in convert_dates is not in DataFrame
- * Categorical label contains more than 32,000 characters
-
- See Also
- --------
- read_stata : Import Stata data files.
- io.stata.StataWriter : Low-level writer for Stata data files.
- io.stata.StataWriter117 : Low-level writer for version 117 files.
-
- Examples
- --------
- >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',
- ... 'parrot'],
- ... 'speed': [350, 18, 361, 15]}})
- >>> df.to_stata('animals.dta') # doctest: +SKIP
- """
- if version not in (114, 117, 118, 119, None):
- raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
- if version == 114:
- if convert_strl is not None:
- raise ValueError("strl is not supported in format 114")
- from pandas.io.stata import StataWriter as statawriter
- elif version == 117:
- # Incompatible import of "statawriter" (imported name has type
- # "Type[StataWriter117]", local name has type "Type[StataWriter]")
- from pandas.io.stata import ( # type: ignore[assignment]
- StataWriter117 as statawriter,
- )
- else: # versions 118 and 119
- # Incompatible import of "statawriter" (imported name has type
- # "Type[StataWriter117]", local name has type "Type[StataWriter]")
- from pandas.io.stata import ( # type: ignore[assignment]
- StataWriterUTF8 as statawriter,
- )
-
- kwargs: dict[str, Any] = {}
- if version is None or version >= 117:
- # strl conversion is only supported >= 117
- kwargs["convert_strl"] = convert_strl
- if version is None or version >= 118:
- # Specifying the version is only supported for UTF8 (118 or 119)
- kwargs["version"] = version
-
- writer = statawriter(
- path,
- self,
- convert_dates=convert_dates,
- byteorder=byteorder,
- time_stamp=time_stamp,
- data_label=data_label,
- write_index=write_index,
- variable_labels=variable_labels,
- compression=compression,
- storage_options=storage_options,
- value_labels=value_labels,
- **kwargs,
- )
- writer.write_file()
-
- def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:
- """
- Write a DataFrame to the binary Feather format.
-
- Parameters
- ----------
- path : str, path object, file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``write()`` function. If a string or a path,
- it will be used as Root Directory path when writing a partitioned dataset.
- **kwargs :
- Additional keywords passed to :func:`pyarrow.feather.write_feather`.
- Starting with pyarrow 0.17, this includes the `compression`,
- `compression_level`, `chunksize` and `version` keywords.
-
- .. versionadded:: 1.1.0
-
- Notes
- -----
- This function writes the dataframe as a `feather file
- <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default
- index. For saving the DataFrame with your custom index use a method that
- supports custom indices e.g. `to_parquet`.
- """
- from pandas.io.feather_format import to_feather
-
- to_feather(self, path, **kwargs)
-
- @doc(
- Series.to_markdown,
- klass=_shared_doc_kwargs["klass"],
- storage_options=_shared_docs["storage_options"],
- examples="""Examples
- --------
- >>> df = pd.DataFrame(
- ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
- ... )
- >>> print(df.to_markdown())
- | | animal_1 | animal_2 |
- |---:|:-----------|:-----------|
- | 0 | elk | dog |
- | 1 | pig | quetzal |
-
- Output markdown with a tabulate option.
-
- >>> print(df.to_markdown(tablefmt="grid"))
- +----+------------+------------+
- | | animal_1 | animal_2 |
- +====+============+============+
- | 0 | elk | dog |
- +----+------------+------------+
- | 1 | pig | quetzal |
- +----+------------+------------+""",
- )
- def to_markdown(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- mode: str = "wt",
- index: bool = True,
- storage_options: StorageOptions = None,
- **kwargs,
- ) -> str | None:
- if "showindex" in kwargs:
- raise ValueError("Pass 'index' instead of 'showindex")
-
- kwargs.setdefault("headers", "keys")
- kwargs.setdefault("tablefmt", "pipe")
- kwargs.setdefault("showindex", index)
- tabulate = import_optional_dependency("tabulate")
- result = tabulate.tabulate(self, **kwargs)
- if buf is None:
- return result
-
- with get_handle(buf, mode, storage_options=storage_options) as handles:
- handles.handle.write(result)
- return None
-
- @overload
- def to_parquet(
- self,
- path: None = ...,
- engine: str = ...,
- compression: str | None = ...,
- index: bool | None = ...,
- partition_cols: list[str] | None = ...,
- storage_options: StorageOptions = ...,
- **kwargs,
- ) -> bytes:
- ...
-
- @overload
- def to_parquet(
- self,
- path: FilePath | WriteBuffer[bytes],
- engine: str = ...,
- compression: str | None = ...,
- index: bool | None = ...,
- partition_cols: list[str] | None = ...,
- storage_options: StorageOptions = ...,
- **kwargs,
- ) -> None:
- ...
-
- @doc(storage_options=_shared_docs["storage_options"])
- def to_parquet(
- self,
- path: FilePath | WriteBuffer[bytes] | None = None,
- engine: str = "auto",
- compression: str | None = "snappy",
- index: bool | None = None,
- partition_cols: list[str] | None = None,
- storage_options: StorageOptions = None,
- **kwargs,
- ) -> bytes | None:
- """
- Write a DataFrame to the binary parquet format.
-
- This function writes the dataframe as a `parquet file
- <https://parquet.apache.org/>`_. You can choose different parquet
- backends, and have the option of compression. See
- :ref:`the user guide <io.parquet>` for more details.
-
- Parameters
- ----------
- path : str, path object, file-like object, or None, default None
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``write()`` function. If None, the result is
- returned as bytes. If a string or path, it will be used as Root Directory
- path when writing a partitioned dataset.
-
- .. versionchanged:: 1.2.0
-
- Previously this was "fname"
-
- engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
- Parquet library to use. If 'auto', then the option
- ``io.parquet.engine`` is used. The default ``io.parquet.engine``
- behavior is to try 'pyarrow', falling back to 'fastparquet' if
- 'pyarrow' is unavailable.
- compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'
- Name of the compression to use. Use ``None`` for no compression.
- index : bool, default None
- If ``True``, include the dataframe's index(es) in the file output.
- If ``False``, they will not be written to the file.
- If ``None``, similar to ``True`` the dataframe's index(es)
- will be saved. However, instead of being saved as values,
- the RangeIndex will be stored as a range in the metadata so it
- doesn't require much space and is faster. Other indexes will
- be included as columns in the file output.
- partition_cols : list, optional, default None
- Column names by which to partition the dataset.
- Columns are partitioned in the order they are given.
- Must be None if path is not a string.
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- **kwargs
- Additional arguments passed to the parquet library. See
- :ref:`pandas io <io.parquet>` for more details.
-
- Returns
- -------
- bytes if no path argument is provided else None
-
- See Also
- --------
- read_parquet : Read a parquet file.
- DataFrame.to_orc : Write an orc file.
- DataFrame.to_csv : Write a csv file.
- DataFrame.to_sql : Write to a sql table.
- DataFrame.to_hdf : Write to hdf.
-
- Notes
- -----
- This function requires either the `fastparquet
- <https://pypi.org/project/fastparquet>`_ or `pyarrow
- <https://arrow.apache.org/docs/python/>`_ library.
-
- Examples
- --------
- >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
- >>> df.to_parquet('df.parquet.gzip',
- ... compression='gzip') # doctest: +SKIP
- >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
- col1 col2
- 0 1 3
- 1 2 4
-
- If you want to get a buffer to the parquet content you can use a io.BytesIO
- object, as long as you don't use partition_cols, which creates multiple files.
-
- >>> import io
- >>> f = io.BytesIO()
- >>> df.to_parquet(f)
- >>> f.seek(0)
- 0
- >>> content = f.read()
- """
- from pandas.io.parquet import to_parquet
-
- return to_parquet(
- self,
- path,
- engine,
- compression=compression,
- index=index,
- partition_cols=partition_cols,
- storage_options=storage_options,
- **kwargs,
- )
-
- def to_orc(
- self,
- path: FilePath | WriteBuffer[bytes] | None = None,
- *,
- engine: Literal["pyarrow"] = "pyarrow",
- index: bool | None = None,
- engine_kwargs: dict[str, Any] | None = None,
- ) -> bytes | None:
- """
- Write a DataFrame to the ORC format.
-
- .. versionadded:: 1.5.0
-
- Parameters
- ----------
- path : str, file-like object or None, default None
- If a string, it will be used as Root Directory path
- when writing a partitioned dataset. By file-like object,
- we refer to objects with a write() method, such as a file handle
- (e.g. via builtin open function). If path is None,
- a bytes object is returned.
- engine : str, default 'pyarrow'
- ORC library to use. Pyarrow must be >= 7.0.0.
- index : bool, optional
- If ``True``, include the dataframe's index(es) in the file output.
- If ``False``, they will not be written to the file.
- If ``None``, similar to ``infer`` the dataframe's index(es)
- will be saved. However, instead of being saved as values,
- the RangeIndex will be stored as a range in the metadata so it
- doesn't require much space and is faster. Other indexes will
- be included as columns in the file output.
- engine_kwargs : dict[str, Any] or None, default None
- Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
-
- Returns
- -------
- bytes if no path argument is provided else None
-
- Raises
- ------
- NotImplementedError
- Dtype of one or more columns is category, unsigned integers, interval,
- period or sparse.
- ValueError
- engine is not pyarrow.
-
- See Also
- --------
- read_orc : Read a ORC file.
- DataFrame.to_parquet : Write a parquet file.
- DataFrame.to_csv : Write a csv file.
- DataFrame.to_sql : Write to a sql table.
- DataFrame.to_hdf : Write to hdf.
-
- Notes
- -----
- * Before using this function you should read the :ref:`user guide about
- ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.
- * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
- library.
- * For supported dtypes please refer to `supported ORC features in Arrow
- <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
- * Currently timezones in datetime columns are not preserved when a
- dataframe is converted into ORC files.
-
- Examples
- --------
- >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
- >>> df.to_orc('df.orc') # doctest: +SKIP
- >>> pd.read_orc('df.orc') # doctest: +SKIP
- col1 col2
- 0 1 4
- 1 2 3
-
- If you want to get a buffer to the orc content you can write it to io.BytesIO
- >>> import io
- >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP
- >>> b.seek(0) # doctest: +SKIP
- 0
- >>> content = b.read() # doctest: +SKIP
- """
- from pandas.io.orc import to_orc
-
- return to_orc(
- self, path, engine=engine, index=index, engine_kwargs=engine_kwargs
- )
-
- @overload
- def to_html(
- self,
- buf: FilePath | WriteBuffer[str],
- columns: Sequence[Level] | None = ...,
- col_space: ColspaceArgType | None = ...,
- header: bool | Sequence[str] = ...,
- index: bool = ...,
- na_rep: str = ...,
- formatters: FormattersType | None = ...,
- float_format: FloatFormatType | None = ...,
- sparsify: bool | None = ...,
- index_names: bool = ...,
- justify: str | None = ...,
- max_rows: int | None = ...,
- max_cols: int | None = ...,
- show_dimensions: bool | str = ...,
- decimal: str = ...,
- bold_rows: bool = ...,
- classes: str | list | tuple | None = ...,
- escape: bool = ...,
- notebook: bool = ...,
- border: int | bool | None = ...,
- table_id: str | None = ...,
- render_links: bool = ...,
- encoding: str | None = ...,
- ) -> None:
- ...
-
- @overload
- def to_html(
- self,
- buf: None = ...,
- columns: Sequence[Level] | None = ...,
- col_space: ColspaceArgType | None = ...,
- header: bool | Sequence[str] = ...,
- index: bool = ...,
- na_rep: str = ...,
- formatters: FormattersType | None = ...,
- float_format: FloatFormatType | None = ...,
- sparsify: bool | None = ...,
- index_names: bool = ...,
- justify: str | None = ...,
- max_rows: int | None = ...,
- max_cols: int | None = ...,
- show_dimensions: bool | str = ...,
- decimal: str = ...,
- bold_rows: bool = ...,
- classes: str | list | tuple | None = ...,
- escape: bool = ...,
- notebook: bool = ...,
- border: int | bool | None = ...,
- table_id: str | None = ...,
- render_links: bool = ...,
- encoding: str | None = ...,
- ) -> str:
- ...
-
- @Substitution(
- header_type="bool",
- header="Whether to print column labels, default True",
- col_space_type="str or int, list or dict of int or str",
- col_space="The minimum width of each column in CSS length "
- "units. An int is assumed to be px units.",
- )
- @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
- def to_html(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- columns: Sequence[Level] | None = None,
- col_space: ColspaceArgType | None = None,
- header: bool | Sequence[str] = True,
- index: bool = True,
- na_rep: str = "NaN",
- formatters: FormattersType | None = None,
- float_format: FloatFormatType | None = None,
- sparsify: bool | None = None,
- index_names: bool = True,
- justify: str | None = None,
- max_rows: int | None = None,
- max_cols: int | None = None,
- show_dimensions: bool | str = False,
- decimal: str = ".",
- bold_rows: bool = True,
- classes: str | list | tuple | None = None,
- escape: bool = True,
- notebook: bool = False,
- border: int | bool | None = None,
- table_id: str | None = None,
- render_links: bool = False,
- encoding: str | None = None,
- ) -> str | None:
- """
- Render a DataFrame as an HTML table.
- %(shared_params)s
- bold_rows : bool, default True
- Make the row labels bold in the output.
- classes : str or list or tuple, default None
- CSS class(es) to apply to the resulting html table.
- escape : bool, default True
- Convert the characters <, >, and & to HTML-safe sequences.
- notebook : {True, False}, default False
- Whether the generated HTML is for IPython Notebook.
- border : int
- A ``border=border`` attribute is included in the opening
- `<table>` tag. Default ``pd.options.display.html.border``.
- table_id : str, optional
- A css id is included in the opening `<table>` tag if specified.
- render_links : bool, default False
- Convert URLs to HTML links.
- encoding : str, default "utf-8"
- Set character encoding.
-
- .. versionadded:: 1.0
- %(returns)s
- See Also
- --------
- to_string : Convert DataFrame to a string.
- """
- if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:
- raise ValueError("Invalid value for justify parameter")
-
- formatter = fmt.DataFrameFormatter(
- self,
- columns=columns,
- col_space=col_space,
- na_rep=na_rep,
- header=header,
- index=index,
- formatters=formatters,
- float_format=float_format,
- bold_rows=bold_rows,
- sparsify=sparsify,
- justify=justify,
- index_names=index_names,
- escape=escape,
- decimal=decimal,
- max_rows=max_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- )
- # TODO: a generic formatter wld b in DataFrameFormatter
- return fmt.DataFrameRenderer(formatter).to_html(
- buf=buf,
- classes=classes,
- notebook=notebook,
- border=border,
- encoding=encoding,
- table_id=table_id,
- render_links=render_links,
- )
-
- @doc(
- storage_options=_shared_docs["storage_options"],
- compression_options=_shared_docs["compression_options"] % "path_or_buffer",
- )
- def to_xml(
- self,
- path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
- index: bool = True,
- root_name: str | None = "data",
- row_name: str | None = "row",
- na_rep: str | None = None,
- attr_cols: list[str] | None = None,
- elem_cols: list[str] | None = None,
- namespaces: dict[str | None, str] | None = None,
- prefix: str | None = None,
- encoding: str = "utf-8",
- xml_declaration: bool | None = True,
- pretty_print: bool | None = True,
- parser: str | None = "lxml",
- stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
- ) -> str | None:
- """
- Render a DataFrame to an XML document.
-
- .. versionadded:: 1.3.0
-
- Parameters
- ----------
- path_or_buffer : str, path object, file-like object, or None, default None
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a ``write()`` function. If None, the result is returned
- as a string.
- index : bool, default True
- Whether to include index in XML document.
- root_name : str, default 'data'
- The name of root element in XML document.
- row_name : str, default 'row'
- The name of row element in XML document.
- na_rep : str, optional
- Missing data representation.
- attr_cols : list-like, optional
- List of columns to write as attributes in row element.
- Hierarchical columns will be flattened with underscore
- delimiting the different levels.
- elem_cols : list-like, optional
- List of columns to write as children in row element. By default,
- all columns output as children of row element. Hierarchical
- columns will be flattened with underscore delimiting the
- different levels.
- namespaces : dict, optional
- All namespaces to be defined in root element. Keys of dict
- should be prefix names and values of dict corresponding URIs.
- Default namespaces should be given empty string key. For
- example, ::
-
- namespaces = {{"": "https://example.com"}}
-
- prefix : str, optional
- Namespace prefix to be used for every element and/or attribute
- in document. This should be one of the keys in ``namespaces``
- dict.
- encoding : str, default 'utf-8'
- Encoding of the resulting document.
- xml_declaration : bool, default True
- Whether to include the XML declaration at start of document.
- pretty_print : bool, default True
- Whether output should be pretty printed with indentation and
- line breaks.
- parser : {{'lxml','etree'}}, default 'lxml'
- Parser module to use for building of tree. Only 'lxml' and
- 'etree' are supported. With 'lxml', the ability to use XSLT
- stylesheet is supported.
- stylesheet : str, path object or file-like object, optional
- A URL, file-like object, or a raw string containing an XSLT
- script used to transform the raw XML output. Script should use
- layout of elements and attributes from original output. This
- argument requires ``lxml`` to be installed. Only XSLT 1.0
- scripts and not later versions is currently supported.
- {compression_options}
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- {storage_options}
-
- Returns
- -------
- None or str
- If ``io`` is None, returns the resulting XML format as a
- string. Otherwise returns None.
-
- See Also
- --------
- to_json : Convert the pandas object to a JSON string.
- to_html : Convert DataFrame to a html.
-
- Examples
- --------
- >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'],
- ... 'degrees': [360, 360, 180],
- ... 'sides': [4, np.nan, 3]}})
-
- >>> df.to_xml() # doctest: +SKIP
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <index>0</index>
- <shape>square</shape>
- <degrees>360</degrees>
- <sides>4.0</sides>
- </row>
- <row>
- <index>1</index>
- <shape>circle</shape>
- <degrees>360</degrees>
- <sides/>
- </row>
- <row>
- <index>2</index>
- <shape>triangle</shape>
- <degrees>180</degrees>
- <sides>3.0</sides>
- </row>
- </data>
-
- >>> df.to_xml(attr_cols=[
- ... 'index', 'shape', 'degrees', 'sides'
- ... ]) # doctest: +SKIP
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row index="0" shape="square" degrees="360" sides="4.0"/>
- <row index="1" shape="circle" degrees="360"/>
- <row index="2" shape="triangle" degrees="180" sides="3.0"/>
- </data>
-
- >>> df.to_xml(namespaces={{"doc": "https://example.com"}},
- ... prefix="doc") # doctest: +SKIP
- <?xml version='1.0' encoding='utf-8'?>
- <doc:data xmlns:doc="https://example.com">
- <doc:row>
- <doc:index>0</doc:index>
- <doc:shape>square</doc:shape>
- <doc:degrees>360</doc:degrees>
- <doc:sides>4.0</doc:sides>
- </doc:row>
- <doc:row>
- <doc:index>1</doc:index>
- <doc:shape>circle</doc:shape>
- <doc:degrees>360</doc:degrees>
- <doc:sides/>
- </doc:row>
- <doc:row>
- <doc:index>2</doc:index>
- <doc:shape>triangle</doc:shape>
- <doc:degrees>180</doc:degrees>
- <doc:sides>3.0</doc:sides>
- </doc:row>
- </doc:data>
- """
-
- from pandas.io.formats.xml import (
- EtreeXMLFormatter,
- LxmlXMLFormatter,
- )
-
- lxml = import_optional_dependency("lxml.etree", errors="ignore")
-
- TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter]
-
- if parser == "lxml":
- if lxml is not None:
- TreeBuilder = LxmlXMLFormatter
- else:
- raise ImportError(
- "lxml not found, please install or use the etree parser."
- )
-
- elif parser == "etree":
- TreeBuilder = EtreeXMLFormatter
-
- else:
- raise ValueError("Values for parser can only be lxml or etree.")
-
- xml_formatter = TreeBuilder(
- self,
- path_or_buffer=path_or_buffer,
- index=index,
- root_name=root_name,
- row_name=row_name,
- na_rep=na_rep,
- attr_cols=attr_cols,
- elem_cols=elem_cols,
- namespaces=namespaces,
- prefix=prefix,
- encoding=encoding,
- xml_declaration=xml_declaration,
- pretty_print=pretty_print,
- stylesheet=stylesheet,
- compression=compression,
- storage_options=storage_options,
- )
-
- return xml_formatter.write_output()
-
- # ----------------------------------------------------------------------
- @doc(INFO_DOCSTRING, **frame_sub_kwargs)
- def info(
- self,
- verbose: bool | None = None,
- buf: WriteBuffer[str] | None = None,
- max_cols: int | None = None,
- memory_usage: bool | str | None = None,
- show_counts: bool | None = None,
- ) -> None:
- info = DataFrameInfo(
- data=self,
- memory_usage=memory_usage,
- )
- info.render(
- buf=buf,
- max_cols=max_cols,
- verbose=verbose,
- show_counts=show_counts,
- )
-
- def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
- """
- Return the memory usage of each column in bytes.
-
- The memory usage can optionally include the contribution of
- the index and elements of `object` dtype.
-
- This value is displayed in `DataFrame.info` by default. This can be
- suppressed by setting ``pandas.options.display.memory_usage`` to False.
-
- Parameters
- ----------
- index : bool, default True
- Specifies whether to include the memory usage of the DataFrame's
- index in returned Series. If ``index=True``, the memory usage of
- the index is the first item in the output.
- deep : bool, default False
- If True, introspect the data deeply by interrogating
- `object` dtypes for system-level memory consumption, and include
- it in the returned values.
-
- Returns
- -------
- Series
- A Series whose index is the original column names and whose values
- is the memory usage of each column in bytes.
-
- See Also
- --------
- numpy.ndarray.nbytes : Total bytes consumed by the elements of an
- ndarray.
- Series.memory_usage : Bytes consumed by a Series.
- Categorical : Memory-efficient array for string values with
- many repeated values.
- DataFrame.info : Concise summary of a DataFrame.
-
- Notes
- -----
- See the :ref:`Frequently Asked Questions <df-memory-usage>` for more
- details.
-
- Examples
- --------
- >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
- >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))
- ... for t in dtypes])
- >>> df = pd.DataFrame(data)
- >>> df.head()
- int64 float64 complex128 object bool
- 0 1 1.0 1.0+0.0j 1 True
- 1 1 1.0 1.0+0.0j 1 True
- 2 1 1.0 1.0+0.0j 1 True
- 3 1 1.0 1.0+0.0j 1 True
- 4 1 1.0 1.0+0.0j 1 True
-
- >>> df.memory_usage()
- Index 128
- int64 40000
- float64 40000
- complex128 80000
- object 40000
- bool 5000
- dtype: int64
-
- >>> df.memory_usage(index=False)
- int64 40000
- float64 40000
- complex128 80000
- object 40000
- bool 5000
- dtype: int64
-
- The memory footprint of `object` dtype columns is ignored by default:
-
- >>> df.memory_usage(deep=True)
- Index 128
- int64 40000
- float64 40000
- complex128 80000
- object 180000
- bool 5000
- dtype: int64
-
- Use a Categorical for efficient storage of an object-dtype column with
- many repeated values.
-
- >>> df['object'].astype('category').memory_usage(deep=True)
- 5244
- """
- result = self._constructor_sliced(
- [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
- index=self.columns,
- dtype=np.intp,
- )
- if index:
- index_memory_usage = self._constructor_sliced(
- self.index.memory_usage(deep=deep), index=["Index"]
- )
- result = index_memory_usage._append(result)
- return result
-
- def transpose(self, *args, copy: bool = False) -> DataFrame:
- """
- Transpose index and columns.
-
- Reflect the DataFrame over its main diagonal by writing rows as columns
- and vice-versa. The property :attr:`.T` is an accessor to the method
- :meth:`transpose`.
-
- Parameters
- ----------
- *args : tuple, optional
- Accepted for compatibility with NumPy.
- copy : bool, default False
- Whether to copy the data after transposing, even for DataFrames
- with a single dtype.
-
- Note that a copy is always required for mixed dtype DataFrames,
- or for DataFrames with any extension types.
-
- Returns
- -------
- DataFrame
- The transposed DataFrame.
-
- See Also
- --------
- numpy.transpose : Permute the dimensions of a given array.
-
- Notes
- -----
- Transposing a DataFrame with mixed dtypes will result in a homogeneous
- DataFrame with the `object` dtype. In such a case, a copy of the data
- is always made.
-
- Examples
- --------
- **Square DataFrame with homogeneous dtype**
-
- >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
- >>> df1 = pd.DataFrame(data=d1)
- >>> df1
- col1 col2
- 0 1 3
- 1 2 4
-
- >>> df1_transposed = df1.T # or df1.transpose()
- >>> df1_transposed
- 0 1
- col1 1 2
- col2 3 4
-
- When the dtype is homogeneous in the original DataFrame, we get a
- transposed DataFrame with the same dtype:
-
- >>> df1.dtypes
- col1 int64
- col2 int64
- dtype: object
- >>> df1_transposed.dtypes
- 0 int64
- 1 int64
- dtype: object
-
- **Non-square DataFrame with mixed dtypes**
-
- >>> d2 = {'name': ['Alice', 'Bob'],
- ... 'score': [9.5, 8],
- ... 'employed': [False, True],
- ... 'kids': [0, 0]}
- >>> df2 = pd.DataFrame(data=d2)
- >>> df2
- name score employed kids
- 0 Alice 9.5 False 0
- 1 Bob 8.0 True 0
-
- >>> df2_transposed = df2.T # or df2.transpose()
- >>> df2_transposed
- 0 1
- name Alice Bob
- score 9.5 8.0
- employed False True
- kids 0 0
-
- When the DataFrame has mixed dtypes, we get a transposed DataFrame with
- the `object` dtype:
-
- >>> df2.dtypes
- name object
- score float64
- employed bool
- kids int64
- dtype: object
- >>> df2_transposed.dtypes
- 0 object
- 1 object
- dtype: object
- """
- nv.validate_transpose(args, {})
- # construct the args
-
- dtypes = list(self.dtypes)
-
- if self._can_fast_transpose:
- # Note: tests pass without this, but this improves perf quite a bit.
- new_vals = self._values.T
- if copy and not using_copy_on_write():
- new_vals = new_vals.copy()
-
- result = self._constructor(
- new_vals, index=self.columns, columns=self.index, copy=False
- )
- if using_copy_on_write() and len(self) > 0:
- result._mgr.add_references(self._mgr) # type: ignore[arg-type]
-
- elif (
- self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0])
- ):
- # We have EAs with the same dtype. We can preserve that dtype in transpose.
- dtype = dtypes[0]
- arr_type = dtype.construct_array_type()
- values = self.values
-
- new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
- result = type(self)._from_arrays(
- new_values, index=self.columns, columns=self.index
- )
-
- else:
- new_arr = self.values.T
- if copy and not using_copy_on_write():
- new_arr = new_arr.copy()
- result = self._constructor(
- new_arr,
- index=self.columns,
- columns=self.index,
- # We already made a copy (more than one block)
- copy=False,
- )
-
- return result.__finalize__(self, method="transpose")
-
- @property
- def T(self) -> DataFrame:
- """
- The transpose of the DataFrame.
-
- Returns
- -------
- DataFrame
- The transposed DataFrame.
-
- See Also
- --------
- DataFrame.transpose : Transpose index and columns.
-
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df
- col1 col2
- 0 1 3
- 1 2 4
-
- >>> df.T
- 0 1
- col1 1 2
- col2 3 4
- """
- return self.transpose()
-
- # ----------------------------------------------------------------------
- # Indexing Methods
-
- def _ixs(self, i: int, axis: AxisInt = 0) -> Series:
- """
- Parameters
- ----------
- i : int
- axis : int
-
- Returns
- -------
- Series
- """
- # irow
- if axis == 0:
- new_mgr = self._mgr.fast_xs(i)
-
- # if we are a copy, mark as such
- copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None
- result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__(
- self
- )
- result._set_is_copy(self, copy=copy)
- return result
-
- # icol
- else:
- label = self.columns[i]
-
- col_mgr = self._mgr.iget(i)
- result = self._box_col_values(col_mgr, i)
-
- # this is a cached value, mark it so
- result._set_as_cached(label, self)
- return result
-
- def _get_column_array(self, i: int) -> ArrayLike:
- """
- Get the values of the i'th column (ndarray or ExtensionArray, as stored
- in the Block)
-
- Warning! The returned array is a view but doesn't handle Copy-on-Write,
- so this should be used with caution (for read-only purposes).
- """
- return self._mgr.iget_values(i)
-
- def _iter_column_arrays(self) -> Iterator[ArrayLike]:
- """
- Iterate over the arrays of all columns in order.
- This returns the values as stored in the Block (ndarray or ExtensionArray).
-
- Warning! The returned array is a view but doesn't handle Copy-on-Write,
- so this should be used with caution (for read-only purposes).
- """
- for i in range(len(self.columns)):
- yield self._get_column_array(i)
-
- def _getitem_nocopy(self, key: list):
- """
- Behaves like __getitem__, but returns a view in cases where __getitem__
- would make a copy.
- """
- # TODO(CoW): can be removed if/when we are always Copy-on-Write
- indexer = self.columns._get_indexer_strict(key, "columns")[1]
- new_axis = self.columns[indexer]
-
- new_mgr = self._mgr.reindex_indexer(
- new_axis,
- indexer,
- axis=0,
- allow_dups=True,
- copy=False,
- only_slice=True,
- )
- return self._constructor(new_mgr)
-
- def __getitem__(self, key):
- check_dict_or_set_indexers(key)
- key = lib.item_from_zerodim(key)
- key = com.apply_if_callable(key, self)
-
- if is_hashable(key) and not is_iterator(key):
- # is_iterator to exclude generator e.g. test_getitem_listlike
- # shortcut if the key is in columns
- is_mi = isinstance(self.columns, MultiIndex)
- # GH#45316 Return view if key is not duplicated
- # Only use drop_duplicates with duplicates for performance
- if not is_mi and (
- self.columns.is_unique
- and key in self.columns
- or key in self.columns.drop_duplicates(keep=False)
- ):
- return self._get_item_cache(key)
-
- elif is_mi and self.columns.is_unique and key in self.columns:
- return self._getitem_multilevel(key)
- # Do we have a slicer (on rows)?
- if isinstance(key, slice):
- indexer = self.index._convert_slice_indexer(key, kind="getitem")
- if isinstance(indexer, np.ndarray):
- # reachable with DatetimeIndex
- indexer = lib.maybe_indices_to_slice(
- indexer.astype(np.intp, copy=False), len(self)
- )
- if isinstance(indexer, np.ndarray):
- # GH#43223 If we can not convert, use take
- return self.take(indexer, axis=0)
- return self._slice(indexer, axis=0)
-
- # Do we have a (boolean) DataFrame?
- if isinstance(key, DataFrame):
- return self.where(key)
-
- # Do we have a (boolean) 1d indexer?
- if com.is_bool_indexer(key):
- return self._getitem_bool_array(key)
-
- # We are left with two options: a single key, and a collection of keys,
- # We interpret tuples as collections only for non-MultiIndex
- is_single_key = isinstance(key, tuple) or not is_list_like(key)
-
- if is_single_key:
- if self.columns.nlevels > 1:
- return self._getitem_multilevel(key)
- indexer = self.columns.get_loc(key)
- if is_integer(indexer):
- indexer = [indexer]
- else:
- if is_iterator(key):
- key = list(key)
- indexer = self.columns._get_indexer_strict(key, "columns")[1]
-
- # take() does not accept boolean indexers
- if getattr(indexer, "dtype", None) == bool:
- indexer = np.where(indexer)[0]
-
- data = self._take_with_is_copy(indexer, axis=1)
-
- if is_single_key:
- # What does looking for a single key in a non-unique index return?
- # The behavior is inconsistent. It returns a Series, except when
- # - the key itself is repeated (test on data.shape, #9519), or
- # - we have a MultiIndex on columns (test on self.columns, #21309)
- if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
- # GH#26490 using data[key] can cause RecursionError
- return data._get_item_cache(key)
-
- return data
-
- def _getitem_bool_array(self, key):
- # also raises Exception if object array with NA values
- # warning here just in case -- previously __setitem__ was
- # reindexing but __getitem__ was not; it seems more reasonable to
- # go with the __setitem__ behavior since that is more consistent
- # with all other indexing behavior
- if isinstance(key, Series) and not key.index.equals(self.index):
- warnings.warn(
- "Boolean Series key will be reindexed to match DataFrame index.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- elif len(key) != len(self.index):
- raise ValueError(
- f"Item wrong length {len(key)} instead of {len(self.index)}."
- )
-
- # check_bool_indexer will throw exception if Series key cannot
- # be reindexed to match DataFrame rows
- key = check_bool_indexer(self.index, key)
-
- if key.all():
- return self.copy(deep=None)
-
- indexer = key.nonzero()[0]
- return self._take_with_is_copy(indexer, axis=0)
-
- def _getitem_multilevel(self, key):
- # self.columns is a MultiIndex
- loc = self.columns.get_loc(key)
- if isinstance(loc, (slice, np.ndarray)):
- new_columns = self.columns[loc]
- result_columns = maybe_droplevels(new_columns, key)
- result = self.iloc[:, loc]
- result.columns = result_columns
-
- # If there is only one column being returned, and its name is
- # either an empty string, or a tuple with an empty string as its
- # first element, then treat the empty string as a placeholder
- # and return the column as if the user had provided that empty
- # string in the key. If the result is a Series, exclude the
- # implied empty string from its name.
- if len(result.columns) == 1:
- # e.g. test_frame_getitem_multicolumn_empty_level,
- # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice
- top = result.columns[0]
- if isinstance(top, tuple):
- top = top[0]
- if top == "":
- result = result[""]
- if isinstance(result, Series):
- result = self._constructor_sliced(
- result, index=self.index, name=key
- )
-
- result._set_is_copy(self)
- return result
- else:
- # loc is neither a slice nor ndarray, so must be an int
- return self._ixs(loc, axis=1)
-
- def _get_value(self, index, col, takeable: bool = False) -> Scalar:
- """
- Quickly retrieve single value at passed column and index.
-
- Parameters
- ----------
- index : row label
- col : column label
- takeable : interpret the index/col as indexers, default False
-
- Returns
- -------
- scalar
-
- Notes
- -----
- Assumes that both `self.index._index_as_unique` and
- `self.columns._index_as_unique`; Caller is responsible for checking.
- """
- if takeable:
- series = self._ixs(col, axis=1)
- return series._values[index]
-
- series = self._get_item_cache(col)
- engine = self.index._engine
-
- if not isinstance(self.index, MultiIndex):
- # CategoricalIndex: Trying to use the engine fastpath may give incorrect
- # results if our categories are integers that dont match our codes
- # IntervalIndex: IntervalTree has no get_loc
- row = self.index.get_loc(index)
- return series._values[row]
-
- # For MultiIndex going through engine effectively restricts us to
- # same-length tuples; see test_get_set_value_no_partial_indexing
- loc = engine.get_loc(index)
- return series._values[loc]
-
- def isetitem(self, loc, value) -> None:
- """
- Set the given value in the column with position `loc`.
-
- This is a positional analogue to ``__setitem__``.
-
- Parameters
- ----------
- loc : int or sequence of ints
- Index position for the column.
- value : scalar or arraylike
- Value(s) for the column.
-
- Notes
- -----
- ``frame.isetitem(loc, value)`` is an in-place method as it will
- modify the DataFrame in place (not returning a new object). In contrast to
- ``frame.iloc[:, i] = value`` which will try to update the existing values in
- place, ``frame.isetitem(loc, value)`` will not update the values of the column
- itself in place, it will instead insert a new array.
-
- In cases where ``frame.columns`` is unique, this is equivalent to
- ``frame[frame.columns[i]] = value``.
- """
- if isinstance(value, DataFrame):
- if is_scalar(loc):
- loc = [loc]
-
- for i, idx in enumerate(loc):
- arraylike = self._sanitize_column(value.iloc[:, i])
- self._iset_item_mgr(idx, arraylike, inplace=False)
- return
-
- arraylike = self._sanitize_column(value)
- self._iset_item_mgr(loc, arraylike, inplace=False)
-
- def __setitem__(self, key, value):
- if not PYPY and using_copy_on_write():
- if sys.getrefcount(self) <= 3:
- warnings.warn(
- _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
- )
-
- key = com.apply_if_callable(key, self)
-
- # see if we can slice the rows
- if isinstance(key, slice):
- slc = self.index._convert_slice_indexer(key, kind="getitem")
- return self._setitem_slice(slc, value)
-
- if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
- self._setitem_frame(key, value)
- elif isinstance(key, (Series, np.ndarray, list, Index)):
- self._setitem_array(key, value)
- elif isinstance(value, DataFrame):
- self._set_item_frame_value(key, value)
- elif (
- is_list_like(value)
- and not self.columns.is_unique
- and 1 < len(self.columns.get_indexer_for([key])) == len(value)
- ):
- # Column to set is duplicated
- self._setitem_array([key], value)
- else:
- # set column
- self._set_item(key, value)
-
- def _setitem_slice(self, key: slice, value) -> None:
- # NB: we can't just use self.loc[key] = value because that
- # operates on labels and we need to operate positional for
- # backwards-compat, xref GH#31469
- self._check_setitem_copy()
- self.iloc[key] = value
-
- def _setitem_array(self, key, value):
- # also raises Exception if object array with NA values
- if com.is_bool_indexer(key):
- # bool indexer is indexing along rows
- if len(key) != len(self.index):
- raise ValueError(
- f"Item wrong length {len(key)} instead of {len(self.index)}!"
- )
- key = check_bool_indexer(self.index, key)
- indexer = key.nonzero()[0]
- self._check_setitem_copy()
- if isinstance(value, DataFrame):
- # GH#39931 reindex since iloc does not align
- value = value.reindex(self.index.take(indexer))
- self.iloc[indexer] = value
-
- else:
- # Note: unlike self.iloc[:, indexer] = value, this will
- # never try to overwrite values inplace
-
- if isinstance(value, DataFrame):
- check_key_length(self.columns, key, value)
- for k1, k2 in zip(key, value.columns):
- self[k1] = value[k2]
-
- elif not is_list_like(value):
- for col in key:
- self[col] = value
-
- elif isinstance(value, np.ndarray) and value.ndim == 2:
- self._iset_not_inplace(key, value)
-
- elif np.ndim(value) > 1:
- # list of lists
- value = DataFrame(value).values
- return self._setitem_array(key, value)
-
- else:
- self._iset_not_inplace(key, value)
-
- def _iset_not_inplace(self, key, value):
- # GH#39510 when setting with df[key] = obj with a list-like key and
- # list-like value, we iterate over those listlikes and set columns
- # one at a time. This is different from dispatching to
- # `self.loc[:, key]= value` because loc.__setitem__ may overwrite
- # data inplace, whereas this will insert new arrays.
-
- def igetitem(obj, i: int):
- # Note: we catch DataFrame obj before getting here, but
- # hypothetically would return obj.iloc[:, i]
- if isinstance(obj, np.ndarray):
- return obj[..., i]
- else:
- return obj[i]
-
- if self.columns.is_unique:
- if np.shape(value)[-1] != len(key):
- raise ValueError("Columns must be same length as key")
-
- for i, col in enumerate(key):
- self[col] = igetitem(value, i)
-
- else:
- ilocs = self.columns.get_indexer_non_unique(key)[0]
- if (ilocs < 0).any():
- # key entries not in self.columns
- raise NotImplementedError
-
- if np.shape(value)[-1] != len(ilocs):
- raise ValueError("Columns must be same length as key")
-
- assert np.ndim(value) <= 2
-
- orig_columns = self.columns
-
- # Using self.iloc[:, i] = ... may set values inplace, which
- # by convention we do not do in __setitem__
- try:
- self.columns = Index(range(len(self.columns)))
- for i, iloc in enumerate(ilocs):
- self[iloc] = igetitem(value, i)
- finally:
- self.columns = orig_columns
-
- def _setitem_frame(self, key, value):
- # support boolean setting with DataFrame input, e.g.
- # df[df > df2] = 0
- if isinstance(key, np.ndarray):
- if key.shape != self.shape:
- raise ValueError("Array conditional must be same shape as self")
- key = self._constructor(key, **self._construct_axes_dict(), copy=False)
-
- if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes):
- raise TypeError(
- "Must pass DataFrame or 2-d ndarray with boolean values only"
- )
-
- self._check_inplace_setting(value)
- self._check_setitem_copy()
- self._where(-key, value, inplace=True)
-
- def _set_item_frame_value(self, key, value: DataFrame) -> None:
- self._ensure_valid_index(value)
-
- # align columns
- if key in self.columns:
- loc = self.columns.get_loc(key)
- cols = self.columns[loc]
- len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols)
- if len_cols != len(value.columns):
- raise ValueError("Columns must be same length as key")
-
- # align right-hand-side columns if self.columns
- # is multi-index and self[key] is a sub-frame
- if isinstance(self.columns, MultiIndex) and isinstance(
- loc, (slice, Series, np.ndarray, Index)
- ):
- cols_droplevel = maybe_droplevels(cols, key)
- if len(cols_droplevel) and not cols_droplevel.equals(value.columns):
- value = value.reindex(cols_droplevel, axis=1)
-
- for col, col_droplevel in zip(cols, cols_droplevel):
- self[col] = value[col_droplevel]
- return
-
- if is_scalar(cols):
- self[cols] = value[value.columns[0]]
- return
-
- # now align rows
- arraylike = _reindex_for_setitem(value, self.index)
- self._set_item_mgr(key, arraylike)
- return
-
- if len(value.columns) != 1:
- raise ValueError(
- "Cannot set a DataFrame with multiple columns to the single "
- f"column {key}"
- )
-
- self[key] = value[value.columns[0]]
-
- def _iset_item_mgr(
- self, loc: int | slice | np.ndarray, value, inplace: bool = False
- ) -> None:
- # when called from _set_item_mgr loc can be anything returned from get_loc
- self._mgr.iset(loc, value, inplace=inplace)
- self._clear_item_cache()
-
- def _set_item_mgr(self, key, value: ArrayLike) -> None:
- try:
- loc = self._info_axis.get_loc(key)
- except KeyError:
- # This item wasn't present, just insert at end
- self._mgr.insert(len(self._info_axis), key, value)
- else:
- self._iset_item_mgr(loc, value)
-
- # check if we are modifying a copy
- # try to set first as we want an invalid
- # value exception to occur first
- if len(self):
- self._check_setitem_copy()
-
- def _iset_item(self, loc: int, value) -> None:
- arraylike = self._sanitize_column(value)
- self._iset_item_mgr(loc, arraylike, inplace=True)
-
- # check if we are modifying a copy
- # try to set first as we want an invalid
- # value exception to occur first
- if len(self):
- self._check_setitem_copy()
-
- def _set_item(self, key, value) -> None:
- """
- Add series to DataFrame in specified column.
-
- If series is a numpy-array (not a Series/TimeSeries), it must be the
- same length as the DataFrames index or an error will be thrown.
-
- Series/TimeSeries will be conformed to the DataFrames index to
- ensure homogeneity.
- """
- value = self._sanitize_column(value)
-
- if (
- key in self.columns
- and value.ndim == 1
- and not is_extension_array_dtype(value)
- ):
- # broadcast across multiple columns if necessary
- if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
- existing_piece = self[key]
- if isinstance(existing_piece, DataFrame):
- value = np.tile(value, (len(existing_piece.columns), 1)).T
-
- self._set_item_mgr(key, value)
-
- def _set_value(
- self, index: IndexLabel, col, value: Scalar, takeable: bool = False
- ) -> None:
- """
- Put single value at passed column and index.
-
- Parameters
- ----------
- index : Label
- row label
- col : Label
- column label
- value : scalar
- takeable : bool, default False
- Sets whether or not index/col interpreted as indexers
- """
- try:
- if takeable:
- icol = col
- iindex = cast(int, index)
- else:
- icol = self.columns.get_loc(col)
- iindex = self.index.get_loc(index)
- self._mgr.column_setitem(icol, iindex, value, inplace_only=True)
- self._clear_item_cache()
-
- except (KeyError, TypeError, ValueError, LossySetitemError):
- # get_loc might raise a KeyError for missing labels (falling back
- # to (i)loc will do expansion of the index)
- # column_setitem will do validation that may raise TypeError,
- # ValueError, or LossySetitemError
- # set using a non-recursive method & reset the cache
- if takeable:
- self.iloc[index, col] = value
- else:
- self.loc[index, col] = value
- self._item_cache.pop(col, None)
-
- except InvalidIndexError as ii_err:
- # GH48729: Seems like you are trying to assign a value to a
- # row when only scalar options are permitted
- raise InvalidIndexError(
- f"You can only assign a scalar value not a {type(value)}"
- ) from ii_err
-
- def _ensure_valid_index(self, value) -> None:
- """
- Ensure that if we don't have an index, that we can create one from the
- passed value.
- """
- # GH5632, make sure that we are a Series convertible
- if not len(self.index) and is_list_like(value) and len(value):
- if not isinstance(value, DataFrame):
- try:
- value = Series(value)
- except (ValueError, NotImplementedError, TypeError) as err:
- raise ValueError(
- "Cannot set a frame with no defined index "
- "and a value that cannot be converted to a Series"
- ) from err
-
- # GH31368 preserve name of index
- index_copy = value.index.copy()
- if self.index.name is not None:
- index_copy.name = self.index.name
-
- self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)
-
- def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:
- """
- Provide boxed values for a column.
- """
- # Lookup in columns so that if e.g. a str datetime was passed
- # we attach the Timestamp object as the name.
- name = self.columns[loc]
- klass = self._constructor_sliced
- # We get index=self.index bc values is a SingleDataManager
- return klass(values, name=name, fastpath=True).__finalize__(self)
-
- # ----------------------------------------------------------------------
- # Lookup Caching
-
- def _clear_item_cache(self) -> None:
- self._item_cache.clear()
-
- def _get_item_cache(self, item: Hashable) -> Series:
- """Return the cached item, item represents a label indexer."""
- if using_copy_on_write():
- loc = self.columns.get_loc(item)
- return self._ixs(loc, axis=1)
-
- cache = self._item_cache
- res = cache.get(item)
- if res is None:
- # All places that call _get_item_cache have unique columns,
- # pending resolution of GH#33047
-
- loc = self.columns.get_loc(item)
- res = self._ixs(loc, axis=1)
-
- cache[item] = res
-
- # for a chain
- res._is_copy = self._is_copy
- return res
-
- def _reset_cacher(self) -> None:
- # no-op for DataFrame
- pass
-
- def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:
- """
- The object has called back to us saying maybe it has changed.
- """
- loc = self._info_axis.get_loc(item)
- arraylike = value._values
-
- old = self._ixs(loc, axis=1)
- if old._values is value._values and inplace:
- # GH#46149 avoid making unnecessary copies/block-splitting
- return
-
- self._mgr.iset(loc, arraylike, inplace=inplace)
-
- # ----------------------------------------------------------------------
- # Unsorted
-
- @overload
- def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame:
- ...
-
- @overload
- def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
- ...
-
- @overload
- def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None:
- ...
-
- def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None:
- """
- Query the columns of a DataFrame with a boolean expression.
-
- Parameters
- ----------
- expr : str
- The query string to evaluate.
-
- You can refer to variables
- in the environment by prefixing them with an '@' character like
- ``@a + b``.
-
- You can refer to column names that are not valid Python variable names
- by surrounding them in backticks. Thus, column names containing spaces
- or punctuations (besides underscores) or starting with digits must be
- surrounded by backticks. (For example, a column named "Area (cm^2)" would
- be referenced as ```Area (cm^2)```). Column names which are Python keywords
- (like "list", "for", "import", etc) cannot be used.
-
- For example, if one of your columns is called ``a a`` and you want
- to sum it with ``b``, your query should be ```a a` + b``.
-
- inplace : bool
- Whether to modify the DataFrame rather than creating a new one.
- **kwargs
- See the documentation for :func:`eval` for complete details
- on the keyword arguments accepted by :meth:`DataFrame.query`.
-
- Returns
- -------
- DataFrame or None
- DataFrame resulting from the provided query expression or
- None if ``inplace=True``.
-
- See Also
- --------
- eval : Evaluate a string describing operations on
- DataFrame columns.
- DataFrame.eval : Evaluate a string describing operations on
- DataFrame columns.
-
- Notes
- -----
- The result of the evaluation of this expression is first passed to
- :attr:`DataFrame.loc` and if that fails because of a
- multidimensional key (e.g., a DataFrame) then the result will be passed
- to :meth:`DataFrame.__getitem__`.
-
- This method uses the top-level :func:`eval` function to
- evaluate the passed query.
-
- The :meth:`~pandas.DataFrame.query` method uses a slightly
- modified Python syntax by default. For example, the ``&`` and ``|``
- (bitwise) operators have the precedence of their boolean cousins,
- :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
- however the semantics are different.
-
- You can change the semantics of the expression by passing the keyword
- argument ``parser='python'``. This enforces the same semantics as
- evaluation in Python space. Likewise, you can pass ``engine='python'``
- to evaluate an expression using Python itself as a backend. This is not
- recommended as it is inefficient compared to using ``numexpr`` as the
- engine.
-
- The :attr:`DataFrame.index` and
- :attr:`DataFrame.columns` attributes of the
- :class:`~pandas.DataFrame` instance are placed in the query namespace
- by default, which allows you to treat both the index and columns of the
- frame as a column in the frame.
- The identifier ``index`` is used for the frame index; you can also
- use the name of the index to identify it in a query. Please note that
- Python keywords may not be used as identifiers.
-
- For further details and examples see the ``query`` documentation in
- :ref:`indexing <indexing.query>`.
-
- *Backtick quoted variables*
-
- Backtick quoted variables are parsed as literal Python code and
- are converted internally to a Python valid identifier.
- This can lead to the following problems.
-
- During parsing a number of disallowed characters inside the backtick
- quoted string are replaced by strings that are allowed as a Python identifier.
- These characters include all operators in Python, the space character, the
- question mark, the exclamation mark, the dollar sign, and the euro sign.
- For other characters that fall outside the ASCII range (U+0001..U+007F)
- and those that are not further specified in PEP 3131,
- the query parser will raise an error.
- This excludes whitespace different than the space character,
- but also the hashtag (as it is used for comments) and the backtick
- itself (backtick can also not be escaped).
-
- In a special case, quotes that make a pair around a backtick can
- confuse the parser.
- For example, ```it's` > `that's``` will raise an error,
- as it forms a quoted string (``'s > `that'``) with a backtick inside.
-
- See also the Python documentation about lexical analysis
- (https://docs.python.org/3/reference/lexical_analysis.html)
- in combination with the source code in :mod:`pandas.core.computation.parsing`.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': range(1, 6),
- ... 'B': range(10, 0, -2),
- ... 'C C': range(10, 5, -1)})
- >>> df
- A B C C
- 0 1 10 10
- 1 2 8 9
- 2 3 6 8
- 3 4 4 7
- 4 5 2 6
- >>> df.query('A > B')
- A B C C
- 4 5 2 6
-
- The previous expression is equivalent to
-
- >>> df[df.A > df.B]
- A B C C
- 4 5 2 6
-
- For columns with spaces in their name, you can use backtick quoting.
-
- >>> df.query('B == `C C`')
- A B C C
- 0 1 10 10
-
- The previous expression is equivalent to
-
- >>> df[df.B == df['C C']]
- A B C C
- 0 1 10 10
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- if not isinstance(expr, str):
- msg = f"expr must be a string to be evaluated, {type(expr)} given"
- raise ValueError(msg)
- kwargs["level"] = kwargs.pop("level", 0) + 1
- kwargs["target"] = None
- res = self.eval(expr, **kwargs)
-
- try:
- result = self.loc[res]
- except ValueError:
- # when res is multi-dimensional loc raises, but this is sometimes a
- # valid query
- result = self[res]
-
- if inplace:
- self._update_inplace(result)
- return None
- else:
- return result
-
- @overload
- def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any:
- ...
-
- @overload
- def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
- ...
-
- def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
- """
- Evaluate a string describing operations on DataFrame columns.
-
- Operates on columns only, not specific rows or elements. This allows
- `eval` to run arbitrary code, which can make you vulnerable to code
- injection if you pass user input to this function.
-
- Parameters
- ----------
- expr : str
- The expression string to evaluate.
- inplace : bool, default False
- If the expression contains an assignment, whether to perform the
- operation inplace and mutate the existing DataFrame. Otherwise,
- a new DataFrame is returned.
- **kwargs
- See the documentation for :func:`eval` for complete details
- on the keyword arguments accepted by
- :meth:`~pandas.DataFrame.query`.
-
- Returns
- -------
- ndarray, scalar, pandas object, or None
- The result of the evaluation or None if ``inplace=True``.
-
- See Also
- --------
- DataFrame.query : Evaluates a boolean expression to query the columns
- of a frame.
- DataFrame.assign : Can evaluate an expression or function to create new
- values for a column.
- eval : Evaluate a Python expression as a string using various
- backends.
-
- Notes
- -----
- For more details see the API documentation for :func:`~eval`.
- For detailed examples see :ref:`enhancing performance with eval
- <enhancingperf.eval>`.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
- >>> df
- A B
- 0 1 10
- 1 2 8
- 2 3 6
- 3 4 4
- 4 5 2
- >>> df.eval('A + B')
- 0 11
- 1 10
- 2 9
- 3 8
- 4 7
- dtype: int64
-
- Assignment is allowed though by default the original DataFrame is not
- modified.
-
- >>> df.eval('C = A + B')
- A B C
- 0 1 10 11
- 1 2 8 10
- 2 3 6 9
- 3 4 4 8
- 4 5 2 7
- >>> df
- A B
- 0 1 10
- 1 2 8
- 2 3 6
- 3 4 4
- 4 5 2
-
- Multiple columns can be assigned to using multi-line expressions:
-
- >>> df.eval(
- ... '''
- ... C = A + B
- ... D = A - B
- ... '''
- ... )
- A B C D
- 0 1 10 11 -9
- 1 2 8 10 -6
- 2 3 6 9 -3
- 3 4 4 8 0
- 4 5 2 7 3
- """
- from pandas.core.computation.eval import eval as _eval
-
- inplace = validate_bool_kwarg(inplace, "inplace")
- kwargs["level"] = kwargs.pop("level", 0) + 1
- index_resolvers = self._get_index_resolvers()
- column_resolvers = self._get_cleaned_column_resolvers()
- resolvers = column_resolvers, index_resolvers
- if "target" not in kwargs:
- kwargs["target"] = self
- kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
-
- return _eval(expr, inplace=inplace, **kwargs)
-
- def select_dtypes(self, include=None, exclude=None) -> DataFrame:
- """
- Return a subset of the DataFrame's columns based on the column dtypes.
-
- Parameters
- ----------
- include, exclude : scalar or list-like
- A selection of dtypes or strings to be included/excluded. At least
- one of these parameters must be supplied.
-
- Returns
- -------
- DataFrame
- The subset of the frame including the dtypes in ``include`` and
- excluding the dtypes in ``exclude``.
-
- Raises
- ------
- ValueError
- * If both of ``include`` and ``exclude`` are empty
- * If ``include`` and ``exclude`` have overlapping elements
- * If any kind of string dtype is passed in.
-
- See Also
- --------
- DataFrame.dtypes: Return Series with the data type of each column.
-
- Notes
- -----
- * To select all *numeric* types, use ``np.number`` or ``'number'``
- * To select strings you must use the ``object`` dtype, but note that
- this will return *all* object dtype columns
- * See the `numpy dtype hierarchy
- <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
- * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
- ``'datetime64'``
- * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
- ``'timedelta64'``
- * To select Pandas categorical dtypes, use ``'category'``
- * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
- 0.20.0) or ``'datetime64[ns, tz]'``
-
- Examples
- --------
- >>> df = pd.DataFrame({'a': [1, 2] * 3,
- ... 'b': [True, False] * 3,
- ... 'c': [1.0, 2.0] * 3})
- >>> df
- a b c
- 0 1 True 1.0
- 1 2 False 2.0
- 2 1 True 1.0
- 3 2 False 2.0
- 4 1 True 1.0
- 5 2 False 2.0
-
- >>> df.select_dtypes(include='bool')
- b
- 0 True
- 1 False
- 2 True
- 3 False
- 4 True
- 5 False
-
- >>> df.select_dtypes(include=['float64'])
- c
- 0 1.0
- 1 2.0
- 2 1.0
- 3 2.0
- 4 1.0
- 5 2.0
-
- >>> df.select_dtypes(exclude=['int64'])
- b c
- 0 True 1.0
- 1 False 2.0
- 2 True 1.0
- 3 False 2.0
- 4 True 1.0
- 5 False 2.0
- """
- if not is_list_like(include):
- include = (include,) if include is not None else ()
- if not is_list_like(exclude):
- exclude = (exclude,) if exclude is not None else ()
-
- selection = (frozenset(include), frozenset(exclude))
-
- if not any(selection):
- raise ValueError("at least one of include or exclude must be nonempty")
-
- # convert the myriad valid dtypes object to a single representation
- def check_int_infer_dtype(dtypes):
- converted_dtypes: list[type] = []
- for dtype in dtypes:
- # Numpy maps int to different types (int32, in64) on Windows and Linux
- # see https://github.com/numpy/numpy/issues/9464
- if (isinstance(dtype, str) and dtype == "int") or (dtype is int):
- converted_dtypes.append(np.int32)
- converted_dtypes.append(np.int64)
- elif dtype == "float" or dtype is float:
- # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20
- converted_dtypes.extend([np.float64, np.float32])
- else:
- converted_dtypes.append(infer_dtype_from_object(dtype))
- return frozenset(converted_dtypes)
-
- include = check_int_infer_dtype(include)
- exclude = check_int_infer_dtype(exclude)
-
- for dtypes in (include, exclude):
- invalidate_string_dtypes(dtypes)
-
- # can't both include AND exclude!
- if not include.isdisjoint(exclude):
- raise ValueError(f"include and exclude overlap on {(include & exclude)}")
-
- def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
- # GH 46870: BooleanDtype._is_numeric == True but should be excluded
- dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype
- return issubclass(dtype.type, tuple(dtypes_set)) or (
- np.number in dtypes_set
- and getattr(dtype, "_is_numeric", False)
- and not is_bool_dtype(dtype)
- )
-
- def predicate(arr: ArrayLike) -> bool:
- dtype = arr.dtype
- if include:
- if not dtype_predicate(dtype, include):
- return False
-
- if exclude:
- if dtype_predicate(dtype, exclude):
- return False
-
- return True
-
- mgr = self._mgr._get_data_subset(predicate).copy(deep=None)
- return type(self)(mgr).__finalize__(self)
-
- def insert(
- self,
- loc: int,
- column: Hashable,
- value: Scalar | AnyArrayLike,
- allow_duplicates: bool | lib.NoDefault = lib.no_default,
- ) -> None:
- """
- Insert column into DataFrame at specified location.
-
- Raises a ValueError if `column` is already contained in the DataFrame,
- unless `allow_duplicates` is set to True.
-
- Parameters
- ----------
- loc : int
- Insertion index. Must verify 0 <= loc <= len(columns).
- column : str, number, or hashable object
- Label of the inserted column.
- value : Scalar, Series, or array-like
- allow_duplicates : bool, optional, default lib.no_default
-
- See Also
- --------
- Index.insert : Insert new item by index.
-
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df
- col1 col2
- 0 1 3
- 1 2 4
- >>> df.insert(1, "newcol", [99, 99])
- >>> df
- col1 newcol col2
- 0 1 99 3
- 1 2 99 4
- >>> df.insert(0, "col1", [100, 100], allow_duplicates=True)
- >>> df
- col1 col1 newcol col2
- 0 100 1 99 3
- 1 100 2 99 4
-
- Notice that pandas uses index alignment in case of `value` from type `Series`:
-
- >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))
- >>> df
- col0 col1 col1 newcol col2
- 0 NaN 100 1 99 3
- 1 5.0 100 2 99 4
- """
- if allow_duplicates is lib.no_default:
- allow_duplicates = False
- if allow_duplicates and not self.flags.allows_duplicate_labels:
- raise ValueError(
- "Cannot specify 'allow_duplicates=True' when "
- "'self.flags.allows_duplicate_labels' is False."
- )
- if not allow_duplicates and column in self.columns:
- # Should this be a different kind of error??
- raise ValueError(f"cannot insert {column}, already exists")
- if not isinstance(loc, int):
- raise TypeError("loc must be int")
-
- value = self._sanitize_column(value)
- self._mgr.insert(loc, column, value)
-
- def assign(self, **kwargs) -> DataFrame:
- r"""
- Assign new columns to a DataFrame.
-
- Returns a new object with all original columns in addition to new ones.
- Existing columns that are re-assigned will be overwritten.
-
- Parameters
- ----------
- **kwargs : dict of {str: callable or Series}
- The column names are keywords. If the values are
- callable, they are computed on the DataFrame and
- assigned to the new columns. The callable must not
- change input DataFrame (though pandas doesn't check it).
- If the values are not callable, (e.g. a Series, scalar, or array),
- they are simply assigned.
-
- Returns
- -------
- DataFrame
- A new DataFrame with the new columns in addition to
- all the existing columns.
-
- Notes
- -----
- Assigning multiple columns within the same ``assign`` is possible.
- Later items in '\*\*kwargs' may refer to newly created or modified
- columns in 'df'; items are computed and assigned into 'df' in order.
-
- Examples
- --------
- >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
- ... index=['Portland', 'Berkeley'])
- >>> df
- temp_c
- Portland 17.0
- Berkeley 25.0
-
- Where the value is a callable, evaluated on `df`:
-
- >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
- temp_c temp_f
- Portland 17.0 62.6
- Berkeley 25.0 77.0
-
- Alternatively, the same behavior can be achieved by directly
- referencing an existing Series or sequence:
-
- >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
- temp_c temp_f
- Portland 17.0 62.6
- Berkeley 25.0 77.0
-
- You can create multiple columns within the same assign where one
- of the columns depends on another one defined within the same assign:
-
- >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
- ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
- temp_c temp_f temp_k
- Portland 17.0 62.6 290.15
- Berkeley 25.0 77.0 298.15
- """
- data = self.copy(deep=None)
-
- for k, v in kwargs.items():
- data[k] = com.apply_if_callable(v, data)
- return data
-
- def _sanitize_column(self, value) -> ArrayLike:
- """
- Ensures new columns (which go into the BlockManager as new blocks) are
- always copied and converted into an array.
-
- Parameters
- ----------
- value : scalar, Series, or array-like
-
- Returns
- -------
- numpy.ndarray or ExtensionArray
- """
- self._ensure_valid_index(value)
-
- # We can get there through isetitem with a DataFrame
- # or through loc single_block_path
- if isinstance(value, DataFrame):
- return _reindex_for_setitem(value, self.index)
- elif is_dict_like(value):
- return _reindex_for_setitem(Series(value), self.index)
-
- if is_list_like(value):
- com.require_length_match(value, self.index)
- return sanitize_array(value, self.index, copy=True, allow_2d=True)
-
- @property
- def _series(self):
- return {
- item: Series(
- self._mgr.iget(idx), index=self.index, name=item, fastpath=True
- )
- for idx, item in enumerate(self.columns)
- }
-
- # ----------------------------------------------------------------------
- # Reindexing and alignment
-
- def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
- frame = self
-
- columns = axes["columns"]
- if columns is not None:
- frame = frame._reindex_columns(
- columns, method, copy, level, fill_value, limit, tolerance
- )
-
- index = axes["index"]
- if index is not None:
- frame = frame._reindex_index(
- index, method, copy, level, fill_value, limit, tolerance
- )
-
- return frame
-
- def _reindex_index(
- self,
- new_index,
- method,
- copy: bool,
- level: Level,
- fill_value=np.nan,
- limit=None,
- tolerance=None,
- ):
- new_index, indexer = self.index.reindex(
- new_index, method=method, level=level, limit=limit, tolerance=tolerance
- )
- return self._reindex_with_indexers(
- {0: [new_index, indexer]},
- copy=copy,
- fill_value=fill_value,
- allow_dups=False,
- )
-
- def _reindex_columns(
- self,
- new_columns,
- method,
- copy: bool,
- level: Level,
- fill_value=None,
- limit=None,
- tolerance=None,
- ):
- new_columns, indexer = self.columns.reindex(
- new_columns, method=method, level=level, limit=limit, tolerance=tolerance
- )
- return self._reindex_with_indexers(
- {1: [new_columns, indexer]},
- copy=copy,
- fill_value=fill_value,
- allow_dups=False,
- )
-
- def _reindex_multi(
- self, axes: dict[str, Index], copy: bool, fill_value
- ) -> DataFrame:
- """
- We are guaranteed non-Nones in the axes.
- """
-
- new_index, row_indexer = self.index.reindex(axes["index"])
- new_columns, col_indexer = self.columns.reindex(axes["columns"])
-
- if row_indexer is not None and col_indexer is not None:
- # Fastpath. By doing two 'take's at once we avoid making an
- # unnecessary copy.
- # We only get here with `not self._is_mixed_type`, which (almost)
- # ensures that self.values is cheap. It may be worth making this
- # condition more specific.
- indexer = row_indexer, col_indexer
- new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)
- return self._constructor(
- new_values, index=new_index, columns=new_columns, copy=False
- )
- else:
- return self._reindex_with_indexers(
- {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
- copy=copy,
- fill_value=fill_value,
- )
-
- @doc(NDFrame.align, **_shared_doc_kwargs)
- def align(
- self,
- other: DataFrame,
- join: AlignJoin = "outer",
- axis: Axis | None = None,
- level: Level = None,
- copy: bool | None = None,
- fill_value=None,
- method: FillnaOptions | None = None,
- limit: int | None = None,
- fill_axis: Axis = 0,
- broadcast_axis: Axis | None = None,
- ) -> DataFrame:
- return super().align(
- other,
- join=join,
- axis=axis,
- level=level,
- copy=copy,
- fill_value=fill_value,
- method=method,
- limit=limit,
- fill_axis=fill_axis,
- broadcast_axis=broadcast_axis,
- )
-
- @Appender(
- """
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
-
- Change the row labels.
-
- >>> df.set_axis(['a', 'b', 'c'], axis='index')
- A B
- a 1 4
- b 2 5
- c 3 6
-
- Change the column labels.
-
- >>> df.set_axis(['I', 'II'], axis='columns')
- I II
- 0 1 4
- 1 2 5
- 2 3 6
- """
- )
- @Substitution(
- **_shared_doc_kwargs,
- extended_summary_sub=" column or",
- axis_description_sub=", and 1 identifies the columns",
- see_also_sub=" or columns",
- )
- @Appender(NDFrame.set_axis.__doc__)
- def set_axis(
- self,
- labels,
- *,
- axis: Axis = 0,
- copy: bool | None = None,
- ) -> DataFrame:
- return super().set_axis(labels, axis=axis, copy=copy)
-
- @doc(
- NDFrame.reindex,
- klass=_shared_doc_kwargs["klass"],
- optional_reindex=_shared_doc_kwargs["optional_reindex"],
- )
- def reindex( # type: ignore[override]
- self,
- labels=None,
- *,
- index=None,
- columns=None,
- axis: Axis | None = None,
- method: str | None = None,
- copy: bool | None = None,
- level: Level | None = None,
- fill_value: Scalar | None = np.nan,
- limit: int | None = None,
- tolerance=None,
- ) -> DataFrame:
- return super().reindex(
- labels=labels,
- index=index,
- columns=columns,
- axis=axis,
- method=method,
- copy=copy,
- level=level,
- fill_value=fill_value,
- limit=limit,
- tolerance=tolerance,
- )
-
- @overload
- def drop(
- self,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level = ...,
- inplace: Literal[True],
- errors: IgnoreRaise = ...,
- ) -> None:
- ...
-
- @overload
- def drop(
- self,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level = ...,
- inplace: Literal[False] = ...,
- errors: IgnoreRaise = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def drop(
- self,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level = ...,
- inplace: bool = ...,
- errors: IgnoreRaise = ...,
- ) -> DataFrame | None:
- ...
-
- def drop(
- self,
- labels: IndexLabel = None,
- *,
- axis: Axis = 0,
- index: IndexLabel = None,
- columns: IndexLabel = None,
- level: Level = None,
- inplace: bool = False,
- errors: IgnoreRaise = "raise",
- ) -> DataFrame | None:
- """
- Drop specified labels from rows or columns.
-
- Remove rows or columns by specifying label names and corresponding
- axis, or by specifying directly index or column names. When using a
- multi-index, labels on different levels can be removed by specifying
- the level. See the :ref:`user guide <advanced.shown_levels>`
- for more information about the now unused levels.
-
- Parameters
- ----------
- labels : single label or list-like
- Index or column labels to drop. A tuple will be used as a single
- label and not treated as a list-like.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Whether to drop labels from the index (0 or 'index') or
- columns (1 or 'columns').
- index : single label or list-like
- Alternative to specifying axis (``labels, axis=0``
- is equivalent to ``index=labels``).
- columns : single label or list-like
- Alternative to specifying axis (``labels, axis=1``
- is equivalent to ``columns=labels``).
- level : int or level name, optional
- For MultiIndex, level from which the labels will be removed.
- inplace : bool, default False
- If False, return a copy. Otherwise, do operation
- inplace and return None.
- errors : {'ignore', 'raise'}, default 'raise'
- If 'ignore', suppress error and only existing labels are
- dropped.
-
- Returns
- -------
- DataFrame or None
- DataFrame without the removed index or column labels or
- None if ``inplace=True``.
-
- Raises
- ------
- KeyError
- If any of the labels is not found in the selected axis.
-
- See Also
- --------
- DataFrame.loc : Label-location based indexer for selection by label.
- DataFrame.dropna : Return DataFrame with labels on given axis omitted
- where (all or any) data are missing.
- DataFrame.drop_duplicates : Return DataFrame with duplicate rows
- removed, optionally only considering certain columns.
- Series.drop : Return Series with specified index labels removed.
-
- Examples
- --------
- >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
- ... columns=['A', 'B', 'C', 'D'])
- >>> df
- A B C D
- 0 0 1 2 3
- 1 4 5 6 7
- 2 8 9 10 11
-
- Drop columns
-
- >>> df.drop(['B', 'C'], axis=1)
- A D
- 0 0 3
- 1 4 7
- 2 8 11
-
- >>> df.drop(columns=['B', 'C'])
- A D
- 0 0 3
- 1 4 7
- 2 8 11
-
- Drop a row by index
-
- >>> df.drop([0, 1])
- A B C D
- 2 8 9 10 11
-
- Drop columns and/or rows of MultiIndex DataFrame
-
- >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
- ... ['speed', 'weight', 'length']],
- ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
- ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
- >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
- ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
- ... [250, 150], [1.5, 0.8], [320, 250],
- ... [1, 0.8], [0.3, 0.2]])
- >>> df
- big small
- lama speed 45.0 30.0
- weight 200.0 100.0
- length 1.5 1.0
- cow speed 30.0 20.0
- weight 250.0 150.0
- length 1.5 0.8
- falcon speed 320.0 250.0
- weight 1.0 0.8
- length 0.3 0.2
-
- Drop a specific index combination from the MultiIndex
- DataFrame, i.e., drop the combination ``'falcon'`` and
- ``'weight'``, which deletes only the corresponding row
-
- >>> df.drop(index=('falcon', 'weight'))
- big small
- lama speed 45.0 30.0
- weight 200.0 100.0
- length 1.5 1.0
- cow speed 30.0 20.0
- weight 250.0 150.0
- length 1.5 0.8
- falcon speed 320.0 250.0
- length 0.3 0.2
-
- >>> df.drop(index='cow', columns='small')
- big
- lama speed 45.0
- weight 200.0
- length 1.5
- falcon speed 320.0
- weight 1.0
- length 0.3
-
- >>> df.drop(index='length', level=1)
- big small
- lama speed 45.0 30.0
- weight 200.0 100.0
- cow speed 30.0 20.0
- weight 250.0 150.0
- falcon speed 320.0 250.0
- weight 1.0 0.8
- """
- return super().drop(
- labels=labels,
- axis=axis,
- index=index,
- columns=columns,
- level=level,
- inplace=inplace,
- errors=errors,
- )
-
- @overload
- def rename(
- self,
- mapper: Renamer | None = ...,
- *,
- index: Renamer | None = ...,
- columns: Renamer | None = ...,
- axis: Axis | None = ...,
- copy: bool | None = ...,
- inplace: Literal[True],
- level: Level = ...,
- errors: IgnoreRaise = ...,
- ) -> None:
- ...
-
- @overload
- def rename(
- self,
- mapper: Renamer | None = ...,
- *,
- index: Renamer | None = ...,
- columns: Renamer | None = ...,
- axis: Axis | None = ...,
- copy: bool | None = ...,
- inplace: Literal[False] = ...,
- level: Level = ...,
- errors: IgnoreRaise = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def rename(
- self,
- mapper: Renamer | None = ...,
- *,
- index: Renamer | None = ...,
- columns: Renamer | None = ...,
- axis: Axis | None = ...,
- copy: bool | None = ...,
- inplace: bool = ...,
- level: Level = ...,
- errors: IgnoreRaise = ...,
- ) -> DataFrame | None:
- ...
-
- def rename(
- self,
- mapper: Renamer | None = None,
- *,
- index: Renamer | None = None,
- columns: Renamer | None = None,
- axis: Axis | None = None,
- copy: bool | None = None,
- inplace: bool = False,
- level: Level = None,
- errors: IgnoreRaise = "ignore",
- ) -> DataFrame | None:
- """
- Rename columns or index labels.
-
- Function / dict values must be unique (1-to-1). Labels not contained in
- a dict / Series will be left as-is. Extra labels listed don't throw an
- error.
-
- See the :ref:`user guide <basics.rename>` for more.
-
- Parameters
- ----------
- mapper : dict-like or function
- Dict-like or function transformations to apply to
- that axis' values. Use either ``mapper`` and ``axis`` to
- specify the axis to target with ``mapper``, or ``index`` and
- ``columns``.
- index : dict-like or function
- Alternative to specifying axis (``mapper, axis=0``
- is equivalent to ``index=mapper``).
- columns : dict-like or function
- Alternative to specifying axis (``mapper, axis=1``
- is equivalent to ``columns=mapper``).
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Axis to target with ``mapper``. Can be either the axis name
- ('index', 'columns') or number (0, 1). The default is 'index'.
- copy : bool, default True
- Also copy underlying data.
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.
- If True then value of copy is ignored.
- level : int or level name, default None
- In case of a MultiIndex, only rename labels in the specified
- level.
- errors : {'ignore', 'raise'}, default 'ignore'
- If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
- or `columns` contains labels that are not present in the Index
- being transformed.
- If 'ignore', existing keys will be renamed and extra keys will be
- ignored.
-
- Returns
- -------
- DataFrame or None
- DataFrame with the renamed axis labels or None if ``inplace=True``.
-
- Raises
- ------
- KeyError
- If any of the labels is not found in the selected axis and
- "errors='raise'".
-
- See Also
- --------
- DataFrame.rename_axis : Set the name of the axis.
-
- Examples
- --------
- ``DataFrame.rename`` supports two calling conventions
-
- * ``(index=index_mapper, columns=columns_mapper, ...)``
- * ``(mapper, axis={'index', 'columns'}, ...)``
-
- We *highly* recommend using keyword arguments to clarify your
- intent.
-
- Rename columns using a mapping:
-
- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
- >>> df.rename(columns={"A": "a", "B": "c"})
- a c
- 0 1 4
- 1 2 5
- 2 3 6
-
- Rename index using a mapping:
-
- >>> df.rename(index={0: "x", 1: "y", 2: "z"})
- A B
- x 1 4
- y 2 5
- z 3 6
-
- Cast index labels to a different type:
-
- >>> df.index
- RangeIndex(start=0, stop=3, step=1)
- >>> df.rename(index=str).index
- Index(['0', '1', '2'], dtype='object')
-
- >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
- Traceback (most recent call last):
- KeyError: ['C'] not found in axis
-
- Using axis-style parameters:
-
- >>> df.rename(str.lower, axis='columns')
- a b
- 0 1 4
- 1 2 5
- 2 3 6
-
- >>> df.rename({1: 2, 2: 4}, axis='index')
- A B
- 0 1 4
- 2 2 5
- 4 3 6
- """
- return super()._rename(
- mapper=mapper,
- index=index,
- columns=columns,
- axis=axis,
- copy=copy,
- inplace=inplace,
- level=level,
- errors=errors,
- )
-
- @overload
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: Literal[False] = ...,
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: Literal[True],
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> None:
- ...
-
- @overload
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: bool = ...,
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> DataFrame | None:
- ...
-
- @doc(NDFrame.fillna, **_shared_doc_kwargs)
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = None,
- *,
- method: FillnaOptions | None = None,
- axis: Axis | None = None,
- inplace: bool = False,
- limit: int | None = None,
- downcast: dict | None = None,
- ) -> DataFrame | None:
- return super().fillna(
- value=value,
- method=method,
- axis=axis,
- inplace=inplace,
- limit=limit,
- downcast=downcast,
- )
-
- def pop(self, item: Hashable) -> Series:
- """
- Return item and drop from frame. Raise KeyError if not found.
-
- Parameters
- ----------
- item : label
- Label of column to be popped.
-
- Returns
- -------
- Series
-
- Examples
- --------
- >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
- ... ('parrot', 'bird', 24.0),
- ... ('lion', 'mammal', 80.5),
- ... ('monkey', 'mammal', np.nan)],
- ... columns=('name', 'class', 'max_speed'))
- >>> df
- name class max_speed
- 0 falcon bird 389.0
- 1 parrot bird 24.0
- 2 lion mammal 80.5
- 3 monkey mammal NaN
-
- >>> df.pop('class')
- 0 bird
- 1 bird
- 2 mammal
- 3 mammal
- Name: class, dtype: object
-
- >>> df
- name max_speed
- 0 falcon 389.0
- 1 parrot 24.0
- 2 lion 80.5
- 3 monkey NaN
- """
- return super().pop(item=item)
-
- @overload
- def replace(
- self,
- to_replace=...,
- value=...,
- *,
- inplace: Literal[False] = ...,
- limit: int | None = ...,
- regex: bool = ...,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def replace(
- self,
- to_replace=...,
- value=...,
- *,
- inplace: Literal[True],
- limit: int | None = ...,
- regex: bool = ...,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
- ) -> None:
- ...
-
- @doc(NDFrame.replace, **_shared_doc_kwargs)
- def replace(
- self,
- to_replace=None,
- value=lib.no_default,
- *,
- inplace: bool = False,
- limit: int | None = None,
- regex: bool = False,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,
- ) -> DataFrame | None:
- return super().replace(
- to_replace=to_replace,
- value=value,
- inplace=inplace,
- limit=limit,
- regex=regex,
- method=method,
- )
-
- def _replace_columnwise(
- self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex
- ):
- """
- Dispatch to Series.replace column-wise.
-
- Parameters
- ----------
- mapping : dict
- of the form {col: (target, value)}
- inplace : bool
- regex : bool or same types as `to_replace` in DataFrame.replace
-
- Returns
- -------
- DataFrame or None
- """
- # Operate column-wise
- res = self if inplace else self.copy(deep=None)
- ax = self.columns
-
- for i, ax_value in enumerate(ax):
- if ax_value in mapping:
- ser = self.iloc[:, i]
-
- target, value = mapping[ax_value]
- newobj = ser.replace(target, value, regex=regex)
-
- res._iset_item(i, newobj)
-
- if inplace:
- return
- return res.__finalize__(self)
-
- @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])
- def shift(
- self,
- periods: int = 1,
- freq: Frequency | None = None,
- axis: Axis = 0,
- fill_value: Hashable = lib.no_default,
- ) -> DataFrame:
- axis = self._get_axis_number(axis)
-
- ncols = len(self.columns)
- if (
- axis == 1
- and periods != 0
- and freq is None
- and fill_value is lib.no_default
- and ncols > 0
- ):
- # We will infer fill_value to match the closest column
-
- # Use a column that we know is valid for our column's dtype GH#38434
- label = self.columns[0]
-
- if periods > 0:
- result = self.iloc[:, :-periods]
- for col in range(min(ncols, abs(periods))):
- # TODO(EA2D): doing this in a loop unnecessary with 2D EAs
- # Define filler inside loop so we get a copy
- filler = self.iloc[:, 0].shift(len(self))
- result.insert(0, label, filler, allow_duplicates=True)
- else:
- result = self.iloc[:, -periods:]
- for col in range(min(ncols, abs(periods))):
- # Define filler inside loop so we get a copy
- filler = self.iloc[:, -1].shift(len(self))
- result.insert(
- len(result.columns), label, filler, allow_duplicates=True
- )
-
- result.columns = self.columns.copy()
- return result
- elif (
- axis == 1
- and periods != 0
- and fill_value is not lib.no_default
- and ncols > 0
- ):
- arrays = self._mgr.arrays
- if len(arrays) > 1 or (
- # If we only have one block and we know that we can't
- # keep the same dtype (i.e. the _can_hold_element check)
- # then we can go through the reindex_indexer path
- # (and avoid casting logic in the Block method).
- not can_hold_element(arrays[0], fill_value)
- ):
- # GH#35488 we need to watch out for multi-block cases
- # We only get here with fill_value not-lib.no_default
- nper = abs(periods)
- nper = min(nper, ncols)
- if periods > 0:
- indexer = np.array(
- [-1] * nper + list(range(ncols - periods)), dtype=np.intp
- )
- else:
- indexer = np.array(
- list(range(nper, ncols)) + [-1] * nper, dtype=np.intp
- )
- mgr = self._mgr.reindex_indexer(
- self.columns,
- indexer,
- axis=0,
- fill_value=fill_value,
- allow_dups=True,
- )
- res_df = self._constructor(mgr)
- return res_df.__finalize__(self, method="shift")
-
- return super().shift(
- periods=periods, freq=freq, axis=axis, fill_value=fill_value
- )
-
- @overload
- def set_index(
- self,
- keys,
- *,
- drop: bool = ...,
- append: bool = ...,
- inplace: Literal[False] = ...,
- verify_integrity: bool = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def set_index(
- self,
- keys,
- *,
- drop: bool = ...,
- append: bool = ...,
- inplace: Literal[True],
- verify_integrity: bool = ...,
- ) -> None:
- ...
-
- def set_index(
- self,
- keys,
- *,
- drop: bool = True,
- append: bool = False,
- inplace: bool = False,
- verify_integrity: bool = False,
- ) -> DataFrame | None:
- """
- Set the DataFrame index using existing columns.
-
- Set the DataFrame index (row labels) using one or more existing
- columns or arrays (of the correct length). The index can replace the
- existing index or expand on it.
-
- Parameters
- ----------
- keys : label or array-like or list of labels/arrays
- This parameter can be either a single column key, a single array of
- the same length as the calling DataFrame, or a list containing an
- arbitrary combination of column keys and arrays. Here, "array"
- encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
- instances of :class:`~collections.abc.Iterator`.
- drop : bool, default True
- Delete columns to be used as the new index.
- append : bool, default False
- Whether to append columns to existing index.
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.
- verify_integrity : bool, default False
- Check the new index for duplicates. Otherwise defer the check until
- necessary. Setting to False will improve the performance of this
- method.
-
- Returns
- -------
- DataFrame or None
- Changed row labels or None if ``inplace=True``.
-
- See Also
- --------
- DataFrame.reset_index : Opposite of set_index.
- DataFrame.reindex : Change to new indices or expand indices.
- DataFrame.reindex_like : Change to same indices as other DataFrame.
-
- Examples
- --------
- >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
- ... 'year': [2012, 2014, 2013, 2014],
- ... 'sale': [55, 40, 84, 31]})
- >>> df
- month year sale
- 0 1 2012 55
- 1 4 2014 40
- 2 7 2013 84
- 3 10 2014 31
-
- Set the index to become the 'month' column:
-
- >>> df.set_index('month')
- year sale
- month
- 1 2012 55
- 4 2014 40
- 7 2013 84
- 10 2014 31
-
- Create a MultiIndex using columns 'year' and 'month':
-
- >>> df.set_index(['year', 'month'])
- sale
- year month
- 2012 1 55
- 2014 4 40
- 2013 7 84
- 2014 10 31
-
- Create a MultiIndex using an Index and a column:
-
- >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
- month sale
- year
- 1 2012 1 55
- 2 2014 4 40
- 3 2013 7 84
- 4 2014 10 31
-
- Create a MultiIndex using two Series:
-
- >>> s = pd.Series([1, 2, 3, 4])
- >>> df.set_index([s, s**2])
- month year sale
- 1 1 1 2012 55
- 2 4 4 2014 40
- 3 9 7 2013 84
- 4 16 10 2014 31
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- self._check_inplace_and_allows_duplicate_labels(inplace)
- if not isinstance(keys, list):
- keys = [keys]
-
- err_msg = (
- 'The parameter "keys" may be a column key, one-dimensional '
- "array, or a list containing only valid column keys and "
- "one-dimensional arrays."
- )
-
- missing: list[Hashable] = []
- for col in keys:
- if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):
- # arrays are fine as long as they are one-dimensional
- # iterators get converted to list below
- if getattr(col, "ndim", 1) != 1:
- raise ValueError(err_msg)
- else:
- # everything else gets tried as a key; see GH 24969
- try:
- found = col in self.columns
- except TypeError as err:
- raise TypeError(
- f"{err_msg}. Received column of type {type(col)}"
- ) from err
- else:
- if not found:
- missing.append(col)
-
- if missing:
- raise KeyError(f"None of {missing} are in the columns")
-
- if inplace:
- frame = self
- else:
- # GH 49473 Use "lazy copy" with Copy-on-Write
- frame = self.copy(deep=None)
-
- arrays = []
- names: list[Hashable] = []
- if append:
- names = list(self.index.names)
- if isinstance(self.index, MultiIndex):
- for i in range(self.index.nlevels):
- arrays.append(self.index._get_level_values(i))
- else:
- arrays.append(self.index)
-
- to_remove: list[Hashable] = []
- for col in keys:
- if isinstance(col, MultiIndex):
- for n in range(col.nlevels):
- arrays.append(col._get_level_values(n))
- names.extend(col.names)
- elif isinstance(col, (Index, Series)):
- # if Index then not MultiIndex (treated above)
-
- # error: Argument 1 to "append" of "list" has incompatible type
- # "Union[Index, Series]"; expected "Index"
- arrays.append(col) # type:ignore[arg-type]
- names.append(col.name)
- elif isinstance(col, (list, np.ndarray)):
- # error: Argument 1 to "append" of "list" has incompatible type
- # "Union[List[Any], ndarray]"; expected "Index"
- arrays.append(col) # type: ignore[arg-type]
- names.append(None)
- elif isinstance(col, abc.Iterator):
- # error: Argument 1 to "append" of "list" has incompatible type
- # "List[Any]"; expected "Index"
- arrays.append(list(col)) # type: ignore[arg-type]
- names.append(None)
- # from here, col can only be a column label
- else:
- arrays.append(frame[col])
- names.append(col)
- if drop:
- to_remove.append(col)
-
- if len(arrays[-1]) != len(self):
- # check newest element against length of calling frame, since
- # ensure_index_from_sequences would not raise for append=False.
- raise ValueError(
- f"Length mismatch: Expected {len(self)} rows, "
- f"received array of length {len(arrays[-1])}"
- )
-
- index = ensure_index_from_sequences(arrays, names)
-
- if verify_integrity and not index.is_unique:
- duplicates = index[index.duplicated()].unique()
- raise ValueError(f"Index has duplicate keys: {duplicates}")
-
- # use set to handle duplicate column names gracefully in case of drop
- for c in set(to_remove):
- del frame[c]
-
- # clear up memory usage
- index._cleanup()
-
- frame.index = index
-
- if not inplace:
- return frame
- return None
-
- @overload
- def reset_index(
- self,
- level: IndexLabel = ...,
- *,
- drop: bool = ...,
- inplace: Literal[False] = ...,
- col_level: Hashable = ...,
- col_fill: Hashable = ...,
- allow_duplicates: bool | lib.NoDefault = ...,
- names: Hashable | Sequence[Hashable] = None,
- ) -> DataFrame:
- ...
-
- @overload
- def reset_index(
- self,
- level: IndexLabel = ...,
- *,
- drop: bool = ...,
- inplace: Literal[True],
- col_level: Hashable = ...,
- col_fill: Hashable = ...,
- allow_duplicates: bool | lib.NoDefault = ...,
- names: Hashable | Sequence[Hashable] = None,
- ) -> None:
- ...
-
- @overload
- def reset_index(
- self,
- level: IndexLabel = ...,
- *,
- drop: bool = ...,
- inplace: bool = ...,
- col_level: Hashable = ...,
- col_fill: Hashable = ...,
- allow_duplicates: bool | lib.NoDefault = ...,
- names: Hashable | Sequence[Hashable] = None,
- ) -> DataFrame | None:
- ...
-
- def reset_index(
- self,
- level: IndexLabel = None,
- *,
- drop: bool = False,
- inplace: bool = False,
- col_level: Hashable = 0,
- col_fill: Hashable = "",
- allow_duplicates: bool | lib.NoDefault = lib.no_default,
- names: Hashable | Sequence[Hashable] = None,
- ) -> DataFrame | None:
- """
- Reset the index, or a level of it.
-
- Reset the index of the DataFrame, and use the default one instead.
- If the DataFrame has a MultiIndex, this method can remove one or more
- levels.
-
- Parameters
- ----------
- level : int, str, tuple, or list, default None
- Only remove the given levels from the index. Removes all levels by
- default.
- drop : bool, default False
- Do not try to insert index into dataframe columns. This resets
- the index to the default integer index.
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.
- col_level : int or str, default 0
- If the columns have multiple levels, determines which level the
- labels are inserted into. By default it is inserted into the first
- level.
- col_fill : object, default ''
- If the columns have multiple levels, determines how the other
- levels are named. If None then the index name is repeated.
- allow_duplicates : bool, optional, default lib.no_default
- Allow duplicate column labels to be created.
-
- .. versionadded:: 1.5.0
-
- names : int, str or 1-dimensional list, default None
- Using the given string, rename the DataFrame column which contains the
- index data. If the DataFrame has a MultiIndex, this has to be a list or
- tuple with length equal to the number of levels.
-
- .. versionadded:: 1.5.0
-
- Returns
- -------
- DataFrame or None
- DataFrame with the new index or None if ``inplace=True``.
-
- See Also
- --------
- DataFrame.set_index : Opposite of reset_index.
- DataFrame.reindex : Change to new indices or expand indices.
- DataFrame.reindex_like : Change to same indices as other DataFrame.
-
- Examples
- --------
- >>> df = pd.DataFrame([('bird', 389.0),
- ... ('bird', 24.0),
- ... ('mammal', 80.5),
- ... ('mammal', np.nan)],
- ... index=['falcon', 'parrot', 'lion', 'monkey'],
- ... columns=('class', 'max_speed'))
- >>> df
- class max_speed
- falcon bird 389.0
- parrot bird 24.0
- lion mammal 80.5
- monkey mammal NaN
-
- When we reset the index, the old index is added as a column, and a
- new sequential index is used:
-
- >>> df.reset_index()
- index class max_speed
- 0 falcon bird 389.0
- 1 parrot bird 24.0
- 2 lion mammal 80.5
- 3 monkey mammal NaN
-
- We can use the `drop` parameter to avoid the old index being added as
- a column:
-
- >>> df.reset_index(drop=True)
- class max_speed
- 0 bird 389.0
- 1 bird 24.0
- 2 mammal 80.5
- 3 mammal NaN
-
- You can also use `reset_index` with `MultiIndex`.
-
- >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
- ... ('bird', 'parrot'),
- ... ('mammal', 'lion'),
- ... ('mammal', 'monkey')],
- ... names=['class', 'name'])
- >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
- ... ('species', 'type')])
- >>> df = pd.DataFrame([(389.0, 'fly'),
- ... (24.0, 'fly'),
- ... (80.5, 'run'),
- ... (np.nan, 'jump')],
- ... index=index,
- ... columns=columns)
- >>> df
- speed species
- max type
- class name
- bird falcon 389.0 fly
- parrot 24.0 fly
- mammal lion 80.5 run
- monkey NaN jump
-
- Using the `names` parameter, choose a name for the index column:
-
- >>> df.reset_index(names=['classes', 'names'])
- classes names speed species
- max type
- 0 bird falcon 389.0 fly
- 1 bird parrot 24.0 fly
- 2 mammal lion 80.5 run
- 3 mammal monkey NaN jump
-
- If the index has multiple levels, we can reset a subset of them:
-
- >>> df.reset_index(level='class')
- class speed species
- max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
-
- If we are not dropping the index, by default, it is placed in the top
- level. We can place it in another level:
-
- >>> df.reset_index(level='class', col_level=1)
- speed species
- class max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
-
- When the index is inserted under another level, we can specify under
- which one with the parameter `col_fill`:
-
- >>> df.reset_index(level='class', col_level=1, col_fill='species')
- species speed species
- class max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
-
- If we specify a nonexistent level for `col_fill`, it is created:
-
- >>> df.reset_index(level='class', col_level=1, col_fill='genus')
- genus speed species
- class max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- self._check_inplace_and_allows_duplicate_labels(inplace)
- if inplace:
- new_obj = self
- else:
- new_obj = self.copy(deep=None)
- if allow_duplicates is not lib.no_default:
- allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")
-
- new_index = default_index(len(new_obj))
- if level is not None:
- if not isinstance(level, (tuple, list)):
- level = [level]
- level = [self.index._get_level_number(lev) for lev in level]
- if len(level) < self.index.nlevels:
- new_index = self.index.droplevel(level)
-
- if not drop:
- to_insert: Iterable[tuple[Any, Any | None]]
-
- default = "index" if "index" not in self else "level_0"
- names = self.index._get_default_index_names(names, default)
-
- if isinstance(self.index, MultiIndex):
- to_insert = zip(self.index.levels, self.index.codes)
- else:
- to_insert = ((self.index, None),)
-
- multi_col = isinstance(self.columns, MultiIndex)
- for i, (lev, lab) in reversed(list(enumerate(to_insert))):
- if level is not None and i not in level:
- continue
- name = names[i]
- if multi_col:
- col_name = list(name) if isinstance(name, tuple) else [name]
- if col_fill is None:
- if len(col_name) not in (1, self.columns.nlevels):
- raise ValueError(
- "col_fill=None is incompatible "
- f"with incomplete column name {name}"
- )
- col_fill = col_name[0]
-
- lev_num = self.columns._get_level_number(col_level)
- name_lst = [col_fill] * lev_num + col_name
- missing = self.columns.nlevels - len(name_lst)
- name_lst += [col_fill] * missing
- name = tuple(name_lst)
-
- # to ndarray and maybe infer different dtype
- level_values = lev._values
- if level_values.dtype == np.object_:
- level_values = lib.maybe_convert_objects(level_values)
-
- if lab is not None:
- # if we have the codes, extract the values with a mask
- level_values = algorithms.take(
- level_values, lab, allow_fill=True, fill_value=lev._na_value
- )
-
- new_obj.insert(
- 0,
- name,
- level_values,
- allow_duplicates=allow_duplicates,
- )
-
- new_obj.index = new_index
- if not inplace:
- return new_obj
-
- return None
-
- # ----------------------------------------------------------------------
- # Reindex-based selection methods
-
- @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
- def isna(self) -> DataFrame:
- result = self._constructor(self._mgr.isna(func=isna))
- return result.__finalize__(self, method="isna")
-
- @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
- def isnull(self) -> DataFrame:
- """
- DataFrame.isnull is an alias for DataFrame.isna.
- """
- return self.isna()
-
- @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
- def notna(self) -> DataFrame:
- return ~self.isna()
-
- @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
- def notnull(self) -> DataFrame:
- """
- DataFrame.notnull is an alias for DataFrame.notna.
- """
- return ~self.isna()
-
- @overload
- def dropna(
- self,
- *,
- axis: Axis = ...,
- how: AnyAll | NoDefault = ...,
- thresh: int | NoDefault = ...,
- subset: IndexLabel = ...,
- inplace: Literal[False] = ...,
- ignore_index: bool = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def dropna(
- self,
- *,
- axis: Axis = ...,
- how: AnyAll | NoDefault = ...,
- thresh: int | NoDefault = ...,
- subset: IndexLabel = ...,
- inplace: Literal[True],
- ignore_index: bool = ...,
- ) -> None:
- ...
-
- def dropna(
- self,
- *,
- axis: Axis = 0,
- how: AnyAll | NoDefault = no_default,
- thresh: int | NoDefault = no_default,
- subset: IndexLabel = None,
- inplace: bool = False,
- ignore_index: bool = False,
- ) -> DataFrame | None:
- """
- Remove missing values.
-
- See the :ref:`User Guide <missing_data>` for more on which values are
- considered missing, and how to work with missing data.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Determine if rows or columns which contain missing values are
- removed.
-
- * 0, or 'index' : Drop rows which contain missing values.
- * 1, or 'columns' : Drop columns which contain missing value.
-
- Pass tuple or list to drop on multiple axes.
- Only a single axis is allowed.
-
- how : {'any', 'all'}, default 'any'
- Determine if row or column is removed from DataFrame, when we have
- at least one NA or all NA.
-
- * 'any' : If any NA values are present, drop that row or column.
- * 'all' : If all values are NA, drop that row or column.
-
- thresh : int, optional
- Require that many non-NA values. Cannot be combined with how.
- subset : column label or sequence of labels, optional
- Labels along other axis to consider, e.g. if you are dropping rows
- these would be a list of columns to include.
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.
- ignore_index : bool, default ``False``
- If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
-
- .. versionadded:: 2.0.0
-
- Returns
- -------
- DataFrame or None
- DataFrame with NA entries dropped from it or None if ``inplace=True``.
-
- See Also
- --------
- DataFrame.isna: Indicate missing values.
- DataFrame.notna : Indicate existing (non-missing) values.
- DataFrame.fillna : Replace missing values.
- Series.dropna : Drop missing values.
- Index.dropna : Drop missing indices.
-
- Examples
- --------
- >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
- ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
- ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
- ... pd.NaT]})
- >>> df
- name toy born
- 0 Alfred NaN NaT
- 1 Batman Batmobile 1940-04-25
- 2 Catwoman Bullwhip NaT
-
- Drop the rows where at least one element is missing.
-
- >>> df.dropna()
- name toy born
- 1 Batman Batmobile 1940-04-25
-
- Drop the columns where at least one element is missing.
-
- >>> df.dropna(axis='columns')
- name
- 0 Alfred
- 1 Batman
- 2 Catwoman
-
- Drop the rows where all elements are missing.
-
- >>> df.dropna(how='all')
- name toy born
- 0 Alfred NaN NaT
- 1 Batman Batmobile 1940-04-25
- 2 Catwoman Bullwhip NaT
-
- Keep only the rows with at least 2 non-NA values.
-
- >>> df.dropna(thresh=2)
- name toy born
- 1 Batman Batmobile 1940-04-25
- 2 Catwoman Bullwhip NaT
-
- Define in which columns to look for missing values.
-
- >>> df.dropna(subset=['name', 'toy'])
- name toy born
- 1 Batman Batmobile 1940-04-25
- 2 Catwoman Bullwhip NaT
- """
- if (how is not no_default) and (thresh is not no_default):
- raise TypeError(
- "You cannot set both the how and thresh arguments at the same time."
- )
-
- if how is no_default:
- how = "any"
-
- inplace = validate_bool_kwarg(inplace, "inplace")
- if isinstance(axis, (tuple, list)):
- # GH20987
- raise TypeError("supplying multiple axes to axis is no longer supported.")
-
- axis = self._get_axis_number(axis)
- agg_axis = 1 - axis
-
- agg_obj = self
- if subset is not None:
- # subset needs to be list
- if not is_list_like(subset):
- subset = [subset]
- ax = self._get_axis(agg_axis)
- indices = ax.get_indexer_for(subset)
- check = indices == -1
- if check.any():
- raise KeyError(np.array(subset)[check].tolist())
- agg_obj = self.take(indices, axis=agg_axis)
-
- if thresh is not no_default:
- count = agg_obj.count(axis=agg_axis)
- mask = count >= thresh
- elif how == "any":
- # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]'
- mask = notna(agg_obj).all(axis=agg_axis, bool_only=False)
- elif how == "all":
- # faster equivalent to 'agg_obj.count(agg_axis) > 0'
- mask = notna(agg_obj).any(axis=agg_axis, bool_only=False)
- else:
- raise ValueError(f"invalid how option: {how}")
-
- if np.all(mask):
- result = self.copy(deep=None)
- else:
- result = self.loc(axis=axis)[mask]
-
- if ignore_index:
- result.index = default_index(len(result))
-
- if not inplace:
- return result
- self._update_inplace(result)
- return None
-
- def drop_duplicates(
- self,
- subset: Hashable | Sequence[Hashable] | None = None,
- *,
- keep: DropKeep = "first",
- inplace: bool = False,
- ignore_index: bool = False,
- ) -> DataFrame | None:
- """
- Return DataFrame with duplicate rows removed.
-
- Considering certain columns is optional. Indexes, including time indexes
- are ignored.
-
- Parameters
- ----------
- subset : column label or sequence of labels, optional
- Only consider certain columns for identifying duplicates, by
- default use all of the columns.
- keep : {'first', 'last', ``False``}, default 'first'
- Determines which duplicates (if any) to keep.
-
- - 'first' : Drop duplicates except for the first occurrence.
- - 'last' : Drop duplicates except for the last occurrence.
- - ``False`` : Drop all duplicates.
-
- inplace : bool, default ``False``
- Whether to modify the DataFrame rather than creating a new one.
- ignore_index : bool, default ``False``
- If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
-
- Returns
- -------
- DataFrame or None
- DataFrame with duplicates removed or None if ``inplace=True``.
-
- See Also
- --------
- DataFrame.value_counts: Count unique combinations of columns.
-
- Examples
- --------
- Consider dataset containing ramen rating.
-
- >>> df = pd.DataFrame({
- ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
- ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
- ... 'rating': [4, 4, 3.5, 15, 5]
- ... })
- >>> df
- brand style rating
- 0 Yum Yum cup 4.0
- 1 Yum Yum cup 4.0
- 2 Indomie cup 3.5
- 3 Indomie pack 15.0
- 4 Indomie pack 5.0
-
- By default, it removes duplicate rows based on all columns.
-
- >>> df.drop_duplicates()
- brand style rating
- 0 Yum Yum cup 4.0
- 2 Indomie cup 3.5
- 3 Indomie pack 15.0
- 4 Indomie pack 5.0
-
- To remove duplicates on specific column(s), use ``subset``.
-
- >>> df.drop_duplicates(subset=['brand'])
- brand style rating
- 0 Yum Yum cup 4.0
- 2 Indomie cup 3.5
-
- To remove duplicates and keep last occurrences, use ``keep``.
-
- >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
- brand style rating
- 1 Yum Yum cup 4.0
- 2 Indomie cup 3.5
- 4 Indomie pack 5.0
- """
- if self.empty:
- return self.copy(deep=None)
-
- inplace = validate_bool_kwarg(inplace, "inplace")
- ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
-
- result = self[-self.duplicated(subset, keep=keep)]
- if ignore_index:
- result.index = default_index(len(result))
-
- if inplace:
- self._update_inplace(result)
- return None
- else:
- return result
-
- def duplicated(
- self,
- subset: Hashable | Sequence[Hashable] | None = None,
- keep: DropKeep = "first",
- ) -> Series:
- """
- Return boolean Series denoting duplicate rows.
-
- Considering certain columns is optional.
-
- Parameters
- ----------
- subset : column label or sequence of labels, optional
- Only consider certain columns for identifying duplicates, by
- default use all of the columns.
- keep : {'first', 'last', False}, default 'first'
- Determines which duplicates (if any) to mark.
-
- - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
- - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
- - False : Mark all duplicates as ``True``.
-
- Returns
- -------
- Series
- Boolean series for each duplicated rows.
-
- See Also
- --------
- Index.duplicated : Equivalent method on index.
- Series.duplicated : Equivalent method on Series.
- Series.drop_duplicates : Remove duplicate values from Series.
- DataFrame.drop_duplicates : Remove duplicate values from DataFrame.
-
- Examples
- --------
- Consider dataset containing ramen rating.
-
- >>> df = pd.DataFrame({
- ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
- ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
- ... 'rating': [4, 4, 3.5, 15, 5]
- ... })
- >>> df
- brand style rating
- 0 Yum Yum cup 4.0
- 1 Yum Yum cup 4.0
- 2 Indomie cup 3.5
- 3 Indomie pack 15.0
- 4 Indomie pack 5.0
-
- By default, for each set of duplicated values, the first occurrence
- is set on False and all others on True.
-
- >>> df.duplicated()
- 0 False
- 1 True
- 2 False
- 3 False
- 4 False
- dtype: bool
-
- By using 'last', the last occurrence of each set of duplicated values
- is set on False and all others on True.
-
- >>> df.duplicated(keep='last')
- 0 True
- 1 False
- 2 False
- 3 False
- 4 False
- dtype: bool
-
- By setting ``keep`` on False, all duplicates are True.
-
- >>> df.duplicated(keep=False)
- 0 True
- 1 True
- 2 False
- 3 False
- 4 False
- dtype: bool
-
- To find duplicates on specific column(s), use ``subset``.
-
- >>> df.duplicated(subset=['brand'])
- 0 False
- 1 True
- 2 False
- 3 True
- 4 True
- dtype: bool
- """
-
- if self.empty:
- return self._constructor_sliced(dtype=bool)
-
- def f(vals) -> tuple[np.ndarray, int]:
- labels, shape = algorithms.factorize(vals, size_hint=len(self))
- return labels.astype("i8", copy=False), len(shape)
-
- if subset is None:
- # https://github.com/pandas-dev/pandas/issues/28770
- # Incompatible types in assignment (expression has type "Index", variable
- # has type "Sequence[Any]")
- subset = self.columns # type: ignore[assignment]
- elif (
- not np.iterable(subset)
- or isinstance(subset, str)
- or isinstance(subset, tuple)
- and subset in self.columns
- ):
- subset = (subset,)
-
- # needed for mypy since can't narrow types using np.iterable
- subset = cast(Sequence, subset)
-
- # Verify all columns in subset exist in the queried dataframe
- # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
- # key that doesn't exist.
- diff = set(subset) - set(self.columns)
- if diff:
- raise KeyError(Index(diff))
-
- if len(subset) == 1 and self.columns.is_unique:
- # GH#45236 This is faster than get_group_index below
- result = self[subset[0]].duplicated(keep)
- result.name = None
- else:
- vals = (col.values for name, col in self.items() if name in subset)
- labels, shape = map(list, zip(*map(f, vals)))
-
- ids = get_group_index(
- labels,
- # error: Argument 1 to "tuple" has incompatible type "List[_T]";
- # expected "Iterable[int]"
- tuple(shape), # type: ignore[arg-type]
- sort=False,
- xnull=False,
- )
- result = self._constructor_sliced(duplicated(ids, keep), index=self.index)
- return result.__finalize__(self, method="duplicated")
-
- # ----------------------------------------------------------------------
- # Sorting
- # error: Signature of "sort_values" incompatible with supertype "NDFrame"
- @overload # type: ignore[override]
- def sort_values(
- self,
- by: IndexLabel,
- *,
- axis: Axis = ...,
- ascending=...,
- inplace: Literal[False] = ...,
- kind: str = ...,
- na_position: str = ...,
- ignore_index: bool = ...,
- key: ValueKeyFunc = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def sort_values(
- self,
- by: IndexLabel,
- *,
- axis: Axis = ...,
- ascending=...,
- inplace: Literal[True],
- kind: str = ...,
- na_position: str = ...,
- ignore_index: bool = ...,
- key: ValueKeyFunc = ...,
- ) -> None:
- ...
-
- # TODO: Just move the sort_values doc here.
- @Substitution(**_shared_doc_kwargs)
- @Appender(NDFrame.sort_values.__doc__)
- def sort_values(
- self,
- by: IndexLabel,
- *,
- axis: Axis = 0,
- ascending: bool | list[bool] | tuple[bool, ...] = True,
- inplace: bool = False,
- kind: str = "quicksort",
- na_position: str = "last",
- ignore_index: bool = False,
- key: ValueKeyFunc = None,
- ) -> DataFrame | None:
- inplace = validate_bool_kwarg(inplace, "inplace")
- axis = self._get_axis_number(axis)
- ascending = validate_ascending(ascending)
- if not isinstance(by, list):
- by = [by]
- # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]";
- # expected "Sized"
- if is_sequence(ascending) and (
- len(by) != len(ascending) # type: ignore[arg-type]
- ):
- # error: Argument 1 to "len" has incompatible type "Union[bool,
- # List[bool]]"; expected "Sized"
- raise ValueError(
- f"Length of ascending ({len(ascending)})" # type: ignore[arg-type]
- f" != length of by ({len(by)})"
- )
- if len(by) > 1:
- keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
-
- # need to rewrap columns in Series to apply key function
- if key is not None:
- # error: List comprehension has incompatible type List[Series];
- # expected List[ndarray]
- keys = [
- Series(k, name=name) # type: ignore[misc]
- for (k, name) in zip(keys, by)
- ]
-
- indexer = lexsort_indexer(
- keys, orders=ascending, na_position=na_position, key=key
- )
- elif len(by):
- # len(by) == 1
-
- by = by[0]
- k = self._get_label_or_level_values(by, axis=axis)
-
- # need to rewrap column in Series to apply key function
- if key is not None:
- # error: Incompatible types in assignment (expression has type
- # "Series", variable has type "ndarray")
- k = Series(k, name=by) # type: ignore[assignment]
-
- if isinstance(ascending, (tuple, list)):
- ascending = ascending[0]
-
- indexer = nargsort(
- k, kind=kind, ascending=ascending, na_position=na_position, key=key
- )
- else:
- if inplace:
- return self._update_inplace(self)
- else:
- return self.copy(deep=None)
-
- if is_range_indexer(indexer, len(indexer)):
- result = self.copy(deep=(not inplace and not using_copy_on_write()))
- if ignore_index:
- result.index = default_index(len(result))
-
- if inplace:
- return self._update_inplace(result)
- else:
- return result
-
- new_data = self._mgr.take(
- indexer, axis=self._get_block_manager_axis(axis), verify=False
- )
-
- if ignore_index:
- new_data.set_axis(
- self._get_block_manager_axis(axis), default_index(len(indexer))
- )
-
- result = self._constructor(new_data)
- if inplace:
- return self._update_inplace(result)
- else:
- return result.__finalize__(self, method="sort_values")
-
- @overload
- def sort_index(
- self,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool | Sequence[bool] = ...,
- inplace: Literal[True],
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool = ...,
- ignore_index: bool = ...,
- key: IndexKeyFunc = ...,
- ) -> None:
- ...
-
- @overload
- def sort_index(
- self,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool | Sequence[bool] = ...,
- inplace: Literal[False] = ...,
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool = ...,
- ignore_index: bool = ...,
- key: IndexKeyFunc = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def sort_index(
- self,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool | Sequence[bool] = ...,
- inplace: bool = ...,
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool = ...,
- ignore_index: bool = ...,
- key: IndexKeyFunc = ...,
- ) -> DataFrame | None:
- ...
-
- def sort_index(
- self,
- *,
- axis: Axis = 0,
- level: IndexLabel = None,
- ascending: bool | Sequence[bool] = True,
- inplace: bool = False,
- kind: SortKind = "quicksort",
- na_position: NaPosition = "last",
- sort_remaining: bool = True,
- ignore_index: bool = False,
- key: IndexKeyFunc = None,
- ) -> DataFrame | None:
- """
- Sort object by labels (along an axis).
-
- Returns a new DataFrame sorted by label if `inplace` argument is
- ``False``, otherwise updates the original DataFrame and returns None.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis along which to sort. The value 0 identifies the rows,
- and 1 identifies the columns.
- level : int or level name or list of ints or list of level names
- If not None, sort on values in specified index level(s).
- ascending : bool or list-like of bools, default True
- Sort ascending vs. descending. When the index is a MultiIndex the
- sort direction can be controlled for each level individually.
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.
- kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
- Choice of sorting algorithm. See also :func:`numpy.sort` for more
- information. `mergesort` and `stable` are the only stable algorithms. For
- DataFrames, this option is only applied when sorting on a single
- column or label.
- na_position : {'first', 'last'}, default 'last'
- Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
- Not implemented for MultiIndex.
- sort_remaining : bool, default True
- If True and sorting by level and index is multilevel, sort by other
- levels too (in order) after sorting by specified level.
- ignore_index : bool, default False
- If True, the resulting axis will be labeled 0, 1, …, n - 1.
- key : callable, optional
- If not None, apply the key function to the index values
- before sorting. This is similar to the `key` argument in the
- builtin :meth:`sorted` function, with the notable difference that
- this `key` function should be *vectorized*. It should expect an
- ``Index`` and return an ``Index`` of the same shape. For MultiIndex
- inputs, the key is applied *per level*.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- DataFrame or None
- The original DataFrame sorted by the labels or None if ``inplace=True``.
-
- See Also
- --------
- Series.sort_index : Sort Series by the index.
- DataFrame.sort_values : Sort DataFrame by the value.
- Series.sort_values : Sort Series by the value.
-
- Examples
- --------
- >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],
- ... columns=['A'])
- >>> df.sort_index()
- A
- 1 4
- 29 2
- 100 1
- 150 5
- 234 3
-
- By default, it sorts in ascending order, to sort in descending order,
- use ``ascending=False``
-
- >>> df.sort_index(ascending=False)
- A
- 234 3
- 150 5
- 100 1
- 29 2
- 1 4
-
- A key function can be specified which is applied to the index before
- sorting. For a ``MultiIndex`` this is applied to each level separately.
-
- >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])
- >>> df.sort_index(key=lambda x: x.str.lower())
- a
- A 1
- b 2
- C 3
- d 4
- """
- return super().sort_index(
- axis=axis,
- level=level,
- ascending=ascending,
- inplace=inplace,
- kind=kind,
- na_position=na_position,
- sort_remaining=sort_remaining,
- ignore_index=ignore_index,
- key=key,
- )
-
- def value_counts(
- self,
- subset: Sequence[Hashable] | None = None,
- normalize: bool = False,
- sort: bool = True,
- ascending: bool = False,
- dropna: bool = True,
- ) -> Series:
- """
- Return a Series containing counts of unique rows in the DataFrame.
-
- .. versionadded:: 1.1.0
-
- Parameters
- ----------
- subset : label or list of labels, optional
- Columns to use when counting unique combinations.
- normalize : bool, default False
- Return proportions rather than frequencies.
- sort : bool, default True
- Sort by frequencies.
- ascending : bool, default False
- Sort in ascending order.
- dropna : bool, default True
- Don’t include counts of rows that contain NA values.
-
- .. versionadded:: 1.3.0
-
- Returns
- -------
- Series
-
- See Also
- --------
- Series.value_counts: Equivalent method on Series.
-
- Notes
- -----
- The returned Series will have a MultiIndex with one level per input
- column but an Index (non-multi) for a single label. By default, rows
- that contain any NA values are omitted from the result. By default,
- the resulting Series will be in descending order so that the first
- element is the most frequently-occurring row.
-
- Examples
- --------
- >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
- ... 'num_wings': [2, 0, 0, 0]},
- ... index=['falcon', 'dog', 'cat', 'ant'])
- >>> df
- num_legs num_wings
- falcon 2 2
- dog 4 0
- cat 4 0
- ant 6 0
-
- >>> df.value_counts()
- num_legs num_wings
- 4 0 2
- 2 2 1
- 6 0 1
- Name: count, dtype: int64
-
- >>> df.value_counts(sort=False)
- num_legs num_wings
- 2 2 1
- 4 0 2
- 6 0 1
- Name: count, dtype: int64
-
- >>> df.value_counts(ascending=True)
- num_legs num_wings
- 2 2 1
- 6 0 1
- 4 0 2
- Name: count, dtype: int64
-
- >>> df.value_counts(normalize=True)
- num_legs num_wings
- 4 0 0.50
- 2 2 0.25
- 6 0 0.25
- Name: proportion, dtype: float64
-
- With `dropna` set to `False` we can also count rows with NA values.
-
- >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],
- ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})
- >>> df
- first_name middle_name
- 0 John Smith
- 1 Anne <NA>
- 2 John <NA>
- 3 Beth Louise
-
- >>> df.value_counts()
- first_name middle_name
- Beth Louise 1
- John Smith 1
- Name: count, dtype: int64
-
- >>> df.value_counts(dropna=False)
- first_name middle_name
- Anne NaN 1
- Beth Louise 1
- John Smith 1
- NaN 1
- Name: count, dtype: int64
-
- >>> df.value_counts("first_name")
- first_name
- John 2
- Anne 1
- Beth 1
- Name: count, dtype: int64
- """
- if subset is None:
- subset = self.columns.tolist()
-
- name = "proportion" if normalize else "count"
- counts = self.groupby(subset, dropna=dropna).grouper.size()
- counts.name = name
-
- if sort:
- counts = counts.sort_values(ascending=ascending)
- if normalize:
- counts /= counts.sum()
-
- # Force MultiIndex for single column
- if is_list_like(subset) and len(subset) == 1:
- counts.index = MultiIndex.from_arrays(
- [counts.index], names=[counts.index.name]
- )
-
- return counts
-
- def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:
- """
- Return the first `n` rows ordered by `columns` in descending order.
-
- Return the first `n` rows with the largest values in `columns`, in
- descending order. The columns that are not specified are returned as
- well, but not used for ordering.
-
- This method is equivalent to
- ``df.sort_values(columns, ascending=False).head(n)``, but more
- performant.
-
- Parameters
- ----------
- n : int
- Number of rows to return.
- columns : label or list of labels
- Column label(s) to order by.
- keep : {'first', 'last', 'all'}, default 'first'
- Where there are duplicate values:
-
- - ``first`` : prioritize the first occurrence(s)
- - ``last`` : prioritize the last occurrence(s)
- - ``all`` : do not drop any duplicates, even it means
- selecting more than `n` items.
-
- Returns
- -------
- DataFrame
- The first `n` rows ordered by the given columns in descending
- order.
-
- See Also
- --------
- DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
- ascending order.
- DataFrame.sort_values : Sort DataFrame by the values.
- DataFrame.head : Return the first `n` rows without re-ordering.
-
- Notes
- -----
- This function cannot be used with all column types. For example, when
- specifying columns with `object` or `category` dtypes, ``TypeError`` is
- raised.
-
- Examples
- --------
- >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
- ... 434000, 434000, 337000, 11300,
- ... 11300, 11300],
- ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
- ... 17036, 182, 38, 311],
- ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
- ... "IS", "NR", "TV", "AI"]},
- ... index=["Italy", "France", "Malta",
- ... "Maldives", "Brunei", "Iceland",
- ... "Nauru", "Tuvalu", "Anguilla"])
- >>> df
- population GDP alpha-2
- Italy 59000000 1937894 IT
- France 65000000 2583560 FR
- Malta 434000 12011 MT
- Maldives 434000 4520 MV
- Brunei 434000 12128 BN
- Iceland 337000 17036 IS
- Nauru 11300 182 NR
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
-
- In the following example, we will use ``nlargest`` to select the three
- rows having the largest values in column "population".
-
- >>> df.nlargest(3, 'population')
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Malta 434000 12011 MT
-
- When using ``keep='last'``, ties are resolved in reverse order:
-
- >>> df.nlargest(3, 'population', keep='last')
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Brunei 434000 12128 BN
-
- When using ``keep='all'``, all duplicate items are maintained:
-
- >>> df.nlargest(3, 'population', keep='all')
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Malta 434000 12011 MT
- Maldives 434000 4520 MV
- Brunei 434000 12128 BN
-
- To order by the largest values in column "population" and then "GDP",
- we can specify multiple columns like in the next example.
-
- >>> df.nlargest(3, ['population', 'GDP'])
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Brunei 434000 12128 BN
- """
- return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
-
- def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:
- """
- Return the first `n` rows ordered by `columns` in ascending order.
-
- Return the first `n` rows with the smallest values in `columns`, in
- ascending order. The columns that are not specified are returned as
- well, but not used for ordering.
-
- This method is equivalent to
- ``df.sort_values(columns, ascending=True).head(n)``, but more
- performant.
-
- Parameters
- ----------
- n : int
- Number of items to retrieve.
- columns : list or str
- Column name or names to order by.
- keep : {'first', 'last', 'all'}, default 'first'
- Where there are duplicate values:
-
- - ``first`` : take the first occurrence.
- - ``last`` : take the last occurrence.
- - ``all`` : do not drop any duplicates, even it means
- selecting more than `n` items.
-
- Returns
- -------
- DataFrame
-
- See Also
- --------
- DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
- descending order.
- DataFrame.sort_values : Sort DataFrame by the values.
- DataFrame.head : Return the first `n` rows without re-ordering.
-
- Examples
- --------
- >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
- ... 434000, 434000, 337000, 337000,
- ... 11300, 11300],
- ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
- ... 17036, 182, 38, 311],
- ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
- ... "IS", "NR", "TV", "AI"]},
- ... index=["Italy", "France", "Malta",
- ... "Maldives", "Brunei", "Iceland",
- ... "Nauru", "Tuvalu", "Anguilla"])
- >>> df
- population GDP alpha-2
- Italy 59000000 1937894 IT
- France 65000000 2583560 FR
- Malta 434000 12011 MT
- Maldives 434000 4520 MV
- Brunei 434000 12128 BN
- Iceland 337000 17036 IS
- Nauru 337000 182 NR
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
-
- In the following example, we will use ``nsmallest`` to select the
- three rows having the smallest values in column "population".
-
- >>> df.nsmallest(3, 'population')
- population GDP alpha-2
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- Iceland 337000 17036 IS
-
- When using ``keep='last'``, ties are resolved in reverse order:
-
- >>> df.nsmallest(3, 'population', keep='last')
- population GDP alpha-2
- Anguilla 11300 311 AI
- Tuvalu 11300 38 TV
- Nauru 337000 182 NR
-
- When using ``keep='all'``, all duplicate items are maintained:
-
- >>> df.nsmallest(3, 'population', keep='all')
- population GDP alpha-2
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- Iceland 337000 17036 IS
- Nauru 337000 182 NR
-
- To order by the smallest values in column "population" and then "GDP", we can
- specify multiple columns like in the next example.
-
- >>> df.nsmallest(3, ['population', 'GDP'])
- population GDP alpha-2
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- Nauru 337000 182 NR
- """
- return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()
-
- @doc(
- Series.swaplevel,
- klass=_shared_doc_kwargs["klass"],
- extra_params=dedent(
- """axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to swap levels on. 0 or 'index' for row-wise, 1 or
- 'columns' for column-wise."""
- ),
- examples=dedent(
- """\
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {"Grade": ["A", "B", "A", "C"]},
- ... index=[
- ... ["Final exam", "Final exam", "Coursework", "Coursework"],
- ... ["History", "Geography", "History", "Geography"],
- ... ["January", "February", "March", "April"],
- ... ],
- ... )
- >>> df
- Grade
- Final exam History January A
- Geography February B
- Coursework History March A
- Geography April C
-
- In the following example, we will swap the levels of the indices.
- Here, we will swap the levels column-wise, but levels can be swapped row-wise
- in a similar manner. Note that column-wise is the default behaviour.
- By not supplying any arguments for i and j, we swap the last and second to
- last indices.
-
- >>> df.swaplevel()
- Grade
- Final exam January History A
- February Geography B
- Coursework March History A
- April Geography C
-
- By supplying one argument, we can choose which index to swap the last
- index with. We can for example swap the first index with the last one as
- follows.
-
- >>> df.swaplevel(0)
- Grade
- January History Final exam A
- February Geography Final exam B
- March History Coursework A
- April Geography Coursework C
-
- We can also define explicitly which indices we want to swap by supplying values
- for both i and j. Here, we for example swap the first and second indices.
-
- >>> df.swaplevel(0, 1)
- Grade
- History Final exam January A
- Geography Final exam February B
- History Coursework March A
- Geography Coursework April C"""
- ),
- )
- def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
- result = self.copy(deep=None)
-
- axis = self._get_axis_number(axis)
-
- if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover
- raise TypeError("Can only swap levels on a hierarchical axis.")
-
- if axis == 0:
- assert isinstance(result.index, MultiIndex)
- result.index = result.index.swaplevel(i, j)
- else:
- assert isinstance(result.columns, MultiIndex)
- result.columns = result.columns.swaplevel(i, j)
- return result
-
- def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame:
- """
- Rearrange index levels using input order. May not drop or duplicate levels.
-
- Parameters
- ----------
- order : list of int or list of str
- List representing new level order. Reference level by number
- (position) or by key (label).
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Where to reorder levels.
-
- Returns
- -------
- DataFrame
-
- Examples
- --------
- >>> data = {
- ... "class": ["Mammals", "Mammals", "Reptiles"],
- ... "diet": ["Omnivore", "Carnivore", "Carnivore"],
- ... "species": ["Humans", "Dogs", "Snakes"],
- ... }
- >>> df = pd.DataFrame(data, columns=["class", "diet", "species"])
- >>> df = df.set_index(["class", "diet"])
- >>> df
- species
- class diet
- Mammals Omnivore Humans
- Carnivore Dogs
- Reptiles Carnivore Snakes
-
- Let's reorder the levels of the index:
-
- >>> df.reorder_levels(["diet", "class"])
- species
- diet class
- Omnivore Mammals Humans
- Carnivore Mammals Dogs
- Reptiles Snakes
- """
- axis = self._get_axis_number(axis)
- if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover
- raise TypeError("Can only reorder levels on a hierarchical axis.")
-
- result = self.copy(deep=None)
-
- if axis == 0:
- assert isinstance(result.index, MultiIndex)
- result.index = result.index.reorder_levels(order)
- else:
- assert isinstance(result.columns, MultiIndex)
- result.columns = result.columns.reorder_levels(order)
- return result
-
- # ----------------------------------------------------------------------
- # Arithmetic Methods
-
- def _cmp_method(self, other, op):
- axis: Literal[1] = 1 # only relevant for Series other case
-
- self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None)
-
- # See GH#4537 for discussion of scalar op behavior
- new_data = self._dispatch_frame_op(other, op, axis=axis)
- return self._construct_result(new_data)
-
- def _arith_method(self, other, op):
- if ops.should_reindex_frame_op(self, other, op, 1, None, None):
- return ops.frame_arith_method_with_reindex(self, other, op)
-
- axis: Literal[1] = 1 # only relevant for Series other case
- other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))
-
- self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None)
-
- new_data = self._dispatch_frame_op(other, op, axis=axis)
- return self._construct_result(new_data)
-
- _logical_method = _arith_method
-
- def _dispatch_frame_op(self, right, func: Callable, axis: AxisInt | None = None):
- """
- Evaluate the frame operation func(left, right) by evaluating
- column-by-column, dispatching to the Series implementation.
-
- Parameters
- ----------
- right : scalar, Series, or DataFrame
- func : arithmetic or comparison operator
- axis : {None, 0, 1}
-
- Returns
- -------
- DataFrame
- """
- # Get the appropriate array-op to apply to each column/block's values.
- array_op = ops.get_array_op(func)
-
- right = lib.item_from_zerodim(right)
- if not is_list_like(right):
- # i.e. scalar, faster than checking np.ndim(right) == 0
- with np.errstate(all="ignore"):
- bm = self._mgr.apply(array_op, right=right)
- return self._constructor(bm)
-
- elif isinstance(right, DataFrame):
- assert self.index.equals(right.index)
- assert self.columns.equals(right.columns)
- # TODO: The previous assertion `assert right._indexed_same(self)`
- # fails in cases with empty columns reached via
- # _frame_arith_method_with_reindex
-
- # TODO operate_blockwise expects a manager of the same type
- with np.errstate(all="ignore"):
- bm = self._mgr.operate_blockwise(
- # error: Argument 1 to "operate_blockwise" of "ArrayManager" has
- # incompatible type "Union[ArrayManager, BlockManager]"; expected
- # "ArrayManager"
- # error: Argument 1 to "operate_blockwise" of "BlockManager" has
- # incompatible type "Union[ArrayManager, BlockManager]"; expected
- # "BlockManager"
- right._mgr, # type: ignore[arg-type]
- array_op,
- )
- return self._constructor(bm)
-
- elif isinstance(right, Series) and axis == 1:
- # axis=1 means we want to operate row-by-row
- assert right.index.equals(self.columns)
-
- right = right._values
- # maybe_align_as_frame ensures we do not have an ndarray here
- assert not isinstance(right, np.ndarray)
-
- with np.errstate(all="ignore"):
- arrays = [
- array_op(_left, _right)
- for _left, _right in zip(self._iter_column_arrays(), right)
- ]
-
- elif isinstance(right, Series):
- assert right.index.equals(self.index) # Handle other cases later
- right = right._values
-
- with np.errstate(all="ignore"):
- arrays = [array_op(left, right) for left in self._iter_column_arrays()]
-
- else:
- # Remaining cases have less-obvious dispatch rules
- raise NotImplementedError(right)
-
- return type(self)._from_arrays(
- arrays, self.columns, self.index, verify_integrity=False
- )
-
- def _combine_frame(self, other: DataFrame, func, fill_value=None):
- # at this point we have `self._indexed_same(other)`
-
- if fill_value is None:
- # since _arith_op may be called in a loop, avoid function call
- # overhead if possible by doing this check once
- _arith_op = func
-
- else:
-
- def _arith_op(left, right):
- # for the mixed_type case where we iterate over columns,
- # _arith_op(left, right) is equivalent to
- # left._binop(right, func, fill_value=fill_value)
- left, right = ops.fill_binop(left, right, fill_value)
- return func(left, right)
-
- new_data = self._dispatch_frame_op(other, _arith_op)
- return new_data
-
- def _construct_result(self, result) -> DataFrame:
- """
- Wrap the result of an arithmetic, comparison, or logical operation.
-
- Parameters
- ----------
- result : DataFrame
-
- Returns
- -------
- DataFrame
- """
- out = self._constructor(result, copy=False).__finalize__(self)
- # Pin columns instead of passing to constructor for compat with
- # non-unique columns case
- out.columns = self.columns
- out.index = self.index
- return out
-
- def __divmod__(self, other) -> tuple[DataFrame, DataFrame]:
- # Naive implementation, room for optimization
- div = self // other
- mod = self - div * other
- return div, mod
-
- def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:
- # Naive implementation, room for optimization
- div = other // self
- mod = other - div * self
- return div, mod
-
- # ----------------------------------------------------------------------
- # Combination-Related
-
- @doc(
- _shared_docs["compare"],
- """
-Returns
--------
-DataFrame
- DataFrame that shows the differences stacked side by side.
-
- The resulting index will be a MultiIndex with 'self' and 'other'
- stacked alternately at the inner level.
-
-Raises
-------
-ValueError
- When the two DataFrames don't have identical labels or shape.
-
-See Also
---------
-Series.compare : Compare with another Series and show differences.
-DataFrame.equals : Test whether two objects contain the same elements.
-
-Notes
------
-Matching NaNs will not appear as a difference.
-
-Can only compare identically-labeled
-(i.e. same shape, identical row and column labels) DataFrames
-
-Examples
---------
->>> df = pd.DataFrame(
-... {{
-... "col1": ["a", "a", "b", "b", "a"],
-... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
-... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
-... }},
-... columns=["col1", "col2", "col3"],
-... )
->>> df
- col1 col2 col3
-0 a 1.0 1.0
-1 a 2.0 2.0
-2 b 3.0 3.0
-3 b NaN 4.0
-4 a 5.0 5.0
-
->>> df2 = df.copy()
->>> df2.loc[0, 'col1'] = 'c'
->>> df2.loc[2, 'col3'] = 4.0
->>> df2
- col1 col2 col3
-0 c 1.0 1.0
-1 a 2.0 2.0
-2 b 3.0 4.0
-3 b NaN 4.0
-4 a 5.0 5.0
-
-Align the differences on columns
-
->>> df.compare(df2)
- col1 col3
- self other self other
-0 a c NaN NaN
-2 NaN NaN 3.0 4.0
-
-Assign result_names
-
->>> df.compare(df2, result_names=("left", "right"))
- col1 col3
- left right left right
-0 a c NaN NaN
-2 NaN NaN 3.0 4.0
-
-Stack the differences on rows
-
->>> df.compare(df2, align_axis=0)
- col1 col3
-0 self a NaN
- other c NaN
-2 self NaN 3.0
- other NaN 4.0
-
-Keep the equal values
-
->>> df.compare(df2, keep_equal=True)
- col1 col3
- self other self other
-0 a c 1.0 1.0
-2 b b 3.0 4.0
-
-Keep all original rows and columns
-
->>> df.compare(df2, keep_shape=True)
- col1 col2 col3
- self other self other self other
-0 a c NaN NaN NaN NaN
-1 NaN NaN NaN NaN NaN NaN
-2 NaN NaN NaN NaN 3.0 4.0
-3 NaN NaN NaN NaN NaN NaN
-4 NaN NaN NaN NaN NaN NaN
-
-Keep all original rows and columns and also all original values
-
->>> df.compare(df2, keep_shape=True, keep_equal=True)
- col1 col2 col3
- self other self other self other
-0 a c 1.0 1.0 1.0 1.0
-1 a a 2.0 2.0 2.0 2.0
-2 b b 3.0 3.0 3.0 4.0
-3 b b NaN NaN 4.0 4.0
-4 a a 5.0 5.0 5.0 5.0
-""",
- klass=_shared_doc_kwargs["klass"],
- )
- def compare(
- self,
- other: DataFrame,
- align_axis: Axis = 1,
- keep_shape: bool = False,
- keep_equal: bool = False,
- result_names: Suffixes = ("self", "other"),
- ) -> DataFrame:
- return super().compare(
- other=other,
- align_axis=align_axis,
- keep_shape=keep_shape,
- keep_equal=keep_equal,
- result_names=result_names,
- )
-
- def combine(
- self,
- other: DataFrame,
- func: Callable[[Series, Series], Series | Hashable],
- fill_value=None,
- overwrite: bool = True,
- ) -> DataFrame:
- """
- Perform column-wise combine with another DataFrame.
-
- Combines a DataFrame with `other` DataFrame using `func`
- to element-wise combine columns. The row and column indexes of the
- resulting DataFrame will be the union of the two.
-
- Parameters
- ----------
- other : DataFrame
- The DataFrame to merge column-wise.
- func : function
- Function that takes two series as inputs and return a Series or a
- scalar. Used to merge the two dataframes column by columns.
- fill_value : scalar value, default None
- The value to fill NaNs with prior to passing any column to the
- merge func.
- overwrite : bool, default True
- If True, columns in `self` that do not exist in `other` will be
- overwritten with NaNs.
-
- Returns
- -------
- DataFrame
- Combination of the provided DataFrames.
-
- See Also
- --------
- DataFrame.combine_first : Combine two DataFrame objects and default to
- non-null values in frame calling the method.
-
- Examples
- --------
- Combine using a simple function that chooses the smaller column.
-
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
- >>> df1.combine(df2, take_smaller)
- A B
- 0 0 3
- 1 0 3
-
- Example using a true element-wise combine function.
-
- >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> df1.combine(df2, np.minimum)
- A B
- 0 1 2
- 1 0 3
-
- Using `fill_value` fills Nones prior to passing the column to the
- merge function.
-
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> df1.combine(df2, take_smaller, fill_value=-5)
- A B
- 0 0 -5.0
- 1 0 4.0
-
- However, if the same element in both dataframes is None, that None
- is preserved
-
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
- >>> df1.combine(df2, take_smaller, fill_value=-5)
- A B
- 0 0 -5.0
- 1 0 3.0
-
- Example that demonstrates the use of `overwrite` and behavior when
- the axis differ between the dataframes.
-
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
- >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
- >>> df1.combine(df2, take_smaller)
- A B C
- 0 NaN NaN NaN
- 1 NaN 3.0 -10.0
- 2 NaN 3.0 1.0
-
- >>> df1.combine(df2, take_smaller, overwrite=False)
- A B C
- 0 0.0 NaN NaN
- 1 0.0 3.0 -10.0
- 2 NaN 3.0 1.0
-
- Demonstrating the preference of the passed in dataframe.
-
- >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
- >>> df2.combine(df1, take_smaller)
- A B C
- 0 0.0 NaN NaN
- 1 0.0 3.0 NaN
- 2 NaN 3.0 NaN
-
- >>> df2.combine(df1, take_smaller, overwrite=False)
- A B C
- 0 0.0 NaN NaN
- 1 0.0 3.0 1.0
- 2 NaN 3.0 1.0
- """
- other_idxlen = len(other.index) # save for compare
-
- this, other = self.align(other, copy=False)
- new_index = this.index
-
- if other.empty and len(new_index) == len(self.index):
- return self.copy()
-
- if self.empty and len(other) == other_idxlen:
- return other.copy()
-
- # sorts if possible; otherwise align above ensures that these are set-equal
- new_columns = this.columns.union(other.columns)
- do_fill = fill_value is not None
- result = {}
- for col in new_columns:
- series = this[col]
- other_series = other[col]
-
- this_dtype = series.dtype
- other_dtype = other_series.dtype
-
- this_mask = isna(series)
- other_mask = isna(other_series)
-
- # don't overwrite columns unnecessarily
- # DO propagate if this column is not in the intersection
- if not overwrite and other_mask.all():
- result[col] = this[col].copy()
- continue
-
- if do_fill:
- series = series.copy()
- other_series = other_series.copy()
- series[this_mask] = fill_value
- other_series[other_mask] = fill_value
-
- if col not in self.columns:
- # If self DataFrame does not have col in other DataFrame,
- # try to promote series, which is all NaN, as other_dtype.
- new_dtype = other_dtype
- try:
- series = series.astype(new_dtype, copy=False)
- except ValueError:
- # e.g. new_dtype is integer types
- pass
- else:
- # if we have different dtypes, possibly promote
- new_dtype = find_common_type([this_dtype, other_dtype])
- series = series.astype(new_dtype, copy=False)
- other_series = other_series.astype(new_dtype, copy=False)
-
- arr = func(series, other_series)
- if isinstance(new_dtype, np.dtype):
- # if new_dtype is an EA Dtype, then `func` is expected to return
- # the correct dtype without any additional casting
- # error: No overload variant of "maybe_downcast_to_dtype" matches
- # argument types "Union[Series, Hashable]", "dtype[Any]"
- arr = maybe_downcast_to_dtype( # type: ignore[call-overload]
- arr, new_dtype
- )
-
- result[col] = arr
-
- # convert_objects just in case
- return self._constructor(result, index=new_index, columns=new_columns)
-
- def combine_first(self, other: DataFrame) -> DataFrame:
- """
- Update null elements with value in the same location in `other`.
-
- Combine two DataFrame objects by filling null values in one DataFrame
- with non-null values from other DataFrame. The row and column indexes
- of the resulting DataFrame will be the union of the two. The resulting
- dataframe contains the 'first' dataframe values and overrides the
- second one values where both first.loc[index, col] and
- second.loc[index, col] are not missing values, upon calling
- first.combine_first(second).
-
- Parameters
- ----------
- other : DataFrame
- Provided DataFrame to use to fill null values.
-
- Returns
- -------
- DataFrame
- The result of combining the provided DataFrame with the other object.
-
- See Also
- --------
- DataFrame.combine : Perform series-wise operation on two DataFrames
- using a given function.
-
- Examples
- --------
- >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> df1.combine_first(df2)
- A B
- 0 1.0 3.0
- 1 0.0 4.0
-
- Null values still persist if the location of that null value
- does not exist in `other`
-
- >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
- >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
- >>> df1.combine_first(df2)
- A B C
- 0 NaN 4.0 NaN
- 1 0.0 3.0 1.0
- 2 NaN 3.0 1.0
- """
- from pandas.core.computation import expressions
-
- def combiner(x, y):
- mask = extract_array(isna(x))
-
- x_values = extract_array(x, extract_numpy=True)
- y_values = extract_array(y, extract_numpy=True)
-
- # If the column y in other DataFrame is not in first DataFrame,
- # just return y_values.
- if y.name not in self.columns:
- return y_values
-
- return expressions.where(mask, y_values, x_values)
-
- combined = self.combine(other, combiner, overwrite=False)
-
- dtypes = {
- col: find_common_type([self.dtypes[col], other.dtypes[col]])
- for col in self.columns.intersection(other.columns)
- if not is_dtype_equal(combined.dtypes[col], self.dtypes[col])
- }
-
- if dtypes:
- combined = combined.astype(dtypes)
-
- return combined
-
- def update(
- self,
- other,
- join: str = "left",
- overwrite: bool = True,
- filter_func=None,
- errors: str = "ignore",
- ) -> None:
- """
- Modify in place using non-NA values from another DataFrame.
-
- Aligns on indices. There is no return value.
-
- Parameters
- ----------
- other : DataFrame, or object coercible into a DataFrame
- Should have at least one matching index/column label
- with the original DataFrame. If a Series is passed,
- its name attribute must be set, and that will be
- used as the column name to align with the original DataFrame.
- join : {'left'}, default 'left'
- Only left join is implemented, keeping the index and columns of the
- original object.
- overwrite : bool, default True
- How to handle non-NA values for overlapping keys:
-
- * True: overwrite original DataFrame's values
- with values from `other`.
- * False: only update values that are NA in
- the original DataFrame.
-
- filter_func : callable(1d-array) -> bool 1d-array, optional
- Can choose to replace values other than NA. Return True for values
- that should be updated.
- errors : {'raise', 'ignore'}, default 'ignore'
- If 'raise', will raise a ValueError if the DataFrame and `other`
- both contain non-NA data in the same place.
-
- Returns
- -------
- None
- This method directly changes calling object.
-
- Raises
- ------
- ValueError
- * When `errors='raise'` and there's overlapping non-NA data.
- * When `errors` is not either `'ignore'` or `'raise'`
- NotImplementedError
- * If `join != 'left'`
-
- See Also
- --------
- dict.update : Similar method for dictionaries.
- DataFrame.merge : For column(s)-on-column(s) operations.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 2, 3],
- ... 'B': [400, 500, 600]})
- >>> new_df = pd.DataFrame({'B': [4, 5, 6],
- ... 'C': [7, 8, 9]})
- >>> df.update(new_df)
- >>> df
- A B
- 0 1 4
- 1 2 5
- 2 3 6
-
- The DataFrame's length does not increase as a result of the update,
- only values at matching index/column labels are updated.
-
- >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
- ... 'B': ['x', 'y', 'z']})
- >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
- >>> df.update(new_df)
- >>> df
- A B
- 0 a d
- 1 b e
- 2 c f
-
- For Series, its name attribute must be set.
-
- >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
- ... 'B': ['x', 'y', 'z']})
- >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
- >>> df.update(new_column)
- >>> df
- A B
- 0 a d
- 1 b y
- 2 c e
- >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
- ... 'B': ['x', 'y', 'z']})
- >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
- >>> df.update(new_df)
- >>> df
- A B
- 0 a x
- 1 b d
- 2 c e
-
- If `other` contains NaNs the corresponding values are not updated
- in the original dataframe.
-
- >>> df = pd.DataFrame({'A': [1, 2, 3],
- ... 'B': [400, 500, 600]})
- >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
- >>> df.update(new_df)
- >>> df
- A B
- 0 1 4
- 1 2 500
- 2 3 6
- """
- from pandas.core.computation import expressions
-
- # TODO: Support other joins
- if join != "left": # pragma: no cover
- raise NotImplementedError("Only left join is supported")
- if errors not in ["ignore", "raise"]:
- raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
-
- if not isinstance(other, DataFrame):
- other = DataFrame(other)
-
- other = other.reindex(self.index)
-
- for col in self.columns.intersection(other.columns):
- this = self[col]._values
- that = other[col]._values
-
- if filter_func is not None:
- with np.errstate(all="ignore"):
- mask = ~filter_func(this) | isna(that)
- else:
- if errors == "raise":
- mask_this = notna(that)
- mask_that = notna(this)
- if any(mask_this & mask_that):
- raise ValueError("Data overlaps.")
-
- if overwrite:
- mask = isna(that)
- else:
- mask = notna(this)
-
- # don't overwrite columns unnecessarily
- if mask.all():
- continue
-
- self.loc[:, col] = expressions.where(mask, this, that)
-
- # ----------------------------------------------------------------------
- # Data reshaping
- @Appender(
- """
-Examples
---------
->>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
-... 'Parrot', 'Parrot'],
-... 'Max Speed': [380., 370., 24., 26.]})
->>> df
- Animal Max Speed
-0 Falcon 380.0
-1 Falcon 370.0
-2 Parrot 24.0
-3 Parrot 26.0
->>> df.groupby(['Animal']).mean()
- Max Speed
-Animal
-Falcon 375.0
-Parrot 25.0
-
-**Hierarchical Indexes**
-
-We can groupby different levels of a hierarchical index
-using the `level` parameter:
-
->>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
-... ['Captive', 'Wild', 'Captive', 'Wild']]
->>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
->>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
-... index=index)
->>> df
- Max Speed
-Animal Type
-Falcon Captive 390.0
- Wild 350.0
-Parrot Captive 30.0
- Wild 20.0
->>> df.groupby(level=0).mean()
- Max Speed
-Animal
-Falcon 370.0
-Parrot 25.0
->>> df.groupby(level="Type").mean()
- Max Speed
-Type
-Captive 210.0
-Wild 185.0
-
-We can also choose to include NA in group keys or not by setting
-`dropna` parameter, the default setting is `True`.
-
->>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
->>> df = pd.DataFrame(l, columns=["a", "b", "c"])
-
->>> df.groupby(by=["b"]).sum()
- a c
-b
-1.0 2 3
-2.0 2 5
-
->>> df.groupby(by=["b"], dropna=False).sum()
- a c
-b
-1.0 2 3
-2.0 2 5
-NaN 1 4
-
->>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
->>> df = pd.DataFrame(l, columns=["a", "b", "c"])
-
->>> df.groupby(by="a").sum()
- b c
-a
-a 13.0 13.0
-b 12.3 123.0
-
->>> df.groupby(by="a", dropna=False).sum()
- b c
-a
-a 13.0 13.0
-b 12.3 123.0
-NaN 12.3 33.0
-
-When using ``.apply()``, use ``group_keys`` to include or exclude the group keys.
-The ``group_keys`` argument defaults to ``True`` (include).
-
->>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
-... 'Parrot', 'Parrot'],
-... 'Max Speed': [380., 370., 24., 26.]})
->>> df.groupby("Animal", group_keys=True).apply(lambda x: x)
- Animal Max Speed
-Animal
-Falcon 0 Falcon 380.0
- 1 Falcon 370.0
-Parrot 2 Parrot 24.0
- 3 Parrot 26.0
-
->>> df.groupby("Animal", group_keys=False).apply(lambda x: x)
- Animal Max Speed
-0 Falcon 380.0
-1 Falcon 370.0
-2 Parrot 24.0
-3 Parrot 26.0
-"""
- )
- @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
- def groupby(
- self,
- by=None,
- axis: Axis = 0,
- level: IndexLabel | None = None,
- as_index: bool = True,
- sort: bool = True,
- group_keys: bool = True,
- observed: bool = False,
- dropna: bool = True,
- ) -> DataFrameGroupBy:
- from pandas.core.groupby.generic import DataFrameGroupBy
-
- if level is None and by is None:
- raise TypeError("You have to supply one of 'by' and 'level'")
- axis = self._get_axis_number(axis)
-
- return DataFrameGroupBy(
- obj=self,
- keys=by,
- axis=axis,
- level=level,
- as_index=as_index,
- sort=sort,
- group_keys=group_keys,
- observed=observed,
- dropna=dropna,
- )
-
- _shared_docs[
- "pivot"
- ] = """
- Return reshaped DataFrame organized by given index / column values.
-
- Reshape data (produce a "pivot" table) based on column values. Uses
- unique values from specified `index` / `columns` to form axes of the
- resulting DataFrame. This function does not support data
- aggregation, multiple values will result in a MultiIndex in the
- columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
-
- Parameters
- ----------%s
- columns : str or object or a list of str
- Column to use to make new frame's columns.
-
- .. versionchanged:: 1.1.0
- Also accept list of columns names.
-
- index : str or object or a list of str, optional
- Column to use to make new frame's index. If not given, uses existing index.
-
- .. versionchanged:: 1.1.0
- Also accept list of index names.
-
- values : str, object or a list of the previous, optional
- Column(s) to use for populating new frame's values. If not
- specified, all remaining columns will be used and the result will
- have hierarchically indexed columns.
-
- Returns
- -------
- DataFrame
- Returns reshaped DataFrame.
-
- Raises
- ------
- ValueError:
- When there are any `index`, `columns` combinations with multiple
- values. `DataFrame.pivot_table` when you need to aggregate.
-
- See Also
- --------
- DataFrame.pivot_table : Generalization of pivot that can handle
- duplicate values for one index/column pair.
- DataFrame.unstack : Pivot based on the index values instead of a
- column.
- wide_to_long : Wide panel to long format. Less flexible but more
- user-friendly than melt.
-
- Notes
- -----
- For finer-tuned control, see hierarchical indexing documentation along
- with the related stack/unstack methods.
-
- Reference :ref:`the user guide <reshaping.pivot>` for more examples.
-
- Examples
- --------
- >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
- ... 'two'],
- ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
- ... 'baz': [1, 2, 3, 4, 5, 6],
- ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
- >>> df
- foo bar baz zoo
- 0 one A 1 x
- 1 one B 2 y
- 2 one C 3 z
- 3 two A 4 q
- 4 two B 5 w
- 5 two C 6 t
-
- >>> df.pivot(index='foo', columns='bar', values='baz')
- bar A B C
- foo
- one 1 2 3
- two 4 5 6
-
- >>> df.pivot(index='foo', columns='bar')['baz']
- bar A B C
- foo
- one 1 2 3
- two 4 5 6
-
- >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
- baz zoo
- bar A B C A B C
- foo
- one 1 2 3 x y z
- two 4 5 6 q w t
-
- You could also assign a list of column names or a list of index names.
-
- >>> df = pd.DataFrame({
- ... "lev1": [1, 1, 1, 2, 2, 2],
- ... "lev2": [1, 1, 2, 1, 1, 2],
- ... "lev3": [1, 2, 1, 2, 1, 2],
- ... "lev4": [1, 2, 3, 4, 5, 6],
- ... "values": [0, 1, 2, 3, 4, 5]})
- >>> df
- lev1 lev2 lev3 lev4 values
- 0 1 1 1 1 0
- 1 1 1 2 2 1
- 2 1 2 1 3 2
- 3 2 1 2 4 3
- 4 2 1 1 5 4
- 5 2 2 2 6 5
-
- >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")
- lev2 1 2
- lev3 1 2 1 2
- lev1
- 1 0.0 1.0 2.0 NaN
- 2 4.0 3.0 NaN 5.0
-
- >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")
- lev3 1 2
- lev1 lev2
- 1 1 0.0 1.0
- 2 2.0 NaN
- 2 1 4.0 3.0
- 2 NaN 5.0
-
- A ValueError is raised if there are any duplicates.
-
- >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
- ... "bar": ['A', 'A', 'B', 'C'],
- ... "baz": [1, 2, 3, 4]})
- >>> df
- foo bar baz
- 0 one A 1
- 1 one A 2
- 2 two B 3
- 3 two C 4
-
- Notice that the first two rows are the same for our `index`
- and `columns` arguments.
-
- >>> df.pivot(index='foo', columns='bar', values='baz')
- Traceback (most recent call last):
- ...
- ValueError: Index contains duplicate entries, cannot reshape
- """
-
- @Substitution("")
- @Appender(_shared_docs["pivot"])
- def pivot(self, *, columns, index=lib.NoDefault, values=lib.NoDefault) -> DataFrame:
- from pandas.core.reshape.pivot import pivot
-
- return pivot(self, index=index, columns=columns, values=values)
-
- _shared_docs[
- "pivot_table"
- ] = """
- Create a spreadsheet-style pivot table as a DataFrame.
-
- The levels in the pivot table will be stored in MultiIndex objects
- (hierarchical indexes) on the index and columns of the result DataFrame.
-
- Parameters
- ----------%s
- values : list-like or scalar, optional
- Column or columns to aggregate.
- index : column, Grouper, array, or list of the previous
- If an array is passed, it must be the same length as the data. The
- list can contain any of the other types (except list).
- Keys to group by on the pivot table index. If an array is passed,
- it is being used as the same manner as column values.
- columns : column, Grouper, array, or list of the previous
- If an array is passed, it must be the same length as the data. The
- list can contain any of the other types (except list).
- Keys to group by on the pivot table column. If an array is passed,
- it is being used as the same manner as column values.
- aggfunc : function, list of functions, dict, default numpy.mean
- If list of functions passed, the resulting pivot table will have
- hierarchical columns whose top level are the function names
- (inferred from the function objects themselves)
- If dict is passed, the key is column to aggregate and value
- is function or list of functions. If ``margin=True``,
- aggfunc will be used to calculate the partial aggregates.
- fill_value : scalar, default None
- Value to replace missing values with (in the resulting pivot table,
- after aggregation).
- margins : bool, default False
- If ``margins=True``, special ``All`` columns and rows
- will be added with partial group aggregates across the categories
- on the rows and columns.
- dropna : bool, default True
- Do not include columns whose entries are all NaN. If True,
- rows with a NaN value in any column will be omitted before
- computing margins.
- margins_name : str, default 'All'
- Name of the row / column that will contain the totals
- when margins is True.
- observed : bool, default False
- This only applies if any of the groupers are Categoricals.
- If True: only show observed values for categorical groupers.
- If False: show all values for categorical groupers.
-
- sort : bool, default True
- Specifies if the result should be sorted.
-
- .. versionadded:: 1.3.0
-
- Returns
- -------
- DataFrame
- An Excel style pivot table.
-
- See Also
- --------
- DataFrame.pivot : Pivot without aggregation that can handle
- non-numeric data.
- DataFrame.melt: Unpivot a DataFrame from wide to long format,
- optionally leaving identifiers set.
- wide_to_long : Wide panel to long format. Less flexible but more
- user-friendly than melt.
-
- Notes
- -----
- Reference :ref:`the user guide <reshaping.pivot>` for more examples.
-
- Examples
- --------
- >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
- ... "bar", "bar", "bar", "bar"],
- ... "B": ["one", "one", "one", "two", "two",
- ... "one", "one", "two", "two"],
- ... "C": ["small", "large", "large", "small",
- ... "small", "large", "small", "small",
- ... "large"],
- ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
- ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
- >>> df
- A B C D E
- 0 foo one small 1 2
- 1 foo one large 2 4
- 2 foo one large 2 5
- 3 foo two small 3 5
- 4 foo two small 3 6
- 5 bar one large 4 6
- 6 bar one small 5 8
- 7 bar two small 6 9
- 8 bar two large 7 9
-
- This first example aggregates values by taking the sum.
-
- >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
- ... columns=['C'], aggfunc=np.sum)
- >>> table
- C large small
- A B
- bar one 4.0 5.0
- two 7.0 6.0
- foo one 4.0 1.0
- two NaN 6.0
-
- We can also fill missing values using the `fill_value` parameter.
-
- >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
- ... columns=['C'], aggfunc=np.sum, fill_value=0)
- >>> table
- C large small
- A B
- bar one 4 5
- two 7 6
- foo one 4 1
- two 0 6
-
- The next example aggregates by taking the mean across multiple columns.
-
- >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
- ... aggfunc={'D': np.mean, 'E': np.mean})
- >>> table
- D E
- A C
- bar large 5.500000 7.500000
- small 5.500000 8.500000
- foo large 2.000000 4.500000
- small 2.333333 4.333333
-
- We can also calculate multiple types of aggregations for any given
- value column.
-
- >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
- ... aggfunc={'D': np.mean,
- ... 'E': [min, max, np.mean]})
- >>> table
- D E
- mean max mean min
- A C
- bar large 5.500000 9 7.500000 6
- small 5.500000 9 8.500000 8
- foo large 2.000000 5 4.500000 4
- small 2.333333 6 4.333333 2
- """
-
- @Substitution("")
- @Appender(_shared_docs["pivot_table"])
- def pivot_table(
- self,
- values=None,
- index=None,
- columns=None,
- aggfunc: AggFuncType = "mean",
- fill_value=None,
- margins: bool = False,
- dropna: bool = True,
- margins_name: Level = "All",
- observed: bool = False,
- sort: bool = True,
- ) -> DataFrame:
- from pandas.core.reshape.pivot import pivot_table
-
- return pivot_table(
- self,
- values=values,
- index=index,
- columns=columns,
- aggfunc=aggfunc,
- fill_value=fill_value,
- margins=margins,
- dropna=dropna,
- margins_name=margins_name,
- observed=observed,
- sort=sort,
- )
-
- def stack(self, level: Level = -1, dropna: bool = True):
- """
- Stack the prescribed level(s) from columns to index.
-
- Return a reshaped DataFrame or Series having a multi-level
- index with one or more new inner-most levels compared to the current
- DataFrame. The new inner-most levels are created by pivoting the
- columns of the current dataframe:
-
- - if the columns have a single level, the output is a Series;
- - if the columns have multiple levels, the new index
- level(s) is (are) taken from the prescribed level(s) and
- the output is a DataFrame.
-
- Parameters
- ----------
- level : int, str, list, default -1
- Level(s) to stack from the column axis onto the index
- axis, defined as one index or label, or a list of indices
- or labels.
- dropna : bool, default True
- Whether to drop rows in the resulting Frame/Series with
- missing values. Stacking a column level onto the index
- axis can create combinations of index and column values
- that are missing from the original dataframe. See Examples
- section.
-
- Returns
- -------
- DataFrame or Series
- Stacked dataframe or series.
-
- See Also
- --------
- DataFrame.unstack : Unstack prescribed level(s) from index axis
- onto column axis.
- DataFrame.pivot : Reshape dataframe from long format to wide
- format.
- DataFrame.pivot_table : Create a spreadsheet-style pivot table
- as a DataFrame.
-
- Notes
- -----
- The function is named by analogy with a collection of books
- being reorganized from being side by side on a horizontal
- position (the columns of the dataframe) to being stacked
- vertically on top of each other (in the index of the
- dataframe).
-
- Reference :ref:`the user guide <reshaping.stacking>` for more examples.
-
- Examples
- --------
- **Single level columns**
-
- >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
- ... index=['cat', 'dog'],
- ... columns=['weight', 'height'])
-
- Stacking a dataframe with a single level column axis returns a Series:
-
- >>> df_single_level_cols
- weight height
- cat 0 1
- dog 2 3
- >>> df_single_level_cols.stack()
- cat weight 0
- height 1
- dog weight 2
- height 3
- dtype: int64
-
- **Multi level columns: simple case**
-
- >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
- ... ('weight', 'pounds')])
- >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
- ... index=['cat', 'dog'],
- ... columns=multicol1)
-
- Stacking a dataframe with a multi-level column axis:
-
- >>> df_multi_level_cols1
- weight
- kg pounds
- cat 1 2
- dog 2 4
- >>> df_multi_level_cols1.stack()
- weight
- cat kg 1
- pounds 2
- dog kg 2
- pounds 4
-
- **Missing values**
-
- >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
- ... ('height', 'm')])
- >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
- ... index=['cat', 'dog'],
- ... columns=multicol2)
-
- It is common to have missing values when stacking a dataframe
- with multi-level columns, as the stacked dataframe typically
- has more values than the original dataframe. Missing values
- are filled with NaNs:
-
- >>> df_multi_level_cols2
- weight height
- kg m
- cat 1.0 2.0
- dog 3.0 4.0
- >>> df_multi_level_cols2.stack()
- height weight
- cat kg NaN 1.0
- m 2.0 NaN
- dog kg NaN 3.0
- m 4.0 NaN
-
- **Prescribing the level(s) to be stacked**
-
- The first parameter controls which level or levels are stacked:
-
- >>> df_multi_level_cols2.stack(0)
- kg m
- cat height NaN 2.0
- weight 1.0 NaN
- dog height NaN 4.0
- weight 3.0 NaN
- >>> df_multi_level_cols2.stack([0, 1])
- cat height m 2.0
- weight kg 1.0
- dog height m 4.0
- weight kg 3.0
- dtype: float64
-
- **Dropping missing values**
-
- >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
- ... index=['cat', 'dog'],
- ... columns=multicol2)
-
- Note that rows where all values are missing are dropped by
- default but this behaviour can be controlled via the dropna
- keyword parameter:
-
- >>> df_multi_level_cols3
- weight height
- kg m
- cat NaN 1.0
- dog 2.0 3.0
- >>> df_multi_level_cols3.stack(dropna=False)
- height weight
- cat kg NaN NaN
- m 1.0 NaN
- dog kg NaN 2.0
- m 3.0 NaN
- >>> df_multi_level_cols3.stack(dropna=True)
- height weight
- cat m 1.0 NaN
- dog kg NaN 2.0
- m 3.0 NaN
- """
- from pandas.core.reshape.reshape import (
- stack,
- stack_multiple,
- )
-
- if isinstance(level, (tuple, list)):
- result = stack_multiple(self, level, dropna=dropna)
- else:
- result = stack(self, level, dropna=dropna)
-
- return result.__finalize__(self, method="stack")
-
- def explode(
- self,
- column: IndexLabel,
- ignore_index: bool = False,
- ) -> DataFrame:
- """
- Transform each element of a list-like to a row, replicating index values.
-
- Parameters
- ----------
- column : IndexLabel
- Column(s) to explode.
- For multiple columns, specify a non-empty list with each element
- be str or tuple, and all specified columns their list-like data
- on same row of the frame must have matching length.
-
- .. versionadded:: 1.3.0
- Multi-column explode
-
- ignore_index : bool, default False
- If True, the resulting index will be labeled 0, 1, …, n - 1.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- DataFrame
- Exploded lists to rows of the subset columns;
- index will be duplicated for these rows.
-
- Raises
- ------
- ValueError :
- * If columns of the frame are not unique.
- * If specified columns to explode is empty list.
- * If specified columns to explode have not matching count of
- elements rowwise in the frame.
-
- See Also
- --------
- DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
- index labels.
- DataFrame.melt : Unpivot a DataFrame from wide format to long format.
- Series.explode : Explode a DataFrame from list-like columns to long format.
-
- Notes
- -----
- This routine will explode list-likes including lists, tuples, sets,
- Series, and np.ndarray. The result dtype of the subset rows will
- be object. Scalars will be returned unchanged, and empty list-likes will
- result in a np.nan for that row. In addition, the ordering of rows in the
- output will be non-deterministic when exploding sets.
-
- Reference :ref:`the user guide <reshaping.explode>` for more examples.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],
- ... 'B': 1,
- ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
- >>> df
- A B C
- 0 [0, 1, 2] 1 [a, b, c]
- 1 foo 1 NaN
- 2 [] 1 []
- 3 [3, 4] 1 [d, e]
-
- Single-column explode.
-
- >>> df.explode('A')
- A B C
- 0 0 1 [a, b, c]
- 0 1 1 [a, b, c]
- 0 2 1 [a, b, c]
- 1 foo 1 NaN
- 2 NaN 1 []
- 3 3 1 [d, e]
- 3 4 1 [d, e]
-
- Multi-column explode.
-
- >>> df.explode(list('AC'))
- A B C
- 0 0 1 a
- 0 1 1 b
- 0 2 1 c
- 1 foo 1 NaN
- 2 NaN 1 NaN
- 3 3 1 d
- 3 4 1 e
- """
- if not self.columns.is_unique:
- duplicate_cols = self.columns[self.columns.duplicated()].tolist()
- raise ValueError(
- f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}"
- )
-
- columns: list[Hashable]
- if is_scalar(column) or isinstance(column, tuple):
- columns = [column]
- elif isinstance(column, list) and all(
- is_scalar(c) or isinstance(c, tuple) for c in column
- ):
- if not column:
- raise ValueError("column must be nonempty")
- if len(column) > len(set(column)):
- raise ValueError("column must be unique")
- columns = column
- else:
- raise ValueError("column must be a scalar, tuple, or list thereof")
-
- df = self.reset_index(drop=True)
- if len(columns) == 1:
- result = df[columns[0]].explode()
- else:
- mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1
- counts0 = self[columns[0]].apply(mylen)
- for c in columns[1:]:
- if not all(counts0 == self[c].apply(mylen)):
- raise ValueError("columns must have matching element counts")
- result = DataFrame({c: df[c].explode() for c in columns})
- result = df.drop(columns, axis=1).join(result)
- if ignore_index:
- result.index = default_index(len(result))
- else:
- result.index = self.index.take(result.index)
- result = result.reindex(columns=self.columns, copy=False)
-
- return result.__finalize__(self, method="explode")
-
- def unstack(self, level: Level = -1, fill_value=None):
- """
- Pivot a level of the (necessarily hierarchical) index labels.
-
- Returns a DataFrame having a new level of column labels whose inner-most level
- consists of the pivoted index labels.
-
- If the index is not a MultiIndex, the output will be a Series
- (the analogue of stack when the columns are not a MultiIndex).
-
- Parameters
- ----------
- level : int, str, or list of these, default -1 (last level)
- Level(s) of index to unstack, can pass level name.
- fill_value : int, str or dict
- Replace NaN with this value if the unstack produces missing values.
-
- Returns
- -------
- Series or DataFrame
-
- See Also
- --------
- DataFrame.pivot : Pivot a table based on column values.
- DataFrame.stack : Pivot a level of the column labels (inverse operation
- from `unstack`).
-
- Notes
- -----
- Reference :ref:`the user guide <reshaping.stacking>` for more examples.
-
- Examples
- --------
- >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
- ... ('two', 'a'), ('two', 'b')])
- >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
- >>> s
- one a 1.0
- b 2.0
- two a 3.0
- b 4.0
- dtype: float64
-
- >>> s.unstack(level=-1)
- a b
- one 1.0 2.0
- two 3.0 4.0
-
- >>> s.unstack(level=0)
- one two
- a 1.0 3.0
- b 2.0 4.0
-
- >>> df = s.unstack(level=0)
- >>> df.unstack()
- one a 1.0
- b 2.0
- two a 3.0
- b 4.0
- dtype: float64
- """
- from pandas.core.reshape.reshape import unstack
-
- result = unstack(self, level, fill_value)
-
- return result.__finalize__(self, method="unstack")
-
- @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})
- def melt(
- self,
- id_vars=None,
- value_vars=None,
- var_name=None,
- value_name: Hashable = "value",
- col_level: Level = None,
- ignore_index: bool = True,
- ) -> DataFrame:
- return melt(
- self,
- id_vars=id_vars,
- value_vars=value_vars,
- var_name=var_name,
- value_name=value_name,
- col_level=col_level,
- ignore_index=ignore_index,
- ).__finalize__(self, method="melt")
-
- # ----------------------------------------------------------------------
- # Time series-related
-
- @doc(
- Series.diff,
- klass="DataFrame",
- extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n "
- "Take difference over rows (0) or columns (1).\n",
- other_klass="Series",
- examples=dedent(
- """
- Difference with previous row
-
- >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
- ... 'b': [1, 1, 2, 3, 5, 8],
- ... 'c': [1, 4, 9, 16, 25, 36]})
- >>> df
- a b c
- 0 1 1 1
- 1 2 1 4
- 2 3 2 9
- 3 4 3 16
- 4 5 5 25
- 5 6 8 36
-
- >>> df.diff()
- a b c
- 0 NaN NaN NaN
- 1 1.0 0.0 3.0
- 2 1.0 1.0 5.0
- 3 1.0 1.0 7.0
- 4 1.0 2.0 9.0
- 5 1.0 3.0 11.0
-
- Difference with previous column
-
- >>> df.diff(axis=1)
- a b c
- 0 NaN 0 0
- 1 NaN -1 3
- 2 NaN -1 7
- 3 NaN -1 13
- 4 NaN 0 20
- 5 NaN 2 28
-
- Difference with 3rd previous row
-
- >>> df.diff(periods=3)
- a b c
- 0 NaN NaN NaN
- 1 NaN NaN NaN
- 2 NaN NaN NaN
- 3 3.0 2.0 15.0
- 4 3.0 4.0 21.0
- 5 3.0 6.0 27.0
-
- Difference with following row
-
- >>> df.diff(periods=-1)
- a b c
- 0 -1.0 0.0 -3.0
- 1 -1.0 -1.0 -5.0
- 2 -1.0 -1.0 -7.0
- 3 -1.0 -2.0 -9.0
- 4 -1.0 -3.0 -11.0
- 5 NaN NaN NaN
-
- Overflow in input dtype
-
- >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)
- >>> df.diff()
- a
- 0 NaN
- 1 255.0"""
- ),
- )
- def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
- if not lib.is_integer(periods):
- if not (
- is_float(periods)
- # error: "int" has no attribute "is_integer"
- and periods.is_integer() # type: ignore[attr-defined]
- ):
- raise ValueError("periods must be an integer")
- periods = int(periods)
-
- axis = self._get_axis_number(axis)
- if axis == 1:
- if periods != 0:
- # in the periods == 0 case, this is equivalent diff of 0 periods
- # along axis=0, and the Manager method may be somewhat more
- # performant, so we dispatch in that case.
- return self - self.shift(periods, axis=axis)
- # With periods=0 this is equivalent to a diff with axis=0
- axis = 0
-
- new_data = self._mgr.diff(n=periods, axis=axis)
- return self._constructor(new_data).__finalize__(self, "diff")
-
- # ----------------------------------------------------------------------
- # Function application
-
- def _gotitem(
- self,
- key: IndexLabel,
- ndim: int,
- subset: DataFrame | Series | None = None,
- ) -> DataFrame | Series:
- """
- Sub-classes to define. Return a sliced object.
-
- Parameters
- ----------
- key : string / list of selections
- ndim : {1, 2}
- requested ndim of result
- subset : object, default None
- subset to act on
- """
- if subset is None:
- subset = self
- elif subset.ndim == 1: # is Series
- return subset
-
- # TODO: _shallow_copy(subset)?
- return subset[key]
-
- _agg_summary_and_see_also_doc = dedent(
- """
- The aggregation operations are always performed over an axis, either the
- index (default) or the column axis. This behavior is different from
- `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
- `var`), where the default is to compute the aggregation of the flattened
- array, e.g., ``numpy.mean(arr_2d)`` as opposed to
- ``numpy.mean(arr_2d, axis=0)``.
-
- `agg` is an alias for `aggregate`. Use the alias.
-
- See Also
- --------
- DataFrame.apply : Perform any type of operations.
- DataFrame.transform : Perform transformation type operations.
- core.groupby.GroupBy : Perform operations over groups.
- core.resample.Resampler : Perform operations over resampled bins.
- core.window.Rolling : Perform operations over rolling window.
- core.window.Expanding : Perform operations over expanding window.
- core.window.ExponentialMovingWindow : Perform operation over exponential weighted
- window.
- """
- )
-
- _agg_examples_doc = dedent(
- """
- Examples
- --------
- >>> df = pd.DataFrame([[1, 2, 3],
- ... [4, 5, 6],
- ... [7, 8, 9],
- ... [np.nan, np.nan, np.nan]],
- ... columns=['A', 'B', 'C'])
-
- Aggregate these functions over the rows.
-
- >>> df.agg(['sum', 'min'])
- A B C
- sum 12.0 15.0 18.0
- min 1.0 2.0 3.0
-
- Different aggregations per column.
-
- >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
- A B
- sum 12.0 NaN
- min 1.0 2.0
- max NaN 8.0
-
- Aggregate different functions over the columns and rename the index of the resulting
- DataFrame.
-
- >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))
- A B C
- x 7.0 NaN NaN
- y NaN 2.0 NaN
- z NaN NaN 6.0
-
- Aggregate over the columns.
-
- >>> df.agg("mean", axis="columns")
- 0 2.0
- 1 5.0
- 2 8.0
- 3 NaN
- dtype: float64
- """
- )
-
- @doc(
- _shared_docs["aggregate"],
- klass=_shared_doc_kwargs["klass"],
- axis=_shared_doc_kwargs["axis"],
- see_also=_agg_summary_and_see_also_doc,
- examples=_agg_examples_doc,
- )
- def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
- from pandas.core.apply import frame_apply
-
- axis = self._get_axis_number(axis)
-
- relabeling, func, columns, order = reconstruct_func(func, **kwargs)
-
- op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
- result = op.agg()
-
- if relabeling:
- # This is to keep the order to columns occurrence unchanged, and also
- # keep the order of new columns occurrence unchanged
-
- # For the return values of reconstruct_func, if relabeling is
- # False, columns and order will be None.
- assert columns is not None
- assert order is not None
-
- result_in_dict = relabel_result(result, func, columns, order)
- result = DataFrame(result_in_dict, index=columns)
-
- return result
-
- agg = aggregate
-
- # error: Signature of "any" incompatible with supertype "NDFrame" [override]
- @overload # type: ignore[override]
- def any(
- self,
- *,
- axis: Axis = ...,
- bool_only: bool | None = ...,
- skipna: bool = ...,
- level: None = ...,
- **kwargs,
- ) -> Series:
- ...
-
- @overload
- def any(
- self,
- *,
- axis: Axis = ...,
- bool_only: bool | None = ...,
- skipna: bool = ...,
- level: Level,
- **kwargs,
- ) -> DataFrame | Series:
- ...
-
- # error: Missing return statement
- @doc(NDFrame.any, **_shared_doc_kwargs)
- def any( # type: ignore[empty-body]
- self,
- axis: Axis = 0,
- bool_only: bool | None = None,
- skipna: bool = True,
- level: Level = None,
- **kwargs,
- ) -> DataFrame | Series:
- ...
-
- @doc(
- _shared_docs["transform"],
- klass=_shared_doc_kwargs["klass"],
- axis=_shared_doc_kwargs["axis"],
- )
- def transform(
- self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
- ) -> DataFrame:
- from pandas.core.apply import frame_apply
-
- op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
- result = op.transform()
- assert isinstance(result, DataFrame)
- return result
-
- def apply(
- self,
- func: AggFuncType,
- axis: Axis = 0,
- raw: bool = False,
- result_type: Literal["expand", "reduce", "broadcast"] | None = None,
- args=(),
- **kwargs,
- ):
- """
- Apply a function along an axis of the DataFrame.
-
- Objects passed to the function are Series objects whose index is
- either the DataFrame's index (``axis=0``) or the DataFrame's columns
- (``axis=1``). By default (``result_type=None``), the final return type
- is inferred from the return type of the applied function. Otherwise,
- it depends on the `result_type` argument.
-
- Parameters
- ----------
- func : function
- Function to apply to each column or row.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Axis along which the function is applied:
-
- * 0 or 'index': apply function to each column.
- * 1 or 'columns': apply function to each row.
-
- raw : bool, default False
- Determines if row or column is passed as a Series or ndarray object:
-
- * ``False`` : passes each row or column as a Series to the
- function.
- * ``True`` : the passed function will receive ndarray objects
- instead.
- If you are just applying a NumPy reduction function this will
- achieve much better performance.
-
- result_type : {'expand', 'reduce', 'broadcast', None}, default None
- These only act when ``axis=1`` (columns):
-
- * 'expand' : list-like results will be turned into columns.
- * 'reduce' : returns a Series if possible rather than expanding
- list-like results. This is the opposite of 'expand'.
- * 'broadcast' : results will be broadcast to the original shape
- of the DataFrame, the original index and columns will be
- retained.
-
- The default behaviour (None) depends on the return value of the
- applied function: list-like results will be returned as a Series
- of those. However if the apply function returns a Series these
- are expanded to columns.
- args : tuple
- Positional arguments to pass to `func` in addition to the
- array/series.
- **kwargs
- Additional keyword arguments to pass as keywords arguments to
- `func`.
-
- Returns
- -------
- Series or DataFrame
- Result of applying ``func`` along the given axis of the
- DataFrame.
-
- See Also
- --------
- DataFrame.applymap: For elementwise operations.
- DataFrame.aggregate: Only perform aggregating type operations.
- DataFrame.transform: Only perform transforming type operations.
-
- Notes
- -----
- Functions that mutate the passed object can produce unexpected
- behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
- for more details.
-
- Examples
- --------
- >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
- >>> df
- A B
- 0 4 9
- 1 4 9
- 2 4 9
-
- Using a numpy universal function (in this case the same as
- ``np.sqrt(df)``):
-
- >>> df.apply(np.sqrt)
- A B
- 0 2.0 3.0
- 1 2.0 3.0
- 2 2.0 3.0
-
- Using a reducing function on either axis
-
- >>> df.apply(np.sum, axis=0)
- A 12
- B 27
- dtype: int64
-
- >>> df.apply(np.sum, axis=1)
- 0 13
- 1 13
- 2 13
- dtype: int64
-
- Returning a list-like will result in a Series
-
- >>> df.apply(lambda x: [1, 2], axis=1)
- 0 [1, 2]
- 1 [1, 2]
- 2 [1, 2]
- dtype: object
-
- Passing ``result_type='expand'`` will expand list-like results
- to columns of a Dataframe
-
- >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
- 0 1
- 0 1 2
- 1 1 2
- 2 1 2
-
- Returning a Series inside the function is similar to passing
- ``result_type='expand'``. The resulting column names
- will be the Series index.
-
- >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
- foo bar
- 0 1 2
- 1 1 2
- 2 1 2
-
- Passing ``result_type='broadcast'`` will ensure the same shape
- result, whether list-like or scalar is returned by the function,
- and broadcast it along the axis. The resulting column names will
- be the originals.
-
- >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
- A B
- 0 1 2
- 1 1 2
- 2 1 2
- """
- from pandas.core.apply import frame_apply
-
- op = frame_apply(
- self,
- func=func,
- axis=axis,
- raw=raw,
- result_type=result_type,
- args=args,
- kwargs=kwargs,
- )
- return op.apply().__finalize__(self, method="apply")
-
- def applymap(
- self, func: PythonFuncType, na_action: str | None = None, **kwargs
- ) -> DataFrame:
- """
- Apply a function to a Dataframe elementwise.
-
- This method applies a function that accepts and returns a scalar
- to every element of a DataFrame.
-
- Parameters
- ----------
- func : callable
- Python function, returns a single value from a single value.
- na_action : {None, 'ignore'}, default None
- If ‘ignore’, propagate NaN values, without passing them to func.
-
- .. versionadded:: 1.2
-
- **kwargs
- Additional keyword arguments to pass as keywords arguments to
- `func`.
-
- .. versionadded:: 1.3.0
-
- Returns
- -------
- DataFrame
- Transformed DataFrame.
-
- See Also
- --------
- DataFrame.apply : Apply a function along input axis of DataFrame.
-
- Examples
- --------
- >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
- >>> df
- 0 1
- 0 1.000 2.120
- 1 3.356 4.567
-
- >>> df.applymap(lambda x: len(str(x)))
- 0 1
- 0 3 4
- 1 5 5
-
- Like Series.map, NA values can be ignored:
-
- >>> df_copy = df.copy()
- >>> df_copy.iloc[0, 0] = pd.NA
- >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore')
- 0 1
- 0 NaN 4
- 1 5.0 5
-
- Note that a vectorized version of `func` often exists, which will
- be much faster. You could square each number elementwise.
-
- >>> df.applymap(lambda x: x**2)
- 0 1
- 0 1.000000 4.494400
- 1 11.262736 20.857489
-
- But it's better to avoid applymap in that case.
-
- >>> df ** 2
- 0 1
- 0 1.000000 4.494400
- 1 11.262736 20.857489
- """
- if na_action not in {"ignore", None}:
- raise ValueError(
- f"na_action must be 'ignore' or None. Got {repr(na_action)}"
- )
- ignore_na = na_action == "ignore"
- func = functools.partial(func, **kwargs)
-
- # if we have a dtype == 'M8[ns]', provide boxed values
- def infer(x):
- if x.empty:
- return lib.map_infer(x, func, ignore_na=ignore_na)
- return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na)
-
- return self.apply(infer).__finalize__(self, "applymap")
-
- # ----------------------------------------------------------------------
- # Merging / joining methods
-
- def _append(
- self,
- other,
- ignore_index: bool = False,
- verify_integrity: bool = False,
- sort: bool = False,
- ) -> DataFrame:
- if isinstance(other, (Series, dict)):
- if isinstance(other, dict):
- if not ignore_index:
- raise TypeError("Can only append a dict if ignore_index=True")
- other = Series(other)
- if other.name is None and not ignore_index:
- raise TypeError(
- "Can only append a Series if ignore_index=True "
- "or if the Series has a name"
- )
-
- index = Index(
- [other.name],
- name=self.index.names
- if isinstance(self.index, MultiIndex)
- else self.index.name,
- )
- row_df = other.to_frame().T
- # infer_objects is needed for
- # test_append_empty_frame_to_series_with_dateutil_tz
- other = row_df.infer_objects(copy=False).rename_axis(
- index.names, copy=False
- )
- elif isinstance(other, list):
- if not other:
- pass
- elif not isinstance(other[0], DataFrame):
- other = DataFrame(other)
- if self.index.name is not None and not ignore_index:
- other.index.name = self.index.name
-
- from pandas.core.reshape.concat import concat
-
- if isinstance(other, (list, tuple)):
- to_concat = [self, *other]
- else:
- to_concat = [self, other]
-
- result = concat(
- to_concat,
- ignore_index=ignore_index,
- verify_integrity=verify_integrity,
- sort=sort,
- )
- return result.__finalize__(self, method="append")
-
- def join(
- self,
- other: DataFrame | Series | Iterable[DataFrame | Series],
- on: IndexLabel | None = None,
- how: MergeHow = "left",
- lsuffix: str = "",
- rsuffix: str = "",
- sort: bool = False,
- validate: str | None = None,
- ) -> DataFrame:
- """
- Join columns of another DataFrame.
-
- Join columns with `other` DataFrame either on index or on a key
- column. Efficiently join multiple DataFrame objects by index at once by
- passing a list.
-
- Parameters
- ----------
- other : DataFrame, Series, or a list containing any combination of them
- Index should be similar to one of the columns in this one. If a
- Series is passed, its name attribute must be set, and that will be
- used as the column name in the resulting joined DataFrame.
- on : str, list of str, or array-like, optional
- Column or index level name(s) in the caller to join on the index
- in `other`, otherwise joins index-on-index. If multiple
- values given, the `other` DataFrame must have a MultiIndex. Can
- pass an array as the join key if it is not already contained in
- the calling DataFrame. Like an Excel VLOOKUP operation.
- how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
- How to handle the operation of the two objects.
-
- * left: use calling frame's index (or column if on is specified)
- * right: use `other`'s index.
- * outer: form union of calling frame's index (or column if on is
- specified) with `other`'s index, and sort it.
- lexicographically.
- * inner: form intersection of calling frame's index (or column if
- on is specified) with `other`'s index, preserving the order
- of the calling's one.
- * cross: creates the cartesian product from both frames, preserves the order
- of the left keys.
-
- .. versionadded:: 1.2.0
-
- lsuffix : str, default ''
- Suffix to use from left frame's overlapping columns.
- rsuffix : str, default ''
- Suffix to use from right frame's overlapping columns.
- sort : bool, default False
- Order result DataFrame lexicographically by the join key. If False,
- the order of the join key depends on the join type (how keyword).
- validate : str, optional
- If specified, checks if join is of specified type.
- * "one_to_one" or "1:1": check if join keys are unique in both left
- and right datasets.
- * "one_to_many" or "1:m": check if join keys are unique in left dataset.
- * "many_to_one" or "m:1": check if join keys are unique in right dataset.
- * "many_to_many" or "m:m": allowed, but does not result in checks.
- .. versionadded:: 1.5.0
-
- Returns
- -------
- DataFrame
- A dataframe containing columns from both the caller and `other`.
-
- See Also
- --------
- DataFrame.merge : For column(s)-on-column(s) operations.
-
- Notes
- -----
- Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
- passing a list of `DataFrame` objects.
-
- Support for specifying index levels as the `on` parameter was added
- in version 0.23.0.
-
- Examples
- --------
- >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
- ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
-
- >>> df
- key A
- 0 K0 A0
- 1 K1 A1
- 2 K2 A2
- 3 K3 A3
- 4 K4 A4
- 5 K5 A5
-
- >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
- ... 'B': ['B0', 'B1', 'B2']})
-
- >>> other
- key B
- 0 K0 B0
- 1 K1 B1
- 2 K2 B2
-
- Join DataFrames using their indexes.
-
- >>> df.join(other, lsuffix='_caller', rsuffix='_other')
- key_caller A key_other B
- 0 K0 A0 K0 B0
- 1 K1 A1 K1 B1
- 2 K2 A2 K2 B2
- 3 K3 A3 NaN NaN
- 4 K4 A4 NaN NaN
- 5 K5 A5 NaN NaN
-
- If we want to join using the key columns, we need to set key to be
- the index in both `df` and `other`. The joined DataFrame will have
- key as its index.
-
- >>> df.set_index('key').join(other.set_index('key'))
- A B
- key
- K0 A0 B0
- K1 A1 B1
- K2 A2 B2
- K3 A3 NaN
- K4 A4 NaN
- K5 A5 NaN
-
- Another option to join using the key columns is to use the `on`
- parameter. DataFrame.join always uses `other`'s index but we can use
- any column in `df`. This method preserves the original DataFrame's
- index in the result.
-
- >>> df.join(other.set_index('key'), on='key')
- key A B
- 0 K0 A0 B0
- 1 K1 A1 B1
- 2 K2 A2 B2
- 3 K3 A3 NaN
- 4 K4 A4 NaN
- 5 K5 A5 NaN
-
- Using non-unique key values shows how they are matched.
-
- >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],
- ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
-
- >>> df
- key A
- 0 K0 A0
- 1 K1 A1
- 2 K1 A2
- 3 K3 A3
- 4 K0 A4
- 5 K1 A5
-
- >>> df.join(other.set_index('key'), on='key', validate='m:1')
- key A B
- 0 K0 A0 B0
- 1 K1 A1 B1
- 2 K1 A2 B1
- 3 K3 A3 NaN
- 4 K0 A4 B0
- 5 K1 A5 B1
- """
- return self._join_compat(
- other,
- on=on,
- how=how,
- lsuffix=lsuffix,
- rsuffix=rsuffix,
- sort=sort,
- validate=validate,
- )
-
- def _join_compat(
- self,
- other: DataFrame | Series | Iterable[DataFrame | Series],
- on: IndexLabel | None = None,
- how: MergeHow = "left",
- lsuffix: str = "",
- rsuffix: str = "",
- sort: bool = False,
- validate: str | None = None,
- ):
- from pandas.core.reshape.concat import concat
- from pandas.core.reshape.merge import merge
-
- if isinstance(other, Series):
- if other.name is None:
- raise ValueError("Other Series must have a name")
- other = DataFrame({other.name: other})
-
- if isinstance(other, DataFrame):
- if how == "cross":
- return merge(
- self,
- other,
- how=how,
- on=on,
- suffixes=(lsuffix, rsuffix),
- sort=sort,
- validate=validate,
- )
- return merge(
- self,
- other,
- left_on=on,
- how=how,
- left_index=on is None,
- right_index=True,
- suffixes=(lsuffix, rsuffix),
- sort=sort,
- validate=validate,
- )
- else:
- if on is not None:
- raise ValueError(
- "Joining multiple DataFrames only supported for joining on index"
- )
-
- if rsuffix or lsuffix:
- raise ValueError(
- "Suffixes not supported when joining multiple DataFrames"
- )
-
- # Mypy thinks the RHS is a
- # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas
- # the LHS is an "Iterable[DataFrame]", but in reality both types are
- # "Iterable[Union[DataFrame, Series]]" due to the if statements
- frames = [cast("DataFrame | Series", self)] + list(other)
-
- can_concat = all(df.index.is_unique for df in frames)
-
- # join indexes only using concat
- if can_concat:
- if how == "left":
- res = concat(
- frames, axis=1, join="outer", verify_integrity=True, sort=sort
- )
- return res.reindex(self.index, copy=False)
- else:
- return concat(
- frames, axis=1, join=how, verify_integrity=True, sort=sort
- )
-
- joined = frames[0]
-
- for frame in frames[1:]:
- joined = merge(
- joined,
- frame,
- how=how,
- left_index=True,
- right_index=True,
- validate=validate,
- )
-
- return joined
-
- @Substitution("")
- @Appender(_merge_doc, indents=2)
- def merge(
- self,
- right: DataFrame | Series,
- how: MergeHow = "inner",
- on: IndexLabel | None = None,
- left_on: IndexLabel | None = None,
- right_on: IndexLabel | None = None,
- left_index: bool = False,
- right_index: bool = False,
- sort: bool = False,
- suffixes: Suffixes = ("_x", "_y"),
- copy: bool | None = None,
- indicator: str | bool = False,
- validate: str | None = None,
- ) -> DataFrame:
- from pandas.core.reshape.merge import merge
-
- return merge(
- self,
- right,
- how=how,
- on=on,
- left_on=left_on,
- right_on=right_on,
- left_index=left_index,
- right_index=right_index,
- sort=sort,
- suffixes=suffixes,
- copy=copy,
- indicator=indicator,
- validate=validate,
- )
-
- def round(
- self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs
- ) -> DataFrame:
- """
- Round a DataFrame to a variable number of decimal places.
-
- Parameters
- ----------
- decimals : int, dict, Series
- Number of decimal places to round each column to. If an int is
- given, round each column to the same number of places.
- Otherwise dict and Series round to variable numbers of places.
- Column names should be in the keys if `decimals` is a
- dict-like, or in the index if `decimals` is a Series. Any
- columns not included in `decimals` will be left as is. Elements
- of `decimals` which are not columns of the input will be
- ignored.
- *args
- Additional keywords have no effect but might be accepted for
- compatibility with numpy.
- **kwargs
- Additional keywords have no effect but might be accepted for
- compatibility with numpy.
-
- Returns
- -------
- DataFrame
- A DataFrame with the affected columns rounded to the specified
- number of decimal places.
-
- See Also
- --------
- numpy.around : Round a numpy array to the given number of decimals.
- Series.round : Round a Series to the given number of decimals.
-
- Examples
- --------
- >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
- ... columns=['dogs', 'cats'])
- >>> df
- dogs cats
- 0 0.21 0.32
- 1 0.01 0.67
- 2 0.66 0.03
- 3 0.21 0.18
-
- By providing an integer each column is rounded to the same number
- of decimal places
-
- >>> df.round(1)
- dogs cats
- 0 0.2 0.3
- 1 0.0 0.7
- 2 0.7 0.0
- 3 0.2 0.2
-
- With a dict, the number of places for specific columns can be
- specified with the column names as key and the number of decimal
- places as value
-
- >>> df.round({'dogs': 1, 'cats': 0})
- dogs cats
- 0 0.2 0.0
- 1 0.0 1.0
- 2 0.7 0.0
- 3 0.2 0.0
-
- Using a Series, the number of places for specific columns can be
- specified with the column names as index and the number of
- decimal places as value
-
- >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
- >>> df.round(decimals)
- dogs cats
- 0 0.2 0.0
- 1 0.0 1.0
- 2 0.7 0.0
- 3 0.2 0.0
- """
- from pandas.core.reshape.concat import concat
-
- def _dict_round(df: DataFrame, decimals):
- for col, vals in df.items():
- try:
- yield _series_round(vals, decimals[col])
- except KeyError:
- yield vals
-
- def _series_round(ser: Series, decimals: int) -> Series:
- if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):
- return ser.round(decimals)
- return ser
-
- nv.validate_round(args, kwargs)
-
- if isinstance(decimals, (dict, Series)):
- if isinstance(decimals, Series) and not decimals.index.is_unique:
- raise ValueError("Index of decimals must be unique")
- if is_dict_like(decimals) and not all(
- is_integer(value) for _, value in decimals.items()
- ):
- raise TypeError("Values in decimals must be integers")
- new_cols = list(_dict_round(self, decimals))
- elif is_integer(decimals):
- # Dispatch to Block.round
- return self._constructor(
- self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()),
- ).__finalize__(self, method="round")
- else:
- raise TypeError("decimals must be an integer, a dict-like or a Series")
-
- if new_cols is not None and len(new_cols) > 0:
- return self._constructor(
- concat(new_cols, axis=1), index=self.index, columns=self.columns
- ).__finalize__(self, method="round")
- else:
- return self.copy(deep=False)
-
- # ----------------------------------------------------------------------
- # Statistical methods, etc.
-
- def corr(
- self,
- method: CorrelationMethod = "pearson",
- min_periods: int = 1,
- numeric_only: bool = False,
- ) -> DataFrame:
- """
- Compute pairwise correlation of columns, excluding NA/null values.
-
- Parameters
- ----------
- method : {'pearson', 'kendall', 'spearman'} or callable
- Method of correlation:
-
- * pearson : standard correlation coefficient
- * kendall : Kendall Tau correlation coefficient
- * spearman : Spearman rank correlation
- * callable: callable with input two 1d ndarrays
- and returning a float. Note that the returned matrix from corr
- will have 1 along the diagonals and will be symmetric
- regardless of the callable's behavior.
- min_periods : int, optional
- Minimum number of observations required per pair of columns
- to have a valid result. Currently only available for Pearson
- and Spearman correlation.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- .. versionchanged:: 2.0.0
- The default value of ``numeric_only`` is now ``False``.
-
- Returns
- -------
- DataFrame
- Correlation matrix.
-
- See Also
- --------
- DataFrame.corrwith : Compute pairwise correlation with another
- DataFrame or Series.
- Series.corr : Compute the correlation between two Series.
-
- Notes
- -----
- Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.
-
- * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
- * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
- * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
-
- Examples
- --------
- >>> def histogram_intersection(a, b):
- ... v = np.minimum(a, b).sum().round(decimals=1)
- ... return v
- >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
- ... columns=['dogs', 'cats'])
- >>> df.corr(method=histogram_intersection)
- dogs cats
- dogs 1.0 0.3
- cats 0.3 1.0
-
- >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)],
- ... columns=['dogs', 'cats'])
- >>> df.corr(min_periods=3)
- dogs cats
- dogs 1.0 NaN
- cats NaN 1.0
- """ # noqa:E501
- data = self._get_numeric_data() if numeric_only else self
- cols = data.columns
- idx = cols.copy()
- mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
-
- if method == "pearson":
- correl = libalgos.nancorr(mat, minp=min_periods)
- elif method == "spearman":
- correl = libalgos.nancorr_spearman(mat, minp=min_periods)
- elif method == "kendall" or callable(method):
- if min_periods is None:
- min_periods = 1
- mat = mat.T
- corrf = nanops.get_corr_func(method)
- K = len(cols)
- correl = np.empty((K, K), dtype=float)
- mask = np.isfinite(mat)
- for i, ac in enumerate(mat):
- for j, bc in enumerate(mat):
- if i > j:
- continue
-
- valid = mask[i] & mask[j]
- if valid.sum() < min_periods:
- c = np.nan
- elif i == j:
- c = 1.0
- elif not valid.all():
- c = corrf(ac[valid], bc[valid])
- else:
- c = corrf(ac, bc)
- correl[i, j] = c
- correl[j, i] = c
- else:
- raise ValueError(
- "method must be either 'pearson', "
- "'spearman', 'kendall', or a callable, "
- f"'{method}' was supplied"
- )
-
- result = self._constructor(correl, index=idx, columns=cols, copy=False)
- return result.__finalize__(self, method="corr")
-
- def cov(
- self,
- min_periods: int | None = None,
- ddof: int | None = 1,
- numeric_only: bool = False,
- ) -> DataFrame:
- """
- Compute pairwise covariance of columns, excluding NA/null values.
-
- Compute the pairwise covariance among the series of a DataFrame.
- The returned data frame is the `covariance matrix
- <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
- of the DataFrame.
-
- Both NA and null values are automatically excluded from the
- calculation. (See the note below about bias from missing values.)
- A threshold can be set for the minimum number of
- observations for each value created. Comparisons with observations
- below this threshold will be returned as ``NaN``.
-
- This method is generally used for the analysis of time series data to
- understand the relationship between different measures
- across time.
-
- Parameters
- ----------
- min_periods : int, optional
- Minimum number of observations required per pair of columns
- to have a valid result.
-
- ddof : int, default 1
- Delta degrees of freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.
-
- .. versionadded:: 1.1.0
-
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- .. versionchanged:: 2.0.0
- The default value of ``numeric_only`` is now ``False``.
-
- Returns
- -------
- DataFrame
- The covariance matrix of the series of the DataFrame.
-
- See Also
- --------
- Series.cov : Compute covariance with another Series.
- core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample
- covariance.
- core.window.expanding.Expanding.cov : Expanding sample covariance.
- core.window.rolling.Rolling.cov : Rolling sample covariance.
-
- Notes
- -----
- Returns the covariance matrix of the DataFrame's time series.
- The covariance is normalized by N-ddof.
-
- For DataFrames that have Series that are missing data (assuming that
- data is `missing at random
- <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
- the returned covariance matrix will be an unbiased estimate
- of the variance and covariance between the member Series.
-
- However, for many applications this estimate may not be acceptable
- because the estimate covariance matrix is not guaranteed to be positive
- semi-definite. This could lead to estimate correlations having
- absolute values which are greater than one, and/or a non-invertible
- covariance matrix. See `Estimation of covariance matrices
- <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
- matrices>`__ for more details.
-
- Examples
- --------
- >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
- ... columns=['dogs', 'cats'])
- >>> df.cov()
- dogs cats
- dogs 0.666667 -1.000000
- cats -1.000000 1.666667
-
- >>> np.random.seed(42)
- >>> df = pd.DataFrame(np.random.randn(1000, 5),
- ... columns=['a', 'b', 'c', 'd', 'e'])
- >>> df.cov()
- a b c d e
- a 0.998438 -0.020161 0.059277 -0.008943 0.014144
- b -0.020161 1.059352 -0.008543 -0.024738 0.009826
- c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
- d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
- e 0.014144 0.009826 -0.000271 -0.013692 0.977795
-
- **Minimum number of periods**
-
- This method also supports an optional ``min_periods`` keyword
- that specifies the required minimum number of non-NA observations for
- each column pair in order to have a valid result:
-
- >>> np.random.seed(42)
- >>> df = pd.DataFrame(np.random.randn(20, 3),
- ... columns=['a', 'b', 'c'])
- >>> df.loc[df.index[:5], 'a'] = np.nan
- >>> df.loc[df.index[5:10], 'b'] = np.nan
- >>> df.cov(min_periods=12)
- a b c
- a 0.316741 NaN -0.150812
- b NaN 1.248003 0.191417
- c -0.150812 0.191417 0.895202
- """
- data = self._get_numeric_data() if numeric_only else self
- cols = data.columns
- idx = cols.copy()
- mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
-
- if notna(mat).all():
- if min_periods is not None and min_periods > len(mat):
- base_cov = np.empty((mat.shape[1], mat.shape[1]))
- base_cov.fill(np.nan)
- else:
- base_cov = np.cov(mat.T, ddof=ddof)
- base_cov = base_cov.reshape((len(cols), len(cols)))
- else:
- base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)
-
- result = self._constructor(base_cov, index=idx, columns=cols, copy=False)
- return result.__finalize__(self, method="cov")
-
- def corrwith(
- self,
- other: DataFrame | Series,
- axis: Axis = 0,
- drop: bool = False,
- method: CorrelationMethod = "pearson",
- numeric_only: bool = False,
- ) -> Series:
- """
- Compute pairwise correlation.
-
- Pairwise correlation is computed between rows or columns of
- DataFrame with rows or columns of Series or DataFrame. DataFrames
- are first aligned along both axes before computing the
- correlations.
-
- Parameters
- ----------
- other : DataFrame, Series
- Object with which to compute correlations.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for
- column-wise.
- drop : bool, default False
- Drop missing indices from result.
- method : {'pearson', 'kendall', 'spearman'} or callable
- Method of correlation:
-
- * pearson : standard correlation coefficient
- * kendall : Kendall Tau correlation coefficient
- * spearman : Spearman rank correlation
- * callable: callable with input two 1d ndarrays
- and returning a float.
-
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- .. versionchanged:: 2.0.0
- The default value of ``numeric_only`` is now ``False``.
-
- Returns
- -------
- Series
- Pairwise correlations.
-
- See Also
- --------
- DataFrame.corr : Compute pairwise correlation of columns.
-
- Examples
- --------
- >>> index = ["a", "b", "c", "d", "e"]
- >>> columns = ["one", "two", "three", "four"]
- >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns)
- >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns)
- >>> df1.corrwith(df2)
- one 1.0
- two 1.0
- three 1.0
- four 1.0
- dtype: float64
-
- >>> df2.corrwith(df1, axis=1)
- a 1.0
- b 1.0
- c 1.0
- d 1.0
- e NaN
- dtype: float64
- """ # noqa:E501
- axis = self._get_axis_number(axis)
- this = self._get_numeric_data() if numeric_only else self
-
- if isinstance(other, Series):
- return this.apply(lambda x: other.corr(x, method=method), axis=axis)
-
- if numeric_only:
- other = other._get_numeric_data()
- left, right = this.align(other, join="inner", copy=False)
-
- if axis == 1:
- left = left.T
- right = right.T
-
- if method == "pearson":
- # mask missing values
- left = left + right * 0
- right = right + left * 0
-
- # demeaned data
- ldem = left - left.mean(numeric_only=numeric_only)
- rdem = right - right.mean(numeric_only=numeric_only)
-
- num = (ldem * rdem).sum()
- dom = (
- (left.count() - 1)
- * left.std(numeric_only=numeric_only)
- * right.std(numeric_only=numeric_only)
- )
-
- correl = num / dom
-
- elif method in ["kendall", "spearman"] or callable(method):
-
- def c(x):
- return nanops.nancorr(x[0], x[1], method=method)
-
- correl = self._constructor_sliced(
- map(c, zip(left.values.T, right.values.T)),
- index=left.columns,
- copy=False,
- )
-
- else:
- raise ValueError(
- f"Invalid method {method} was passed, "
- "valid methods are: 'pearson', 'kendall', "
- "'spearman', or callable"
- )
-
- if not drop:
- # Find non-matching labels along the given axis
- # and append missing correlations (GH 22375)
- raxis: AxisInt = 1 if axis == 0 else 0
- result_index = this._get_axis(raxis).union(other._get_axis(raxis))
- idx_diff = result_index.difference(correl.index)
-
- if len(idx_diff) > 0:
- correl = correl._append(
- Series([np.nan] * len(idx_diff), index=idx_diff)
- )
-
- return correl
-
- # ----------------------------------------------------------------------
- # ndarray-like stats methods
-
- def count(self, axis: Axis = 0, numeric_only: bool = False):
- """
- Count non-NA cells for each column or row.
-
- The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
- on `pandas.options.mode.use_inf_as_na`) are considered NA.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- If 0 or 'index' counts are generated for each column.
- If 1 or 'columns' counts are generated for each row.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- Returns
- -------
- Series or DataFrame
- For each column/row the number of non-NA/null entries.
- If `level` is specified returns a `DataFrame`.
-
- See Also
- --------
- Series.count: Number of non-NA elements in a Series.
- DataFrame.value_counts: Count unique combinations of columns.
- DataFrame.shape: Number of DataFrame rows and columns (including NA
- elements).
- DataFrame.isna: Boolean same-sized DataFrame showing places of NA
- elements.
-
- Examples
- --------
- Constructing DataFrame from a dictionary:
-
- >>> df = pd.DataFrame({"Person":
- ... ["John", "Myla", "Lewis", "John", "Myla"],
- ... "Age": [24., np.nan, 21., 33, 26],
- ... "Single": [False, True, True, True, False]})
- >>> df
- Person Age Single
- 0 John 24.0 False
- 1 Myla NaN True
- 2 Lewis 21.0 True
- 3 John 33.0 True
- 4 Myla 26.0 False
-
- Notice the uncounted NA values:
-
- >>> df.count()
- Person 5
- Age 4
- Single 5
- dtype: int64
-
- Counts for each **row**:
-
- >>> df.count(axis='columns')
- 0 3
- 1 2
- 2 3
- 3 3
- 4 3
- dtype: int64
- """
- axis = self._get_axis_number(axis)
-
- if numeric_only:
- frame = self._get_numeric_data()
- else:
- frame = self
-
- # GH #423
- if len(frame._get_axis(axis)) == 0:
- result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))
- else:
- if frame._is_mixed_type or frame._mgr.any_extension_types:
- # the or any_extension_types is really only hit for single-
- # column frames with an extension array
- result = notna(frame).sum(axis=axis)
- else:
- # GH13407
- series_counts = notna(frame).sum(axis=axis)
- counts = series_counts._values
- result = self._constructor_sliced(
- counts, index=frame._get_agg_axis(axis), copy=False
- )
-
- return result.astype("int64").__finalize__(self, method="count")
-
- def _reduce(
- self,
- op,
- name: str,
- *,
- axis: Axis = 0,
- skipna: bool = True,
- numeric_only: bool = False,
- filter_type=None,
- **kwds,
- ):
- assert filter_type is None or filter_type == "bool", filter_type
- out_dtype = "bool" if filter_type == "bool" else None
-
- if axis is not None:
- axis = self._get_axis_number(axis)
-
- def func(values: np.ndarray):
- # We only use this in the case that operates on self.values
- return op(values, axis=axis, skipna=skipna, **kwds)
-
- def blk_func(values, axis: Axis = 1):
- if isinstance(values, ExtensionArray):
- if not is_1d_only_ea_dtype(values.dtype) and not isinstance(
- self._mgr, ArrayManager
- ):
- return values._reduce(name, axis=1, skipna=skipna, **kwds)
- return values._reduce(name, skipna=skipna, **kwds)
- else:
- return op(values, axis=axis, skipna=skipna, **kwds)
-
- def _get_data() -> DataFrame:
- if filter_type is None:
- data = self._get_numeric_data()
- else:
- # GH#25101, GH#24434
- assert filter_type == "bool"
- data = self._get_bool_data()
- return data
-
- # Case with EAs see GH#35881
- df = self
- if numeric_only:
- df = _get_data()
- if axis is None:
- return func(df.values)
- elif axis == 1:
- if len(df.index) == 0:
- # Taking a transpose would result in no columns, losing the dtype.
- # In the empty case, reducing along axis 0 or 1 gives the same
- # result dtype, so reduce with axis=0 and ignore values
- result = df._reduce(
- op,
- name,
- axis=0,
- skipna=skipna,
- numeric_only=False,
- filter_type=filter_type,
- **kwds,
- ).iloc[:0]
- result.index = df.index
- return result
- df = df.T
-
- # After possibly _get_data and transposing, we are now in the
- # simple case where we can use BlockManager.reduce
- res = df._mgr.reduce(blk_func)
- out = df._constructor(res).iloc[0]
- if out_dtype is not None:
- out = out.astype(out_dtype)
- elif (df._mgr.get_dtypes() == object).any():
- out = out.astype(object)
- elif len(self) == 0 and name in ("sum", "prod"):
- # Even if we are object dtype, follow numpy and return
- # float64, see test_apply_funcs_over_empty
- out = out.astype(np.float64)
-
- return out
-
- def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
- """
- Special case for _reduce to try to avoid a potentially-expensive transpose.
-
- Apply the reduction block-wise along axis=1 and then reduce the resulting
- 1D arrays.
- """
- if name == "all":
- result = np.ones(len(self), dtype=bool)
- ufunc = np.logical_and
- elif name == "any":
- result = np.zeros(len(self), dtype=bool)
- # error: Incompatible types in assignment
- # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'],
- # Literal[20], Literal[False]]", variable has type
- # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20],
- # Literal[True]]")
- ufunc = np.logical_or # type: ignore[assignment]
- else:
- raise NotImplementedError(name)
-
- for arr in self._mgr.arrays:
- middle = func(arr, axis=0, skipna=skipna)
- result = ufunc(result, middle)
-
- res_ser = self._constructor_sliced(result, index=self.index, copy=False)
- return res_ser
-
- def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
- """
- Count number of distinct elements in specified axis.
-
- Return Series with number of distinct elements. Can ignore NaN
- values.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
- column-wise.
- dropna : bool, default True
- Don't include NaN in the counts.
-
- Returns
- -------
- Series
-
- See Also
- --------
- Series.nunique: Method nunique for Series.
- DataFrame.count: Count non-NA cells for each column or row.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
- >>> df.nunique()
- A 3
- B 2
- dtype: int64
-
- >>> df.nunique(axis=1)
- 0 1
- 1 2
- 2 2
- dtype: int64
- """
- return self.apply(Series.nunique, axis=axis, dropna=dropna)
-
- @doc(_shared_docs["idxmin"], numeric_only_default="False")
- def idxmin(
- self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
- ) -> Series:
- axis = self._get_axis_number(axis)
- if numeric_only:
- data = self._get_numeric_data()
- else:
- data = self
-
- res = data._reduce(
- nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
- )
- indices = res._values
-
- # indices will always be np.ndarray since axis is not None and
- # values is a 2d array for DataFrame
- # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
- assert isinstance(indices, np.ndarray) # for mypy
-
- index = data._get_axis(axis)
- result = [index[i] if i >= 0 else np.nan for i in indices]
- final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
- return final_result.__finalize__(self, method="idxmin")
-
- @doc(_shared_docs["idxmax"], numeric_only_default="False")
- def idxmax(
- self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
- ) -> Series:
- axis = self._get_axis_number(axis)
- if numeric_only:
- data = self._get_numeric_data()
- else:
- data = self
-
- res = data._reduce(
- nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
- )
- indices = res._values
-
- # indices will always be np.ndarray since axis is not None and
- # values is a 2d array for DataFrame
- # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
- assert isinstance(indices, np.ndarray) # for mypy
-
- index = data._get_axis(axis)
- result = [index[i] if i >= 0 else np.nan for i in indices]
- final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
- return final_result.__finalize__(self, method="idxmax")
-
- def _get_agg_axis(self, axis_num: int) -> Index:
- """
- Let's be explicit about this.
- """
- if axis_num == 0:
- return self.columns
- elif axis_num == 1:
- return self.index
- else:
- raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
-
- def mode(
- self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True
- ) -> DataFrame:
- """
- Get the mode(s) of each element along the selected axis.
-
- The mode of a set of values is the value that appears most often.
- It can be multiple values.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to iterate over while searching for the mode:
-
- * 0 or 'index' : get mode of each column
- * 1 or 'columns' : get mode of each row.
-
- numeric_only : bool, default False
- If True, only apply to numeric columns.
- dropna : bool, default True
- Don't consider counts of NaN/NaT.
-
- Returns
- -------
- DataFrame
- The modes of each column or row.
-
- See Also
- --------
- Series.mode : Return the highest frequency value in a Series.
- Series.value_counts : Return the counts of values in a Series.
-
- Examples
- --------
- >>> df = pd.DataFrame([('bird', 2, 2),
- ... ('mammal', 4, np.nan),
- ... ('arthropod', 8, 0),
- ... ('bird', 2, np.nan)],
- ... index=('falcon', 'horse', 'spider', 'ostrich'),
- ... columns=('species', 'legs', 'wings'))
- >>> df
- species legs wings
- falcon bird 2 2.0
- horse mammal 4 NaN
- spider arthropod 8 0.0
- ostrich bird 2 NaN
-
- By default, missing values are not considered, and the mode of wings
- are both 0 and 2. Because the resulting DataFrame has two rows,
- the second row of ``species`` and ``legs`` contains ``NaN``.
-
- >>> df.mode()
- species legs wings
- 0 bird 2.0 0.0
- 1 NaN NaN 2.0
-
- Setting ``dropna=False`` ``NaN`` values are considered and they can be
- the mode (like for wings).
-
- >>> df.mode(dropna=False)
- species legs wings
- 0 bird 2 NaN
-
- Setting ``numeric_only=True``, only the mode of numeric columns is
- computed, and columns of other types are ignored.
-
- >>> df.mode(numeric_only=True)
- legs wings
- 0 2.0 0.0
- 1 NaN 2.0
-
- To compute the mode over columns and not rows, use the axis parameter:
-
- >>> df.mode(axis='columns', numeric_only=True)
- 0 1
- falcon 2.0 NaN
- horse 4.0 NaN
- spider 0.0 8.0
- ostrich 2.0 NaN
- """
- data = self if not numeric_only else self._get_numeric_data()
-
- def f(s):
- return s.mode(dropna=dropna)
-
- data = data.apply(f, axis=axis)
- # Ensure index is type stable (should always use int index)
- if data.empty:
- data.index = default_index(0)
-
- return data
-
- @overload
- def quantile(
- self,
- q: float = ...,
- axis: Axis = ...,
- numeric_only: bool = ...,
- interpolation: QuantileInterpolation = ...,
- ) -> Series:
- ...
-
- @overload
- def quantile(
- self,
- q: AnyArrayLike | Sequence[float],
- axis: Axis = ...,
- numeric_only: bool = ...,
- interpolation: QuantileInterpolation = ...,
- ) -> Series | DataFrame:
- ...
-
- @overload
- def quantile(
- self,
- q: float | AnyArrayLike | Sequence[float] = ...,
- axis: Axis = ...,
- numeric_only: bool = ...,
- interpolation: QuantileInterpolation = ...,
- ) -> Series | DataFrame:
- ...
-
- def quantile(
- self,
- q: float | AnyArrayLike | Sequence[float] = 0.5,
- axis: Axis = 0,
- numeric_only: bool = False,
- interpolation: QuantileInterpolation = "linear",
- method: Literal["single", "table"] = "single",
- ) -> Series | DataFrame:
- """
- Return values at the given quantile over requested axis.
-
- Parameters
- ----------
- q : float or array-like, default 0.5 (50% quantile)
- Value between 0 <= q <= 1, the quantile(s) to compute.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionchanged:: 2.0.0
- The default value of ``numeric_only`` is now ``False``.
-
- interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
- This optional parameter specifies the interpolation method to use,
- when the desired quantile lies between two data points `i` and `j`:
-
- * linear: `i + (j - i) * fraction`, where `fraction` is the
- fractional part of the index surrounded by `i` and `j`.
- * lower: `i`.
- * higher: `j`.
- * nearest: `i` or `j` whichever is nearest.
- * midpoint: (`i` + `j`) / 2.
- method : {'single', 'table'}, default 'single'
- Whether to compute quantiles per-column ('single') or over all columns
- ('table'). When 'table', the only allowed interpolation methods are
- 'nearest', 'lower', and 'higher'.
-
- Returns
- -------
- Series or DataFrame
-
- If ``q`` is an array, a DataFrame will be returned where the
- index is ``q``, the columns are the columns of self, and the
- values are the quantiles.
- If ``q`` is a float, a Series will be returned where the
- index is the columns of self and the values are the quantiles.
-
- See Also
- --------
- core.window.rolling.Rolling.quantile: Rolling quantile.
- numpy.percentile: Numpy function to compute the percentile.
-
- Examples
- --------
- >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
- ... columns=['a', 'b'])
- >>> df.quantile(.1)
- a 1.3
- b 3.7
- Name: 0.1, dtype: float64
- >>> df.quantile([.1, .5])
- a b
- 0.1 1.3 3.7
- 0.5 2.5 55.0
-
- Specifying `method='table'` will compute the quantile over all columns.
-
- >>> df.quantile(.1, method="table", interpolation="nearest")
- a 1
- b 1
- Name: 0.1, dtype: int64
- >>> df.quantile([.1, .5], method="table", interpolation="nearest")
- a b
- 0.1 1 1
- 0.5 3 100
-
- Specifying `numeric_only=False` will also compute the quantile of
- datetime and timedelta data.
-
- >>> df = pd.DataFrame({'A': [1, 2],
- ... 'B': [pd.Timestamp('2010'),
- ... pd.Timestamp('2011')],
- ... 'C': [pd.Timedelta('1 days'),
- ... pd.Timedelta('2 days')]})
- >>> df.quantile(0.5, numeric_only=False)
- A 1.5
- B 2010-07-02 12:00:00
- C 1 days 12:00:00
- Name: 0.5, dtype: object
- """
- validate_percentile(q)
- axis = self._get_axis_number(axis)
-
- if not is_list_like(q):
- # BlockManager.quantile expects listlike, so we wrap and unwrap here
- # error: List item 0 has incompatible type "Union[float, Union[Union[
- # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]";
- # expected "float"
- res_df = self.quantile( # type: ignore[call-overload]
- [q],
- axis=axis,
- numeric_only=numeric_only,
- interpolation=interpolation,
- method=method,
- )
- if method == "single":
- res = res_df.iloc[0]
- else:
- # cannot directly iloc over sparse arrays
- res = res_df.T.iloc[:, 0]
- if axis == 1 and len(self) == 0:
- # GH#41544 try to get an appropriate dtype
- dtype = find_common_type(list(self.dtypes))
- if needs_i8_conversion(dtype):
- return res.astype(dtype)
- return res
-
- q = Index(q, dtype=np.float64)
- data = self._get_numeric_data() if numeric_only else self
-
- if axis == 1:
- data = data.T
-
- if len(data.columns) == 0:
- # GH#23925 _get_numeric_data may have dropped all columns
- cols = Index([], name=self.columns.name)
-
- dtype = np.float64
- if axis == 1:
- # GH#41544 try to get an appropriate dtype
- cdtype = find_common_type(list(self.dtypes))
- if needs_i8_conversion(cdtype):
- dtype = cdtype
-
- res = self._constructor([], index=q, columns=cols, dtype=dtype)
- return res.__finalize__(self, method="quantile")
-
- valid_method = {"single", "table"}
- if method not in valid_method:
- raise ValueError(
- f"Invalid method: {method}. Method must be in {valid_method}."
- )
- if method == "single":
- res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation)
- elif method == "table":
- valid_interpolation = {"nearest", "lower", "higher"}
- if interpolation not in valid_interpolation:
- raise ValueError(
- f"Invalid interpolation: {interpolation}. "
- f"Interpolation must be in {valid_interpolation}"
- )
- # handle degenerate case
- if len(data) == 0:
- if data.ndim == 2:
- dtype = find_common_type(list(self.dtypes))
- else:
- dtype = self.dtype
- return self._constructor([], index=q, columns=data.columns, dtype=dtype)
-
- q_idx = np.quantile( # type: ignore[call-overload]
- np.arange(len(data)), q, **{np_percentile_argname: interpolation}
- )
-
- by = data.columns
- if len(by) > 1:
- keys = [data._get_label_or_level_values(x) for x in by]
- indexer = lexsort_indexer(keys)
- else:
- by = by[0]
- k = data._get_label_or_level_values(by) # type: ignore[arg-type]
- indexer = nargsort(k)
-
- res = data._mgr.take(indexer[q_idx], verify=False)
- res.axes[1] = q
-
- result = self._constructor(res)
- return result.__finalize__(self, method="quantile")
-
- @doc(NDFrame.asfreq, **_shared_doc_kwargs)
- def asfreq(
- self,
- freq: Frequency,
- method: FillnaOptions | None = None,
- how: str | None = None,
- normalize: bool = False,
- fill_value: Hashable = None,
- ) -> DataFrame:
- return super().asfreq(
- freq=freq,
- method=method,
- how=how,
- normalize=normalize,
- fill_value=fill_value,
- )
-
- @doc(NDFrame.resample, **_shared_doc_kwargs)
- def resample(
- self,
- rule,
- axis: Axis = 0,
- closed: str | None = None,
- label: str | None = None,
- convention: str = "start",
- kind: str | None = None,
- on: Level = None,
- level: Level = None,
- origin: str | TimestampConvertibleTypes = "start_day",
- offset: TimedeltaConvertibleTypes | None = None,
- group_keys: bool = False,
- ) -> Resampler:
- return super().resample(
- rule=rule,
- axis=axis,
- closed=closed,
- label=label,
- convention=convention,
- kind=kind,
- on=on,
- level=level,
- origin=origin,
- offset=offset,
- group_keys=group_keys,
- )
-
- def to_timestamp(
- self,
- freq: Frequency | None = None,
- how: str = "start",
- axis: Axis = 0,
- copy: bool | None = None,
- ) -> DataFrame:
- """
- Cast to DatetimeIndex of timestamps, at *beginning* of period.
-
- Parameters
- ----------
- freq : str, default frequency of PeriodIndex
- Desired frequency.
- how : {'s', 'e', 'start', 'end'}
- Convention for converting period to timestamp; start of period
- vs. end.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to convert (the index by default).
- copy : bool, default True
- If False then underlying input data is not copied.
-
- Returns
- -------
- DataFrame
- The DataFrame has a DatetimeIndex.
-
- Examples
- --------
- >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y')
- >>> d = {'col1': [1, 2], 'col2': [3, 4]}
- >>> df1 = pd.DataFrame(data=d, index=idx)
- >>> df1
- col1 col2
- 2023 1 3
- 2024 2 4
-
- The resulting timestamps will be at the beginning of the year in this case
-
- >>> df1 = df1.to_timestamp()
- >>> df1
- col1 col2
- 2023-01-01 1 3
- 2024-01-01 2 4
- >>> df1.index
- DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None)
-
- Using `freq` which is the offset that the Timestamps will have
-
- >>> df2 = pd.DataFrame(data=d, index=idx)
- >>> df2 = df2.to_timestamp(freq='M')
- >>> df2
- col1 col2
- 2023-01-31 1 3
- 2024-01-31 2 4
- >>> df2.index
- DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None)
- """
- new_obj = self.copy(deep=copy and not using_copy_on_write())
-
- axis_name = self._get_axis_name(axis)
- old_ax = getattr(self, axis_name)
- if not isinstance(old_ax, PeriodIndex):
- raise TypeError(f"unsupported Type {type(old_ax).__name__}")
-
- new_ax = old_ax.to_timestamp(freq=freq, how=how)
-
- setattr(new_obj, axis_name, new_ax)
- return new_obj
-
- def to_period(
- self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None
- ) -> DataFrame:
- """
- Convert DataFrame from DatetimeIndex to PeriodIndex.
-
- Convert DataFrame from DatetimeIndex to PeriodIndex with desired
- frequency (inferred from index if not passed).
-
- Parameters
- ----------
- freq : str, default
- Frequency of the PeriodIndex.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to convert (the index by default).
- copy : bool, default True
- If False then underlying input data is not copied.
-
- Returns
- -------
- DataFrame
- The DataFrame has a PeriodIndex.
-
- Examples
- --------
- >>> idx = pd.to_datetime(
- ... [
- ... "2001-03-31 00:00:00",
- ... "2002-05-31 00:00:00",
- ... "2003-08-31 00:00:00",
- ... ]
- ... )
-
- >>> idx
- DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],
- dtype='datetime64[ns]', freq=None)
-
- >>> idx.to_period("M")
- PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')
-
- For the yearly frequency
-
- >>> idx.to_period("Y")
- PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]')
- """
- new_obj = self.copy(deep=copy and not using_copy_on_write())
-
- axis_name = self._get_axis_name(axis)
- old_ax = getattr(self, axis_name)
- if not isinstance(old_ax, DatetimeIndex):
- raise TypeError(f"unsupported Type {type(old_ax).__name__}")
-
- new_ax = old_ax.to_period(freq=freq)
-
- setattr(new_obj, axis_name, new_ax)
- return new_obj
-
- def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
- """
- Whether each element in the DataFrame is contained in values.
-
- Parameters
- ----------
- values : iterable, Series, DataFrame or dict
- The result will only be true at a location if all the
- labels match. If `values` is a Series, that's the index. If
- `values` is a dict, the keys must be the column names,
- which must match. If `values` is a DataFrame,
- then both the index and column labels must match.
-
- Returns
- -------
- DataFrame
- DataFrame of booleans showing whether each element in the DataFrame
- is contained in values.
-
- See Also
- --------
- DataFrame.eq: Equality test for DataFrame.
- Series.isin: Equivalent method on Series.
- Series.str.contains: Test if pattern or regex is contained within a
- string of a Series or Index.
-
- Examples
- --------
- >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
- ... index=['falcon', 'dog'])
- >>> df
- num_legs num_wings
- falcon 2 2
- dog 4 0
-
- When ``values`` is a list check whether every value in the DataFrame
- is present in the list (which animals have 0 or 2 legs or wings)
-
- >>> df.isin([0, 2])
- num_legs num_wings
- falcon True True
- dog False True
-
- To check if ``values`` is *not* in the DataFrame, use the ``~`` operator:
-
- >>> ~df.isin([0, 2])
- num_legs num_wings
- falcon False False
- dog True False
-
- When ``values`` is a dict, we can pass values to check for each
- column separately:
-
- >>> df.isin({'num_wings': [0, 3]})
- num_legs num_wings
- falcon False False
- dog False True
-
- When ``values`` is a Series or DataFrame the index and column must
- match. Note that 'falcon' does not match based on the number of legs
- in other.
-
- >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]},
- ... index=['spider', 'falcon'])
- >>> df.isin(other)
- num_legs num_wings
- falcon False True
- dog False False
- """
- if isinstance(values, dict):
- from pandas.core.reshape.concat import concat
-
- values = collections.defaultdict(list, values)
- result = concat(
- (
- self.iloc[:, [i]].isin(values[col])
- for i, col in enumerate(self.columns)
- ),
- axis=1,
- )
- elif isinstance(values, Series):
- if not values.index.is_unique:
- raise ValueError("cannot compute isin with a duplicate axis.")
- result = self.eq(values.reindex_like(self), axis="index")
- elif isinstance(values, DataFrame):
- if not (values.columns.is_unique and values.index.is_unique):
- raise ValueError("cannot compute isin with a duplicate axis.")
- result = self.eq(values.reindex_like(self))
- else:
- if not is_list_like(values):
- raise TypeError(
- "only list-like or dict-like objects are allowed "
- "to be passed to DataFrame.isin(), "
- f"you passed a '{type(values).__name__}'"
- )
- # error: Argument 2 to "isin" has incompatible type "Union[Sequence[Any],
- # Mapping[Any, Any]]"; expected "Union[Union[ExtensionArray,
- # ndarray[Any, Any]], Index, Series]"
- result = self._constructor(
- algorithms.isin(
- self.values.ravel(), values # type: ignore[arg-type]
- ).reshape(self.shape),
- self.index,
- self.columns,
- copy=False,
- )
- return result.__finalize__(self, method="isin")
-
- # ----------------------------------------------------------------------
- # Add index and columns
- _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"]
- _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {
- **NDFrame._AXIS_TO_AXIS_NUMBER,
- 1: 1,
- "columns": 1,
- }
- _AXIS_LEN = len(_AXIS_ORDERS)
- _info_axis_number: Literal[1] = 1
- _info_axis_name: Literal["columns"] = "columns"
-
- index = properties.AxisProperty(
- axis=1, doc="The index (row labels) of the DataFrame."
- )
- columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.")
-
- # ----------------------------------------------------------------------
- # Add plotting methods to DataFrame
- plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
- hist = pandas.plotting.hist_frame
- boxplot = pandas.plotting.boxplot_frame
- sparse = CachedAccessor("sparse", SparseFrameAccessor)
-
- # ----------------------------------------------------------------------
- # Internal Interface Methods
-
- def _to_dict_of_blocks(self, copy: bool = True):
- """
- Return a dict of dtype -> Constructor Types that
- each is a homogeneous dtype.
-
- Internal ONLY - only works for BlockManager
- """
- mgr = self._mgr
- # convert to BlockManager if needed -> this way support ArrayManager as well
- mgr = mgr_to_mgr(mgr, "block")
- mgr = cast(BlockManager, mgr)
- return {
- k: self._constructor(v).__finalize__(self)
- for k, v, in mgr.to_dict(copy=copy).items()
- }
-
- @property
- def values(self) -> np.ndarray:
- """
- Return a Numpy representation of the DataFrame.
-
- .. warning::
-
- We recommend using :meth:`DataFrame.to_numpy` instead.
-
- Only the values in the DataFrame will be returned, the axes labels
- will be removed.
-
- Returns
- -------
- numpy.ndarray
- The values of the DataFrame.
-
- See Also
- --------
- DataFrame.to_numpy : Recommended alternative to this method.
- DataFrame.index : Retrieve the index labels.
- DataFrame.columns : Retrieving the column names.
-
- Notes
- -----
- The dtype will be a lower-common-denominator dtype (implicit
- upcasting); that is to say if the dtypes (even of numeric types)
- are mixed, the one that accommodates all will be chosen. Use this
- with care if you are not dealing with the blocks.
-
- e.g. If the dtypes are float16 and float32, dtype will be upcast to
- float32. If dtypes are int32 and uint8, dtype will be upcast to
- int32. By :func:`numpy.find_common_type` convention, mixing int64
- and uint64 will result in a float64 dtype.
-
- Examples
- --------
- A DataFrame where all columns are the same type (e.g., int64) results
- in an array of the same type.
-
- >>> df = pd.DataFrame({'age': [ 3, 29],
- ... 'height': [94, 170],
- ... 'weight': [31, 115]})
- >>> df
- age height weight
- 0 3 94 31
- 1 29 170 115
- >>> df.dtypes
- age int64
- height int64
- weight int64
- dtype: object
- >>> df.values
- array([[ 3, 94, 31],
- [ 29, 170, 115]])
-
- A DataFrame with mixed type columns(e.g., str/object, int64, float32)
- results in an ndarray of the broadest type that accommodates these
- mixed types (e.g., object).
-
- >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),
- ... ('lion', 80.5, 1),
- ... ('monkey', np.nan, None)],
- ... columns=('name', 'max_speed', 'rank'))
- >>> df2.dtypes
- name object
- max_speed float64
- rank object
- dtype: object
- >>> df2.values
- array([['parrot', 24.0, 'second'],
- ['lion', 80.5, 1],
- ['monkey', nan, None]], dtype=object)
- """
- return self._mgr.as_array()
-
- @overload
- def ffill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[False] = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def ffill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[True],
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> None:
- ...
-
- @overload
- def ffill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: bool = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> DataFrame | None:
- ...
-
- def ffill(
- self,
- *,
- axis: None | Axis = None,
- inplace: bool = False,
- limit: None | int = None,
- downcast: dict | None = None,
- ) -> DataFrame | None:
- return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
-
- @overload
- def bfill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[False] = ...,
- limit: None | int = ...,
- downcast=...,
- ) -> DataFrame:
- ...
-
- @overload
- def bfill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[True],
- limit: None | int = ...,
- downcast=...,
- ) -> None:
- ...
-
- @overload
- def bfill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: bool = ...,
- limit: None | int = ...,
- downcast=...,
- ) -> DataFrame | None:
- ...
-
- def bfill(
- self,
- *,
- axis: None | Axis = None,
- inplace: bool = False,
- limit: None | int = None,
- downcast=None,
- ) -> DataFrame | None:
- return super().bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
-
- def clip(
- self: DataFrame,
- lower: float | None = None,
- upper: float | None = None,
- *,
- axis: Axis | None = None,
- inplace: bool = False,
- **kwargs,
- ) -> DataFrame | None:
- return super().clip(lower, upper, axis=axis, inplace=inplace, **kwargs)
-
- def interpolate(
- self: DataFrame,
- method: str = "linear",
- *,
- axis: Axis = 0,
- limit: int | None = None,
- inplace: bool = False,
- limit_direction: str | None = None,
- limit_area: str | None = None,
- downcast: str | None = None,
- **kwargs,
- ) -> DataFrame | None:
- return super().interpolate(
- method=method,
- axis=axis,
- limit=limit,
- inplace=inplace,
- limit_direction=limit_direction,
- limit_area=limit_area,
- downcast=downcast,
- **kwargs,
- )
-
- @overload
- def where(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[False] = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def where(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[True],
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> None:
- ...
-
- @overload
- def where(
- self,
- cond,
- other=...,
- *,
- inplace: bool = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> DataFrame | None:
- ...
-
- def where(
- self,
- cond,
- other=lib.no_default,
- *,
- inplace: bool = False,
- axis: Axis | None = None,
- level: Level = None,
- ) -> DataFrame | None:
- return super().where(
- cond,
- other,
- inplace=inplace,
- axis=axis,
- level=level,
- )
-
- @overload
- def mask(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[False] = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def mask(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[True],
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> None:
- ...
-
- @overload
- def mask(
- self,
- cond,
- other=...,
- *,
- inplace: bool = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> DataFrame | None:
- ...
-
- def mask(
- self,
- cond,
- other=lib.no_default,
- *,
- inplace: bool = False,
- axis: Axis | None = None,
- level: Level = None,
- ) -> DataFrame | None:
- return super().mask(
- cond,
- other,
- inplace=inplace,
- axis=axis,
- level=level,
- )
-
-
-DataFrame._add_numeric_operations()
-
-ops.add_flex_arithmetic_methods(DataFrame)
-
-
-def _from_nested_dict(data) -> collections.defaultdict:
- new_data: collections.defaultdict = collections.defaultdict(dict)
- for index, s in data.items():
- for col, v in s.items():
- new_data[col][index] = v
- return new_data
-
-
-def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
- # reindex if necessary
-
- if value.index.equals(index) or not len(index):
- return value._values.copy()
-
- # GH#4107
- try:
- reindexed_value = value.reindex(index)._values
- except ValueError as err:
- # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
- if not value.index.is_unique:
- # duplicate axis
- raise err
-
- raise TypeError(
- "incompatible index of inserted column with frame index"
- ) from err
- return reindexed_value
diff --git a/contrib/python/pandas/py3/pandas/core/generic.py b/contrib/python/pandas/py3/pandas/core/generic.py
deleted file mode 100644
index 0243ec38d3b..00000000000
--- a/contrib/python/pandas/py3/pandas/core/generic.py
+++ /dev/null
@@ -1,12604 +0,0 @@
-# pyright: reportPropertyTypeMismatch=false
-from __future__ import annotations
-
-import collections
-import datetime as dt
-from functools import partial
-import gc
-from json import loads
-import operator
-import pickle
-import re
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- ClassVar,
- Hashable,
- Iterator,
- Literal,
- Mapping,
- NoReturn,
- Sequence,
- Type,
- cast,
- final,
- overload,
-)
-import warnings
-import weakref
-
-import numpy as np
-
-from pandas._config import (
- config,
- using_copy_on_write,
-)
-
-from pandas._libs import lib
-from pandas._libs.lib import is_range_indexer
-from pandas._libs.tslibs import (
- Period,
- Tick,
- Timestamp,
- to_offset,
-)
-from pandas._typing import (
- AlignJoin,
- AnyArrayLike,
- ArrayLike,
- Axis,
- AxisInt,
- CompressionOptions,
- Dtype,
- DtypeArg,
- DtypeBackend,
- DtypeObj,
- FilePath,
- FillnaOptions,
- FloatFormatType,
- FormattersType,
- Frequency,
- IgnoreRaise,
- IndexKeyFunc,
- IndexLabel,
- IntervalClosedType,
- JSONSerializable,
- Level,
- Manager,
- NaPosition,
- NDFrameT,
- RandomState,
- Renamer,
- Scalar,
- SortKind,
- StorageOptions,
- Suffixes,
- T,
- TimeAmbiguous,
- TimedeltaConvertibleTypes,
- TimeNonexistent,
- TimestampConvertibleTypes,
- ValueKeyFunc,
- WriteBuffer,
- npt,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.compat.numpy import function as nv
-from pandas.errors import (
- AbstractMethodError,
- InvalidIndexError,
- SettingWithCopyError,
- SettingWithCopyWarning,
-)
-from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
-from pandas.util._validators import (
- check_dtype_backend,
- validate_ascending,
- validate_bool_kwarg,
- validate_fillna_kwargs,
- validate_inclusive,
-)
-
-from pandas.core.dtypes.astype import astype_is_view
-from pandas.core.dtypes.common import (
- ensure_object,
- ensure_platform_int,
- ensure_str,
- is_bool,
- is_bool_dtype,
- is_datetime64_any_dtype,
- is_datetime64tz_dtype,
- is_dict_like,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float,
- is_list_like,
- is_number,
- is_numeric_dtype,
- is_re_compilable,
- is_scalar,
- is_timedelta64_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-from pandas.core.dtypes.inference import (
- is_hashable,
- is_nested_list_like,
-)
-from pandas.core.dtypes.missing import (
- isna,
- notna,
-)
-
-from pandas.core import (
- algorithms as algos,
- arraylike,
- common,
- indexing,
- nanops,
- sample,
-)
-from pandas.core.array_algos.replace import should_use_regex
-from pandas.core.arrays import ExtensionArray
-from pandas.core.base import PandasObject
-from pandas.core.construction import extract_array
-from pandas.core.flags import Flags
-from pandas.core.indexes.api import (
- DatetimeIndex,
- Index,
- MultiIndex,
- PeriodIndex,
- RangeIndex,
- default_index,
- ensure_index,
-)
-from pandas.core.internals import (
- ArrayManager,
- BlockManager,
- SingleArrayManager,
-)
-from pandas.core.internals.construction import (
- mgr_to_mgr,
- ndarray_to_mgr,
-)
-from pandas.core.methods.describe import describe_ndframe
-from pandas.core.missing import (
- clean_fill_method,
- clean_reindex_fill_method,
- find_valid_index,
-)
-from pandas.core.ops import align_method_FRAME
-from pandas.core.reshape.concat import concat
-from pandas.core.shared_docs import _shared_docs
-from pandas.core.sorting import get_indexer_indexer
-from pandas.core.window import (
- Expanding,
- ExponentialMovingWindow,
- Rolling,
- Window,
-)
-
-from pandas.io.formats.format import (
- DataFrameFormatter,
- DataFrameRenderer,
-)
-from pandas.io.formats.printing import pprint_thing
-
-if TYPE_CHECKING:
- from pandas._libs.tslibs import BaseOffset
-
- from pandas.core.frame import DataFrame
- from pandas.core.indexers.objects import BaseIndexer
- from pandas.core.resample import Resampler
- from pandas.core.series import Series
-
- from pandas.io.pytables import HDFStore
-
-
-# goal is to be able to define the docs close to function, while still being
-# able to share
-_shared_docs = {**_shared_docs}
-_shared_doc_kwargs = {
- "axes": "keywords for axes",
- "klass": "Series/DataFrame",
- "axes_single_arg": "int or labels for object",
- "args_transpose": "axes to permute (int or label for object)",
- "inplace": """
- inplace : bool, default False
- If True, performs operation inplace and returns None.""",
- "optional_by": """
- by : str or list of str
- Name or list of names to sort by""",
- "replace_iloc": """
- This differs from updating with ``.loc`` or ``.iloc``, which require
- you to specify a location to update with some value.""",
-}
-
-
-bool_t = bool # Need alias because NDFrame has def bool:
-
-
-class NDFrame(PandasObject, indexing.IndexingMixin):
- """
- N-dimensional analogue of DataFrame. Store multi-dimensional in a
- size-mutable, labeled data structure
-
- Parameters
- ----------
- data : BlockManager
- axes : list
- copy : bool, default False
- """
-
- _internal_names: list[str] = [
- "_mgr",
- "_cacher",
- "_item_cache",
- "_cache",
- "_is_copy",
- "_subtyp",
- "_name",
- "_default_kind",
- "_default_fill_value",
- "_metadata",
- "__array_struct__",
- "__array_interface__",
- "_flags",
- ]
- _internal_names_set: set[str] = set(_internal_names)
- _accessors: set[str] = set()
- _hidden_attrs: frozenset[str] = frozenset([])
- _metadata: list[str] = []
- _is_copy: weakref.ReferenceType[NDFrame] | None = None
- _mgr: Manager
- _attrs: dict[Hashable, Any]
- _typ: str
-
- # ----------------------------------------------------------------------
- # Constructors
-
- def __init__(
- self,
- data: Manager,
- copy: bool_t = False,
- attrs: Mapping[Hashable, Any] | None = None,
- ) -> None:
- # copy kwarg is retained for mypy compat, is not used
-
- object.__setattr__(self, "_is_copy", None)
- object.__setattr__(self, "_mgr", data)
- object.__setattr__(self, "_item_cache", {})
- if attrs is None:
- attrs = {}
- else:
- attrs = dict(attrs)
- object.__setattr__(self, "_attrs", attrs)
- object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
-
- @classmethod
- def _init_mgr(
- cls,
- mgr: Manager,
- axes,
- dtype: Dtype | None = None,
- copy: bool_t = False,
- ) -> Manager:
- """passed a manager and a axes dict"""
- for a, axe in axes.items():
- if axe is not None:
- axe = ensure_index(axe)
- bm_axis = cls._get_block_manager_axis(a)
- mgr = mgr.reindex_axis(axe, axis=bm_axis)
-
- # make a copy if explicitly requested
- if copy:
- mgr = mgr.copy()
- if dtype is not None:
- # avoid further copies if we can
- if (
- isinstance(mgr, BlockManager)
- and len(mgr.blocks) == 1
- and is_dtype_equal(mgr.blocks[0].values.dtype, dtype)
- ):
- pass
- else:
- mgr = mgr.astype(dtype=dtype)
- return mgr
-
- def _as_manager(self: NDFrameT, typ: str, copy: bool_t = True) -> NDFrameT:
- """
- Private helper function to create a DataFrame with specific manager.
-
- Parameters
- ----------
- typ : {"block", "array"}
- copy : bool, default True
- Only controls whether the conversion from Block->ArrayManager
- copies the 1D arrays (to ensure proper/contiguous memory layout).
-
- Returns
- -------
- DataFrame
- New DataFrame using specified manager type. Is not guaranteed
- to be a copy or not.
- """
- new_mgr: Manager
- new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
- # fastpath of passing a manager doesn't check the option/manager class
- return self._constructor(new_mgr).__finalize__(self)
-
- # ----------------------------------------------------------------------
- # attrs and flags
-
- @property
- def attrs(self) -> dict[Hashable, Any]:
- """
- Dictionary of global attributes of this dataset.
-
- .. warning::
-
- attrs is experimental and may change without warning.
-
- See Also
- --------
- DataFrame.flags : Global flags applying to this object.
- """
- if self._attrs is None:
- self._attrs = {}
- return self._attrs
-
- @attrs.setter
- def attrs(self, value: Mapping[Hashable, Any]) -> None:
- self._attrs = dict(value)
-
- @final
- @property
- def flags(self) -> Flags:
- """
- Get the properties associated with this pandas object.
-
- The available flags are
-
- * :attr:`Flags.allows_duplicate_labels`
-
- See Also
- --------
- Flags : Flags that apply to pandas objects.
- DataFrame.attrs : Global metadata applying to this dataset.
-
- Notes
- -----
- "Flags" differ from "metadata". Flags reflect properties of the
- pandas object (the Series or DataFrame). Metadata refer to properties
- of the dataset, and should be stored in :attr:`DataFrame.attrs`.
-
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2]})
- >>> df.flags
- <Flags(allows_duplicate_labels=True)>
-
- Flags can be get or set using ``.``
-
- >>> df.flags.allows_duplicate_labels
- True
- >>> df.flags.allows_duplicate_labels = False
-
- Or by slicing with a key
-
- >>> df.flags["allows_duplicate_labels"]
- False
- >>> df.flags["allows_duplicate_labels"] = True
- """
- return self._flags
-
- @final
- def set_flags(
- self: NDFrameT,
- *,
- copy: bool_t = False,
- allows_duplicate_labels: bool_t | None = None,
- ) -> NDFrameT:
- """
- Return a new object with updated flags.
-
- Parameters
- ----------
- copy : bool, default False
- Specify if a copy of the object should be made.
- allows_duplicate_labels : bool, optional
- Whether the returned object allows duplicate labels.
-
- Returns
- -------
- Series or DataFrame
- The same type as the caller.
-
- See Also
- --------
- DataFrame.attrs : Global metadata applying to this dataset.
- DataFrame.flags : Global flags applying to this object.
-
- Notes
- -----
- This method returns a new object that's a view on the same data
- as the input. Mutating the input or the output values will be reflected
- in the other.
-
- This method is intended to be used in method chains.
-
- "Flags" differ from "metadata". Flags reflect properties of the
- pandas object (the Series or DataFrame). Metadata refer to properties
- of the dataset, and should be stored in :attr:`DataFrame.attrs`.
-
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2]})
- >>> df.flags.allows_duplicate_labels
- True
- >>> df2 = df.set_flags(allows_duplicate_labels=False)
- >>> df2.flags.allows_duplicate_labels
- False
- """
- df = self.copy(deep=copy and not using_copy_on_write())
- if allows_duplicate_labels is not None:
- df.flags["allows_duplicate_labels"] = allows_duplicate_labels
- return df
-
- @final
- @classmethod
- def _validate_dtype(cls, dtype) -> DtypeObj | None:
- """validate the passed dtype"""
- if dtype is not None:
- dtype = pandas_dtype(dtype)
-
- # a compound dtype
- if dtype.kind == "V":
- raise NotImplementedError(
- "compound dtypes are not implemented "
- f"in the {cls.__name__} constructor"
- )
-
- return dtype
-
- # ----------------------------------------------------------------------
- # Construction
-
- @property
- def _constructor(self: NDFrameT) -> Callable[..., NDFrameT]:
- """
- Used when a manipulation result has the same dimensions as the
- original.
- """
- raise AbstractMethodError(self)
-
- # ----------------------------------------------------------------------
- # Internals
-
- @final
- @property
- def _data(self):
- # GH#33054 retained because some downstream packages uses this,
- # e.g. fastparquet
- return self._mgr
-
- # ----------------------------------------------------------------------
- # Axis
- _stat_axis_number = 0
- _stat_axis_name = "index"
- _AXIS_ORDERS: list[Literal["index", "columns"]]
- _AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
- _info_axis_number: int
- _info_axis_name: Literal["index", "columns"]
- _AXIS_LEN: int
-
- @final
- def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):
- """Return an axes dictionary for myself."""
- d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
- # error: Argument 1 to "update" of "MutableMapping" has incompatible type
- # "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
- d.update(kwargs) # type: ignore[arg-type]
- return d
-
- @final
- @classmethod
- def _get_axis_number(cls, axis: Axis) -> AxisInt:
- try:
- return cls._AXIS_TO_AXIS_NUMBER[axis]
- except KeyError:
- raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
-
- @final
- @classmethod
- def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
- axis_number = cls._get_axis_number(axis)
- return cls._AXIS_ORDERS[axis_number]
-
- @final
- def _get_axis(self, axis: Axis) -> Index:
- axis_number = self._get_axis_number(axis)
- assert axis_number in {0, 1}
- return self.index if axis_number == 0 else self.columns
-
- @final
- @classmethod
- def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
- """Map the axis to the block_manager axis."""
- axis = cls._get_axis_number(axis)
- ndim = cls._AXIS_LEN
- if ndim == 2:
- # i.e. DataFrame
- return 1 - axis
- return axis
-
- @final
- def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
- # index or columns
- axis_index = getattr(self, axis)
- d = {}
- prefix = axis[0]
-
- for i, name in enumerate(axis_index.names):
- if name is not None:
- key = level = name
- else:
- # prefix with 'i' or 'c' depending on the input axis
- # e.g., you must do ilevel_0 for the 0th level of an unnamed
- # multiiindex
- key = f"{prefix}level_{i}"
- level = i
-
- level_values = axis_index.get_level_values(level)
- s = level_values.to_series()
- s.index = axis_index
- d[key] = s
-
- # put the index/columns itself in the dict
- if isinstance(axis_index, MultiIndex):
- dindex = axis_index
- else:
- dindex = axis_index.to_series()
-
- d[axis] = dindex
- return d
-
- @final
- def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
- from pandas.core.computation.parsing import clean_column_name
-
- d: dict[str, Series | MultiIndex] = {}
- for axis_name in self._AXIS_ORDERS:
- d.update(self._get_axis_resolvers(axis_name))
-
- return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
-
- @final
- def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
- """
- Return the special character free column resolvers of a dataframe.
-
- Column names with special characters are 'cleaned up' so that they can
- be referred to by backtick quoting.
- Used in :meth:`DataFrame.eval`.
- """
- from pandas.core.computation.parsing import clean_column_name
-
- if isinstance(self, ABCSeries):
- return {clean_column_name(self.name): self}
-
- return {
- clean_column_name(k): v for k, v in self.items() if not isinstance(k, int)
- }
-
- @property
- def _info_axis(self) -> Index:
- return getattr(self, self._info_axis_name)
-
- @property
- def _stat_axis(self) -> Index:
- return getattr(self, self._stat_axis_name)
-
- @property
- def shape(self) -> tuple[int, ...]:
- """
- Return a tuple of axis dimensions
- """
- return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
-
- @property
- def axes(self) -> list[Index]:
- """
- Return index label(s) of the internal NDFrame
- """
- # we do it this way because if we have reversed axes, then
- # the block manager shows then reversed
- return [self._get_axis(a) for a in self._AXIS_ORDERS]
-
- @property
- def ndim(self) -> int:
- """
- Return an int representing the number of axes / array dimensions.
-
- Return 1 if Series. Otherwise return 2 if DataFrame.
-
- See Also
- --------
- ndarray.ndim : Number of array dimensions.
-
- Examples
- --------
- >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
- >>> s.ndim
- 1
-
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.ndim
- 2
- """
- return self._mgr.ndim
-
- @property
- def size(self) -> int:
- """
- Return an int representing the number of elements in this object.
-
- Return the number of rows if Series. Otherwise return the number of
- rows times number of columns if DataFrame.
-
- See Also
- --------
- ndarray.size : Number of elements in the array.
-
- Examples
- --------
- >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
- >>> s.size
- 3
-
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.size
- 4
- """
- # error: Incompatible return value type (got "signedinteger[_64Bit]",
- # expected "int") [return-value]
- return np.prod(self.shape) # type: ignore[return-value]
-
- def set_axis(
- self: NDFrameT,
- labels,
- *,
- axis: Axis = 0,
- copy: bool_t | None = None,
- ) -> NDFrameT:
- """
- Assign desired index to given axis.
-
- Indexes for%(extended_summary_sub)s row labels can be changed by assigning
- a list-like or Index.
-
- Parameters
- ----------
- labels : list-like, Index
- The values for the new index.
-
- axis : %(axes_single_arg)s, default 0
- The axis to update. The value 0 identifies the rows. For `Series`
- this parameter is unused and defaults to 0.
-
- copy : bool, default True
- Whether to make a copy of the underlying data.
-
- .. versionadded:: 1.5.0
-
- Returns
- -------
- %(klass)s
- An object of type %(klass)s.
-
- See Also
- --------
- %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
- """
- return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)
-
- @final
- def _set_axis_nocheck(
- self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None
- ):
- if inplace:
- setattr(self, self._get_axis_name(axis), labels)
- else:
- # With copy=False, we create a new object but don't copy the
- # underlying data.
- obj = self.copy(deep=copy and not using_copy_on_write())
- setattr(obj, obj._get_axis_name(axis), labels)
- return obj
-
- @final
- def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
- """
- This is called from the cython code when we set the `index` attribute
- directly, e.g. `series.index = [1, 2, 3]`.
- """
- labels = ensure_index(labels)
- self._mgr.set_axis(axis, labels)
- self._clear_item_cache()
-
- @final
- def swapaxes(
- self: NDFrameT, axis1: Axis, axis2: Axis, copy: bool_t | None = None
- ) -> NDFrameT:
- """
- Interchange axes and swap values axes appropriately.
-
- Returns
- -------
- same as input
- """
- i = self._get_axis_number(axis1)
- j = self._get_axis_number(axis2)
-
- if i == j:
- return self.copy(deep=copy and not using_copy_on_write())
-
- mapping = {i: j, j: i}
-
- new_axes = [self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN)]
- new_values = self._values.swapaxes(i, j) # type: ignore[union-attr]
- if (
- using_copy_on_write()
- and self._mgr.is_single_block
- and isinstance(self._mgr, BlockManager)
- ):
- # This should only get hit in case of having a single block, otherwise a
- # copy is made, we don't have to set up references.
- new_mgr = ndarray_to_mgr(
- new_values,
- new_axes[0],
- new_axes[1],
- dtype=None,
- copy=False,
- typ="block",
- )
- assert isinstance(new_mgr, BlockManager)
- assert isinstance(self._mgr, BlockManager)
- new_mgr.blocks[0].refs = self._mgr.blocks[0].refs
- new_mgr.blocks[0].refs.add_reference(
- new_mgr.blocks[0] # type: ignore[arg-type]
- )
- return self._constructor(new_mgr).__finalize__(self, method="swapaxes")
-
- elif (copy or copy is None) and self._mgr.is_single_block:
- new_values = new_values.copy()
-
- return self._constructor(
- new_values,
- *new_axes,
- # The no-copy case for CoW is handled above
- copy=False,
- ).__finalize__(self, method="swapaxes")
-
- @final
- @doc(klass=_shared_doc_kwargs["klass"])
- def droplevel(self: NDFrameT, level: IndexLabel, axis: Axis = 0) -> NDFrameT:
- """
- Return {klass} with requested index / column level(s) removed.
-
- Parameters
- ----------
- level : int, str, or list-like
- If a string is given, must be the name of a level
- If list-like, elements must be names or positional indexes
- of levels.
-
- axis : {{0 or 'index', 1 or 'columns'}}, default 0
- Axis along which the level(s) is removed:
-
- * 0 or 'index': remove level(s) in column.
- * 1 or 'columns': remove level(s) in row.
-
- For `Series` this parameter is unused and defaults to 0.
-
- Returns
- -------
- {klass}
- {klass} with requested index / column level(s) removed.
-
- Examples
- --------
- >>> df = pd.DataFrame([
- ... [1, 2, 3, 4],
- ... [5, 6, 7, 8],
- ... [9, 10, 11, 12]
- ... ]).set_index([0, 1]).rename_axis(['a', 'b'])
-
- >>> df.columns = pd.MultiIndex.from_tuples([
- ... ('c', 'e'), ('d', 'f')
- ... ], names=['level_1', 'level_2'])
-
- >>> df
- level_1 c d
- level_2 e f
- a b
- 1 2 3 4
- 5 6 7 8
- 9 10 11 12
-
- >>> df.droplevel('a')
- level_1 c d
- level_2 e f
- b
- 2 3 4
- 6 7 8
- 10 11 12
-
- >>> df.droplevel('level_2', axis=1)
- level_1 c d
- a b
- 1 2 3 4
- 5 6 7 8
- 9 10 11 12
- """
- labels = self._get_axis(axis)
- new_labels = labels.droplevel(level)
- return self.set_axis(new_labels, axis=axis, copy=None)
-
- def pop(self, item: Hashable) -> Series | Any:
- result = self[item]
- del self[item]
-
- return result
-
- @final
- def squeeze(self, axis: Axis | None = None):
- """
- Squeeze 1 dimensional axis objects into scalars.
-
- Series or DataFrames with a single element are squeezed to a scalar.
- DataFrames with a single column or a single row are squeezed to a
- Series. Otherwise the object is unchanged.
-
- This method is most useful when you don't know if your
- object is a Series or DataFrame, but you do know it has just a single
- column. In that case you can safely call `squeeze` to ensure you have a
- Series.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns', None}, default None
- A specific axis to squeeze. By default, all length-1 axes are
- squeezed. For `Series` this parameter is unused and defaults to `None`.
-
- Returns
- -------
- DataFrame, Series, or scalar
- The projection after squeezing `axis` or all the axes.
-
- See Also
- --------
- Series.iloc : Integer-location based indexing for selecting scalars.
- DataFrame.iloc : Integer-location based indexing for selecting Series.
- Series.to_frame : Inverse of DataFrame.squeeze for a
- single-column DataFrame.
-
- Examples
- --------
- >>> primes = pd.Series([2, 3, 5, 7])
-
- Slicing might produce a Series with a single value:
-
- >>> even_primes = primes[primes % 2 == 0]
- >>> even_primes
- 0 2
- dtype: int64
-
- >>> even_primes.squeeze()
- 2
-
- Squeezing objects with more than one value in every axis does nothing:
-
- >>> odd_primes = primes[primes % 2 == 1]
- >>> odd_primes
- 1 3
- 2 5
- 3 7
- dtype: int64
-
- >>> odd_primes.squeeze()
- 1 3
- 2 5
- 3 7
- dtype: int64
-
- Squeezing is even more effective when used with DataFrames.
-
- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
- >>> df
- a b
- 0 1 2
- 1 3 4
-
- Slicing a single column will produce a DataFrame with the columns
- having only one value:
-
- >>> df_a = df[['a']]
- >>> df_a
- a
- 0 1
- 1 3
-
- So the columns can be squeezed down, resulting in a Series:
-
- >>> df_a.squeeze('columns')
- 0 1
- 1 3
- Name: a, dtype: int64
-
- Slicing a single row from a single column will produce a single
- scalar DataFrame:
-
- >>> df_0a = df.loc[df.index < 1, ['a']]
- >>> df_0a
- a
- 0 1
-
- Squeezing the rows produces a single scalar Series:
-
- >>> df_0a.squeeze('rows')
- a 1
- Name: 0, dtype: int64
-
- Squeezing all axes will project directly into a scalar:
-
- >>> df_0a.squeeze()
- 1
- """
- axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),)
- return self.iloc[
- tuple(
- 0 if i in axes and len(a) == 1 else slice(None)
- for i, a in enumerate(self.axes)
- )
- ]
-
- # ----------------------------------------------------------------------
- # Rename
-
- def _rename(
- self: NDFrameT,
- mapper: Renamer | None = None,
- *,
- index: Renamer | None = None,
- columns: Renamer | None = None,
- axis: Axis | None = None,
- copy: bool_t | None = None,
- inplace: bool_t = False,
- level: Level | None = None,
- errors: str = "ignore",
- ) -> NDFrameT | None:
- # called by Series.rename and DataFrame.rename
-
- if mapper is None and index is None and columns is None:
- raise TypeError("must pass an index to rename")
-
- if index is not None or columns is not None:
- if axis is not None:
- raise TypeError(
- "Cannot specify both 'axis' and any of 'index' or 'columns'"
- )
- if mapper is not None:
- raise TypeError(
- "Cannot specify both 'mapper' and any of 'index' or 'columns'"
- )
- else:
- # use the mapper argument
- if axis and self._get_axis_number(axis) == 1:
- columns = mapper
- else:
- index = mapper
-
- self._check_inplace_and_allows_duplicate_labels(inplace)
- result = self if inplace else self.copy(deep=copy and not using_copy_on_write())
-
- for axis_no, replacements in enumerate((index, columns)):
- if replacements is None:
- continue
-
- ax = self._get_axis(axis_no)
- f = common.get_rename_function(replacements)
-
- if level is not None:
- level = ax._get_level_number(level)
-
- # GH 13473
- if not callable(replacements):
- if ax._is_multi and level is not None:
- indexer = ax.get_level_values(level).get_indexer_for(replacements)
- else:
- indexer = ax.get_indexer_for(replacements)
-
- if errors == "raise" and len(indexer[indexer == -1]):
- missing_labels = [
- label
- for index, label in enumerate(replacements)
- if indexer[index] == -1
- ]
- raise KeyError(f"{missing_labels} not found in axis")
-
- new_index = ax._transform_index(f, level=level)
- result._set_axis_nocheck(new_index, axis=axis_no, inplace=True, copy=False)
- result._clear_item_cache()
-
- if inplace:
- self._update_inplace(result)
- return None
- else:
- return result.__finalize__(self, method="rename")
-
- @overload
- def rename_axis(
- self: NDFrameT,
- mapper: IndexLabel | lib.NoDefault = ...,
- *,
- index=...,
- columns=...,
- axis: Axis = ...,
- copy: bool_t | None = ...,
- inplace: Literal[False] = ...,
- ) -> NDFrameT:
- ...
-
- @overload
- def rename_axis(
- self,
- mapper: IndexLabel | lib.NoDefault = ...,
- *,
- index=...,
- columns=...,
- axis: Axis = ...,
- copy: bool_t | None = ...,
- inplace: Literal[True],
- ) -> None:
- ...
-
- @overload
- def rename_axis(
- self: NDFrameT,
- mapper: IndexLabel | lib.NoDefault = ...,
- *,
- index=...,
- columns=...,
- axis: Axis = ...,
- copy: bool_t | None = ...,
- inplace: bool_t = ...,
- ) -> NDFrameT | None:
- ...
-
- def rename_axis(
- self: NDFrameT,
- mapper: IndexLabel | lib.NoDefault = lib.no_default,
- *,
- index=lib.no_default,
- columns=lib.no_default,
- axis: Axis = 0,
- copy: bool_t | None = None,
- inplace: bool_t = False,
- ) -> NDFrameT | None:
- """
- Set the name of the axis for the index or columns.
-
- Parameters
- ----------
- mapper : scalar, list-like, optional
- Value to set the axis name attribute.
- index, columns : scalar, list-like, dict-like or function, optional
- A scalar, list-like, dict-like or functions transformations to
- apply to that axis' values.
- Note that the ``columns`` parameter is not allowed if the
- object is a Series. This parameter only apply for DataFrame
- type objects.
-
- Use either ``mapper`` and ``axis`` to
- specify the axis to target with ``mapper``, or ``index``
- and/or ``columns``.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to rename. For `Series` this parameter is unused and defaults to 0.
- copy : bool, default None
- Also copy underlying data.
- inplace : bool, default False
- Modifies the object directly, instead of creating a new Series
- or DataFrame.
-
- Returns
- -------
- Series, DataFrame, or None
- The same type as the caller or None if ``inplace=True``.
-
- See Also
- --------
- Series.rename : Alter Series index labels or name.
- DataFrame.rename : Alter DataFrame index labels or name.
- Index.rename : Set new names on index.
-
- Notes
- -----
- ``DataFrame.rename_axis`` supports two calling conventions
-
- * ``(index=index_mapper, columns=columns_mapper, ...)``
- * ``(mapper, axis={'index', 'columns'}, ...)``
-
- The first calling convention will only modify the names of
- the index and/or the names of the Index object that is the columns.
- In this case, the parameter ``copy`` is ignored.
-
- The second calling convention will modify the names of the
- corresponding index if mapper is a list or a scalar.
- However, if mapper is dict-like or a function, it will use the
- deprecated behavior of modifying the axis *labels*.
-
- We *highly* recommend using keyword arguments to clarify your
- intent.
-
- Examples
- --------
- **Series**
-
- >>> s = pd.Series(["dog", "cat", "monkey"])
- >>> s
- 0 dog
- 1 cat
- 2 monkey
- dtype: object
- >>> s.rename_axis("animal")
- animal
- 0 dog
- 1 cat
- 2 monkey
- dtype: object
-
- **DataFrame**
-
- >>> df = pd.DataFrame({"num_legs": [4, 4, 2],
- ... "num_arms": [0, 0, 2]},
- ... ["dog", "cat", "monkey"])
- >>> df
- num_legs num_arms
- dog 4 0
- cat 4 0
- monkey 2 2
- >>> df = df.rename_axis("animal")
- >>> df
- num_legs num_arms
- animal
- dog 4 0
- cat 4 0
- monkey 2 2
- >>> df = df.rename_axis("limbs", axis="columns")
- >>> df
- limbs num_legs num_arms
- animal
- dog 4 0
- cat 4 0
- monkey 2 2
-
- **MultiIndex**
-
- >>> df.index = pd.MultiIndex.from_product([['mammal'],
- ... ['dog', 'cat', 'monkey']],
- ... names=['type', 'name'])
- >>> df
- limbs num_legs num_arms
- type name
- mammal dog 4 0
- cat 4 0
- monkey 2 2
-
- >>> df.rename_axis(index={'type': 'class'})
- limbs num_legs num_arms
- class name
- mammal dog 4 0
- cat 4 0
- monkey 2 2
-
- >>> df.rename_axis(columns=str.upper)
- LIMBS num_legs num_arms
- type name
- mammal dog 4 0
- cat 4 0
- monkey 2 2
- """
- axes = {"index": index, "columns": columns}
-
- if axis is not None:
- axis = self._get_axis_number(axis)
-
- inplace = validate_bool_kwarg(inplace, "inplace")
-
- if copy and using_copy_on_write():
- copy = False
-
- if mapper is not lib.no_default:
- # Use v0.23 behavior if a scalar or list
- non_mapper = is_scalar(mapper) or (
- is_list_like(mapper) and not is_dict_like(mapper)
- )
- if non_mapper:
- return self._set_axis_name(
- mapper, axis=axis, inplace=inplace, copy=copy
- )
- else:
- raise ValueError("Use `.rename` to alter labels with a mapper.")
- else:
- # Use new behavior. Means that index and/or columns
- # is specified
- result = self if inplace else self.copy(deep=copy)
-
- for axis in range(self._AXIS_LEN):
- v = axes.get(self._get_axis_name(axis))
- if v is lib.no_default:
- continue
- non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))
- if non_mapper:
- newnames = v
- else:
- f = common.get_rename_function(v)
- curnames = self._get_axis(axis).names
- newnames = [f(name) for name in curnames]
- result._set_axis_name(newnames, axis=axis, inplace=True, copy=copy)
- if not inplace:
- return result
- return None
-
- @final
- def _set_axis_name(
- self, name, axis: Axis = 0, inplace: bool_t = False, copy: bool_t | None = True
- ):
- """
- Set the name(s) of the axis.
-
- Parameters
- ----------
- name : str or list of str
- Name(s) to set.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to set the label. The value 0 or 'index' specifies index,
- and the value 1 or 'columns' specifies columns.
- inplace : bool, default False
- If `True`, do operation inplace and return None.
- copy:
- Whether to make a copy of the result.
-
- Returns
- -------
- Series, DataFrame, or None
- The same type as the caller or `None` if `inplace` is `True`.
-
- See Also
- --------
- DataFrame.rename : Alter the axis labels of :class:`DataFrame`.
- Series.rename : Alter the index labels or set the index name
- of :class:`Series`.
- Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.
-
- Examples
- --------
- >>> df = pd.DataFrame({"num_legs": [4, 4, 2]},
- ... ["dog", "cat", "monkey"])
- >>> df
- num_legs
- dog 4
- cat 4
- monkey 2
- >>> df._set_axis_name("animal")
- num_legs
- animal
- dog 4
- cat 4
- monkey 2
- >>> df.index = pd.MultiIndex.from_product(
- ... [["mammal"], ['dog', 'cat', 'monkey']])
- >>> df._set_axis_name(["type", "name"])
- num_legs
- type name
- mammal dog 4
- cat 4
- monkey 2
- """
- axis = self._get_axis_number(axis)
- idx = self._get_axis(axis).set_names(name)
-
- inplace = validate_bool_kwarg(inplace, "inplace")
- renamed = self if inplace else self.copy(deep=copy)
- if axis == 0:
- renamed.index = idx
- else:
- renamed.columns = idx
-
- if not inplace:
- return renamed
-
- # ----------------------------------------------------------------------
- # Comparison Methods
-
- @final
- def _indexed_same(self, other) -> bool_t:
- return all(
- self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS
- )
-
- @final
- def equals(self, other: object) -> bool_t:
- """
- Test whether two objects contain the same elements.
-
- This function allows two Series or DataFrames to be compared against
- each other to see if they have the same shape and elements. NaNs in
- the same location are considered equal.
-
- The row/column index do not need to have the same type, as long
- as the values are considered equal. Corresponding columns must be of
- the same dtype.
-
- Parameters
- ----------
- other : Series or DataFrame
- The other Series or DataFrame to be compared with the first.
-
- Returns
- -------
- bool
- True if all elements are the same in both objects, False
- otherwise.
-
- See Also
- --------
- Series.eq : Compare two Series objects of the same length
- and return a Series where each element is True if the element
- in each Series is equal, False otherwise.
- DataFrame.eq : Compare two DataFrame objects of the same shape and
- return a DataFrame where each element is True if the respective
- element in each DataFrame is equal, False otherwise.
- testing.assert_series_equal : Raises an AssertionError if left and
- right are not equal. Provides an easy interface to ignore
- inequality in dtypes, indexes and precision among others.
- testing.assert_frame_equal : Like assert_series_equal, but targets
- DataFrames.
- numpy.array_equal : Return True if two arrays have the same shape
- and elements, False otherwise.
-
- Examples
- --------
- >>> df = pd.DataFrame({1: [10], 2: [20]})
- >>> df
- 1 2
- 0 10 20
-
- DataFrames df and exactly_equal have the same types and values for
- their elements and column labels, which will return True.
-
- >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})
- >>> exactly_equal
- 1 2
- 0 10 20
- >>> df.equals(exactly_equal)
- True
-
- DataFrames df and different_column_type have the same element
- types and values, but have different types for the column labels,
- which will still return True.
-
- >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
- >>> different_column_type
- 1.0 2.0
- 0 10 20
- >>> df.equals(different_column_type)
- True
-
- DataFrames df and different_data_type have different types for the
- same values for their elements, and will return False even though
- their column labels are the same values and types.
-
- >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
- >>> different_data_type
- 1 2
- 0 10.0 20.0
- >>> df.equals(different_data_type)
- False
- """
- if not (isinstance(other, type(self)) or isinstance(self, type(other))):
- return False
- other = cast(NDFrame, other)
- return self._mgr.equals(other._mgr)
-
- # -------------------------------------------------------------------------
- # Unary Methods
-
- @final
- def __neg__(self: NDFrameT) -> NDFrameT:
- def blk_func(values: ArrayLike):
- if is_bool_dtype(values.dtype):
- # error: Argument 1 to "inv" has incompatible type "Union
- # [ExtensionArray, ndarray[Any, Any]]"; expected
- # "_SupportsInversion[ndarray[Any, dtype[bool_]]]"
- return operator.inv(values) # type: ignore[arg-type]
- else:
- # error: Argument 1 to "neg" has incompatible type "Union
- # [ExtensionArray, ndarray[Any, Any]]"; expected
- # "_SupportsNeg[ndarray[Any, dtype[Any]]]"
- return operator.neg(values) # type: ignore[arg-type]
-
- new_data = self._mgr.apply(blk_func)
- res = self._constructor(new_data)
- return res.__finalize__(self, method="__neg__")
-
- @final
- def __pos__(self: NDFrameT) -> NDFrameT:
- def blk_func(values: ArrayLike):
- if is_bool_dtype(values.dtype):
- return values.copy()
- else:
- # error: Argument 1 to "pos" has incompatible type "Union
- # [ExtensionArray, ndarray[Any, Any]]"; expected
- # "_SupportsPos[ndarray[Any, dtype[Any]]]"
- return operator.pos(values) # type: ignore[arg-type]
-
- new_data = self._mgr.apply(blk_func)
- res = self._constructor(new_data)
- return res.__finalize__(self, method="__pos__")
-
- @final
- def __invert__(self: NDFrameT) -> NDFrameT:
- if not self.size:
- # inv fails with 0 len
- return self.copy(deep=False)
-
- new_data = self._mgr.apply(operator.invert)
- return self._constructor(new_data).__finalize__(self, method="__invert__")
-
- @final
- def __nonzero__(self) -> NoReturn:
- raise ValueError(
- f"The truth value of a {type(self).__name__} is ambiguous. "
- "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
- )
-
- __bool__ = __nonzero__
-
- @final
- def bool(self) -> bool_t:
- """
- Return the bool of a single element Series or DataFrame.
-
- This must be a boolean scalar value, either True or False. It will raise a
- ValueError if the Series or DataFrame does not have exactly 1 element, or that
- element is not boolean (integer values 0 and 1 will also raise an exception).
-
- Returns
- -------
- bool
- The value in the Series or DataFrame.
-
- See Also
- --------
- Series.astype : Change the data type of a Series, including to boolean.
- DataFrame.astype : Change the data type of a DataFrame, including to boolean.
- numpy.bool_ : NumPy boolean data type, used by pandas for boolean values.
-
- Examples
- --------
- The method will only work for single element objects with a boolean value:
-
- >>> pd.Series([True]).bool()
- True
- >>> pd.Series([False]).bool()
- False
-
- >>> pd.DataFrame({'col': [True]}).bool()
- True
- >>> pd.DataFrame({'col': [False]}).bool()
- False
- """
- v = self.squeeze()
- if isinstance(v, (bool, np.bool_)):
- return bool(v)
- elif is_scalar(v):
- raise ValueError(
- "bool cannot act on a non-boolean single element "
- f"{type(self).__name__}"
- )
-
- self.__nonzero__()
- # for mypy (__nonzero__ raises)
- return True
-
- @final
- def abs(self: NDFrameT) -> NDFrameT:
- """
- Return a Series/DataFrame with absolute numeric value of each element.
-
- This function only applies to elements that are all numeric.
-
- Returns
- -------
- abs
- Series/DataFrame containing the absolute value of each element.
-
- See Also
- --------
- numpy.absolute : Calculate the absolute value element-wise.
-
- Notes
- -----
- For ``complex`` inputs, ``1.2 + 1j``, the absolute value is
- :math:`\\sqrt{ a^2 + b^2 }`.
-
- Examples
- --------
- Absolute numeric values in a Series.
-
- >>> s = pd.Series([-1.10, 2, -3.33, 4])
- >>> s.abs()
- 0 1.10
- 1 2.00
- 2 3.33
- 3 4.00
- dtype: float64
-
- Absolute numeric values in a Series with complex numbers.
-
- >>> s = pd.Series([1.2 + 1j])
- >>> s.abs()
- 0 1.56205
- dtype: float64
-
- Absolute numeric values in a Series with a Timedelta element.
-
- >>> s = pd.Series([pd.Timedelta('1 days')])
- >>> s.abs()
- 0 1 days
- dtype: timedelta64[ns]
-
- Select rows with data closest to certain value using argsort (from
- `StackOverflow <https://stackoverflow.com/a/17758115>`__).
-
- >>> df = pd.DataFrame({
- ... 'a': [4, 5, 6, 7],
- ... 'b': [10, 20, 30, 40],
- ... 'c': [100, 50, -30, -50]
- ... })
- >>> df
- a b c
- 0 4 10 100
- 1 5 20 50
- 2 6 30 -30
- 3 7 40 -50
- >>> df.loc[(df.c - 43).abs().argsort()]
- a b c
- 1 5 20 50
- 0 4 10 100
- 2 6 30 -30
- 3 7 40 -50
- """
- res_mgr = self._mgr.apply(np.abs)
- return self._constructor(res_mgr).__finalize__(self, name="abs")
-
- @final
- def __abs__(self: NDFrameT) -> NDFrameT:
- return self.abs()
-
- @final
- def __round__(self: NDFrameT, decimals: int = 0) -> NDFrameT:
- return self.round(decimals).__finalize__(self, method="__round__")
-
- # -------------------------------------------------------------------------
- # Label or Level Combination Helpers
- #
- # A collection of helper methods for DataFrame/Series operations that
- # accept a combination of column/index labels and levels. All such
- # operations should utilize/extend these methods when possible so that we
- # have consistent precedence and validation logic throughout the library.
-
- @final
- def _is_level_reference(self, key: Level, axis: Axis = 0) -> bool_t:
- """
- Test whether a key is a level reference for a given axis.
-
- To be considered a level reference, `key` must be a string that:
- - (axis=0): Matches the name of an index level and does NOT match
- a column label.
- - (axis=1): Matches the name of a column level and does NOT match
- an index label.
-
- Parameters
- ----------
- key : Hashable
- Potential level name for the given axis
- axis : int, default 0
- Axis that levels are associated with (0 for index, 1 for columns)
-
- Returns
- -------
- is_level : bool
- """
- axis_int = self._get_axis_number(axis)
-
- return (
- key is not None
- and is_hashable(key)
- and key in self.axes[axis_int].names
- and not self._is_label_reference(key, axis=axis_int)
- )
-
- @final
- def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool_t:
- """
- Test whether a key is a label reference for a given axis.
-
- To be considered a label reference, `key` must be a string that:
- - (axis=0): Matches a column label
- - (axis=1): Matches an index label
-
- Parameters
- ----------
- key : Hashable
- Potential label name, i.e. Index entry.
- axis : int, default 0
- Axis perpendicular to the axis that labels are associated with
- (0 means search for column labels, 1 means search for index labels)
-
- Returns
- -------
- is_label: bool
- """
- axis_int = self._get_axis_number(axis)
- other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)
-
- return (
- key is not None
- and is_hashable(key)
- and any(key in self.axes[ax] for ax in other_axes)
- )
-
- @final
- def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool_t:
- """
- Test whether a key is a label or level reference for a given axis.
-
- To be considered either a label or a level reference, `key` must be a
- string that:
- - (axis=0): Matches a column label or an index level
- - (axis=1): Matches an index label or a column level
-
- Parameters
- ----------
- key : Hashable
- Potential label or level name
- axis : int, default 0
- Axis that levels are associated with (0 for index, 1 for columns)
-
- Returns
- -------
- bool
- """
- return self._is_level_reference(key, axis=axis) or self._is_label_reference(
- key, axis=axis
- )
-
- @final
- def _check_label_or_level_ambiguity(self, key: Level, axis: Axis = 0) -> None:
- """
- Check whether `key` is ambiguous.
-
- By ambiguous, we mean that it matches both a level of the input
- `axis` and a label of the other axis.
-
- Parameters
- ----------
- key : Hashable
- Label or level name.
- axis : int, default 0
- Axis that levels are associated with (0 for index, 1 for columns).
-
- Raises
- ------
- ValueError: `key` is ambiguous
- """
-
- axis_int = self._get_axis_number(axis)
- other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)
-
- if (
- key is not None
- and is_hashable(key)
- and key in self.axes[axis_int].names
- and any(key in self.axes[ax] for ax in other_axes)
- ):
- # Build an informative and grammatical warning
- level_article, level_type = (
- ("an", "index") if axis_int == 0 else ("a", "column")
- )
-
- label_article, label_type = (
- ("a", "column") if axis_int == 0 else ("an", "index")
- )
-
- msg = (
- f"'{key}' is both {level_article} {level_type} level and "
- f"{label_article} {label_type} label, which is ambiguous."
- )
- raise ValueError(msg)
-
- @final
- def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike:
- """
- Return a 1-D array of values associated with `key`, a label or level
- from the given `axis`.
-
- Retrieval logic:
- - (axis=0): Return column values if `key` matches a column label.
- Otherwise return index level values if `key` matches an index
- level.
- - (axis=1): Return row values if `key` matches an index label.
- Otherwise return column level values if 'key' matches a column
- level
-
- Parameters
- ----------
- key : Hashable
- Label or level name.
- axis : int, default 0
- Axis that levels are associated with (0 for index, 1 for columns)
-
- Returns
- -------
- np.ndarray or ExtensionArray
-
- Raises
- ------
- KeyError
- if `key` matches neither a label nor a level
- ValueError
- if `key` matches multiple labels
- """
- axis = self._get_axis_number(axis)
- other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
-
- if self._is_label_reference(key, axis=axis):
- self._check_label_or_level_ambiguity(key, axis=axis)
- values = self.xs(key, axis=other_axes[0])._values
- elif self._is_level_reference(key, axis=axis):
- values = self.axes[axis].get_level_values(key)._values
- else:
- raise KeyError(key)
-
- # Check for duplicates
- if values.ndim > 1:
- if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
- multi_message = (
- "\n"
- "For a multi-index, the label must be a "
- "tuple with elements corresponding to each level."
- )
- else:
- multi_message = ""
-
- label_axis_name = "column" if axis == 0 else "index"
- raise ValueError(
- f"The {label_axis_name} label '{key}' is not unique.{multi_message}"
- )
-
- return values
-
- @final
- def _drop_labels_or_levels(self, keys, axis: AxisInt = 0):
- """
- Drop labels and/or levels for the given `axis`.
-
- For each key in `keys`:
- - (axis=0): If key matches a column label then drop the column.
- Otherwise if key matches an index level then drop the level.
- - (axis=1): If key matches an index label then drop the row.
- Otherwise if key matches a column level then drop the level.
-
- Parameters
- ----------
- keys : str or list of str
- labels or levels to drop
- axis : int, default 0
- Axis that levels are associated with (0 for index, 1 for columns)
-
- Returns
- -------
- dropped: DataFrame
-
- Raises
- ------
- ValueError
- if any `keys` match neither a label nor a level
- """
- axis = self._get_axis_number(axis)
-
- # Validate keys
- keys = common.maybe_make_list(keys)
- invalid_keys = [
- k for k in keys if not self._is_label_or_level_reference(k, axis=axis)
- ]
-
- if invalid_keys:
- raise ValueError(
- "The following keys are not valid labels or "
- f"levels for axis {axis}: {invalid_keys}"
- )
-
- # Compute levels and labels to drop
- levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)]
-
- labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)]
-
- # Perform copy upfront and then use inplace operations below.
- # This ensures that we always perform exactly one copy.
- # ``copy`` and/or ``inplace`` options could be added in the future.
- dropped = self.copy(deep=False)
-
- if axis == 0:
- # Handle dropping index levels
- if levels_to_drop:
- dropped.reset_index(levels_to_drop, drop=True, inplace=True)
-
- # Handle dropping columns labels
- if labels_to_drop:
- dropped.drop(labels_to_drop, axis=1, inplace=True)
- else:
- # Handle dropping column levels
- if levels_to_drop:
- if isinstance(dropped.columns, MultiIndex):
- # Drop the specified levels from the MultiIndex
- dropped.columns = dropped.columns.droplevel(levels_to_drop)
- else:
- # Drop the last level of Index by replacing with
- # a RangeIndex
- dropped.columns = RangeIndex(dropped.columns.size)
-
- # Handle dropping index labels
- if labels_to_drop:
- dropped.drop(labels_to_drop, axis=0, inplace=True)
-
- return dropped
-
- # ----------------------------------------------------------------------
- # Iteration
-
- # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
- # Incompatible types in assignment (expression has type "None", base class
- # "object" defined the type as "Callable[[object], int]")
- __hash__: ClassVar[None] # type: ignore[assignment]
-
- def __iter__(self) -> Iterator:
- """
- Iterate over info axis.
-
- Returns
- -------
- iterator
- Info axis as iterator.
- """
- return iter(self._info_axis)
-
- # can we get a better explanation of this?
- def keys(self) -> Index:
- """
- Get the 'info axis' (see Indexing for more).
-
- This is index for Series, columns for DataFrame.
-
- Returns
- -------
- Index
- Info axis.
- """
- return self._info_axis
-
- def items(self):
- """
- Iterate over (label, values) on info axis
-
- This is index for Series and columns for DataFrame.
-
- Returns
- -------
- Generator
- """
- for h in self._info_axis:
- yield h, self[h]
-
- def __len__(self) -> int:
- """Returns length of info axis"""
- return len(self._info_axis)
-
- @final
- def __contains__(self, key) -> bool_t:
- """True if the key is in the info axis"""
- return key in self._info_axis
-
- @property
- def empty(self) -> bool_t:
- """
- Indicator whether Series/DataFrame is empty.
-
- True if Series/DataFrame is entirely empty (no items), meaning any of the
- axes are of length 0.
-
- Returns
- -------
- bool
- If Series/DataFrame is empty, return True, if not return False.
-
- See Also
- --------
- Series.dropna : Return series without null values.
- DataFrame.dropna : Return DataFrame with labels on given axis omitted
- where (all or any) data are missing.
-
- Notes
- -----
- If Series/DataFrame contains only NaNs, it is still not considered empty. See
- the example below.
-
- Examples
- --------
- An example of an actual empty DataFrame. Notice the index is empty:
-
- >>> df_empty = pd.DataFrame({'A' : []})
- >>> df_empty
- Empty DataFrame
- Columns: [A]
- Index: []
- >>> df_empty.empty
- True
-
- If we only have NaNs in our DataFrame, it is not considered empty! We
- will need to drop the NaNs to make the DataFrame empty:
-
- >>> df = pd.DataFrame({'A' : [np.nan]})
- >>> df
- A
- 0 NaN
- >>> df.empty
- False
- >>> df.dropna().empty
- True
-
- >>> ser_empty = pd.Series({'A' : []})
- >>> ser_empty
- A []
- dtype: object
- >>> ser_empty.empty
- False
- >>> ser_empty = pd.Series()
- >>> ser_empty.empty
- True
- """
- return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)
-
- # ----------------------------------------------------------------------
- # Array Interface
-
- # This is also set in IndexOpsMixin
- # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
- __array_priority__: int = 1000
-
- def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
- values = self._values
- arr = np.asarray(values, dtype=dtype)
- if (
- astype_is_view(values.dtype, arr.dtype)
- and using_copy_on_write()
- and self._mgr.is_single_block
- ):
- # Check if both conversions can be done without a copy
- if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view(
- values.dtype, arr.dtype
- ):
- arr = arr.view()
- arr.flags.writeable = False
- return arr
-
- @final
- def __array_ufunc__(
- self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any
- ):
- return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)
-
- # ----------------------------------------------------------------------
- # Picklability
-
- @final
- def __getstate__(self) -> dict[str, Any]:
- meta = {k: getattr(self, k, None) for k in self._metadata}
- return {
- "_mgr": self._mgr,
- "_typ": self._typ,
- "_metadata": self._metadata,
- "attrs": self.attrs,
- "_flags": {k: self.flags[k] for k in self.flags._keys},
- **meta,
- }
-
- @final
- def __setstate__(self, state) -> None:
- if isinstance(state, BlockManager):
- self._mgr = state
- elif isinstance(state, dict):
- if "_data" in state and "_mgr" not in state:
- # compat for older pickles
- state["_mgr"] = state.pop("_data")
- typ = state.get("_typ")
- if typ is not None:
- attrs = state.get("_attrs", {})
- object.__setattr__(self, "_attrs", attrs)
- flags = state.get("_flags", {"allows_duplicate_labels": True})
- object.__setattr__(self, "_flags", Flags(self, **flags))
-
- # set in the order of internal names
- # to avoid definitional recursion
- # e.g. say fill_value needing _mgr to be
- # defined
- meta = set(self._internal_names + self._metadata)
- for k in list(meta):
- if k in state and k != "_flags":
- v = state[k]
- object.__setattr__(self, k, v)
-
- for k, v in state.items():
- if k not in meta:
- object.__setattr__(self, k, v)
-
- else:
- raise NotImplementedError("Pre-0.12 pickles are no longer supported")
- elif len(state) == 2:
- raise NotImplementedError("Pre-0.12 pickles are no longer supported")
-
- self._item_cache: dict[Hashable, Series] = {}
-
- # ----------------------------------------------------------------------
- # Rendering Methods
-
- def __repr__(self) -> str:
- # string representation based upon iterating over self
- # (since, by definition, `PandasContainers` are iterable)
- prepr = f"[{','.join(map(pprint_thing, self))}]"
- return f"{type(self).__name__}({prepr})"
-
- @final
- def _repr_latex_(self):
- """
- Returns a LaTeX representation for a particular object.
- Mainly for use with nbconvert (jupyter notebook conversion to pdf).
- """
- if config.get_option("styler.render.repr") == "latex":
- return self.to_latex()
- else:
- return None
-
- @final
- def _repr_data_resource_(self):
- """
- Not a real Jupyter special repr method, but we use the same
- naming convention.
- """
- if config.get_option("display.html.table_schema"):
- data = self.head(config.get_option("display.max_rows"))
-
- as_json = data.to_json(orient="table")
- as_json = cast(str, as_json)
- return loads(as_json, object_pairs_hook=collections.OrderedDict)
-
- # ----------------------------------------------------------------------
- # I/O Methods
-
- @final
- @doc(
- klass="object",
- storage_options=_shared_docs["storage_options"],
- storage_options_versionadded="1.2.0",
- )
- def to_excel(
- self,
- excel_writer,
- sheet_name: str = "Sheet1",
- na_rep: str = "",
- float_format: str | None = None,
- columns: Sequence[Hashable] | None = None,
- header: Sequence[Hashable] | bool_t = True,
- index: bool_t = True,
- index_label: IndexLabel = None,
- startrow: int = 0,
- startcol: int = 0,
- engine: str | None = None,
- merge_cells: bool_t = True,
- inf_rep: str = "inf",
- freeze_panes: tuple[int, int] | None = None,
- storage_options: StorageOptions = None,
- ) -> None:
- """
- Write {klass} to an Excel sheet.
-
- To write a single {klass} to an Excel .xlsx file it is only necessary to
- specify a target file name. To write to multiple sheets it is necessary to
- create an `ExcelWriter` object with a target file name, and specify a sheet
- in the file to write to.
-
- Multiple sheets may be written to by specifying unique `sheet_name`.
- With all data written to the file it is necessary to save the changes.
- Note that creating an `ExcelWriter` object with a file name that already
- exists will result in the contents of the existing file being erased.
-
- Parameters
- ----------
- excel_writer : path-like, file-like, or ExcelWriter object
- File path or existing ExcelWriter.
- sheet_name : str, default 'Sheet1'
- Name of sheet which will contain DataFrame.
- na_rep : str, default ''
- Missing data representation.
- float_format : str, optional
- Format string for floating point numbers. For example
- ``float_format="%.2f"`` will format 0.1234 to 0.12.
- columns : sequence or list of str, optional
- Columns to write.
- header : bool or list of str, default True
- Write out the column names. If a list of string is given it is
- assumed to be aliases for the column names.
- index : bool, default True
- Write row names (index).
- index_label : str or sequence, optional
- Column label for index column(s) if desired. If not specified, and
- `header` and `index` are True, then the index names are used. A
- sequence should be given if the DataFrame uses MultiIndex.
- startrow : int, default 0
- Upper left cell row to dump data frame.
- startcol : int, default 0
- Upper left cell column to dump data frame.
- engine : str, optional
- Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
- via the options ``io.excel.xlsx.writer`` or
- ``io.excel.xlsm.writer``.
-
- merge_cells : bool, default True
- Write MultiIndex and Hierarchical Rows as merged cells.
- inf_rep : str, default 'inf'
- Representation for infinity (there is no native representation for
- infinity in Excel).
- freeze_panes : tuple of int (length 2), optional
- Specifies the one-based bottommost row and rightmost column that
- is to be frozen.
- {storage_options}
-
- .. versionadded:: {storage_options_versionadded}
-
- See Also
- --------
- to_csv : Write DataFrame to a comma-separated values (csv) file.
- ExcelWriter : Class for writing DataFrame objects into excel sheets.
- read_excel : Read an Excel file into a pandas DataFrame.
- read_csv : Read a comma-separated values (csv) file into DataFrame.
- io.formats.style.Styler.to_excel : Add styles to Excel sheet.
-
- Notes
- -----
- For compatibility with :meth:`~DataFrame.to_csv`,
- to_excel serializes lists and dicts to strings before writing.
-
- Once a workbook has been saved it is not possible to write further
- data without rewriting the whole workbook.
-
- Examples
- --------
-
- Create, write to and save a workbook:
-
- >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
- ... index=['row 1', 'row 2'],
- ... columns=['col 1', 'col 2'])
- >>> df1.to_excel("output.xlsx") # doctest: +SKIP
-
- To specify the sheet name:
-
- >>> df1.to_excel("output.xlsx",
- ... sheet_name='Sheet_name_1') # doctest: +SKIP
-
- If you wish to write to more than one sheet in the workbook, it is
- necessary to specify an ExcelWriter object:
-
- >>> df2 = df1.copy()
- >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
- ... df1.to_excel(writer, sheet_name='Sheet_name_1')
- ... df2.to_excel(writer, sheet_name='Sheet_name_2')
-
- ExcelWriter can also be used to append to an existing Excel file:
-
- >>> with pd.ExcelWriter('output.xlsx',
- ... mode='a') as writer: # doctest: +SKIP
- ... df.to_excel(writer, sheet_name='Sheet_name_3')
-
- To set the library that is used to write the Excel file,
- you can pass the `engine` keyword (the default engine is
- automatically chosen depending on the file extension):
-
- >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
- """
-
- df = self if isinstance(self, ABCDataFrame) else self.to_frame()
-
- from pandas.io.formats.excel import ExcelFormatter
-
- formatter = ExcelFormatter(
- df,
- na_rep=na_rep,
- cols=columns,
- header=header,
- float_format=float_format,
- index=index,
- index_label=index_label,
- merge_cells=merge_cells,
- inf_rep=inf_rep,
- )
- formatter.write(
- excel_writer,
- sheet_name=sheet_name,
- startrow=startrow,
- startcol=startcol,
- freeze_panes=freeze_panes,
- engine=engine,
- storage_options=storage_options,
- )
-
- @final
- @doc(
- storage_options=_shared_docs["storage_options"],
- compression_options=_shared_docs["compression_options"] % "path_or_buf",
- )
- def to_json(
- self,
- path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
- orient: str | None = None,
- date_format: str | None = None,
- double_precision: int = 10,
- force_ascii: bool_t = True,
- date_unit: str = "ms",
- default_handler: Callable[[Any], JSONSerializable] | None = None,
- lines: bool_t = False,
- compression: CompressionOptions = "infer",
- index: bool_t = True,
- indent: int | None = None,
- storage_options: StorageOptions = None,
- mode: Literal["a", "w"] = "w",
- ) -> str | None:
- """
- Convert the object to a JSON string.
-
- Note NaN's and None will be converted to null and datetime objects
- will be converted to UNIX timestamps.
-
- Parameters
- ----------
- path_or_buf : str, path object, file-like object, or None, default None
- String, path object (implementing os.PathLike[str]), or file-like
- object implementing a write() function. If None, the result is
- returned as a string.
- orient : str
- Indication of expected JSON string format.
-
- * Series:
-
- - default is 'index'
- - allowed values are: {{'split', 'records', 'index', 'table'}}.
-
- * DataFrame:
-
- - default is 'columns'
- - allowed values are: {{'split', 'records', 'index', 'columns',
- 'values', 'table'}}.
-
- * The format of the JSON string:
-
- - 'split' : dict like {{'index' -> [index], 'columns' -> [columns],
- 'data' -> [values]}}
- - 'records' : list like [{{column -> value}}, ... , {{column -> value}}]
- - 'index' : dict like {{index -> {{column -> value}}}}
- - 'columns' : dict like {{column -> {{index -> value}}}}
- - 'values' : just the values array
- - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}}
-
- Describing the data, where data component is like ``orient='records'``.
-
- date_format : {{None, 'epoch', 'iso'}}
- Type of date conversion. 'epoch' = epoch milliseconds,
- 'iso' = ISO8601. The default depends on the `orient`. For
- ``orient='table'``, the default is 'iso'. For all other orients,
- the default is 'epoch'.
- double_precision : int, default 10
- The number of decimal places to use when encoding
- floating point values.
- force_ascii : bool, default True
- Force encoded string to be ASCII.
- date_unit : str, default 'ms' (milliseconds)
- The time unit to encode to, governs timestamp and ISO8601
- precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
- microsecond, and nanosecond respectively.
- default_handler : callable, default None
- Handler to call if object cannot otherwise be converted to a
- suitable format for JSON. Should receive a single argument which is
- the object to convert and return a serialisable object.
- lines : bool, default False
- If 'orient' is 'records' write out line-delimited json format. Will
- throw ValueError if incorrect 'orient' since others are not
- list-like.
- {compression_options}
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- index : bool, default True
- Whether to include the index values in the JSON string. Not
- including the index (``index=False``) is only supported when
- orient is 'split' or 'table'.
- indent : int, optional
- Length of whitespace used to indent each record.
-
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- mode : str, default 'w' (writing)
- Specify the IO mode for output when supplying a path_or_buf.
- Accepted args are 'w' (writing) and 'a' (append) only.
- mode='a' is only supported when lines is True and orient is 'records'.
-
- Returns
- -------
- None or str
- If path_or_buf is None, returns the resulting json format as a
- string. Otherwise returns None.
-
- See Also
- --------
- read_json : Convert a JSON string to pandas object.
-
- Notes
- -----
- The behavior of ``indent=0`` varies from the stdlib, which does not
- indent the output but does insert newlines. Currently, ``indent=0``
- and the default ``indent=None`` are equivalent in pandas, though this
- may change in a future release.
-
- ``orient='table'`` contains a 'pandas_version' field under 'schema'.
- This stores the version of `pandas` used in the latest revision of the
- schema.
-
- Examples
- --------
- >>> from json import loads, dumps
- >>> df = pd.DataFrame(
- ... [["a", "b"], ["c", "d"]],
- ... index=["row 1", "row 2"],
- ... columns=["col 1", "col 2"],
- ... )
-
- >>> result = df.to_json(orient="split")
- >>> parsed = loads(result)
- >>> dumps(parsed, indent=4) # doctest: +SKIP
- {{
- "columns": [
- "col 1",
- "col 2"
- ],
- "index": [
- "row 1",
- "row 2"
- ],
- "data": [
- [
- "a",
- "b"
- ],
- [
- "c",
- "d"
- ]
- ]
- }}
-
- Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
- Note that index labels are not preserved with this encoding.
-
- >>> result = df.to_json(orient="records")
- >>> parsed = loads(result)
- >>> dumps(parsed, indent=4) # doctest: +SKIP
- [
- {{
- "col 1": "a",
- "col 2": "b"
- }},
- {{
- "col 1": "c",
- "col 2": "d"
- }}
- ]
-
- Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
-
- >>> result = df.to_json(orient="index")
- >>> parsed = loads(result)
- >>> dumps(parsed, indent=4) # doctest: +SKIP
- {{
- "row 1": {{
- "col 1": "a",
- "col 2": "b"
- }},
- "row 2": {{
- "col 1": "c",
- "col 2": "d"
- }}
- }}
-
- Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
-
- >>> result = df.to_json(orient="columns")
- >>> parsed = loads(result)
- >>> dumps(parsed, indent=4) # doctest: +SKIP
- {{
- "col 1": {{
- "row 1": "a",
- "row 2": "c"
- }},
- "col 2": {{
- "row 1": "b",
- "row 2": "d"
- }}
- }}
-
- Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
-
- >>> result = df.to_json(orient="values")
- >>> parsed = loads(result)
- >>> dumps(parsed, indent=4) # doctest: +SKIP
- [
- [
- "a",
- "b"
- ],
- [
- "c",
- "d"
- ]
- ]
-
- Encoding with Table Schema:
-
- >>> result = df.to_json(orient="table")
- >>> parsed = loads(result)
- >>> dumps(parsed, indent=4) # doctest: +SKIP
- {{
- "schema": {{
- "fields": [
- {{
- "name": "index",
- "type": "string"
- }},
- {{
- "name": "col 1",
- "type": "string"
- }},
- {{
- "name": "col 2",
- "type": "string"
- }}
- ],
- "primaryKey": [
- "index"
- ],
- "pandas_version": "1.4.0"
- }},
- "data": [
- {{
- "index": "row 1",
- "col 1": "a",
- "col 2": "b"
- }},
- {{
- "index": "row 2",
- "col 1": "c",
- "col 2": "d"
- }}
- ]
- }}
- """
- from pandas.io import json
-
- if date_format is None and orient == "table":
- date_format = "iso"
- elif date_format is None:
- date_format = "epoch"
-
- config.is_nonnegative_int(indent)
- indent = indent or 0
-
- return json.to_json(
- path_or_buf=path_or_buf,
- obj=self,
- orient=orient,
- date_format=date_format,
- double_precision=double_precision,
- force_ascii=force_ascii,
- date_unit=date_unit,
- default_handler=default_handler,
- lines=lines,
- compression=compression,
- index=index,
- indent=indent,
- storage_options=storage_options,
- mode=mode,
- )
-
- @final
- def to_hdf(
- self,
- path_or_buf: FilePath | HDFStore,
- key: str,
- mode: str = "a",
- complevel: int | None = None,
- complib: str | None = None,
- append: bool_t = False,
- format: str | None = None,
- index: bool_t = True,
- min_itemsize: int | dict[str, int] | None = None,
- nan_rep=None,
- dropna: bool_t | None = None,
- data_columns: Literal[True] | list[str] | None = None,
- errors: str = "strict",
- encoding: str = "UTF-8",
- ) -> None:
- """
- Write the contained data to an HDF5 file using HDFStore.
-
- Hierarchical Data Format (HDF) is self-describing, allowing an
- application to interpret the structure and contents of a file with
- no outside information. One HDF file can hold a mix of related objects
- which can be accessed as a group or as individual objects.
-
- In order to add another DataFrame or Series to an existing HDF file
- please use append mode and a different a key.
-
- .. warning::
-
- One can store a subclass of ``DataFrame`` or ``Series`` to HDF5,
- but the type of the subclass is lost upon storing.
-
- For more information see the :ref:`user guide <io.hdf5>`.
-
- Parameters
- ----------
- path_or_buf : str or pandas.HDFStore
- File path or HDFStore object.
- key : str
- Identifier for the group in the store.
- mode : {'a', 'w', 'r+'}, default 'a'
- Mode to open file:
-
- - 'w': write, a new file is created (an existing file with
- the same name would be deleted).
- - 'a': append, an existing file is opened for reading and
- writing, and if the file does not exist it is created.
- - 'r+': similar to 'a', but the file must already exist.
- complevel : {0-9}, default None
- Specifies a compression level for data.
- A value of 0 or None disables compression.
- complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
- Specifies the compression library to be used.
- As of v0.20.2 these additional compressors for Blosc are supported
- (default if no compressor specified: 'blosc:blosclz'):
- {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
- 'blosc:zlib', 'blosc:zstd'}.
- Specifying a compression library which is not available issues
- a ValueError.
- append : bool, default False
- For Table formats, append the input data to the existing.
- format : {'fixed', 'table', None}, default 'fixed'
- Possible values:
-
- - 'fixed': Fixed format. Fast writing/reading. Not-appendable,
- nor searchable.
- - 'table': Table format. Write as a PyTables Table structure
- which may perform worse but allow more flexible operations
- like searching / selecting subsets of the data.
- - If None, pd.get_option('io.hdf.default_format') is checked,
- followed by fallback to "fixed".
- index : bool, default True
- Write DataFrame index as a column.
- min_itemsize : dict or int, optional
- Map column names to minimum string sizes for columns.
- nan_rep : Any, optional
- How to represent null values as str.
- Not allowed with append=True.
- dropna : bool, default False, optional
- Remove missing values.
- data_columns : list of columns or True, optional
- List of columns to create as indexed data columns for on-disk
- queries, or True to use all columns. By default only the axes
- of the object are indexed. See
- :ref:`Query via data columns<io.hdf5-query-data-columns>`. for
- more information.
- Applicable only to format='table'.
- errors : str, default 'strict'
- Specifies how encoding and decoding errors are to be handled.
- See the errors argument for :func:`open` for a full list
- of options.
- encoding : str, default "UTF-8"
-
- See Also
- --------
- read_hdf : Read from HDF file.
- DataFrame.to_orc : Write a DataFrame to the binary orc format.
- DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
- DataFrame.to_sql : Write to a SQL table.
- DataFrame.to_feather : Write out feather-format for DataFrames.
- DataFrame.to_csv : Write out to a csv file.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
- ... index=['a', 'b', 'c']) # doctest: +SKIP
- >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP
-
- We can add another object to the same file:
-
- >>> s = pd.Series([1, 2, 3, 4]) # doctest: +SKIP
- >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP
-
- Reading from HDF file:
-
- >>> pd.read_hdf('data.h5', 'df') # doctest: +SKIP
- A B
- a 1 4
- b 2 5
- c 3 6
- >>> pd.read_hdf('data.h5', 's') # doctest: +SKIP
- 0 1
- 1 2
- 2 3
- 3 4
- dtype: int64
- """
- from pandas.io import pytables
-
- # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected
- # "Union[DataFrame, Series]" [arg-type]
- pytables.to_hdf(
- path_or_buf,
- key,
- self, # type: ignore[arg-type]
- mode=mode,
- complevel=complevel,
- complib=complib,
- append=append,
- format=format,
- index=index,
- min_itemsize=min_itemsize,
- nan_rep=nan_rep,
- dropna=dropna,
- data_columns=data_columns,
- errors=errors,
- encoding=encoding,
- )
-
- @final
- def to_sql(
- self,
- name: str,
- con,
- schema: str | None = None,
- if_exists: Literal["fail", "replace", "append"] = "fail",
- index: bool_t = True,
- index_label: IndexLabel = None,
- chunksize: int | None = None,
- dtype: DtypeArg | None = None,
- method: str | None = None,
- ) -> int | None:
- """
- Write records stored in a DataFrame to a SQL database.
-
- Databases supported by SQLAlchemy [1]_ are supported. Tables can be
- newly created, appended to, or overwritten.
-
- Parameters
- ----------
- name : str
- Name of SQL table.
- con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection
- Using SQLAlchemy makes it possible to use any DB supported by that
- library. Legacy support is provided for sqlite3.Connection objects. The user
- is responsible for engine disposal and connection closure for the SQLAlchemy
- connectable. See `here \
- <https://docs.sqlalchemy.org/en/20/core/connections.html>`_.
- If passing a sqlalchemy.engine.Connection which is already in a transaction,
- the transaction will not be committed. If passing a sqlite3.Connection,
- it will not be possible to roll back the record insertion.
-
- schema : str, optional
- Specify the schema (if database flavor supports this). If None, use
- default schema.
- if_exists : {'fail', 'replace', 'append'}, default 'fail'
- How to behave if the table already exists.
-
- * fail: Raise a ValueError.
- * replace: Drop the table before inserting new values.
- * append: Insert new values to the existing table.
-
- index : bool, default True
- Write DataFrame index as a column. Uses `index_label` as the column
- name in the table.
- index_label : str or sequence, default None
- Column label for index column(s). If None is given (default) and
- `index` is True, then the index names are used.
- A sequence should be given if the DataFrame uses MultiIndex.
- chunksize : int, optional
- Specify the number of rows in each batch to be written at a time.
- By default, all rows will be written at once.
- dtype : dict or scalar, optional
- Specifying the datatype for columns. If a dictionary is used, the
- keys should be the column names and the values should be the
- SQLAlchemy types or strings for the sqlite3 legacy mode. If a
- scalar is provided, it will be applied to all columns.
- method : {None, 'multi', callable}, optional
- Controls the SQL insertion clause used:
-
- * None : Uses standard SQL ``INSERT`` clause (one per row).
- * 'multi': Pass multiple values in a single ``INSERT`` clause.
- * callable with signature ``(pd_table, conn, keys, data_iter)``.
-
- Details and a sample callable implementation can be found in the
- section :ref:`insert method <io.sql.method>`.
-
- Returns
- -------
- None or int
- Number of rows affected by to_sql. None is returned if the callable
- passed into ``method`` does not return an integer number of rows.
-
- The number of returned rows affected is the sum of the ``rowcount``
- attribute of ``sqlite3.Cursor`` or SQLAlchemy connectable which may not
- reflect the exact number of written rows as stipulated in the
- `sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>`__ or
- `SQLAlchemy <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.CursorResult.rowcount>`__.
-
- .. versionadded:: 1.4.0
-
- Raises
- ------
- ValueError
- When the table already exists and `if_exists` is 'fail' (the
- default).
-
- See Also
- --------
- read_sql : Read a DataFrame from a table.
-
- Notes
- -----
- Timezone aware datetime columns will be written as
- ``Timestamp with timezone`` type with SQLAlchemy if supported by the
- database. Otherwise, the datetimes will be stored as timezone unaware
- timestamps local to the original timezone.
-
- References
- ----------
- .. [1] https://docs.sqlalchemy.org
- .. [2] https://www.python.org/dev/peps/pep-0249/
-
- Examples
- --------
- Create an in-memory SQLite database.
-
- >>> from sqlalchemy import create_engine
- >>> engine = create_engine('sqlite://', echo=False)
-
- Create a table from scratch with 3 rows.
-
- >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})
- >>> df
- name
- 0 User 1
- 1 User 2
- 2 User 3
-
- >>> df.to_sql('users', con=engine)
- 3
- >>> from sqlalchemy import text
- >>> with engine.connect() as conn:
- ... conn.execute(text("SELECT * FROM users")).fetchall()
- [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
-
- An `sqlalchemy.engine.Connection` can also be passed to `con`:
-
- >>> with engine.begin() as connection:
- ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})
- ... df1.to_sql('users', con=connection, if_exists='append')
- 2
-
- This is allowed to support operations that require that the same
- DBAPI connection is used for the entire operation.
-
- >>> df2 = pd.DataFrame({'name' : ['User 6', 'User 7']})
- >>> df2.to_sql('users', con=engine, if_exists='append')
- 2
- >>> with engine.connect() as conn:
- ... conn.execute(text("SELECT * FROM users")).fetchall()
- [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
- (0, 'User 4'), (1, 'User 5'), (0, 'User 6'),
- (1, 'User 7')]
-
- Overwrite the table with just ``df2``.
-
- >>> df2.to_sql('users', con=engine, if_exists='replace',
- ... index_label='id')
- 2
- >>> with engine.connect() as conn:
- ... conn.execute(text("SELECT * FROM users")).fetchall()
- [(0, 'User 6'), (1, 'User 7')]
-
- Specify the dtype (especially useful for integers with missing values).
- Notice that while pandas is forced to store the data as floating point,
- the database supports nullable integers. When fetching the data with
- Python, we get back integer scalars.
-
- >>> df = pd.DataFrame({"A": [1, None, 2]})
- >>> df
- A
- 0 1.0
- 1 NaN
- 2 2.0
-
- >>> from sqlalchemy.types import Integer
- >>> df.to_sql('integers', con=engine, index=False,
- ... dtype={"A": Integer()})
- 3
-
- >>> with engine.connect() as conn:
- ... conn.execute(text("SELECT * FROM integers")).fetchall()
- [(1,), (None,), (2,)]
- """ # noqa:E501
- from pandas.io import sql
-
- return sql.to_sql(
- self,
- name,
- con,
- schema=schema,
- if_exists=if_exists,
- index=index,
- index_label=index_label,
- chunksize=chunksize,
- dtype=dtype,
- method=method,
- )
-
- @final
- @doc(
- storage_options=_shared_docs["storage_options"],
- compression_options=_shared_docs["compression_options"] % "path",
- )
- def to_pickle(
- self,
- path: FilePath | WriteBuffer[bytes],
- compression: CompressionOptions = "infer",
- protocol: int = pickle.HIGHEST_PROTOCOL,
- storage_options: StorageOptions = None,
- ) -> None:
- """
- Pickle (serialize) object to file.
-
- Parameters
- ----------
- path : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``write()`` function. File path where
- the pickled object will be stored.
- {compression_options}
- protocol : int
- Int which indicates which protocol should be used by the pickler,
- default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
- values are 0, 1, 2, 3, 4, 5. A negative value for the protocol
- parameter is equivalent to setting its value to HIGHEST_PROTOCOL.
-
- .. [1] https://docs.python.org/3/library/pickle.html.
-
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- See Also
- --------
- read_pickle : Load pickled pandas object (or any object) from file.
- DataFrame.to_hdf : Write DataFrame to an HDF5 file.
- DataFrame.to_sql : Write DataFrame to a SQL database.
- DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
-
- Examples
- --------
- >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP
- >>> original_df # doctest: +SKIP
- foo bar
- 0 0 5
- 1 1 6
- 2 2 7
- 3 3 8
- 4 4 9
- >>> original_df.to_pickle("./dummy.pkl") # doctest: +SKIP
-
- >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP
- >>> unpickled_df # doctest: +SKIP
- foo bar
- 0 0 5
- 1 1 6
- 2 2 7
- 3 3 8
- 4 4 9
- """ # noqa: E501
- from pandas.io.pickle import to_pickle
-
- to_pickle(
- self,
- path,
- compression=compression,
- protocol=protocol,
- storage_options=storage_options,
- )
-
- @final
- def to_clipboard(
- self, excel: bool_t = True, sep: str | None = None, **kwargs
- ) -> None:
- r"""
- Copy object to the system clipboard.
-
- Write a text representation of object to the system clipboard.
- This can be pasted into Excel, for example.
-
- Parameters
- ----------
- excel : bool, default True
- Produce output in a csv format for easy pasting into excel.
-
- - True, use the provided separator for csv pasting.
- - False, write a string representation of the object to the clipboard.
-
- sep : str, default ``'\t'``
- Field delimiter.
- **kwargs
- These parameters will be passed to DataFrame.to_csv.
-
- See Also
- --------
- DataFrame.to_csv : Write a DataFrame to a comma-separated values
- (csv) file.
- read_clipboard : Read text from clipboard and pass to read_csv.
-
- Notes
- -----
- Requirements for your platform.
-
- - Linux : `xclip`, or `xsel` (with `PyQt4` modules)
- - Windows : none
- - macOS : none
-
- This method uses the processes developed for the package `pyperclip`. A
- solution to render any output string format is given in the examples.
-
- Examples
- --------
- Copy the contents of a DataFrame to the clipboard.
-
- >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
-
- >>> df.to_clipboard(sep=',') # doctest: +SKIP
- ... # Wrote the following to the system clipboard:
- ... # ,A,B,C
- ... # 0,1,2,3
- ... # 1,4,5,6
-
- We can omit the index by passing the keyword `index` and setting
- it to false.
-
- >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP
- ... # Wrote the following to the system clipboard:
- ... # A,B,C
- ... # 1,2,3
- ... # 4,5,6
-
- Using the original `pyperclip` package for any string output format.
-
- .. code-block:: python
-
- import pyperclip
- html = df.style.to_html()
- pyperclip.copy(html)
- """
- from pandas.io import clipboards
-
- clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)
-
- @final
- def to_xarray(self):
- """
- Return an xarray object from the pandas object.
-
- Returns
- -------
- xarray.DataArray or xarray.Dataset
- Data in the pandas structure converted to Dataset if the object is
- a DataFrame, or a DataArray if the object is a Series.
-
- See Also
- --------
- DataFrame.to_hdf : Write DataFrame to an HDF5 file.
- DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
-
- Notes
- -----
- See the `xarray docs <https://xarray.pydata.org/en/stable/>`__
-
- Examples
- --------
- >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2),
- ... ('parrot', 'bird', 24.0, 2),
- ... ('lion', 'mammal', 80.5, 4),
- ... ('monkey', 'mammal', np.nan, 4)],
- ... columns=['name', 'class', 'max_speed',
- ... 'num_legs'])
- >>> df
- name class max_speed num_legs
- 0 falcon bird 389.0 2
- 1 parrot bird 24.0 2
- 2 lion mammal 80.5 4
- 3 monkey mammal NaN 4
-
- >>> df.to_xarray()
- <xarray.Dataset>
- Dimensions: (index: 4)
- Coordinates:
- * index (index) int64 0 1 2 3
- Data variables:
- name (index) object 'falcon' 'parrot' 'lion' 'monkey'
- class (index) object 'bird' 'bird' 'mammal' 'mammal'
- max_speed (index) float64 389.0 24.0 80.5 nan
- num_legs (index) int64 2 2 4 4
-
- >>> df['max_speed'].to_xarray()
- <xarray.DataArray 'max_speed' (index: 4)>
- array([389. , 24. , 80.5, nan])
- Coordinates:
- * index (index) int64 0 1 2 3
-
- >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01',
- ... '2018-01-02', '2018-01-02'])
- >>> df_multiindex = pd.DataFrame({'date': dates,
- ... 'animal': ['falcon', 'parrot',
- ... 'falcon', 'parrot'],
- ... 'speed': [350, 18, 361, 15]})
- >>> df_multiindex = df_multiindex.set_index(['date', 'animal'])
-
- >>> df_multiindex
- speed
- date animal
- 2018-01-01 falcon 350
- parrot 18
- 2018-01-02 falcon 361
- parrot 15
-
- >>> df_multiindex.to_xarray()
- <xarray.Dataset>
- Dimensions: (date: 2, animal: 2)
- Coordinates:
- * date (date) datetime64[ns] 2018-01-01 2018-01-02
- * animal (animal) object 'falcon' 'parrot'
- Data variables:
- speed (date, animal) int64 350 18 361 15
- """
- xarray = import_optional_dependency("xarray")
-
- if self.ndim == 1:
- return xarray.DataArray.from_series(self)
- else:
- return xarray.Dataset.from_dataframe(self)
-
- @overload
- def to_latex(
- self,
- buf: None = ...,
- columns: Sequence[Hashable] | None = ...,
- header: bool_t | Sequence[str] = ...,
- index: bool_t = ...,
- na_rep: str = ...,
- formatters: FormattersType | None = ...,
- float_format: FloatFormatType | None = ...,
- sparsify: bool_t | None = ...,
- index_names: bool_t = ...,
- bold_rows: bool_t = ...,
- column_format: str | None = ...,
- longtable: bool_t | None = ...,
- escape: bool_t | None = ...,
- encoding: str | None = ...,
- decimal: str = ...,
- multicolumn: bool_t | None = ...,
- multicolumn_format: str | None = ...,
- multirow: bool_t | None = ...,
- caption: str | tuple[str, str] | None = ...,
- label: str | None = ...,
- position: str | None = ...,
- ) -> str:
- ...
-
- @overload
- def to_latex(
- self,
- buf: FilePath | WriteBuffer[str],
- columns: Sequence[Hashable] | None = ...,
- header: bool_t | Sequence[str] = ...,
- index: bool_t = ...,
- na_rep: str = ...,
- formatters: FormattersType | None = ...,
- float_format: FloatFormatType | None = ...,
- sparsify: bool_t | None = ...,
- index_names: bool_t = ...,
- bold_rows: bool_t = ...,
- column_format: str | None = ...,
- longtable: bool_t | None = ...,
- escape: bool_t | None = ...,
- encoding: str | None = ...,
- decimal: str = ...,
- multicolumn: bool_t | None = ...,
- multicolumn_format: str | None = ...,
- multirow: bool_t | None = ...,
- caption: str | tuple[str, str] | None = ...,
- label: str | None = ...,
- position: str | None = ...,
- ) -> None:
- ...
-
- @final
- def to_latex(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- columns: Sequence[Hashable] | None = None,
- header: bool_t | Sequence[str] = True,
- index: bool_t = True,
- na_rep: str = "NaN",
- formatters: FormattersType | None = None,
- float_format: FloatFormatType | None = None,
- sparsify: bool_t | None = None,
- index_names: bool_t = True,
- bold_rows: bool_t = False,
- column_format: str | None = None,
- longtable: bool_t | None = None,
- escape: bool_t | None = None,
- encoding: str | None = None,
- decimal: str = ".",
- multicolumn: bool_t | None = None,
- multicolumn_format: str | None = None,
- multirow: bool_t | None = None,
- caption: str | tuple[str, str] | None = None,
- label: str | None = None,
- position: str | None = None,
- ) -> str | None:
- r"""
- Render object to a LaTeX tabular, longtable, or nested table.
-
- Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted
- into a main LaTeX document or read from an external file
- with ``\input{{table.tex}}``.
-
- .. versionchanged:: 1.2.0
- Added position argument, changed meaning of caption argument.
-
- .. versionchanged:: 2.0.0
- Refactored to use the Styler implementation via jinja2 templating.
-
- Parameters
- ----------
- buf : str, Path or StringIO-like, optional, default None
- Buffer to write to. If None, the output is returned as a string.
- columns : list of label, optional
- The subset of columns to write. Writes all columns by default.
- header : bool or list of str, default True
- Write out the column names. If a list of strings is given,
- it is assumed to be aliases for the column names.
- index : bool, default True
- Write row names (index).
- na_rep : str, default 'NaN'
- Missing data representation.
- formatters : list of functions or dict of {{str: function}}, optional
- Formatter functions to apply to columns' elements by position or
- name. The result of each function must be a unicode string.
- List must be of length equal to the number of columns.
- float_format : one-parameter function or str, optional, default None
- Formatter for floating point numbers. For example
- ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will
- both result in 0.1234 being formatted as 0.12.
- sparsify : bool, optional
- Set to False for a DataFrame with a hierarchical index to print
- every multiindex key at each row. By default, the value will be
- read from the config module.
- index_names : bool, default True
- Prints the names of the indexes.
- bold_rows : bool, default False
- Make the row labels bold in the output.
- column_format : str, optional
- The columns format as specified in `LaTeX table format
- <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g. 'rcl' for 3
- columns. By default, 'l' will be used for all columns except
- columns of numbers, which default to 'r'.
- longtable : bool, optional
- Use a longtable environment instead of tabular. Requires
- adding a \usepackage{{longtable}} to your LaTeX preamble.
- By default, the value will be read from the pandas config
- module, and set to `True` if the option ``styler.latex.environment`` is
- `"longtable"`.
-
- .. versionchanged:: 2.0.0
- The pandas option affecting this argument has changed.
- escape : bool, optional
- By default, the value will be read from the pandas config
- module and set to `True` if the option ``styler.format.escape`` is
- `"latex"`. When set to False prevents from escaping latex special
- characters in column names.
-
- .. versionchanged:: 2.0.0
- The pandas option affecting this argument has changed, as has the
- default value to `False`.
- encoding : str, optional
- A string representing the encoding to use in the output file,
- defaults to 'utf-8'.
- decimal : str, default '.'
- Character recognized as decimal separator, e.g. ',' in Europe.
- multicolumn : bool, default True
- Use \multicolumn to enhance MultiIndex columns.
- The default will be read from the config module, and is set
- as the option ``styler.sparse.columns``.
-
- .. versionchanged:: 2.0.0
- The pandas option affecting this argument has changed.
- multicolumn_format : str, default 'r'
- The alignment for multicolumns, similar to `column_format`
- The default will be read from the config module, and is set as the option
- ``styler.latex.multicol_align``.
-
- .. versionchanged:: 2.0.0
- The pandas option affecting this argument has changed, as has the
- default value to "r".
- multirow : bool, default True
- Use \multirow to enhance MultiIndex rows. Requires adding a
- \usepackage{{multirow}} to your LaTeX preamble. Will print
- centered labels (instead of top-aligned) across the contained
- rows, separating groups via clines. The default will be read
- from the pandas config module, and is set as the option
- ``styler.sparse.index``.
-
- .. versionchanged:: 2.0.0
- The pandas option affecting this argument has changed, as has the
- default value to `True`.
- caption : str or tuple, optional
- Tuple (full_caption, short_caption),
- which results in ``\caption[short_caption]{{full_caption}}``;
- if a single string is passed, no short caption will be set.
-
- .. versionchanged:: 1.2.0
- Optionally allow caption to be a tuple ``(full_caption, short_caption)``.
-
- label : str, optional
- The LaTeX label to be placed inside ``\label{{}}`` in the output.
- This is used with ``\ref{{}}`` in the main ``.tex`` file.
-
- position : str, optional
- The LaTeX positional argument for tables, to be placed after
- ``\begin{{}}`` in the output.
-
- .. versionadded:: 1.2.0
-
- Returns
- -------
- str or None
- If buf is None, returns the result as a string. Otherwise returns None.
-
- See Also
- --------
- io.formats.style.Styler.to_latex : Render a DataFrame to LaTeX
- with conditional formatting.
- DataFrame.to_string : Render a DataFrame to a console-friendly
- tabular output.
- DataFrame.to_html : Render a DataFrame as an HTML table.
-
- Notes
- -----
- As of v2.0.0 this method has changed to use the Styler implementation as
- part of :meth:`.Styler.to_latex` via ``jinja2`` templating. This means
- that ``jinja2`` is a requirement, and needs to be installed, for this method
- to function. It is advised that users switch to using Styler, since that
- implementation is more frequently updated and contains much more
- flexibility with the output.
-
- Examples
- --------
- Convert a general DataFrame to LaTeX with formatting:
-
- >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'],
- ... age=[26, 45],
- ... height=[181.23, 177.65]))
- >>> print(df.to_latex(index=False,
- ... formatters={"name": str.upper},
- ... float_format="{:.1f}".format,
- ... )) # doctest: +SKIP
- \begin{tabular}{lrr}
- \toprule
- name & age & height \\
- \midrule
- RAPHAEL & 26 & 181.2 \\
- DONATELLO & 45 & 177.7 \\
- \bottomrule
- \end{tabular}
- """
- # Get defaults from the pandas config
- if self.ndim == 1:
- self = self.to_frame()
- if longtable is None:
- longtable = config.get_option("styler.latex.environment") == "longtable"
- if escape is None:
- escape = config.get_option("styler.format.escape") == "latex"
- if multicolumn is None:
- multicolumn = config.get_option("styler.sparse.columns")
- if multicolumn_format is None:
- multicolumn_format = config.get_option("styler.latex.multicol_align")
- if multirow is None:
- multirow = config.get_option("styler.sparse.index")
-
- if column_format is not None and not isinstance(column_format, str):
- raise ValueError("`column_format` must be str or unicode")
- length = len(self.columns) if columns is None else len(columns)
- if isinstance(header, (list, tuple)) and len(header) != length:
- raise ValueError(f"Writing {length} cols but got {len(header)} aliases")
-
- # Refactor formatters/float_format/decimal/na_rep/escape to Styler structure
- base_format_ = {
- "na_rep": na_rep,
- "escape": "latex" if escape else None,
- "decimal": decimal,
- }
- index_format_: dict[str, Any] = {"axis": 0, **base_format_}
- column_format_: dict[str, Any] = {"axis": 1, **base_format_}
-
- if isinstance(float_format, str):
- float_format_: Callable | None = lambda x: float_format % x
- else:
- float_format_ = float_format
-
- def _wrap(x, alt_format_):
- if isinstance(x, (float, complex)) and float_format_ is not None:
- return float_format_(x)
- else:
- return alt_format_(x)
-
- formatters_: list | tuple | dict | Callable | None = None
- if isinstance(formatters, list):
- formatters_ = {
- c: partial(_wrap, alt_format_=formatters[i])
- for i, c in enumerate(self.columns)
- }
- elif isinstance(formatters, dict):
- index_formatter = formatters.pop("__index__", None)
- column_formatter = formatters.pop("__columns__", None)
- if index_formatter is not None:
- index_format_.update({"formatter": index_formatter})
- if column_formatter is not None:
- column_format_.update({"formatter": column_formatter})
-
- formatters_ = formatters
- float_columns = self.select_dtypes(include="float").columns
- for col in float_columns:
- if col not in formatters.keys():
- formatters_.update({col: float_format_})
- elif formatters is None and float_format is not None:
- formatters_ = partial(_wrap, alt_format_=lambda v: v)
- format_index_ = [index_format_, column_format_]
-
- # Deal with hiding indexes and relabelling column names
- hide_: list[dict] = []
- relabel_index_: list[dict] = []
- if columns:
- hide_.append(
- {
- "subset": [c for c in self.columns if c not in columns],
- "axis": "columns",
- }
- )
- if header is False:
- hide_.append({"axis": "columns"})
- elif isinstance(header, (list, tuple)):
- relabel_index_.append({"labels": header, "axis": "columns"})
- format_index_ = [index_format_] # column_format is overwritten
-
- if index is False:
- hide_.append({"axis": "index"})
- if index_names is False:
- hide_.append({"names": True, "axis": "index"})
-
- render_kwargs_ = {
- "hrules": True,
- "sparse_index": sparsify,
- "sparse_columns": sparsify,
- "environment": "longtable" if longtable else None,
- "multicol_align": multicolumn_format
- if multicolumn
- else f"naive-{multicolumn_format}",
- "multirow_align": "t" if multirow else "naive",
- "encoding": encoding,
- "caption": caption,
- "label": label,
- "position": position,
- "column_format": column_format,
- "clines": "skip-last;data"
- if (multirow and isinstance(self.index, MultiIndex))
- else None,
- "bold_rows": bold_rows,
- }
-
- return self._to_latex_via_styler(
- buf,
- hide=hide_,
- relabel_index=relabel_index_,
- format={"formatter": formatters_, **base_format_},
- format_index=format_index_,
- render_kwargs=render_kwargs_,
- )
-
- def _to_latex_via_styler(
- self,
- buf=None,
- *,
- hide: dict | list[dict] | None = None,
- relabel_index: dict | list[dict] | None = None,
- format: dict | list[dict] | None = None,
- format_index: dict | list[dict] | None = None,
- render_kwargs: dict | None = None,
- ):
- """
- Render object to a LaTeX tabular, longtable, or nested table.
-
- Uses the ``Styler`` implementation with the following, ordered, method chaining:
-
- .. code-block:: python
- styler = Styler(DataFrame)
- styler.hide(**hide)
- styler.relabel_index(**relabel_index)
- styler.format(**format)
- styler.format_index(**format_index)
- styler.to_latex(buf=buf, **render_kwargs)
-
- Parameters
- ----------
- buf : str, Path or StringIO-like, optional, default None
- Buffer to write to. If None, the output is returned as a string.
- hide : dict, list of dict
- Keyword args to pass to the method call of ``Styler.hide``. If a list will
- call the method numerous times.
- relabel_index : dict, list of dict
- Keyword args to pass to the method of ``Styler.relabel_index``. If a list
- will call the method numerous times.
- format : dict, list of dict
- Keyword args to pass to the method call of ``Styler.format``. If a list will
- call the method numerous times.
- format_index : dict, list of dict
- Keyword args to pass to the method call of ``Styler.format_index``. If a
- list will call the method numerous times.
- render_kwargs : dict
- Keyword args to pass to the method call of ``Styler.to_latex``.
-
- Returns
- -------
- str or None
- If buf is None, returns the result as a string. Otherwise returns None.
- """
- from pandas.io.formats.style import Styler
-
- self = cast("DataFrame", self)
- styler = Styler(self, uuid="")
-
- for kw_name in ["hide", "relabel_index", "format", "format_index"]:
- kw = vars()[kw_name]
- if isinstance(kw, dict):
- getattr(styler, kw_name)(**kw)
- elif isinstance(kw, list):
- for sub_kw in kw:
- getattr(styler, kw_name)(**sub_kw)
-
- # bold_rows is not a direct kwarg of Styler.to_latex
- render_kwargs = {} if render_kwargs is None else render_kwargs
- if render_kwargs.pop("bold_rows"):
- styler.applymap_index(lambda v: "textbf:--rwrap;")
-
- return styler.to_latex(buf=buf, **render_kwargs)
-
- @overload
- def to_csv(
- self,
- path_or_buf: None = ...,
- sep: str = ...,
- na_rep: str = ...,
- float_format: str | Callable | None = ...,
- columns: Sequence[Hashable] | None = ...,
- header: bool_t | list[str] = ...,
- index: bool_t = ...,
- index_label: IndexLabel | None = ...,
- mode: str = ...,
- encoding: str | None = ...,
- compression: CompressionOptions = ...,
- quoting: int | None = ...,
- quotechar: str = ...,
- lineterminator: str | None = ...,
- chunksize: int | None = ...,
- date_format: str | None = ...,
- doublequote: bool_t = ...,
- escapechar: str | None = ...,
- decimal: str = ...,
- errors: str = ...,
- storage_options: StorageOptions = ...,
- ) -> str:
- ...
-
- @overload
- def to_csv(
- self,
- path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
- sep: str = ...,
- na_rep: str = ...,
- float_format: str | Callable | None = ...,
- columns: Sequence[Hashable] | None = ...,
- header: bool_t | list[str] = ...,
- index: bool_t = ...,
- index_label: IndexLabel | None = ...,
- mode: str = ...,
- encoding: str | None = ...,
- compression: CompressionOptions = ...,
- quoting: int | None = ...,
- quotechar: str = ...,
- lineterminator: str | None = ...,
- chunksize: int | None = ...,
- date_format: str | None = ...,
- doublequote: bool_t = ...,
- escapechar: str | None = ...,
- decimal: str = ...,
- errors: str = ...,
- storage_options: StorageOptions = ...,
- ) -> None:
- ...
-
- @final
- @doc(
- storage_options=_shared_docs["storage_options"],
- compression_options=_shared_docs["compression_options"] % "path_or_buf",
- )
- def to_csv(
- self,
- path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
- sep: str = ",",
- na_rep: str = "",
- float_format: str | Callable | None = None,
- columns: Sequence[Hashable] | None = None,
- header: bool_t | list[str] = True,
- index: bool_t = True,
- index_label: IndexLabel | None = None,
- mode: str = "w",
- encoding: str | None = None,
- compression: CompressionOptions = "infer",
- quoting: int | None = None,
- quotechar: str = '"',
- lineterminator: str | None = None,
- chunksize: int | None = None,
- date_format: str | None = None,
- doublequote: bool_t = True,
- escapechar: str | None = None,
- decimal: str = ".",
- errors: str = "strict",
- storage_options: StorageOptions = None,
- ) -> str | None:
- r"""
- Write object to a comma-separated values (csv) file.
-
- Parameters
- ----------
- path_or_buf : str, path object, file-like object, or None, default None
- String, path object (implementing os.PathLike[str]), or file-like
- object implementing a write() function. If None, the result is
- returned as a string. If a non-binary file object is passed, it should
- be opened with `newline=''`, disabling universal newlines. If a binary
- file object is passed, `mode` might need to contain a `'b'`.
-
- .. versionchanged:: 1.2.0
-
- Support for binary file objects was introduced.
-
- sep : str, default ','
- String of length 1. Field delimiter for the output file.
- na_rep : str, default ''
- Missing data representation.
- float_format : str, Callable, default None
- Format string for floating point numbers. If a Callable is given, it takes
- precedence over other numeric formatting parameters, like decimal.
- columns : sequence, optional
- Columns to write.
- header : bool or list of str, default True
- Write out the column names. If a list of strings is given it is
- assumed to be aliases for the column names.
- index : bool, default True
- Write row names (index).
- index_label : str or sequence, or False, default None
- Column label for index column(s) if desired. If None is given, and
- `header` and `index` are True, then the index names are used. A
- sequence should be given if the object uses MultiIndex. If
- False do not print fields for index names. Use index_label=False
- for easier importing in R.
- mode : str, default 'w'
- Python write mode. The available write modes are the same as
- :py:func:`open`.
- encoding : str, optional
- A string representing the encoding to use in the output file,
- defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
- is a non-binary file object.
- {compression_options}
-
- .. versionchanged:: 1.0.0
-
- May now be a dict with key 'method' as compression mode
- and other entries as additional compression options if
- compression mode is 'zip'.
-
- .. versionchanged:: 1.1.0
-
- Passing compression options as keys in dict is
- supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
-
- .. versionchanged:: 1.2.0
-
- Compression is supported for binary file objects.
-
- .. versionchanged:: 1.2.0
-
- Previous versions forwarded dict entries for 'gzip' to
- `gzip.open` instead of `gzip.GzipFile` which prevented
- setting `mtime`.
-
- quoting : optional constant from csv module
- Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
- then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
- will treat them as non-numeric.
- quotechar : str, default '\"'
- String of length 1. Character used to quote fields.
- lineterminator : str, optional
- The newline character or character sequence to use in the output
- file. Defaults to `os.linesep`, which depends on the OS in which
- this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
-
- .. versionchanged:: 1.5.0
-
- Previously was line_terminator, changed for consistency with
- read_csv and the standard library 'csv' module.
-
- chunksize : int or None
- Rows to write at a time.
- date_format : str, default None
- Format string for datetime objects.
- doublequote : bool, default True
- Control quoting of `quotechar` inside a field.
- escapechar : str, default None
- String of length 1. Character used to escape `sep` and `quotechar`
- when appropriate.
- decimal : str, default '.'
- Character recognized as decimal separator. E.g. use ',' for
- European data.
- errors : str, default 'strict'
- Specifies how encoding and decoding errors are to be handled.
- See the errors argument for :func:`open` for a full list
- of options.
-
- .. versionadded:: 1.1.0
-
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- Returns
- -------
- None or str
- If path_or_buf is None, returns the resulting csv format as a
- string. Otherwise returns None.
-
- See Also
- --------
- read_csv : Load a CSV file into a DataFrame.
- to_excel : Write DataFrame to an Excel file.
-
- Examples
- --------
- >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
- ... 'mask': ['red', 'purple'],
- ... 'weapon': ['sai', 'bo staff']}})
- >>> df.to_csv(index=False)
- 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
-
- Create 'out.zip' containing 'out.csv'
-
- >>> compression_opts = dict(method='zip',
- ... archive_name='out.csv') # doctest: +SKIP
- >>> df.to_csv('out.zip', index=False,
- ... compression=compression_opts) # doctest: +SKIP
-
- To write a csv file to a new folder or nested folder you will first
- need to create it using either Pathlib or os:
-
- >>> from pathlib import Path # doctest: +SKIP
- >>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
- >>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
- >>> df.to_csv(filepath) # doctest: +SKIP
-
- >>> import os # doctest: +SKIP
- >>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
- >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
- """
- df = self if isinstance(self, ABCDataFrame) else self.to_frame()
-
- formatter = DataFrameFormatter(
- frame=df,
- header=header,
- index=index,
- na_rep=na_rep,
- float_format=float_format,
- decimal=decimal,
- )
-
- return DataFrameRenderer(formatter).to_csv(
- path_or_buf,
- lineterminator=lineterminator,
- sep=sep,
- encoding=encoding,
- errors=errors,
- compression=compression,
- quoting=quoting,
- columns=columns,
- index_label=index_label,
- mode=mode,
- chunksize=chunksize,
- quotechar=quotechar,
- date_format=date_format,
- doublequote=doublequote,
- escapechar=escapechar,
- storage_options=storage_options,
- )
-
- # ----------------------------------------------------------------------
- # Lookup Caching
-
- def _reset_cacher(self) -> None:
- """
- Reset the cacher.
- """
- raise AbstractMethodError(self)
-
- def _maybe_update_cacher(
- self,
- clear: bool_t = False,
- verify_is_copy: bool_t = True,
- inplace: bool_t = False,
- ) -> None:
- """
- See if we need to update our parent cacher if clear, then clear our
- cache.
-
- Parameters
- ----------
- clear : bool, default False
- Clear the item cache.
- verify_is_copy : bool, default True
- Provide is_copy checks.
- """
- if using_copy_on_write():
- return
-
- if verify_is_copy:
- self._check_setitem_copy(t="referent")
-
- if clear:
- self._clear_item_cache()
-
- def _clear_item_cache(self) -> None:
- raise AbstractMethodError(self)
-
- # ----------------------------------------------------------------------
- # Indexing Methods
-
- def take(self: NDFrameT, indices, axis: Axis = 0, **kwargs) -> NDFrameT:
- """
- Return the elements in the given *positional* indices along an axis.
-
- This means that we are not indexing according to actual values in
- the index attribute of the object. We are indexing according to the
- actual position of the element in the object.
-
- Parameters
- ----------
- indices : array-like
- An array of ints indicating which positions to take.
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- The axis on which to select elements. ``0`` means that we are
- selecting rows, ``1`` means that we are selecting columns.
- For `Series` this parameter is unused and defaults to 0.
- **kwargs
- For compatibility with :meth:`numpy.take`. Has no effect on the
- output.
-
- Returns
- -------
- same type as caller
- An array-like containing the elements taken from the object.
-
- See Also
- --------
- DataFrame.loc : Select a subset of a DataFrame by labels.
- DataFrame.iloc : Select a subset of a DataFrame by positions.
- numpy.take : Take elements from an array along an axis.
-
- Examples
- --------
- >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
- ... ('parrot', 'bird', 24.0),
- ... ('lion', 'mammal', 80.5),
- ... ('monkey', 'mammal', np.nan)],
- ... columns=['name', 'class', 'max_speed'],
- ... index=[0, 2, 3, 1])
- >>> df
- name class max_speed
- 0 falcon bird 389.0
- 2 parrot bird 24.0
- 3 lion mammal 80.5
- 1 monkey mammal NaN
-
- Take elements at positions 0 and 3 along the axis 0 (default).
-
- Note how the actual indices selected (0 and 1) do not correspond to
- our selected indices 0 and 3. That's because we are selecting the 0th
- and 3rd rows, not rows whose indices equal 0 and 3.
-
- >>> df.take([0, 3])
- name class max_speed
- 0 falcon bird 389.0
- 1 monkey mammal NaN
-
- Take elements at indices 1 and 2 along the axis 1 (column selection).
-
- >>> df.take([1, 2], axis=1)
- class max_speed
- 0 bird 389.0
- 2 bird 24.0
- 3 mammal 80.5
- 1 mammal NaN
-
- We may take elements using negative integers for positive indices,
- starting from the end of the object, just like with Python lists.
-
- >>> df.take([-1, -2])
- name class max_speed
- 1 monkey mammal NaN
- 3 lion mammal 80.5
- """
-
- nv.validate_take((), kwargs)
-
- return self._take(indices, axis)
-
- def _take(
- self: NDFrameT,
- indices,
- axis: Axis = 0,
- convert_indices: bool_t = True,
- ) -> NDFrameT:
- """
- Internal version of the `take` allowing specification of additional args.
-
- See the docstring of `take` for full explanation of the parameters.
- """
- if not isinstance(indices, slice):
- indices = np.asarray(indices, dtype=np.intp)
- if (
- axis == 0
- and indices.ndim == 1
- and using_copy_on_write()
- and is_range_indexer(indices, len(self))
- ):
- return self.copy(deep=None)
-
- new_data = self._mgr.take(
- indices,
- axis=self._get_block_manager_axis(axis),
- verify=True,
- convert_indices=convert_indices,
- )
- return self._constructor(new_data).__finalize__(self, method="take")
-
- def _take_with_is_copy(self: NDFrameT, indices, axis: Axis = 0) -> NDFrameT:
- """
- Internal version of the `take` method that sets the `_is_copy`
- attribute to keep track of the parent dataframe (using in indexing
- for the SettingWithCopyWarning).
-
- See the docstring of `take` for full explanation of the parameters.
- """
- result = self._take(indices=indices, axis=axis)
- # Maybe set copy if we didn't actually change the index.
- if not result._get_axis(axis).equals(self._get_axis(axis)):
- result._set_is_copy(self)
- return result
-
- @final
- def xs(
- self: NDFrameT,
- key: IndexLabel,
- axis: Axis = 0,
- level: IndexLabel = None,
- drop_level: bool_t = True,
- ) -> NDFrameT:
- """
- Return cross-section from the Series/DataFrame.
-
- This method takes a `key` argument to select data at a particular
- level of a MultiIndex.
-
- Parameters
- ----------
- key : label or tuple of label
- Label contained in the index, or partially in a MultiIndex.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Axis to retrieve cross-section on.
- level : object, defaults to first n levels (n=1 or len(key))
- In case of a key partially contained in a MultiIndex, indicate
- which levels are used. Levels can be referred by label or position.
- drop_level : bool, default True
- If False, returns object with same levels as self.
-
- Returns
- -------
- Series or DataFrame
- Cross-section from the original Series or DataFrame
- corresponding to the selected index levels.
-
- See Also
- --------
- DataFrame.loc : Access a group of rows and columns
- by label(s) or a boolean array.
- DataFrame.iloc : Purely integer-location based indexing
- for selection by position.
-
- Notes
- -----
- `xs` can not be used to set values.
-
- MultiIndex Slicers is a generic way to get/set values on
- any level or levels.
- It is a superset of `xs` functionality, see
- :ref:`MultiIndex Slicers <advanced.mi_slicers>`.
-
- Examples
- --------
- >>> d = {'num_legs': [4, 4, 2, 2],
- ... 'num_wings': [0, 0, 2, 2],
- ... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
- ... 'animal': ['cat', 'dog', 'bat', 'penguin'],
- ... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
- >>> df = pd.DataFrame(data=d)
- >>> df = df.set_index(['class', 'animal', 'locomotion'])
- >>> df
- num_legs num_wings
- class animal locomotion
- mammal cat walks 4 0
- dog walks 4 0
- bat flies 2 2
- bird penguin walks 2 2
-
- Get values at specified index
-
- >>> df.xs('mammal')
- num_legs num_wings
- animal locomotion
- cat walks 4 0
- dog walks 4 0
- bat flies 2 2
-
- Get values at several indexes
-
- >>> df.xs(('mammal', 'dog', 'walks'))
- num_legs 4
- num_wings 0
- Name: (mammal, dog, walks), dtype: int64
-
- Get values at specified index and level
-
- >>> df.xs('cat', level=1)
- num_legs num_wings
- class locomotion
- mammal walks 4 0
-
- Get values at several indexes and levels
-
- >>> df.xs(('bird', 'walks'),
- ... level=[0, 'locomotion'])
- num_legs num_wings
- animal
- penguin 2 2
-
- Get values at specified column and axis
-
- >>> df.xs('num_wings', axis=1)
- class animal locomotion
- mammal cat walks 0
- dog walks 0
- bat flies 2
- bird penguin walks 2
- Name: num_wings, dtype: int64
- """
- axis = self._get_axis_number(axis)
- labels = self._get_axis(axis)
-
- if isinstance(key, list):
- raise TypeError("list keys are not supported in xs, pass a tuple instead")
-
- if level is not None:
- if not isinstance(labels, MultiIndex):
- raise TypeError("Index must be a MultiIndex")
- loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
-
- # create the tuple of the indexer
- _indexer = [slice(None)] * self.ndim
- _indexer[axis] = loc
- indexer = tuple(_indexer)
-
- result = self.iloc[indexer]
- setattr(result, result._get_axis_name(axis), new_ax)
- return result
-
- if axis == 1:
- if drop_level:
- return self[key]
- index = self.columns
- else:
- index = self.index
-
- if isinstance(index, MultiIndex):
- loc, new_index = index._get_loc_level(key, level=0)
- if not drop_level:
- if lib.is_integer(loc):
- new_index = index[loc : loc + 1]
- else:
- new_index = index[loc]
- else:
- loc = index.get_loc(key)
-
- if isinstance(loc, np.ndarray):
- if loc.dtype == np.bool_:
- (inds,) = loc.nonzero()
- return self._take_with_is_copy(inds, axis=axis)
- else:
- return self._take_with_is_copy(loc, axis=axis)
-
- if not is_scalar(loc):
- new_index = index[loc]
-
- if is_scalar(loc) and axis == 0:
- # In this case loc should be an integer
- if self.ndim == 1:
- # if we encounter an array-like and we only have 1 dim
- # that means that their are list/ndarrays inside the Series!
- # so just return them (GH 6394)
- return self._values[loc]
-
- new_mgr = self._mgr.fast_xs(loc)
-
- result = self._constructor_sliced(
- new_mgr, name=self.index[loc]
- ).__finalize__(self)
- elif is_scalar(loc):
- result = self.iloc[:, slice(loc, loc + 1)]
- elif axis == 1:
- result = self.iloc[:, loc]
- else:
- result = self.iloc[loc]
- result.index = new_index
-
- # this could be a view
- # but only in a single-dtyped view sliceable case
- result._set_is_copy(self, copy=not result._is_view)
- return result
-
- def __getitem__(self, item):
- raise AbstractMethodError(self)
-
- def _slice(self: NDFrameT, slobj: slice, axis: Axis = 0) -> NDFrameT:
- """
- Construct a slice of this container.
-
- Slicing with this method is *always* positional.
- """
- assert isinstance(slobj, slice), type(slobj)
- axis = self._get_block_manager_axis(axis)
- result = self._constructor(self._mgr.get_slice(slobj, axis=axis))
- result = result.__finalize__(self)
-
- # this could be a view
- # but only in a single-dtyped view sliceable case
- is_copy = axis != 0 or result._is_view
- result._set_is_copy(self, copy=is_copy)
- return result
-
- @final
- def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
- if not copy:
- self._is_copy = None
- else:
- assert ref is not None
- self._is_copy = weakref.ref(ref)
-
- def _check_is_chained_assignment_possible(self) -> bool_t:
- """
- Check if we are a view, have a cacher, and are of mixed type.
- If so, then force a setitem_copy check.
-
- Should be called just near setting a value
-
- Will return a boolean if it we are a view and are cached, but a
- single-dtype meaning that the cacher should be updated following
- setting.
- """
- if self._is_copy:
- self._check_setitem_copy(t="referent")
- return False
-
- @final
- def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
- """
-
- Parameters
- ----------
- t : str, the type of setting error
- force : bool, default False
- If True, then force showing an error.
-
- validate if we are doing a setitem on a chained copy.
-
- It is technically possible to figure out that we are setting on
- a copy even WITH a multi-dtyped pandas object. In other words, some
- blocks may be views while other are not. Currently _is_view will ALWAYS
- return False for multi-blocks to avoid having to handle this case.
-
- df = DataFrame(np.arange(0,9), columns=['count'])
- df['group'] = 'b'
-
- # This technically need not raise SettingWithCopy if both are view
- # (which is not generally guaranteed but is usually True. However,
- # this is in general not a good practice and we recommend using .loc.
- df.iloc[0:5]['group'] = 'a'
-
- """
- if using_copy_on_write():
- return
-
- # return early if the check is not needed
- if not (force or self._is_copy):
- return
-
- value = config.get_option("mode.chained_assignment")
- if value is None:
- return
-
- # see if the copy is not actually referred; if so, then dissolve
- # the copy weakref
- if self._is_copy is not None and not isinstance(self._is_copy, str):
- r = self._is_copy()
- if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
- self._is_copy = None
- return
-
- # a custom message
- if isinstance(self._is_copy, str):
- t = self._is_copy
-
- elif t == "referent":
- t = (
- "\n"
- "A value is trying to be set on a copy of a slice from a "
- "DataFrame\n\n"
- "See the caveats in the documentation: "
- "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
- "indexing.html#returning-a-view-versus-a-copy"
- )
-
- else:
- t = (
- "\n"
- "A value is trying to be set on a copy of a slice from a "
- "DataFrame.\n"
- "Try using .loc[row_indexer,col_indexer] = value "
- "instead\n\nSee the caveats in the documentation: "
- "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
- "indexing.html#returning-a-view-versus-a-copy"
- )
-
- if value == "raise":
- raise SettingWithCopyError(t)
- if value == "warn":
- warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
-
- def __delitem__(self, key) -> None:
- """
- Delete item
- """
- deleted = False
-
- maybe_shortcut = False
- if self.ndim == 2 and isinstance(self.columns, MultiIndex):
- try:
- # By using engine's __contains__ we effectively
- # restrict to same-length tuples
- maybe_shortcut = key not in self.columns._engine
- except TypeError:
- pass
-
- if maybe_shortcut:
- # Allow shorthand to delete all columns whose first len(key)
- # elements match key:
- if not isinstance(key, tuple):
- key = (key,)
- for col in self.columns:
- if isinstance(col, tuple) and col[: len(key)] == key:
- del self[col]
- deleted = True
- if not deleted:
- # If the above loop ran and didn't delete anything because
- # there was no match, this call should raise the appropriate
- # exception:
- loc = self.axes[-1].get_loc(key)
- self._mgr = self._mgr.idelete(loc)
-
- # delete from the caches
- try:
- del self._item_cache[key]
- except KeyError:
- pass
-
- # ----------------------------------------------------------------------
- # Unsorted
-
- @final
- def _check_inplace_and_allows_duplicate_labels(self, inplace):
- if inplace and not self.flags.allows_duplicate_labels:
- raise ValueError(
- "Cannot specify 'inplace=True' when "
- "'self.flags.allows_duplicate_labels' is False."
- )
-
- @final
- def get(self, key, default=None):
- """
- Get item from object for given key (ex: DataFrame column).
-
- Returns default value if not found.
-
- Parameters
- ----------
- key : object
-
- Returns
- -------
- same type as items contained in object
-
- Examples
- --------
- >>> df = pd.DataFrame(
- ... [
- ... [24.3, 75.7, "high"],
- ... [31, 87.8, "high"],
- ... [22, 71.6, "medium"],
- ... [35, 95, "medium"],
- ... ],
- ... columns=["temp_celsius", "temp_fahrenheit", "windspeed"],
- ... index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"),
- ... )
-
- >>> df
- temp_celsius temp_fahrenheit windspeed
- 2014-02-12 24.3 75.7 high
- 2014-02-13 31.0 87.8 high
- 2014-02-14 22.0 71.6 medium
- 2014-02-15 35.0 95.0 medium
-
- >>> df.get(["temp_celsius", "windspeed"])
- temp_celsius windspeed
- 2014-02-12 24.3 high
- 2014-02-13 31.0 high
- 2014-02-14 22.0 medium
- 2014-02-15 35.0 medium
-
- >>> ser = df['windspeed']
- >>> ser.get('2014-02-13')
- 'high'
-
- If the key isn't found, the default value will be used.
-
- >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value")
- 'default_value'
-
- >>> ser.get('2014-02-10', '[unknown]')
- '[unknown]'
- """
- try:
- return self[key]
- except (KeyError, ValueError, IndexError):
- return default
-
- @final
- @property
- def _is_view(self) -> bool_t:
- """Return boolean indicating if self is view of another array"""
- return self._mgr.is_view
-
- @final
- def reindex_like(
- self: NDFrameT,
- other,
- method: Literal["backfill", "bfill", "pad", "ffill", "nearest"] | None = None,
- copy: bool_t | None = None,
- limit=None,
- tolerance=None,
- ) -> NDFrameT:
- """
- Return an object with matching indices as other object.
-
- Conform the object to the same index on all axes. Optional
- filling logic, placing NaN in locations having no value
- in the previous index. A new object is produced unless the
- new index is equivalent to the current one and copy=False.
-
- Parameters
- ----------
- other : Object of the same data type
- Its row and column indices are used to define the new indices
- of this object.
- method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
- Method to use for filling holes in reindexed DataFrame.
- Please note: this is only applicable to DataFrames/Series with a
- monotonically increasing/decreasing index.
-
- * None (default): don't fill gaps
- * pad / ffill: propagate last valid observation forward to next
- valid
- * backfill / bfill: use next valid observation to fill gap
- * nearest: use nearest valid observations to fill gap.
-
- copy : bool, default True
- Return a new object, even if the passed indexes are the same.
- limit : int, default None
- Maximum number of consecutive labels to fill for inexact matches.
- tolerance : optional
- Maximum distance between original and new labels for inexact
- matches. The values of the index at the matching locations must
- satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
-
- Tolerance may be a scalar value, which applies the same tolerance
- to all values, or list-like, which applies variable tolerance per
- element. List-like includes list, tuple, array, Series, and must be
- the same size as the index and its dtype must exactly match the
- index's type.
-
- Returns
- -------
- Series or DataFrame
- Same type as caller, but with changed indices on each axis.
-
- See Also
- --------
- DataFrame.set_index : Set row labels.
- DataFrame.reset_index : Remove row labels or move them to new columns.
- DataFrame.reindex : Change to new indices or expand indices.
-
- Notes
- -----
- Same as calling
- ``.reindex(index=other.index, columns=other.columns,...)``.
-
- Examples
- --------
- >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'],
- ... [31, 87.8, 'high'],
- ... [22, 71.6, 'medium'],
- ... [35, 95, 'medium']],
- ... columns=['temp_celsius', 'temp_fahrenheit',
- ... 'windspeed'],
- ... index=pd.date_range(start='2014-02-12',
- ... end='2014-02-15', freq='D'))
-
- >>> df1
- temp_celsius temp_fahrenheit windspeed
- 2014-02-12 24.3 75.7 high
- 2014-02-13 31.0 87.8 high
- 2014-02-14 22.0 71.6 medium
- 2014-02-15 35.0 95.0 medium
-
- >>> df2 = pd.DataFrame([[28, 'low'],
- ... [30, 'low'],
- ... [35.1, 'medium']],
- ... columns=['temp_celsius', 'windspeed'],
- ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
- ... '2014-02-15']))
-
- >>> df2
- temp_celsius windspeed
- 2014-02-12 28.0 low
- 2014-02-13 30.0 low
- 2014-02-15 35.1 medium
-
- >>> df2.reindex_like(df1)
- temp_celsius temp_fahrenheit windspeed
- 2014-02-12 28.0 NaN low
- 2014-02-13 30.0 NaN low
- 2014-02-14 NaN NaN NaN
- 2014-02-15 35.1 NaN medium
- """
- d = other._construct_axes_dict(
- axes=self._AXIS_ORDERS,
- method=method,
- copy=copy,
- limit=limit,
- tolerance=tolerance,
- )
-
- return self.reindex(**d)
-
- @overload
- def drop(
- self,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level | None = ...,
- inplace: Literal[True],
- errors: IgnoreRaise = ...,
- ) -> None:
- ...
-
- @overload
- def drop(
- self: NDFrameT,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level | None = ...,
- inplace: Literal[False] = ...,
- errors: IgnoreRaise = ...,
- ) -> NDFrameT:
- ...
-
- @overload
- def drop(
- self: NDFrameT,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level | None = ...,
- inplace: bool_t = ...,
- errors: IgnoreRaise = ...,
- ) -> NDFrameT | None:
- ...
-
- def drop(
- self: NDFrameT,
- labels: IndexLabel = None,
- *,
- axis: Axis = 0,
- index: IndexLabel = None,
- columns: IndexLabel = None,
- level: Level | None = None,
- inplace: bool_t = False,
- errors: IgnoreRaise = "raise",
- ) -> NDFrameT | None:
- inplace = validate_bool_kwarg(inplace, "inplace")
-
- if labels is not None:
- if index is not None or columns is not None:
- raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
- axis_name = self._get_axis_name(axis)
- axes = {axis_name: labels}
- elif index is not None or columns is not None:
- axes = {"index": index}
- if self.ndim == 2:
- axes["columns"] = columns
- else:
- raise ValueError(
- "Need to specify at least one of 'labels', 'index' or 'columns'"
- )
-
- obj = self
-
- for axis, labels in axes.items():
- if labels is not None:
- obj = obj._drop_axis(labels, axis, level=level, errors=errors)
-
- if inplace:
- self._update_inplace(obj)
- return None
- else:
- return obj
-
- @final
- def _drop_axis(
- self: NDFrameT,
- labels,
- axis,
- level=None,
- errors: IgnoreRaise = "raise",
- only_slice: bool_t = False,
- ) -> NDFrameT:
- """
- Drop labels from specified axis. Used in the ``drop`` method
- internally.
-
- Parameters
- ----------
- labels : single label or list-like
- axis : int or axis name
- level : int or level name, default None
- For MultiIndex
- errors : {'ignore', 'raise'}, default 'raise'
- If 'ignore', suppress error and existing labels are dropped.
- only_slice : bool, default False
- Whether indexing along columns should be view-only.
-
- """
- axis_num = self._get_axis_number(axis)
- axis = self._get_axis(axis)
-
- if axis.is_unique:
- if level is not None:
- if not isinstance(axis, MultiIndex):
- raise AssertionError("axis must be a MultiIndex")
- new_axis = axis.drop(labels, level=level, errors=errors)
- else:
- new_axis = axis.drop(labels, errors=errors)
- indexer = axis.get_indexer(new_axis)
-
- # Case for non-unique axis
- else:
- is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple)
- labels = ensure_object(common.index_labels_to_array(labels))
- if level is not None:
- if not isinstance(axis, MultiIndex):
- raise AssertionError("axis must be a MultiIndex")
- mask = ~axis.get_level_values(level).isin(labels)
-
- # GH 18561 MultiIndex.drop should raise if label is absent
- if errors == "raise" and mask.all():
- raise KeyError(f"{labels} not found in axis")
- elif (
- isinstance(axis, MultiIndex)
- and labels.dtype == "object"
- and not is_tuple_labels
- ):
- # Set level to zero in case of MultiIndex and label is string,
- # because isin can't handle strings for MultiIndexes GH#36293
- # In case of tuples we get dtype object but have to use isin GH#42771
- mask = ~axis.get_level_values(0).isin(labels)
- else:
- mask = ~axis.isin(labels)
- # Check if label doesn't exist along axis
- labels_missing = (axis.get_indexer_for(labels) == -1).any()
- if errors == "raise" and labels_missing:
- raise KeyError(f"{labels} not found in axis")
-
- if is_extension_array_dtype(mask.dtype):
- # GH#45860
- mask = mask.to_numpy(dtype=bool)
-
- indexer = mask.nonzero()[0]
- new_axis = axis.take(indexer)
-
- bm_axis = self.ndim - axis_num - 1
- new_mgr = self._mgr.reindex_indexer(
- new_axis,
- indexer,
- axis=bm_axis,
- allow_dups=True,
- copy=None,
- only_slice=only_slice,
- )
- result = self._constructor(new_mgr)
- if self.ndim == 1:
- result.name = self.name
-
- return result.__finalize__(self)
-
- @final
- def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None:
- """
- Replace self internals with result.
-
- Parameters
- ----------
- result : same type as self
- verify_is_copy : bool, default True
- Provide is_copy checks.
- """
- # NOTE: This does *not* call __finalize__ and that's an explicit
- # decision that we may revisit in the future.
- self._reset_cache()
- self._clear_item_cache()
- self._mgr = result._mgr
- self._maybe_update_cacher(verify_is_copy=verify_is_copy, inplace=True)
-
- @final
- def add_prefix(self: NDFrameT, prefix: str, axis: Axis | None = None) -> NDFrameT:
- """
- Prefix labels with string `prefix`.
-
- For Series, the row labels are prefixed.
- For DataFrame, the column labels are prefixed.
-
- Parameters
- ----------
- prefix : str
- The string to add before each label.
- axis : {{0 or 'index', 1 or 'columns', None}}, default None
- Axis to add prefix on
-
- .. versionadded:: 2.0.0
-
- Returns
- -------
- Series or DataFrame
- New Series or DataFrame with updated labels.
-
- See Also
- --------
- Series.add_suffix: Suffix row labels with string `suffix`.
- DataFrame.add_suffix: Suffix column labels with string `suffix`.
-
- Examples
- --------
- >>> s = pd.Series([1, 2, 3, 4])
- >>> s
- 0 1
- 1 2
- 2 3
- 3 4
- dtype: int64
-
- >>> s.add_prefix('item_')
- item_0 1
- item_1 2
- item_2 3
- item_3 4
- dtype: int64
-
- >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
- >>> df
- A B
- 0 1 3
- 1 2 4
- 2 3 5
- 3 4 6
-
- >>> df.add_prefix('col_')
- col_A col_B
- 0 1 3
- 1 2 4
- 2 3 5
- 3 4 6
- """
- f = lambda x: f"{prefix}{x}"
-
- axis_name = self._info_axis_name
- if axis is not None:
- axis_name = self._get_axis_name(axis)
-
- mapper = {axis_name: f}
-
- # error: Incompatible return value type (got "Optional[NDFrameT]",
- # expected "NDFrameT")
- # error: Argument 1 to "rename" of "NDFrame" has incompatible type
- # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"
- # error: Keywords must be strings
- return self._rename(**mapper) # type: ignore[return-value, arg-type, misc]
-
- @final
- def add_suffix(self: NDFrameT, suffix: str, axis: Axis | None = None) -> NDFrameT:
- """
- Suffix labels with string `suffix`.
-
- For Series, the row labels are suffixed.
- For DataFrame, the column labels are suffixed.
-
- Parameters
- ----------
- suffix : str
- The string to add after each label.
- axis : {{0 or 'index', 1 or 'columns', None}}, default None
- Axis to add suffix on
-
- .. versionadded:: 2.0.0
-
- Returns
- -------
- Series or DataFrame
- New Series or DataFrame with updated labels.
-
- See Also
- --------
- Series.add_prefix: Prefix row labels with string `prefix`.
- DataFrame.add_prefix: Prefix column labels with string `prefix`.
-
- Examples
- --------
- >>> s = pd.Series([1, 2, 3, 4])
- >>> s
- 0 1
- 1 2
- 2 3
- 3 4
- dtype: int64
-
- >>> s.add_suffix('_item')
- 0_item 1
- 1_item 2
- 2_item 3
- 3_item 4
- dtype: int64
-
- >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
- >>> df
- A B
- 0 1 3
- 1 2 4
- 2 3 5
- 3 4 6
-
- >>> df.add_suffix('_col')
- A_col B_col
- 0 1 3
- 1 2 4
- 2 3 5
- 3 4 6
- """
- f = lambda x: f"{x}{suffix}"
-
- axis_name = self._info_axis_name
- if axis is not None:
- axis_name = self._get_axis_name(axis)
-
- mapper = {axis_name: f}
- # error: Incompatible return value type (got "Optional[NDFrameT]",
- # expected "NDFrameT")
- # error: Argument 1 to "rename" of "NDFrame" has incompatible type
- # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"
- # error: Keywords must be strings
- return self._rename(**mapper) # type: ignore[return-value, arg-type, misc]
-
- @overload
- def sort_values(
- self: NDFrameT,
- *,
- axis: Axis = ...,
- ascending: bool_t | Sequence[bool_t] = ...,
- inplace: Literal[False] = ...,
- kind: str = ...,
- na_position: str = ...,
- ignore_index: bool_t = ...,
- key: ValueKeyFunc = ...,
- ) -> NDFrameT:
- ...
-
- @overload
- def sort_values(
- self,
- *,
- axis: Axis = ...,
- ascending: bool_t | Sequence[bool_t] = ...,
- inplace: Literal[True],
- kind: str = ...,
- na_position: str = ...,
- ignore_index: bool_t = ...,
- key: ValueKeyFunc = ...,
- ) -> None:
- ...
-
- @overload
- def sort_values(
- self: NDFrameT,
- *,
- axis: Axis = ...,
- ascending: bool_t | Sequence[bool_t] = ...,
- inplace: bool_t = ...,
- kind: str = ...,
- na_position: str = ...,
- ignore_index: bool_t = ...,
- key: ValueKeyFunc = ...,
- ) -> NDFrameT | None:
- ...
-
- def sort_values(
- self: NDFrameT,
- *,
- axis: Axis = 0,
- ascending: bool_t | Sequence[bool_t] = True,
- inplace: bool_t = False,
- kind: str = "quicksort",
- na_position: str = "last",
- ignore_index: bool_t = False,
- key: ValueKeyFunc = None,
- ) -> NDFrameT | None:
- """
- Sort by the values along either axis.
-
- Parameters
- ----------%(optional_by)s
- axis : %(axes_single_arg)s, default 0
- Axis to be sorted.
- ascending : bool or list of bool, default True
- Sort ascending vs. descending. Specify list for multiple sort
- orders. If this is a list of bools, must match the length of
- the by.
- inplace : bool, default False
- If True, perform operation in-place.
- kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
- Choice of sorting algorithm. See also :func:`numpy.sort` for more
- information. `mergesort` and `stable` are the only stable algorithms. For
- DataFrames, this option is only applied when sorting on a single
- column or label.
- na_position : {'first', 'last'}, default 'last'
- Puts NaNs at the beginning if `first`; `last` puts NaNs at the
- end.
- ignore_index : bool, default False
- If True, the resulting axis will be labeled 0, 1, …, n - 1.
- key : callable, optional
- Apply the key function to the values
- before sorting. This is similar to the `key` argument in the
- builtin :meth:`sorted` function, with the notable difference that
- this `key` function should be *vectorized*. It should expect a
- ``Series`` and return a Series with the same shape as the input.
- It will be applied to each column in `by` independently.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- DataFrame or None
- DataFrame with sorted values or None if ``inplace=True``.
-
- See Also
- --------
- DataFrame.sort_index : Sort a DataFrame by the index.
- Series.sort_values : Similar method for a Series.
-
- Examples
- --------
- >>> df = pd.DataFrame({
- ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
- ... 'col2': [2, 1, 9, 8, 7, 4],
- ... 'col3': [0, 1, 9, 4, 2, 3],
- ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F']
- ... })
- >>> df
- col1 col2 col3 col4
- 0 A 2 0 a
- 1 A 1 1 B
- 2 B 9 9 c
- 3 NaN 8 4 D
- 4 D 7 2 e
- 5 C 4 3 F
-
- Sort by col1
-
- >>> df.sort_values(by=['col1'])
- col1 col2 col3 col4
- 0 A 2 0 a
- 1 A 1 1 B
- 2 B 9 9 c
- 5 C 4 3 F
- 4 D 7 2 e
- 3 NaN 8 4 D
-
- Sort by multiple columns
-
- >>> df.sort_values(by=['col1', 'col2'])
- col1 col2 col3 col4
- 1 A 1 1 B
- 0 A 2 0 a
- 2 B 9 9 c
- 5 C 4 3 F
- 4 D 7 2 e
- 3 NaN 8 4 D
-
- Sort Descending
-
- >>> df.sort_values(by='col1', ascending=False)
- col1 col2 col3 col4
- 4 D 7 2 e
- 5 C 4 3 F
- 2 B 9 9 c
- 0 A 2 0 a
- 1 A 1 1 B
- 3 NaN 8 4 D
-
- Putting NAs first
-
- >>> df.sort_values(by='col1', ascending=False, na_position='first')
- col1 col2 col3 col4
- 3 NaN 8 4 D
- 4 D 7 2 e
- 5 C 4 3 F
- 2 B 9 9 c
- 0 A 2 0 a
- 1 A 1 1 B
-
- Sorting with a key function
-
- >>> df.sort_values(by='col4', key=lambda col: col.str.lower())
- col1 col2 col3 col4
- 0 A 2 0 a
- 1 A 1 1 B
- 2 B 9 9 c
- 3 NaN 8 4 D
- 4 D 7 2 e
- 5 C 4 3 F
-
- Natural sort with the key argument,
- using the `natsort <https://github.com/SethMMorton/natsort>` package.
-
- >>> df = pd.DataFrame({
- ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'],
- ... "value": [10, 20, 30, 40, 50]
- ... })
- >>> df
- time value
- 0 0hr 10
- 1 128hr 20
- 2 72hr 30
- 3 48hr 40
- 4 96hr 50
- >>> from natsort import index_natsorted
- >>> df.sort_values(
- ... by="time",
- ... key=lambda x: np.argsort(index_natsorted(df["time"]))
- ... )
- time value
- 0 0hr 10
- 3 48hr 40
- 2 72hr 30
- 4 96hr 50
- 1 128hr 20
- """
- raise AbstractMethodError(self)
-
- @overload
- def sort_index(
- self,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool_t | Sequence[bool_t] = ...,
- inplace: Literal[True],
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool_t = ...,
- ignore_index: bool_t = ...,
- key: IndexKeyFunc = ...,
- ) -> None:
- ...
-
- @overload
- def sort_index(
- self: NDFrameT,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool_t | Sequence[bool_t] = ...,
- inplace: Literal[False] = ...,
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool_t = ...,
- ignore_index: bool_t = ...,
- key: IndexKeyFunc = ...,
- ) -> NDFrameT:
- ...
-
- @overload
- def sort_index(
- self: NDFrameT,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool_t | Sequence[bool_t] = ...,
- inplace: bool_t = ...,
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool_t = ...,
- ignore_index: bool_t = ...,
- key: IndexKeyFunc = ...,
- ) -> NDFrameT | None:
- ...
-
- def sort_index(
- self: NDFrameT,
- *,
- axis: Axis = 0,
- level: IndexLabel = None,
- ascending: bool_t | Sequence[bool_t] = True,
- inplace: bool_t = False,
- kind: SortKind = "quicksort",
- na_position: NaPosition = "last",
- sort_remaining: bool_t = True,
- ignore_index: bool_t = False,
- key: IndexKeyFunc = None,
- ) -> NDFrameT | None:
- inplace = validate_bool_kwarg(inplace, "inplace")
- axis = self._get_axis_number(axis)
- ascending = validate_ascending(ascending)
-
- target = self._get_axis(axis)
-
- indexer = get_indexer_indexer(
- target, level, ascending, kind, na_position, sort_remaining, key
- )
-
- if indexer is None:
- if inplace:
- result = self
- else:
- result = self.copy(deep=None)
-
- if ignore_index:
- result.index = default_index(len(self))
- if inplace:
- return None
- else:
- return result
-
- baxis = self._get_block_manager_axis(axis)
- new_data = self._mgr.take(indexer, axis=baxis, verify=False)
-
- # reconstruct axis if needed
- new_data.set_axis(baxis, new_data.axes[baxis]._sort_levels_monotonic())
-
- if ignore_index:
- axis = 1 if isinstance(self, ABCDataFrame) else 0
- new_data.set_axis(axis, default_index(len(indexer)))
-
- result = self._constructor(new_data)
-
- if inplace:
- return self._update_inplace(result)
- else:
- return result.__finalize__(self, method="sort_index")
-
- @doc(
- klass=_shared_doc_kwargs["klass"],
- optional_reindex="",
- )
- def reindex(
- self: NDFrameT,
- labels=None,
- index=None,
- columns=None,
- axis: Axis | None = None,
- method: str | None = None,
- copy: bool_t | None = None,
- level: Level | None = None,
- fill_value: Scalar | None = np.nan,
- limit: int | None = None,
- tolerance=None,
- ) -> NDFrameT:
- """
- Conform {klass} to new index with optional filling logic.
-
- Places NA/NaN in locations having no value in the previous index. A new object
- is produced unless the new index is equivalent to the current one and
- ``copy=False``.
-
- Parameters
- ----------
- {optional_reindex}
- method : {{None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}}
- Method to use for filling holes in reindexed DataFrame.
- Please note: this is only applicable to DataFrames/Series with a
- monotonically increasing/decreasing index.
-
- * None (default): don't fill gaps
- * pad / ffill: Propagate last valid observation forward to next
- valid.
- * backfill / bfill: Use next valid observation to fill gap.
- * nearest: Use nearest valid observations to fill gap.
-
- copy : bool, default True
- Return a new object, even if the passed indexes are the same.
- level : int or name
- Broadcast across a level, matching Index values on the
- passed MultiIndex level.
- fill_value : scalar, default np.NaN
- Value to use for missing values. Defaults to NaN, but can be any
- "compatible" value.
- limit : int, default None
- Maximum number of consecutive elements to forward or backward fill.
- tolerance : optional
- Maximum distance between original and new labels for inexact
- matches. The values of the index at the matching locations most
- satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
-
- Tolerance may be a scalar value, which applies the same tolerance
- to all values, or list-like, which applies variable tolerance per
- element. List-like includes list, tuple, array, Series, and must be
- the same size as the index and its dtype must exactly match the
- index's type.
-
- Returns
- -------
- {klass} with changed index.
-
- See Also
- --------
- DataFrame.set_index : Set row labels.
- DataFrame.reset_index : Remove row labels or move them to new columns.
- DataFrame.reindex_like : Change to same indices as other DataFrame.
-
- Examples
- --------
- ``DataFrame.reindex`` supports two calling conventions
-
- * ``(index=index_labels, columns=column_labels, ...)``
- * ``(labels, axis={{'index', 'columns'}}, ...)``
-
- We *highly* recommend using keyword arguments to clarify your
- intent.
-
- Create a dataframe with some fictional data.
-
- >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
- >>> df = pd.DataFrame({{'http_status': [200, 200, 404, 404, 301],
- ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}},
- ... index=index)
- >>> df
- http_status response_time
- Firefox 200 0.04
- Chrome 200 0.02
- Safari 404 0.07
- IE10 404 0.08
- Konqueror 301 1.00
-
- Create a new index and reindex the dataframe. By default
- values in the new index that do not have corresponding
- records in the dataframe are assigned ``NaN``.
-
- >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
- ... 'Chrome']
- >>> df.reindex(new_index)
- http_status response_time
- Safari 404.0 0.07
- Iceweasel NaN NaN
- Comodo Dragon NaN NaN
- IE10 404.0 0.08
- Chrome 200.0 0.02
-
- We can fill in the missing values by passing a value to
- the keyword ``fill_value``. Because the index is not monotonically
- increasing or decreasing, we cannot use arguments to the keyword
- ``method`` to fill the ``NaN`` values.
-
- >>> df.reindex(new_index, fill_value=0)
- http_status response_time
- Safari 404 0.07
- Iceweasel 0 0.00
- Comodo Dragon 0 0.00
- IE10 404 0.08
- Chrome 200 0.02
-
- >>> df.reindex(new_index, fill_value='missing')
- http_status response_time
- Safari 404 0.07
- Iceweasel missing missing
- Comodo Dragon missing missing
- IE10 404 0.08
- Chrome 200 0.02
-
- We can also reindex the columns.
-
- >>> df.reindex(columns=['http_status', 'user_agent'])
- http_status user_agent
- Firefox 200 NaN
- Chrome 200 NaN
- Safari 404 NaN
- IE10 404 NaN
- Konqueror 301 NaN
-
- Or we can use "axis-style" keyword arguments
-
- >>> df.reindex(['http_status', 'user_agent'], axis="columns")
- http_status user_agent
- Firefox 200 NaN
- Chrome 200 NaN
- Safari 404 NaN
- IE10 404 NaN
- Konqueror 301 NaN
-
- To further illustrate the filling functionality in
- ``reindex``, we will create a dataframe with a
- monotonically increasing index (for example, a sequence
- of dates).
-
- >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')
- >>> df2 = pd.DataFrame({{"prices": [100, 101, np.nan, 100, 89, 88]}},
- ... index=date_index)
- >>> df2
- prices
- 2010-01-01 100.0
- 2010-01-02 101.0
- 2010-01-03 NaN
- 2010-01-04 100.0
- 2010-01-05 89.0
- 2010-01-06 88.0
-
- Suppose we decide to expand the dataframe to cover a wider
- date range.
-
- >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
- >>> df2.reindex(date_index2)
- prices
- 2009-12-29 NaN
- 2009-12-30 NaN
- 2009-12-31 NaN
- 2010-01-01 100.0
- 2010-01-02 101.0
- 2010-01-03 NaN
- 2010-01-04 100.0
- 2010-01-05 89.0
- 2010-01-06 88.0
- 2010-01-07 NaN
-
- The index entries that did not have a value in the original data frame
- (for example, '2009-12-29') are by default filled with ``NaN``.
- If desired, we can fill in the missing values using one of several
- options.
-
- For example, to back-propagate the last valid value to fill the ``NaN``
- values, pass ``bfill`` as an argument to the ``method`` keyword.
-
- >>> df2.reindex(date_index2, method='bfill')
- prices
- 2009-12-29 100.0
- 2009-12-30 100.0
- 2009-12-31 100.0
- 2010-01-01 100.0
- 2010-01-02 101.0
- 2010-01-03 NaN
- 2010-01-04 100.0
- 2010-01-05 89.0
- 2010-01-06 88.0
- 2010-01-07 NaN
-
- Please note that the ``NaN`` value present in the original dataframe
- (at index value 2010-01-03) will not be filled by any of the
- value propagation schemes. This is because filling while reindexing
- does not look at dataframe values, but only compares the original and
- desired indexes. If you do want to fill in the ``NaN`` values present
- in the original dataframe, use the ``fillna()`` method.
-
- See the :ref:`user guide <basics.reindexing>` for more.
- """
- # TODO: Decide if we care about having different examples for different
- # kinds
-
- if index is not None and columns is not None and labels is not None:
- raise TypeError("Cannot specify all of 'labels', 'index', 'columns'.")
- elif index is not None or columns is not None:
- if axis is not None:
- raise TypeError(
- "Cannot specify both 'axis' and any of 'index' or 'columns'"
- )
- if labels is not None:
- if index is not None:
- columns = labels
- else:
- index = labels
- else:
- if axis and self._get_axis_number(axis) == 1:
- columns = labels
- else:
- index = labels
- axes: dict[Literal["index", "columns"], Any] = {
- "index": index,
- "columns": columns,
- }
- method = clean_reindex_fill_method(method)
-
- # if all axes that are requested to reindex are equal, then only copy
- # if indicated must have index names equal here as well as values
- if copy and using_copy_on_write():
- copy = False
- if all(
- self._get_axis(axis_name).identical(ax)
- for axis_name, ax in axes.items()
- if ax is not None
- ):
- return self.copy(deep=copy)
-
- # check if we are a multi reindex
- if self._needs_reindex_multi(axes, method, level):
- return self._reindex_multi(axes, copy, fill_value)
-
- # perform the reindex on the axes
- return self._reindex_axes(
- axes, level, limit, tolerance, method, fill_value, copy
- ).__finalize__(self, method="reindex")
-
- def _reindex_axes(
- self: NDFrameT, axes, level, limit, tolerance, method, fill_value, copy
- ) -> NDFrameT:
- """Perform the reindex for all the axes."""
- obj = self
- for a in self._AXIS_ORDERS:
- labels = axes[a]
- if labels is None:
- continue
-
- ax = self._get_axis(a)
- new_index, indexer = ax.reindex(
- labels, level=level, limit=limit, tolerance=tolerance, method=method
- )
-
- axis = self._get_axis_number(a)
- obj = obj._reindex_with_indexers(
- {axis: [new_index, indexer]},
- fill_value=fill_value,
- copy=copy,
- allow_dups=False,
- )
- # If we've made a copy once, no need to make another one
- copy = False
-
- return obj
-
- def _needs_reindex_multi(self, axes, method, level) -> bool_t:
- """Check if we do need a multi reindex."""
- return (
- (common.count_not_none(*axes.values()) == self._AXIS_LEN)
- and method is None
- and level is None
- and not self._is_mixed_type
- and not (
- self.ndim == 2
- and len(self.dtypes) == 1
- and is_extension_array_dtype(self.dtypes.iloc[0])
- )
- )
-
- def _reindex_multi(self, axes, copy, fill_value):
- raise AbstractMethodError(self)
-
- @final
- def _reindex_with_indexers(
- self: NDFrameT,
- reindexers,
- fill_value=None,
- copy: bool_t | None = False,
- allow_dups: bool_t = False,
- ) -> NDFrameT:
- """allow_dups indicates an internal call here"""
- # reindex doing multiple operations on different axes if indicated
- new_data = self._mgr
- for axis in sorted(reindexers.keys()):
- index, indexer = reindexers[axis]
- baxis = self._get_block_manager_axis(axis)
-
- if index is None:
- continue
-
- index = ensure_index(index)
- if indexer is not None:
- indexer = ensure_platform_int(indexer)
-
- # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi)
- new_data = new_data.reindex_indexer(
- index,
- indexer,
- axis=baxis,
- fill_value=fill_value,
- allow_dups=allow_dups,
- copy=copy,
- )
- # If we've made a copy once, no need to make another one
- copy = False
-
- if (
- (copy or copy is None)
- and new_data is self._mgr
- and not using_copy_on_write()
- ):
- new_data = new_data.copy(deep=copy)
- elif using_copy_on_write() and new_data is self._mgr:
- new_data = new_data.copy(deep=False)
-
- return self._constructor(new_data).__finalize__(self)
-
- def filter(
- self: NDFrameT,
- items=None,
- like: str | None = None,
- regex: str | None = None,
- axis: Axis | None = None,
- ) -> NDFrameT:
- """
- Subset the dataframe rows or columns according to the specified index labels.
-
- Note that this routine does not filter a dataframe on its
- contents. The filter is applied to the labels of the index.
-
- Parameters
- ----------
- items : list-like
- Keep labels from axis which are in items.
- like : str
- Keep labels from axis for which "like in label == True".
- regex : str (regular expression)
- Keep labels from axis for which re.search(regex, label) == True.
- axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
- The axis to filter on, expressed either as an index (int)
- or axis name (str). By default this is the info axis, 'columns' for
- DataFrame. For `Series` this parameter is unused and defaults to `None`.
-
- Returns
- -------
- same type as input object
-
- See Also
- --------
- DataFrame.loc : Access a group of rows and columns
- by label(s) or a boolean array.
-
- Notes
- -----
- The ``items``, ``like``, and ``regex`` parameters are
- enforced to be mutually exclusive.
-
- ``axis`` defaults to the info axis that is used when indexing
- with ``[]``.
-
- Examples
- --------
- >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
- ... index=['mouse', 'rabbit'],
- ... columns=['one', 'two', 'three'])
- >>> df
- one two three
- mouse 1 2 3
- rabbit 4 5 6
-
- >>> # select columns by name
- >>> df.filter(items=['one', 'three'])
- one three
- mouse 1 3
- rabbit 4 6
-
- >>> # select columns by regular expression
- >>> df.filter(regex='e$', axis=1)
- one three
- mouse 1 3
- rabbit 4 6
-
- >>> # select rows containing 'bbi'
- >>> df.filter(like='bbi', axis=0)
- one two three
- rabbit 4 5 6
- """
- nkw = common.count_not_none(items, like, regex)
- if nkw > 1:
- raise TypeError(
- "Keyword arguments `items`, `like`, or `regex` "
- "are mutually exclusive"
- )
-
- if axis is None:
- axis = self._info_axis_name
- labels = self._get_axis(axis)
-
- if items is not None:
- name = self._get_axis_name(axis)
- # error: Keywords must be strings
- return self.reindex( # type: ignore[misc]
- **{name: [r for r in items if r in labels]} # type: ignore[arg-type]
- )
- elif like:
-
- def f(x) -> bool_t:
- assert like is not None # needed for mypy
- return like in ensure_str(x)
-
- values = labels.map(f)
- return self.loc(axis=axis)[values]
- elif regex:
-
- def f(x) -> bool_t:
- return matcher.search(ensure_str(x)) is not None
-
- matcher = re.compile(regex)
- values = labels.map(f)
- return self.loc(axis=axis)[values]
- else:
- raise TypeError("Must pass either `items`, `like`, or `regex`")
-
- @final
- def head(self: NDFrameT, n: int = 5) -> NDFrameT:
- """
- Return the first `n` rows.
-
- This function returns the first `n` rows for the object based
- on position. It is useful for quickly testing if your object
- has the right type of data in it.
-
- For negative values of `n`, this function returns all rows except
- the last `|n|` rows, equivalent to ``df[:n]``.
-
- If n is larger than the number of rows, this function returns all rows.
-
- Parameters
- ----------
- n : int, default 5
- Number of rows to select.
-
- Returns
- -------
- same type as caller
- The first `n` rows of the caller object.
-
- See Also
- --------
- DataFrame.tail: Returns the last `n` rows.
-
- Examples
- --------
- >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
- ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
- >>> df
- animal
- 0 alligator
- 1 bee
- 2 falcon
- 3 lion
- 4 monkey
- 5 parrot
- 6 shark
- 7 whale
- 8 zebra
-
- Viewing the first 5 lines
-
- >>> df.head()
- animal
- 0 alligator
- 1 bee
- 2 falcon
- 3 lion
- 4 monkey
-
- Viewing the first `n` lines (three in this case)
-
- >>> df.head(3)
- animal
- 0 alligator
- 1 bee
- 2 falcon
-
- For negative values of `n`
-
- >>> df.head(-3)
- animal
- 0 alligator
- 1 bee
- 2 falcon
- 3 lion
- 4 monkey
- 5 parrot
- """
- return self.iloc[:n]
-
- @final
- def tail(self: NDFrameT, n: int = 5) -> NDFrameT:
- """
- Return the last `n` rows.
-
- This function returns last `n` rows from the object based on
- position. It is useful for quickly verifying data, for example,
- after sorting or appending rows.
-
- For negative values of `n`, this function returns all rows except
- the first `|n|` rows, equivalent to ``df[|n|:]``.
-
- If n is larger than the number of rows, this function returns all rows.
-
- Parameters
- ----------
- n : int, default 5
- Number of rows to select.
-
- Returns
- -------
- type of caller
- The last `n` rows of the caller object.
-
- See Also
- --------
- DataFrame.head : The first `n` rows of the caller object.
-
- Examples
- --------
- >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
- ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
- >>> df
- animal
- 0 alligator
- 1 bee
- 2 falcon
- 3 lion
- 4 monkey
- 5 parrot
- 6 shark
- 7 whale
- 8 zebra
-
- Viewing the last 5 lines
-
- >>> df.tail()
- animal
- 4 monkey
- 5 parrot
- 6 shark
- 7 whale
- 8 zebra
-
- Viewing the last `n` lines (three in this case)
-
- >>> df.tail(3)
- animal
- 6 shark
- 7 whale
- 8 zebra
-
- For negative values of `n`
-
- >>> df.tail(-3)
- animal
- 3 lion
- 4 monkey
- 5 parrot
- 6 shark
- 7 whale
- 8 zebra
- """
- if n == 0:
- return self.iloc[0:0]
- return self.iloc[-n:]
-
- @final
- def sample(
- self: NDFrameT,
- n: int | None = None,
- frac: float | None = None,
- replace: bool_t = False,
- weights=None,
- random_state: RandomState | None = None,
- axis: Axis | None = None,
- ignore_index: bool_t = False,
- ) -> NDFrameT:
- """
- Return a random sample of items from an axis of object.
-
- You can use `random_state` for reproducibility.
-
- Parameters
- ----------
- n : int, optional
- Number of items from axis to return. Cannot be used with `frac`.
- Default = 1 if `frac` = None.
- frac : float, optional
- Fraction of axis items to return. Cannot be used with `n`.
- replace : bool, default False
- Allow or disallow sampling of the same row more than once.
- weights : str or ndarray-like, optional
- Default 'None' results in equal probability weighting.
- If passed a Series, will align with target object on index. Index
- values in weights not found in sampled object will be ignored and
- index values in sampled object not in weights will be assigned
- weights of zero.
- If called on a DataFrame, will accept the name of a column
- when axis = 0.
- Unless weights are a Series, weights must be same length as axis
- being sampled.
- If weights do not sum to 1, they will be normalized to sum to 1.
- Missing values in the weights column will be treated as zero.
- Infinite values not allowed.
- random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
- If int, array-like, or BitGenerator, seed for random number generator.
- If np.random.RandomState or np.random.Generator, use as given.
-
- .. versionchanged:: 1.1.0
-
- array-like and BitGenerator object now passed to np.random.RandomState()
- as seed
-
- .. versionchanged:: 1.4.0
-
- np.random.Generator objects now accepted
-
- axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
- Axis to sample. Accepts axis number or name. Default is stat axis
- for given data type. For `Series` this parameter is unused and defaults to `None`.
- ignore_index : bool, default False
- If True, the resulting index will be labeled 0, 1, …, n - 1.
-
- .. versionadded:: 1.3.0
-
- Returns
- -------
- Series or DataFrame
- A new object of same type as caller containing `n` items randomly
- sampled from the caller object.
-
- See Also
- --------
- DataFrameGroupBy.sample: Generates random samples from each group of a
- DataFrame object.
- SeriesGroupBy.sample: Generates random samples from each group of a
- Series object.
- numpy.random.choice: Generates a random sample from a given 1-D numpy
- array.
-
- Notes
- -----
- If `frac` > 1, `replacement` should be set to `True`.
-
- Examples
- --------
- >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
- ... 'num_wings': [2, 0, 0, 0],
- ... 'num_specimen_seen': [10, 2, 1, 8]},
- ... index=['falcon', 'dog', 'spider', 'fish'])
- >>> df
- num_legs num_wings num_specimen_seen
- falcon 2 2 10
- dog 4 0 2
- spider 8 0 1
- fish 0 0 8
-
- Extract 3 random elements from the ``Series`` ``df['num_legs']``:
- Note that we use `random_state` to ensure the reproducibility of
- the examples.
-
- >>> df['num_legs'].sample(n=3, random_state=1)
- fish 0
- spider 8
- falcon 2
- Name: num_legs, dtype: int64
-
- A random 50% sample of the ``DataFrame`` with replacement:
-
- >>> df.sample(frac=0.5, replace=True, random_state=1)
- num_legs num_wings num_specimen_seen
- dog 4 0 2
- fish 0 0 8
-
- An upsample sample of the ``DataFrame`` with replacement:
- Note that `replace` parameter has to be `True` for `frac` parameter > 1.
-
- >>> df.sample(frac=2, replace=True, random_state=1)
- num_legs num_wings num_specimen_seen
- dog 4 0 2
- fish 0 0 8
- falcon 2 2 10
- falcon 2 2 10
- fish 0 0 8
- dog 4 0 2
- fish 0 0 8
- dog 4 0 2
-
- Using a DataFrame column as weights. Rows with larger value in the
- `num_specimen_seen` column are more likely to be sampled.
-
- >>> df.sample(n=2, weights='num_specimen_seen', random_state=1)
- num_legs num_wings num_specimen_seen
- falcon 2 2 10
- fish 0 0 8
- """ # noqa:E501
- if axis is None:
- axis = self._stat_axis_number
-
- axis = self._get_axis_number(axis)
- obj_len = self.shape[axis]
-
- # Process random_state argument
- rs = common.random_state(random_state)
-
- size = sample.process_sampling_size(n, frac, replace)
- if size is None:
- assert frac is not None
- size = round(frac * obj_len)
-
- if weights is not None:
- weights = sample.preprocess_weights(self, weights, axis)
-
- sampled_indices = sample.sample(obj_len, size, replace, weights, rs)
- result = self.take(sampled_indices, axis=axis)
-
- if ignore_index:
- result.index = default_index(len(result))
-
- return result
-
- @final
- @doc(klass=_shared_doc_kwargs["klass"])
- def pipe(
- self,
- func: Callable[..., T] | tuple[Callable[..., T], str],
- *args,
- **kwargs,
- ) -> T:
- r"""
- Apply chainable functions that expect Series or DataFrames.
-
- Parameters
- ----------
- func : function
- Function to apply to the {klass}.
- ``args``, and ``kwargs`` are passed into ``func``.
- Alternatively a ``(callable, data_keyword)`` tuple where
- ``data_keyword`` is a string indicating the keyword of
- ``callable`` that expects the {klass}.
- args : iterable, optional
- Positional arguments passed into ``func``.
- kwargs : mapping, optional
- A dictionary of keyword arguments passed into ``func``.
-
- Returns
- -------
- the return type of ``func``.
-
- See Also
- --------
- DataFrame.apply : Apply a function along input axis of DataFrame.
- DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
- Series.map : Apply a mapping correspondence on a
- :class:`~pandas.Series`.
-
- Notes
- -----
- Use ``.pipe`` when chaining together functions that expect
- Series, DataFrames or GroupBy objects. Instead of writing
-
- >>> func(g(h(df), arg1=a), arg2=b, arg3=c) # doctest: +SKIP
-
- You can write
-
- >>> (df.pipe(h)
- ... .pipe(g, arg1=a)
- ... .pipe(func, arg2=b, arg3=c)
- ... ) # doctest: +SKIP
-
- If you have a function that takes the data as (say) the second
- argument, pass a tuple indicating which keyword expects the
- data. For example, suppose ``func`` takes its data as ``arg2``:
-
- >>> (df.pipe(h)
- ... .pipe(g, arg1=a)
- ... .pipe((func, 'arg2'), arg1=a, arg3=c)
- ... ) # doctest: +SKIP
- """
- if using_copy_on_write():
- return common.pipe(self.copy(deep=None), func, *args, **kwargs)
- return common.pipe(self, func, *args, **kwargs)
-
- # ----------------------------------------------------------------------
- # Attribute access
-
- @final
- def __finalize__(
- self: NDFrameT, other, method: str | None = None, **kwargs
- ) -> NDFrameT:
- """
- Propagate metadata from other to self.
-
- Parameters
- ----------
- other : the object from which to get the attributes that we are going
- to propagate
- method : str, optional
- A passed method name providing context on where ``__finalize__``
- was called.
-
- .. warning::
-
- The value passed as `method` are not currently considered
- stable across pandas releases.
- """
- if isinstance(other, NDFrame):
- for name in other.attrs:
- self.attrs[name] = other.attrs[name]
-
- self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels
- # For subclasses using _metadata.
- for name in set(self._metadata) & set(other._metadata):
- assert isinstance(name, str)
- object.__setattr__(self, name, getattr(other, name, None))
-
- if method == "concat":
- attrs = other.objs[0].attrs
- check_attrs = all(objs.attrs == attrs for objs in other.objs[1:])
- if check_attrs:
- for name in attrs:
- self.attrs[name] = attrs[name]
-
- allows_duplicate_labels = all(
- x.flags.allows_duplicate_labels for x in other.objs
- )
- self.flags.allows_duplicate_labels = allows_duplicate_labels
-
- return self
-
- def __getattr__(self, name: str):
- """
- After regular attribute access, try looking up the name
- This allows simpler access to columns for interactive use.
- """
- # Note: obj.x will always call obj.__getattribute__('x') prior to
- # calling obj.__getattr__('x').
- if (
- name not in self._internal_names_set
- and name not in self._metadata
- and name not in self._accessors
- and self._info_axis._can_hold_identifiers_and_holds_name(name)
- ):
- return self[name]
- return object.__getattribute__(self, name)
-
- def __setattr__(self, name: str, value) -> None:
- """
- After regular attribute access, try setting the name
- This allows simpler access to columns for interactive use.
- """
- # first try regular attribute access via __getattribute__, so that
- # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify
- # the same attribute.
-
- try:
- object.__getattribute__(self, name)
- return object.__setattr__(self, name, value)
- except AttributeError:
- pass
-
- # if this fails, go on to more involved attribute setting
- # (note that this matches __getattr__, above).
- if name in self._internal_names_set:
- object.__setattr__(self, name, value)
- elif name in self._metadata:
- object.__setattr__(self, name, value)
- else:
- try:
- existing = getattr(self, name)
- if isinstance(existing, Index):
- object.__setattr__(self, name, value)
- elif name in self._info_axis:
- self[name] = value
- else:
- object.__setattr__(self, name, value)
- except (AttributeError, TypeError):
- if isinstance(self, ABCDataFrame) and (is_list_like(value)):
- warnings.warn(
- "Pandas doesn't allow columns to be "
- "created via a new attribute name - see "
- "https://pandas.pydata.org/pandas-docs/"
- "stable/indexing.html#attribute-access",
- stacklevel=find_stack_level(),
- )
- object.__setattr__(self, name, value)
-
- @final
- def _dir_additions(self) -> set[str]:
- """
- add the string-like attributes from the info_axis.
- If info_axis is a MultiIndex, its first level values are used.
- """
- additions = super()._dir_additions()
- if self._info_axis._can_hold_strings:
- additions.update(self._info_axis._dir_additions_for_owner)
- return additions
-
- # ----------------------------------------------------------------------
- # Consolidation of internals
-
- @final
- def _protect_consolidate(self, f):
- """
- Consolidate _mgr -- if the blocks have changed, then clear the
- cache
- """
- if isinstance(self._mgr, (ArrayManager, SingleArrayManager)):
- return f()
- blocks_before = len(self._mgr.blocks)
- result = f()
- if len(self._mgr.blocks) != blocks_before:
- self._clear_item_cache()
- return result
-
- @final
- def _consolidate_inplace(self) -> None:
- """Consolidate data in place and return None"""
-
- def f() -> None:
- self._mgr = self._mgr.consolidate()
-
- self._protect_consolidate(f)
-
- @final
- def _consolidate(self):
- """
- Compute NDFrame with "consolidated" internals (data of each dtype
- grouped together in a single ndarray).
-
- Returns
- -------
- consolidated : same type as caller
- """
- f = lambda: self._mgr.consolidate()
- cons_data = self._protect_consolidate(f)
- return self._constructor(cons_data).__finalize__(self)
-
- @property
- def _is_mixed_type(self) -> bool_t:
- if self._mgr.is_single_block:
- return False
-
- if self._mgr.any_extension_types:
- # Even if they have the same dtype, we can't consolidate them,
- # so we pretend this is "mixed'"
- return True
-
- return self.dtypes.nunique() > 1
-
- @final
- def _check_inplace_setting(self, value) -> bool_t:
- """check whether we allow in-place setting with this type of value"""
- if self._is_mixed_type and not self._mgr.is_numeric_mixed_type:
- # allow an actual np.nan through
- if is_float(value) and np.isnan(value) or value is lib.no_default:
- return True
-
- raise TypeError(
- "Cannot do inplace boolean setting on "
- "mixed-types with a non np.nan value"
- )
-
- return True
-
- @final
- def _get_numeric_data(self: NDFrameT) -> NDFrameT:
- return self._constructor(self._mgr.get_numeric_data()).__finalize__(self)
-
- @final
- def _get_bool_data(self):
- return self._constructor(self._mgr.get_bool_data()).__finalize__(self)
-
- # ----------------------------------------------------------------------
- # Internal Interface Methods
-
- @property
- def values(self):
- raise AbstractMethodError(self)
-
- @property
- def _values(self) -> ArrayLike:
- """internal implementation"""
- raise AbstractMethodError(self)
-
- @property
- def dtypes(self):
- """
- Return the dtypes in the DataFrame.
-
- This returns a Series with the data type of each column.
- The result's index is the original DataFrame's columns. Columns
- with mixed types are stored with the ``object`` dtype. See
- :ref:`the User Guide <basics.dtypes>` for more.
-
- Returns
- -------
- pandas.Series
- The data type of each column.
-
- Examples
- --------
- >>> df = pd.DataFrame({'float': [1.0],
- ... 'int': [1],
- ... 'datetime': [pd.Timestamp('20180310')],
- ... 'string': ['foo']})
- >>> df.dtypes
- float float64
- int int64
- datetime datetime64[ns]
- string object
- dtype: object
- """
- data = self._mgr.get_dtypes()
- return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)
-
- def astype(
- self: NDFrameT, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise"
- ) -> NDFrameT:
- """
- Cast a pandas object to a specified dtype ``dtype``.
-
- Parameters
- ----------
- dtype : str, data type, Series or Mapping of column name -> data type
- Use a str, numpy.dtype, pandas.ExtensionDtype or Python type to
- cast entire pandas object to the same type. Alternatively, use a
- mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is
- a numpy.dtype or Python type to cast one or more of the DataFrame's
- columns to column-specific types.
- copy : bool, default True
- Return a copy when ``copy=True`` (be very careful setting
- ``copy=False`` as changes to values then may propagate to other
- pandas objects).
- errors : {'raise', 'ignore'}, default 'raise'
- Control raising of exceptions on invalid data for provided dtype.
-
- - ``raise`` : allow exceptions to be raised
- - ``ignore`` : suppress exceptions. On error return original object.
-
- Returns
- -------
- same type as caller
-
- See Also
- --------
- to_datetime : Convert argument to datetime.
- to_timedelta : Convert argument to timedelta.
- to_numeric : Convert argument to a numeric type.
- numpy.ndarray.astype : Cast a numpy array to a specified type.
-
- Notes
- -----
- .. versionchanged:: 2.0.0
-
- Using ``astype`` to convert from timezone-naive dtype to
- timezone-aware dtype will raise an exception.
- Use :meth:`Series.dt.tz_localize` instead.
-
- Examples
- --------
- Create a DataFrame:
-
- >>> d = {'col1': [1, 2], 'col2': [3, 4]}
- >>> df = pd.DataFrame(data=d)
- >>> df.dtypes
- col1 int64
- col2 int64
- dtype: object
-
- Cast all columns to int32:
-
- >>> df.astype('int32').dtypes
- col1 int32
- col2 int32
- dtype: object
-
- Cast col1 to int32 using a dictionary:
-
- >>> df.astype({'col1': 'int32'}).dtypes
- col1 int32
- col2 int64
- dtype: object
-
- Create a series:
-
- >>> ser = pd.Series([1, 2], dtype='int32')
- >>> ser
- 0 1
- 1 2
- dtype: int32
- >>> ser.astype('int64')
- 0 1
- 1 2
- dtype: int64
-
- Convert to categorical type:
-
- >>> ser.astype('category')
- 0 1
- 1 2
- dtype: category
- Categories (2, int32): [1, 2]
-
- Convert to ordered categorical type with custom ordering:
-
- >>> from pandas.api.types import CategoricalDtype
- >>> cat_dtype = CategoricalDtype(
- ... categories=[2, 1], ordered=True)
- >>> ser.astype(cat_dtype)
- 0 1
- 1 2
- dtype: category
- Categories (2, int64): [2 < 1]
-
- Create a series of dates:
-
- >>> ser_date = pd.Series(pd.date_range('20200101', periods=3))
- >>> ser_date
- 0 2020-01-01
- 1 2020-01-02
- 2 2020-01-03
- dtype: datetime64[ns]
- """
- if copy and using_copy_on_write():
- copy = False
-
- if is_dict_like(dtype):
- if self.ndim == 1: # i.e. Series
- if len(dtype) > 1 or self.name not in dtype:
- raise KeyError(
- "Only the Series name can be used for "
- "the key in Series dtype mappings."
- )
- new_type = dtype[self.name]
- return self.astype(new_type, copy, errors)
-
- # GH#44417 cast to Series so we can use .iat below, which will be
- # robust in case we
- from pandas import Series
-
- dtype_ser = Series(dtype, dtype=object)
-
- for col_name in dtype_ser.index:
- if col_name not in self:
- raise KeyError(
- "Only a column name can be used for the "
- "key in a dtype mappings argument. "
- f"'{col_name}' not found in columns."
- )
-
- dtype_ser = dtype_ser.reindex(self.columns, fill_value=None, copy=False)
-
- results = []
- for i, (col_name, col) in enumerate(self.items()):
- cdt = dtype_ser.iat[i]
- if isna(cdt):
- res_col = col.copy(deep=copy)
- else:
- try:
- res_col = col.astype(dtype=cdt, copy=copy, errors=errors)
- except ValueError as ex:
- ex.args = (
- f"{ex}: Error while type casting for column '{col_name}'",
- )
- raise
- results.append(res_col)
-
- elif is_extension_array_dtype(dtype) and self.ndim > 1:
- # GH 18099/22869: columnwise conversion to extension dtype
- # GH 24704: use iloc to handle duplicate column names
- # TODO(EA2D): special case not needed with 2D EAs
- results = [
- self.iloc[:, i].astype(dtype, copy=copy)
- for i in range(len(self.columns))
- ]
-
- else:
- # else, only a single dtype is given
- new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
- return self._constructor(new_data).__finalize__(self, method="astype")
-
- # GH 33113: handle empty frame or series
- if not results:
- return self.copy(deep=None)
-
- # GH 19920: retain column metadata after concat
- result = concat(results, axis=1, copy=False)
- # GH#40810 retain subclass
- # error: Incompatible types in assignment
- # (expression has type "NDFrameT", variable has type "DataFrame")
- result = self._constructor(result) # type: ignore[assignment]
- result.columns = self.columns
- result = result.__finalize__(self, method="astype")
- # https://github.com/python/mypy/issues/8354
- return cast(NDFrameT, result)
-
- @final
- def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT:
- """
- Make a copy of this object's indices and data.
-
- When ``deep=True`` (default), a new object will be created with a
- copy of the calling object's data and indices. Modifications to
- the data or indices of the copy will not be reflected in the
- original object (see notes below).
-
- When ``deep=False``, a new object will be created without copying
- the calling object's data or index (only references to the data
- and index are copied). Any changes to the data of the original
- will be reflected in the shallow copy (and vice versa).
-
- Parameters
- ----------
- deep : bool, default True
- Make a deep copy, including a copy of the data and the indices.
- With ``deep=False`` neither the indices nor the data are copied.
-
- Returns
- -------
- Series or DataFrame
- Object type matches caller.
-
- Notes
- -----
- When ``deep=True``, data is copied but actual Python objects
- will not be copied recursively, only the reference to the object.
- This is in contrast to `copy.deepcopy` in the Standard Library,
- which recursively copies object data (see examples below).
-
- While ``Index`` objects are copied when ``deep=True``, the underlying
- numpy array is not copied for performance reasons. Since ``Index`` is
- immutable, the underlying data can be safely shared and a copy
- is not needed.
-
- Since pandas is not thread safe, see the
- :ref:`gotchas <gotchas.thread-safety>` when copying in a threading
- environment.
-
- Examples
- --------
- >>> s = pd.Series([1, 2], index=["a", "b"])
- >>> s
- a 1
- b 2
- dtype: int64
-
- >>> s_copy = s.copy()
- >>> s_copy
- a 1
- b 2
- dtype: int64
-
- **Shallow copy versus default (deep) copy:**
-
- >>> s = pd.Series([1, 2], index=["a", "b"])
- >>> deep = s.copy()
- >>> shallow = s.copy(deep=False)
-
- Shallow copy shares data and index with original.
-
- >>> s is shallow
- False
- >>> s.values is shallow.values and s.index is shallow.index
- True
-
- Deep copy has own copy of data and index.
-
- >>> s is deep
- False
- >>> s.values is deep.values or s.index is deep.index
- False
-
- Updates to the data shared by shallow copy and original is reflected
- in both; deep copy remains unchanged.
-
- >>> s[0] = 3
- >>> shallow[1] = 4
- >>> s
- a 3
- b 4
- dtype: int64
- >>> shallow
- a 3
- b 4
- dtype: int64
- >>> deep
- a 1
- b 2
- dtype: int64
-
- Note that when copying an object containing Python objects, a deep copy
- will copy the data, but will not do so recursively. Updating a nested
- data object will be reflected in the deep copy.
-
- >>> s = pd.Series([[1, 2], [3, 4]])
- >>> deep = s.copy()
- >>> s[0][0] = 10
- >>> s
- 0 [10, 2]
- 1 [3, 4]
- dtype: object
- >>> deep
- 0 [10, 2]
- 1 [3, 4]
- dtype: object
- """
- data = self._mgr.copy(deep=deep)
- self._clear_item_cache()
- return self._constructor(data).__finalize__(self, method="copy")
-
- @final
- def __copy__(self: NDFrameT, deep: bool_t = True) -> NDFrameT:
- return self.copy(deep=deep)
-
- @final
- def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT:
- """
- Parameters
- ----------
- memo, default None
- Standard signature. Unused
- """
- return self.copy(deep=True)
-
- @final
- def infer_objects(self: NDFrameT, copy: bool_t | None = None) -> NDFrameT:
- """
- Attempt to infer better dtypes for object columns.
-
- Attempts soft conversion of object-dtyped
- columns, leaving non-object and unconvertible
- columns unchanged. The inference rules are the
- same as during normal Series/DataFrame construction.
-
- Parameters
- ----------
- copy : bool, default True
- Whether to make a copy for non-object or non-inferrable columns
- or Series.
-
- Returns
- -------
- same type as input object
-
- See Also
- --------
- to_datetime : Convert argument to datetime.
- to_timedelta : Convert argument to timedelta.
- to_numeric : Convert argument to numeric type.
- convert_dtypes : Convert argument to best possible dtype.
-
- Examples
- --------
- >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})
- >>> df = df.iloc[1:]
- >>> df
- A
- 1 1
- 2 2
- 3 3
-
- >>> df.dtypes
- A object
- dtype: object
-
- >>> df.infer_objects().dtypes
- A int64
- dtype: object
- """
- new_mgr = self._mgr.convert(copy=copy)
- return self._constructor(new_mgr).__finalize__(self, method="infer_objects")
-
- @final
- def convert_dtypes(
- self: NDFrameT,
- infer_objects: bool_t = True,
- convert_string: bool_t = True,
- convert_integer: bool_t = True,
- convert_boolean: bool_t = True,
- convert_floating: bool_t = True,
- dtype_backend: DtypeBackend = "numpy_nullable",
- ) -> NDFrameT:
- """
- Convert columns to the best possible dtypes using dtypes supporting ``pd.NA``.
-
- Parameters
- ----------
- infer_objects : bool, default True
- Whether object dtypes should be converted to the best possible types.
- convert_string : bool, default True
- Whether object dtypes should be converted to ``StringDtype()``.
- convert_integer : bool, default True
- Whether, if possible, conversion can be done to integer extension types.
- convert_boolean : bool, defaults True
- Whether object dtypes should be converted to ``BooleanDtypes()``.
- convert_floating : bool, defaults True
- Whether, if possible, conversion can be done to floating extension types.
- If `convert_integer` is also True, preference will be give to integer
- dtypes if the floats can be faithfully casted to integers.
-
- .. versionadded:: 1.2.0
- dtype_backend : {"numpy_nullable", "pyarrow"}, default "numpy_nullable"
- Which dtype_backend to use, e.g. whether a DataFrame should use nullable
- dtypes for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- Returns
- -------
- Series or DataFrame
- Copy of input object with new dtype.
-
- See Also
- --------
- infer_objects : Infer dtypes of objects.
- to_datetime : Convert argument to datetime.
- to_timedelta : Convert argument to timedelta.
- to_numeric : Convert argument to a numeric type.
-
- Notes
- -----
- By default, ``convert_dtypes`` will attempt to convert a Series (or each
- Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
- ``convert_string``, ``convert_integer``, ``convert_boolean`` and
- ``convert_floating``, it is possible to turn off individual conversions
- to ``StringDtype``, the integer extension types, ``BooleanDtype``
- or floating extension types, respectively.
-
- For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
- rules as during normal Series/DataFrame construction. Then, if possible,
- convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer
- or floating extension type, otherwise leave as ``object``.
-
- If the dtype is integer, convert to an appropriate integer extension type.
-
- If the dtype is numeric, and consists of all integers, convert to an
- appropriate integer extension type. Otherwise, convert to an
- appropriate floating extension type.
-
- .. versionchanged:: 1.2
- Starting with pandas 1.2, this method also converts float columns
- to the nullable floating extension type.
-
- In the future, as new dtypes are added that support ``pd.NA``, the results
- of this method will change to support those new dtypes.
-
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {
- ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
- ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
- ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
- ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
- ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
- ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
- ... }
- ... )
-
- Start with a DataFrame with default dtypes.
-
- >>> df
- a b c d e f
- 0 1 x True h 10.0 NaN
- 1 2 y False i NaN 100.5
- 2 3 z NaN NaN 20.0 200.0
-
- >>> df.dtypes
- a int32
- b object
- c object
- d object
- e float64
- f float64
- dtype: object
-
- Convert the DataFrame to use best possible dtypes.
-
- >>> dfn = df.convert_dtypes()
- >>> dfn
- a b c d e f
- 0 1 x True h 10 <NA>
- 1 2 y False i <NA> 100.5
- 2 3 z <NA> <NA> 20 200.0
-
- >>> dfn.dtypes
- a Int32
- b string[python]
- c boolean
- d string[python]
- e Int64
- f Float64
- dtype: object
-
- Start with a Series of strings and missing data represented by ``np.nan``.
-
- >>> s = pd.Series(["a", "b", np.nan])
- >>> s
- 0 a
- 1 b
- 2 NaN
- dtype: object
-
- Obtain a Series with dtype ``StringDtype``.
-
- >>> s.convert_dtypes()
- 0 a
- 1 b
- 2 <NA>
- dtype: string
- """
- check_dtype_backend(dtype_backend)
- if self.ndim == 1:
- return self._convert_dtypes(
- infer_objects,
- convert_string,
- convert_integer,
- convert_boolean,
- convert_floating,
- dtype_backend=dtype_backend,
- )
- else:
- results = [
- col._convert_dtypes(
- infer_objects,
- convert_string,
- convert_integer,
- convert_boolean,
- convert_floating,
- dtype_backend=dtype_backend,
- )
- for col_name, col in self.items()
- ]
- if len(results) > 0:
- result = concat(results, axis=1, copy=False, keys=self.columns)
- cons = cast(Type["DataFrame"], self._constructor)
- result = cons(result)
- result = result.__finalize__(self, method="convert_dtypes")
- # https://github.com/python/mypy/issues/8354
- return cast(NDFrameT, result)
- else:
- return self.copy(deep=None)
-
- # ----------------------------------------------------------------------
- # Filling NA's
-
- @overload
- def fillna(
- self: NDFrameT,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: Literal[False] = ...,
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> NDFrameT:
- ...
-
- @overload
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: Literal[True],
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> None:
- ...
-
- @overload
- def fillna(
- self: NDFrameT,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: bool_t = ...,
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> NDFrameT | None:
- ...
-
- @doc(**_shared_doc_kwargs)
- def fillna(
- self: NDFrameT,
- value: Hashable | Mapping | Series | DataFrame = None,
- *,
- method: FillnaOptions | None = None,
- axis: Axis | None = None,
- inplace: bool_t = False,
- limit: int | None = None,
- downcast: dict | None = None,
- ) -> NDFrameT | None:
- """
- Fill NA/NaN values using the specified method.
-
- Parameters
- ----------
- value : scalar, dict, Series, or DataFrame
- Value to use to fill holes (e.g. 0), alternately a
- dict/Series/DataFrame of values specifying which value to use for
- each index (for a Series) or column (for a DataFrame). Values not
- in the dict/Series/DataFrame will not be filled. This value cannot
- be a list.
- method : {{'backfill', 'bfill', 'ffill', None}}, default None
- Method to use for filling holes in reindexed Series:
-
- * ffill: propagate last valid observation forward to next valid.
- * backfill / bfill: use next valid observation to fill gap.
-
- axis : {axes_single_arg}
- Axis along which to fill missing values. For `Series`
- this parameter is unused and defaults to 0.
- inplace : bool, default False
- If True, fill in-place. Note: this will modify any
- other views on this object (e.g., a no-copy slice for a column in a
- DataFrame).
- limit : int, default None
- If method is specified, this is the maximum number of consecutive
- NaN values to forward/backward fill. In other words, if there is
- a gap with more than this number of consecutive NaNs, it will only
- be partially filled. If method is not specified, this is the
- maximum number of entries along the entire axis where NaNs will be
- filled. Must be greater than 0 if not None.
- downcast : dict, default is None
- A dict of item->dtype of what to downcast if possible,
- or the string 'infer' which will try to downcast to an appropriate
- equal type (e.g. float64 to int64 if possible).
-
- Returns
- -------
- {klass} or None
- Object with missing values filled or None if ``inplace=True``.
-
- See Also
- --------
- interpolate : Fill NaN values using interpolation.
- reindex : Conform object to new index.
- asfreq : Convert TimeSeries to specified frequency.
-
- Examples
- --------
- >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
- ... [3, 4, np.nan, 1],
- ... [np.nan, np.nan, np.nan, np.nan],
- ... [np.nan, 3, np.nan, 4]],
- ... columns=list("ABCD"))
- >>> df
- A B C D
- 0 NaN 2.0 NaN 0.0
- 1 3.0 4.0 NaN 1.0
- 2 NaN NaN NaN NaN
- 3 NaN 3.0 NaN 4.0
-
- Replace all NaN elements with 0s.
-
- >>> df.fillna(0)
- A B C D
- 0 0.0 2.0 0.0 0.0
- 1 3.0 4.0 0.0 1.0
- 2 0.0 0.0 0.0 0.0
- 3 0.0 3.0 0.0 4.0
-
- We can also propagate non-null values forward or backward.
-
- >>> df.fillna(method="ffill")
- A B C D
- 0 NaN 2.0 NaN 0.0
- 1 3.0 4.0 NaN 1.0
- 2 3.0 4.0 NaN 1.0
- 3 3.0 3.0 NaN 4.0
-
- Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
- 2, and 3 respectively.
-
- >>> values = {{"A": 0, "B": 1, "C": 2, "D": 3}}
- >>> df.fillna(value=values)
- A B C D
- 0 0.0 2.0 2.0 0.0
- 1 3.0 4.0 2.0 1.0
- 2 0.0 1.0 2.0 3.0
- 3 0.0 3.0 2.0 4.0
-
- Only replace the first NaN element.
-
- >>> df.fillna(value=values, limit=1)
- A B C D
- 0 0.0 2.0 2.0 0.0
- 1 3.0 4.0 NaN 1.0
- 2 NaN 1.0 NaN 3.0
- 3 NaN 3.0 NaN 4.0
-
- When filling using a DataFrame, replacement happens along
- the same column names and same indices
-
- >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
- >>> df.fillna(df2)
- A B C D
- 0 0.0 2.0 0.0 0.0
- 1 3.0 4.0 0.0 1.0
- 2 0.0 0.0 0.0 NaN
- 3 0.0 3.0 0.0 4.0
-
- Note that column D is not affected since it is not present in df2.
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- value, method = validate_fillna_kwargs(value, method)
-
- # set the default here, so functions examining the signaure
- # can detect if something was set (e.g. in groupby) (GH9221)
- if axis is None:
- axis = 0
- axis = self._get_axis_number(axis)
-
- if value is None:
- if not self._mgr.is_single_block and axis == 1:
- if inplace:
- raise NotImplementedError()
- result = self.T.fillna(method=method, limit=limit).T
-
- return result
-
- new_data = self._mgr.interpolate(
- method=method,
- axis=axis,
- limit=limit,
- inplace=inplace,
- downcast=downcast,
- )
- else:
- if self.ndim == 1:
- if isinstance(value, (dict, ABCSeries)):
- if not len(value):
- # test_fillna_nonscalar
- if inplace:
- return None
- return self.copy(deep=None)
- from pandas import Series
-
- value = Series(value)
- value = value.reindex(self.index, copy=False)
- value = value._values
- elif not is_list_like(value):
- pass
- else:
- raise TypeError(
- '"value" parameter must be a scalar, dict '
- "or Series, but you passed a "
- f'"{type(value).__name__}"'
- )
-
- new_data = self._mgr.fillna(
- value=value, limit=limit, inplace=inplace, downcast=downcast
- )
-
- elif isinstance(value, (dict, ABCSeries)):
- if axis == 1:
- raise NotImplementedError(
- "Currently only can fill "
- "with dict/Series column "
- "by column"
- )
- if using_copy_on_write():
- result = self.copy(deep=None)
- else:
- result = self if inplace else self.copy()
- is_dict = isinstance(downcast, dict)
- for k, v in value.items():
- if k not in result:
- continue
-
- # error: Item "None" of "Optional[Dict[Any, Any]]" has no
- # attribute "get"
- downcast_k = (
- downcast
- if not is_dict
- else downcast.get(k) # type: ignore[union-attr]
- )
-
- res_k = result[k].fillna(v, limit=limit, downcast=downcast_k)
-
- if not inplace:
- result[k] = res_k
- else:
- # We can write into our existing column(s) iff dtype
- # was preserved.
- if isinstance(res_k, ABCSeries):
- # i.e. 'k' only shows up once in self.columns
- if res_k.dtype == result[k].dtype:
- result.loc[:, k] = res_k
- else:
- # Different dtype -> no way to do inplace.
- result[k] = res_k
- else:
- # see test_fillna_dict_inplace_nonunique_columns
- locs = result.columns.get_loc(k)
- if isinstance(locs, slice):
- locs = np.arange(self.shape[1])[locs]
- elif (
- isinstance(locs, np.ndarray) and locs.dtype.kind == "b"
- ):
- locs = locs.nonzero()[0]
- elif not (
- isinstance(locs, np.ndarray) and locs.dtype.kind == "i"
- ):
- # Should never be reached, but let's cover our bases
- raise NotImplementedError(
- "Unexpected get_loc result, please report a bug at "
- "https://github.com/pandas-dev/pandas"
- )
-
- for i, loc in enumerate(locs):
- res_loc = res_k.iloc[:, i]
- target = self.iloc[:, loc]
-
- if res_loc.dtype == target.dtype:
- result.iloc[:, loc] = res_loc
- else:
- result.isetitem(loc, res_loc)
- if inplace:
- return self._update_inplace(result)
- else:
- return result
-
- elif not is_list_like(value):
- if axis == 1:
- result = self.T.fillna(value=value, limit=limit).T
-
- new_data = result
- else:
- new_data = self._mgr.fillna(
- value=value, limit=limit, inplace=inplace, downcast=downcast
- )
- elif isinstance(value, ABCDataFrame) and self.ndim == 2:
- new_data = self.where(self.notna(), value)._mgr
- else:
- raise ValueError(f"invalid fill value with a {type(value)}")
-
- result = self._constructor(new_data)
- if inplace:
- return self._update_inplace(result)
- else:
- return result.__finalize__(self, method="fillna")
-
- @overload
- def ffill(
- self: NDFrameT,
- *,
- axis: None | Axis = ...,
- inplace: Literal[False] = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> NDFrameT:
- ...
-
- @overload
- def ffill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[True],
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> None:
- ...
-
- @overload
- def ffill(
- self: NDFrameT,
- *,
- axis: None | Axis = ...,
- inplace: bool_t = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> NDFrameT | None:
- ...
-
- @doc(klass=_shared_doc_kwargs["klass"])
- def ffill(
- self: NDFrameT,
- *,
- axis: None | Axis = None,
- inplace: bool_t = False,
- limit: None | int = None,
- downcast: dict | None = None,
- ) -> NDFrameT | None:
- """
- Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.
-
- Returns
- -------
- {klass} or None
- Object with missing values filled or None if ``inplace=True``.
- """
- return self.fillna(
- method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
- )
-
- @doc(klass=_shared_doc_kwargs["klass"])
- def pad(
- self: NDFrameT,
- *,
- axis: None | Axis = None,
- inplace: bool_t = False,
- limit: None | int = None,
- downcast: dict | None = None,
- ) -> NDFrameT | None:
- """
- Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.
-
- .. deprecated:: 2.0
-
- {klass}.pad is deprecated. Use {klass}.ffill instead.
-
- Returns
- -------
- {klass} or None
- Object with missing values filled or None if ``inplace=True``.
- """
- warnings.warn(
- "DataFrame.pad/Series.pad is deprecated. Use "
- "DataFrame.ffill/Series.ffill instead",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self.ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
-
- @overload
- def bfill(
- self: NDFrameT,
- *,
- axis: None | Axis = ...,
- inplace: Literal[False] = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> NDFrameT:
- ...
-
- @overload
- def bfill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[True],
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> None:
- ...
-
- @overload
- def bfill(
- self: NDFrameT,
- *,
- axis: None | Axis = ...,
- inplace: bool_t = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> NDFrameT | None:
- ...
-
- @doc(klass=_shared_doc_kwargs["klass"])
- def bfill(
- self: NDFrameT,
- *,
- axis: None | Axis = None,
- inplace: bool_t = False,
- limit: None | int = None,
- downcast: dict | None = None,
- ) -> NDFrameT | None:
- """
- Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.
-
- Returns
- -------
- {klass} or None
- Object with missing values filled or None if ``inplace=True``.
- """
- return self.fillna(
- method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
- )
-
- @doc(klass=_shared_doc_kwargs["klass"])
- def backfill(
- self: NDFrameT,
- *,
- axis: None | Axis = None,
- inplace: bool_t = False,
- limit: None | int = None,
- downcast: dict | None = None,
- ) -> NDFrameT | None:
- """
- Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.
-
- .. deprecated:: 2.0
-
- {klass}.backfill is deprecated. Use {klass}.bfill instead.
-
- Returns
- -------
- {klass} or None
- Object with missing values filled or None if ``inplace=True``.
- """
- warnings.warn(
- "DataFrame.backfill/Series.backfill is deprecated. Use "
- "DataFrame.bfill/Series.bfill instead",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self.bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
-
- @overload
- def replace(
- self: NDFrameT,
- to_replace=...,
- value=...,
- *,
- inplace: Literal[False] = ...,
- limit: int | None = ...,
- regex: bool_t = ...,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
- ) -> NDFrameT:
- ...
-
- @overload
- def replace(
- self,
- to_replace=...,
- value=...,
- *,
- inplace: Literal[True],
- limit: int | None = ...,
- regex: bool_t = ...,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
- ) -> None:
- ...
-
- @overload
- def replace(
- self: NDFrameT,
- to_replace=...,
- value=...,
- *,
- inplace: bool_t = ...,
- limit: int | None = ...,
- regex: bool_t = ...,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
- ) -> NDFrameT | None:
- ...
-
- @doc(
- _shared_docs["replace"],
- klass=_shared_doc_kwargs["klass"],
- inplace=_shared_doc_kwargs["inplace"],
- replace_iloc=_shared_doc_kwargs["replace_iloc"],
- )
- def replace(
- self: NDFrameT,
- to_replace=None,
- value=lib.no_default,
- *,
- inplace: bool_t = False,
- limit: int | None = None,
- regex: bool_t = False,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,
- ) -> NDFrameT | None:
- if not (
- is_scalar(to_replace)
- or is_re_compilable(to_replace)
- or is_list_like(to_replace)
- ):
- raise TypeError(
- "Expecting 'to_replace' to be either a scalar, array-like, "
- "dict or None, got invalid type "
- f"{repr(type(to_replace).__name__)}"
- )
-
- inplace = validate_bool_kwarg(inplace, "inplace")
- if not is_bool(regex) and to_replace is not None:
- raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool")
-
- if value is lib.no_default or method is not lib.no_default:
- # GH#36984 if the user explicitly passes value=None we want to
- # respect that. We have the corner case where the user explicitly
- # passes value=None *and* a method, which we interpret as meaning
- # they want the (documented) default behavior.
- if method is lib.no_default:
- # TODO: get this to show up as the default in the docs?
- method = "pad"
-
- # passing a single value that is scalar like
- # when value is None (GH5319), for compat
- if not is_dict_like(to_replace) and not is_dict_like(regex):
- to_replace = [to_replace]
-
- if isinstance(to_replace, (tuple, list)):
- # TODO: Consider copy-on-write for non-replaced columns's here
- if isinstance(self, ABCDataFrame):
- from pandas import Series
-
- result = self.apply(
- Series._replace_single,
- args=(to_replace, method, inplace, limit),
- )
- if inplace:
- return None
- return result
- return self._replace_single(to_replace, method, inplace, limit)
-
- if not is_dict_like(to_replace):
- if not is_dict_like(regex):
- raise TypeError(
- 'If "to_replace" and "value" are both None '
- 'and "to_replace" is not a list, then '
- "regex must be a mapping"
- )
- to_replace = regex
- regex = True
-
- items = list(to_replace.items())
- if items:
- keys, values = zip(*items)
- else:
- keys, values = ([], [])
-
- are_mappings = [is_dict_like(v) for v in values]
-
- if any(are_mappings):
- if not all(are_mappings):
- raise TypeError(
- "If a nested mapping is passed, all values "
- "of the top level mapping must be mappings"
- )
- # passed a nested dict/Series
- to_rep_dict = {}
- value_dict = {}
-
- for k, v in items:
- keys, values = list(zip(*v.items())) or ([], [])
-
- to_rep_dict[k] = list(keys)
- value_dict[k] = list(values)
-
- to_replace, value = to_rep_dict, value_dict
- else:
- to_replace, value = keys, values
-
- return self.replace(
- to_replace, value, inplace=inplace, limit=limit, regex=regex
- )
- else:
- # need a non-zero len on all axes
- if not self.size:
- if inplace:
- return None
- return self.copy(deep=None)
-
- if is_dict_like(to_replace):
- if is_dict_like(value): # {'A' : NA} -> {'A' : 0}
- # Note: Checking below for `in foo.keys()` instead of
- # `in foo` is needed for when we have a Series and not dict
- mapping = {
- col: (to_replace[col], value[col])
- for col in to_replace.keys()
- if col in value.keys() and col in self
- }
- return self._replace_columnwise(mapping, inplace, regex)
-
- # {'A': NA} -> 0
- elif not is_list_like(value):
- # Operate column-wise
- if self.ndim == 1:
- raise ValueError(
- "Series.replace cannot use dict-like to_replace "
- "and non-None value"
- )
- mapping = {
- col: (to_rep, value) for col, to_rep in to_replace.items()
- }
- return self._replace_columnwise(mapping, inplace, regex)
- else:
- raise TypeError("value argument must be scalar, dict, or Series")
-
- elif is_list_like(to_replace):
- if not is_list_like(value):
- # e.g. to_replace = [NA, ''] and value is 0,
- # so we replace NA with 0 and then replace '' with 0
- value = [value] * len(to_replace)
-
- # e.g. we have to_replace = [NA, ''] and value = [0, 'missing']
- if len(to_replace) != len(value):
- raise ValueError(
- f"Replacement lists must match in length. "
- f"Expecting {len(to_replace)} got {len(value)} "
- )
- new_data = self._mgr.replace_list(
- src_list=to_replace,
- dest_list=value,
- inplace=inplace,
- regex=regex,
- )
-
- elif to_replace is None:
- if not (
- is_re_compilable(regex)
- or is_list_like(regex)
- or is_dict_like(regex)
- ):
- raise TypeError(
- f"'regex' must be a string or a compiled regular expression "
- f"or a list or dict of strings or regular expressions, "
- f"you passed a {repr(type(regex).__name__)}"
- )
- return self.replace(
- regex, value, inplace=inplace, limit=limit, regex=True
- )
- else:
- # dest iterable dict-like
- if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1}
- # Operate column-wise
- if self.ndim == 1:
- raise ValueError(
- "Series.replace cannot use dict-value and "
- "non-None to_replace"
- )
- mapping = {col: (to_replace, val) for col, val in value.items()}
- return self._replace_columnwise(mapping, inplace, regex)
-
- elif not is_list_like(value): # NA -> 0
- regex = should_use_regex(regex, to_replace)
- if regex:
- new_data = self._mgr.replace_regex(
- to_replace=to_replace,
- value=value,
- inplace=inplace,
- )
- else:
- new_data = self._mgr.replace(
- to_replace=to_replace, value=value, inplace=inplace
- )
- else:
- raise TypeError(
- f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}'
- )
-
- result = self._constructor(new_data)
- if inplace:
- return self._update_inplace(result)
- else:
- return result.__finalize__(self, method="replace")
-
- def interpolate(
- self: NDFrameT,
- method: str = "linear",
- *,
- axis: Axis = 0,
- limit: int | None = None,
- inplace: bool_t = False,
- limit_direction: str | None = None,
- limit_area: str | None = None,
- downcast: str | None = None,
- **kwargs,
- ) -> NDFrameT | None:
- """
- Fill NaN values using an interpolation method.
-
- Please note that only ``method='linear'`` is supported for
- DataFrame/Series with a MultiIndex.
-
- Parameters
- ----------
- method : str, default 'linear'
- Interpolation technique to use. One of:
-
- * 'linear': Ignore the index and treat the values as equally
- spaced. This is the only method supported on MultiIndexes.
- * 'time': Works on daily and higher resolution data to interpolate
- given length of interval.
- * 'index', 'values': use the actual numerical values of the index.
- * 'pad': Fill in NaNs using existing values.
- * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
- 'barycentric', 'polynomial': Passed to
- `scipy.interpolate.interp1d`, whereas 'spline' is passed to
- `scipy.interpolate.UnivariateSpline`. These methods use the numerical
- values of the index. Both 'polynomial' and 'spline' require that
- you also specify an `order` (int), e.g.
- ``df.interpolate(method='polynomial', order=5)``. Note that,
- `slinear` method in Pandas refers to the Scipy first order `spline`
- instead of Pandas first order `spline`.
- * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima',
- 'cubicspline': Wrappers around the SciPy interpolation methods of
- similar names. See `Notes`.
- * 'from_derivatives': Refers to
- `scipy.interpolate.BPoly.from_derivatives` which
- replaces 'piecewise_polynomial' interpolation method in
- scipy 0.18.
-
- axis : {{0 or 'index', 1 or 'columns', None}}, default None
- Axis to interpolate along. For `Series` this parameter is unused
- and defaults to 0.
- limit : int, optional
- Maximum number of consecutive NaNs to fill. Must be greater than
- 0.
- inplace : bool, default False
- Update the data in place if possible.
- limit_direction : {{'forward', 'backward', 'both'}}, Optional
- Consecutive NaNs will be filled in this direction.
-
- If limit is specified:
- * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'.
- * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be
- 'backwards'.
-
- If 'limit' is not specified:
- * If 'method' is 'backfill' or 'bfill', the default is 'backward'
- * else the default is 'forward'
-
- .. versionchanged:: 1.1.0
- raises ValueError if `limit_direction` is 'forward' or 'both' and
- method is 'backfill' or 'bfill'.
- raises ValueError if `limit_direction` is 'backward' or 'both' and
- method is 'pad' or 'ffill'.
-
- limit_area : {{`None`, 'inside', 'outside'}}, default None
- If limit is specified, consecutive NaNs will be filled with this
- restriction.
-
- * ``None``: No fill restriction.
- * 'inside': Only fill NaNs surrounded by valid values
- (interpolate).
- * 'outside': Only fill NaNs outside valid values (extrapolate).
-
- downcast : optional, 'infer' or None, defaults to None
- Downcast dtypes if possible.
- ``**kwargs`` : optional
- Keyword arguments to pass on to the interpolating function.
-
- Returns
- -------
- Series or DataFrame or None
- Returns the same object type as the caller, interpolated at
- some or all ``NaN`` values or None if ``inplace=True``.
-
- See Also
- --------
- fillna : Fill missing values using different methods.
- scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials
- (Akima interpolator).
- scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the
- Bernstein basis.
- scipy.interpolate.interp1d : Interpolate a 1-D function.
- scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh
- interpolator).
- scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic
- interpolation.
- scipy.interpolate.CubicSpline : Cubic spline data interpolator.
-
- Notes
- -----
- The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'
- methods are wrappers around the respective SciPy implementations of
- similar names. These use the actual numerical values of the index.
- For more information on their behavior, see the
- `SciPy documentation
- <https://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation>`__.
-
- Examples
- --------
- Filling in ``NaN`` in a :class:`~pandas.Series` via linear
- interpolation.
-
- >>> s = pd.Series([0, 1, np.nan, 3])
- >>> s
- 0 0.0
- 1 1.0
- 2 NaN
- 3 3.0
- dtype: float64
- >>> s.interpolate()
- 0 0.0
- 1 1.0
- 2 2.0
- 3 3.0
- dtype: float64
-
- Filling in ``NaN`` in a Series by padding, but filling at most two
- consecutive ``NaN`` at a time.
-
- >>> s = pd.Series([np.nan, "single_one", np.nan,
- ... "fill_two_more", np.nan, np.nan, np.nan,
- ... 4.71, np.nan])
- >>> s
- 0 NaN
- 1 single_one
- 2 NaN
- 3 fill_two_more
- 4 NaN
- 5 NaN
- 6 NaN
- 7 4.71
- 8 NaN
- dtype: object
- >>> s.interpolate(method='pad', limit=2)
- 0 NaN
- 1 single_one
- 2 single_one
- 3 fill_two_more
- 4 fill_two_more
- 5 fill_two_more
- 6 NaN
- 7 4.71
- 8 4.71
- dtype: object
-
- Filling in ``NaN`` in a Series via polynomial interpolation or splines:
- Both 'polynomial' and 'spline' methods require that you also specify
- an ``order`` (int).
-
- >>> s = pd.Series([0, 2, np.nan, 8])
- >>> s.interpolate(method='polynomial', order=2)
- 0 0.000000
- 1 2.000000
- 2 4.666667
- 3 8.000000
- dtype: float64
-
- Fill the DataFrame forward (that is, going down) along each column
- using linear interpolation.
-
- Note how the last entry in column 'a' is interpolated differently,
- because there is no entry after it to use for interpolation.
- Note how the first entry in column 'b' remains ``NaN``, because there
- is no entry before it to use for interpolation.
-
- >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0),
- ... (np.nan, 2.0, np.nan, np.nan),
- ... (2.0, 3.0, np.nan, 9.0),
- ... (np.nan, 4.0, -4.0, 16.0)],
- ... columns=list('abcd'))
- >>> df
- a b c d
- 0 0.0 NaN -1.0 1.0
- 1 NaN 2.0 NaN NaN
- 2 2.0 3.0 NaN 9.0
- 3 NaN 4.0 -4.0 16.0
- >>> df.interpolate(method='linear', limit_direction='forward', axis=0)
- a b c d
- 0 0.0 NaN -1.0 1.0
- 1 1.0 2.0 -2.0 5.0
- 2 2.0 3.0 -3.0 9.0
- 3 2.0 4.0 -4.0 16.0
-
- Using polynomial interpolation.
-
- >>> df['d'].interpolate(method='polynomial', order=2)
- 0 1.0
- 1 4.0
- 2 9.0
- 3 16.0
- Name: d, dtype: float64
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
-
- axis = self._get_axis_number(axis)
-
- fillna_methods = ["ffill", "bfill", "pad", "backfill"]
- should_transpose = axis == 1 and method not in fillna_methods
-
- obj = self.T if should_transpose else self
-
- if obj.empty:
- return self.copy()
-
- if method not in fillna_methods:
- axis = self._info_axis_number
-
- if isinstance(obj.index, MultiIndex) and method != "linear":
- raise ValueError(
- "Only `method=linear` interpolation is supported on MultiIndexes."
- )
-
- # Set `limit_direction` depending on `method`
- if limit_direction is None:
- limit_direction = (
- "backward" if method in ("backfill", "bfill") else "forward"
- )
- else:
- if method in ("pad", "ffill") and limit_direction != "forward":
- raise ValueError(
- f"`limit_direction` must be 'forward' for method `{method}`"
- )
- if method in ("backfill", "bfill") and limit_direction != "backward":
- raise ValueError(
- f"`limit_direction` must be 'backward' for method `{method}`"
- )
-
- if obj.ndim == 2 and np.all(obj.dtypes == np.dtype("object")):
- raise TypeError(
- "Cannot interpolate with all object-dtype columns "
- "in the DataFrame. Try setting at least one "
- "column to a numeric dtype."
- )
-
- # create/use the index
- if method == "linear":
- # prior default
- index = Index(np.arange(len(obj.index)))
- else:
- index = obj.index
- methods = {"index", "values", "nearest", "time"}
- is_numeric_or_datetime = (
- is_numeric_dtype(index.dtype)
- or is_datetime64_any_dtype(index.dtype)
- or is_timedelta64_dtype(index.dtype)
- )
- if method not in methods and not is_numeric_or_datetime:
- raise ValueError(
- "Index column must be numeric or datetime type when "
- f"using {method} method other than linear. "
- "Try setting a numeric or datetime index column before "
- "interpolating."
- )
-
- if isna(index).any():
- raise NotImplementedError(
- "Interpolation with NaNs in the index "
- "has not been implemented. Try filling "
- "those NaNs before interpolating."
- )
- new_data = obj._mgr.interpolate(
- method=method,
- axis=axis,
- index=index,
- limit=limit,
- limit_direction=limit_direction,
- limit_area=limit_area,
- inplace=inplace,
- downcast=downcast,
- **kwargs,
- )
-
- result = self._constructor(new_data)
- if should_transpose:
- result = result.T
- if inplace:
- return self._update_inplace(result)
- else:
- return result.__finalize__(self, method="interpolate")
-
- # ----------------------------------------------------------------------
- # Timeseries methods Methods
-
- @final
- def asof(self, where, subset=None):
- """
- Return the last row(s) without any NaNs before `where`.
-
- The last row (for each element in `where`, if list) without any
- NaN is taken.
- In case of a :class:`~pandas.DataFrame`, the last row without NaN
- considering only the subset of columns (if not `None`)
-
- If there is no good value, NaN is returned for a Series or
- a Series of NaN values for a DataFrame
-
- Parameters
- ----------
- where : date or array-like of dates
- Date(s) before which the last row(s) are returned.
- subset : str or array-like of str, default `None`
- For DataFrame, if not `None`, only use these columns to
- check for NaNs.
-
- Returns
- -------
- scalar, Series, or DataFrame
-
- The return can be:
-
- * scalar : when `self` is a Series and `where` is a scalar
- * Series: when `self` is a Series and `where` is an array-like,
- or when `self` is a DataFrame and `where` is a scalar
- * DataFrame : when `self` is a DataFrame and `where` is an
- array-like
-
- Return scalar, Series, or DataFrame.
-
- See Also
- --------
- merge_asof : Perform an asof merge. Similar to left join.
-
- Notes
- -----
- Dates are assumed to be sorted. Raises if this is not the case.
-
- Examples
- --------
- A Series and a scalar `where`.
-
- >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])
- >>> s
- 10 1.0
- 20 2.0
- 30 NaN
- 40 4.0
- dtype: float64
-
- >>> s.asof(20)
- 2.0
-
- For a sequence `where`, a Series is returned. The first value is
- NaN, because the first element of `where` is before the first
- index value.
-
- >>> s.asof([5, 20])
- 5 NaN
- 20 2.0
- dtype: float64
-
- Missing values are not considered. The following is ``2.0``, not
- NaN, even though NaN is at the index location for ``30``.
-
- >>> s.asof(30)
- 2.0
-
- Take all columns into consideration
-
- >>> df = pd.DataFrame({'a': [10, 20, 30, 40, 50],
- ... 'b': [None, None, None, None, 500]},
- ... index=pd.DatetimeIndex(['2018-02-27 09:01:00',
- ... '2018-02-27 09:02:00',
- ... '2018-02-27 09:03:00',
- ... '2018-02-27 09:04:00',
- ... '2018-02-27 09:05:00']))
- >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
- ... '2018-02-27 09:04:30']))
- a b
- 2018-02-27 09:03:30 NaN NaN
- 2018-02-27 09:04:30 NaN NaN
-
- Take a single column into consideration
-
- >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
- ... '2018-02-27 09:04:30']),
- ... subset=['a'])
- a b
- 2018-02-27 09:03:30 30 NaN
- 2018-02-27 09:04:30 40 NaN
- """
- if isinstance(where, str):
- where = Timestamp(where)
-
- if not self.index.is_monotonic_increasing:
- raise ValueError("asof requires a sorted index")
-
- is_series = isinstance(self, ABCSeries)
- if is_series:
- if subset is not None:
- raise ValueError("subset is not valid for Series")
- else:
- if subset is None:
- subset = self.columns
- if not is_list_like(subset):
- subset = [subset]
-
- is_list = is_list_like(where)
- if not is_list:
- start = self.index[0]
- if isinstance(self.index, PeriodIndex):
- where = Period(where, freq=self.index.freq)
-
- if where < start:
- if not is_series:
- return self._constructor_sliced(
- index=self.columns, name=where, dtype=np.float64
- )
- return np.nan
-
- # It's always much faster to use a *while* loop here for
- # Series than pre-computing all the NAs. However a
- # *while* loop is extremely expensive for DataFrame
- # so we later pre-compute all the NAs and use the same
- # code path whether *where* is a scalar or list.
- # See PR: https://github.com/pandas-dev/pandas/pull/14476
- if is_series:
- loc = self.index.searchsorted(where, side="right")
- if loc > 0:
- loc -= 1
-
- values = self._values
- while loc > 0 and isna(values[loc]):
- loc -= 1
- return values[loc]
-
- if not isinstance(where, Index):
- where = Index(where) if is_list else Index([where])
-
- nulls = self.isna() if is_series else self[subset].isna().any(axis=1)
- if nulls.all():
- if is_series:
- self = cast("Series", self)
- return self._constructor(np.nan, index=where, name=self.name)
- elif is_list:
- self = cast("DataFrame", self)
- return self._constructor(np.nan, index=where, columns=self.columns)
- else:
- self = cast("DataFrame", self)
- return self._constructor_sliced(
- np.nan, index=self.columns, name=where[0]
- )
-
- locs = self.index.asof_locs(where, ~(nulls._values))
-
- # mask the missing
- missing = locs == -1
- data = self.take(locs)
- data.index = where
- if missing.any():
- # GH#16063 only do this setting when necessary, otherwise
- # we'd cast e.g. bools to floats
- data.loc[missing] = np.nan
- return data if is_list else data.iloc[-1]
-
- # ----------------------------------------------------------------------
- # Action Methods
-
- @doc(klass=_shared_doc_kwargs["klass"])
- def isna(self: NDFrameT) -> NDFrameT:
- """
- Detect missing values.
-
- Return a boolean same-sized object indicating if the values are NA.
- NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
- values.
- Everything else gets mapped to False values. Characters such as empty
- strings ``''`` or :attr:`numpy.inf` are not considered NA values
- (unless you set ``pandas.options.mode.use_inf_as_na = True``).
-
- Returns
- -------
- {klass}
- Mask of bool values for each element in {klass} that
- indicates whether an element is an NA value.
-
- See Also
- --------
- {klass}.isnull : Alias of isna.
- {klass}.notna : Boolean inverse of isna.
- {klass}.dropna : Omit axes labels with missing values.
- isna : Top-level isna.
-
- Examples
- --------
- Show which entries in a DataFrame are NA.
-
- >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
- ... born=[pd.NaT, pd.Timestamp('1939-05-27'),
- ... pd.Timestamp('1940-04-25')],
- ... name=['Alfred', 'Batman', ''],
- ... toy=[None, 'Batmobile', 'Joker']))
- >>> df
- age born name toy
- 0 5.0 NaT Alfred None
- 1 6.0 1939-05-27 Batman Batmobile
- 2 NaN 1940-04-25 Joker
-
- >>> df.isna()
- age born name toy
- 0 False True False True
- 1 False False False False
- 2 True False False False
-
- Show which entries in a Series are NA.
-
- >>> ser = pd.Series([5, 6, np.NaN])
- >>> ser
- 0 5.0
- 1 6.0
- 2 NaN
- dtype: float64
-
- >>> ser.isna()
- 0 False
- 1 False
- 2 True
- dtype: bool
- """
- return isna(self).__finalize__(self, method="isna")
-
- @doc(isna, klass=_shared_doc_kwargs["klass"])
- def isnull(self: NDFrameT) -> NDFrameT:
- return isna(self).__finalize__(self, method="isnull")
-
- @doc(klass=_shared_doc_kwargs["klass"])
- def notna(self: NDFrameT) -> NDFrameT:
- """
- Detect existing (non-missing) values.
-
- Return a boolean same-sized object indicating if the values are not NA.
- Non-missing values get mapped to True. Characters such as empty
- strings ``''`` or :attr:`numpy.inf` are not considered NA values
- (unless you set ``pandas.options.mode.use_inf_as_na = True``).
- NA values, such as None or :attr:`numpy.NaN`, get mapped to False
- values.
-
- Returns
- -------
- {klass}
- Mask of bool values for each element in {klass} that
- indicates whether an element is not an NA value.
-
- See Also
- --------
- {klass}.notnull : Alias of notna.
- {klass}.isna : Boolean inverse of notna.
- {klass}.dropna : Omit axes labels with missing values.
- notna : Top-level notna.
-
- Examples
- --------
- Show which entries in a DataFrame are not NA.
-
- >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
- ... born=[pd.NaT, pd.Timestamp('1939-05-27'),
- ... pd.Timestamp('1940-04-25')],
- ... name=['Alfred', 'Batman', ''],
- ... toy=[None, 'Batmobile', 'Joker']))
- >>> df
- age born name toy
- 0 5.0 NaT Alfred None
- 1 6.0 1939-05-27 Batman Batmobile
- 2 NaN 1940-04-25 Joker
-
- >>> df.notna()
- age born name toy
- 0 True False True False
- 1 True True True True
- 2 False True True True
-
- Show which entries in a Series are not NA.
-
- >>> ser = pd.Series([5, 6, np.NaN])
- >>> ser
- 0 5.0
- 1 6.0
- 2 NaN
- dtype: float64
-
- >>> ser.notna()
- 0 True
- 1 True
- 2 False
- dtype: bool
- """
- return notna(self).__finalize__(self, method="notna")
-
- @doc(notna, klass=_shared_doc_kwargs["klass"])
- def notnull(self: NDFrameT) -> NDFrameT:
- return notna(self).__finalize__(self, method="notnull")
-
- @final
- def _clip_with_scalar(self, lower, upper, inplace: bool_t = False):
- if (lower is not None and np.any(isna(lower))) or (
- upper is not None and np.any(isna(upper))
- ):
- raise ValueError("Cannot use an NA value as a clip threshold")
-
- result = self
- mask = isna(self._values)
-
- with np.errstate(all="ignore"):
- if upper is not None:
- subset = self <= upper
- result = result.where(subset, upper, axis=None, inplace=False)
- if lower is not None:
- subset = self >= lower
- result = result.where(subset, lower, axis=None, inplace=False)
-
- if np.any(mask):
- result[mask] = np.nan
-
- if inplace:
- return self._update_inplace(result)
- else:
- return result
-
- @final
- def _clip_with_one_bound(self, threshold, method, axis, inplace):
- if axis is not None:
- axis = self._get_axis_number(axis)
-
- # method is self.le for upper bound and self.ge for lower bound
- if is_scalar(threshold) and is_number(threshold):
- if method.__name__ == "le":
- return self._clip_with_scalar(None, threshold, inplace=inplace)
- return self._clip_with_scalar(threshold, None, inplace=inplace)
-
- # GH #15390
- # In order for where method to work, the threshold must
- # be transformed to NDFrame from other array like structure.
- if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold):
- if isinstance(self, ABCSeries):
- threshold = self._constructor(threshold, index=self.index)
- else:
- threshold = align_method_FRAME(self, threshold, axis, flex=None)[1]
-
- # GH 40420
- # Treat missing thresholds as no bounds, not clipping the values
- if is_list_like(threshold):
- fill_value = np.inf if method.__name__ == "le" else -np.inf
- threshold_inf = threshold.fillna(fill_value)
- else:
- threshold_inf = threshold
-
- subset = method(threshold_inf, axis=axis) | isna(self)
-
- # GH 40420
- return self.where(subset, threshold, axis=axis, inplace=inplace)
-
- def clip(
- self: NDFrameT,
- lower=None,
- upper=None,
- *,
- axis: Axis | None = None,
- inplace: bool_t = False,
- **kwargs,
- ) -> NDFrameT | None:
- """
- Trim values at input threshold(s).
-
- Assigns values outside boundary to boundary values. Thresholds
- can be singular values or array like, and in the latter case
- the clipping is performed element-wise in the specified axis.
-
- Parameters
- ----------
- lower : float or array-like, default None
- Minimum threshold value. All values below this
- threshold will be set to it. A missing
- threshold (e.g `NA`) will not clip the value.
- upper : float or array-like, default None
- Maximum threshold value. All values above this
- threshold will be set to it. A missing
- threshold (e.g `NA`) will not clip the value.
- axis : {{0 or 'index', 1 or 'columns', None}}, default None
- Align object with lower and upper along the given axis.
- For `Series` this parameter is unused and defaults to `None`.
- inplace : bool, default False
- Whether to perform the operation in place on the data.
- *args, **kwargs
- Additional keywords have no effect but might be accepted
- for compatibility with numpy.
-
- Returns
- -------
- Series or DataFrame or None
- Same type as calling object with the values outside the
- clip boundaries replaced or None if ``inplace=True``.
-
- See Also
- --------
- Series.clip : Trim values at input threshold in series.
- DataFrame.clip : Trim values at input threshold in dataframe.
- numpy.clip : Clip (limit) the values in an array.
-
- Examples
- --------
- >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}
- >>> df = pd.DataFrame(data)
- >>> df
- col_0 col_1
- 0 9 -2
- 1 -3 -7
- 2 0 6
- 3 -1 8
- 4 5 -5
-
- Clips per column using lower and upper thresholds:
-
- >>> df.clip(-4, 6)
- col_0 col_1
- 0 6 -2
- 1 -3 -4
- 2 0 6
- 3 -1 6
- 4 5 -4
-
- Clips using specific lower and upper thresholds per column element:
-
- >>> t = pd.Series([2, -4, -1, 6, 3])
- >>> t
- 0 2
- 1 -4
- 2 -1
- 3 6
- 4 3
- dtype: int64
-
- >>> df.clip(t, t + 4, axis=0)
- col_0 col_1
- 0 6 2
- 1 -3 -4
- 2 0 3
- 3 6 8
- 4 5 3
-
- Clips using specific lower threshold per column element, with missing values:
-
- >>> t = pd.Series([2, -4, np.NaN, 6, 3])
- >>> t
- 0 2.0
- 1 -4.0
- 2 NaN
- 3 6.0
- 4 3.0
- dtype: float64
-
- >>> df.clip(t, axis=0)
- col_0 col_1
- 0 9 2
- 1 -3 -4
- 2 0 6
- 3 6 8
- 4 5 3
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
-
- axis = nv.validate_clip_with_axis(axis, (), kwargs)
- if axis is not None:
- axis = self._get_axis_number(axis)
-
- # GH 17276
- # numpy doesn't like NaN as a clip value
- # so ignore
- # GH 19992
- # numpy doesn't drop a list-like bound containing NaN
- isna_lower = isna(lower)
- if not is_list_like(lower):
- if np.any(isna_lower):
- lower = None
- elif np.all(isna_lower):
- lower = None
- isna_upper = isna(upper)
- if not is_list_like(upper):
- if np.any(isna_upper):
- upper = None
- elif np.all(isna_upper):
- upper = None
-
- # GH 2747 (arguments were reversed)
- if (
- lower is not None
- and upper is not None
- and is_scalar(lower)
- and is_scalar(upper)
- ):
- lower, upper = min(lower, upper), max(lower, upper)
-
- # fast-path for scalars
- if (lower is None or (is_scalar(lower) and is_number(lower))) and (
- upper is None or (is_scalar(upper) and is_number(upper))
- ):
- return self._clip_with_scalar(lower, upper, inplace=inplace)
-
- result = self
- if lower is not None:
- result = result._clip_with_one_bound(
- lower, method=self.ge, axis=axis, inplace=inplace
- )
- if upper is not None:
- if inplace:
- result = self
- result = result._clip_with_one_bound(
- upper, method=self.le, axis=axis, inplace=inplace
- )
-
- return result
-
- @doc(**_shared_doc_kwargs)
- def asfreq(
- self: NDFrameT,
- freq: Frequency,
- method: FillnaOptions | None = None,
- how: str | None = None,
- normalize: bool_t = False,
- fill_value: Hashable = None,
- ) -> NDFrameT:
- """
- Convert time series to specified frequency.
-
- Returns the original data conformed to a new index with the specified
- frequency.
-
- If the index of this {klass} is a :class:`~pandas.PeriodIndex`, the new index
- is the result of transforming the original index with
- :meth:`PeriodIndex.asfreq <pandas.PeriodIndex.asfreq>` (so the original index
- will map one-to-one to the new index).
-
- Otherwise, the new index will be equivalent to ``pd.date_range(start, end,
- freq=freq)`` where ``start`` and ``end`` are, respectively, the first and
- last entries in the original index (see :func:`pandas.date_range`). The
- values corresponding to any timesteps in the new index which were not present
- in the original index will be null (``NaN``), unless a method for filling
- such unknowns is provided (see the ``method`` parameter below).
-
- The :meth:`resample` method is more appropriate if an operation on each group of
- timesteps (such as an aggregate) is necessary to represent the data at the new
- frequency.
-
- Parameters
- ----------
- freq : DateOffset or str
- Frequency DateOffset or string.
- method : {{'backfill'/'bfill', 'pad'/'ffill'}}, default None
- Method to use for filling holes in reindexed Series (note this
- does not fill NaNs that already were present):
-
- * 'pad' / 'ffill': propagate last valid observation forward to next
- valid
- * 'backfill' / 'bfill': use NEXT valid observation to fill.
- how : {{'start', 'end'}}, default end
- For PeriodIndex only (see PeriodIndex.asfreq).
- normalize : bool, default False
- Whether to reset output index to midnight.
- fill_value : scalar, optional
- Value to use for missing values, applied during upsampling (note
- this does not fill NaNs that already were present).
-
- Returns
- -------
- {klass}
- {klass} object reindexed to the specified frequency.
-
- See Also
- --------
- reindex : Conform DataFrame to new index with optional filling logic.
-
- Notes
- -----
- To learn more about the frequency strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
-
- Examples
- --------
- Start by creating a series with 4 one minute timestamps.
-
- >>> index = pd.date_range('1/1/2000', periods=4, freq='T')
- >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)
- >>> df = pd.DataFrame({{'s': series}})
- >>> df
- s
- 2000-01-01 00:00:00 0.0
- 2000-01-01 00:01:00 NaN
- 2000-01-01 00:02:00 2.0
- 2000-01-01 00:03:00 3.0
-
- Upsample the series into 30 second bins.
-
- >>> df.asfreq(freq='30S')
- s
- 2000-01-01 00:00:00 0.0
- 2000-01-01 00:00:30 NaN
- 2000-01-01 00:01:00 NaN
- 2000-01-01 00:01:30 NaN
- 2000-01-01 00:02:00 2.0
- 2000-01-01 00:02:30 NaN
- 2000-01-01 00:03:00 3.0
-
- Upsample again, providing a ``fill value``.
-
- >>> df.asfreq(freq='30S', fill_value=9.0)
- s
- 2000-01-01 00:00:00 0.0
- 2000-01-01 00:00:30 9.0
- 2000-01-01 00:01:00 NaN
- 2000-01-01 00:01:30 9.0
- 2000-01-01 00:02:00 2.0
- 2000-01-01 00:02:30 9.0
- 2000-01-01 00:03:00 3.0
-
- Upsample again, providing a ``method``.
-
- >>> df.asfreq(freq='30S', method='bfill')
- s
- 2000-01-01 00:00:00 0.0
- 2000-01-01 00:00:30 NaN
- 2000-01-01 00:01:00 NaN
- 2000-01-01 00:01:30 2.0
- 2000-01-01 00:02:00 2.0
- 2000-01-01 00:02:30 3.0
- 2000-01-01 00:03:00 3.0
- """
- from pandas.core.resample import asfreq
-
- return asfreq(
- self,
- freq,
- method=method,
- how=how,
- normalize=normalize,
- fill_value=fill_value,
- )
-
- @final
- def at_time(
- self: NDFrameT, time, asof: bool_t = False, axis: Axis | None = None
- ) -> NDFrameT:
- """
- Select values at particular time of day (e.g., 9:30AM).
-
- Parameters
- ----------
- time : datetime.time or str
- The values to select.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- For `Series` this parameter is unused and defaults to 0.
-
- Returns
- -------
- Series or DataFrame
-
- Raises
- ------
- TypeError
- If the index is not a :class:`DatetimeIndex`
-
- See Also
- --------
- between_time : Select values between particular times of the day.
- first : Select initial periods of time series based on a date offset.
- last : Select final periods of time series based on a date offset.
- DatetimeIndex.indexer_at_time : Get just the index locations for
- values at particular time of the day.
-
- Examples
- --------
- >>> i = pd.date_range('2018-04-09', periods=4, freq='12H')
- >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
- >>> ts
- A
- 2018-04-09 00:00:00 1
- 2018-04-09 12:00:00 2
- 2018-04-10 00:00:00 3
- 2018-04-10 12:00:00 4
-
- >>> ts.at_time('12:00')
- A
- 2018-04-09 12:00:00 2
- 2018-04-10 12:00:00 4
- """
- if axis is None:
- axis = self._stat_axis_number
- axis = self._get_axis_number(axis)
-
- index = self._get_axis(axis)
-
- if not isinstance(index, DatetimeIndex):
- raise TypeError("Index must be DatetimeIndex")
-
- indexer = index.indexer_at_time(time, asof=asof)
- return self._take_with_is_copy(indexer, axis=axis)
-
- @final
- def between_time(
- self: NDFrameT,
- start_time,
- end_time,
- inclusive: IntervalClosedType = "both",
- axis: Axis | None = None,
- ) -> NDFrameT:
- """
- Select values between particular times of the day (e.g., 9:00-9:30 AM).
-
- By setting ``start_time`` to be later than ``end_time``,
- you can get the times that are *not* between the two times.
-
- Parameters
- ----------
- start_time : datetime.time or str
- Initial time as a time filter limit.
- end_time : datetime.time or str
- End time as a time filter limit.
- inclusive : {"both", "neither", "left", "right"}, default "both"
- Include boundaries; whether to set each bound as closed or open.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Determine range time on index or columns value.
- For `Series` this parameter is unused and defaults to 0.
-
- Returns
- -------
- Series or DataFrame
- Data from the original object filtered to the specified dates range.
-
- Raises
- ------
- TypeError
- If the index is not a :class:`DatetimeIndex`
-
- See Also
- --------
- at_time : Select values at a particular time of the day.
- first : Select initial periods of time series based on a date offset.
- last : Select final periods of time series based on a date offset.
- DatetimeIndex.indexer_between_time : Get just the index locations for
- values between particular times of the day.
-
- Examples
- --------
- >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min')
- >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
- >>> ts
- A
- 2018-04-09 00:00:00 1
- 2018-04-10 00:20:00 2
- 2018-04-11 00:40:00 3
- 2018-04-12 01:00:00 4
-
- >>> ts.between_time('0:15', '0:45')
- A
- 2018-04-10 00:20:00 2
- 2018-04-11 00:40:00 3
-
- You get the times that are *not* between two times by setting
- ``start_time`` later than ``end_time``:
-
- >>> ts.between_time('0:45', '0:15')
- A
- 2018-04-09 00:00:00 1
- 2018-04-12 01:00:00 4
- """
- if axis is None:
- axis = self._stat_axis_number
- axis = self._get_axis_number(axis)
-
- index = self._get_axis(axis)
- if not isinstance(index, DatetimeIndex):
- raise TypeError("Index must be DatetimeIndex")
-
- left_inclusive, right_inclusive = validate_inclusive(inclusive)
- indexer = index.indexer_between_time(
- start_time,
- end_time,
- include_start=left_inclusive,
- include_end=right_inclusive,
- )
- return self._take_with_is_copy(indexer, axis=axis)
-
- @doc(**_shared_doc_kwargs)
- def resample(
- self,
- rule,
- axis: Axis = 0,
- closed: str | None = None,
- label: str | None = None,
- convention: str = "start",
- kind: str | None = None,
- on: Level = None,
- level: Level = None,
- origin: str | TimestampConvertibleTypes = "start_day",
- offset: TimedeltaConvertibleTypes | None = None,
- group_keys: bool_t = False,
- ) -> Resampler:
- """
- Resample time-series data.
-
- Convenience method for frequency conversion and resampling of time series.
- The object must have a datetime-like index (`DatetimeIndex`, `PeriodIndex`,
- or `TimedeltaIndex`), or the caller must pass the label of a datetime-like
- series/index to the ``on``/``level`` keyword parameter.
-
- Parameters
- ----------
- rule : DateOffset, Timedelta or str
- The offset string or object representing target conversion.
- axis : {{0 or 'index', 1 or 'columns'}}, default 0
- Which axis to use for up- or down-sampling. For `Series` this parameter
- is unused and defaults to 0. Must be
- `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`.
- closed : {{'right', 'left'}}, default None
- Which side of bin interval is closed. The default is 'left'
- for all frequency offsets except for 'M', 'A', 'Q', 'BM',
- 'BA', 'BQ', and 'W' which all have a default of 'right'.
- label : {{'right', 'left'}}, default None
- Which bin edge label to label bucket with. The default is 'left'
- for all frequency offsets except for 'M', 'A', 'Q', 'BM',
- 'BA', 'BQ', and 'W' which all have a default of 'right'.
- convention : {{'start', 'end', 's', 'e'}}, default 'start'
- For `PeriodIndex` only, controls whether to use the start or
- end of `rule`.
- kind : {{'timestamp', 'period'}}, optional, default None
- Pass 'timestamp' to convert the resulting index to a
- `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`.
- By default the input representation is retained.
-
- on : str, optional
- For a DataFrame, column to use instead of index for resampling.
- Column must be datetime-like.
- level : str or int, optional
- For a MultiIndex, level (name or number) to use for
- resampling. `level` must be datetime-like.
- origin : Timestamp or str, default 'start_day'
- The timestamp on which to adjust the grouping. The timezone of origin
- must match the timezone of the index.
- If string, must be one of the following:
-
- - 'epoch': `origin` is 1970-01-01
- - 'start': `origin` is the first value of the timeseries
- - 'start_day': `origin` is the first day at midnight of the timeseries
-
- .. versionadded:: 1.1.0
-
- - 'end': `origin` is the last value of the timeseries
- - 'end_day': `origin` is the ceiling midnight of the last day
-
- .. versionadded:: 1.3.0
-
- offset : Timedelta or str, default is None
- An offset timedelta added to the origin.
-
- .. versionadded:: 1.1.0
-
- group_keys : bool, default False
- Whether to include the group keys in the result index when using
- ``.apply()`` on the resampled object.
-
- .. versionadded:: 1.5.0
-
- Not specifying ``group_keys`` will retain values-dependent behavior
- from pandas 1.4 and earlier (see :ref:`pandas 1.5.0 Release notes
- <whatsnew_150.enhancements.resample_group_keys>` for examples).
-
- .. versionchanged:: 2.0.0
-
- ``group_keys`` now defaults to ``False``.
-
- Returns
- -------
- pandas.core.Resampler
- :class:`~pandas.core.Resampler` object.
-
- See Also
- --------
- Series.resample : Resample a Series.
- DataFrame.resample : Resample a DataFrame.
- groupby : Group {klass} by mapping, function, label, or list of labels.
- asfreq : Reindex a {klass} with the given frequency without grouping.
-
- Notes
- -----
- See the `user guide
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling>`__
- for more.
-
- To learn more about the offset strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`__.
-
- Examples
- --------
- Start by creating a series with 9 one minute timestamps.
-
- >>> index = pd.date_range('1/1/2000', periods=9, freq='T')
- >>> series = pd.Series(range(9), index=index)
- >>> series
- 2000-01-01 00:00:00 0
- 2000-01-01 00:01:00 1
- 2000-01-01 00:02:00 2
- 2000-01-01 00:03:00 3
- 2000-01-01 00:04:00 4
- 2000-01-01 00:05:00 5
- 2000-01-01 00:06:00 6
- 2000-01-01 00:07:00 7
- 2000-01-01 00:08:00 8
- Freq: T, dtype: int64
-
- Downsample the series into 3 minute bins and sum the values
- of the timestamps falling into a bin.
-
- >>> series.resample('3T').sum()
- 2000-01-01 00:00:00 3
- 2000-01-01 00:03:00 12
- 2000-01-01 00:06:00 21
- Freq: 3T, dtype: int64
-
- Downsample the series into 3 minute bins as above, but label each
- bin using the right edge instead of the left. Please note that the
- value in the bucket used as the label is not included in the bucket,
- which it labels. For example, in the original series the
- bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed
- value in the resampled bucket with the label ``2000-01-01 00:03:00``
- does not include 3 (if it did, the summed value would be 6, not 3).
- To include this value close the right side of the bin interval as
- illustrated in the example below this one.
-
- >>> series.resample('3T', label='right').sum()
- 2000-01-01 00:03:00 3
- 2000-01-01 00:06:00 12
- 2000-01-01 00:09:00 21
- Freq: 3T, dtype: int64
-
- Downsample the series into 3 minute bins as above, but close the right
- side of the bin interval.
-
- >>> series.resample('3T', label='right', closed='right').sum()
- 2000-01-01 00:00:00 0
- 2000-01-01 00:03:00 6
- 2000-01-01 00:06:00 15
- 2000-01-01 00:09:00 15
- Freq: 3T, dtype: int64
-
- Upsample the series into 30 second bins.
-
- >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows
- 2000-01-01 00:00:00 0.0
- 2000-01-01 00:00:30 NaN
- 2000-01-01 00:01:00 1.0
- 2000-01-01 00:01:30 NaN
- 2000-01-01 00:02:00 2.0
- Freq: 30S, dtype: float64
-
- Upsample the series into 30 second bins and fill the ``NaN``
- values using the ``ffill`` method.
-
- >>> series.resample('30S').ffill()[0:5]
- 2000-01-01 00:00:00 0
- 2000-01-01 00:00:30 0
- 2000-01-01 00:01:00 1
- 2000-01-01 00:01:30 1
- 2000-01-01 00:02:00 2
- Freq: 30S, dtype: int64
-
- Upsample the series into 30 second bins and fill the
- ``NaN`` values using the ``bfill`` method.
-
- >>> series.resample('30S').bfill()[0:5]
- 2000-01-01 00:00:00 0
- 2000-01-01 00:00:30 1
- 2000-01-01 00:01:00 1
- 2000-01-01 00:01:30 2
- 2000-01-01 00:02:00 2
- Freq: 30S, dtype: int64
-
- Pass a custom function via ``apply``
-
- >>> def custom_resampler(arraylike):
- ... return np.sum(arraylike) + 5
- ...
- >>> series.resample('3T').apply(custom_resampler)
- 2000-01-01 00:00:00 8
- 2000-01-01 00:03:00 17
- 2000-01-01 00:06:00 26
- Freq: 3T, dtype: int64
-
- For a Series with a PeriodIndex, the keyword `convention` can be
- used to control whether to use the start or end of `rule`.
-
- Resample a year by quarter using 'start' `convention`. Values are
- assigned to the first quarter of the period.
-
- >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01',
- ... freq='A',
- ... periods=2))
- >>> s
- 2012 1
- 2013 2
- Freq: A-DEC, dtype: int64
- >>> s.resample('Q', convention='start').asfreq()
- 2012Q1 1.0
- 2012Q2 NaN
- 2012Q3 NaN
- 2012Q4 NaN
- 2013Q1 2.0
- 2013Q2 NaN
- 2013Q3 NaN
- 2013Q4 NaN
- Freq: Q-DEC, dtype: float64
-
- Resample quarters by month using 'end' `convention`. Values are
- assigned to the last month of the period.
-
- >>> q = pd.Series([1, 2, 3, 4], index=pd.period_range('2018-01-01',
- ... freq='Q',
- ... periods=4))
- >>> q
- 2018Q1 1
- 2018Q2 2
- 2018Q3 3
- 2018Q4 4
- Freq: Q-DEC, dtype: int64
- >>> q.resample('M', convention='end').asfreq()
- 2018-03 1.0
- 2018-04 NaN
- 2018-05 NaN
- 2018-06 2.0
- 2018-07 NaN
- 2018-08 NaN
- 2018-09 3.0
- 2018-10 NaN
- 2018-11 NaN
- 2018-12 4.0
- Freq: M, dtype: float64
-
- For DataFrame objects, the keyword `on` can be used to specify the
- column instead of the index for resampling.
-
- >>> d = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],
- ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}
- >>> df = pd.DataFrame(d)
- >>> df['week_starting'] = pd.date_range('01/01/2018',
- ... periods=8,
- ... freq='W')
- >>> df
- price volume week_starting
- 0 10 50 2018-01-07
- 1 11 60 2018-01-14
- 2 9 40 2018-01-21
- 3 13 100 2018-01-28
- 4 14 50 2018-02-04
- 5 18 100 2018-02-11
- 6 17 40 2018-02-18
- 7 19 50 2018-02-25
- >>> df.resample('M', on='week_starting').mean()
- price volume
- week_starting
- 2018-01-31 10.75 62.5
- 2018-02-28 17.00 60.0
-
- For a DataFrame with MultiIndex, the keyword `level` can be used to
- specify on which level the resampling needs to take place.
-
- >>> days = pd.date_range('1/1/2000', periods=4, freq='D')
- >>> d2 = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],
- ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}
- >>> df2 = pd.DataFrame(
- ... d2,
- ... index=pd.MultiIndex.from_product(
- ... [days, ['morning', 'afternoon']]
- ... )
- ... )
- >>> df2
- price volume
- 2000-01-01 morning 10 50
- afternoon 11 60
- 2000-01-02 morning 9 40
- afternoon 13 100
- 2000-01-03 morning 14 50
- afternoon 18 100
- 2000-01-04 morning 17 40
- afternoon 19 50
- >>> df2.resample('D', level=0).sum()
- price volume
- 2000-01-01 21 110
- 2000-01-02 22 140
- 2000-01-03 32 150
- 2000-01-04 36 90
-
- If you want to adjust the start of the bins based on a fixed timestamp:
-
- >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
- >>> rng = pd.date_range(start, end, freq='7min')
- >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
- >>> ts
- 2000-10-01 23:30:00 0
- 2000-10-01 23:37:00 3
- 2000-10-01 23:44:00 6
- 2000-10-01 23:51:00 9
- 2000-10-01 23:58:00 12
- 2000-10-02 00:05:00 15
- 2000-10-02 00:12:00 18
- 2000-10-02 00:19:00 21
- 2000-10-02 00:26:00 24
- Freq: 7T, dtype: int64
-
- >>> ts.resample('17min').sum()
- 2000-10-01 23:14:00 0
- 2000-10-01 23:31:00 9
- 2000-10-01 23:48:00 21
- 2000-10-02 00:05:00 54
- 2000-10-02 00:22:00 24
- Freq: 17T, dtype: int64
-
- >>> ts.resample('17min', origin='epoch').sum()
- 2000-10-01 23:18:00 0
- 2000-10-01 23:35:00 18
- 2000-10-01 23:52:00 27
- 2000-10-02 00:09:00 39
- 2000-10-02 00:26:00 24
- Freq: 17T, dtype: int64
-
- >>> ts.resample('17min', origin='2000-01-01').sum()
- 2000-10-01 23:24:00 3
- 2000-10-01 23:41:00 15
- 2000-10-01 23:58:00 45
- 2000-10-02 00:15:00 45
- Freq: 17T, dtype: int64
-
- If you want to adjust the start of the bins with an `offset` Timedelta, the two
- following lines are equivalent:
-
- >>> ts.resample('17min', origin='start').sum()
- 2000-10-01 23:30:00 9
- 2000-10-01 23:47:00 21
- 2000-10-02 00:04:00 54
- 2000-10-02 00:21:00 24
- Freq: 17T, dtype: int64
-
- >>> ts.resample('17min', offset='23h30min').sum()
- 2000-10-01 23:30:00 9
- 2000-10-01 23:47:00 21
- 2000-10-02 00:04:00 54
- 2000-10-02 00:21:00 24
- Freq: 17T, dtype: int64
-
- If you want to take the largest Timestamp as the end of the bins:
-
- >>> ts.resample('17min', origin='end').sum()
- 2000-10-01 23:35:00 0
- 2000-10-01 23:52:00 18
- 2000-10-02 00:09:00 27
- 2000-10-02 00:26:00 63
- Freq: 17T, dtype: int64
-
- In contrast with the `start_day`, you can use `end_day` to take the ceiling
- midnight of the largest Timestamp as the end of the bins and drop the bins
- not containing data:
-
- >>> ts.resample('17min', origin='end_day').sum()
- 2000-10-01 23:38:00 3
- 2000-10-01 23:55:00 15
- 2000-10-02 00:12:00 45
- 2000-10-02 00:29:00 45
- Freq: 17T, dtype: int64
- """
- from pandas.core.resample import get_resampler
-
- axis = self._get_axis_number(axis)
- return get_resampler(
- cast("Series | DataFrame", self),
- freq=rule,
- label=label,
- closed=closed,
- axis=axis,
- kind=kind,
- convention=convention,
- key=on,
- level=level,
- origin=origin,
- offset=offset,
- group_keys=group_keys,
- )
-
- @final
- def first(self: NDFrameT, offset) -> NDFrameT:
- """
- Select initial periods of time series data based on a date offset.
-
- For a DataFrame with a sorted DatetimeIndex, this function can
- select the first few rows based on a date offset.
-
- Parameters
- ----------
- offset : str, DateOffset or dateutil.relativedelta
- The offset length of the data that will be selected. For instance,
- '1M' will display all the rows having their index within the first month.
-
- Returns
- -------
- Series or DataFrame
- A subset of the caller.
-
- Raises
- ------
- TypeError
- If the index is not a :class:`DatetimeIndex`
-
- See Also
- --------
- last : Select final periods of time series based on a date offset.
- at_time : Select values at a particular time of the day.
- between_time : Select values between particular times of the day.
-
- Examples
- --------
- >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
- >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
- >>> ts
- A
- 2018-04-09 1
- 2018-04-11 2
- 2018-04-13 3
- 2018-04-15 4
-
- Get the rows for the first 3 days:
-
- >>> ts.first('3D')
- A
- 2018-04-09 1
- 2018-04-11 2
-
- Notice the data for 3 first calendar days were returned, not the first
- 3 days observed in the dataset, and therefore data for 2018-04-13 was
- not returned.
- """
- if not isinstance(self.index, DatetimeIndex):
- raise TypeError("'first' only supports a DatetimeIndex index")
-
- if len(self.index) == 0:
- return self.copy(deep=False)
-
- offset = to_offset(offset)
- if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]):
- # GH#29623 if first value is end of period, remove offset with n = 1
- # before adding the real offset
- end_date = end = self.index[0] - offset.base + offset
- else:
- end_date = end = self.index[0] + offset
-
- # Tick-like, e.g. 3 weeks
- if isinstance(offset, Tick) and end_date in self.index:
- end = self.index.searchsorted(end_date, side="left")
- return self.iloc[:end]
-
- return self.loc[:end]
-
- @final
- def last(self: NDFrameT, offset) -> NDFrameT:
- """
- Select final periods of time series data based on a date offset.
-
- For a DataFrame with a sorted DatetimeIndex, this function
- selects the last few rows based on a date offset.
-
- Parameters
- ----------
- offset : str, DateOffset, dateutil.relativedelta
- The offset length of the data that will be selected. For instance,
- '3D' will display all the rows having their index within the last 3 days.
-
- Returns
- -------
- Series or DataFrame
- A subset of the caller.
-
- Raises
- ------
- TypeError
- If the index is not a :class:`DatetimeIndex`
-
- See Also
- --------
- first : Select initial periods of time series based on a date offset.
- at_time : Select values at a particular time of the day.
- between_time : Select values between particular times of the day.
-
- Examples
- --------
- >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
- >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
- >>> ts
- A
- 2018-04-09 1
- 2018-04-11 2
- 2018-04-13 3
- 2018-04-15 4
-
- Get the rows for the last 3 days:
-
- >>> ts.last('3D')
- A
- 2018-04-13 3
- 2018-04-15 4
-
- Notice the data for 3 last calendar days were returned, not the last
- 3 observed days in the dataset, and therefore data for 2018-04-11 was
- not returned.
- """
- if not isinstance(self.index, DatetimeIndex):
- raise TypeError("'last' only supports a DatetimeIndex index")
-
- if len(self.index) == 0:
- return self.copy(deep=False)
-
- offset = to_offset(offset)
-
- start_date = self.index[-1] - offset
- start = self.index.searchsorted(start_date, side="right")
- return self.iloc[start:]
-
- @final
- def rank(
- self: NDFrameT,
- axis: Axis = 0,
- method: str = "average",
- numeric_only: bool_t = False,
- na_option: str = "keep",
- ascending: bool_t = True,
- pct: bool_t = False,
- ) -> NDFrameT:
- """
- Compute numerical data ranks (1 through n) along axis.
-
- By default, equal values are assigned a rank that is the average of the
- ranks of those values.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Index to direct ranking.
- For `Series` this parameter is unused and defaults to 0.
- method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
- How to rank the group of records that have the same value (i.e. ties):
-
- * average: average rank of the group
- * min: lowest rank in the group
- * max: highest rank in the group
- * first: ranks assigned in order they appear in the array
- * dense: like 'min', but rank always increases by 1 between groups.
-
- numeric_only : bool, default False
- For DataFrame objects, rank only numeric columns if set to True.
-
- .. versionchanged:: 2.0.0
- The default value of ``numeric_only`` is now ``False``.
-
- na_option : {'keep', 'top', 'bottom'}, default 'keep'
- How to rank NaN values:
-
- * keep: assign NaN rank to NaN values
- * top: assign lowest rank to NaN values
- * bottom: assign highest rank to NaN values
-
- ascending : bool, default True
- Whether or not the elements should be ranked in ascending order.
- pct : bool, default False
- Whether or not to display the returned rankings in percentile
- form.
-
- Returns
- -------
- same type as caller
- Return a Series or DataFrame with data ranks as values.
-
- See Also
- --------
- core.groupby.DataFrameGroupBy.rank : Rank of values within each group.
- core.groupby.SeriesGroupBy.rank : Rank of values within each group.
-
- Examples
- --------
- >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog',
- ... 'spider', 'snake'],
- ... 'Number_legs': [4, 2, 4, 8, np.nan]})
- >>> df
- Animal Number_legs
- 0 cat 4.0
- 1 penguin 2.0
- 2 dog 4.0
- 3 spider 8.0
- 4 snake NaN
-
- Ties are assigned the mean of the ranks (by default) for the group.
-
- >>> s = pd.Series(range(5), index=list("abcde"))
- >>> s["d"] = s["b"]
- >>> s.rank()
- a 1.0
- b 2.5
- c 4.0
- d 2.5
- e 5.0
- dtype: float64
-
- The following example shows how the method behaves with the above
- parameters:
-
- * default_rank: this is the default behaviour obtained without using
- any parameter.
- * max_rank: setting ``method = 'max'`` the records that have the
- same values are ranked using the highest rank (e.g.: since 'cat'
- and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)
- * NA_bottom: choosing ``na_option = 'bottom'``, if there are records
- with NaN values they are placed at the bottom of the ranking.
- * pct_rank: when setting ``pct = True``, the ranking is expressed as
- percentile rank.
-
- >>> df['default_rank'] = df['Number_legs'].rank()
- >>> df['max_rank'] = df['Number_legs'].rank(method='max')
- >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')
- >>> df['pct_rank'] = df['Number_legs'].rank(pct=True)
- >>> df
- Animal Number_legs default_rank max_rank NA_bottom pct_rank
- 0 cat 4.0 2.5 3.0 2.5 0.625
- 1 penguin 2.0 1.0 1.0 1.0 0.250
- 2 dog 4.0 2.5 3.0 2.5 0.625
- 3 spider 8.0 4.0 4.0 4.0 1.000
- 4 snake NaN NaN NaN 5.0 NaN
- """
- axis_int = self._get_axis_number(axis)
-
- if na_option not in {"keep", "top", "bottom"}:
- msg = "na_option must be one of 'keep', 'top', or 'bottom'"
- raise ValueError(msg)
-
- def ranker(data):
- if data.ndim == 2:
- # i.e. DataFrame, we cast to ndarray
- values = data.values
- else:
- # i.e. Series, can dispatch to EA
- values = data._values
-
- if isinstance(values, ExtensionArray):
- ranks = values._rank(
- axis=axis_int,
- method=method,
- ascending=ascending,
- na_option=na_option,
- pct=pct,
- )
- else:
- ranks = algos.rank(
- values,
- axis=axis_int,
- method=method,
- ascending=ascending,
- na_option=na_option,
- pct=pct,
- )
-
- ranks_obj = self._constructor(ranks, **data._construct_axes_dict())
- return ranks_obj.__finalize__(self, method="rank")
-
- if numeric_only:
- if self.ndim == 1 and not is_numeric_dtype(self.dtype):
- # GH#47500
- raise TypeError(
- "Series.rank does not allow numeric_only=True with "
- "non-numeric dtype."
- )
- data = self._get_numeric_data()
- else:
- data = self
-
- return ranker(data)
-
- @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"])
- def compare(
- self,
- other,
- align_axis: Axis = 1,
- keep_shape: bool_t = False,
- keep_equal: bool_t = False,
- result_names: Suffixes = ("self", "other"),
- ):
- if type(self) is not type(other):
- cls_self, cls_other = type(self).__name__, type(other).__name__
- raise TypeError(
- f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'"
- )
-
- mask = ~((self == other) | (self.isna() & other.isna()))
- mask.fillna(True, inplace=True)
-
- if not keep_equal:
- self = self.where(mask)
- other = other.where(mask)
-
- if not keep_shape:
- if isinstance(self, ABCDataFrame):
- cmask = mask.any()
- rmask = mask.any(axis=1)
- self = self.loc[rmask, cmask]
- other = other.loc[rmask, cmask]
- else:
- self = self[mask]
- other = other[mask]
- if not isinstance(result_names, tuple):
- raise TypeError(
- f"Passing 'result_names' as a {type(result_names)} is not "
- "supported. Provide 'result_names' as a tuple instead."
- )
-
- if align_axis in (1, "columns"): # This is needed for Series
- axis = 1
- else:
- axis = self._get_axis_number(align_axis)
-
- diff = concat([self, other], axis=axis, keys=result_names)
-
- if axis >= self.ndim:
- # No need to reorganize data if stacking on new axis
- # This currently applies for stacking two Series on columns
- return diff
-
- ax = diff._get_axis(axis)
- ax_names = np.array(ax.names)
-
- # set index names to positions to avoid confusion
- ax.names = np.arange(len(ax_names))
-
- # bring self-other to inner level
- order = list(range(1, ax.nlevels)) + [0]
- if isinstance(diff, ABCDataFrame):
- diff = diff.reorder_levels(order, axis=axis)
- else:
- diff = diff.reorder_levels(order)
-
- # restore the index names in order
- diff._get_axis(axis=axis).names = ax_names[order]
-
- # reorder axis to keep things organized
- indices = (
- np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten()
- )
- diff = diff.take(indices, axis=axis)
-
- return diff
-
- @doc(**_shared_doc_kwargs)
- def align(
- self: NDFrameT,
- other: NDFrameT,
- join: AlignJoin = "outer",
- axis: Axis | None = None,
- level: Level = None,
- copy: bool_t | None = None,
- fill_value: Hashable = None,
- method: FillnaOptions | None = None,
- limit: int | None = None,
- fill_axis: Axis = 0,
- broadcast_axis: Axis | None = None,
- ) -> NDFrameT:
- """
- Align two objects on their axes with the specified join method.
-
- Join method is specified for each axis Index.
-
- Parameters
- ----------
- other : DataFrame or Series
- join : {{'outer', 'inner', 'left', 'right'}}, default 'outer'
- axis : allowed axis of the other object, default None
- Align on index (0), columns (1), or both (None).
- level : int or level name, default None
- Broadcast across a level, matching Index values on the
- passed MultiIndex level.
- copy : bool, default True
- Always returns new objects. If copy=False and no reindexing is
- required then original objects are returned.
- fill_value : scalar, default np.NaN
- Value to use for missing values. Defaults to NaN, but can be any
- "compatible" value.
- method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None
- Method to use for filling holes in reindexed Series:
-
- - pad / ffill: propagate last valid observation forward to next valid.
- - backfill / bfill: use NEXT valid observation to fill gap.
-
- limit : int, default None
- If method is specified, this is the maximum number of consecutive
- NaN values to forward/backward fill. In other words, if there is
- a gap with more than this number of consecutive NaNs, it will only
- be partially filled. If method is not specified, this is the
- maximum number of entries along the entire axis where NaNs will be
- filled. Must be greater than 0 if not None.
- fill_axis : {axes_single_arg}, default 0
- Filling axis, method and limit.
- broadcast_axis : {axes_single_arg}, default None
- Broadcast values along this axis, if aligning two objects of
- different dimensions.
-
- Returns
- -------
- tuple of ({klass}, type of other)
- Aligned objects.
-
- Examples
- --------
- >>> df = pd.DataFrame(
- ... [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2]
- ... )
- >>> other = pd.DataFrame(
- ... [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],
- ... columns=["A", "B", "C", "D"],
- ... index=[2, 3, 4],
- ... )
- >>> df
- D B E A
- 1 1 2 3 4
- 2 6 7 8 9
- >>> other
- A B C D
- 2 10 20 30 40
- 3 60 70 80 90
- 4 600 700 800 900
-
- Align on columns:
-
- >>> left, right = df.align(other, join="outer", axis=1)
- >>> left
- A B C D E
- 1 4 2 NaN 1 3
- 2 9 7 NaN 6 8
- >>> right
- A B C D E
- 2 10 20 30 40 NaN
- 3 60 70 80 90 NaN
- 4 600 700 800 900 NaN
-
- We can also align on the index:
-
- >>> left, right = df.align(other, join="outer", axis=0)
- >>> left
- D B E A
- 1 1.0 2.0 3.0 4.0
- 2 6.0 7.0 8.0 9.0
- 3 NaN NaN NaN NaN
- 4 NaN NaN NaN NaN
- >>> right
- A B C D
- 1 NaN NaN NaN NaN
- 2 10.0 20.0 30.0 40.0
- 3 60.0 70.0 80.0 90.0
- 4 600.0 700.0 800.0 900.0
-
- Finally, the default `axis=None` will align on both index and columns:
-
- >>> left, right = df.align(other, join="outer", axis=None)
- >>> left
- A B C D E
- 1 4.0 2.0 NaN 1.0 3.0
- 2 9.0 7.0 NaN 6.0 8.0
- 3 NaN NaN NaN NaN NaN
- 4 NaN NaN NaN NaN NaN
- >>> right
- A B C D E
- 1 NaN NaN NaN NaN NaN
- 2 10.0 20.0 30.0 40.0 NaN
- 3 60.0 70.0 80.0 90.0 NaN
- 4 600.0 700.0 800.0 900.0 NaN
- """
-
- method = clean_fill_method(method)
-
- if broadcast_axis == 1 and self.ndim != other.ndim:
- if isinstance(self, ABCSeries):
- # this means other is a DataFrame, and we need to broadcast
- # self
- cons = self._constructor_expanddim
- df = cons(
- {c: self for c in other.columns}, **other._construct_axes_dict()
- )
- return df._align_frame(
- other,
- join=join,
- axis=axis,
- level=level,
- copy=copy,
- fill_value=fill_value,
- method=method,
- limit=limit,
- fill_axis=fill_axis,
- )
- elif isinstance(other, ABCSeries):
- # this means self is a DataFrame, and we need to broadcast
- # other
- cons = other._constructor_expanddim
- df = cons(
- {c: other for c in self.columns}, **self._construct_axes_dict()
- )
- return self._align_frame(
- df,
- join=join,
- axis=axis,
- level=level,
- copy=copy,
- fill_value=fill_value,
- method=method,
- limit=limit,
- fill_axis=fill_axis,
- )
-
- if axis is not None:
- axis = self._get_axis_number(axis)
- if isinstance(other, ABCDataFrame):
- return self._align_frame(
- other,
- join=join,
- axis=axis,
- level=level,
- copy=copy,
- fill_value=fill_value,
- method=method,
- limit=limit,
- fill_axis=fill_axis,
- )
- elif isinstance(other, ABCSeries):
- return self._align_series(
- other,
- join=join,
- axis=axis,
- level=level,
- copy=copy,
- fill_value=fill_value,
- method=method,
- limit=limit,
- fill_axis=fill_axis,
- )
- else: # pragma: no cover
- raise TypeError(f"unsupported type: {type(other)}")
-
- @final
- def _align_frame(
- self,
- other,
- join: AlignJoin = "outer",
- axis: Axis | None = None,
- level=None,
- copy: bool_t | None = None,
- fill_value=None,
- method=None,
- limit=None,
- fill_axis: Axis = 0,
- ):
- # defaults
- join_index, join_columns = None, None
- ilidx, iridx = None, None
- clidx, cridx = None, None
-
- is_series = isinstance(self, ABCSeries)
-
- if (axis is None or axis == 0) and not self.index.equals(other.index):
- join_index, ilidx, iridx = self.index.join(
- other.index, how=join, level=level, return_indexers=True
- )
-
- if (
- (axis is None or axis == 1)
- and not is_series
- and not self.columns.equals(other.columns)
- ):
- join_columns, clidx, cridx = self.columns.join(
- other.columns, how=join, level=level, return_indexers=True
- )
-
- if is_series:
- reindexers = {0: [join_index, ilidx]}
- else:
- reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]}
-
- left = self._reindex_with_indexers(
- reindexers, copy=copy, fill_value=fill_value, allow_dups=True
- )
- # other must be always DataFrame
- right = other._reindex_with_indexers(
- {0: [join_index, iridx], 1: [join_columns, cridx]},
- copy=copy,
- fill_value=fill_value,
- allow_dups=True,
- )
-
- if method is not None:
- _left = left.fillna(method=method, axis=fill_axis, limit=limit)
- assert _left is not None # needed for mypy
- left = _left
- right = right.fillna(method=method, axis=fill_axis, limit=limit)
-
- # if DatetimeIndex have different tz, convert to UTC
- left, right = _align_as_utc(left, right, join_index)
-
- return (
- left.__finalize__(self),
- right.__finalize__(other),
- )
-
- @final
- def _align_series(
- self,
- other,
- join: AlignJoin = "outer",
- axis: Axis | None = None,
- level=None,
- copy: bool_t | None = None,
- fill_value=None,
- method=None,
- limit=None,
- fill_axis: Axis = 0,
- ):
- is_series = isinstance(self, ABCSeries)
- if copy and using_copy_on_write():
- copy = False
-
- if (not is_series and axis is None) or axis not in [None, 0, 1]:
- raise ValueError("Must specify axis=0 or 1")
-
- if is_series and axis == 1:
- raise ValueError("cannot align series to a series other than axis 0")
-
- # series/series compat, other must always be a Series
- if not axis:
- # equal
- if self.index.equals(other.index):
- join_index, lidx, ridx = None, None, None
- else:
- join_index, lidx, ridx = self.index.join(
- other.index, how=join, level=level, return_indexers=True
- )
-
- if is_series:
- left = self._reindex_indexer(join_index, lidx, copy)
- elif lidx is None or join_index is None:
- left = self.copy(deep=copy)
- else:
- left = self._constructor(
- self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy)
- )
-
- right = other._reindex_indexer(join_index, ridx, copy)
-
- else:
- # one has > 1 ndim
- fdata = self._mgr
- join_index = self.axes[1]
- lidx, ridx = None, None
- if not join_index.equals(other.index):
- join_index, lidx, ridx = join_index.join(
- other.index, how=join, level=level, return_indexers=True
- )
-
- if lidx is not None:
- bm_axis = self._get_block_manager_axis(1)
- fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)
-
- if copy and fdata is self._mgr:
- fdata = fdata.copy()
-
- left = self._constructor(fdata)
-
- if ridx is None:
- right = other.copy(deep=copy)
- else:
- right = other.reindex(join_index, level=level)
-
- # fill
- fill_na = notna(fill_value) or (method is not None)
- if fill_na:
- left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis)
- right = right.fillna(fill_value, method=method, limit=limit)
-
- # if DatetimeIndex have different tz, convert to UTC
- if is_series or (not is_series and axis == 0):
- left, right = _align_as_utc(left, right, join_index)
-
- return (
- left.__finalize__(self),
- right.__finalize__(other),
- )
-
- @final
- def _where(
- self,
- cond,
- other=lib.no_default,
- inplace: bool_t = False,
- axis: Axis | None = None,
- level=None,
- ):
- """
- Equivalent to public method `where`, except that `other` is not
- applied as a function even if callable. Used in __setitem__.
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
-
- if axis is not None:
- axis = self._get_axis_number(axis)
-
- # align the cond to same shape as myself
- cond = common.apply_if_callable(cond, self)
- if isinstance(cond, NDFrame):
- # CoW: Make sure reference is not kept alive
- cond = cond.align(self, join="right", broadcast_axis=1, copy=False)[0]
- else:
- if not hasattr(cond, "shape"):
- cond = np.asanyarray(cond)
- if cond.shape != self.shape:
- raise ValueError("Array conditional must be same shape as self")
- cond = self._constructor(cond, **self._construct_axes_dict(), copy=False)
-
- # make sure we are boolean
- fill_value = bool(inplace)
- cond = cond.fillna(fill_value)
-
- msg = "Boolean array expected for the condition, not {dtype}"
-
- if not cond.empty:
- if not isinstance(cond, ABCDataFrame):
- # This is a single-dimensional object.
- if not is_bool_dtype(cond):
- raise ValueError(msg.format(dtype=cond.dtype))
- else:
- for _dt in cond.dtypes:
- if not is_bool_dtype(_dt):
- raise ValueError(msg.format(dtype=_dt))
- else:
- # GH#21947 we have an empty DataFrame/Series, could be object-dtype
- cond = cond.astype(bool)
-
- cond = -cond if inplace else cond
- cond = cond.reindex(self._info_axis, axis=self._info_axis_number, copy=False)
-
- # try to align with other
- if isinstance(other, NDFrame):
- # align with me
- if other.ndim <= self.ndim:
- # CoW: Make sure reference is not kept alive
- other = self.align(
- other,
- join="left",
- axis=axis,
- level=level,
- fill_value=None,
- copy=False,
- )[1]
-
- # if we are NOT aligned, raise as we cannot where index
- if axis is None and not other._indexed_same(self):
- raise InvalidIndexError
-
- if other.ndim < self.ndim:
- # TODO(EA2D): avoid object-dtype cast in EA case GH#38729
- other = other._values
- if axis == 0:
- other = np.reshape(other, (-1, 1))
- elif axis == 1:
- other = np.reshape(other, (1, -1))
-
- other = np.broadcast_to(other, self.shape)
-
- # slice me out of the other
- else:
- raise NotImplementedError(
- "cannot align with a higher dimensional NDFrame"
- )
-
- elif not isinstance(other, (MultiIndex, NDFrame)):
- # mainly just catching Index here
- other = extract_array(other, extract_numpy=True)
-
- if isinstance(other, (np.ndarray, ExtensionArray)):
- if other.shape != self.shape:
- if self.ndim != 1:
- # In the ndim == 1 case we may have
- # other length 1, which we treat as scalar (GH#2745, GH#4192)
- # or len(other) == icond.sum(), which we treat like
- # __setitem__ (GH#3235)
- raise ValueError(
- "other must be the same shape as self when an ndarray"
- )
-
- # we are the same shape, so create an actual object for alignment
- else:
- other = self._constructor(
- other, **self._construct_axes_dict(), copy=False
- )
-
- if axis is None:
- axis = 0
-
- if self.ndim == getattr(other, "ndim", 0):
- align = True
- else:
- align = self._get_axis_number(axis) == 1
-
- if inplace:
- # we may have different type blocks come out of putmask, so
- # reconstruct the block manager
-
- self._check_inplace_setting(other)
- new_data = self._mgr.putmask(mask=cond, new=other, align=align)
- result = self._constructor(new_data)
- return self._update_inplace(result)
-
- else:
- new_data = self._mgr.where(
- other=other,
- cond=cond,
- align=align,
- )
- result = self._constructor(new_data)
- return result.__finalize__(self)
-
- @overload
- def where(
- self: NDFrameT,
- cond,
- other=...,
- *,
- inplace: Literal[False] = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> NDFrameT:
- ...
-
- @overload
- def where(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[True],
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> None:
- ...
-
- @overload
- def where(
- self: NDFrameT,
- cond,
- other=...,
- *,
- inplace: bool_t = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> NDFrameT | None:
- ...
-
- @doc(
- klass=_shared_doc_kwargs["klass"],
- cond="True",
- cond_rev="False",
- name="where",
- name_other="mask",
- )
- def where(
- self: NDFrameT,
- cond,
- other=np.nan,
- *,
- inplace: bool_t = False,
- axis: Axis | None = None,
- level: Level = None,
- ) -> NDFrameT | None:
- """
- Replace values where the condition is {cond_rev}.
-
- Parameters
- ----------
- cond : bool {klass}, array-like, or callable
- Where `cond` is {cond}, keep the original value. Where
- {cond_rev}, replace with corresponding value from `other`.
- If `cond` is callable, it is computed on the {klass} and
- should return boolean {klass} or array. The callable must
- not change input {klass} (though pandas doesn't check it).
- other : scalar, {klass}, or callable
- Entries where `cond` is {cond_rev} are replaced with
- corresponding value from `other`.
- If other is callable, it is computed on the {klass} and
- should return scalar or {klass}. The callable must not
- change input {klass} (though pandas doesn't check it).
- If not specified, entries will be filled with the corresponding
- NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension
- dtypes).
- inplace : bool, default False
- Whether to perform the operation in place on the data.
- axis : int, default None
- Alignment axis if needed. For `Series` this parameter is
- unused and defaults to 0.
- level : int, default None
- Alignment level if needed.
-
- Returns
- -------
- Same type as caller or None if ``inplace=True``.
-
- See Also
- --------
- :func:`DataFrame.{name_other}` : Return an object of same shape as
- self.
-
- Notes
- -----
- The {name} method is an application of the if-then idiom. For each
- element in the calling DataFrame, if ``cond`` is ``{cond}`` the
- element is used; otherwise the corresponding element from the DataFrame
- ``other`` is used. If the axis of ``other`` does not align with axis of
- ``cond`` {klass}, the misaligned index positions will be filled with
- {cond_rev}.
-
- The signature for :func:`DataFrame.where` differs from
- :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to
- ``np.where(m, df1, df2)``.
-
- For further details and examples see the ``{name}`` documentation in
- :ref:`indexing <indexing.where_mask>`.
-
- The dtype of the object takes precedence. The fill value is casted to
- the object's dtype, if this can be done losslessly.
-
- Examples
- --------
- >>> s = pd.Series(range(5))
- >>> s.where(s > 0)
- 0 NaN
- 1 1.0
- 2 2.0
- 3 3.0
- 4 4.0
- dtype: float64
- >>> s.mask(s > 0)
- 0 0.0
- 1 NaN
- 2 NaN
- 3 NaN
- 4 NaN
- dtype: float64
-
- >>> s = pd.Series(range(5))
- >>> t = pd.Series([True, False])
- >>> s.where(t, 99)
- 0 0
- 1 99
- 2 99
- 3 99
- 4 99
- dtype: int64
- >>> s.mask(t, 99)
- 0 99
- 1 1
- 2 99
- 3 99
- 4 99
- dtype: int64
-
- >>> s.where(s > 1, 10)
- 0 10
- 1 10
- 2 2
- 3 3
- 4 4
- dtype: int64
- >>> s.mask(s > 1, 10)
- 0 0
- 1 1
- 2 10
- 3 10
- 4 10
- dtype: int64
-
- >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
- >>> df
- A B
- 0 0 1
- 1 2 3
- 2 4 5
- 3 6 7
- 4 8 9
- >>> m = df % 3 == 0
- >>> df.where(m, -df)
- A B
- 0 0 -1
- 1 -2 3
- 2 -4 -5
- 3 6 -7
- 4 -8 9
- >>> df.where(m, -df) == np.where(m, df, -df)
- A B
- 0 True True
- 1 True True
- 2 True True
- 3 True True
- 4 True True
- >>> df.where(m, -df) == df.mask(~m, -df)
- A B
- 0 True True
- 1 True True
- 2 True True
- 3 True True
- 4 True True
- """
- other = common.apply_if_callable(other, self)
- return self._where(cond, other, inplace, axis, level)
-
- @overload
- def mask(
- self: NDFrameT,
- cond,
- other=...,
- *,
- inplace: Literal[False] = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> NDFrameT:
- ...
-
- @overload
- def mask(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[True],
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> None:
- ...
-
- @overload
- def mask(
- self: NDFrameT,
- cond,
- other=...,
- *,
- inplace: bool_t = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> NDFrameT | None:
- ...
-
- @doc(
- where,
- klass=_shared_doc_kwargs["klass"],
- cond="False",
- cond_rev="True",
- name="mask",
- name_other="where",
- )
- def mask(
- self: NDFrameT,
- cond,
- other=lib.no_default,
- *,
- inplace: bool_t = False,
- axis: Axis | None = None,
- level: Level = None,
- ) -> NDFrameT | None:
- inplace = validate_bool_kwarg(inplace, "inplace")
- cond = common.apply_if_callable(cond, self)
-
- # see gh-21891
- if not hasattr(cond, "__invert__"):
- cond = np.array(cond)
-
- return self.where(
- ~cond,
- other=other,
- inplace=inplace,
- axis=axis,
- level=level,
- )
-
- @doc(klass=_shared_doc_kwargs["klass"])
- def shift(
- self: NDFrameT,
- periods: int = 1,
- freq=None,
- axis: Axis = 0,
- fill_value: Hashable = None,
- ) -> NDFrameT:
- """
- Shift index by desired number of periods with an optional time `freq`.
-
- When `freq` is not passed, shift the index without realigning the data.
- If `freq` is passed (in this case, the index must be date or datetime,
- or it will raise a `NotImplementedError`), the index will be
- increased using the periods and the `freq`. `freq` can be inferred
- when specified as "infer" as long as either freq or inferred_freq
- attribute is set in the index.
-
- Parameters
- ----------
- periods : int
- Number of periods to shift. Can be positive or negative.
- freq : DateOffset, tseries.offsets, timedelta, or str, optional
- Offset to use from the tseries module or time rule (e.g. 'EOM').
- If `freq` is specified then the index values are shifted but the
- data is not realigned. That is, use `freq` if you would like to
- extend the index when shifting and preserve the original data.
- If `freq` is specified as "infer" then it will be inferred from
- the freq or inferred_freq attributes of the index. If neither of
- those attributes exist, a ValueError is thrown.
- axis : {{0 or 'index', 1 or 'columns', None}}, default None
- Shift direction. For `Series` this parameter is unused and defaults to 0.
- fill_value : object, optional
- The scalar value to use for newly introduced missing values.
- the default depends on the dtype of `self`.
- For numeric data, ``np.nan`` is used.
- For datetime, timedelta, or period data, etc. :attr:`NaT` is used.
- For extension dtypes, ``self.dtype.na_value`` is used.
-
- .. versionchanged:: 1.1.0
-
- Returns
- -------
- {klass}
- Copy of input object, shifted.
-
- See Also
- --------
- Index.shift : Shift values of Index.
- DatetimeIndex.shift : Shift values of DatetimeIndex.
- PeriodIndex.shift : Shift values of PeriodIndex.
-
- Examples
- --------
- >>> df = pd.DataFrame({{"Col1": [10, 20, 15, 30, 45],
- ... "Col2": [13, 23, 18, 33, 48],
- ... "Col3": [17, 27, 22, 37, 52]}},
- ... index=pd.date_range("2020-01-01", "2020-01-05"))
- >>> df
- Col1 Col2 Col3
- 2020-01-01 10 13 17
- 2020-01-02 20 23 27
- 2020-01-03 15 18 22
- 2020-01-04 30 33 37
- 2020-01-05 45 48 52
-
- >>> df.shift(periods=3)
- Col1 Col2 Col3
- 2020-01-01 NaN NaN NaN
- 2020-01-02 NaN NaN NaN
- 2020-01-03 NaN NaN NaN
- 2020-01-04 10.0 13.0 17.0
- 2020-01-05 20.0 23.0 27.0
-
- >>> df.shift(periods=1, axis="columns")
- Col1 Col2 Col3
- 2020-01-01 NaN 10 13
- 2020-01-02 NaN 20 23
- 2020-01-03 NaN 15 18
- 2020-01-04 NaN 30 33
- 2020-01-05 NaN 45 48
-
- >>> df.shift(periods=3, fill_value=0)
- Col1 Col2 Col3
- 2020-01-01 0 0 0
- 2020-01-02 0 0 0
- 2020-01-03 0 0 0
- 2020-01-04 10 13 17
- 2020-01-05 20 23 27
-
- >>> df.shift(periods=3, freq="D")
- Col1 Col2 Col3
- 2020-01-04 10 13 17
- 2020-01-05 20 23 27
- 2020-01-06 15 18 22
- 2020-01-07 30 33 37
- 2020-01-08 45 48 52
-
- >>> df.shift(periods=3, freq="infer")
- Col1 Col2 Col3
- 2020-01-04 10 13 17
- 2020-01-05 20 23 27
- 2020-01-06 15 18 22
- 2020-01-07 30 33 37
- 2020-01-08 45 48 52
- """
- if periods == 0:
- return self.copy(deep=None)
-
- if freq is None:
- # when freq is None, data is shifted, index is not
- axis = self._get_axis_number(axis)
- new_data = self._mgr.shift(
- periods=periods, axis=axis, fill_value=fill_value
- )
- return self._constructor(new_data).__finalize__(self, method="shift")
-
- # when freq is given, index is shifted, data is not
- index = self._get_axis(axis)
-
- if freq == "infer":
- freq = getattr(index, "freq", None)
-
- if freq is None:
- freq = getattr(index, "inferred_freq", None)
-
- if freq is None:
- msg = "Freq was not set in the index hence cannot be inferred"
- raise ValueError(msg)
-
- elif isinstance(freq, str):
- freq = to_offset(freq)
-
- if isinstance(index, PeriodIndex):
- orig_freq = to_offset(index.freq)
- if freq != orig_freq:
- assert orig_freq is not None # for mypy
- raise ValueError(
- f"Given freq {freq.rule_code} does not match "
- f"PeriodIndex freq {orig_freq.rule_code}"
- )
- new_ax = index.shift(periods)
- else:
- new_ax = index.shift(periods, freq)
-
- result = self.set_axis(new_ax, axis=axis)
- return result.__finalize__(self, method="shift")
-
- def truncate(
- self: NDFrameT,
- before=None,
- after=None,
- axis: Axis | None = None,
- copy: bool_t | None = None,
- ) -> NDFrameT:
- """
- Truncate a Series or DataFrame before and after some index value.
-
- This is a useful shorthand for boolean indexing based on index
- values above or below certain thresholds.
-
- Parameters
- ----------
- before : date, str, int
- Truncate all rows before this index value.
- after : date, str, int
- Truncate all rows after this index value.
- axis : {0 or 'index', 1 or 'columns'}, optional
- Axis to truncate. Truncates the index (rows) by default.
- For `Series` this parameter is unused and defaults to 0.
- copy : bool, default is True,
- Return a copy of the truncated section.
-
- Returns
- -------
- type of caller
- The truncated Series or DataFrame.
-
- See Also
- --------
- DataFrame.loc : Select a subset of a DataFrame by label.
- DataFrame.iloc : Select a subset of a DataFrame by position.
-
- Notes
- -----
- If the index being truncated contains only datetime values,
- `before` and `after` may be specified as strings instead of
- Timestamps.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],
- ... 'B': ['f', 'g', 'h', 'i', 'j'],
- ... 'C': ['k', 'l', 'm', 'n', 'o']},
- ... index=[1, 2, 3, 4, 5])
- >>> df
- A B C
- 1 a f k
- 2 b g l
- 3 c h m
- 4 d i n
- 5 e j o
-
- >>> df.truncate(before=2, after=4)
- A B C
- 2 b g l
- 3 c h m
- 4 d i n
-
- The columns of a DataFrame can be truncated.
-
- >>> df.truncate(before="A", after="B", axis="columns")
- A B
- 1 a f
- 2 b g
- 3 c h
- 4 d i
- 5 e j
-
- For Series, only rows can be truncated.
-
- >>> df['A'].truncate(before=2, after=4)
- 2 b
- 3 c
- 4 d
- Name: A, dtype: object
-
- The index values in ``truncate`` can be datetimes or string
- dates.
-
- >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')
- >>> df = pd.DataFrame(index=dates, data={'A': 1})
- >>> df.tail()
- A
- 2016-01-31 23:59:56 1
- 2016-01-31 23:59:57 1
- 2016-01-31 23:59:58 1
- 2016-01-31 23:59:59 1
- 2016-02-01 00:00:00 1
-
- >>> df.truncate(before=pd.Timestamp('2016-01-05'),
- ... after=pd.Timestamp('2016-01-10')).tail()
- A
- 2016-01-09 23:59:56 1
- 2016-01-09 23:59:57 1
- 2016-01-09 23:59:58 1
- 2016-01-09 23:59:59 1
- 2016-01-10 00:00:00 1
-
- Because the index is a DatetimeIndex containing only dates, we can
- specify `before` and `after` as strings. They will be coerced to
- Timestamps before truncation.
-
- >>> df.truncate('2016-01-05', '2016-01-10').tail()
- A
- 2016-01-09 23:59:56 1
- 2016-01-09 23:59:57 1
- 2016-01-09 23:59:58 1
- 2016-01-09 23:59:59 1
- 2016-01-10 00:00:00 1
-
- Note that ``truncate`` assumes a 0 value for any unspecified time
- component (midnight). This differs from partial string slicing, which
- returns any partially matching dates.
-
- >>> df.loc['2016-01-05':'2016-01-10', :].tail()
- A
- 2016-01-10 23:59:55 1
- 2016-01-10 23:59:56 1
- 2016-01-10 23:59:57 1
- 2016-01-10 23:59:58 1
- 2016-01-10 23:59:59 1
- """
- if axis is None:
- axis = self._stat_axis_number
- axis = self._get_axis_number(axis)
- ax = self._get_axis(axis)
-
- # GH 17935
- # Check that index is sorted
- if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
- raise ValueError("truncate requires a sorted index")
-
- # if we have a date index, convert to dates, otherwise
- # treat like a slice
- if ax._is_all_dates:
- from pandas.core.tools.datetimes import to_datetime
-
- before = to_datetime(before)
- after = to_datetime(after)
-
- if before is not None and after is not None and before > after:
- raise ValueError(f"Truncate: {after} must be after {before}")
-
- if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
- before, after = after, before
-
- slicer = [slice(None, None)] * self._AXIS_LEN
- slicer[axis] = slice(before, after)
- result = self.loc[tuple(slicer)]
-
- if isinstance(ax, MultiIndex):
- setattr(result, self._get_axis_name(axis), ax.truncate(before, after))
-
- result = result.copy(deep=copy and not using_copy_on_write())
-
- return result
-
- @final
- @doc(klass=_shared_doc_kwargs["klass"])
- def tz_convert(
- self: NDFrameT, tz, axis: Axis = 0, level=None, copy: bool_t | None = None
- ) -> NDFrameT:
- """
- Convert tz-aware axis to target time zone.
-
- Parameters
- ----------
- tz : str or tzinfo object or None
- Target time zone. Passing ``None`` will convert to
- UTC and remove the timezone information.
- axis : {{0 or 'index', 1 or 'columns'}}, default 0
- The axis to convert
- level : int, str, default None
- If axis is a MultiIndex, convert a specific level. Otherwise
- must be None.
- copy : bool, default True
- Also make a copy of the underlying data.
-
- Returns
- -------
- {klass}
- Object with time zone converted axis.
-
- Raises
- ------
- TypeError
- If the axis is tz-naive.
-
- Examples
- --------
- Change to another time zone:
-
- >>> s = pd.Series(
- ... [1],
- ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']),
- ... )
- >>> s.tz_convert('Asia/Shanghai')
- 2018-09-15 07:30:00+08:00 1
- dtype: int64
-
- Pass None to convert to UTC and get a tz-naive index:
-
- >>> s = pd.Series([1],
- ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))
- >>> s.tz_convert(None)
- 2018-09-14 23:30:00 1
- dtype: int64
- """
- axis = self._get_axis_number(axis)
- ax = self._get_axis(axis)
-
- def _tz_convert(ax, tz):
- if not hasattr(ax, "tz_convert"):
- if len(ax) > 0:
- ax_name = self._get_axis_name(axis)
- raise TypeError(
- f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
- )
- ax = DatetimeIndex([], tz=tz)
- else:
- ax = ax.tz_convert(tz)
- return ax
-
- # if a level is given it must be a MultiIndex level or
- # equivalent to the axis name
- if isinstance(ax, MultiIndex):
- level = ax._get_level_number(level)
- new_level = _tz_convert(ax.levels[level], tz)
- ax = ax.set_levels(new_level, level=level)
- else:
- if level not in (None, 0, ax.name):
- raise ValueError(f"The level {level} is not valid")
- ax = _tz_convert(ax, tz)
-
- result = self.copy(deep=copy and not using_copy_on_write())
- result = result.set_axis(ax, axis=axis, copy=False)
- return result.__finalize__(self, method="tz_convert")
-
- @final
- @doc(klass=_shared_doc_kwargs["klass"])
- def tz_localize(
- self: NDFrameT,
- tz,
- axis: Axis = 0,
- level=None,
- copy: bool_t | None = None,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ) -> NDFrameT:
- """
- Localize tz-naive index of a Series or DataFrame to target time zone.
-
- This operation localizes the Index. To localize the values in a
- timezone-naive Series, use :meth:`Series.dt.tz_localize`.
-
- Parameters
- ----------
- tz : str or tzinfo or None
- Time zone to localize. Passing ``None`` will remove the
- time zone information and preserve local time.
- axis : {{0 or 'index', 1 or 'columns'}}, default 0
- The axis to localize
- level : int, str, default None
- If axis ia a MultiIndex, localize a specific level. Otherwise
- must be None.
- copy : bool, default True
- Also make a copy of the underlying data.
- ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
- When clocks moved backward due to DST, ambiguous times may arise.
- For example in Central European Time (UTC+01), when going from
- 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
- 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
- `ambiguous` parameter dictates how ambiguous times should be
- handled.
-
- - 'infer' will attempt to infer fall dst-transition hours based on
- order
- - bool-ndarray where True signifies a DST time, False designates
- a non-DST time (note that this flag is only applicable for
- ambiguous times)
- - 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous
- times.
- nonexistent : str, default 'raise'
- A nonexistent time does not exist in a particular timezone
- where clocks moved forward due to DST. Valid values are:
-
- - 'shift_forward' will shift the nonexistent time forward to the
- closest existing time
- - 'shift_backward' will shift the nonexistent time backward to the
- closest existing time
- - 'NaT' will return NaT where there are nonexistent times
- - timedelta objects will shift nonexistent times by the timedelta
- - 'raise' will raise an NonExistentTimeError if there are
- nonexistent times.
-
- Returns
- -------
- {klass}
- Same type as the input.
-
- Raises
- ------
- TypeError
- If the TimeSeries is tz-aware and tz is not None.
-
- Examples
- --------
- Localize local times:
-
- >>> s = pd.Series(
- ... [1],
- ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']),
- ... )
- >>> s.tz_localize('CET')
- 2018-09-15 01:30:00+02:00 1
- dtype: int64
-
- Pass None to convert to tz-naive index and preserve local time:
-
- >>> s = pd.Series([1],
- ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))
- >>> s.tz_localize(None)
- 2018-09-15 01:30:00 1
- dtype: int64
-
- Be careful with DST changes. When there is sequential data, pandas
- can infer the DST time:
-
- >>> s = pd.Series(range(7),
- ... index=pd.DatetimeIndex(['2018-10-28 01:30:00',
- ... '2018-10-28 02:00:00',
- ... '2018-10-28 02:30:00',
- ... '2018-10-28 02:00:00',
- ... '2018-10-28 02:30:00',
- ... '2018-10-28 03:00:00',
- ... '2018-10-28 03:30:00']))
- >>> s.tz_localize('CET', ambiguous='infer')
- 2018-10-28 01:30:00+02:00 0
- 2018-10-28 02:00:00+02:00 1
- 2018-10-28 02:30:00+02:00 2
- 2018-10-28 02:00:00+01:00 3
- 2018-10-28 02:30:00+01:00 4
- 2018-10-28 03:00:00+01:00 5
- 2018-10-28 03:30:00+01:00 6
- dtype: int64
-
- In some cases, inferring the DST is impossible. In such cases, you can
- pass an ndarray to the ambiguous parameter to set the DST explicitly
-
- >>> s = pd.Series(range(3),
- ... index=pd.DatetimeIndex(['2018-10-28 01:20:00',
- ... '2018-10-28 02:36:00',
- ... '2018-10-28 03:46:00']))
- >>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))
- 2018-10-28 01:20:00+02:00 0
- 2018-10-28 02:36:00+02:00 1
- 2018-10-28 03:46:00+01:00 2
- dtype: int64
-
- If the DST transition causes nonexistent times, you can shift these
- dates forward or backward with a timedelta object or `'shift_forward'`
- or `'shift_backward'`.
-
- >>> s = pd.Series(range(2),
- ... index=pd.DatetimeIndex(['2015-03-29 02:30:00',
- ... '2015-03-29 03:30:00']))
- >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')
- 2015-03-29 03:00:00+02:00 0
- 2015-03-29 03:30:00+02:00 1
- dtype: int64
- >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')
- 2015-03-29 01:59:59.999999999+01:00 0
- 2015-03-29 03:30:00+02:00 1
- dtype: int64
- >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H'))
- 2015-03-29 03:30:00+02:00 0
- 2015-03-29 03:30:00+02:00 1
- dtype: int64
- """
- nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")
- if nonexistent not in nonexistent_options and not isinstance(
- nonexistent, dt.timedelta
- ):
- raise ValueError(
- "The nonexistent argument must be one of 'raise', "
- "'NaT', 'shift_forward', 'shift_backward' or "
- "a timedelta object"
- )
-
- axis = self._get_axis_number(axis)
- ax = self._get_axis(axis)
-
- def _tz_localize(ax, tz, ambiguous, nonexistent):
- if not hasattr(ax, "tz_localize"):
- if len(ax) > 0:
- ax_name = self._get_axis_name(axis)
- raise TypeError(
- f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
- )
- ax = DatetimeIndex([], tz=tz)
- else:
- ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent)
- return ax
-
- # if a level is given it must be a MultiIndex level or
- # equivalent to the axis name
- if isinstance(ax, MultiIndex):
- level = ax._get_level_number(level)
- new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent)
- ax = ax.set_levels(new_level, level=level)
- else:
- if level not in (None, 0, ax.name):
- raise ValueError(f"The level {level} is not valid")
- ax = _tz_localize(ax, tz, ambiguous, nonexistent)
-
- result = self.copy(deep=copy and not using_copy_on_write())
- result = result.set_axis(ax, axis=axis, copy=False)
- return result.__finalize__(self, method="tz_localize")
-
- # ----------------------------------------------------------------------
- # Numeric Methods
-
- @final
- def describe(
- self: NDFrameT,
- percentiles=None,
- include=None,
- exclude=None,
- ) -> NDFrameT:
- """
- Generate descriptive statistics.
-
- Descriptive statistics include those that summarize the central
- tendency, dispersion and shape of a
- dataset's distribution, excluding ``NaN`` values.
-
- Analyzes both numeric and object series, as well
- as ``DataFrame`` column sets of mixed data types. The output
- will vary depending on what is provided. Refer to the notes
- below for more detail.
-
- Parameters
- ----------
- percentiles : list-like of numbers, optional
- The percentiles to include in the output. All should
- fall between 0 and 1. The default is
- ``[.25, .5, .75]``, which returns the 25th, 50th, and
- 75th percentiles.
- include : 'all', list-like of dtypes or None (default), optional
- A white list of data types to include in the result. Ignored
- for ``Series``. Here are the options:
-
- - 'all' : All columns of the input will be included in the output.
- - A list-like of dtypes : Limits the results to the
- provided data types.
- To limit the result to numeric types submit
- ``numpy.number``. To limit it instead to object columns submit
- the ``numpy.object`` data type. Strings
- can also be used in the style of
- ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
- select pandas categorical columns, use ``'category'``
- - None (default) : The result will include all numeric columns.
- exclude : list-like of dtypes or None (default), optional,
- A black list of data types to omit from the result. Ignored
- for ``Series``. Here are the options:
-
- - A list-like of dtypes : Excludes the provided data types
- from the result. To exclude numeric types submit
- ``numpy.number``. To exclude object columns submit the data
- type ``numpy.object``. Strings can also be used in the style of
- ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To
- exclude pandas categorical columns, use ``'category'``
- - None (default) : The result will exclude nothing.
-
- Returns
- -------
- Series or DataFrame
- Summary statistics of the Series or Dataframe provided.
-
- See Also
- --------
- DataFrame.count: Count number of non-NA/null observations.
- DataFrame.max: Maximum of the values in the object.
- DataFrame.min: Minimum of the values in the object.
- DataFrame.mean: Mean of the values.
- DataFrame.std: Standard deviation of the observations.
- DataFrame.select_dtypes: Subset of a DataFrame including/excluding
- columns based on their dtype.
-
- Notes
- -----
- For numeric data, the result's index will include ``count``,
- ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
- upper percentiles. By default the lower percentile is ``25`` and the
- upper percentile is ``75``. The ``50`` percentile is the
- same as the median.
-
- For object data (e.g. strings or timestamps), the result's index
- will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``
- is the most common value. The ``freq`` is the most common value's
- frequency. Timestamps also include the ``first`` and ``last`` items.
-
- If multiple object values have the highest count, then the
- ``count`` and ``top`` results will be arbitrarily chosen from
- among those with the highest count.
-
- For mixed data types provided via a ``DataFrame``, the default is to
- return only an analysis of numeric columns. If the dataframe consists
- only of object and categorical data without any numeric columns, the
- default is to return an analysis of both the object and categorical
- columns. If ``include='all'`` is provided as an option, the result
- will include a union of attributes of each type.
-
- The `include` and `exclude` parameters can be used to limit
- which columns in a ``DataFrame`` are analyzed for the output.
- The parameters are ignored when analyzing a ``Series``.
-
- Examples
- --------
- Describing a numeric ``Series``.
-
- >>> s = pd.Series([1, 2, 3])
- >>> s.describe()
- count 3.0
- mean 2.0
- std 1.0
- min 1.0
- 25% 1.5
- 50% 2.0
- 75% 2.5
- max 3.0
- dtype: float64
-
- Describing a categorical ``Series``.
-
- >>> s = pd.Series(['a', 'a', 'b', 'c'])
- >>> s.describe()
- count 4
- unique 3
- top a
- freq 2
- dtype: object
-
- Describing a timestamp ``Series``.
-
- >>> s = pd.Series([
- ... np.datetime64("2000-01-01"),
- ... np.datetime64("2010-01-01"),
- ... np.datetime64("2010-01-01")
- ... ])
- >>> s.describe()
- count 3
- mean 2006-09-01 08:00:00
- min 2000-01-01 00:00:00
- 25% 2004-12-31 12:00:00
- 50% 2010-01-01 00:00:00
- 75% 2010-01-01 00:00:00
- max 2010-01-01 00:00:00
- dtype: object
-
- Describing a ``DataFrame``. By default only numeric fields
- are returned.
-
- >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),
- ... 'numeric': [1, 2, 3],
- ... 'object': ['a', 'b', 'c']
- ... })
- >>> df.describe()
- numeric
- count 3.0
- mean 2.0
- std 1.0
- min 1.0
- 25% 1.5
- 50% 2.0
- 75% 2.5
- max 3.0
-
- Describing all columns of a ``DataFrame`` regardless of data type.
-
- >>> df.describe(include='all') # doctest: +SKIP
- categorical numeric object
- count 3 3.0 3
- unique 3 NaN 3
- top f NaN a
- freq 1 NaN 1
- mean NaN 2.0 NaN
- std NaN 1.0 NaN
- min NaN 1.0 NaN
- 25% NaN 1.5 NaN
- 50% NaN 2.0 NaN
- 75% NaN 2.5 NaN
- max NaN 3.0 NaN
-
- Describing a column from a ``DataFrame`` by accessing it as
- an attribute.
-
- >>> df.numeric.describe()
- count 3.0
- mean 2.0
- std 1.0
- min 1.0
- 25% 1.5
- 50% 2.0
- 75% 2.5
- max 3.0
- Name: numeric, dtype: float64
-
- Including only numeric columns in a ``DataFrame`` description.
-
- >>> df.describe(include=[np.number])
- numeric
- count 3.0
- mean 2.0
- std 1.0
- min 1.0
- 25% 1.5
- 50% 2.0
- 75% 2.5
- max 3.0
-
- Including only string columns in a ``DataFrame`` description.
-
- >>> df.describe(include=[object]) # doctest: +SKIP
- object
- count 3
- unique 3
- top a
- freq 1
-
- Including only categorical columns from a ``DataFrame`` description.
-
- >>> df.describe(include=['category'])
- categorical
- count 3
- unique 3
- top d
- freq 1
-
- Excluding numeric columns from a ``DataFrame`` description.
-
- >>> df.describe(exclude=[np.number]) # doctest: +SKIP
- categorical object
- count 3 3
- unique 3 3
- top f a
- freq 1 1
-
- Excluding object columns from a ``DataFrame`` description.
-
- >>> df.describe(exclude=[object]) # doctest: +SKIP
- categorical numeric
- count 3 3.0
- unique 3 NaN
- top f NaN
- freq 1 NaN
- mean NaN 2.0
- std NaN 1.0
- min NaN 1.0
- 25% NaN 1.5
- 50% NaN 2.0
- 75% NaN 2.5
- max NaN 3.0
- """
- return describe_ndframe(
- obj=self,
- include=include,
- exclude=exclude,
- percentiles=percentiles,
- )
-
- @final
- def pct_change(
- self: NDFrameT,
- periods: int = 1,
- fill_method: Literal["backfill", "bfill", "pad", "ffill"] | None = "pad",
- limit=None,
- freq=None,
- **kwargs,
- ) -> NDFrameT:
- """
- Percentage change between the current and a prior element.
-
- Computes the percentage change from the immediately previous row by
- default. This is useful in comparing the percentage of change in a time
- series of elements.
-
- Parameters
- ----------
- periods : int, default 1
- Periods to shift for forming percent change.
- fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad'
- How to handle NAs **before** computing percent changes.
- limit : int, default None
- The number of consecutive NAs to fill before stopping.
- freq : DateOffset, timedelta, or str, optional
- Increment to use from time series API (e.g. 'M' or BDay()).
- **kwargs
- Additional keyword arguments are passed into
- `DataFrame.shift` or `Series.shift`.
-
- Returns
- -------
- Series or DataFrame
- The same type as the calling object.
-
- See Also
- --------
- Series.diff : Compute the difference of two elements in a Series.
- DataFrame.diff : Compute the difference of two elements in a DataFrame.
- Series.shift : Shift the index by some number of periods.
- DataFrame.shift : Shift the index by some number of periods.
-
- Examples
- --------
- **Series**
-
- >>> s = pd.Series([90, 91, 85])
- >>> s
- 0 90
- 1 91
- 2 85
- dtype: int64
-
- >>> s.pct_change()
- 0 NaN
- 1 0.011111
- 2 -0.065934
- dtype: float64
-
- >>> s.pct_change(periods=2)
- 0 NaN
- 1 NaN
- 2 -0.055556
- dtype: float64
-
- See the percentage change in a Series where filling NAs with last
- valid observation forward to next valid.
-
- >>> s = pd.Series([90, 91, None, 85])
- >>> s
- 0 90.0
- 1 91.0
- 2 NaN
- 3 85.0
- dtype: float64
-
- >>> s.pct_change(fill_method='ffill')
- 0 NaN
- 1 0.011111
- 2 0.000000
- 3 -0.065934
- dtype: float64
-
- **DataFrame**
-
- Percentage change in French franc, Deutsche Mark, and Italian lira from
- 1980-01-01 to 1980-03-01.
-
- >>> df = pd.DataFrame({
- ... 'FR': [4.0405, 4.0963, 4.3149],
- ... 'GR': [1.7246, 1.7482, 1.8519],
- ... 'IT': [804.74, 810.01, 860.13]},
- ... index=['1980-01-01', '1980-02-01', '1980-03-01'])
- >>> df
- FR GR IT
- 1980-01-01 4.0405 1.7246 804.74
- 1980-02-01 4.0963 1.7482 810.01
- 1980-03-01 4.3149 1.8519 860.13
-
- >>> df.pct_change()
- FR GR IT
- 1980-01-01 NaN NaN NaN
- 1980-02-01 0.013810 0.013684 0.006549
- 1980-03-01 0.053365 0.059318 0.061876
-
- Percentage of change in GOOG and APPL stock volume. Shows computing
- the percentage change between columns.
-
- >>> df = pd.DataFrame({
- ... '2016': [1769950, 30586265],
- ... '2015': [1500923, 40912316],
- ... '2014': [1371819, 41403351]},
- ... index=['GOOG', 'APPL'])
- >>> df
- 2016 2015 2014
- GOOG 1769950 1500923 1371819
- APPL 30586265 40912316 41403351
-
- >>> df.pct_change(axis='columns', periods=-1)
- 2016 2015 2014
- GOOG 0.179241 0.094112 NaN
- APPL -0.252395 -0.011860 NaN
- """
- axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name))
- if fill_method is None:
- data = self
- else:
- _data = self.fillna(method=fill_method, axis=axis, limit=limit)
- assert _data is not None # needed for mypy
- data = _data
-
- shifted = data.shift(periods=periods, freq=freq, axis=axis, **kwargs)
- # Unsupported left operand type for / ("NDFrameT")
- rs = data / shifted - 1 # type: ignore[operator]
- if freq is not None:
- # Shift method is implemented differently when freq is not None
- # We want to restore the original index
- rs = rs.loc[~rs.index.duplicated()]
- rs = rs.reindex_like(data)
- return rs.__finalize__(self, method="pct_change")
-
- @final
- def _logical_func(
- self,
- name: str,
- func,
- axis: Axis = 0,
- bool_only: bool_t = False,
- skipna: bool_t = True,
- **kwargs,
- ) -> Series | bool_t:
- nv.validate_logical_func((), kwargs, fname=name)
- validate_bool_kwarg(skipna, "skipna", none_allowed=False)
-
- if self.ndim > 1 and axis is None:
- # Reduce along one dimension then the other, to simplify DataFrame._reduce
- res = self._logical_func(
- name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs
- )
- return res._logical_func(name, func, skipna=skipna, **kwargs)
-
- if (
- self.ndim > 1
- and axis == 1
- and len(self._mgr.arrays) > 1
- # TODO(EA2D): special-case not needed
- and all(x.ndim == 2 for x in self._mgr.arrays)
- and not kwargs
- ):
- # Fastpath avoiding potentially expensive transpose
- obj = self
- if bool_only:
- obj = self._get_bool_data()
- return obj._reduce_axis1(name, func, skipna=skipna)
-
- return self._reduce(
- func,
- name=name,
- axis=axis,
- skipna=skipna,
- numeric_only=bool_only,
- filter_type="bool",
- )
-
- def any(
- self,
- axis: Axis = 0,
- bool_only: bool_t = False,
- skipna: bool_t = True,
- **kwargs,
- ) -> DataFrame | Series | bool_t:
- return self._logical_func(
- "any", nanops.nanany, axis, bool_only, skipna, **kwargs
- )
-
- def all(
- self,
- axis: Axis = 0,
- bool_only: bool_t = False,
- skipna: bool_t = True,
- **kwargs,
- ) -> Series | bool_t:
- return self._logical_func(
- "all", nanops.nanall, axis, bool_only, skipna, **kwargs
- )
-
- @final
- def _accum_func(
- self,
- name: str,
- func,
- axis: Axis | None = None,
- skipna: bool_t = True,
- *args,
- **kwargs,
- ):
- skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name)
- if axis is None:
- axis = self._stat_axis_number
- else:
- axis = self._get_axis_number(axis)
-
- if axis == 1:
- return self.T._accum_func(
- name, func, axis=0, skipna=skipna, *args, **kwargs # noqa: B026
- ).T
-
- def block_accum_func(blk_values):
- values = blk_values.T if hasattr(blk_values, "T") else blk_values
-
- result: np.ndarray | ExtensionArray
- if isinstance(values, ExtensionArray):
- result = values._accumulate(name, skipna=skipna, **kwargs)
- else:
- result = nanops.na_accum_func(values, func, skipna=skipna)
-
- result = result.T if hasattr(result, "T") else result
- return result
-
- result = self._mgr.apply(block_accum_func)
-
- return self._constructor(result).__finalize__(self, method=name)
-
- def cummax(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
- return self._accum_func(
- "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs
- )
-
- def cummin(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
- return self._accum_func(
- "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs
- )
-
- def cumsum(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
- return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs)
-
- def cumprod(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
- return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs)
-
- @final
- def _stat_function_ddof(
- self,
- name: str,
- func,
- axis: Axis | None = None,
- skipna: bool_t = True,
- ddof: int = 1,
- numeric_only: bool_t = False,
- **kwargs,
- ) -> Series | float:
- nv.validate_stat_ddof_func((), kwargs, fname=name)
- validate_bool_kwarg(skipna, "skipna", none_allowed=False)
- if axis is None:
- axis = self._stat_axis_number
-
- return self._reduce(
- func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof
- )
-
- def sem(
- self,
- axis: Axis | None = None,
- skipna: bool_t = True,
- ddof: int = 1,
- numeric_only: bool_t = False,
- **kwargs,
- ) -> Series | float:
- return self._stat_function_ddof(
- "sem", nanops.nansem, axis, skipna, ddof, numeric_only, **kwargs
- )
-
- def var(
- self,
- axis: Axis | None = None,
- skipna: bool_t = True,
- ddof: int = 1,
- numeric_only: bool_t = False,
- **kwargs,
- ) -> Series | float:
- return self._stat_function_ddof(
- "var", nanops.nanvar, axis, skipna, ddof, numeric_only, **kwargs
- )
-
- def std(
- self,
- axis: Axis | None = None,
- skipna: bool_t = True,
- ddof: int = 1,
- numeric_only: bool_t = False,
- **kwargs,
- ) -> Series | float:
- return self._stat_function_ddof(
- "std", nanops.nanstd, axis, skipna, ddof, numeric_only, **kwargs
- )
-
- @final
- def _stat_function(
- self,
- name: str,
- func,
- axis: Axis | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- if name == "median":
- nv.validate_median((), kwargs)
- else:
- nv.validate_stat_func((), kwargs, fname=name)
-
- validate_bool_kwarg(skipna, "skipna", none_allowed=False)
-
- return self._reduce(
- func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
- )
-
- def min(
- self,
- axis: Axis | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- return self._stat_function(
- "min",
- nanops.nanmin,
- axis,
- skipna,
- numeric_only,
- **kwargs,
- )
-
- def max(
- self,
- axis: Axis | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- return self._stat_function(
- "max",
- nanops.nanmax,
- axis,
- skipna,
- numeric_only,
- **kwargs,
- )
-
- def mean(
- self,
- axis: Axis | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ) -> Series | float:
- return self._stat_function(
- "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
- )
-
- def median(
- self,
- axis: Axis | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ) -> Series | float:
- return self._stat_function(
- "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs
- )
-
- def skew(
- self,
- axis: Axis | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ) -> Series | float:
- return self._stat_function(
- "skew", nanops.nanskew, axis, skipna, numeric_only, **kwargs
- )
-
- def kurt(
- self,
- axis: Axis | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ) -> Series | float:
- return self._stat_function(
- "kurt", nanops.nankurt, axis, skipna, numeric_only, **kwargs
- )
-
- kurtosis = kurt
-
- @final
- def _min_count_stat_function(
- self,
- name: str,
- func,
- axis: Axis | None = None,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- min_count: int = 0,
- **kwargs,
- ):
- if name == "sum":
- nv.validate_sum((), kwargs)
- elif name == "prod":
- nv.validate_prod((), kwargs)
- else:
- nv.validate_stat_func((), kwargs, fname=name)
-
- validate_bool_kwarg(skipna, "skipna", none_allowed=False)
-
- if axis is None:
- axis = self._stat_axis_number
-
- return self._reduce(
- func,
- name=name,
- axis=axis,
- skipna=skipna,
- numeric_only=numeric_only,
- min_count=min_count,
- )
-
- def sum(
- self,
- axis: Axis | None = None,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- min_count: int = 0,
- **kwargs,
- ):
- return self._min_count_stat_function(
- "sum", nanops.nansum, axis, skipna, numeric_only, min_count, **kwargs
- )
-
- def prod(
- self,
- axis: Axis | None = None,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- min_count: int = 0,
- **kwargs,
- ):
- return self._min_count_stat_function(
- "prod",
- nanops.nanprod,
- axis,
- skipna,
- numeric_only,
- min_count,
- **kwargs,
- )
-
- product = prod
-
- @classmethod
- def _add_numeric_operations(cls) -> None:
- """
- Add the operations to the cls; evaluate the doc strings again
- """
- axis_descr, name1, name2 = _doc_params(cls)
-
- @doc(
- _bool_doc,
- desc=_any_desc,
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- see_also=_any_see_also,
- examples=_any_examples,
- empty_value=False,
- )
- def any(
- self,
- *,
- axis: Axis = 0,
- bool_only=None,
- skipna: bool_t = True,
- **kwargs,
- ):
- return NDFrame.any(
- self,
- axis=axis,
- bool_only=bool_only,
- skipna=skipna,
- **kwargs,
- )
-
- setattr(cls, "any", any)
-
- @doc(
- _bool_doc,
- desc=_all_desc,
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- see_also=_all_see_also,
- examples=_all_examples,
- empty_value=True,
- )
- def all(
- self,
- axis: Axis = 0,
- bool_only=None,
- skipna: bool_t = True,
- **kwargs,
- ):
- return NDFrame.all(self, axis, bool_only, skipna, **kwargs)
-
- setattr(cls, "all", all)
-
- @doc(
- _num_ddof_doc,
- desc="Return unbiased standard error of the mean over requested "
- "axis.\n\nNormalized by N-1 by default. This can be changed "
- "using the ddof argument",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- notes="",
- examples="",
- )
- def sem(
- self,
- axis: Axis | None = None,
- skipna: bool_t = True,
- ddof: int = 1,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- return NDFrame.sem(self, axis, skipna, ddof, numeric_only, **kwargs)
-
- setattr(cls, "sem", sem)
-
- @doc(
- _num_ddof_doc,
- desc="Return unbiased variance over requested axis.\n\nNormalized by "
- "N-1 by default. This can be changed using the ddof argument.",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- notes="",
- examples=_var_examples,
- )
- def var(
- self,
- axis: Axis | None = None,
- skipna: bool_t = True,
- ddof: int = 1,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- return NDFrame.var(self, axis, skipna, ddof, numeric_only, **kwargs)
-
- setattr(cls, "var", var)
-
- @doc(
- _num_ddof_doc,
- desc="Return sample standard deviation over requested axis."
- "\n\nNormalized by N-1 by default. This can be changed using the "
- "ddof argument.",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- notes=_std_notes,
- examples=_std_examples,
- )
- def std(
- self,
- axis: Axis | None = None,
- skipna: bool_t = True,
- ddof: int = 1,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- return NDFrame.std(self, axis, skipna, ddof, numeric_only, **kwargs)
-
- setattr(cls, "std", std)
-
- @doc(
- _cnum_doc,
- desc="minimum",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- accum_func_name="min",
- examples=_cummin_examples,
- )
- def cummin(
- self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs
- ):
- return NDFrame.cummin(self, axis, skipna, *args, **kwargs)
-
- setattr(cls, "cummin", cummin)
-
- @doc(
- _cnum_doc,
- desc="maximum",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- accum_func_name="max",
- examples=_cummax_examples,
- )
- def cummax(
- self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs
- ):
- return NDFrame.cummax(self, axis, skipna, *args, **kwargs)
-
- setattr(cls, "cummax", cummax)
-
- @doc(
- _cnum_doc,
- desc="sum",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- accum_func_name="sum",
- examples=_cumsum_examples,
- )
- def cumsum(
- self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs
- ):
- return NDFrame.cumsum(self, axis, skipna, *args, **kwargs)
-
- setattr(cls, "cumsum", cumsum)
-
- @doc(
- _cnum_doc,
- desc="product",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- accum_func_name="prod",
- examples=_cumprod_examples,
- )
- def cumprod(
- self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs
- ):
- return NDFrame.cumprod(self, axis, skipna, *args, **kwargs)
-
- setattr(cls, "cumprod", cumprod)
-
- # error: Untyped decorator makes function "sum" untyped
- @doc( # type: ignore[misc]
- _num_doc,
- desc="Return the sum of the values over the requested axis.\n\n"
- "This is equivalent to the method ``numpy.sum``.",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- min_count=_min_count_stub,
- see_also=_stat_func_see_also,
- examples=_sum_examples,
- )
- def sum(
- self,
- axis: Axis | None = None,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- min_count: int = 0,
- **kwargs,
- ):
- return NDFrame.sum(self, axis, skipna, numeric_only, min_count, **kwargs)
-
- setattr(cls, "sum", sum)
-
- @doc(
- _num_doc,
- desc="Return the product of the values over the requested axis.",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- min_count=_min_count_stub,
- see_also=_stat_func_see_also,
- examples=_prod_examples,
- )
- def prod(
- self,
- axis: Axis | None = None,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- min_count: int = 0,
- **kwargs,
- ):
- return NDFrame.prod(self, axis, skipna, numeric_only, min_count, **kwargs)
-
- setattr(cls, "prod", prod)
- cls.product = prod
-
- @doc(
- _num_doc,
- desc="Return the mean of the values over the requested axis.",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- min_count="",
- see_also="",
- examples="",
- )
- def mean(
- self,
- axis: AxisInt | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
-
- setattr(cls, "mean", mean)
-
- @doc(
- _num_doc,
- desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- min_count="",
- see_also="",
- examples="",
- )
- def skew(
- self,
- axis: AxisInt | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- return NDFrame.skew(self, axis, skipna, numeric_only, **kwargs)
-
- setattr(cls, "skew", skew)
-
- @doc(
- _num_doc,
- desc="Return unbiased kurtosis over requested axis.\n\n"
- "Kurtosis obtained using Fisher's definition of\n"
- "kurtosis (kurtosis of normal == 0.0). Normalized "
- "by N-1.",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- min_count="",
- see_also="",
- examples="",
- )
- def kurt(
- self,
- axis: Axis | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- return NDFrame.kurt(self, axis, skipna, numeric_only, **kwargs)
-
- setattr(cls, "kurt", kurt)
- cls.kurtosis = kurt
-
- @doc(
- _num_doc,
- desc="Return the median of the values over the requested axis.",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- min_count="",
- see_also="",
- examples="",
- )
- def median(
- self,
- axis: AxisInt | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- return NDFrame.median(self, axis, skipna, numeric_only, **kwargs)
-
- setattr(cls, "median", median)
-
- @doc(
- _num_doc,
- desc="Return the maximum of the values over the requested axis.\n\n"
- "If you want the *index* of the maximum, use ``idxmax``. This is "
- "the equivalent of the ``numpy.ndarray`` method ``argmax``.",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- min_count="",
- see_also=_stat_func_see_also,
- examples=_max_examples,
- )
- def max(
- self,
- axis: AxisInt | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- return NDFrame.max(self, axis, skipna, numeric_only, **kwargs)
-
- setattr(cls, "max", max)
-
- @doc(
- _num_doc,
- desc="Return the minimum of the values over the requested axis.\n\n"
- "If you want the *index* of the minimum, use ``idxmin``. This is "
- "the equivalent of the ``numpy.ndarray`` method ``argmin``.",
- name1=name1,
- name2=name2,
- axis_descr=axis_descr,
- min_count="",
- see_also=_stat_func_see_also,
- examples=_min_examples,
- )
- def min(
- self,
- axis: AxisInt | None = 0,
- skipna: bool_t = True,
- numeric_only: bool_t = False,
- **kwargs,
- ):
- return NDFrame.min(self, axis, skipna, numeric_only, **kwargs)
-
- setattr(cls, "min", min)
-
- @final
- @doc(Rolling)
- def rolling(
- self,
- window: int | dt.timedelta | str | BaseOffset | BaseIndexer,
- min_periods: int | None = None,
- center: bool_t = False,
- win_type: str | None = None,
- on: str | None = None,
- axis: Axis = 0,
- closed: str | None = None,
- step: int | None = None,
- method: str = "single",
- ) -> Window | Rolling:
- axis = self._get_axis_number(axis)
-
- if win_type is not None:
- return Window(
- self,
- window=window,
- min_periods=min_periods,
- center=center,
- win_type=win_type,
- on=on,
- axis=axis,
- closed=closed,
- step=step,
- method=method,
- )
-
- return Rolling(
- self,
- window=window,
- min_periods=min_periods,
- center=center,
- win_type=win_type,
- on=on,
- axis=axis,
- closed=closed,
- step=step,
- method=method,
- )
-
- @final
- @doc(Expanding)
- def expanding(
- self,
- min_periods: int = 1,
- axis: Axis = 0,
- method: str = "single",
- ) -> Expanding:
- axis = self._get_axis_number(axis)
- return Expanding(self, min_periods=min_periods, axis=axis, method=method)
-
- @final
- @doc(ExponentialMovingWindow)
- def ewm(
- self,
- com: float | None = None,
- span: float | None = None,
- halflife: float | TimedeltaConvertibleTypes | None = None,
- alpha: float | None = None,
- min_periods: int | None = 0,
- adjust: bool_t = True,
- ignore_na: bool_t = False,
- axis: Axis = 0,
- times: np.ndarray | DataFrame | Series | None = None,
- method: str = "single",
- ) -> ExponentialMovingWindow:
- axis = self._get_axis_number(axis)
- return ExponentialMovingWindow(
- self,
- com=com,
- span=span,
- halflife=halflife,
- alpha=alpha,
- min_periods=min_periods,
- adjust=adjust,
- ignore_na=ignore_na,
- axis=axis,
- times=times,
- method=method,
- )
-
- # ----------------------------------------------------------------------
- # Arithmetic Methods
-
- @final
- def _inplace_method(self, other, op):
- """
- Wrap arithmetic method to operate inplace.
- """
- result = op(self, other)
-
- if (
- self.ndim == 1
- and result._indexed_same(self)
- and is_dtype_equal(result.dtype, self.dtype)
- ):
- # GH#36498 this inplace op can _actually_ be inplace.
- # Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager,
- # BlockManager, SingleBlockManager]" has no attribute "setitem_inplace"
- self._mgr.setitem_inplace( # type: ignore[union-attr]
- slice(None), result._values
- )
- return self
-
- # Delete cacher
- self._reset_cacher()
-
- # this makes sure that we are aligned like the input
- # we are updating inplace so we want to ignore is_copy
- self._update_inplace(
- result.reindex_like(self, copy=False), verify_is_copy=False
- )
- return self
-
- def __iadd__(self: NDFrameT, other) -> NDFrameT:
- # error: Unsupported left operand type for + ("Type[NDFrame]")
- return self._inplace_method(other, type(self).__add__) # type: ignore[operator]
-
- def __isub__(self: NDFrameT, other) -> NDFrameT:
- # error: Unsupported left operand type for - ("Type[NDFrame]")
- return self._inplace_method(other, type(self).__sub__) # type: ignore[operator]
-
- def __imul__(self: NDFrameT, other) -> NDFrameT:
- # error: Unsupported left operand type for * ("Type[NDFrame]")
- return self._inplace_method(other, type(self).__mul__) # type: ignore[operator]
-
- def __itruediv__(self: NDFrameT, other) -> NDFrameT:
- # error: Unsupported left operand type for / ("Type[NDFrame]")
- return self._inplace_method(
- other, type(self).__truediv__ # type: ignore[operator]
- )
-
- def __ifloordiv__(self: NDFrameT, other) -> NDFrameT:
- # error: Unsupported left operand type for // ("Type[NDFrame]")
- return self._inplace_method(
- other, type(self).__floordiv__ # type: ignore[operator]
- )
-
- def __imod__(self: NDFrameT, other) -> NDFrameT:
- # error: Unsupported left operand type for % ("Type[NDFrame]")
- return self._inplace_method(other, type(self).__mod__) # type: ignore[operator]
-
- def __ipow__(self: NDFrameT, other) -> NDFrameT:
- # error: Unsupported left operand type for ** ("Type[NDFrame]")
- return self._inplace_method(other, type(self).__pow__) # type: ignore[operator]
-
- def __iand__(self: NDFrameT, other) -> NDFrameT:
- # error: Unsupported left operand type for & ("Type[NDFrame]")
- return self._inplace_method(other, type(self).__and__) # type: ignore[operator]
-
- def __ior__(self: NDFrameT, other) -> NDFrameT:
- # error: Unsupported left operand type for | ("Type[NDFrame]")
- return self._inplace_method(other, type(self).__or__) # type: ignore[operator]
-
- def __ixor__(self: NDFrameT, other) -> NDFrameT:
- # error: Unsupported left operand type for ^ ("Type[NDFrame]")
- return self._inplace_method(other, type(self).__xor__) # type: ignore[operator]
-
- # ----------------------------------------------------------------------
- # Misc methods
-
- @final
- def _find_valid_index(self, *, how: str) -> Hashable | None:
- """
- Retrieves the index of the first valid value.
-
- Parameters
- ----------
- how : {'first', 'last'}
- Use this parameter to change between the first or last valid index.
-
- Returns
- -------
- idx_first_valid : type of index
- """
- idxpos = find_valid_index(self._values, how=how, is_valid=~isna(self._values))
- if idxpos is None:
- return None
- return self.index[idxpos]
-
- @final
- @doc(position="first", klass=_shared_doc_kwargs["klass"])
- def first_valid_index(self) -> Hashable | None:
- """
- Return index for {position} non-NA value or None, if no non-NA value is found.
-
- Returns
- -------
- type of index
-
- Notes
- -----
- If all elements are non-NA/null, returns None.
- Also returns None for empty {klass}.
- """
- return self._find_valid_index(how="first")
-
- @final
- @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"])
- def last_valid_index(self) -> Hashable | None:
- return self._find_valid_index(how="last")
-
-
-def _doc_params(cls):
- """Return a tuple of the doc params."""
- axis_descr = (
- f"{{{', '.join([f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS)])}}}"
- )
- name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar"
- name2 = cls.__name__
- return axis_descr, name, name2
-
-
-_num_doc = """
-{desc}
-
-Parameters
-----------
-axis : {axis_descr}
- Axis for the function to be applied on.
- For `Series` this parameter is unused and defaults to 0.
-
- For DataFrames, specifying ``axis=None`` will apply the aggregation
- across both axes.
-
- .. versionadded:: 2.0.0
-
-skipna : bool, default True
- Exclude NA/null values when computing the result.
-numeric_only : bool, default False
- Include only float, int, boolean columns. Not implemented for Series.
-
-{min_count}\
-**kwargs
- Additional keyword arguments to be passed to the function.
-
-Returns
--------
-{name1} or scalar\
-{see_also}\
-{examples}
-"""
-
-_num_ddof_doc = """
-{desc}
-
-Parameters
-----------
-axis : {axis_descr}
- For `Series` this parameter is unused and defaults to 0.
-skipna : bool, default True
- Exclude NA/null values. If an entire row/column is NA, the result
- will be NA.
-ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
- where N represents the number of elements.
-numeric_only : bool, default False
- Include only float, int, boolean columns. Not implemented for Series.
-
-Returns
--------
-{name1} or {name2} (if level specified) \
-{notes}\
-{examples}
-"""
-
-_std_notes = """
-
-Notes
------
-To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the
-default `ddof=1`)"""
-
-_std_examples = """
-
-Examples
---------
->>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
-... 'age': [21, 25, 62, 43],
-... 'height': [1.61, 1.87, 1.49, 2.01]}
-... ).set_index('person_id')
->>> df
- age height
-person_id
-0 21 1.61
-1 25 1.87
-2 62 1.49
-3 43 2.01
-
-The standard deviation of the columns can be found as follows:
-
->>> df.std()
-age 18.786076
-height 0.237417
-dtype: float64
-
-Alternatively, `ddof=0` can be set to normalize by N instead of N-1:
-
->>> df.std(ddof=0)
-age 16.269219
-height 0.205609
-dtype: float64"""
-
-_var_examples = """
-
-Examples
---------
->>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
-... 'age': [21, 25, 62, 43],
-... 'height': [1.61, 1.87, 1.49, 2.01]}
-... ).set_index('person_id')
->>> df
- age height
-person_id
-0 21 1.61
-1 25 1.87
-2 62 1.49
-3 43 2.01
-
->>> df.var()
-age 352.916667
-height 0.056367
-dtype: float64
-
-Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1:
-
->>> df.var(ddof=0)
-age 264.687500
-height 0.042275
-dtype: float64"""
-
-_bool_doc = """
-{desc}
-
-Parameters
-----------
-axis : {{0 or 'index', 1 or 'columns', None}}, default 0
- Indicate which axis or axes should be reduced. For `Series` this parameter
- is unused and defaults to 0.
-
- * 0 / 'index' : reduce the index, return a Series whose index is the
- original column labels.
- * 1 / 'columns' : reduce the columns, return a Series whose index is the
- original index.
- * None : reduce all axes, return a scalar.
-
-bool_only : bool, default None
- Include only boolean columns. If None, will attempt to use everything,
- then use only boolean data. Not implemented for Series.
-skipna : bool, default True
- Exclude NA/null values. If the entire row/column is NA and skipna is
- True, then the result will be {empty_value}, as for an empty row/column.
- If skipna is False, then NA are treated as True, because these are not
- equal to zero.
-**kwargs : any, default None
- Additional keywords have no effect but might be accepted for
- compatibility with NumPy.
-
-Returns
--------
-{name1} or {name2}
- If level is specified, then, {name2} is returned; otherwise, {name1}
- is returned.
-
-{see_also}
-{examples}"""
-
-_all_desc = """\
-Return whether all elements are True, potentially over an axis.
-
-Returns True unless there at least one element within a series or
-along a Dataframe axis that is False or equivalent (e.g. zero or
-empty)."""
-
-_all_examples = """\
-Examples
---------
-**Series**
-
->>> pd.Series([True, True]).all()
-True
->>> pd.Series([True, False]).all()
-False
->>> pd.Series([], dtype="float64").all()
-True
->>> pd.Series([np.nan]).all()
-True
->>> pd.Series([np.nan]).all(skipna=False)
-True
-
-**DataFrames**
-
-Create a dataframe from a dictionary.
-
->>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
->>> df
- col1 col2
-0 True True
-1 True False
-
-Default behaviour checks if values in each column all return True.
-
->>> df.all()
-col1 True
-col2 False
-dtype: bool
-
-Specify ``axis='columns'`` to check if values in each row all return True.
-
->>> df.all(axis='columns')
-0 True
-1 False
-dtype: bool
-
-Or ``axis=None`` for whether every value is True.
-
->>> df.all(axis=None)
-False
-"""
-
-_all_see_also = """\
-See Also
---------
-Series.all : Return True if all elements are True.
-DataFrame.any : Return True if one (or more) elements are True.
-"""
-
-_cnum_doc = """
-Return cumulative {desc} over a DataFrame or Series axis.
-
-Returns a DataFrame or Series of the same size containing the cumulative
-{desc}.
-
-Parameters
-----------
-axis : {{0 or 'index', 1 or 'columns'}}, default 0
- The index or the name of the axis. 0 is equivalent to None or 'index'.
- For `Series` this parameter is unused and defaults to 0.
-skipna : bool, default True
- Exclude NA/null values. If an entire row/column is NA, the result
- will be NA.
-*args, **kwargs
- Additional keywords have no effect but might be accepted for
- compatibility with NumPy.
-
-Returns
--------
-{name1} or {name2}
- Return cumulative {desc} of {name1} or {name2}.
-
-See Also
---------
-core.window.expanding.Expanding.{accum_func_name} : Similar functionality
- but ignores ``NaN`` values.
-{name2}.{accum_func_name} : Return the {desc} over
- {name2} axis.
-{name2}.cummax : Return cumulative maximum over {name2} axis.
-{name2}.cummin : Return cumulative minimum over {name2} axis.
-{name2}.cumsum : Return cumulative sum over {name2} axis.
-{name2}.cumprod : Return cumulative product over {name2} axis.
-
-{examples}"""
-
-_cummin_examples = """\
-Examples
---------
-**Series**
-
->>> s = pd.Series([2, np.nan, 5, -1, 0])
->>> s
-0 2.0
-1 NaN
-2 5.0
-3 -1.0
-4 0.0
-dtype: float64
-
-By default, NA values are ignored.
-
->>> s.cummin()
-0 2.0
-1 NaN
-2 2.0
-3 -1.0
-4 -1.0
-dtype: float64
-
-To include NA values in the operation, use ``skipna=False``
-
->>> s.cummin(skipna=False)
-0 2.0
-1 NaN
-2 NaN
-3 NaN
-4 NaN
-dtype: float64
-
-**DataFrame**
-
->>> df = pd.DataFrame([[2.0, 1.0],
-... [3.0, np.nan],
-... [1.0, 0.0]],
-... columns=list('AB'))
->>> df
- A B
-0 2.0 1.0
-1 3.0 NaN
-2 1.0 0.0
-
-By default, iterates over rows and finds the minimum
-in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
-
->>> df.cummin()
- A B
-0 2.0 1.0
-1 2.0 NaN
-2 1.0 0.0
-
-To iterate over columns and find the minimum in each row,
-use ``axis=1``
-
->>> df.cummin(axis=1)
- A B
-0 2.0 1.0
-1 3.0 NaN
-2 1.0 0.0
-"""
-
-_cumsum_examples = """\
-Examples
---------
-**Series**
-
->>> s = pd.Series([2, np.nan, 5, -1, 0])
->>> s
-0 2.0
-1 NaN
-2 5.0
-3 -1.0
-4 0.0
-dtype: float64
-
-By default, NA values are ignored.
-
->>> s.cumsum()
-0 2.0
-1 NaN
-2 7.0
-3 6.0
-4 6.0
-dtype: float64
-
-To include NA values in the operation, use ``skipna=False``
-
->>> s.cumsum(skipna=False)
-0 2.0
-1 NaN
-2 NaN
-3 NaN
-4 NaN
-dtype: float64
-
-**DataFrame**
-
->>> df = pd.DataFrame([[2.0, 1.0],
-... [3.0, np.nan],
-... [1.0, 0.0]],
-... columns=list('AB'))
->>> df
- A B
-0 2.0 1.0
-1 3.0 NaN
-2 1.0 0.0
-
-By default, iterates over rows and finds the sum
-in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
-
->>> df.cumsum()
- A B
-0 2.0 1.0
-1 5.0 NaN
-2 6.0 1.0
-
-To iterate over columns and find the sum in each row,
-use ``axis=1``
-
->>> df.cumsum(axis=1)
- A B
-0 2.0 3.0
-1 3.0 NaN
-2 1.0 1.0
-"""
-
-_cumprod_examples = """\
-Examples
---------
-**Series**
-
->>> s = pd.Series([2, np.nan, 5, -1, 0])
->>> s
-0 2.0
-1 NaN
-2 5.0
-3 -1.0
-4 0.0
-dtype: float64
-
-By default, NA values are ignored.
-
->>> s.cumprod()
-0 2.0
-1 NaN
-2 10.0
-3 -10.0
-4 -0.0
-dtype: float64
-
-To include NA values in the operation, use ``skipna=False``
-
->>> s.cumprod(skipna=False)
-0 2.0
-1 NaN
-2 NaN
-3 NaN
-4 NaN
-dtype: float64
-
-**DataFrame**
-
->>> df = pd.DataFrame([[2.0, 1.0],
-... [3.0, np.nan],
-... [1.0, 0.0]],
-... columns=list('AB'))
->>> df
- A B
-0 2.0 1.0
-1 3.0 NaN
-2 1.0 0.0
-
-By default, iterates over rows and finds the product
-in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
-
->>> df.cumprod()
- A B
-0 2.0 1.0
-1 6.0 NaN
-2 6.0 0.0
-
-To iterate over columns and find the product in each row,
-use ``axis=1``
-
->>> df.cumprod(axis=1)
- A B
-0 2.0 2.0
-1 3.0 NaN
-2 1.0 0.0
-"""
-
-_cummax_examples = """\
-Examples
---------
-**Series**
-
->>> s = pd.Series([2, np.nan, 5, -1, 0])
->>> s
-0 2.0
-1 NaN
-2 5.0
-3 -1.0
-4 0.0
-dtype: float64
-
-By default, NA values are ignored.
-
->>> s.cummax()
-0 2.0
-1 NaN
-2 5.0
-3 5.0
-4 5.0
-dtype: float64
-
-To include NA values in the operation, use ``skipna=False``
-
->>> s.cummax(skipna=False)
-0 2.0
-1 NaN
-2 NaN
-3 NaN
-4 NaN
-dtype: float64
-
-**DataFrame**
-
->>> df = pd.DataFrame([[2.0, 1.0],
-... [3.0, np.nan],
-... [1.0, 0.0]],
-... columns=list('AB'))
->>> df
- A B
-0 2.0 1.0
-1 3.0 NaN
-2 1.0 0.0
-
-By default, iterates over rows and finds the maximum
-in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
-
->>> df.cummax()
- A B
-0 2.0 1.0
-1 3.0 NaN
-2 3.0 1.0
-
-To iterate over columns and find the maximum in each row,
-use ``axis=1``
-
->>> df.cummax(axis=1)
- A B
-0 2.0 2.0
-1 3.0 NaN
-2 1.0 1.0
-"""
-
-_any_see_also = """\
-See Also
---------
-numpy.any : Numpy version of this method.
-Series.any : Return whether any element is True.
-Series.all : Return whether all elements are True.
-DataFrame.any : Return whether any element is True over requested axis.
-DataFrame.all : Return whether all elements are True over requested axis.
-"""
-
-_any_desc = """\
-Return whether any element is True, potentially over an axis.
-
-Returns False unless there is at least one element within a series or
-along a Dataframe axis that is True or equivalent (e.g. non-zero or
-non-empty)."""
-
-_any_examples = """\
-Examples
---------
-**Series**
-
-For Series input, the output is a scalar indicating whether any element
-is True.
-
->>> pd.Series([False, False]).any()
-False
->>> pd.Series([True, False]).any()
-True
->>> pd.Series([], dtype="float64").any()
-False
->>> pd.Series([np.nan]).any()
-False
->>> pd.Series([np.nan]).any(skipna=False)
-True
-
-**DataFrame**
-
-Whether each column contains at least one True element (the default).
-
->>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
->>> df
- A B C
-0 1 0 0
-1 2 2 0
-
->>> df.any()
-A True
-B True
-C False
-dtype: bool
-
-Aggregating over the columns.
-
->>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
->>> df
- A B
-0 True 1
-1 False 2
-
->>> df.any(axis='columns')
-0 True
-1 True
-dtype: bool
-
->>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
->>> df
- A B
-0 True 1
-1 False 0
-
->>> df.any(axis='columns')
-0 True
-1 False
-dtype: bool
-
-Aggregating over the entire DataFrame with ``axis=None``.
-
->>> df.any(axis=None)
-True
-
-`any` for an empty DataFrame is an empty Series.
-
->>> pd.DataFrame([]).any()
-Series([], dtype: bool)
-"""
-
-_shared_docs[
- "stat_func_example"
-] = """
-
-Examples
---------
->>> idx = pd.MultiIndex.from_arrays([
-... ['warm', 'warm', 'cold', 'cold'],
-... ['dog', 'falcon', 'fish', 'spider']],
-... names=['blooded', 'animal'])
->>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
->>> s
-blooded animal
-warm dog 4
- falcon 2
-cold fish 0
- spider 8
-Name: legs, dtype: int64
-
->>> s.{stat_func}()
-{default_output}"""
-
-_sum_examples = _shared_docs["stat_func_example"].format(
- stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8
-)
-
-_sum_examples += """
-
-By default, the sum of an empty or all-NA Series is ``0``.
-
->>> pd.Series([], dtype="float64").sum() # min_count=0 is the default
-0.0
-
-This can be controlled with the ``min_count`` parameter. For example, if
-you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
-
->>> pd.Series([], dtype="float64").sum(min_count=1)
-nan
-
-Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
-empty series identically.
-
->>> pd.Series([np.nan]).sum()
-0.0
-
->>> pd.Series([np.nan]).sum(min_count=1)
-nan"""
-
-_max_examples: str = _shared_docs["stat_func_example"].format(
- stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8
-)
-
-_min_examples: str = _shared_docs["stat_func_example"].format(
- stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0
-)
-
-_stat_func_see_also = """
-
-See Also
---------
-Series.sum : Return the sum.
-Series.min : Return the minimum.
-Series.max : Return the maximum.
-Series.idxmin : Return the index of the minimum.
-Series.idxmax : Return the index of the maximum.
-DataFrame.sum : Return the sum over the requested axis.
-DataFrame.min : Return the minimum over the requested axis.
-DataFrame.max : Return the maximum over the requested axis.
-DataFrame.idxmin : Return the index of the minimum over the requested axis.
-DataFrame.idxmax : Return the index of the maximum over the requested axis."""
-
-_prod_examples = """
-
-Examples
---------
-By default, the product of an empty or all-NA Series is ``1``
-
->>> pd.Series([], dtype="float64").prod()
-1.0
-
-This can be controlled with the ``min_count`` parameter
-
->>> pd.Series([], dtype="float64").prod(min_count=1)
-nan
-
-Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
-empty series identically.
-
->>> pd.Series([np.nan]).prod()
-1.0
-
->>> pd.Series([np.nan]).prod(min_count=1)
-nan"""
-
-_min_count_stub = """\
-min_count : int, default 0
- The required number of valid values to perform the operation. If fewer than
- ``min_count`` non-NA values are present the result will be NA.
-"""
-
-
-def _align_as_utc(
- left: NDFrameT, right: NDFrameT, join_index: Index | None
-) -> tuple[NDFrameT, NDFrameT]:
- """
- If we are aligning timezone-aware DatetimeIndexes and the timezones
- do not match, convert both to UTC.
- """
- if is_datetime64tz_dtype(left.index.dtype):
- if left.index.tz != right.index.tz:
- if join_index is not None:
- # GH#33671 ensure we don't change the index on
- # our original Series (NB: by default deep=False)
- left = left.copy()
- right = right.copy()
- left.index = join_index
- right.index = join_index
-
- return left, right
diff --git a/contrib/python/pandas/py3/pandas/core/groupby/__init__.py b/contrib/python/pandas/py3/pandas/core/groupby/__init__.py
deleted file mode 100644
index 8248f378e2c..00000000000
--- a/contrib/python/pandas/py3/pandas/core/groupby/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from pandas.core.groupby.generic import (
- DataFrameGroupBy,
- NamedAgg,
- SeriesGroupBy,
-)
-from pandas.core.groupby.groupby import GroupBy
-from pandas.core.groupby.grouper import Grouper
-
-__all__ = [
- "DataFrameGroupBy",
- "NamedAgg",
- "SeriesGroupBy",
- "GroupBy",
- "Grouper",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/groupby/base.py b/contrib/python/pandas/py3/pandas/core/groupby/base.py
deleted file mode 100644
index 0f6d39be7d3..00000000000
--- a/contrib/python/pandas/py3/pandas/core/groupby/base.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""
-Provide basic components for groupby.
-"""
-from __future__ import annotations
-
-import dataclasses
-from typing import Hashable
-
-
-@dataclasses.dataclass(order=True, frozen=True)
-class OutputKey:
- label: Hashable
- position: int
-
-
-# special case to prevent duplicate plots when catching exceptions when
-# forwarding methods from NDFrames
-plotting_methods = frozenset(["plot", "hist"])
-
-# cythonized transformations or canned "agg+broadcast", which do not
-# require postprocessing of the result by transform.
-cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"])
-
-# List of aggregation/reduction functions.
-# These map each group to a single numeric value
-reduction_kernels = frozenset(
- [
- "all",
- "any",
- "corrwith",
- "count",
- "first",
- "idxmax",
- "idxmin",
- "last",
- "max",
- "mean",
- "median",
- "min",
- "nunique",
- "prod",
- # as long as `quantile`'s signature accepts only
- # a single quantile value, it's a reduction.
- # GH#27526 might change that.
- "quantile",
- "sem",
- "size",
- "skew",
- "std",
- "sum",
- "var",
- ]
-)
-
-# List of transformation functions.
-# a transformation is a function that, for each group,
-# produces a result that has the same shape as the group.
-
-
-transformation_kernels = frozenset(
- [
- "bfill",
- "cumcount",
- "cummax",
- "cummin",
- "cumprod",
- "cumsum",
- "diff",
- "ffill",
- "fillna",
- "ngroup",
- "pct_change",
- "rank",
- "shift",
- ]
-)
-
-# these are all the public methods on Grouper which don't belong
-# in either of the above lists
-groupby_other_methods = frozenset(
- [
- "agg",
- "aggregate",
- "apply",
- "boxplot",
- # corr and cov return ngroups*ncolumns rows, so they
- # are neither a transformation nor a reduction
- "corr",
- "cov",
- "describe",
- "dtypes",
- "expanding",
- "ewm",
- "filter",
- "get_group",
- "groups",
- "head",
- "hist",
- "indices",
- "ndim",
- "ngroups",
- "nth",
- "ohlc",
- "pipe",
- "plot",
- "resample",
- "rolling",
- "tail",
- "take",
- "transform",
- "sample",
- "value_counts",
- ]
-)
-# Valid values of `name` for `groupby.transform(name)`
-# NOTE: do NOT edit this directly. New additions should be inserted
-# into the appropriate list above.
-transform_kernel_allowlist = reduction_kernels | transformation_kernels
diff --git a/contrib/python/pandas/py3/pandas/core/groupby/categorical.py b/contrib/python/pandas/py3/pandas/core/groupby/categorical.py
deleted file mode 100644
index 20248cd69bf..00000000000
--- a/contrib/python/pandas/py3/pandas/core/groupby/categorical.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from __future__ import annotations
-
-import numpy as np
-
-from pandas.core.algorithms import unique1d
-from pandas.core.arrays.categorical import (
- Categorical,
- CategoricalDtype,
- recode_for_categories,
-)
-
-
-def recode_for_groupby(
- c: Categorical, sort: bool, observed: bool
-) -> tuple[Categorical, Categorical | None]:
- """
- Code the categories to ensure we can groupby for categoricals.
-
- If observed=True, we return a new Categorical with the observed
- categories only.
-
- If sort=False, return a copy of self, coded with categories as
- returned by .unique(), followed by any categories not appearing in
- the data. If sort=True, return self.
-
- This method is needed solely to ensure the categorical index of the
- GroupBy result has categories in the order of appearance in the data
- (GH-8868).
-
- Parameters
- ----------
- c : Categorical
- sort : bool
- The value of the sort parameter groupby was called with.
- observed : bool
- Account only for the observed values
-
- Returns
- -------
- Categorical
- If sort=False, the new categories are set to the order of
- appearance in codes (unless ordered=True, in which case the
- original order is preserved), followed by any unrepresented
- categories in the original order.
- Categorical or None
- If we are observed, return the original categorical, otherwise None
- """
- # we only care about observed values
- if observed:
- # In cases with c.ordered, this is equivalent to
- # return c.remove_unused_categories(), c
-
- unique_codes = unique1d(c.codes)
-
- take_codes = unique_codes[unique_codes != -1]
- if sort:
- take_codes = np.sort(take_codes)
-
- # we recode according to the uniques
- categories = c.categories.take(take_codes)
- codes = recode_for_categories(c.codes, c.categories, categories)
-
- # return a new categorical that maps our new codes
- # and categories
- dtype = CategoricalDtype(categories, ordered=c.ordered)
- return Categorical(codes, dtype=dtype, fastpath=True), c
-
- # Already sorted according to c.categories; all is fine
- if sort:
- return c, None
-
- # sort=False should order groups in as-encountered order (GH-8868)
-
- # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
- all_codes = np.arange(c.categories.nunique())
- # GH 38140: exclude nan from indexer for categories
- unique_notnan_codes = unique1d(c.codes[c.codes != -1])
- if sort:
- unique_notnan_codes = np.sort(unique_notnan_codes)
- if len(all_codes) > len(unique_notnan_codes):
- # GH 13179: All categories need to be present, even if missing from the data
- missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True)
- take_codes = np.concatenate((unique_notnan_codes, missing_codes))
- else:
- take_codes = unique_notnan_codes
-
- return Categorical(c, c.unique().categories.take(take_codes)), None
diff --git a/contrib/python/pandas/py3/pandas/core/groupby/generic.py b/contrib/python/pandas/py3/pandas/core/groupby/generic.py
deleted file mode 100644
index d11a00972b1..00000000000
--- a/contrib/python/pandas/py3/pandas/core/groupby/generic.py
+++ /dev/null
@@ -1,2651 +0,0 @@
-"""
-Define the SeriesGroupBy and DataFrameGroupBy
-classes that hold the groupby interfaces (and some implementations).
-
-These are user facing as the result of the ``df.groupby(...)`` operations,
-which here returns a DataFrameGroupBy object.
-"""
-from __future__ import annotations
-
-from collections import abc
-from functools import partial
-from textwrap import dedent
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Iterable,
- Literal,
- Mapping,
- NamedTuple,
- Sequence,
- TypeVar,
- Union,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs import (
- Interval,
- lib,
- reduction as libreduction,
-)
-from pandas._typing import (
- ArrayLike,
- Axis,
- AxisInt,
- CorrelationMethod,
- FillnaOptions,
- IndexLabel,
- Manager,
- Manager2D,
- SingleManager,
- TakeIndexer,
-)
-from pandas.errors import SpecificationError
-from pandas.util._decorators import (
- Appender,
- Substitution,
- doc,
-)
-
-from pandas.core.dtypes.common import (
- ensure_int64,
- is_bool,
- is_categorical_dtype,
- is_dict_like,
- is_integer_dtype,
- is_interval_dtype,
- is_numeric_dtype,
- is_scalar,
-)
-from pandas.core.dtypes.missing import (
- isna,
- notna,
-)
-
-from pandas.core import algorithms
-from pandas.core.apply import (
- GroupByApply,
- maybe_mangle_lambdas,
- reconstruct_func,
- validate_func_kwargs,
-)
-import pandas.core.common as com
-from pandas.core.frame import DataFrame
-from pandas.core.groupby import base
-from pandas.core.groupby.groupby import (
- GroupBy,
- GroupByPlot,
- _agg_template,
- _apply_docs,
- _transform_template,
-)
-from pandas.core.indexes.api import (
- Index,
- MultiIndex,
- all_indexes_same,
- default_index,
-)
-from pandas.core.series import Series
-from pandas.core.util.numba_ import maybe_use_numba
-
-from pandas.plotting import boxplot_frame_groupby
-
-if TYPE_CHECKING:
- from pandas import Categorical
- from pandas.core.generic import NDFrame
-
-# TODO(typing) the return value on this callable should be any *scalar*.
-AggScalar = Union[str, Callable[..., Any]]
-# TODO: validate types on ScalarResult and move to _typing
-# Blocked from using by https://github.com/python/mypy/issues/1484
-# See note at _mangle_lambda_list
-ScalarResult = TypeVar("ScalarResult")
-
-
-class NamedAgg(NamedTuple):
- """
- Helper for column specific aggregation with control over output column names.
-
- Subclass of typing.NamedTuple.
-
- Parameters
- ----------
- column : Hashable
- Column label in the DataFrame to apply aggfunc.
- aggfunc : function or str
- Function to apply to the provided column. If string, the name of a built-in
- pandas function.
-
- Examples
- --------
- >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]})
- >>> agg_a = pd.NamedAgg(column="a", aggfunc="min")
- >>> agg_1 = pd.NamedAgg(column=1, aggfunc=np.mean)
- >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1)
- result_a result_1
- key
- 1 -1 10.5
- 2 1 12.0
- """
-
- column: Hashable
- aggfunc: AggScalar
-
-
-class SeriesGroupBy(GroupBy[Series]):
- def _wrap_agged_manager(self, mgr: Manager) -> Series:
- return self.obj._constructor(mgr, name=self.obj.name)
-
- def _get_data_to_aggregate(
- self, *, numeric_only: bool = False, name: str | None = None
- ) -> SingleManager:
- ser = self._selected_obj
- single = ser._mgr
- if numeric_only and not is_numeric_dtype(ser.dtype):
- # GH#41291 match Series behavior
- kwd_name = "numeric_only"
- raise TypeError(
- f"Cannot use {kwd_name}=True with "
- f"{type(self).__name__}.{name} and non-numeric dtypes."
- )
- return single
-
- def _iterate_slices(self) -> Iterable[Series]:
- yield self._selected_obj
-
- _agg_examples_doc = dedent(
- """
- Examples
- --------
- >>> s = pd.Series([1, 2, 3, 4])
-
- >>> s
- 0 1
- 1 2
- 2 3
- 3 4
- dtype: int64
-
- >>> s.groupby([1, 1, 2, 2]).min()
- 1 1
- 2 3
- dtype: int64
-
- >>> s.groupby([1, 1, 2, 2]).agg('min')
- 1 1
- 2 3
- dtype: int64
-
- >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
- min max
- 1 1 2
- 2 3 4
-
- The output column names can be controlled by passing
- the desired column names and aggregations as keyword arguments.
-
- >>> s.groupby([1, 1, 2, 2]).agg(
- ... minimum='min',
- ... maximum='max',
- ... )
- minimum maximum
- 1 1 2
- 2 3 4
-
- .. versionchanged:: 1.3.0
-
- The resulting dtype will reflect the return value of the aggregating function.
-
- >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())
- 1 1.0
- 2 3.0
- dtype: float64
- """
- )
-
- @Appender(
- _apply_docs["template"].format(
- input="series", examples=_apply_docs["series_examples"]
- )
- )
- def apply(self, func, *args, **kwargs) -> Series:
- return super().apply(func, *args, **kwargs)
-
- @doc(_agg_template, examples=_agg_examples_doc, klass="Series")
- def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
- if maybe_use_numba(engine):
- return self._aggregate_with_numba(
- func, *args, engine_kwargs=engine_kwargs, **kwargs
- )
-
- relabeling = func is None
- columns = None
- if relabeling:
- columns, func = validate_func_kwargs(kwargs)
- kwargs = {}
-
- if isinstance(func, str):
- return getattr(self, func)(*args, **kwargs)
-
- elif isinstance(func, abc.Iterable):
- # Catch instances of lists / tuples
- # but not the class list / tuple itself.
- func = maybe_mangle_lambdas(func)
- ret = self._aggregate_multiple_funcs(func, *args, **kwargs)
- if relabeling:
- # columns is not narrowed by mypy from relabeling flag
- assert columns is not None # for mypy
- ret.columns = columns
- if not self.as_index:
- ret = ret.reset_index()
- return ret
-
- else:
- cyfunc = com.get_cython_func(func)
- if cyfunc and not args and not kwargs:
- return getattr(self, cyfunc)()
-
- if self.ngroups == 0:
- # e.g. test_evaluate_with_empty_groups without any groups to
- # iterate over, we have no output on which to do dtype
- # inference. We default to using the existing dtype.
- # xref GH#51445
- obj = self._obj_with_exclusions
- return self.obj._constructor(
- [],
- name=self.obj.name,
- index=self.grouper.result_index,
- dtype=obj.dtype,
- )
-
- if self.grouper.nkeys > 1:
- return self._python_agg_general(func, *args, **kwargs)
-
- try:
- return self._python_agg_general(func, *args, **kwargs)
- except KeyError:
- # KeyError raised in test_groupby.test_basic is bc the func does
- # a dictionary lookup on group.name, but group name is not
- # pinned in _python_agg_general, only in _aggregate_named
- result = self._aggregate_named(func, *args, **kwargs)
-
- # result is a dict whose keys are the elements of result_index
- result = Series(result, index=self.grouper.result_index)
- result = self._wrap_aggregated_output(result)
- return result
-
- agg = aggregate
-
- def _python_agg_general(self, func, *args, **kwargs):
- func = com.is_builtin_func(func)
- f = lambda x: func(x, *args, **kwargs)
-
- obj = self._obj_with_exclusions
- result = self.grouper.agg_series(obj, f)
- res = obj._constructor(result, name=obj.name)
- return self._wrap_aggregated_output(res)
-
- def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:
- if isinstance(arg, dict):
- if self.as_index:
- # GH 15931
- raise SpecificationError("nested renamer is not supported")
- else:
- # GH#50684 - This accidentally worked in 1.x
- arg = list(arg.items())
- elif any(isinstance(x, (tuple, list)) for x in arg):
- arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
- else:
- # list of functions / function names
- columns = []
- for f in arg:
- columns.append(com.get_callable_name(f) or f)
-
- arg = zip(columns, arg)
-
- results: dict[base.OutputKey, DataFrame | Series] = {}
- with com.temp_setattr(self, "as_index", True):
- # Combine results using the index, need to adjust index after
- # if as_index=False (GH#50724)
- for idx, (name, func) in enumerate(arg):
- key = base.OutputKey(label=name, position=idx)
- results[key] = self.aggregate(func, *args, **kwargs)
-
- if any(isinstance(x, DataFrame) for x in results.values()):
- from pandas import concat
-
- res_df = concat(
- results.values(), axis=1, keys=[key.label for key in results]
- )
- return res_df
-
- indexed_output = {key.position: val for key, val in results.items()}
- output = self.obj._constructor_expanddim(indexed_output, index=None)
- output.columns = Index(key.label for key in results)
-
- return output
-
- def _wrap_applied_output(
- self,
- data: Series,
- values: list[Any],
- not_indexed_same: bool = False,
- is_transform: bool = False,
- ) -> DataFrame | Series:
- """
- Wrap the output of SeriesGroupBy.apply into the expected result.
-
- Parameters
- ----------
- data : Series
- Input data for groupby operation.
- values : List[Any]
- Applied output for each group.
- not_indexed_same : bool, default False
- Whether the applied outputs are not indexed the same as the group axes.
-
- Returns
- -------
- DataFrame or Series
- """
- if len(values) == 0:
- # GH #6265
- if is_transform:
- # GH#47787 see test_group_on_empty_multiindex
- res_index = data.index
- else:
- res_index = self.grouper.result_index
-
- return self.obj._constructor(
- [],
- name=self.obj.name,
- index=res_index,
- dtype=data.dtype,
- )
- assert values is not None
-
- if isinstance(values[0], dict):
- # GH #823 #24880
- index = self.grouper.result_index
- res_df = self.obj._constructor_expanddim(values, index=index)
- res_df = self._reindex_output(res_df)
- # if self.observed is False,
- # keep all-NaN rows created while re-indexing
- res_ser = res_df.stack(dropna=self.observed)
- res_ser.name = self.obj.name
- return res_ser
- elif isinstance(values[0], (Series, DataFrame)):
- result = self._concat_objects(
- values,
- not_indexed_same=not_indexed_same,
- is_transform=is_transform,
- )
- if isinstance(result, Series):
- result.name = self.obj.name
- if not self.as_index and not_indexed_same:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
- return result
- else:
- # GH #6265 #24880
- result = self.obj._constructor(
- data=values, index=self.grouper.result_index, name=self.obj.name
- )
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
- return self._reindex_output(result)
-
- def _aggregate_named(self, func, *args, **kwargs):
- # Note: this is very similar to _aggregate_series_pure_python,
- # but that does not pin group.name
- result = {}
- initialized = False
-
- for name, group in self:
- object.__setattr__(group, "name", name)
-
- output = func(group, *args, **kwargs)
- output = libreduction.extract_result(output)
- if not initialized:
- # We only do this validation on the first iteration
- libreduction.check_result_array(output, group.dtype)
- initialized = True
- result[name] = output
-
- return result
-
- __examples_series_doc = dedent(
- """
- >>> ser = pd.Series(
- ... [390.0, 350.0, 30.0, 20.0],
- ... index=["Falcon", "Falcon", "Parrot", "Parrot"],
- ... name="Max Speed")
- >>> grouped = ser.groupby([1, 1, 2, 2])
- >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
- Falcon 0.707107
- Falcon -0.707107
- Parrot 0.707107
- Parrot -0.707107
- Name: Max Speed, dtype: float64
-
- Broadcast result of the transformation
-
- >>> grouped.transform(lambda x: x.max() - x.min())
- Falcon 40.0
- Falcon 40.0
- Parrot 10.0
- Parrot 10.0
- Name: Max Speed, dtype: float64
-
- >>> grouped.transform("mean")
- Falcon 370.0
- Falcon 370.0
- Parrot 25.0
- Parrot 25.0
- Name: Max Speed, dtype: float64
-
- .. versionchanged:: 1.3.0
-
- The resulting dtype will reflect the return value of the passed ``func``,
- for example:
-
- >>> grouped.transform(lambda x: x.astype(int).max())
- Falcon 390
- Falcon 390
- Parrot 30
- Parrot 30
- Name: Max Speed, dtype: int64
- """
- )
-
- @Substitution(klass="Series", example=__examples_series_doc)
- @Appender(_transform_template)
- def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
- return self._transform(
- func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
- )
-
- def _cython_transform(
- self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs
- ):
- assert axis == 0 # handled by caller
-
- obj = self._selected_obj
-
- try:
- result = self.grouper._cython_operation(
- "transform", obj._values, how, axis, **kwargs
- )
- except NotImplementedError as err:
- # e.g. test_groupby_raises_string
- raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
-
- return obj._constructor(result, index=self.obj.index, name=obj.name)
-
- def _transform_general(self, func: Callable, *args, **kwargs) -> Series:
- """
- Transform with a callable func`.
- """
- assert callable(func)
- klass = type(self.obj)
-
- results = []
- for name, group in self.grouper.get_iterator(
- self._selected_obj, axis=self.axis
- ):
- # this setattr is needed for test_transform_lambda_with_datetimetz
- object.__setattr__(group, "name", name)
- res = func(group, *args, **kwargs)
-
- results.append(klass(res, index=group.index))
-
- # check for empty "results" to avoid concat ValueError
- if results:
- from pandas.core.reshape.concat import concat
-
- concatenated = concat(results)
- result = self._set_result_index_ordered(concatenated)
- else:
- result = self.obj._constructor(dtype=np.float64)
-
- result.name = self.obj.name
- return result
-
- def filter(self, func, dropna: bool = True, *args, **kwargs):
- """
- Filter elements from groups that don't satisfy a criterion.
-
- Elements from groups are filtered if they do not satisfy the
- boolean criterion specified by func.
-
- Parameters
- ----------
- func : function
- Criterion to apply to each group. Should return True or False.
- dropna : bool
- Drop groups that do not pass the filter. True by default; if False,
- groups that evaluate False are filled with NaNs.
-
- Returns
- -------
- Series
-
- Notes
- -----
- Functions that mutate the passed object can produce unexpected
- behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
- for more details.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
- ... 'foo', 'bar'],
- ... 'B' : [1, 2, 3, 4, 5, 6],
- ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
- >>> grouped = df.groupby('A')
- >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
- 1 2
- 3 4
- 5 6
- Name: B, dtype: int64
- """
- if isinstance(func, str):
- wrapper = lambda x: getattr(x, func)(*args, **kwargs)
- else:
- wrapper = lambda x: func(x, *args, **kwargs)
-
- # Interpret np.nan as False.
- def true_and_notna(x) -> bool:
- b = wrapper(x)
- return notna(b) and b
-
- try:
- indices = [
- self._get_index(name) for name, group in self if true_and_notna(group)
- ]
- except (ValueError, TypeError) as err:
- raise TypeError("the filter must return a boolean result") from err
-
- filtered = self._apply_filter(indices, dropna)
- return filtered
-
- def nunique(self, dropna: bool = True) -> Series | DataFrame:
- """
- Return number of unique elements in the group.
-
- Returns
- -------
- Series
- Number of unique values within each group.
- """
- ids, _, _ = self.grouper.group_info
-
- val = self.obj._values
-
- codes, _ = algorithms.factorize(val, sort=False)
- sorter = np.lexsort((codes, ids))
- codes = codes[sorter]
- ids = ids[sorter]
-
- # group boundaries are where group ids change
- # unique observations are where sorted values change
- idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
- inc = np.r_[1, codes[1:] != codes[:-1]]
-
- # 1st item of each group is a new unique observation
- mask = codes == -1
- if dropna:
- inc[idx] = 1
- inc[mask] = 0
- else:
- inc[mask & np.r_[False, mask[:-1]]] = 0
- inc[idx] = 1
-
- out = np.add.reduceat(inc, idx).astype("int64", copy=False)
- if len(ids):
- # NaN/NaT group exists if the head of ids is -1,
- # so remove it from res and exclude its index from idx
- if ids[0] == -1:
- res = out[1:]
- idx = idx[np.flatnonzero(idx)]
- else:
- res = out
- else:
- res = out[1:]
- ri = self.grouper.result_index
-
- # we might have duplications among the bins
- if len(res) != len(ri):
- res, out = np.zeros(len(ri), dtype=out.dtype), res
- if len(ids) > 0:
- # GH#21334s
- res[ids[idx]] = out
-
- result: Series | DataFrame = self.obj._constructor(
- res, index=ri, name=self.obj.name
- )
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
- return self._reindex_output(result, fill_value=0)
-
- @doc(Series.describe)
- def describe(self, **kwargs):
- return super().describe(**kwargs)
-
- def value_counts(
- self,
- normalize: bool = False,
- sort: bool = True,
- ascending: bool = False,
- bins=None,
- dropna: bool = True,
- ) -> Series | DataFrame:
- name = "proportion" if normalize else "count"
-
- if bins is None:
- result = self._value_counts(
- normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
- )
- result.name = name
- return result
-
- from pandas.core.reshape.merge import get_join_indexers
- from pandas.core.reshape.tile import cut
-
- ids, _, _ = self.grouper.group_info
- val = self.obj._values
-
- index_names = self.grouper.names + [self.obj.name]
-
- if is_categorical_dtype(val.dtype) or (
- bins is not None and not np.iterable(bins)
- ):
- # scalar bins cannot be done at top level
- # in a backward compatible way
- # GH38672 relates to categorical dtype
- ser = self.apply(
- Series.value_counts,
- normalize=normalize,
- sort=sort,
- ascending=ascending,
- bins=bins,
- )
- ser.name = name
- ser.index.names = index_names
- return ser
-
- # groupby removes null keys from groupings
- mask = ids != -1
- ids, val = ids[mask], val[mask]
-
- if bins is None:
- lab, lev = algorithms.factorize(val, sort=True)
- llab = lambda lab, inc: lab[inc]
- else:
- # lab is a Categorical with categories an IntervalIndex
- cat_ser = cut(Series(val, copy=False), bins, include_lowest=True)
- cat_obj = cast("Categorical", cat_ser._values)
- lev = cat_obj.categories
- lab = lev.take(
- cat_obj.codes,
- allow_fill=True,
- fill_value=lev._na_value,
- )
- llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
-
- if is_interval_dtype(lab.dtype):
- # TODO: should we do this inside II?
- lab_interval = cast(Interval, lab)
-
- sorter = np.lexsort((lab_interval.left, lab_interval.right, ids))
- else:
- sorter = np.lexsort((lab, ids))
-
- ids, lab = ids[sorter], lab[sorter]
-
- # group boundaries are where group ids change
- idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]
- idx = np.r_[0, idchanges]
- if not len(ids):
- idx = idchanges
-
- # new values are where sorted labels change
- lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
- inc = np.r_[True, lchanges]
- if not len(val):
- inc = lchanges
- inc[idx] = True # group boundaries are also new values
- out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
-
- # num. of times each group should be repeated
- rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
-
- # multi-index components
- codes = self.grouper.reconstructed_codes
- codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
- levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
-
- if dropna:
- mask = codes[-1] != -1
- if mask.all():
- dropna = False
- else:
- out, codes = out[mask], [level_codes[mask] for level_codes in codes]
-
- if normalize:
- out = out.astype("float")
- d = np.diff(np.r_[idx, len(ids)])
- if dropna:
- m = ids[lab == -1]
- np.add.at(d, m, -1)
- acc = rep(d)[mask]
- else:
- acc = rep(d)
- out /= acc
-
- if sort and bins is None:
- cat = ids[inc][mask] if dropna else ids[inc]
- sorter = np.lexsort((out if ascending else -out, cat))
- out, codes[-1] = out[sorter], codes[-1][sorter]
-
- if bins is not None:
- # for compat. with libgroupby.value_counts need to ensure every
- # bin is present at every index level, null filled with zeros
- diff = np.zeros(len(out), dtype="bool")
- for level_codes in codes[:-1]:
- diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
-
- ncat, nbin = diff.sum(), len(levels[-1])
-
- left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
-
- right = [diff.cumsum() - 1, codes[-1]]
-
- _, idx = get_join_indexers(left, right, sort=False, how="left")
- out = np.where(idx != -1, out[idx], 0)
-
- if sort:
- sorter = np.lexsort((out if ascending else -out, left[0]))
- out, left[-1] = out[sorter], left[-1][sorter]
-
- # build the multi-index w/ full levels
- def build_codes(lev_codes: np.ndarray) -> np.ndarray:
- return np.repeat(lev_codes[diff], nbin)
-
- codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
- codes.append(left[-1])
-
- mi = MultiIndex(
- levels=levels, codes=codes, names=index_names, verify_integrity=False
- )
-
- if is_integer_dtype(out.dtype):
- out = ensure_int64(out)
- result = self.obj._constructor(out, index=mi, name=name)
- if not self.as_index:
- result = result.reset_index()
- return result
-
- def fillna(
- self,
- value: object | ArrayLike | None = None,
- method: FillnaOptions | None = None,
- axis: Axis | None = None,
- inplace: bool = False,
- limit: int | None = None,
- downcast: dict | None = None,
- ) -> Series | None:
- """
- Fill NA/NaN values using the specified method within groups.
-
- Parameters
- ----------
- value : scalar, dict, Series, or DataFrame
- Value to use to fill holes (e.g. 0), alternately a
- dict/Series/DataFrame of values specifying which value to use for
- each index (for a Series) or column (for a DataFrame). Values not
- in the dict/Series/DataFrame will not be filled. This value cannot
- be a list. Users wanting to use the ``value`` argument and not ``method``
- should prefer :meth:`.Series.fillna` as this
- will produce the same result and be more performant.
- method : {{'bfill', 'ffill', None}}, default None
- Method to use for filling holes. ``'ffill'`` will propagate
- the last valid observation forward within a group.
- ``'bfill'`` will use next valid observation to fill the gap.
- axis : {0 or 'index', 1 or 'columns'}
- Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`.
- inplace : bool, default False
- Broken. Do not set to True.
- limit : int, default None
- If method is specified, this is the maximum number of consecutive
- NaN values to forward/backward fill within a group. In other words,
- if there is a gap with more than this number of consecutive NaNs,
- it will only be partially filled. If method is not specified, this is the
- maximum number of entries along the entire axis where NaNs will be
- filled. Must be greater than 0 if not None.
- downcast : dict, default is None
- A dict of item->dtype of what to downcast if possible,
- or the string 'infer' which will try to downcast to an appropriate
- equal type (e.g. float64 to int64 if possible).
-
- Returns
- -------
- Series
- Object with missing values filled within groups.
-
- See Also
- --------
- ffill : Forward fill values within a group.
- bfill : Backward fill values within a group.
-
- Examples
- --------
- >>> ser = pd.Series([np.nan, np.nan, 2, 3, np.nan, np.nan])
- >>> ser
- 0 NaN
- 1 NaN
- 2 2.0
- 3 3.0
- 4 NaN
- 5 NaN
- dtype: float64
-
- Propagate non-null values forward or backward within each group.
-
- >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill")
- 0 NaN
- 1 NaN
- 2 2.0
- 3 3.0
- 4 3.0
- 5 3.0
- dtype: float64
-
- >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="bfill")
- 0 2.0
- 1 2.0
- 2 2.0
- 3 3.0
- 4 NaN
- 5 NaN
- dtype: float64
-
- Only replace the first NaN element within a group.
-
- >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill", limit=1)
- 0 NaN
- 1 NaN
- 2 2.0
- 3 3.0
- 4 3.0
- 5 NaN
- dtype: float64
- """
- result = self._op_via_apply(
- "fillna",
- value=value,
- method=method,
- axis=axis,
- inplace=inplace,
- limit=limit,
- downcast=downcast,
- )
- return result
-
- def take(
- self,
- indices: TakeIndexer,
- axis: Axis = 0,
- **kwargs,
- ) -> Series:
- """
- Return the elements in the given *positional* indices in each group.
-
- This means that we are not indexing according to actual values in
- the index attribute of the object. We are indexing according to the
- actual position of the element in the object.
-
- If a requested index does not exist for some group, this method will raise.
- To get similar behavior that ignores indices that don't exist, see
- :meth:`.SeriesGroupBy.nth`.
-
- Parameters
- ----------
- indices : array-like
- An array of ints indicating which positions to take in each group.
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- The axis on which to select elements. ``0`` means that we are
- selecting rows, ``1`` means that we are selecting columns.
- For `SeriesGroupBy` this parameter is unused and defaults to 0.
- **kwargs
- For compatibility with :meth:`numpy.take`. Has no effect on the
- output.
-
- Returns
- -------
- Series
- A Series containing the elements taken from each group.
-
- See Also
- --------
- Series.take : Take elements from a Series along an axis.
- Series.loc : Select a subset of a DataFrame by labels.
- Series.iloc : Select a subset of a DataFrame by positions.
- numpy.take : Take elements from an array along an axis.
- SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist.
-
- Examples
- --------
- >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
- ... ('parrot', 'bird', 24.0),
- ... ('lion', 'mammal', 80.5),
- ... ('monkey', 'mammal', np.nan),
- ... ('rabbit', 'mammal', 15.0)],
- ... columns=['name', 'class', 'max_speed'],
- ... index=[4, 3, 2, 1, 0])
- >>> df
- name class max_speed
- 4 falcon bird 389.0
- 3 parrot bird 24.0
- 2 lion mammal 80.5
- 1 monkey mammal NaN
- 0 rabbit mammal 15.0
- >>> gb = df["name"].groupby([1, 1, 2, 2, 2])
-
- Take elements at positions 0 and 1 along the axis 0 in each group (default).
-
- >>> gb.take([0, 1])
- 1 4 falcon
- 3 parrot
- 2 2 lion
- 1 monkey
- Name: name, dtype: object
-
- We may take elements using negative integers for positive indices,
- starting from the end of the object, just like with Python lists.
-
- >>> gb.take([-1, -2])
- 1 3 parrot
- 4 falcon
- 2 0 rabbit
- 1 monkey
- Name: name, dtype: object
- """
- result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
- return result
-
- def skew(
- self,
- axis: Axis | lib.NoDefault = lib.no_default,
- skipna: bool = True,
- numeric_only: bool = False,
- **kwargs,
- ) -> Series:
- """
- Return unbiased skew within groups.
-
- Normalized by N-1.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- Axis for the function to be applied on.
- This parameter is only for compatibility with DataFrame and is unused.
-
- skipna : bool, default True
- Exclude NA/null values when computing the result.
-
- numeric_only : bool, default False
- Include only float, int, boolean columns. Not implemented for Series.
-
- **kwargs
- Additional keyword arguments to be passed to the function.
-
- Returns
- -------
- Series
-
- See Also
- --------
- Series.skew : Return unbiased skew over requested axis.
-
- Examples
- --------
- >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.],
- ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon',
- ... 'Parrot', 'Parrot', 'Parrot'],
- ... name="Max Speed")
- >>> ser
- Falcon 390.0
- Falcon 350.0
- Falcon 357.0
- Falcon NaN
- Parrot 22.0
- Parrot 20.0
- Parrot 30.0
- Name: Max Speed, dtype: float64
- >>> ser.groupby(level=0).skew()
- Falcon 1.525174
- Parrot 1.457863
- Name: Max Speed, dtype: float64
- >>> ser.groupby(level=0).skew(skipna=False)
- Falcon NaN
- Parrot 1.457863
- Name: Max Speed, dtype: float64
- """
- result = self._op_via_apply(
- "skew",
- axis=axis,
- skipna=skipna,
- numeric_only=numeric_only,
- **kwargs,
- )
- return result
-
- @property
- @doc(Series.plot.__doc__)
- def plot(self):
- result = GroupByPlot(self)
- return result
-
- @doc(Series.nlargest.__doc__)
- def nlargest(
- self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
- ) -> Series:
- f = partial(Series.nlargest, n=n, keep=keep)
- data = self._selected_obj
- # Don't change behavior if result index happens to be the same, i.e.
- # already ordered and n >= all group sizes.
- result = self._python_apply_general(f, data, not_indexed_same=True)
- return result
-
- @doc(Series.nsmallest.__doc__)
- def nsmallest(
- self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
- ) -> Series:
- f = partial(Series.nsmallest, n=n, keep=keep)
- data = self._selected_obj
- # Don't change behavior if result index happens to be the same, i.e.
- # already ordered and n >= all group sizes.
- result = self._python_apply_general(f, data, not_indexed_same=True)
- return result
-
- @doc(Series.idxmin.__doc__)
- def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:
- result = self._op_via_apply("idxmin", axis=axis, skipna=skipna)
- return result
-
- @doc(Series.idxmax.__doc__)
- def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:
- result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
- return result
-
- @doc(Series.corr.__doc__)
- def corr(
- self,
- other: Series,
- method: CorrelationMethod = "pearson",
- min_periods: int | None = None,
- ) -> Series:
- result = self._op_via_apply(
- "corr", other=other, method=method, min_periods=min_periods
- )
- return result
-
- @doc(Series.cov.__doc__)
- def cov(
- self, other: Series, min_periods: int | None = None, ddof: int | None = 1
- ) -> Series:
- result = self._op_via_apply(
- "cov", other=other, min_periods=min_periods, ddof=ddof
- )
- return result
-
- @property
- @doc(Series.is_monotonic_increasing.__doc__)
- def is_monotonic_increasing(self) -> Series:
- return self.apply(lambda ser: ser.is_monotonic_increasing)
-
- @property
- @doc(Series.is_monotonic_decreasing.__doc__)
- def is_monotonic_decreasing(self) -> Series:
- return self.apply(lambda ser: ser.is_monotonic_decreasing)
-
- @doc(Series.hist.__doc__)
- def hist(
- self,
- by=None,
- ax=None,
- grid: bool = True,
- xlabelsize: int | None = None,
- xrot: float | None = None,
- ylabelsize: int | None = None,
- yrot: float | None = None,
- figsize: tuple[int, int] | None = None,
- bins: int | Sequence[int] = 10,
- backend: str | None = None,
- legend: bool = False,
- **kwargs,
- ):
- result = self._op_via_apply(
- "hist",
- by=by,
- ax=ax,
- grid=grid,
- xlabelsize=xlabelsize,
- xrot=xrot,
- ylabelsize=ylabelsize,
- yrot=yrot,
- figsize=figsize,
- bins=bins,
- backend=backend,
- legend=legend,
- **kwargs,
- )
- return result
-
- @property
- @doc(Series.dtype.__doc__)
- def dtype(self) -> Series:
- return self.apply(lambda ser: ser.dtype)
-
- @doc(Series.unique.__doc__)
- def unique(self) -> Series:
- result = self._op_via_apply("unique")
- return result
-
-
-class DataFrameGroupBy(GroupBy[DataFrame]):
- _agg_examples_doc = dedent(
- """
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {
- ... "A": [1, 1, 2, 2],
- ... "B": [1, 2, 3, 4],
- ... "C": [0.362838, 0.227877, 1.267767, -0.562860],
- ... }
- ... )
-
- >>> df
- A B C
- 0 1 1 0.362838
- 1 1 2 0.227877
- 2 2 3 1.267767
- 3 2 4 -0.562860
-
- The aggregation is for each column.
-
- >>> df.groupby('A').agg('min')
- B C
- A
- 1 1 0.227877
- 2 3 -0.562860
-
- Multiple aggregations
-
- >>> df.groupby('A').agg(['min', 'max'])
- B C
- min max min max
- A
- 1 1 2 0.227877 0.362838
- 2 3 4 -0.562860 1.267767
-
- Select a column for aggregation
-
- >>> df.groupby('A').B.agg(['min', 'max'])
- min max
- A
- 1 1 2
- 2 3 4
-
- User-defined function for aggregation
-
- >>> df.groupby('A').agg(lambda x: sum(x) + 2)
- B C
- A
- 1 5 2.590715
- 2 9 2.704907
-
- Different aggregations per column
-
- >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
- B C
- min max sum
- A
- 1 1 2 0.590715
- 2 3 4 0.704907
-
- To control the output names with different aggregations per column,
- pandas supports "named aggregation"
-
- >>> df.groupby("A").agg(
- ... b_min=pd.NamedAgg(column="B", aggfunc="min"),
- ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
- b_min c_sum
- A
- 1 1 0.590715
- 2 3 0.704907
-
- - The keywords are the *output* column names
- - The values are tuples whose first element is the column to select
- and the second element is the aggregation to apply to that column.
- Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
- ``['column', 'aggfunc']`` to make it clearer what the arguments are.
- As usual, the aggregation can be a callable or a string alias.
-
- See :ref:`groupby.aggregate.named` for more.
-
- .. versionchanged:: 1.3.0
-
- The resulting dtype will reflect the return value of the aggregating function.
-
- >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())
- B
- A
- 1 1.0
- 2 3.0
- """
- )
-
- @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame")
- def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
- if maybe_use_numba(engine):
- return self._aggregate_with_numba(
- func, *args, engine_kwargs=engine_kwargs, **kwargs
- )
-
- relabeling, func, columns, order = reconstruct_func(func, **kwargs)
- func = maybe_mangle_lambdas(func)
-
- op = GroupByApply(self, func, args, kwargs)
- result = op.agg()
- if not is_dict_like(func) and result is not None:
- return result
- elif relabeling:
- # this should be the only (non-raising) case with relabeling
- # used reordered index of columns
- result = cast(DataFrame, result)
- result = result.iloc[:, order]
- result = cast(DataFrame, result)
- # error: Incompatible types in assignment (expression has type
- # "Optional[List[str]]", variable has type
- # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]],
- # Index, Series], Sequence[Any]]")
- result.columns = columns # type: ignore[assignment]
-
- if result is None:
- # grouper specific aggregations
- if self.grouper.nkeys > 1:
- # test_groupby_as_index_series_scalar gets here with 'not self.as_index'
- return self._python_agg_general(func, *args, **kwargs)
- elif args or kwargs:
- # test_pass_args_kwargs gets here (with and without as_index)
- # can't return early
- result = self._aggregate_frame(func, *args, **kwargs)
-
- elif self.axis == 1:
- # _aggregate_multiple_funcs does not allow self.axis == 1
- # Note: axis == 1 precludes 'not self.as_index', see __init__
- result = self._aggregate_frame(func)
- return result
-
- else:
- # try to treat as if we are passing a list
- gba = GroupByApply(self, [func], args=(), kwargs={})
- try:
- result = gba.agg()
-
- except ValueError as err:
- if "No objects to concatenate" not in str(err):
- raise
- # _aggregate_frame can fail with e.g. func=Series.mode,
- # where it expects 1D values but would be getting 2D values
- # In other tests, using aggregate_frame instead of GroupByApply
- # would give correct values but incorrect dtypes
- # object vs float64 in test_cython_agg_empty_buckets
- # float64 vs int64 in test_category_order_apply
- result = self._aggregate_frame(func)
-
- else:
- # GH#32040, GH#35246
- # e.g. test_groupby_as_index_select_column_sum_empty_df
- result = cast(DataFrame, result)
- result.columns = self._obj_with_exclusions.columns.copy()
-
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
-
- return result
-
- agg = aggregate
-
- def _python_agg_general(self, func, *args, **kwargs):
- func = com.is_builtin_func(func)
- f = lambda x: func(x, *args, **kwargs)
-
- # iterate through "columns" ex exclusions to populate output dict
- output: dict[base.OutputKey, ArrayLike] = {}
-
- if self.ngroups == 0:
- # e.g. test_evaluate_with_empty_groups different path gets different
- # result dtype in empty case.
- return self._python_apply_general(f, self._selected_obj, is_agg=True)
-
- for idx, obj in enumerate(self._iterate_slices()):
- name = obj.name
- result = self.grouper.agg_series(obj, f)
- key = base.OutputKey(label=name, position=idx)
- output[key] = result
-
- if not output:
- # e.g. test_margins_no_values_no_cols
- return self._python_apply_general(f, self._selected_obj)
-
- res = self._indexed_output_to_ndframe(output)
- return self._wrap_aggregated_output(res)
-
- def _iterate_slices(self) -> Iterable[Series]:
- obj = self._selected_obj
- if self.axis == 1:
- obj = obj.T
-
- if isinstance(obj, Series) and obj.name not in self.exclusions:
- # Occurs when doing DataFrameGroupBy(...)["X"]
- yield obj
- else:
- for label, values in obj.items():
- if label in self.exclusions:
- # Note: if we tried to just iterate over _obj_with_exclusions,
- # we would break test_wrap_agg_out by yielding a column
- # that is skipped here but not dropped from obj_with_exclusions
- continue
-
- yield values
-
- def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
- if self.grouper.nkeys != 1:
- raise AssertionError("Number of keys must be 1")
-
- obj = self._obj_with_exclusions
-
- result: dict[Hashable, NDFrame | np.ndarray] = {}
- for name, grp_df in self.grouper.get_iterator(obj, self.axis):
- fres = func(grp_df, *args, **kwargs)
- result[name] = fres
-
- result_index = self.grouper.result_index
- other_ax = obj.axes[1 - self.axis]
- out = self.obj._constructor(result, index=other_ax, columns=result_index)
- if self.axis == 0:
- out = out.T
-
- return out
-
- def _wrap_applied_output(
- self,
- data: DataFrame,
- values: list,
- not_indexed_same: bool = False,
- is_transform: bool = False,
- ):
- if len(values) == 0:
- if is_transform:
- # GH#47787 see test_group_on_empty_multiindex
- res_index = data.index
- else:
- res_index = self.grouper.result_index
-
- result = self.obj._constructor(index=res_index, columns=data.columns)
- result = result.astype(data.dtypes, copy=False)
- return result
-
- # GH12824
- # using values[0] here breaks test_groupby_apply_none_first
- first_not_none = next(com.not_none(*values), None)
-
- if first_not_none is None:
- # GH9684 - All values are None, return an empty frame.
- return self.obj._constructor()
- elif isinstance(first_not_none, DataFrame):
- return self._concat_objects(
- values,
- not_indexed_same=not_indexed_same,
- is_transform=is_transform,
- )
-
- key_index = self.grouper.result_index if self.as_index else None
-
- if isinstance(first_not_none, (np.ndarray, Index)):
- # GH#1738: values is list of arrays of unequal lengths
- # fall through to the outer else clause
- # TODO: sure this is right? we used to do this
- # after raising AttributeError above
- return self.obj._constructor_sliced(
- values, index=key_index, name=self._selection
- )
- elif not isinstance(first_not_none, Series):
- # values are not series or array-like but scalars
- # self._selection not passed through to Series as the
- # result should not take the name of original selection
- # of columns
- if self.as_index:
- return self.obj._constructor_sliced(values, index=key_index)
- else:
- result = self.obj._constructor(values, columns=[self._selection])
- result = self._insert_inaxis_grouper(result)
- return result
- else:
- # values are Series
- return self._wrap_applied_output_series(
- values,
- not_indexed_same,
- first_not_none,
- key_index,
- is_transform,
- )
-
- def _wrap_applied_output_series(
- self,
- values: list[Series],
- not_indexed_same: bool,
- first_not_none,
- key_index: Index | None,
- is_transform: bool,
- ) -> DataFrame | Series:
- kwargs = first_not_none._construct_axes_dict()
- backup = Series(**kwargs)
- values = [x if (x is not None) else backup for x in values]
-
- all_indexed_same = all_indexes_same(x.index for x in values)
-
- if not all_indexed_same:
- # GH 8467
- return self._concat_objects(
- values,
- not_indexed_same=True,
- is_transform=is_transform,
- )
-
- # Combine values
- # vstack+constructor is faster than concat and handles MI-columns
- stacked_values = np.vstack([np.asarray(v) for v in values])
-
- if self.axis == 0:
- index = key_index
- columns = first_not_none.index.copy()
- if columns.name is None:
- # GH6124 - propagate name of Series when it's consistent
- names = {v.name for v in values}
- if len(names) == 1:
- columns.name = list(names)[0]
- else:
- index = first_not_none.index
- columns = key_index
- stacked_values = stacked_values.T
-
- if stacked_values.dtype == object:
- # We'll have the DataFrame constructor do inference
- stacked_values = stacked_values.tolist()
- result = self.obj._constructor(stacked_values, index=index, columns=columns)
-
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
-
- return self._reindex_output(result)
-
- def _cython_transform(
- self,
- how: str,
- numeric_only: bool = False,
- axis: AxisInt = 0,
- **kwargs,
- ) -> DataFrame:
- assert axis == 0 # handled by caller
-
- # With self.axis == 0, we have multi-block tests
- # e.g. test_rank_min_int, test_cython_transform_frame
- # test_transform_numeric_ret
- # With self.axis == 1, _get_data_to_aggregate does a transpose
- # so we always have a single block.
- mgr: Manager2D = self._get_data_to_aggregate(
- numeric_only=numeric_only, name=how
- )
-
- def arr_func(bvalues: ArrayLike) -> ArrayLike:
- return self.grouper._cython_operation(
- "transform", bvalues, how, 1, **kwargs
- )
-
- # We could use `mgr.apply` here and not have to set_axis, but
- # we would have to do shape gymnastics for ArrayManager compat
- res_mgr = mgr.grouped_reduce(arr_func)
- res_mgr.set_axis(1, mgr.axes[1])
-
- res_df = self.obj._constructor(res_mgr)
- res_df = self._maybe_transpose_result(res_df)
- return res_df
-
- def _transform_general(self, func, *args, **kwargs):
- from pandas.core.reshape.concat import concat
-
- applied = []
- obj = self._obj_with_exclusions
- gen = self.grouper.get_iterator(obj, axis=self.axis)
- fast_path, slow_path = self._define_paths(func, *args, **kwargs)
-
- # Determine whether to use slow or fast path by evaluating on the first group.
- # Need to handle the case of an empty generator and process the result so that
- # it does not need to be computed again.
- try:
- name, group = next(gen)
- except StopIteration:
- pass
- else:
- object.__setattr__(group, "name", name)
- try:
- path, res = self._choose_path(fast_path, slow_path, group)
- except ValueError as err:
- # e.g. test_transform_with_non_scalar_group
- msg = "transform must return a scalar value for each group"
- raise ValueError(msg) from err
- if group.size > 0:
- res = _wrap_transform_general_frame(self.obj, group, res)
- applied.append(res)
-
- # Compute and process with the remaining groups
- for name, group in gen:
- if group.size == 0:
- continue
- object.__setattr__(group, "name", name)
- res = path(group)
-
- res = _wrap_transform_general_frame(self.obj, group, res)
- applied.append(res)
-
- concat_index = obj.columns if self.axis == 0 else obj.index
- other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
- concatenated = concat(applied, axis=self.axis, verify_integrity=False)
- concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
- return self._set_result_index_ordered(concatenated)
-
- __examples_dataframe_doc = dedent(
- """
- >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
- ... 'foo', 'bar'],
- ... 'B' : ['one', 'one', 'two', 'three',
- ... 'two', 'two'],
- ... 'C' : [1, 5, 5, 2, 5, 5],
- ... 'D' : [2.0, 5., 8., 1., 2., 9.]})
- >>> grouped = df.groupby('A')[['C', 'D']]
- >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
- C D
- 0 -1.154701 -0.577350
- 1 0.577350 0.000000
- 2 0.577350 1.154701
- 3 -1.154701 -1.000000
- 4 0.577350 -0.577350
- 5 0.577350 1.000000
-
- Broadcast result of the transformation
-
- >>> grouped.transform(lambda x: x.max() - x.min())
- C D
- 0 4.0 6.0
- 1 3.0 8.0
- 2 4.0 6.0
- 3 3.0 8.0
- 4 4.0 6.0
- 5 3.0 8.0
-
- >>> grouped.transform("mean")
- C D
- 0 3.666667 4.0
- 1 4.000000 5.0
- 2 3.666667 4.0
- 3 4.000000 5.0
- 4 3.666667 4.0
- 5 4.000000 5.0
-
- .. versionchanged:: 1.3.0
-
- The resulting dtype will reflect the return value of the passed ``func``,
- for example:
-
- >>> grouped.transform(lambda x: x.astype(int).max())
- C D
- 0 5 8
- 1 5 9
- 2 5 8
- 3 5 9
- 4 5 8
- 5 5 9
- """
- )
-
- @Substitution(klass="DataFrame", example=__examples_dataframe_doc)
- @Appender(_transform_template)
- def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
- return self._transform(
- func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
- )
-
- def _define_paths(self, func, *args, **kwargs):
- if isinstance(func, str):
- fast_path = lambda group: getattr(group, func)(*args, **kwargs)
- slow_path = lambda group: group.apply(
- lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
- )
- else:
- fast_path = lambda group: func(group, *args, **kwargs)
- slow_path = lambda group: group.apply(
- lambda x: func(x, *args, **kwargs), axis=self.axis
- )
- return fast_path, slow_path
-
- def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):
- path = slow_path
- res = slow_path(group)
-
- if self.ngroups == 1:
- # no need to evaluate multiple paths when only
- # a single group exists
- return path, res
-
- # if we make it here, test if we can use the fast path
- try:
- res_fast = fast_path(group)
- except AssertionError:
- raise # pragma: no cover
- except Exception:
- # GH#29631 For user-defined function, we can't predict what may be
- # raised; see test_transform.test_transform_fastpath_raises
- return path, res
-
- # verify fast path returns either:
- # a DataFrame with columns equal to group.columns
- # OR a Series with index equal to group.columns
- if isinstance(res_fast, DataFrame):
- if not res_fast.columns.equals(group.columns):
- return path, res
- elif isinstance(res_fast, Series):
- if not res_fast.index.equals(group.columns):
- return path, res
- else:
- return path, res
-
- if res_fast.equals(res):
- path = fast_path
-
- return path, res
-
- def filter(self, func, dropna: bool = True, *args, **kwargs):
- """
- Filter elements from groups that don't satisfy a criterion.
-
- Elements from groups are filtered if they do not satisfy the
- boolean criterion specified by func.
-
- Parameters
- ----------
- func : function
- Criterion to apply to each group. Should return True or False.
- dropna : bool
- Drop groups that do not pass the filter. True by default; if False,
- groups that evaluate False are filled with NaNs.
-
- Returns
- -------
- DataFrame
-
- Notes
- -----
- Each subframe is endowed the attribute 'name' in case you need to know
- which group you are working on.
-
- Functions that mutate the passed object can produce unexpected
- behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
- for more details.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
- ... 'foo', 'bar'],
- ... 'B' : [1, 2, 3, 4, 5, 6],
- ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
- >>> grouped = df.groupby('A')
- >>> grouped.filter(lambda x: x['B'].mean() > 3.)
- A B C
- 1 bar 2 5.0
- 3 bar 4 1.0
- 5 bar 6 9.0
- """
- indices = []
-
- obj = self._selected_obj
- gen = self.grouper.get_iterator(obj, axis=self.axis)
-
- for name, group in gen:
- object.__setattr__(group, "name", name)
-
- res = func(group, *args, **kwargs)
-
- try:
- res = res.squeeze()
- except AttributeError: # allow e.g., scalars and frames to pass
- pass
-
- # interpret the result of the filter
- if is_bool(res) or (is_scalar(res) and isna(res)):
- if notna(res) and res:
- indices.append(self._get_index(name))
- else:
- # non scalars aren't allowed
- raise TypeError(
- f"filter function returned a {type(res).__name__}, "
- "but expected a scalar bool"
- )
-
- return self._apply_filter(indices, dropna)
-
- def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:
- if self.axis == 1:
- # GH 37725
- raise ValueError("Cannot subset columns when using axis=1")
- # per GH 23566
- if isinstance(key, tuple) and len(key) > 1:
- # if len == 1, then it becomes a SeriesGroupBy and this is actually
- # valid syntax, so don't raise
- raise ValueError(
- "Cannot subset columns with a tuple with more than one element. "
- "Use a list instead."
- )
- return super().__getitem__(key)
-
- def _gotitem(self, key, ndim: int, subset=None):
- """
- sub-classes to define
- return a sliced object
-
- Parameters
- ----------
- key : string / list of selections
- ndim : {1, 2}
- requested ndim of result
- subset : object, default None
- subset to act on
- """
- if ndim == 2:
- if subset is None:
- subset = self.obj
- return DataFrameGroupBy(
- subset,
- self.grouper,
- axis=self.axis,
- level=self.level,
- grouper=self.grouper,
- exclusions=self.exclusions,
- selection=key,
- as_index=self.as_index,
- sort=self.sort,
- group_keys=self.group_keys,
- observed=self.observed,
- dropna=self.dropna,
- )
- elif ndim == 1:
- if subset is None:
- subset = self.obj[key]
- return SeriesGroupBy(
- subset,
- level=self.level,
- grouper=self.grouper,
- exclusions=self.exclusions,
- selection=key,
- as_index=self.as_index,
- sort=self.sort,
- group_keys=self.group_keys,
- observed=self.observed,
- dropna=self.dropna,
- )
-
- raise AssertionError("invalid ndim for _gotitem")
-
- def _get_data_to_aggregate(
- self, *, numeric_only: bool = False, name: str | None = None
- ) -> Manager2D:
- obj = self._obj_with_exclusions
- if self.axis == 1:
- mgr = obj.T._mgr
- else:
- mgr = obj._mgr
-
- if numeric_only:
- mgr = mgr.get_numeric_data(copy=False)
- return mgr
-
- def _indexed_output_to_ndframe(
- self, output: Mapping[base.OutputKey, ArrayLike]
- ) -> DataFrame:
- """
- Wrap the dict result of a GroupBy aggregation into a DataFrame.
- """
- indexed_output = {key.position: val for key, val in output.items()}
- columns = Index([key.label for key in output])
- columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)
-
- result = self.obj._constructor(indexed_output)
- result.columns = columns
- return result
-
- def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
- return self.obj._constructor(mgr)
-
- def _iterate_column_groupbys(self, obj: DataFrame):
- for i, colname in enumerate(obj.columns):
- yield colname, SeriesGroupBy(
- obj.iloc[:, i],
- selection=colname,
- grouper=self.grouper,
- exclusions=self.exclusions,
- observed=self.observed,
- )
-
- def _apply_to_column_groupbys(self, func, obj: DataFrame) -> DataFrame:
- from pandas.core.reshape.concat import concat
-
- columns = obj.columns
- results = [
- func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)
- ]
-
- if not len(results):
- # concat would raise
- return DataFrame([], columns=columns, index=self.grouper.result_index)
- else:
- return concat(results, keys=columns, axis=1)
-
- def nunique(self, dropna: bool = True) -> DataFrame:
- """
- Return DataFrame with counts of unique elements in each position.
-
- Parameters
- ----------
- dropna : bool, default True
- Don't include NaN in the counts.
-
- Returns
- -------
- nunique: DataFrame
-
- Examples
- --------
- >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
- ... 'ham', 'ham'],
- ... 'value1': [1, 5, 5, 2, 5, 5],
- ... 'value2': list('abbaxy')})
- >>> df
- id value1 value2
- 0 spam 1 a
- 1 egg 5 b
- 2 egg 5 b
- 3 spam 2 a
- 4 ham 5 x
- 5 ham 5 y
-
- >>> df.groupby('id').nunique()
- value1 value2
- id
- egg 1 1
- ham 1 2
- spam 2 1
-
- Check for rows with the same id but conflicting values:
-
- >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
- id value1 value2
- 0 spam 1 a
- 3 spam 2 a
- 4 ham 5 x
- 5 ham 5 y
- """
-
- if self.axis != 0:
- # see test_groupby_crash_on_nunique
- return self._python_apply_general(
- lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True
- )
-
- obj = self._obj_with_exclusions
- results = self._apply_to_column_groupbys(
- lambda sgb: sgb.nunique(dropna), obj=obj
- )
-
- if not self.as_index:
- results.index = default_index(len(results))
- results = self._insert_inaxis_grouper(results)
-
- return results
-
- def idxmax(
- self,
- axis: Axis | None = None,
- skipna: bool = True,
- numeric_only: bool = False,
- ) -> DataFrame:
- """
- Return index of first occurrence of maximum over requested axis.
-
- NA/null values are excluded.
-
- Parameters
- ----------
- axis : {{0 or 'index', 1 or 'columns'}}, default None
- The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- If axis is not provided, grouper's axis is used.
-
- .. versionchanged:: 2.0.0
-
- skipna : bool, default True
- Exclude NA/null values. If an entire row/column is NA, the result
- will be NA.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- Returns
- -------
- Series
- Indexes of maxima along the specified axis.
-
- Raises
- ------
- ValueError
- * If the row/column is empty
-
- See Also
- --------
- Series.idxmax : Return index of the maximum element.
-
- Notes
- -----
- This method is the DataFrame version of ``ndarray.argmax``.
-
- Examples
- --------
- Consider a dataset containing food consumption in Argentina.
-
- >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
- ... 'co2_emissions': [37.2, 19.66, 1712]},
- ... index=['Pork', 'Wheat Products', 'Beef'])
-
- >>> df
- consumption co2_emissions
- Pork 10.51 37.20
- Wheat Products 103.11 19.66
- Beef 55.48 1712.00
-
- By default, it returns the index for the maximum value in each column.
-
- >>> df.idxmax()
- consumption Wheat Products
- co2_emissions Beef
- dtype: object
-
- To return the index for the maximum value in each row, use ``axis="columns"``.
-
- >>> df.idxmax(axis="columns")
- Pork co2_emissions
- Wheat Products consumption
- Beef co2_emissions
- dtype: object
- """
- if axis is None:
- axis = self.axis
-
- def func(df):
- return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)
-
- func.__name__ = "idxmax"
- result = self._python_apply_general(
- func, self._obj_with_exclusions, not_indexed_same=True
- )
- return result
-
- def idxmin(
- self,
- axis: Axis | None = None,
- skipna: bool = True,
- numeric_only: bool = False,
- ) -> DataFrame:
- """
- Return index of first occurrence of minimum over requested axis.
-
- NA/null values are excluded.
-
- Parameters
- ----------
- axis : {{0 or 'index', 1 or 'columns'}}, default None
- The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- If axis is not provided, grouper's axis is used.
-
- .. versionchanged:: 2.0.0
-
- skipna : bool, default True
- Exclude NA/null values. If an entire row/column is NA, the result
- will be NA.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- Returns
- -------
- Series
- Indexes of minima along the specified axis.
-
- Raises
- ------
- ValueError
- * If the row/column is empty
-
- See Also
- --------
- Series.idxmin : Return index of the minimum element.
-
- Notes
- -----
- This method is the DataFrame version of ``ndarray.argmin``.
-
- Examples
- --------
- Consider a dataset containing food consumption in Argentina.
-
- >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
- ... 'co2_emissions': [37.2, 19.66, 1712]},
- ... index=['Pork', 'Wheat Products', 'Beef'])
-
- >>> df
- consumption co2_emissions
- Pork 10.51 37.20
- Wheat Products 103.11 19.66
- Beef 55.48 1712.00
-
- By default, it returns the index for the minimum value in each column.
-
- >>> df.idxmin()
- consumption Pork
- co2_emissions Wheat Products
- dtype: object
-
- To return the index for the minimum value in each row, use ``axis="columns"``.
-
- >>> df.idxmin(axis="columns")
- Pork consumption
- Wheat Products co2_emissions
- Beef consumption
- dtype: object
- """
- if axis is None:
- axis = self.axis
-
- def func(df):
- return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)
-
- func.__name__ = "idxmin"
- result = self._python_apply_general(
- func, self._obj_with_exclusions, not_indexed_same=True
- )
- return result
-
- boxplot = boxplot_frame_groupby
-
- def value_counts(
- self,
- subset: Sequence[Hashable] | None = None,
- normalize: bool = False,
- sort: bool = True,
- ascending: bool = False,
- dropna: bool = True,
- ) -> DataFrame | Series:
- """
- Return a Series or DataFrame containing counts of unique rows.
-
- .. versionadded:: 1.4.0
-
- Parameters
- ----------
- subset : list-like, optional
- Columns to use when counting unique combinations.
- normalize : bool, default False
- Return proportions rather than frequencies.
- sort : bool, default True
- Sort by frequencies.
- ascending : bool, default False
- Sort in ascending order.
- dropna : bool, default True
- Don’t include counts of rows that contain NA values.
-
- Returns
- -------
- Series or DataFrame
- Series if the groupby as_index is True, otherwise DataFrame.
-
- See Also
- --------
- Series.value_counts: Equivalent method on Series.
- DataFrame.value_counts: Equivalent method on DataFrame.
- SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
-
- Notes
- -----
- - If the groupby as_index is True then the returned Series will have a
- MultiIndex with one level per input column.
- - If the groupby as_index is False then the returned DataFrame will have an
- additional column with the value_counts. The column is labelled 'count' or
- 'proportion', depending on the ``normalize`` parameter.
-
- By default, rows that contain any NA values are omitted from
- the result.
-
- By default, the result will be in descending order so that the
- first element of each group is the most frequently-occurring row.
-
- Examples
- --------
- >>> df = pd.DataFrame({
- ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
- ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
- ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
- ... })
-
- >>> df
- gender education country
- 0 male low US
- 1 male medium FR
- 2 female high US
- 3 male low FR
- 4 female high FR
- 5 male low FR
-
- >>> df.groupby('gender').value_counts()
- gender education country
- female high FR 1
- US 1
- male low FR 2
- US 1
- medium FR 1
- Name: count, dtype: int64
-
- >>> df.groupby('gender').value_counts(ascending=True)
- gender education country
- female high FR 1
- US 1
- male low US 1
- medium FR 1
- low FR 2
- Name: count, dtype: int64
-
- >>> df.groupby('gender').value_counts(normalize=True)
- gender education country
- female high FR 0.50
- US 0.50
- male low FR 0.50
- US 0.25
- medium FR 0.25
- Name: proportion, dtype: float64
-
- >>> df.groupby('gender', as_index=False).value_counts()
- gender education country count
- 0 female high FR 1
- 1 female high US 1
- 2 male low FR 2
- 3 male low US 1
- 4 male medium FR 1
-
- >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
- gender education country proportion
- 0 female high FR 0.50
- 1 female high US 0.50
- 2 male low FR 0.50
- 3 male low US 0.25
- 4 male medium FR 0.25
- """
- return self._value_counts(subset, normalize, sort, ascending, dropna)
-
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = None,
- method: FillnaOptions | None = None,
- axis: Axis | None = None,
- inplace: bool = False,
- limit=None,
- downcast=None,
- ) -> DataFrame | None:
- """
- Fill NA/NaN values using the specified method within groups.
-
- Parameters
- ----------
- value : scalar, dict, Series, or DataFrame
- Value to use to fill holes (e.g. 0), alternately a
- dict/Series/DataFrame of values specifying which value to use for
- each index (for a Series) or column (for a DataFrame). Values not
- in the dict/Series/DataFrame will not be filled. This value cannot
- be a list. Users wanting to use the ``value`` argument and not ``method``
- should prefer :meth:`.DataFrame.fillna` as this
- will produce the same result and be more performant.
- method : {{'bfill', 'ffill', None}}, default None
- Method to use for filling holes. ``'ffill'`` will propagate
- the last valid observation forward within a group.
- ``'bfill'`` will use next valid observation to fill the gap.
- axis : {0 or 'index', 1 or 'columns'}
- Axis along which to fill missing values. When the :class:`DataFrameGroupBy`
- ``axis`` argument is ``0``, using ``axis=1`` here will produce
- the same results as :meth:`.DataFrame.fillna`. When the
- :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0``
- or ``axis=1`` here will produce the same results.
- inplace : bool, default False
- Broken. Do not set to True.
- limit : int, default None
- If method is specified, this is the maximum number of consecutive
- NaN values to forward/backward fill within a group. In other words,
- if there is a gap with more than this number of consecutive NaNs,
- it will only be partially filled. If method is not specified, this is the
- maximum number of entries along the entire axis where NaNs will be
- filled. Must be greater than 0 if not None.
- downcast : dict, default is None
- A dict of item->dtype of what to downcast if possible,
- or the string 'infer' which will try to downcast to an appropriate
- equal type (e.g. float64 to int64 if possible).
-
- Returns
- -------
- DataFrame
- Object with missing values filled.
-
- See Also
- --------
- ffill : Forward fill values within a group.
- bfill : Backward fill values within a group.
-
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {
- ... "key": [0, 0, 1, 1, 1],
- ... "A": [np.nan, 2, np.nan, 3, np.nan],
- ... "B": [2, 3, np.nan, np.nan, np.nan],
- ... "C": [np.nan, np.nan, 2, np.nan, np.nan],
- ... }
- ... )
- >>> df
- key A B C
- 0 0 NaN 2.0 NaN
- 1 0 2.0 3.0 NaN
- 2 1 NaN NaN 2.0
- 3 1 3.0 NaN NaN
- 4 1 NaN NaN NaN
-
- Propagate non-null values forward or backward within each group along columns.
-
- >>> df.groupby("key").fillna(method="ffill")
- A B C
- 0 NaN 2.0 NaN
- 1 2.0 3.0 NaN
- 2 NaN NaN 2.0
- 3 3.0 NaN 2.0
- 4 3.0 NaN 2.0
-
- >>> df.groupby("key").fillna(method="bfill")
- A B C
- 0 2.0 2.0 NaN
- 1 2.0 3.0 NaN
- 2 3.0 NaN 2.0
- 3 3.0 NaN NaN
- 4 NaN NaN NaN
-
- Propagate non-null values forward or backward within each group along rows.
-
- >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="ffill")
- key A B C
- 0 0.0 0.0 2.0 2.0
- 1 0.0 2.0 3.0 3.0
- 2 1.0 1.0 NaN 2.0
- 3 1.0 3.0 NaN NaN
- 4 1.0 1.0 NaN NaN
-
- >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="bfill")
- key A B C
- 0 0.0 NaN 2.0 NaN
- 1 0.0 2.0 3.0 NaN
- 2 1.0 NaN 2.0 2.0
- 3 1.0 3.0 NaN NaN
- 4 1.0 NaN NaN NaN
-
- Only replace the first NaN element within a group along rows.
-
- >>> df.groupby("key").fillna(method="ffill", limit=1)
- A B C
- 0 NaN 2.0 NaN
- 1 2.0 3.0 NaN
- 2 NaN NaN 2.0
- 3 3.0 NaN 2.0
- 4 3.0 NaN NaN
- """
- result = self._op_via_apply(
- "fillna",
- value=value,
- method=method,
- axis=axis,
- inplace=inplace,
- limit=limit,
- downcast=downcast,
- )
- return result
-
- def take(
- self,
- indices: TakeIndexer,
- axis: Axis | None = 0,
- **kwargs,
- ) -> DataFrame:
- """
- Return the elements in the given *positional* indices in each group.
-
- This means that we are not indexing according to actual values in
- the index attribute of the object. We are indexing according to the
- actual position of the element in the object.
-
- If a requested index does not exist for some group, this method will raise.
- To get similar behavior that ignores indices that don't exist, see
- :meth:`.DataFrameGroupBy.nth`.
-
- Parameters
- ----------
- indices : array-like
- An array of ints indicating which positions to take.
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- The axis on which to select elements. ``0`` means that we are
- selecting rows, ``1`` means that we are selecting columns.
- **kwargs
- For compatibility with :meth:`numpy.take`. Has no effect on the
- output.
-
- Returns
- -------
- DataFrame
- An DataFrame containing the elements taken from each group.
-
- See Also
- --------
- DataFrame.take : Take elements from a Series along an axis.
- DataFrame.loc : Select a subset of a DataFrame by labels.
- DataFrame.iloc : Select a subset of a DataFrame by positions.
- numpy.take : Take elements from an array along an axis.
-
- Examples
- --------
- >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
- ... ('parrot', 'bird', 24.0),
- ... ('lion', 'mammal', 80.5),
- ... ('monkey', 'mammal', np.nan),
- ... ('rabbit', 'mammal', 15.0)],
- ... columns=['name', 'class', 'max_speed'],
- ... index=[4, 3, 2, 1, 0])
- >>> df
- name class max_speed
- 4 falcon bird 389.0
- 3 parrot bird 24.0
- 2 lion mammal 80.5
- 1 monkey mammal NaN
- 0 rabbit mammal 15.0
- >>> gb = df.groupby([1, 1, 2, 2, 2])
-
- Take elements at positions 0 and 1 along the axis 0 (default).
-
- Note how the indices selected in the result do not correspond to
- our input indices 0 and 1. That's because we are selecting the 0th
- and 1st rows, not rows whose indices equal 0 and 1.
-
- >>> gb.take([0, 1])
- name class max_speed
- 1 4 falcon bird 389.0
- 3 parrot bird 24.0
- 2 2 lion mammal 80.5
- 1 monkey mammal NaN
-
- The order of the specified indices influences the order in the result.
- Here, the order is swapped from the previous example.
-
- >>> gb.take([1, 0])
- name class max_speed
- 1 3 parrot bird 24.0
- 4 falcon bird 389.0
- 2 1 monkey mammal NaN
- 2 lion mammal 80.5
-
- Take elements at indices 1 and 2 along the axis 1 (column selection).
-
- We may take elements using negative integers for positive indices,
- starting from the end of the object, just like with Python lists.
-
- >>> gb.take([-1, -2])
- name class max_speed
- 1 3 parrot bird 24.0
- 4 falcon bird 389.0
- 2 0 rabbit mammal 15.0
- 1 monkey mammal NaN
- """
- result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
- return result
-
- def skew(
- self,
- axis: Axis | None | lib.NoDefault = lib.no_default,
- skipna: bool = True,
- numeric_only: bool = False,
- **kwargs,
- ) -> DataFrame:
- """
- Return unbiased skew within groups.
-
- Normalized by N-1.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- Axis for the function to be applied on.
-
- Specifying ``axis=None`` will apply the aggregation across both axes.
-
- .. versionadded:: 2.0.0
-
- skipna : bool, default True
- Exclude NA/null values when computing the result.
-
- numeric_only : bool, default False
- Include only float, int, boolean columns.
-
- **kwargs
- Additional keyword arguments to be passed to the function.
-
- Returns
- -------
- DataFrame
-
- See Also
- --------
- DataFrame.skew : Return unbiased skew over requested axis.
-
- Examples
- --------
- >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi',
- ... 'lion', 'monkey', 'rabbit'],
- ... ['bird', 'bird', 'bird', 'bird',
- ... 'mammal', 'mammal', 'mammal']]
- >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class'))
- >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan,
- ... 80.5, 21.5, 15.0]},
- ... index=index)
- >>> df
- max_speed
- name class
- falcon bird 389.0
- parrot bird 24.0
- cockatoo bird 70.0
- kiwi bird NaN
- lion mammal 80.5
- monkey mammal 21.5
- rabbit mammal 15.0
- >>> gb = df.groupby(["class"])
- >>> gb.skew()
- max_speed
- class
- bird 1.628296
- mammal 1.669046
- >>> gb.skew(skipna=False)
- max_speed
- class
- bird NaN
- mammal 1.669046
- """
- result = self._op_via_apply(
- "skew",
- axis=axis,
- skipna=skipna,
- numeric_only=numeric_only,
- **kwargs,
- )
- return result
-
- @property
- @doc(DataFrame.plot.__doc__)
- def plot(self) -> GroupByPlot:
- result = GroupByPlot(self)
- return result
-
- @doc(DataFrame.corr.__doc__)
- def corr(
- self,
- method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
- min_periods: int = 1,
- numeric_only: bool = False,
- ) -> DataFrame:
- result = self._op_via_apply(
- "corr", method=method, min_periods=min_periods, numeric_only=numeric_only
- )
- return result
-
- @doc(DataFrame.cov.__doc__)
- def cov(
- self,
- min_periods: int | None = None,
- ddof: int | None = 1,
- numeric_only: bool = False,
- ) -> DataFrame:
- result = self._op_via_apply(
- "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only
- )
- return result
-
- @doc(DataFrame.hist.__doc__)
- def hist(
- self,
- column: IndexLabel = None,
- by=None,
- grid: bool = True,
- xlabelsize: int | None = None,
- xrot: float | None = None,
- ylabelsize: int | None = None,
- yrot: float | None = None,
- ax=None,
- sharex: bool = False,
- sharey: bool = False,
- figsize: tuple[int, int] | None = None,
- layout: tuple[int, int] | None = None,
- bins: int | Sequence[int] = 10,
- backend: str | None = None,
- legend: bool = False,
- **kwargs,
- ):
- result = self._op_via_apply(
- "hist",
- column=column,
- by=by,
- grid=grid,
- xlabelsize=xlabelsize,
- xrot=xrot,
- ylabelsize=ylabelsize,
- yrot=yrot,
- ax=ax,
- sharex=sharex,
- sharey=sharey,
- figsize=figsize,
- layout=layout,
- bins=bins,
- backend=backend,
- legend=legend,
- **kwargs,
- )
- return result
-
- @property
- @doc(DataFrame.dtypes.__doc__)
- def dtypes(self) -> Series:
- # error: Incompatible return value type (got "DataFrame", expected "Series")
- return self.apply(lambda df: df.dtypes) # type: ignore[return-value]
-
- @doc(DataFrame.corrwith.__doc__)
- def corrwith(
- self,
- other: DataFrame | Series,
- axis: Axis = 0,
- drop: bool = False,
- method: CorrelationMethod = "pearson",
- numeric_only: bool = False,
- ) -> DataFrame:
- result = self._op_via_apply(
- "corrwith",
- other=other,
- axis=axis,
- drop=drop,
- method=method,
- numeric_only=numeric_only,
- )
- return result
-
-
-def _wrap_transform_general_frame(
- obj: DataFrame, group: DataFrame, res: DataFrame | Series
-) -> DataFrame:
- from pandas import concat
-
- if isinstance(res, Series):
- # we need to broadcast across the
- # other dimension; this will preserve dtypes
- # GH14457
- if res.index.is_(obj.index):
- res_frame = concat([res] * len(group.columns), axis=1)
- res_frame.columns = group.columns
- res_frame.index = group.index
- else:
- res_frame = obj._constructor(
- np.tile(res.values, (len(group.index), 1)),
- columns=group.columns,
- index=group.index,
- )
- assert isinstance(res_frame, DataFrame)
- return res_frame
- elif isinstance(res, DataFrame) and not res.index.is_(group.index):
- return res._align_frame(group)[0]
- else:
- return res
diff --git a/contrib/python/pandas/py3/pandas/core/groupby/groupby.py b/contrib/python/pandas/py3/pandas/core/groupby/groupby.py
deleted file mode 100644
index 42b7fd9b635..00000000000
--- a/contrib/python/pandas/py3/pandas/core/groupby/groupby.py
+++ /dev/null
@@ -1,4292 +0,0 @@
-"""
-Provide the groupby split-apply-combine paradigm. Define the GroupBy
-class providing the base-class of operations.
-
-The SeriesGroupBy and DataFrameGroupBy sub-class
-(defined in pandas.core.groupby.generic)
-expose these user-facing objects to provide specific functionality.
-"""
-from __future__ import annotations
-
-import datetime
-from functools import (
- partial,
- wraps,
-)
-import inspect
-from textwrap import dedent
-from typing import (
- TYPE_CHECKING,
- Callable,
- Hashable,
- Iterable,
- Iterator,
- List,
- Literal,
- Mapping,
- Sequence,
- TypeVar,
- Union,
- cast,
- final,
-)
-import warnings
-
-import numpy as np
-
-from pandas._config.config import option_context
-
-from pandas._libs import (
- Timestamp,
- lib,
-)
-from pandas._libs.algos import rank_1d
-import pandas._libs.groupby as libgroupby
-from pandas._libs.missing import NA
-from pandas._typing import (
- AnyArrayLike,
- ArrayLike,
- Axis,
- AxisInt,
- DtypeObj,
- FillnaOptions,
- IndexLabel,
- NDFrameT,
- PositionalIndexer,
- RandomState,
- Scalar,
- T,
- npt,
-)
-from pandas.compat.numpy import function as nv
-from pandas.errors import (
- AbstractMethodError,
- DataError,
-)
-from pandas.util._decorators import (
- Appender,
- Substitution,
- cache_readonly,
- doc,
-)
-
-from pandas.core.dtypes.cast import ensure_dtype_can_hold_na
-from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_float_dtype,
- is_hashable,
- is_integer,
- is_integer_dtype,
- is_numeric_dtype,
- is_object_dtype,
- is_scalar,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.missing import (
- isna,
- notna,
-)
-
-from pandas.core import (
- algorithms,
- sample,
-)
-from pandas.core._numba import executor
-from pandas.core.arrays import (
- BaseMaskedArray,
- BooleanArray,
- Categorical,
- DatetimeArray,
- ExtensionArray,
- FloatingArray,
- TimedeltaArray,
-)
-from pandas.core.base import (
- PandasObject,
- SelectionMixin,
-)
-import pandas.core.common as com
-from pandas.core.frame import DataFrame
-from pandas.core.generic import NDFrame
-from pandas.core.groupby import (
- base,
- numba_,
- ops,
-)
-from pandas.core.groupby.grouper import get_grouper
-from pandas.core.groupby.indexing import (
- GroupByIndexingMixin,
- GroupByNthSelector,
-)
-from pandas.core.indexes.api import (
- CategoricalIndex,
- Index,
- MultiIndex,
- RangeIndex,
- default_index,
-)
-from pandas.core.internals.blocks import ensure_block_shape
-from pandas.core.series import Series
-from pandas.core.sorting import get_group_index_sorter
-from pandas.core.util.numba_ import (
- get_jit_arguments,
- maybe_use_numba,
-)
-
-if TYPE_CHECKING:
- from pandas.core.window import (
- ExpandingGroupby,
- ExponentialMovingWindowGroupby,
- RollingGroupby,
- )
-
-_common_see_also = """
- See Also
- --------
- Series.%(name)s : Apply a function %(name)s to a Series.
- DataFrame.%(name)s : Apply a function %(name)s
- to each row or column of a DataFrame.
-"""
-
-_apply_docs = {
- "template": """
- Apply function ``func`` group-wise and combine the results together.
-
- The function passed to ``apply`` must take a {input} as its first
- argument and return a DataFrame, Series or scalar. ``apply`` will
- then take care of combining the results back together into a single
- dataframe or series. ``apply`` is therefore a highly flexible
- grouping method.
-
- While ``apply`` is a very flexible method, its downside is that
- using it can be quite a bit slower than using more specific methods
- like ``agg`` or ``transform``. Pandas offers a wide range of method that will
- be much faster than using ``apply`` for their specific purposes, so try to
- use them before reaching for ``apply``.
-
- Parameters
- ----------
- func : callable
- A callable that takes a {input} as its first argument, and
- returns a dataframe, a series or a scalar. In addition the
- callable may take positional and keyword arguments.
- args, kwargs : tuple and dict
- Optional positional and keyword arguments to pass to ``func``.
-
- Returns
- -------
- Series or DataFrame
-
- See Also
- --------
- pipe : Apply function to the full GroupBy object instead of to each
- group.
- aggregate : Apply aggregate function to the GroupBy object.
- transform : Apply function column-by-column to the GroupBy object.
- Series.apply : Apply a function to a Series.
- DataFrame.apply : Apply a function to each row or column of a DataFrame.
-
- Notes
- -----
-
- .. versionchanged:: 1.3.0
-
- The resulting dtype will reflect the return value of the passed ``func``,
- see the examples below.
-
- Functions that mutate the passed object can produce unexpected
- behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
- for more details.
-
- Examples
- --------
- {examples}
- """,
- "dataframe_examples": """
- >>> df = pd.DataFrame({'A': 'a a b'.split(),
- ... 'B': [1,2,3],
- ... 'C': [4,6,5]})
- >>> g1 = df.groupby('A', group_keys=False)
- >>> g2 = df.groupby('A', group_keys=True)
-
- Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only
- differ in their ``group_keys`` argument. Calling `apply` in various ways,
- we can get different grouping results:
-
- Example 1: below the function passed to `apply` takes a DataFrame as
- its argument and returns a DataFrame. `apply` combines the result for
- each group together into a new DataFrame:
-
- >>> g1[['B', 'C']].apply(lambda x: x / x.sum())
- B C
- 0 0.333333 0.4
- 1 0.666667 0.6
- 2 1.000000 1.0
-
- In the above, the groups are not part of the index. We can have them included
- by using ``g2`` where ``group_keys=True``:
-
- >>> g2[['B', 'C']].apply(lambda x: x / x.sum())
- B C
- A
- a 0 0.333333 0.4
- 1 0.666667 0.6
- b 2 1.000000 1.0
-
- Example 2: The function passed to `apply` takes a DataFrame as
- its argument and returns a Series. `apply` combines the result for
- each group together into a new DataFrame.
-
- .. versionchanged:: 1.3.0
-
- The resulting dtype will reflect the return value of the passed ``func``.
-
- >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
- B C
- A
- a 1.0 2.0
- b 0.0 0.0
-
- >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
- B C
- A
- a 1.0 2.0
- b 0.0 0.0
-
- The ``group_keys`` argument has no effect here because the result is not
- like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
- to the input.
-
- Example 3: The function passed to `apply` takes a DataFrame as
- its argument and returns a scalar. `apply` combines the result for
- each group together into a Series, including setting the index as
- appropriate:
-
- >>> g1.apply(lambda x: x.C.max() - x.B.min())
- A
- a 5
- b 2
- dtype: int64""",
- "series_examples": """
- >>> s = pd.Series([0, 1, 2], index='a a b'.split())
- >>> g1 = s.groupby(s.index, group_keys=False)
- >>> g2 = s.groupby(s.index, group_keys=True)
-
- From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
- Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only
- differ in their ``group_keys`` argument. Calling `apply` in various ways,
- we can get different grouping results:
-
- Example 1: The function passed to `apply` takes a Series as
- its argument and returns a Series. `apply` combines the result for
- each group together into a new Series.
-
- .. versionchanged:: 1.3.0
-
- The resulting dtype will reflect the return value of the passed ``func``.
-
- >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2)
- a 0.0
- a 2.0
- b 1.0
- dtype: float64
-
- In the above, the groups are not part of the index. We can have them included
- by using ``g2`` where ``group_keys=True``:
-
- >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2)
- a a 0.0
- a 2.0
- b b 1.0
- dtype: float64
-
- Example 2: The function passed to `apply` takes a Series as
- its argument and returns a scalar. `apply` combines the result for
- each group together into a Series, including setting the index as
- appropriate:
-
- >>> g1.apply(lambda x: x.max() - x.min())
- a 1
- b 0
- dtype: int64
-
- The ``group_keys`` argument has no effect here because the result is not
- like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
- to the input.
-
- >>> g2.apply(lambda x: x.max() - x.min())
- a 1
- b 0
- dtype: int64""",
-}
-
-_groupby_agg_method_template = """
-Compute {fname} of group values.
-
-Parameters
-----------
-numeric_only : bool, default {no}
- Include only float, int, boolean columns.
-
- .. versionchanged:: 2.0.0
-
- numeric_only no longer accepts ``None``.
-
-min_count : int, default {mc}
- The required number of valid values to perform the operation. If fewer
- than ``min_count`` non-NA values are present the result will be NA.
-
-Returns
--------
-Series or DataFrame
- Computed {fname} of values within each group.
-"""
-
-_pipe_template = """
-Apply a ``func`` with arguments to this %(klass)s object and return its result.
-
-Use `.pipe` when you want to improve readability by chaining together
-functions that expect Series, DataFrames, GroupBy or Resampler objects.
-Instead of writing
-
->>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP
-
-You can write
-
->>> (df.groupby('group')
-... .pipe(f)
-... .pipe(g, arg1=a)
-... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP
-
-which is much more readable.
-
-Parameters
-----------
-func : callable or tuple of (callable, str)
- Function to apply to this %(klass)s object or, alternatively,
- a `(callable, data_keyword)` tuple where `data_keyword` is a
- string indicating the keyword of `callable` that expects the
- %(klass)s object.
-args : iterable, optional
- Positional arguments passed into `func`.
-kwargs : dict, optional
- A dictionary of keyword arguments passed into `func`.
-
-Returns
--------
-the return type of `func`.
-
-See Also
---------
-Series.pipe : Apply a function with arguments to a series.
-DataFrame.pipe: Apply a function with arguments to a dataframe.
-apply : Apply function to each group instead of to the
- full %(klass)s object.
-
-Notes
------
-See more `here
-<https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_
-
-Examples
---------
-%(examples)s
-"""
-
-_transform_template = """
-Call function producing a same-indexed %(klass)s on each group.
-
-Returns a %(klass)s having the same indexes as the original object
-filled with the transformed values.
-
-Parameters
-----------
-f : function, str
- Function to apply to each group. See the Notes section below for requirements.
-
- Accepted inputs are:
-
- - String
- - Python function
- - Numba JIT function with ``engine='numba'`` specified.
-
- Only passing a single function is supported with this engine.
- If the ``'numba'`` engine is chosen, the function must be
- a user defined function with ``values`` and ``index`` as the
- first and second arguments respectively in the function signature.
- Each group's index will be passed to the user defined function
- and optionally available for use.
-
- If a string is chosen, then it needs to be the name
- of the groupby method you want to use.
-
- .. versionchanged:: 1.1.0
-*args
- Positional arguments to pass to func.
-engine : str, default None
- * ``'cython'`` : Runs the function through C-extensions from cython.
- * ``'numba'`` : Runs the function through JIT compiled code from numba.
- * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba``
-
- .. versionadded:: 1.1.0
-engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be
- applied to the function
-
- .. versionadded:: 1.1.0
-**kwargs
- Keyword arguments to be passed into func.
-
-Returns
--------
-%(klass)s
-
-See Also
---------
-%(klass)s.groupby.apply : Apply function ``func`` group-wise and combine
- the results together.
-%(klass)s.groupby.aggregate : Aggregate using one or more
- operations over the specified axis.
-%(klass)s.transform : Call ``func`` on self producing a %(klass)s with the
- same axis shape as self.
-
-Notes
------
-Each group is endowed the attribute 'name' in case you need to know
-which group you are working on.
-
-The current implementation imposes three requirements on f:
-
-* f must return a value that either has the same shape as the input
- subframe or can be broadcast to the shape of the input subframe.
- For example, if `f` returns a scalar it will be broadcast to have the
- same shape as the input subframe.
-* if this is a DataFrame, f must support application column-by-column
- in the subframe. If f also supports application to the entire subframe,
- then a fast path is used starting from the second chunk.
-* f must not mutate groups. Mutation is not supported and may
- produce unexpected results. See :ref:`gotchas.udf-mutation` for more details.
-
-When using ``engine='numba'``, there will be no "fall back" behavior internally.
-The group data and group index will be passed as numpy arrays to the JITed
-user defined function, and no alternative execution attempts will be tried.
-
-.. versionchanged:: 1.3.0
-
- The resulting dtype will reflect the return value of the passed ``func``,
- see the examples below.
-
-.. versionchanged:: 2.0.0
-
- When using ``.transform`` on a grouped DataFrame and the transformation function
- returns a DataFrame, pandas now aligns the result's index
- with the input's index. You can call ``.to_numpy()`` on the
- result of the transformation function to avoid alignment.
-
-Examples
---------
-%(example)s"""
-
-_agg_template = """
-Aggregate using one or more operations over the specified axis.
-
-Parameters
-----------
-func : function, str, list, dict or None
- Function to use for aggregating the data. If a function, must either
- work when passed a {klass} or when passed to {klass}.apply.
-
- Accepted combinations are:
-
- - function
- - string function name
- - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
- - dict of axis labels -> functions, function names or list of such.
- - None, in which case ``**kwargs`` are used with Named Aggregation. Here the
- output has one column for each element in ``**kwargs``. The name of the
- column is keyword, whereas the value determines the aggregation used to compute
- the values in the column.
-
- Can also accept a Numba JIT function with
- ``engine='numba'`` specified. Only passing a single function is supported
- with this engine.
-
- If the ``'numba'`` engine is chosen, the function must be
- a user defined function with ``values`` and ``index`` as the
- first and second arguments respectively in the function signature.
- Each group's index will be passed to the user defined function
- and optionally available for use.
-
- .. versionchanged:: 1.1.0
-*args
- Positional arguments to pass to func.
-engine : str, default None
- * ``'cython'`` : Runs the function through C-extensions from cython.
- * ``'numba'`` : Runs the function through JIT compiled code from numba.
- * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
-
- .. versionadded:: 1.1.0
-engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
- applied to the function
-
- .. versionadded:: 1.1.0
-**kwargs
- * If ``func`` is None, ``**kwargs`` are used to define the output names and
- aggregations via Named Aggregation. See ``func`` entry.
- * Otherwise, keyword arguments to be passed into func.
-
-Returns
--------
-{klass}
-
-See Also
---------
-{klass}.groupby.apply : Apply function func group-wise
- and combine the results together.
-{klass}.groupby.transform : Transforms the Series on each group
- based on the given function.
-{klass}.aggregate : Aggregate using one or more
- operations over the specified axis.
-
-Notes
------
-When using ``engine='numba'``, there will be no "fall back" behavior internally.
-The group data and group index will be passed as numpy arrays to the JITed
-user defined function, and no alternative execution attempts will be tried.
-
-Functions that mutate the passed object can produce unexpected
-behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
-for more details.
-
-.. versionchanged:: 1.3.0
-
- The resulting dtype will reflect the return value of the passed ``func``,
- see the examples below.
-{examples}"""
-
-
-@final
-class GroupByPlot(PandasObject):
- """
- Class implementing the .plot attribute for groupby objects.
- """
-
- def __init__(self, groupby: GroupBy) -> None:
- self._groupby = groupby
-
- def __call__(self, *args, **kwargs):
- def f(self):
- return self.plot(*args, **kwargs)
-
- f.__name__ = "plot"
- return self._groupby.apply(f)
-
- def __getattr__(self, name: str):
- def attr(*args, **kwargs):
- def f(self):
- return getattr(self.plot, name)(*args, **kwargs)
-
- return self._groupby.apply(f)
-
- return attr
-
-
-_KeysArgType = Union[
- Hashable,
- List[Hashable],
- Callable[[Hashable], Hashable],
- List[Callable[[Hashable], Hashable]],
- Mapping[Hashable, Hashable],
-]
-
-
-class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):
- _hidden_attrs = PandasObject._hidden_attrs | {
- "as_index",
- "axis",
- "dropna",
- "exclusions",
- "grouper",
- "group_keys",
- "keys",
- "level",
- "obj",
- "observed",
- "sort",
- }
-
- axis: AxisInt
- grouper: ops.BaseGrouper
- keys: _KeysArgType | None = None
- level: IndexLabel | None = None
- group_keys: bool
-
- @final
- def __len__(self) -> int:
- return len(self.groups)
-
- @final
- def __repr__(self) -> str:
- # TODO: Better repr for GroupBy object
- return object.__repr__(self)
-
- @final
- @property
- def groups(self) -> dict[Hashable, np.ndarray]:
- """
- Dict {group name -> group labels}.
- """
- return self.grouper.groups
-
- @final
- @property
- def ngroups(self) -> int:
- return self.grouper.ngroups
-
- @final
- @property
- def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
- """
- Dict {group name -> group indices}.
- """
- return self.grouper.indices
-
- @final
- def _get_indices(self, names):
- """
- Safe get multiple indices, translate keys for
- datelike to underlying repr.
- """
-
- def get_converter(s):
- # possibly convert to the actual key types
- # in the indices, could be a Timestamp or a np.datetime64
- if isinstance(s, datetime.datetime):
- return lambda key: Timestamp(key)
- elif isinstance(s, np.datetime64):
- return lambda key: Timestamp(key).asm8
- else:
- return lambda key: key
-
- if len(names) == 0:
- return []
-
- if len(self.indices) > 0:
- index_sample = next(iter(self.indices))
- else:
- index_sample = None # Dummy sample
-
- name_sample = names[0]
- if isinstance(index_sample, tuple):
- if not isinstance(name_sample, tuple):
- msg = "must supply a tuple to get_group with multiple grouping keys"
- raise ValueError(msg)
- if not len(name_sample) == len(index_sample):
- try:
- # If the original grouper was a tuple
- return [self.indices[name] for name in names]
- except KeyError as err:
- # turns out it wasn't a tuple
- msg = (
- "must supply a same-length tuple to get_group "
- "with multiple grouping keys"
- )
- raise ValueError(msg) from err
-
- converters = [get_converter(s) for s in index_sample]
- names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)
-
- else:
- converter = get_converter(index_sample)
- names = (converter(name) for name in names)
-
- return [self.indices.get(name, []) for name in names]
-
- @final
- def _get_index(self, name):
- """
- Safe get index, translate keys for datelike to underlying repr.
- """
- return self._get_indices([name])[0]
-
- @final
- @cache_readonly
- def _selected_obj(self):
- # Note: _selected_obj is always just `self.obj` for SeriesGroupBy
- if isinstance(self.obj, Series):
- return self.obj
-
- if self._selection is not None:
- if is_hashable(self._selection):
- # i.e. a single key, so selecting it will return a Series.
- # In this case, _obj_with_exclusions would wrap the key
- # in a list and return a single-column DataFrame.
- return self.obj[self._selection]
-
- # Otherwise _selection is equivalent to _selection_list, so
- # _selected_obj matches _obj_with_exclusions, so we can re-use
- # that and avoid making a copy.
- return self._obj_with_exclusions
-
- return self.obj
-
- @final
- def _dir_additions(self) -> set[str]:
- return self.obj._dir_additions()
-
- @Substitution(
- klass="GroupBy",
- examples=dedent(
- """\
- >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
- >>> df
- A B
- 0 a 1
- 1 b 2
- 2 a 3
- 3 b 4
-
- To get the difference between each groups maximum and minimum value in one
- pass, you can do
-
- >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
- B
- A
- a 2
- b 2"""
- ),
- )
- @Appender(_pipe_template)
- def pipe(
- self,
- func: Callable[..., T] | tuple[Callable[..., T], str],
- *args,
- **kwargs,
- ) -> T:
- return com.pipe(self, func, *args, **kwargs)
-
- @final
- def get_group(self, name, obj=None) -> DataFrame | Series:
- """
- Construct DataFrame from group with provided name.
-
- Parameters
- ----------
- name : object
- The name of the group to get as a DataFrame.
- obj : DataFrame, default None
- The DataFrame to take the DataFrame out of. If
- it is None, the object groupby was called on will
- be used.
-
- Returns
- -------
- same type as obj
- """
- if obj is None:
- obj = self._selected_obj
-
- inds = self._get_index(name)
- if not len(inds):
- raise KeyError(name)
-
- return obj._take_with_is_copy(inds, axis=self.axis)
-
- @final
- def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]:
- """
- Groupby iterator.
-
- Returns
- -------
- Generator yielding sequence of (name, subsetted object)
- for each group
- """
- keys = self.keys
- result = self.grouper.get_iterator(self._selected_obj, axis=self.axis)
- if isinstance(keys, list) and len(keys) == 1:
- # GH#42795 - when keys is a list, return tuples even when length is 1
- result = (((key,), group) for key, group in result)
- return result
-
-
-# To track operations that expand dimensions, like ohlc
-OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame)
-
-
-class GroupBy(BaseGroupBy[NDFrameT]):
- """
- Class for grouping and aggregating relational data.
-
- See aggregate, transform, and apply functions on this object.
-
- It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
-
- ::
-
- grouped = groupby(obj, ...)
-
- Parameters
- ----------
- obj : pandas object
- axis : int, default 0
- level : int, default None
- Level of MultiIndex
- groupings : list of Grouping objects
- Most users should ignore this
- exclusions : array-like, optional
- List of columns to exclude
- name : str
- Most users should ignore this
-
- Returns
- -------
- **Attributes**
- groups : dict
- {group name -> group labels}
- len(grouped) : int
- Number of groups
-
- Notes
- -----
- After grouping, see aggregate, apply, and transform functions. Here are
- some other brief notes about usage. When grouping by multiple groups, the
- result index will be a MultiIndex (hierarchical) by default.
-
- Iteration produces (key, group) tuples, i.e. chunking the data by group. So
- you can write code like:
-
- ::
-
- grouped = obj.groupby(keys, axis=axis)
- for key, group in grouped:
- # do something with the data
-
- Function calls on GroupBy, if not specially implemented, "dispatch" to the
- grouped data. So if you group a DataFrame and wish to invoke the std()
- method on each group, you can simply do:
-
- ::
-
- df.groupby(mapper).std()
-
- rather than
-
- ::
-
- df.groupby(mapper).aggregate(np.std)
-
- You can pass arguments to these "wrapped" functions, too.
-
- See the online documentation for full exposition on these topics and much
- more
- """
-
- grouper: ops.BaseGrouper
- as_index: bool
-
- @final
- def __init__(
- self,
- obj: NDFrameT,
- keys: _KeysArgType | None = None,
- axis: Axis = 0,
- level: IndexLabel | None = None,
- grouper: ops.BaseGrouper | None = None,
- exclusions: frozenset[Hashable] | None = None,
- selection: IndexLabel | None = None,
- as_index: bool = True,
- sort: bool = True,
- group_keys: bool = True,
- observed: bool = False,
- dropna: bool = True,
- ) -> None:
- self._selection = selection
-
- assert isinstance(obj, NDFrame), type(obj)
-
- self.level = level
-
- if not as_index:
- if axis != 0:
- raise ValueError("as_index=False only valid for axis=0")
-
- self.as_index = as_index
- self.keys = keys
- self.sort = sort
- self.group_keys = group_keys
- self.observed = observed
- self.dropna = dropna
-
- if grouper is None:
- grouper, exclusions, obj = get_grouper(
- obj,
- keys,
- axis=axis,
- level=level,
- sort=sort,
- observed=observed,
- dropna=self.dropna,
- )
-
- self.obj = obj
- self.axis = obj._get_axis_number(axis)
- self.grouper = grouper
- self.exclusions = frozenset(exclusions) if exclusions else frozenset()
-
- def __getattr__(self, attr: str):
- if attr in self._internal_names_set:
- return object.__getattribute__(self, attr)
- if attr in self.obj:
- return self[attr]
-
- raise AttributeError(
- f"'{type(self).__name__}' object has no attribute '{attr}'"
- )
-
- @final
- def _op_via_apply(self, name: str, *args, **kwargs):
- """Compute the result of an operation by using GroupBy's apply."""
- f = getattr(type(self._obj_with_exclusions), name)
- sig = inspect.signature(f)
-
- # a little trickery for aggregation functions that need an axis
- # argument
- if "axis" in sig.parameters:
- if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default:
- kwargs["axis"] = self.axis
-
- def curried(x):
- return f(x, *args, **kwargs)
-
- # preserve the name so we can detect it when calling plot methods,
- # to avoid duplicates
- curried.__name__ = name
-
- # special case otherwise extra plots are created when catching the
- # exception below
- if name in base.plotting_methods:
- return self.apply(curried)
-
- is_transform = name in base.transformation_kernels
- result = self._python_apply_general(
- curried,
- self._obj_with_exclusions,
- is_transform=is_transform,
- not_indexed_same=not is_transform,
- )
-
- if self.grouper.has_dropped_na and is_transform:
- # result will have dropped rows due to nans, fill with null
- # and ensure index is ordered same as the input
- result = self._set_result_index_ordered(result)
- return result
-
- # -----------------------------------------------------------------
- # Selection
-
- def _iterate_slices(self) -> Iterable[Series]:
- raise AbstractMethodError(self)
-
- # -----------------------------------------------------------------
- # Dispatch/Wrapping
-
- @final
- def _concat_objects(
- self,
- values,
- not_indexed_same: bool = False,
- is_transform: bool = False,
- ):
- from pandas.core.reshape.concat import concat
-
- if self.group_keys and not is_transform:
- if self.as_index:
- # possible MI return case
- group_keys = self.grouper.result_index
- group_levels = self.grouper.levels
- group_names = self.grouper.names
-
- result = concat(
- values,
- axis=self.axis,
- keys=group_keys,
- levels=group_levels,
- names=group_names,
- sort=False,
- )
- else:
- # GH5610, returns a MI, with the first level being a
- # range index
- keys = list(range(len(values)))
- result = concat(values, axis=self.axis, keys=keys)
-
- elif not not_indexed_same:
- result = concat(values, axis=self.axis)
-
- ax = self._selected_obj._get_axis(self.axis)
- if self.dropna:
- labels = self.grouper.group_info[0]
- mask = labels != -1
- ax = ax[mask]
-
- # this is a very unfortunate situation
- # we can't use reindex to restore the original order
- # when the ax has duplicates
- # so we resort to this
- # GH 14776, 30667
- # TODO: can we re-use e.g. _reindex_non_unique?
- if ax.has_duplicates and not result.axes[self.axis].equals(ax):
- # e.g. test_category_order_transformer
- target = algorithms.unique1d(ax._values)
- indexer, _ = result.index.get_indexer_non_unique(target)
- result = result.take(indexer, axis=self.axis)
- else:
- result = result.reindex(ax, axis=self.axis, copy=False)
-
- else:
- result = concat(values, axis=self.axis)
-
- name = self.obj.name if self.obj.ndim == 1 else self._selection
- if isinstance(result, Series) and name is not None:
- result.name = name
-
- return result
-
- @final
- def _set_result_index_ordered(
- self, result: OutputFrameOrSeries
- ) -> OutputFrameOrSeries:
- # set the result index on the passed values object and
- # return the new object, xref 8046
-
- obj_axis = self.obj._get_axis(self.axis)
-
- if self.grouper.is_monotonic and not self.grouper.has_dropped_na:
- # shortcut if we have an already ordered grouper
- result = result.set_axis(obj_axis, axis=self.axis, copy=False)
- return result
-
- # row order is scrambled => sort the rows by position in original index
- original_positions = Index(self.grouper.result_ilocs())
- result = result.set_axis(original_positions, axis=self.axis, copy=False)
- result = result.sort_index(axis=self.axis)
- if self.grouper.has_dropped_na:
- # Add back in any missing rows due to dropna - index here is integral
- # with values referring to the row of the input so can use RangeIndex
- result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
- result = result.set_axis(obj_axis, axis=self.axis, copy=False)
-
- return result
-
- @final
- def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame:
- if isinstance(result, Series):
- result = result.to_frame()
-
- # zip in reverse so we can always insert at loc 0
- columns = result.columns
- for name, lev, in_axis in zip(
- reversed(self.grouper.names),
- reversed(self.grouper.get_group_levels()),
- reversed([grp.in_axis for grp in self.grouper.groupings]),
- ):
- # GH #28549
- # When using .apply(-), name will be in columns already
- if in_axis and name not in columns:
- result.insert(0, name, lev)
-
- return result
-
- def _indexed_output_to_ndframe(
- self, result: Mapping[base.OutputKey, ArrayLike]
- ) -> Series | DataFrame:
- raise AbstractMethodError(self)
-
- @final
- def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT:
- if self.axis == 1:
- # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy
- result = result.T
- if result.index.equals(self.obj.index):
- # Retain e.g. DatetimeIndex/TimedeltaIndex freq
- # e.g. test_groupby_crash_on_nunique
- result.index = self.obj.index.copy()
- return result
-
- @final
- def _wrap_aggregated_output(
- self,
- result: Series | DataFrame,
- qs: npt.NDArray[np.float64] | None = None,
- ):
- """
- Wraps the output of GroupBy aggregations into the expected result.
-
- Parameters
- ----------
- result : Series, DataFrame
-
- Returns
- -------
- Series or DataFrame
- """
- # ATM we do not get here for SeriesGroupBy; when we do, we will
- # need to require that result.name already match self.obj.name
-
- if not self.as_index:
- # `not self.as_index` is only relevant for DataFrameGroupBy,
- # enforced in __init__
- result = self._insert_inaxis_grouper(result)
- result = result._consolidate()
- index = Index(range(self.grouper.ngroups))
-
- else:
- index = self.grouper.result_index
-
- if qs is not None:
- # We get here with len(qs) != 1 and not self.as_index
- # in test_pass_args_kwargs
- index = _insert_quantile_level(index, qs)
-
- result.index = index
-
- # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has
- # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT"
- res = self._maybe_transpose_result(result) # type: ignore[arg-type]
- return self._reindex_output(res, qs=qs)
-
- def _wrap_applied_output(
- self,
- data,
- values: list,
- not_indexed_same: bool = False,
- is_transform: bool = False,
- ):
- raise AbstractMethodError(self)
-
- # -----------------------------------------------------------------
- # numba
-
- @final
- def _numba_prep(self, data: DataFrame):
- ids, _, ngroups = self.grouper.group_info
- sorted_index = get_group_index_sorter(ids, ngroups)
- sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False)
-
- sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()
- if len(self.grouper.groupings) > 1:
- raise NotImplementedError(
- "More than 1 grouping labels are not supported with engine='numba'"
- )
- # GH 46867
- index_data = data.index
- if isinstance(index_data, MultiIndex):
- group_key = self.grouper.groupings[0].name
- index_data = index_data.get_level_values(group_key)
- sorted_index_data = index_data.take(sorted_index).to_numpy()
-
- starts, ends = lib.generate_slices(sorted_ids, ngroups)
- return (
- starts,
- ends,
- sorted_index_data,
- sorted_data,
- )
-
- def _numba_agg_general(
- self,
- func: Callable,
- engine_kwargs: dict[str, bool] | None,
- *aggregator_args,
- ):
- """
- Perform groupby with a standard numerical aggregation function (e.g. mean)
- with Numba.
- """
- if not self.as_index:
- raise NotImplementedError(
- "as_index=False is not supported. Use .reset_index() instead."
- )
- if self.axis == 1:
- raise NotImplementedError("axis=1 is not supported.")
-
- data = self._obj_with_exclusions
- df = data if data.ndim == 2 else data.to_frame()
- starts, ends, sorted_index, sorted_data = self._numba_prep(df)
- aggregator = executor.generate_shared_aggregator(
- func, **get_jit_arguments(engine_kwargs)
- )
- result = aggregator(sorted_data, starts, ends, 0, *aggregator_args)
-
- index = self.grouper.result_index
- if data.ndim == 1:
- result_kwargs = {"name": data.name}
- result = result.ravel()
- else:
- result_kwargs = {"columns": data.columns}
- return data._constructor(result, index=index, **result_kwargs)
-
- @final
- def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
- """
- Perform groupby transform routine with the numba engine.
-
- This routine mimics the data splitting routine of the DataSplitter class
- to generate the indices of each group in the sorted data and then passes the
- data and indices into a Numba jitted function.
- """
- data = self._obj_with_exclusions
- df = data if data.ndim == 2 else data.to_frame()
-
- starts, ends, sorted_index, sorted_data = self._numba_prep(df)
- numba_.validate_udf(func)
- numba_transform_func = numba_.generate_numba_transform_func(
- func, **get_jit_arguments(engine_kwargs, kwargs)
- )
- result = numba_transform_func(
- sorted_data,
- sorted_index,
- starts,
- ends,
- len(df.columns),
- *args,
- )
- # result values needs to be resorted to their original positions since we
- # evaluated the data sorted by group
- result = result.take(np.argsort(sorted_index), axis=0)
- index = data.index
- if data.ndim == 1:
- result_kwargs = {"name": data.name}
- result = result.ravel()
- else:
- result_kwargs = {"columns": data.columns}
- return data._constructor(result, index=index, **result_kwargs)
-
- @final
- def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
- """
- Perform groupby aggregation routine with the numba engine.
-
- This routine mimics the data splitting routine of the DataSplitter class
- to generate the indices of each group in the sorted data and then passes the
- data and indices into a Numba jitted function.
- """
- data = self._obj_with_exclusions
- df = data if data.ndim == 2 else data.to_frame()
-
- starts, ends, sorted_index, sorted_data = self._numba_prep(df)
- numba_.validate_udf(func)
- numba_agg_func = numba_.generate_numba_agg_func(
- func, **get_jit_arguments(engine_kwargs, kwargs)
- )
- result = numba_agg_func(
- sorted_data,
- sorted_index,
- starts,
- ends,
- len(df.columns),
- *args,
- )
- index = self.grouper.result_index
- if data.ndim == 1:
- result_kwargs = {"name": data.name}
- result = result.ravel()
- else:
- result_kwargs = {"columns": data.columns}
- res = data._constructor(result, index=index, **result_kwargs)
- if not self.as_index:
- res = self._insert_inaxis_grouper(res)
- res.index = default_index(len(res))
- return res
-
- # -----------------------------------------------------------------
- # apply/agg/transform
-
- @Appender(
- _apply_docs["template"].format(
- input="dataframe", examples=_apply_docs["dataframe_examples"]
- )
- )
- def apply(self, func, *args, **kwargs) -> NDFrameT:
- func = com.is_builtin_func(func)
-
- if isinstance(func, str):
- if hasattr(self, func):
- res = getattr(self, func)
- if callable(res):
- return res(*args, **kwargs)
- elif args or kwargs:
- raise ValueError(f"Cannot pass arguments to property {func}")
- return res
-
- else:
- raise TypeError(f"apply func should be callable, not '{func}'")
-
- elif args or kwargs:
- if callable(func):
-
- @wraps(func)
- def f(g):
- with np.errstate(all="ignore"):
- return func(g, *args, **kwargs)
-
- else:
- raise ValueError(
- "func must be a callable if args or kwargs are supplied"
- )
- else:
- f = func
-
- # ignore SettingWithCopy here in case the user mutates
- with option_context("mode.chained_assignment", None):
- try:
- result = self._python_apply_general(f, self._selected_obj)
- except TypeError:
- # gh-20949
- # try again, with .apply acting as a filtering
- # operation, by excluding the grouping column
- # This would normally not be triggered
- # except if the udf is trying an operation that
- # fails on *some* columns, e.g. a numeric operation
- # on a string grouper column
-
- return self._python_apply_general(f, self._obj_with_exclusions)
-
- return result
-
- @final
- def _python_apply_general(
- self,
- f: Callable,
- data: DataFrame | Series,
- not_indexed_same: bool | None = None,
- is_transform: bool = False,
- is_agg: bool = False,
- ) -> NDFrameT:
- """
- Apply function f in python space
-
- Parameters
- ----------
- f : callable
- Function to apply
- data : Series or DataFrame
- Data to apply f to
- not_indexed_same: bool, optional
- When specified, overrides the value of not_indexed_same. Apply behaves
- differently when the result index is equal to the input index, but
- this can be coincidental leading to value-dependent behavior.
- is_transform : bool, default False
- Indicator for whether the function is actually a transform
- and should not have group keys prepended.
- is_agg : bool, default False
- Indicator for whether the function is an aggregation. When the
- result is empty, we don't want to warn for this case.
- See _GroupBy._python_agg_general.
-
- Returns
- -------
- Series or DataFrame
- data after applying f
- """
- values, mutated = self.grouper.apply(f, data, self.axis)
- if not_indexed_same is None:
- not_indexed_same = mutated
-
- return self._wrap_applied_output(
- data,
- values,
- not_indexed_same,
- is_transform,
- )
-
- @final
- def _agg_general(
- self,
- numeric_only: bool = False,
- min_count: int = -1,
- *,
- alias: str,
- npfunc: Callable,
- ):
- result = self._cython_agg_general(
- how=alias,
- alt=npfunc,
- numeric_only=numeric_only,
- min_count=min_count,
- )
- return result.__finalize__(self.obj, method="groupby")
-
- def _agg_py_fallback(
- self, values: ArrayLike, ndim: int, alt: Callable
- ) -> ArrayLike:
- """
- Fallback to pure-python aggregation if _cython_operation raises
- NotImplementedError.
- """
- # We get here with a) EADtypes and b) object dtype
- assert alt is not None
-
- if values.ndim == 1:
- # For DataFrameGroupBy we only get here with ExtensionArray
- ser = Series(values, copy=False)
- else:
- # We only get here with values.dtype == object
- # TODO: special case not needed with ArrayManager
- df = DataFrame(values.T)
- # bc we split object blocks in grouped_reduce, we have only 1 col
- # otherwise we'd have to worry about block-splitting GH#39329
- assert df.shape[1] == 1
- # Avoid call to self.values that can occur in DataFrame
- # reductions; see GH#28949
- ser = df.iloc[:, 0]
-
- # We do not get here with UDFs, so we know that our dtype
- # should always be preserved by the implemented aggregations
- # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
- res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
-
- if isinstance(values, Categorical):
- # Because we only get here with known dtype-preserving
- # reductions, we cast back to Categorical.
- # TODO: if we ever get "rank" working, exclude it here.
- res_values = type(values)._from_sequence(res_values, dtype=values.dtype)
-
- elif ser.dtype == object:
- res_values = res_values.astype(object, copy=False)
-
- # If we are DataFrameGroupBy and went through a SeriesGroupByPath
- # then we need to reshape
- # GH#32223 includes case with IntegerArray values, ndarray res_values
- # test_groupby_duplicate_columns with object dtype values
- return ensure_block_shape(res_values, ndim=ndim)
-
- @final
- def _cython_agg_general(
- self,
- how: str,
- alt: Callable,
- numeric_only: bool = False,
- min_count: int = -1,
- **kwargs,
- ):
- # Note: we never get here with how="ohlc" for DataFrameGroupBy;
- # that goes through SeriesGroupBy
-
- data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
-
- def array_func(values: ArrayLike) -> ArrayLike:
- try:
- result = self.grouper._cython_operation(
- "aggregate",
- values,
- how,
- axis=data.ndim - 1,
- min_count=min_count,
- **kwargs,
- )
- except NotImplementedError:
- # generally if we have numeric_only=False
- # and non-applicable functions
- # try to python agg
- # TODO: shouldn't min_count matter?
- result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
-
- return result
-
- new_mgr = data.grouped_reduce(array_func)
- res = self._wrap_agged_manager(new_mgr)
- out = self._wrap_aggregated_output(res)
- if self.axis == 1:
- out = out.infer_objects(copy=False)
- return out
-
- def _cython_transform(
- self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs
- ):
- raise AbstractMethodError(self)
-
- @final
- def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
- if maybe_use_numba(engine):
- return self._transform_with_numba(
- func, *args, engine_kwargs=engine_kwargs, **kwargs
- )
-
- # optimized transforms
- func = com.get_cython_func(func) or func
-
- if not isinstance(func, str):
- return self._transform_general(func, *args, **kwargs)
-
- elif func not in base.transform_kernel_allowlist:
- msg = f"'{func}' is not a valid function name for transform(name)"
- raise ValueError(msg)
- elif func in base.cythonized_kernels or func in base.transformation_kernels:
- # cythonized transform or canned "agg+broadcast"
- return getattr(self, func)(*args, **kwargs)
-
- else:
- # i.e. func in base.reduction_kernels
-
- # GH#30918 Use _transform_fast only when we know func is an aggregation
- # If func is a reduction, we need to broadcast the
- # result to the whole group. Compute func result
- # and deal with possible broadcasting below.
- # Temporarily set observed for dealing with categoricals.
- with com.temp_setattr(self, "observed", True):
- with com.temp_setattr(self, "as_index", True):
- # GH#49834 - result needs groups in the index for
- # _wrap_transform_fast_result
- result = getattr(self, func)(*args, **kwargs)
-
- return self._wrap_transform_fast_result(result)
-
- @final
- def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
- """
- Fast transform path for aggregations.
- """
- obj = self._obj_with_exclusions
-
- # for each col, reshape to size of original frame by take operation
- ids, _, _ = self.grouper.group_info
- result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False)
-
- if self.obj.ndim == 1:
- # i.e. SeriesGroupBy
- out = algorithms.take_nd(result._values, ids)
- output = obj._constructor(out, index=obj.index, name=obj.name)
- else:
- # `.size()` gives Series output on DataFrame input, need axis 0
- axis = 0 if result.ndim == 1 else self.axis
- # GH#46209
- # Don't convert indices: negative indices need to give rise
- # to null values in the result
- output = result._take(ids, axis=axis, convert_indices=False)
- output = output.set_axis(obj._get_axis(self.axis), axis=axis)
- return output
-
- # -----------------------------------------------------------------
- # Utilities
-
- @final
- def _apply_filter(self, indices, dropna):
- if len(indices) == 0:
- indices = np.array([], dtype="int64")
- else:
- indices = np.sort(np.concatenate(indices))
- if dropna:
- filtered = self._selected_obj.take(indices, axis=self.axis)
- else:
- mask = np.empty(len(self._selected_obj.index), dtype=bool)
- mask.fill(False)
- mask[indices.astype(int)] = True
- # mask fails to broadcast when passed to where; broadcast manually.
- mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
- filtered = self._selected_obj.where(mask) # Fill with NaNs.
- return filtered
-
- @final
- def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
- """
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from length of group - 1 to 0.
-
- Notes
- -----
- this is currently implementing sort=False
- (though the default is sort=True) for groupby in general
- """
- ids, _, ngroups = self.grouper.group_info
- sorter = get_group_index_sorter(ids, ngroups)
- ids, count = ids[sorter], len(ids)
-
- if count == 0:
- return np.empty(0, dtype=np.int64)
-
- run = np.r_[True, ids[:-1] != ids[1:]]
- rep = np.diff(np.r_[np.nonzero(run)[0], count])
- out = (~run).cumsum()
-
- if ascending:
- out -= np.repeat(out[run], rep)
- else:
- out = np.repeat(out[np.r_[run[1:], True]], rep) - out
-
- if self.grouper.has_dropped_na:
- out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))
- else:
- out = out.astype(np.int64, copy=False)
-
- rev = np.empty(count, dtype=np.intp)
- rev[sorter] = np.arange(count, dtype=np.intp)
- return out[rev]
-
- # -----------------------------------------------------------------
-
- @final
- @property
- def _obj_1d_constructor(self) -> Callable:
- # GH28330 preserve subclassed Series/DataFrames
- if isinstance(self.obj, DataFrame):
- return self.obj._constructor_sliced
- assert isinstance(self.obj, Series)
- return self.obj._constructor
-
- @final
- def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool):
- """
- Shared func to call any / all Cython GroupBy implementations.
- """
-
- def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:
- if is_object_dtype(vals.dtype) and skipna:
- # GH#37501: don't raise on pd.NA when skipna=True
- mask = isna(vals)
- if mask.any():
- # mask on original values computed separately
- vals = vals.copy()
- vals[mask] = True
- elif isinstance(vals, BaseMaskedArray):
- vals = vals._data
- vals = vals.astype(bool, copy=False)
- return vals.view(np.int8), bool
-
- def result_to_bool(
- result: np.ndarray,
- inference: type,
- nullable: bool = False,
- ) -> ArrayLike:
- if nullable:
- return BooleanArray(result.astype(bool, copy=False), result == -1)
- else:
- return result.astype(inference, copy=False)
-
- return self._get_cythonized_result(
- libgroupby.group_any_all,
- numeric_only=False,
- cython_dtype=np.dtype(np.int8),
- pre_processing=objs_to_bool,
- post_processing=result_to_bool,
- val_test=val_test,
- skipna=skipna,
- )
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def any(self, skipna: bool = True):
- """
- Return True if any value in the group is truthful, else False.
-
- Parameters
- ----------
- skipna : bool, default True
- Flag to ignore nan values during truth testing.
-
- Returns
- -------
- Series or DataFrame
- DataFrame or Series of boolean values, where a value is True if any element
- is True within its respective group, False otherwise.
- """
- return self._bool_agg("any", skipna)
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def all(self, skipna: bool = True):
- """
- Return True if all values in the group are truthful, else False.
-
- Parameters
- ----------
- skipna : bool, default True
- Flag to ignore nan values during truth testing.
-
- Returns
- -------
- Series or DataFrame
- DataFrame or Series of boolean values, where a value is True if all elements
- are True within its respective group, False otherwise.
- """
- return self._bool_agg("all", skipna)
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def count(self) -> NDFrameT:
- """
- Compute count of group, excluding missing values.
-
- Returns
- -------
- Series or DataFrame
- Count of values within each group.
- """
- data = self._get_data_to_aggregate()
- ids, _, ngroups = self.grouper.group_info
- mask = ids != -1
-
- is_series = data.ndim == 1
-
- def hfunc(bvalues: ArrayLike) -> ArrayLike:
- # TODO(EA2D): reshape would not be necessary with 2D EAs
- if bvalues.ndim == 1:
- # EA
- masked = mask & ~isna(bvalues).reshape(1, -1)
- else:
- masked = mask & ~isna(bvalues)
-
- counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups)
- if is_series:
- assert counted.ndim == 2
- assert counted.shape[0] == 1
- return counted[0]
- return counted
-
- new_mgr = data.grouped_reduce(hfunc)
- new_obj = self._wrap_agged_manager(new_mgr)
-
- # If we are grouping on categoricals we want unobserved categories to
- # return zero, rather than the default of NaN which the reindexing in
- # _wrap_aggregated_output() returns. GH 35028
- # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false
- with com.temp_setattr(self, "observed", True):
- result = self._wrap_aggregated_output(new_obj)
-
- return self._reindex_output(result, fill_value=0)
-
- @final
- @Substitution(name="groupby")
- @Substitution(see_also=_common_see_also)
- def mean(
- self,
- numeric_only: bool = False,
- engine: str = "cython",
- engine_kwargs: dict[str, bool] | None = None,
- ):
- """
- Compute mean of groups, excluding missing values.
-
- Parameters
- ----------
- numeric_only : bool, default False
- Include only float, int, boolean columns.
-
- .. versionchanged:: 2.0.0
-
- numeric_only no longer accepts ``None`` and defaults to ``False``.
-
- engine : str, default None
- * ``'cython'`` : Runs the operation through C-extensions from cython.
- * ``'numba'`` : Runs the operation through JIT compiled code from numba.
- * ``None`` : Defaults to ``'cython'`` or globally setting
- ``compute.use_numba``
-
- .. versionadded:: 1.4.0
-
- engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
-
- .. versionadded:: 1.4.0
-
- Returns
- -------
- pandas.Series or pandas.DataFrame
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
- ... 'B': [np.nan, 2, 3, 4, 5],
- ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
-
- Groupby one column and return the mean of the remaining columns in
- each group.
-
- >>> df.groupby('A').mean()
- B C
- A
- 1 3.0 1.333333
- 2 4.0 1.500000
-
- Groupby two columns and return the mean of the remaining column.
-
- >>> df.groupby(['A', 'B']).mean()
- C
- A B
- 1 2.0 2.0
- 4.0 1.0
- 2 3.0 1.0
- 5.0 2.0
-
- Groupby one column and return the mean of only particular column in
- the group.
-
- >>> df.groupby('A')['B'].mean()
- A
- 1 3.0
- 2 4.0
- Name: B, dtype: float64
- """
-
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_mean
-
- return self._numba_agg_general(sliding_mean, engine_kwargs)
- else:
- result = self._cython_agg_general(
- "mean",
- alt=lambda x: Series(x).mean(numeric_only=numeric_only),
- numeric_only=numeric_only,
- )
- return result.__finalize__(self.obj, method="groupby")
-
- @final
- def median(self, numeric_only: bool = False):
- """
- Compute median of groups, excluding missing values.
-
- For multiple groupings, the result index will be a MultiIndex
-
- Parameters
- ----------
- numeric_only : bool, default False
- Include only float, int, boolean columns.
-
- .. versionchanged:: 2.0.0
-
- numeric_only no longer accepts ``None`` and defaults to False.
-
- Returns
- -------
- Series or DataFrame
- Median of values within each group.
- """
- result = self._cython_agg_general(
- "median",
- alt=lambda x: Series(x).median(numeric_only=numeric_only),
- numeric_only=numeric_only,
- )
- return result.__finalize__(self.obj, method="groupby")
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def std(
- self,
- ddof: int = 1,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- numeric_only: bool = False,
- ):
- """
- Compute standard deviation of groups, excluding missing values.
-
- For multiple groupings, the result index will be a MultiIndex.
-
- Parameters
- ----------
- ddof : int, default 1
- Degrees of freedom.
-
- engine : str, default None
- * ``'cython'`` : Runs the operation through C-extensions from cython.
- * ``'numba'`` : Runs the operation through JIT compiled code from numba.
- * ``None`` : Defaults to ``'cython'`` or globally setting
- ``compute.use_numba``
-
- .. versionadded:: 1.4.0
-
- engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
-
- .. versionadded:: 1.4.0
-
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- .. versionchanged:: 2.0.0
-
- numeric_only now defaults to ``False``.
-
- Returns
- -------
- Series or DataFrame
- Standard deviation of values within each group.
- """
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_var
-
- return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof))
- else:
-
- def _preprocessing(values):
- if isinstance(values, BaseMaskedArray):
- return values._data, None
- return values, None
-
- def _postprocessing(
- vals, inference, nullable: bool = False, result_mask=None
- ) -> ArrayLike:
- if nullable:
- if result_mask.ndim == 2:
- result_mask = result_mask[:, 0]
- return FloatingArray(np.sqrt(vals), result_mask.view(np.bool_))
- return np.sqrt(vals)
-
- result = self._get_cythonized_result(
- libgroupby.group_var,
- cython_dtype=np.dtype(np.float64),
- numeric_only=numeric_only,
- needs_counts=True,
- pre_processing=_preprocessing,
- post_processing=_postprocessing,
- ddof=ddof,
- how="std",
- )
- return result
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def var(
- self,
- ddof: int = 1,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- numeric_only: bool = False,
- ):
- """
- Compute variance of groups, excluding missing values.
-
- For multiple groupings, the result index will be a MultiIndex.
-
- Parameters
- ----------
- ddof : int, default 1
- Degrees of freedom.
-
- engine : str, default None
- * ``'cython'`` : Runs the operation through C-extensions from cython.
- * ``'numba'`` : Runs the operation through JIT compiled code from numba.
- * ``None`` : Defaults to ``'cython'`` or globally setting
- ``compute.use_numba``
-
- .. versionadded:: 1.4.0
-
- engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
-
- .. versionadded:: 1.4.0
-
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- .. versionchanged:: 2.0.0
-
- numeric_only now defaults to ``False``.
-
- Returns
- -------
- Series or DataFrame
- Variance of values within each group.
- """
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_var
-
- return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
- else:
- return self._cython_agg_general(
- "var",
- alt=lambda x: Series(x).var(ddof=ddof),
- numeric_only=numeric_only,
- ddof=ddof,
- )
-
- @final
- def _value_counts(
- self,
- subset: Sequence[Hashable] | None = None,
- normalize: bool = False,
- sort: bool = True,
- ascending: bool = False,
- dropna: bool = True,
- ) -> DataFrame | Series:
- """
- Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy.
-
- SeriesGroupBy additionally supports a bins argument. See the docstring of
- DataFrameGroupBy.value_counts for a description of arguments.
- """
- if self.axis == 1:
- raise NotImplementedError(
- "DataFrameGroupBy.value_counts only handles axis=0"
- )
- name = "proportion" if normalize else "count"
-
- df = self.obj
- obj = self._obj_with_exclusions
-
- in_axis_names = {
- grouping.name for grouping in self.grouper.groupings if grouping.in_axis
- }
- if isinstance(obj, Series):
- _name = obj.name
- keys = [] if _name in in_axis_names else [obj]
- else:
- unique_cols = set(obj.columns)
- if subset is not None:
- subsetted = set(subset)
- clashing = subsetted & set(in_axis_names)
- if clashing:
- raise ValueError(
- f"Keys {clashing} in subset cannot be in "
- "the groupby column keys."
- )
- doesnt_exist = subsetted - unique_cols
- if doesnt_exist:
- raise ValueError(
- f"Keys {doesnt_exist} in subset do not "
- f"exist in the DataFrame."
- )
- else:
- subsetted = unique_cols
-
- keys = [
- # Can't use .values because the column label needs to be preserved
- obj.iloc[:, idx]
- for idx, _name in enumerate(obj.columns)
- if _name not in in_axis_names and _name in subsetted
- ]
-
- groupings = list(self.grouper.groupings)
- for key in keys:
- grouper, _, _ = get_grouper(
- df,
- key=key,
- axis=self.axis,
- sort=self.sort,
- observed=False,
- dropna=dropna,
- )
- groupings += list(grouper.groupings)
-
- # Take the size of the overall columns
- gb = df.groupby(
- groupings,
- sort=self.sort,
- observed=self.observed,
- dropna=self.dropna,
- )
- result_series = cast(Series, gb.size())
- result_series.name = name
-
- # GH-46357 Include non-observed categories
- # of non-grouping columns regardless of `observed`
- if any(
- isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
- and not grouping._observed
- for grouping in groupings
- ):
- levels_list = [ping.result_index for ping in groupings]
- multi_index, _ = MultiIndex.from_product(
- levels_list, names=[ping.name for ping in groupings]
- ).sortlevel()
- result_series = result_series.reindex(multi_index, fill_value=0)
-
- if normalize:
- # Normalize the results by dividing by the original group sizes.
- # We are guaranteed to have the first N levels be the
- # user-requested grouping.
- levels = list(
- range(len(self.grouper.groupings), result_series.index.nlevels)
- )
- indexed_group_size = result_series.groupby(
- result_series.index.droplevel(levels),
- sort=self.sort,
- dropna=self.dropna,
- ).transform("sum")
- result_series /= indexed_group_size
-
- # Handle groups of non-observed categories
- result_series = result_series.fillna(0.0)
-
- if sort:
- # Sort the values and then resort by the main grouping
- index_level = range(len(self.grouper.groupings))
- result_series = result_series.sort_values(ascending=ascending).sort_index(
- level=index_level, sort_remaining=False
- )
-
- result: Series | DataFrame
- if self.as_index:
- result = result_series
- else:
- # Convert to frame
- index = result_series.index
- columns = com.fill_missing_names(index.names)
- if name in columns:
- raise ValueError(f"Column label '{name}' is duplicate of result column")
- result_series.name = name
- result_series.index = index.set_names(range(len(columns)))
- result_frame = result_series.reset_index()
- result_frame.columns = columns + [name]
- result = result_frame
- return result.__finalize__(self.obj, method="value_counts")
-
- @final
- def sem(self, ddof: int = 1, numeric_only: bool = False):
- """
- Compute standard error of the mean of groups, excluding missing values.
-
- For multiple groupings, the result index will be a MultiIndex.
-
- Parameters
- ----------
- ddof : int, default 1
- Degrees of freedom.
-
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- .. versionchanged:: 2.0.0
-
- numeric_only now defaults to ``False``.
-
- Returns
- -------
- Series or DataFrame
- Standard error of the mean of values within each group.
- """
- if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
- raise TypeError(
- f"{type(self).__name__}.sem called with "
- f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
- )
- result = self.std(ddof=ddof, numeric_only=numeric_only)
-
- if result.ndim == 1:
- result /= np.sqrt(self.count())
- else:
- cols = result.columns.difference(self.exclusions).unique()
- counts = self.count()
- result_ilocs = result.columns.get_indexer_for(cols)
- count_ilocs = counts.columns.get_indexer_for(cols)
-
- result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs])
- return result
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def size(self) -> DataFrame | Series:
- """
- Compute group sizes.
-
- Returns
- -------
- DataFrame or Series
- Number of rows in each group as a Series if as_index is True
- or a DataFrame if as_index is False.
- """
- result = self.grouper.size()
-
- # GH28330 preserve subclassed Series/DataFrames through calls
- if isinstance(self.obj, Series):
- result = self._obj_1d_constructor(result, name=self.obj.name)
- else:
- result = self._obj_1d_constructor(result)
-
- with com.temp_setattr(self, "as_index", True):
- # size already has the desired behavior in GH#49519, but this makes the
- # as_index=False path of _reindex_output fail on categorical groupers.
- result = self._reindex_output(result, fill_value=0)
- if not self.as_index:
- # error: Incompatible types in assignment (expression has
- # type "DataFrame", variable has type "Series")
- result = result.rename("size").reset_index() # type: ignore[assignment]
- return result
-
- @final
- @doc(_groupby_agg_method_template, fname="sum", no=False, mc=0)
- def sum(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_sum
-
- return self._numba_agg_general(
- sliding_sum,
- engine_kwargs,
- )
- else:
- # If we are grouping on categoricals we want unobserved categories to
- # return zero, rather than the default of NaN which the reindexing in
- # _agg_general() returns. GH #31422
- with com.temp_setattr(self, "observed", True):
- result = self._agg_general(
- numeric_only=numeric_only,
- min_count=min_count,
- alias="sum",
- npfunc=np.sum,
- )
-
- return self._reindex_output(result, fill_value=0)
-
- @final
- @doc(_groupby_agg_method_template, fname="prod", no=False, mc=0)
- def prod(self, numeric_only: bool = False, min_count: int = 0):
- return self._agg_general(
- numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
- )
-
- @final
- @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1)
- def min(
- self,
- numeric_only: bool = False,
- min_count: int = -1,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_min_max
-
- return self._numba_agg_general(sliding_min_max, engine_kwargs, False)
- else:
- return self._agg_general(
- numeric_only=numeric_only,
- min_count=min_count,
- alias="min",
- npfunc=np.min,
- )
-
- @final
- @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1)
- def max(
- self,
- numeric_only: bool = False,
- min_count: int = -1,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_min_max
-
- return self._numba_agg_general(sliding_min_max, engine_kwargs, True)
- else:
- return self._agg_general(
- numeric_only=numeric_only,
- min_count=min_count,
- alias="max",
- npfunc=np.max,
- )
-
- @final
- def first(self, numeric_only: bool = False, min_count: int = -1):
- """
- Compute the first non-null entry of each column.
-
- Parameters
- ----------
- numeric_only : bool, default False
- Include only float, int, boolean columns.
- min_count : int, default -1
- The required number of valid values to perform the operation. If fewer
- than ``min_count`` non-NA values are present the result will be NA.
-
- Returns
- -------
- Series or DataFrame
- First non-null of values within each group.
-
- See Also
- --------
- DataFrame.groupby : Apply a function groupby to each row or column of a
- DataFrame.
- pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry
- of each column.
- pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.
-
- Examples
- --------
- >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3],
- ... D=['3/11/2000', '3/12/2000', '3/13/2000']))
- >>> df['D'] = pd.to_datetime(df['D'])
- >>> df.groupby("A").first()
- B C D
- A
- 1 5.0 1 2000-03-11
- 3 6.0 3 2000-03-13
- >>> df.groupby("A").first(min_count=2)
- B C D
- A
- 1 NaN 1.0 2000-03-11
- 3 NaN NaN NaT
- >>> df.groupby("A").first(numeric_only=True)
- B C
- A
- 1 5.0 1
- 3 6.0 3
- """
-
- def first_compat(obj: NDFrameT, axis: AxisInt = 0):
- def first(x: Series):
- """Helper function for first item that isn't NA."""
- arr = x.array[notna(x.array)]
- if not len(arr):
- return np.nan
- return arr[0]
-
- if isinstance(obj, DataFrame):
- return obj.apply(first, axis=axis)
- elif isinstance(obj, Series):
- return first(obj)
- else: # pragma: no cover
- raise TypeError(type(obj))
-
- return self._agg_general(
- numeric_only=numeric_only,
- min_count=min_count,
- alias="first",
- npfunc=first_compat,
- )
-
- @final
- def last(self, numeric_only: bool = False, min_count: int = -1):
- """
- Compute the last non-null entry of each column.
-
- Parameters
- ----------
- numeric_only : bool, default False
- Include only float, int, boolean columns. If None, will attempt to use
- everything, then use only numeric data.
- min_count : int, default -1
- The required number of valid values to perform the operation. If fewer
- than ``min_count`` non-NA values are present the result will be NA.
-
- Returns
- -------
- Series or DataFrame
- Last non-null of values within each group.
-
- See Also
- --------
- DataFrame.groupby : Apply a function groupby to each row or column of a
- DataFrame.
- pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry
- of each column.
- pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.
-
- Examples
- --------
- >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3]))
- >>> df.groupby("A").last()
- B C
- A
- 1 5.0 2
- 3 6.0 3
- """
-
- def last_compat(obj: NDFrameT, axis: AxisInt = 0):
- def last(x: Series):
- """Helper function for last item that isn't NA."""
- arr = x.array[notna(x.array)]
- if not len(arr):
- return np.nan
- return arr[-1]
-
- if isinstance(obj, DataFrame):
- return obj.apply(last, axis=axis)
- elif isinstance(obj, Series):
- return last(obj)
- else: # pragma: no cover
- raise TypeError(type(obj))
-
- return self._agg_general(
- numeric_only=numeric_only,
- min_count=min_count,
- alias="last",
- npfunc=last_compat,
- )
-
- @final
- def ohlc(self) -> DataFrame:
- """
- Compute open, high, low and close values of a group, excluding missing values.
-
- For multiple groupings, the result index will be a MultiIndex
-
- Returns
- -------
- DataFrame
- Open, high, low and close values within each group.
- """
- if self.obj.ndim == 1:
- # self._iterate_slices() yields only self._selected_obj
- obj = self._selected_obj
-
- is_numeric = is_numeric_dtype(obj.dtype)
- if not is_numeric:
- raise DataError("No numeric types to aggregate")
-
- res_values = self.grouper._cython_operation(
- "aggregate", obj._values, "ohlc", axis=0, min_count=-1
- )
-
- agg_names = ["open", "high", "low", "close"]
- result = self.obj._constructor_expanddim(
- res_values, index=self.grouper.result_index, columns=agg_names
- )
- return self._reindex_output(result)
-
- result = self._apply_to_column_groupbys(
- lambda x: x.ohlc(), self._obj_with_exclusions
- )
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
- return result
-
- @doc(DataFrame.describe)
- def describe(
- self,
- percentiles=None,
- include=None,
- exclude=None,
- ) -> NDFrameT:
- obj = self._obj_with_exclusions
-
- if len(obj) == 0:
- described = obj.describe(
- percentiles=percentiles, include=include, exclude=exclude
- )
- if obj.ndim == 1:
- result = described
- else:
- result = described.unstack()
- return result.to_frame().T.iloc[:0]
-
- with com.temp_setattr(self, "as_index", True):
- result = self._python_apply_general(
- lambda x: x.describe(
- percentiles=percentiles, include=include, exclude=exclude
- ),
- obj,
- not_indexed_same=True,
- )
- if self.axis == 1:
- return result.T
-
- # GH#49256 - properly handle the grouping column(s)
- result = result.unstack()
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
-
- return result
-
- @final
- def resample(self, rule, *args, **kwargs):
- """
- Provide resampling when using a TimeGrouper.
-
- Given a grouper, the function resamples it according to a string
- "string" -> "frequency".
-
- See the :ref:`frequency aliases <timeseries.offset_aliases>`
- documentation for more details.
-
- Parameters
- ----------
- rule : str or DateOffset
- The offset string or object representing target grouper conversion.
- *args, **kwargs
- Possible arguments are `how`, `fill_method`, `limit`, `kind` and
- `on`, and other arguments of `TimeGrouper`.
-
- Returns
- -------
- Grouper
- Return a new grouper with our resampler appended.
-
- See Also
- --------
- Grouper : Specify a frequency to resample with when
- grouping by a key.
- DatetimeIndex.resample : Frequency conversion and resampling of
- time series.
-
- Examples
- --------
- >>> idx = pd.date_range('1/1/2000', periods=4, freq='T')
- >>> df = pd.DataFrame(data=4 * [range(2)],
- ... index=idx,
- ... columns=['a', 'b'])
- >>> df.iloc[2, 0] = 5
- >>> df
- a b
- 2000-01-01 00:00:00 0 1
- 2000-01-01 00:01:00 0 1
- 2000-01-01 00:02:00 5 1
- 2000-01-01 00:03:00 0 1
-
- Downsample the DataFrame into 3 minute bins and sum the values of
- the timestamps falling into a bin.
-
- >>> df.groupby('a').resample('3T').sum()
- a b
- a
- 0 2000-01-01 00:00:00 0 2
- 2000-01-01 00:03:00 0 1
- 5 2000-01-01 00:00:00 5 1
-
- Upsample the series into 30 second bins.
-
- >>> df.groupby('a').resample('30S').sum()
- a b
- a
- 0 2000-01-01 00:00:00 0 1
- 2000-01-01 00:00:30 0 0
- 2000-01-01 00:01:00 0 1
- 2000-01-01 00:01:30 0 0
- 2000-01-01 00:02:00 0 0
- 2000-01-01 00:02:30 0 0
- 2000-01-01 00:03:00 0 1
- 5 2000-01-01 00:02:00 5 1
-
- Resample by month. Values are assigned to the month of the period.
-
- >>> df.groupby('a').resample('M').sum()
- a b
- a
- 0 2000-01-31 0 3
- 5 2000-01-31 5 1
-
- Downsample the series into 3 minute bins as above, but close the right
- side of the bin interval.
-
- >>> df.groupby('a').resample('3T', closed='right').sum()
- a b
- a
- 0 1999-12-31 23:57:00 0 1
- 2000-01-01 00:00:00 0 2
- 5 2000-01-01 00:00:00 5 1
-
- Downsample the series into 3 minute bins and close the right side of
- the bin interval, but label each bin using the right edge instead of
- the left.
-
- >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
- a b
- a
- 0 2000-01-01 00:00:00 0 1
- 2000-01-01 00:03:00 0 2
- 5 2000-01-01 00:03:00 5 1
- """
- from pandas.core.resample import get_resampler_for_grouping
-
- return get_resampler_for_grouping(self, rule, *args, **kwargs)
-
- @final
- def rolling(self, *args, **kwargs) -> RollingGroupby:
- """
- Return a rolling grouper, providing rolling functionality per group.
-
- Parameters
- ----------
- window : int, timedelta, str, offset, or BaseIndexer subclass
- Size of the moving window.
-
- If an integer, the fixed number of observations used for
- each window.
-
- If a timedelta, str, or offset, the time period of each window. Each
- window will be a variable sized based on the observations included in
- the time-period. This is only valid for datetimelike indexes.
- To learn more about the offsets & frequency strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
-
- If a BaseIndexer subclass, the window boundaries
- based on the defined ``get_window_bounds`` method. Additional rolling
- keyword arguments, namely ``min_periods``, ``center``, ``closed`` and
- ``step`` will be passed to ``get_window_bounds``.
-
- min_periods : int, default None
- Minimum number of observations in window required to have a value;
- otherwise, result is ``np.nan``.
-
- For a window that is specified by an offset,
- ``min_periods`` will default to 1.
-
- For a window that is specified by an integer, ``min_periods`` will default
- to the size of the window.
-
- center : bool, default False
- If False, set the window labels as the right edge of the window index.
-
- If True, set the window labels as the center of the window index.
-
- win_type : str, default None
- If ``None``, all points are evenly weighted.
-
- If a string, it must be a valid `scipy.signal window function
- <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__.
-
- Certain Scipy window types require additional parameters to be passed
- in the aggregation function. The additional parameters must match
- the keywords specified in the Scipy window type method signature.
-
- on : str, optional
- For a DataFrame, a column label or Index level on which
- to calculate the rolling window, rather than the DataFrame's index.
-
- Provided integer column is ignored and excluded from result since
- an integer index is not used to calculate the rolling window.
-
- axis : int or str, default 0
- If ``0`` or ``'index'``, roll across the rows.
-
- If ``1`` or ``'columns'``, roll across the columns.
-
- For `Series` this parameter is unused and defaults to 0.
-
- closed : str, default None
- If ``'right'``, the first point in the window is excluded from calculations.
-
- If ``'left'``, the last point in the window is excluded from calculations.
-
- If ``'both'``, the no points in the window are excluded from calculations.
-
- If ``'neither'``, the first and last points in the window are excluded
- from calculations.
-
- Default ``None`` (``'right'``).
-
- method : str {'single', 'table'}, default 'single'
- Execute the rolling operation per single column or row (``'single'``)
- or over the entire object (``'table'``).
-
- This argument is only implemented when specifying ``engine='numba'``
- in the method call.
-
- Returns
- -------
- RollingGroupby
- Return a new grouper with our rolling appended.
-
- See Also
- --------
- Series.rolling : Calling object with Series data.
- DataFrame.rolling : Calling object with DataFrames.
- Series.groupby : Apply a function groupby to a Series.
- DataFrame.groupby : Apply a function groupby.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 2, 2],
- ... 'B': [1, 2, 3, 4],
- ... 'C': [0.362, 0.227, 1.267, -0.562]})
- >>> df
- A B C
- 0 1 1 0.362
- 1 1 2 0.227
- 2 2 3 1.267
- 3 2 4 -0.562
-
- >>> df.groupby('A').rolling(2).sum()
- B C
- A
- 1 0 NaN NaN
- 1 3.0 0.589
- 2 2 NaN NaN
- 3 7.0 0.705
-
- >>> df.groupby('A').rolling(2, min_periods=1).sum()
- B C
- A
- 1 0 1.0 0.362
- 1 3.0 0.589
- 2 2 3.0 1.267
- 3 7.0 0.705
-
- >>> df.groupby('A').rolling(2, on='B').sum()
- B C
- A
- 1 0 1 NaN
- 1 2 0.589
- 2 2 3 NaN
- 3 4 0.705
- """
- from pandas.core.window import RollingGroupby
-
- return RollingGroupby(
- self._selected_obj,
- *args,
- _grouper=self.grouper,
- _as_index=self.as_index,
- **kwargs,
- )
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def expanding(self, *args, **kwargs) -> ExpandingGroupby:
- """
- Return an expanding grouper, providing expanding
- functionality per group.
- """
- from pandas.core.window import ExpandingGroupby
-
- return ExpandingGroupby(
- self._selected_obj,
- *args,
- _grouper=self.grouper,
- **kwargs,
- )
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:
- """
- Return an ewm grouper, providing ewm functionality per group.
- """
- from pandas.core.window import ExponentialMovingWindowGroupby
-
- return ExponentialMovingWindowGroupby(
- self._selected_obj,
- *args,
- _grouper=self.grouper,
- **kwargs,
- )
-
- @final
- def _fill(self, direction: Literal["ffill", "bfill"], limit=None):
- """
- Shared function for `pad` and `backfill` to call Cython method.
-
- Parameters
- ----------
- direction : {'ffill', 'bfill'}
- Direction passed to underlying Cython function. `bfill` will cause
- values to be filled backwards. `ffill` and any other values will
- default to a forward fill
- limit : int, default None
- Maximum number of consecutive values to fill. If `None`, this
- method will convert to -1 prior to passing to Cython
-
- Returns
- -------
- `Series` or `DataFrame` with filled values
-
- See Also
- --------
- pad : Returns Series with minimum number of char in object.
- backfill : Backward fill the missing values in the dataset.
- """
- # Need int value for Cython
- if limit is None:
- limit = -1
-
- ids, _, _ = self.grouper.group_info
- sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False)
- if direction == "bfill":
- sorted_labels = sorted_labels[::-1]
-
- col_func = partial(
- libgroupby.group_fillna_indexer,
- labels=ids,
- sorted_labels=sorted_labels,
- direction=direction,
- limit=limit,
- dropna=self.dropna,
- )
-
- def blk_func(values: ArrayLike) -> ArrayLike:
- mask = isna(values)
- if values.ndim == 1:
- indexer = np.empty(values.shape, dtype=np.intp)
- col_func(out=indexer, mask=mask)
- return algorithms.take_nd(values, indexer)
-
- else:
- # We broadcast algorithms.take_nd analogous to
- # np.take_along_axis
-
- # Note: we only get here with backfill/pad,
- # so if we have a dtype that cannot hold NAs,
- # then there will be no -1s in indexer, so we can use
- # the original dtype (no need to ensure_dtype_can_hold_na)
- if isinstance(values, np.ndarray):
- dtype = values.dtype
- if self.grouper.has_dropped_na:
- # dropped null groups give rise to nan in the result
- dtype = ensure_dtype_can_hold_na(values.dtype)
- out = np.empty(values.shape, dtype=dtype)
- else:
- out = type(values)._empty(values.shape, dtype=values.dtype)
-
- for i, value_element in enumerate(values):
- # call group_fillna_indexer column-wise
- indexer = np.empty(values.shape[1], dtype=np.intp)
- col_func(out=indexer, mask=mask[i])
- out[i, :] = algorithms.take_nd(value_element, indexer)
- return out
-
- mgr = self._get_data_to_aggregate()
- res_mgr = mgr.apply(blk_func)
-
- new_obj = self._wrap_agged_manager(res_mgr)
-
- if self.axis == 1:
- # Only relevant for DataFrameGroupBy
- new_obj = new_obj.T
- new_obj.columns = self.obj.columns
-
- new_obj.index = self.obj.index
- return new_obj
-
- @final
- @Substitution(name="groupby")
- def ffill(self, limit=None):
- """
- Forward fill the values.
-
- Parameters
- ----------
- limit : int, optional
- Limit of how many values to fill.
-
- Returns
- -------
- Series or DataFrame
- Object with missing values filled.
-
- See Also
- --------
- Series.ffill: Returns Series with minimum number of char in object.
- DataFrame.ffill: Object with missing values filled or None if inplace=True.
- Series.fillna: Fill NaN values of a Series.
- DataFrame.fillna: Fill NaN values of a DataFrame.
- """
- return self._fill("ffill", limit=limit)
-
- @final
- @Substitution(name="groupby")
- def bfill(self, limit=None):
- """
- Backward fill the values.
-
- Parameters
- ----------
- limit : int, optional
- Limit of how many values to fill.
-
- Returns
- -------
- Series or DataFrame
- Object with missing values filled.
-
- See Also
- --------
- Series.bfill : Backward fill the missing values in the dataset.
- DataFrame.bfill: Backward fill the missing values in the dataset.
- Series.fillna: Fill NaN values of a Series.
- DataFrame.fillna: Fill NaN values of a DataFrame.
- """
- return self._fill("bfill", limit=limit)
-
- @final
- @property
- @Substitution(name="groupby")
- @Substitution(see_also=_common_see_also)
- def nth(self) -> GroupByNthSelector:
- """
- Take the nth row from each group if n is an int, otherwise a subset of rows.
-
- Can be either a call or an index. dropna is not available with index notation.
- Index notation accepts a comma separated list of integers and slices.
-
- If dropna, will take the nth non-null row, dropna is either
- 'all' or 'any'; this is equivalent to calling dropna(how=dropna)
- before the groupby.
-
- Parameters
- ----------
- n : int, slice or list of ints and slices
- A single nth value for the row or a list of nth values or slices.
-
- .. versionchanged:: 1.4.0
- Added slice and lists containing slices.
- Added index notation.
-
- dropna : {'any', 'all', None}, default None
- Apply the specified dropna operation before counting which row is
- the nth row. Only supported if n is an int.
-
- Returns
- -------
- Series or DataFrame
- N-th value within each group.
- %(see_also)s
- Examples
- --------
-
- >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
- ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
- >>> g = df.groupby('A')
- >>> g.nth(0)
- A B
- 0 1 NaN
- 2 2 3.0
- >>> g.nth(1)
- A B
- 1 1 2.0
- 4 2 5.0
- >>> g.nth(-1)
- A B
- 3 1 4.0
- 4 2 5.0
- >>> g.nth([0, 1])
- A B
- 0 1 NaN
- 1 1 2.0
- 2 2 3.0
- 4 2 5.0
- >>> g.nth(slice(None, -1))
- A B
- 0 1 NaN
- 1 1 2.0
- 2 2 3.0
-
- Index notation may also be used
-
- >>> g.nth[0, 1]
- A B
- 0 1 NaN
- 1 1 2.0
- 2 2 3.0
- 4 2 5.0
- >>> g.nth[:-1]
- A B
- 0 1 NaN
- 1 1 2.0
- 2 2 3.0
-
- Specifying `dropna` allows ignoring ``NaN`` values
-
- >>> g.nth(0, dropna='any')
- A B
- 1 1 2.0
- 2 2 3.0
-
- When the specified ``n`` is larger than any of the groups, an
- empty DataFrame is returned
-
- >>> g.nth(3, dropna='any')
- Empty DataFrame
- Columns: [A, B]
- Index: []
- """
- return GroupByNthSelector(self)
-
- def _nth(
- self,
- n: PositionalIndexer | tuple,
- dropna: Literal["any", "all", None] = None,
- ) -> NDFrameT:
- if not dropna:
- mask = self._make_mask_from_positional_indexer(n)
-
- ids, _, _ = self.grouper.group_info
-
- # Drop NA values in grouping
- mask = mask & (ids != -1)
-
- out = self._mask_selected_obj(mask)
- return out
-
- # dropna is truthy
- if not is_integer(n):
- raise ValueError("dropna option only supported for an integer argument")
-
- if dropna not in ["any", "all"]:
- # Note: when agg-ing picker doesn't raise this, just returns NaN
- raise ValueError(
- "For a DataFrame or Series groupby.nth, dropna must be "
- "either None, 'any' or 'all', "
- f"(was passed {dropna})."
- )
-
- # old behaviour, but with all and any support for DataFrames.
- # modified in GH 7559 to have better perf
- n = cast(int, n)
- dropped = self.obj.dropna(how=dropna, axis=self.axis)
-
- # get a new grouper for our dropped obj
- if self.keys is None and self.level is None:
- # we don't have the grouper info available
- # (e.g. we have selected out
- # a column that is not in the current object)
- axis = self.grouper.axis
- grouper = self.grouper.codes_info[axis.isin(dropped.index)]
- if self.grouper.has_dropped_na:
- # Null groups need to still be encoded as -1 when passed to groupby
- nulls = grouper == -1
- # error: No overload variant of "where" matches argument types
- # "Any", "NAType", "Any"
- values = np.where(nulls, NA, grouper) # type: ignore[call-overload]
- grouper = Index(values, dtype="Int64") # type: ignore[assignment]
-
- else:
- # create a grouper with the original parameters, but on dropped
- # object
- grouper, _, _ = get_grouper( # type: ignore[assignment]
- dropped,
- key=self.keys,
- axis=self.axis,
- level=self.level,
- sort=self.sort,
- )
-
- grb = dropped.groupby(
- grouper, as_index=self.as_index, sort=self.sort, axis=self.axis
- )
- return grb.nth(n)
-
- @final
- def quantile(
- self,
- q: float | AnyArrayLike = 0.5,
- interpolation: str = "linear",
- numeric_only: bool = False,
- ):
- """
- Return group values at the given quantile, a la numpy.percentile.
-
- Parameters
- ----------
- q : float or array-like, default 0.5 (50% quantile)
- Value(s) between 0 and 1 providing the quantile(s) to compute.
- interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
- Method to use when the desired quantile falls between two points.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- .. versionchanged:: 2.0.0
-
- numeric_only now defaults to ``False``.
-
- Returns
- -------
- Series or DataFrame
- Return type determined by caller of GroupBy object.
-
- See Also
- --------
- Series.quantile : Similar method for Series.
- DataFrame.quantile : Similar method for DataFrame.
- numpy.percentile : NumPy method to compute qth percentile.
-
- Examples
- --------
- >>> df = pd.DataFrame([
- ... ['a', 1], ['a', 2], ['a', 3],
- ... ['b', 1], ['b', 3], ['b', 5]
- ... ], columns=['key', 'val'])
- >>> df.groupby('key').quantile()
- val
- key
- a 2.0
- b 3.0
- """
-
- def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
- if is_object_dtype(vals):
- raise TypeError(
- "'quantile' cannot be performed against 'object' dtypes!"
- )
-
- inference: DtypeObj | None = None
- if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype):
- out = vals.to_numpy(dtype=float, na_value=np.nan)
- inference = vals.dtype
- elif is_integer_dtype(vals.dtype):
- if isinstance(vals, ExtensionArray):
- out = vals.to_numpy(dtype=float, na_value=np.nan)
- else:
- out = vals
- inference = np.dtype(np.int64)
- elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):
- out = vals.to_numpy(dtype=float, na_value=np.nan)
- elif needs_i8_conversion(vals.dtype):
- inference = vals.dtype
- # In this case we need to delay the casting until after the
- # np.lexsort below.
- # error: Incompatible return value type (got
- # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any,
- # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any],
- # Optional[Union[dtype[Any], ExtensionDtype]]]")
- return vals, inference # type: ignore[return-value]
- elif isinstance(vals, ExtensionArray) and is_float_dtype(vals):
- inference = np.dtype(np.float64)
- out = vals.to_numpy(dtype=float, na_value=np.nan)
- else:
- out = np.asarray(vals)
-
- return out, inference
-
- def post_processor(
- vals: np.ndarray,
- inference: DtypeObj | None,
- result_mask: np.ndarray | None,
- orig_vals: ArrayLike,
- ) -> ArrayLike:
- if inference:
- # Check for edge case
- if isinstance(orig_vals, BaseMaskedArray):
- assert result_mask is not None # for mypy
-
- if interpolation in {"linear", "midpoint"} and not is_float_dtype(
- orig_vals
- ):
- return FloatingArray(vals, result_mask)
- else:
- # Item "ExtensionDtype" of "Union[ExtensionDtype, str,
- # dtype[Any], Type[object]]" has no attribute "numpy_dtype"
- # [union-attr]
- return type(orig_vals)(
- vals.astype(
- inference.numpy_dtype # type: ignore[union-attr]
- ),
- result_mask,
- )
-
- elif not (
- is_integer_dtype(inference)
- and interpolation in {"linear", "midpoint"}
- ):
- if needs_i8_conversion(inference):
- # error: Item "ExtensionArray" of "Union[ExtensionArray,
- # ndarray[Any, Any]]" has no attribute "_ndarray"
- vals = vals.astype("i8").view(
- orig_vals._ndarray.dtype # type: ignore[union-attr]
- )
- # error: Item "ExtensionArray" of "Union[ExtensionArray,
- # ndarray[Any, Any]]" has no attribute "_from_backing_data"
- return orig_vals._from_backing_data( # type: ignore[union-attr]
- vals
- )
-
- assert isinstance(inference, np.dtype) # for mypy
- return vals.astype(inference)
-
- return vals
-
- orig_scalar = is_scalar(q)
- if orig_scalar:
- # error: Incompatible types in assignment (expression has type "List[
- # Union[float, ExtensionArray, ndarray[Any, Any], Index, Series]]",
- # variable has type "Union[float, Union[Union[ExtensionArray, ndarray[
- # Any, Any]], Index, Series]]")
- q = [q] # type: ignore[assignment]
-
- qs = np.array(q, dtype=np.float64)
- ids, _, ngroups = self.grouper.group_info
- nqs = len(qs)
-
- func = partial(
- libgroupby.group_quantile, labels=ids, qs=qs, interpolation=interpolation
- )
-
- # Put '-1' (NaN) labels as the last group so it does not interfere
- # with the calculations. Note: length check avoids failure on empty
- # labels. In that case, the value doesn't matter
- na_label_for_sorting = ids.max() + 1 if len(ids) > 0 else 0
- labels_for_lexsort = np.where(ids == -1, na_label_for_sorting, ids)
-
- def blk_func(values: ArrayLike) -> ArrayLike:
- orig_vals = values
- if isinstance(values, BaseMaskedArray):
- mask = values._mask
- result_mask = np.zeros((ngroups, nqs), dtype=np.bool_)
- else:
- mask = isna(values)
- result_mask = None
-
- is_datetimelike = needs_i8_conversion(values.dtype)
-
- vals, inference = pre_processor(values)
-
- ncols = 1
- if vals.ndim == 2:
- ncols = vals.shape[0]
- shaped_labels = np.broadcast_to(
- labels_for_lexsort, (ncols, len(labels_for_lexsort))
- )
- else:
- shaped_labels = labels_for_lexsort
-
- out = np.empty((ncols, ngroups, nqs), dtype=np.float64)
-
- # Get an index of values sorted by values and then labels
- order = (vals, shaped_labels)
- sort_arr = np.lexsort(order).astype(np.intp, copy=False)
-
- if is_datetimelike:
- # This casting needs to happen after the lexsort in order
- # to ensure that NaTs are placed at the end and not the front
- vals = vals.view("i8").astype(np.float64)
-
- if vals.ndim == 1:
- # Ea is always 1d
- func(
- out[0],
- values=vals,
- mask=mask,
- sort_indexer=sort_arr,
- result_mask=result_mask,
- )
- else:
- for i in range(ncols):
- func(out[i], values=vals[i], mask=mask[i], sort_indexer=sort_arr[i])
-
- if vals.ndim == 1:
- out = out.ravel("K")
- if result_mask is not None:
- result_mask = result_mask.ravel("K")
- else:
- out = out.reshape(ncols, ngroups * nqs)
- return post_processor(out, inference, result_mask, orig_vals)
-
- data = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile")
- res_mgr = data.grouped_reduce(blk_func)
-
- res = self._wrap_agged_manager(res_mgr)
-
- if orig_scalar:
- # Avoid expensive MultiIndex construction
- return self._wrap_aggregated_output(res)
- return self._wrap_aggregated_output(res, qs=qs)
-
- @final
- @Substitution(name="groupby")
- def ngroup(self, ascending: bool = True):
- """
- Number each group from 0 to the number of groups - 1.
-
- This is the enumerative complement of cumcount. Note that the
- numbers given to the groups match the order in which the groups
- would be seen when iterating over the groupby object, not the
- order they are first observed.
-
- Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN`
- and will be skipped from the count.
-
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from number of group - 1 to 0.
-
- Returns
- -------
- Series
- Unique numbers for each group.
-
- See Also
- --------
- .cumcount : Number the rows in each group.
-
- Examples
- --------
- >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]})
- >>> df
- color
- 0 red
- 1 None
- 2 red
- 3 blue
- 4 blue
- 5 red
- >>> df.groupby("color").ngroup()
- 0 1.0
- 1 NaN
- 2 1.0
- 3 0.0
- 4 0.0
- 5 1.0
- dtype: float64
- >>> df.groupby("color", dropna=False).ngroup()
- 0 1
- 1 2
- 2 1
- 3 0
- 4 0
- 5 1
- dtype: int64
- >>> df.groupby("color", dropna=False).ngroup(ascending=False)
- 0 1
- 1 0
- 2 1
- 3 2
- 4 2
- 5 1
- dtype: int64
- """
- obj = self._obj_with_exclusions
- index = obj._get_axis(self.axis)
- comp_ids = self.grouper.group_info[0]
-
- dtype: type
- if self.grouper.has_dropped_na:
- comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)
- dtype = np.float64
- else:
- dtype = np.int64
-
- if any(ping._passed_categorical for ping in self.grouper.groupings):
- # comp_ids reflect non-observed groups, we need only observed
- comp_ids = rank_1d(comp_ids, ties_method="dense") - 1
-
- result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
- if not ascending:
- result = self.ngroups - 1 - result
- return result
-
- @final
- @Substitution(name="groupby")
- def cumcount(self, ascending: bool = True):
- """
- Number each item in each group from 0 to the length of that group - 1.
-
- Essentially this is equivalent to
-
- .. code-block:: python
-
- self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))
-
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from length of group - 1 to 0.
-
- Returns
- -------
- Series
- Sequence number of each element within each group.
-
- See Also
- --------
- .ngroup : Number the groups themselves.
-
- Examples
- --------
- >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
- ... columns=['A'])
- >>> df
- A
- 0 a
- 1 a
- 2 a
- 3 b
- 4 b
- 5 a
- >>> df.groupby('A').cumcount()
- 0 0
- 1 1
- 2 2
- 3 0
- 4 1
- 5 3
- dtype: int64
- >>> df.groupby('A').cumcount(ascending=False)
- 0 3
- 1 2
- 2 1
- 3 1
- 4 0
- 5 0
- dtype: int64
- """
- index = self._obj_with_exclusions._get_axis(self.axis)
- cumcounts = self._cumcount_array(ascending=ascending)
- return self._obj_1d_constructor(cumcounts, index)
-
- @final
- @Substitution(name="groupby")
- @Substitution(see_also=_common_see_also)
- def rank(
- self,
- method: str = "average",
- ascending: bool = True,
- na_option: str = "keep",
- pct: bool = False,
- axis: AxisInt = 0,
- ) -> NDFrameT:
- """
- Provide the rank of values within each group.
-
- Parameters
- ----------
- method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
- * average: average rank of group.
- * min: lowest rank in group.
- * max: highest rank in group.
- * first: ranks assigned in order they appear in the array.
- * dense: like 'min', but rank always increases by 1 between groups.
- ascending : bool, default True
- False for ranks by high (1) to low (N).
- na_option : {'keep', 'top', 'bottom'}, default 'keep'
- * keep: leave NA values where they are.
- * top: smallest rank if ascending.
- * bottom: smallest rank if descending.
- pct : bool, default False
- Compute percentage rank of data within each group.
- axis : int, default 0
- The axis of the object over which to compute the rank.
-
- Returns
- -------
- DataFrame with ranking of values within each group
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {
- ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"],
- ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5],
- ... }
- ... )
- >>> df
- group value
- 0 a 2
- 1 a 4
- 2 a 2
- 3 a 3
- 4 a 5
- 5 b 1
- 6 b 2
- 7 b 4
- 8 b 1
- 9 b 5
- >>> for method in ['average', 'min', 'max', 'dense', 'first']:
- ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method)
- >>> df
- group value average_rank min_rank max_rank dense_rank first_rank
- 0 a 2 1.5 1.0 2.0 1.0 1.0
- 1 a 4 4.0 4.0 4.0 3.0 4.0
- 2 a 2 1.5 1.0 2.0 1.0 2.0
- 3 a 3 3.0 3.0 3.0 2.0 3.0
- 4 a 5 5.0 5.0 5.0 4.0 5.0
- 5 b 1 1.5 1.0 2.0 1.0 1.0
- 6 b 2 3.0 3.0 3.0 2.0 3.0
- 7 b 4 4.0 4.0 4.0 3.0 4.0
- 8 b 1 1.5 1.0 2.0 1.0 2.0
- 9 b 5 5.0 5.0 5.0 4.0 5.0
- """
- if na_option not in {"keep", "top", "bottom"}:
- msg = "na_option must be one of 'keep', 'top', or 'bottom'"
- raise ValueError(msg)
-
- kwargs = {
- "ties_method": method,
- "ascending": ascending,
- "na_option": na_option,
- "pct": pct,
- }
- if axis != 0:
- # DataFrame uses different keyword name
- kwargs["method"] = kwargs.pop("ties_method")
- f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs)
- result = self._python_apply_general(
- f, self._selected_obj, is_transform=True
- )
- return result
-
- return self._cython_transform(
- "rank",
- numeric_only=False,
- axis=axis,
- **kwargs,
- )
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def cumprod(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT:
- """
- Cumulative product for each group.
-
- Returns
- -------
- Series or DataFrame
- """
- nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])
- if axis != 0:
- f = lambda x: x.cumprod(axis=axis, **kwargs)
- return self._python_apply_general(f, self._selected_obj, is_transform=True)
-
- return self._cython_transform("cumprod", **kwargs)
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def cumsum(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT:
- """
- Cumulative sum for each group.
-
- Returns
- -------
- Series or DataFrame
- """
- nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])
- if axis != 0:
- f = lambda x: x.cumsum(axis=axis, **kwargs)
- return self._python_apply_general(f, self._selected_obj, is_transform=True)
-
- return self._cython_transform("cumsum", **kwargs)
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def cummin(
- self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs
- ) -> NDFrameT:
- """
- Cumulative min for each group.
-
- Returns
- -------
- Series or DataFrame
- """
- skipna = kwargs.get("skipna", True)
- if axis != 0:
- f = lambda x: np.minimum.accumulate(x, axis)
- obj = self._selected_obj
- if numeric_only:
- obj = obj._get_numeric_data()
- return self._python_apply_general(f, obj, is_transform=True)
-
- return self._cython_transform(
- "cummin", numeric_only=numeric_only, skipna=skipna
- )
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def cummax(
- self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs
- ) -> NDFrameT:
- """
- Cumulative max for each group.
-
- Returns
- -------
- Series or DataFrame
- """
- skipna = kwargs.get("skipna", True)
- if axis != 0:
- f = lambda x: np.maximum.accumulate(x, axis)
- obj = self._selected_obj
- if numeric_only:
- obj = obj._get_numeric_data()
- return self._python_apply_general(f, obj, is_transform=True)
-
- return self._cython_transform(
- "cummax", numeric_only=numeric_only, skipna=skipna
- )
-
- @final
- def _get_cythonized_result(
- self,
- base_func: Callable,
- cython_dtype: np.dtype,
- numeric_only: bool = False,
- needs_counts: bool = False,
- pre_processing=None,
- post_processing=None,
- how: str = "any_all",
- **kwargs,
- ):
- """
- Get result for Cythonized functions.
-
- Parameters
- ----------
- base_func : callable, Cythonized function to be called
- cython_dtype : np.dtype
- Type of the array that will be modified by the Cython call.
- numeric_only : bool, default False
- Whether only numeric datatypes should be computed
- needs_counts : bool, default False
- Whether the counts should be a part of the Cython call
- pre_processing : function, default None
- Function to be applied to `values` prior to passing to Cython.
- Function should return a tuple where the first element is the
- values to be passed to Cython and the second element is an optional
- type which the values should be converted to after being returned
- by the Cython operation. This function is also responsible for
- raising a TypeError if the values have an invalid type. Raises
- if `needs_values` is False.
- post_processing : function, default None
- Function to be applied to result of Cython function. Should accept
- an array of values as the first argument and type inferences as its
- second argument, i.e. the signature should be
- (ndarray, Type). If `needs_nullable=True`, a third argument should be
- `nullable`, to allow for processing specific to nullable values.
- how : str, default any_all
- Determines if any/all cython interface or std interface is used.
- **kwargs : dict
- Extra arguments to be passed back to Cython funcs
-
- Returns
- -------
- `Series` or `DataFrame` with filled values
- """
- if post_processing and not callable(post_processing):
- raise ValueError("'post_processing' must be a callable!")
- if pre_processing and not callable(pre_processing):
- raise ValueError("'pre_processing' must be a callable!")
-
- grouper = self.grouper
-
- ids, _, ngroups = grouper.group_info
-
- base_func = partial(base_func, labels=ids)
-
- def blk_func(values: ArrayLike) -> ArrayLike:
- values = values.T
- ncols = 1 if values.ndim == 1 else values.shape[1]
-
- result: ArrayLike
- result = np.zeros(ngroups * ncols, dtype=cython_dtype)
- result = result.reshape((ngroups, ncols))
-
- func = partial(base_func, out=result)
-
- inferences = None
-
- if needs_counts:
- counts = np.zeros(ngroups, dtype=np.int64)
- func = partial(func, counts=counts)
-
- is_datetimelike = values.dtype.kind in ["m", "M"]
- vals = values
- if is_datetimelike and how == "std":
- vals = vals.view("i8")
- if pre_processing:
- vals, inferences = pre_processing(vals)
-
- vals = vals.astype(cython_dtype, copy=False)
- if vals.ndim == 1:
- vals = vals.reshape((-1, 1))
- func = partial(func, values=vals)
-
- if how != "std" or isinstance(values, BaseMaskedArray):
- mask = isna(values).view(np.uint8)
- if mask.ndim == 1:
- mask = mask.reshape(-1, 1)
- func = partial(func, mask=mask)
-
- if how != "std":
- is_nullable = isinstance(values, BaseMaskedArray)
- func = partial(func, nullable=is_nullable)
-
- elif isinstance(values, BaseMaskedArray):
- result_mask = np.zeros(result.shape, dtype=np.bool_)
- func = partial(func, result_mask=result_mask)
-
- # Call func to modify result in place
- if how == "std":
- func(**kwargs, is_datetimelike=is_datetimelike)
- else:
- func(**kwargs)
-
- if values.ndim == 1:
- assert result.shape[1] == 1, result.shape
- result = result[:, 0]
-
- if post_processing:
- pp_kwargs: dict[str, bool | np.ndarray] = {}
- pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray)
- if how == "std" and pp_kwargs["nullable"]:
- pp_kwargs["result_mask"] = result_mask
-
- result = post_processing(result, inferences, **pp_kwargs)
-
- if how == "std" and is_datetimelike:
- values = cast("DatetimeArray | TimedeltaArray", values)
- unit = values.unit
- with warnings.catch_warnings():
- # suppress "RuntimeWarning: invalid value encountered in cast"
- warnings.filterwarnings("ignore")
- result = result.astype(np.int64, copy=False)
- result = result.view(f"m8[{unit}]")
-
- return result.T
-
- # Operate block-wise instead of column-by-column
- mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
-
- res_mgr = mgr.grouped_reduce(blk_func)
-
- out = self._wrap_agged_manager(res_mgr)
- return self._wrap_aggregated_output(out)
-
- @final
- @Substitution(name="groupby")
- def shift(self, periods: int = 1, freq=None, axis: Axis = 0, fill_value=None):
- """
- Shift each group by periods observations.
-
- If freq is passed, the index will be increased using the periods and the freq.
-
- Parameters
- ----------
- periods : int, default 1
- Number of periods to shift.
- freq : str, optional
- Frequency string.
- axis : axis to shift, default 0
- Shift direction.
- fill_value : optional
- The scalar value to use for newly introduced missing values.
-
- Returns
- -------
- Series or DataFrame
- Object shifted within each group.
-
- See Also
- --------
- Index.shift : Shift values of Index.
- """
- if freq is not None or axis != 0:
- f = lambda x: x.shift(periods, freq, axis, fill_value)
- return self._python_apply_general(f, self._selected_obj, is_transform=True)
-
- ids, _, ngroups = self.grouper.group_info
- res_indexer = np.zeros(len(ids), dtype=np.int64)
-
- libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods)
-
- obj = self._obj_with_exclusions
-
- res = obj._reindex_with_indexers(
- {self.axis: (obj.axes[self.axis], res_indexer)},
- fill_value=fill_value,
- allow_dups=True,
- )
- return res
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def diff(self, periods: int = 1, axis: AxisInt = 0) -> NDFrameT:
- """
- First discrete difference of element.
-
- Calculates the difference of each element compared with another
- element in the group (default is element in previous row).
-
- Parameters
- ----------
- periods : int, default 1
- Periods to shift for calculating difference, accepts negative values.
- axis : axis to shift, default 0
- Take difference over rows (0) or columns (1).
-
- Returns
- -------
- Series or DataFrame
- First differences.
- """
- if axis != 0:
- return self.apply(lambda x: x.diff(periods=periods, axis=axis))
-
- obj = self._obj_with_exclusions
- shifted = self.shift(periods=periods, axis=axis)
-
- # GH45562 - to retain existing behavior and match behavior of Series.diff(),
- # int8 and int16 are coerced to float32 rather than float64.
- dtypes_to_f32 = ["int8", "int16"]
- if obj.ndim == 1:
- if obj.dtype in dtypes_to_f32:
- shifted = shifted.astype("float32")
- else:
- to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32]
- if len(to_coerce):
- shifted = shifted.astype({c: "float32" for c in to_coerce})
-
- return obj - shifted
-
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def pct_change(
- self,
- periods: int = 1,
- fill_method: FillnaOptions = "ffill",
- limit=None,
- freq=None,
- axis: Axis = 0,
- ):
- """
- Calculate pct_change of each value to previous entry in group.
-
- Returns
- -------
- Series or DataFrame
- Percentage changes within each group.
- """
- # TODO(GH#23918): Remove this conditional for SeriesGroupBy when
- # GH#23918 is fixed
- if freq is not None or axis != 0:
- f = lambda x: x.pct_change(
- periods=periods,
- fill_method=fill_method,
- limit=limit,
- freq=freq,
- axis=axis,
- )
- return self._python_apply_general(f, self._selected_obj, is_transform=True)
-
- if fill_method is None: # GH30463
- fill_method = "ffill"
- limit = 0
- filled = getattr(self, fill_method)(limit=limit)
- fill_grp = filled.groupby(
- self.grouper.codes, axis=self.axis, group_keys=self.group_keys
- )
- shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis)
- return (filled / shifted) - 1
-
- @final
- @Substitution(name="groupby")
- @Substitution(see_also=_common_see_also)
- def head(self, n: int = 5) -> NDFrameT:
- """
- Return first n rows of each group.
-
- Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows
- from the original DataFrame with original index and order preserved
- (``as_index`` flag is ignored).
-
- Parameters
- ----------
- n : int
- If positive: number of entries to include from start of each group.
- If negative: number of entries to exclude from end of each group.
-
- Returns
- -------
- Series or DataFrame
- Subset of original Series or DataFrame as determined by n.
- %(see_also)s
- Examples
- --------
-
- >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],
- ... columns=['A', 'B'])
- >>> df.groupby('A').head(1)
- A B
- 0 1 2
- 2 5 6
- >>> df.groupby('A').head(-1)
- A B
- 0 1 2
- """
- mask = self._make_mask_from_positional_indexer(slice(None, n))
- return self._mask_selected_obj(mask)
-
- @final
- @Substitution(name="groupby")
- @Substitution(see_also=_common_see_also)
- def tail(self, n: int = 5) -> NDFrameT:
- """
- Return last n rows of each group.
-
- Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows
- from the original DataFrame with original index and order preserved
- (``as_index`` flag is ignored).
-
- Parameters
- ----------
- n : int
- If positive: number of entries to include from end of each group.
- If negative: number of entries to exclude from start of each group.
-
- Returns
- -------
- Series or DataFrame
- Subset of original Series or DataFrame as determined by n.
- %(see_also)s
- Examples
- --------
-
- >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
- ... columns=['A', 'B'])
- >>> df.groupby('A').tail(1)
- A B
- 1 a 2
- 3 b 2
- >>> df.groupby('A').tail(-1)
- A B
- 1 a 2
- 3 b 2
- """
- if n:
- mask = self._make_mask_from_positional_indexer(slice(-n, None))
- else:
- mask = self._make_mask_from_positional_indexer([])
-
- return self._mask_selected_obj(mask)
-
- @final
- def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT:
- """
- Return _selected_obj with mask applied to the correct axis.
-
- Parameters
- ----------
- mask : np.ndarray[bool]
- Boolean mask to apply.
-
- Returns
- -------
- Series or DataFrame
- Filtered _selected_obj.
- """
- ids = self.grouper.group_info[0]
- mask = mask & (ids != -1)
-
- if self.axis == 0:
- return self._selected_obj[mask]
- else:
- return self._selected_obj.iloc[:, mask]
-
- @final
- def _reindex_output(
- self,
- output: OutputFrameOrSeries,
- fill_value: Scalar = np.NaN,
- qs: npt.NDArray[np.float64] | None = None,
- ) -> OutputFrameOrSeries:
- """
- If we have categorical groupers, then we might want to make sure that
- we have a fully re-indexed output to the levels. This means expanding
- the output space to accommodate all values in the cartesian product of
- our groups, regardless of whether they were observed in the data or
- not. This will expand the output space if there are missing groups.
-
- The method returns early without modifying the input if the number of
- groupings is less than 2, self.observed == True or none of the groupers
- are categorical.
-
- Parameters
- ----------
- output : Series or DataFrame
- Object resulting from grouping and applying an operation.
- fill_value : scalar, default np.NaN
- Value to use for unobserved categories if self.observed is False.
- qs : np.ndarray[float64] or None, default None
- quantile values, only relevant for quantile.
-
- Returns
- -------
- Series or DataFrame
- Object (potentially) re-indexed to include all possible groups.
- """
- groupings = self.grouper.groupings
- if len(groupings) == 1:
- return output
-
- # if we only care about the observed values
- # we are done
- elif self.observed:
- return output
-
- # reindexing only applies to a Categorical grouper
- elif not any(
- isinstance(ping.grouping_vector, (Categorical, CategoricalIndex))
- for ping in groupings
- ):
- return output
-
- levels_list = [ping.group_index for ping in groupings]
- names = self.grouper.names
- if qs is not None:
- # error: Argument 1 to "append" of "list" has incompatible type
- # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index"
- levels_list.append(qs) # type: ignore[arg-type]
- names = names + [None]
- index = MultiIndex.from_product(levels_list, names=names)
- if self.sort:
- index = index.sort_values()
-
- if self.as_index:
- # Always holds for SeriesGroupBy unless GH#36507 is implemented
- d = {
- self.obj._get_axis_name(self.axis): index,
- "copy": False,
- "fill_value": fill_value,
- }
- return output.reindex(**d) # type: ignore[arg-type]
-
- # GH 13204
- # Here, the categorical in-axis groupers, which need to be fully
- # expanded, are columns in `output`. An idea is to do:
- # output = output.set_index(self.grouper.names)
- # .reindex(index).reset_index()
- # but special care has to be taken because of possible not-in-axis
- # groupers.
- # So, we manually select and drop the in-axis grouper columns,
- # reindex `output`, and then reset the in-axis grouper columns.
-
- # Select in-axis groupers
- in_axis_grps = list(
- (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
- )
- if len(in_axis_grps) > 0:
- g_nums, g_names = zip(*in_axis_grps)
- output = output.drop(labels=list(g_names), axis=1)
-
- # Set a temp index and reindex (possibly expanding)
- output = output.set_index(self.grouper.result_index).reindex(
- index, copy=False, fill_value=fill_value
- )
-
- # Reset in-axis grouper columns
- # (using level numbers `g_nums` because level names may not be unique)
- if len(in_axis_grps) > 0:
- output = output.reset_index(level=g_nums)
-
- return output.reset_index(drop=True)
-
- @final
- def sample(
- self,
- n: int | None = None,
- frac: float | None = None,
- replace: bool = False,
- weights: Sequence | Series | None = None,
- random_state: RandomState | None = None,
- ):
- """
- Return a random sample of items from each group.
-
- You can use `random_state` for reproducibility.
-
- .. versionadded:: 1.1.0
-
- Parameters
- ----------
- n : int, optional
- Number of items to return for each group. Cannot be used with
- `frac` and must be no larger than the smallest group unless
- `replace` is True. Default is one if `frac` is None.
- frac : float, optional
- Fraction of items to return. Cannot be used with `n`.
- replace : bool, default False
- Allow or disallow sampling of the same row more than once.
- weights : list-like, optional
- Default None results in equal probability weighting.
- If passed a list-like then values must have the same length as
- the underlying DataFrame or Series object and will be used as
- sampling probabilities after normalization within each group.
- Values must be non-negative with at least one positive element
- within each group.
- random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
- If int, array-like, or BitGenerator, seed for random number generator.
- If np.random.RandomState or np.random.Generator, use as given.
-
- .. versionchanged:: 1.4.0
-
- np.random.Generator objects now accepted
-
- Returns
- -------
- Series or DataFrame
- A new object of same type as caller containing items randomly
- sampled within each group from the caller object.
-
- See Also
- --------
- DataFrame.sample: Generate random samples from a DataFrame object.
- numpy.random.choice: Generate a random sample from a given 1-D numpy
- array.
-
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}
- ... )
- >>> df
- a b
- 0 red 0
- 1 red 1
- 2 blue 2
- 3 blue 3
- 4 black 4
- 5 black 5
-
- Select one row at random for each distinct value in column a. The
- `random_state` argument can be used to guarantee reproducibility:
-
- >>> df.groupby("a").sample(n=1, random_state=1)
- a b
- 4 black 4
- 2 blue 2
- 1 red 1
-
- Set `frac` to sample fixed proportions rather than counts:
-
- >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2)
- 5 5
- 2 2
- 0 0
- Name: b, dtype: int64
-
- Control sample probabilities within groups by setting weights:
-
- >>> df.groupby("a").sample(
- ... n=1,
- ... weights=[1, 1, 1, 0, 0, 1],
- ... random_state=1,
- ... )
- a b
- 5 black 5
- 2 blue 2
- 0 red 0
- """ # noqa:E501
- if self._selected_obj.empty:
- # GH48459 prevent ValueError when object is empty
- return self._selected_obj
- size = sample.process_sampling_size(n, frac, replace)
- if weights is not None:
- weights_arr = sample.preprocess_weights(
- self._selected_obj, weights, axis=self.axis
- )
-
- random_state = com.random_state(random_state)
-
- group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
-
- sampled_indices = []
- for labels, obj in group_iterator:
- grp_indices = self.indices[labels]
- group_size = len(grp_indices)
- if size is not None:
- sample_size = size
- else:
- assert frac is not None
- sample_size = round(frac * group_size)
-
- grp_sample = sample.sample(
- group_size,
- size=sample_size,
- replace=replace,
- weights=None if weights is None else weights_arr[grp_indices],
- random_state=random_state,
- )
- sampled_indices.append(grp_indices[grp_sample])
-
- sampled_indices = np.concatenate(sampled_indices)
- return self._selected_obj.take(sampled_indices, axis=self.axis)
-
-
-@doc(GroupBy)
-def get_groupby(
- obj: NDFrame,
- by: _KeysArgType | None = None,
- axis: AxisInt = 0,
- grouper: ops.BaseGrouper | None = None,
- group_keys: bool = True,
-) -> GroupBy:
- klass: type[GroupBy]
- if isinstance(obj, Series):
- from pandas.core.groupby.generic import SeriesGroupBy
-
- klass = SeriesGroupBy
- elif isinstance(obj, DataFrame):
- from pandas.core.groupby.generic import DataFrameGroupBy
-
- klass = DataFrameGroupBy
- else: # pragma: no cover
- raise TypeError(f"invalid type: {obj}")
-
- return klass(
- obj=obj,
- keys=by,
- axis=axis,
- grouper=grouper,
- group_keys=group_keys,
- )
-
-
-def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex:
- """
- Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex.
-
- The quantile level in the MultiIndex is a repeated copy of 'qs'.
-
- Parameters
- ----------
- idx : Index
- qs : np.ndarray[float64]
-
- Returns
- -------
- MultiIndex
- """
- nqs = len(qs)
-
- if idx._is_multi:
- idx = cast(MultiIndex, idx)
- lev_codes, lev = Index(qs).factorize()
- levels = list(idx.levels) + [lev]
- codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))]
- mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None])
- else:
- mi = MultiIndex.from_product([idx, qs])
- return mi
diff --git a/contrib/python/pandas/py3/pandas/core/groupby/grouper.py b/contrib/python/pandas/py3/pandas/core/groupby/grouper.py
deleted file mode 100644
index f735ce682fc..00000000000
--- a/contrib/python/pandas/py3/pandas/core/groupby/grouper.py
+++ /dev/null
@@ -1,1044 +0,0 @@
-"""
-Provide user facing operators for doing the split part of the
-split-apply-combine paradigm.
-"""
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Hashable,
- Iterator,
- final,
-)
-import warnings
-
-import numpy as np
-
-from pandas._config import using_copy_on_write
-
-from pandas._typing import (
- ArrayLike,
- Axis,
- NDFrameT,
- npt,
-)
-from pandas.errors import InvalidIndexError
-from pandas.util._decorators import cache_readonly
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_list_like,
- is_scalar,
-)
-
-from pandas.core import algorithms
-from pandas.core.arrays import (
- Categorical,
- ExtensionArray,
-)
-import pandas.core.common as com
-from pandas.core.frame import DataFrame
-from pandas.core.groupby import ops
-from pandas.core.groupby.categorical import recode_for_groupby
-from pandas.core.indexes.api import (
- CategoricalIndex,
- Index,
- MultiIndex,
-)
-from pandas.core.series import Series
-
-from pandas.io.formats.printing import pprint_thing
-
-if TYPE_CHECKING:
- from pandas.core.generic import NDFrame
-
-
-class Grouper:
- """
- A Grouper allows the user to specify a groupby instruction for an object.
-
- This specification will select a column via the key parameter, or if the
- level and/or axis parameters are given, a level of the index of the target
- object.
-
- If `axis` and/or `level` are passed as keywords to both `Grouper` and
- `groupby`, the values passed to `Grouper` take precedence.
-
- Parameters
- ----------
- key : str, defaults to None
- Groupby key, which selects the grouping column of the target.
- level : name/number, defaults to None
- The level for the target index.
- freq : str / frequency object, defaults to None
- This will groupby the specified frequency if the target selection
- (via key or level) is a datetime-like object. For full specification
- of available frequencies, please see `here
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
- axis : str, int, defaults to 0
- Number/name of the axis.
- sort : bool, default to False
- Whether to sort the resulting labels.
- closed : {'left' or 'right'}
- Closed end of interval. Only when `freq` parameter is passed.
- label : {'left' or 'right'}
- Interval boundary to use for labeling.
- Only when `freq` parameter is passed.
- convention : {'start', 'end', 'e', 's'}
- If grouper is PeriodIndex and `freq` parameter is passed.
-
- origin : Timestamp or str, default 'start_day'
- The timestamp on which to adjust the grouping. The timezone of origin must
- match the timezone of the index.
- If string, must be one of the following:
-
- - 'epoch': `origin` is 1970-01-01
- - 'start': `origin` is the first value of the timeseries
- - 'start_day': `origin` is the first day at midnight of the timeseries
-
- .. versionadded:: 1.1.0
-
- - 'end': `origin` is the last value of the timeseries
- - 'end_day': `origin` is the ceiling midnight of the last day
-
- .. versionadded:: 1.3.0
-
- offset : Timedelta or str, default is None
- An offset timedelta added to the origin.
-
- .. versionadded:: 1.1.0
-
- dropna : bool, default True
- If True, and if group keys contain NA values, NA values together with
- row/column will be dropped. If False, NA values will also be treated as
- the key in groups.
-
- .. versionadded:: 1.2.0
-
- Returns
- -------
- A specification for a groupby instruction
-
- Examples
- --------
- Syntactic sugar for ``df.groupby('A')``
-
- >>> df = pd.DataFrame(
- ... {
- ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"],
- ... "Speed": [100, 5, 200, 300, 15],
- ... }
- ... )
- >>> df
- Animal Speed
- 0 Falcon 100
- 1 Parrot 5
- 2 Falcon 200
- 3 Falcon 300
- 4 Parrot 15
- >>> df.groupby(pd.Grouper(key="Animal")).mean()
- Speed
- Animal
- Falcon 200.0
- Parrot 10.0
-
- Specify a resample operation on the column 'Publish date'
-
- >>> df = pd.DataFrame(
- ... {
- ... "Publish date": [
- ... pd.Timestamp("2000-01-02"),
- ... pd.Timestamp("2000-01-02"),
- ... pd.Timestamp("2000-01-09"),
- ... pd.Timestamp("2000-01-16")
- ... ],
- ... "ID": [0, 1, 2, 3],
- ... "Price": [10, 20, 30, 40]
- ... }
- ... )
- >>> df
- Publish date ID Price
- 0 2000-01-02 0 10
- 1 2000-01-02 1 20
- 2 2000-01-09 2 30
- 3 2000-01-16 3 40
- >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean()
- ID Price
- Publish date
- 2000-01-02 0.5 15.0
- 2000-01-09 2.0 30.0
- 2000-01-16 3.0 40.0
-
- If you want to adjust the start of the bins based on a fixed timestamp:
-
- >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
- >>> rng = pd.date_range(start, end, freq='7min')
- >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
- >>> ts
- 2000-10-01 23:30:00 0
- 2000-10-01 23:37:00 3
- 2000-10-01 23:44:00 6
- 2000-10-01 23:51:00 9
- 2000-10-01 23:58:00 12
- 2000-10-02 00:05:00 15
- 2000-10-02 00:12:00 18
- 2000-10-02 00:19:00 21
- 2000-10-02 00:26:00 24
- Freq: 7T, dtype: int64
-
- >>> ts.groupby(pd.Grouper(freq='17min')).sum()
- 2000-10-01 23:14:00 0
- 2000-10-01 23:31:00 9
- 2000-10-01 23:48:00 21
- 2000-10-02 00:05:00 54
- 2000-10-02 00:22:00 24
- Freq: 17T, dtype: int64
-
- >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum()
- 2000-10-01 23:18:00 0
- 2000-10-01 23:35:00 18
- 2000-10-01 23:52:00 27
- 2000-10-02 00:09:00 39
- 2000-10-02 00:26:00 24
- Freq: 17T, dtype: int64
-
- >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
- 2000-10-01 23:24:00 3
- 2000-10-01 23:41:00 15
- 2000-10-01 23:58:00 45
- 2000-10-02 00:15:00 45
- Freq: 17T, dtype: int64
-
- If you want to adjust the start of the bins with an `offset` Timedelta, the two
- following lines are equivalent:
-
- >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum()
- 2000-10-01 23:30:00 9
- 2000-10-01 23:47:00 21
- 2000-10-02 00:04:00 54
- 2000-10-02 00:21:00 24
- Freq: 17T, dtype: int64
-
- >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum()
- 2000-10-01 23:30:00 9
- 2000-10-01 23:47:00 21
- 2000-10-02 00:04:00 54
- 2000-10-02 00:21:00 24
- Freq: 17T, dtype: int64
-
- To replace the use of the deprecated `base` argument, you can now use `offset`,
- in this example it is equivalent to have `base=2`:
-
- >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum()
- 2000-10-01 23:16:00 0
- 2000-10-01 23:33:00 9
- 2000-10-01 23:50:00 36
- 2000-10-02 00:07:00 39
- 2000-10-02 00:24:00 24
- Freq: 17T, dtype: int64
- """
-
- sort: bool
- dropna: bool
- _gpr_index: Index | None
- _grouper: Index | None
-
- _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna")
-
- def __new__(cls, *args, **kwargs):
- if kwargs.get("freq") is not None:
- from pandas.core.resample import TimeGrouper
-
- cls = TimeGrouper
- return super().__new__(cls)
-
- def __init__(
- self,
- key=None,
- level=None,
- freq=None,
- axis: Axis = 0,
- sort: bool = False,
- dropna: bool = True,
- ) -> None:
- self.key = key
- self.level = level
- self.freq = freq
- self.axis = axis
- self.sort = sort
- self.dropna = dropna
-
- self._grouper_deprecated = None
- self._indexer_deprecated = None
- self._obj_deprecated = None
- self._gpr_index = None
- self.binner = None
- self._grouper = None
- self._indexer = None
-
- def _get_grouper(
- self, obj: NDFrameT, validate: bool = True
- ) -> tuple[ops.BaseGrouper, NDFrameT]:
- """
- Parameters
- ----------
- obj : Series or DataFrame
- validate : bool, default True
- if True, validate the grouper
-
- Returns
- -------
- a tuple of grouper, obj (possibly sorted)
- """
- obj, _, _ = self._set_grouper(obj)
- grouper, _, obj = get_grouper(
- obj,
- [self.key],
- axis=self.axis,
- level=self.level,
- sort=self.sort,
- validate=validate,
- dropna=self.dropna,
- )
- # Without setting this, subsequent lookups to .groups raise
- # error: Incompatible types in assignment (expression has type "BaseGrouper",
- # variable has type "None")
- self._grouper_deprecated = grouper # type: ignore[assignment]
-
- return grouper, obj
-
- @final
- def _set_grouper(
- self, obj: NDFrame, sort: bool = False, *, gpr_index: Index | None = None
- ):
- """
- given an object and the specifications, setup the internal grouper
- for this particular specification
-
- Parameters
- ----------
- obj : Series or DataFrame
- sort : bool, default False
- whether the resulting grouper should be sorted
- gpr_index : Index or None, default None
-
- Returns
- -------
- NDFrame
- Index
- np.ndarray[np.intp] | None
- """
- assert obj is not None
-
- indexer = None
-
- if self.key is not None and self.level is not None:
- raise ValueError("The Grouper cannot specify both a key and a level!")
-
- # Keep self._grouper value before overriding
- if self._grouper is None:
- # TODO: What are we assuming about subsequent calls?
- self._grouper = gpr_index
- self._indexer = self._indexer_deprecated
-
- # the key must be a valid info item
- if self.key is not None:
- key = self.key
- # The 'on' is already defined
- if getattr(gpr_index, "name", None) == key and isinstance(obj, Series):
- # Sometimes self._grouper will have been resorted while
- # obj has not. In this case there is a mismatch when we
- # call self._grouper.take(obj.index) so we need to undo the sorting
- # before we call _grouper.take.
- assert self._grouper is not None
- if self._indexer is not None:
- reverse_indexer = self._indexer.argsort()
- unsorted_ax = self._grouper.take(reverse_indexer)
- ax = unsorted_ax.take(obj.index)
- else:
- ax = self._grouper.take(obj.index)
- else:
- if key not in obj._info_axis:
- raise KeyError(f"The grouper name {key} is not found")
- ax = Index(obj[key], name=key)
-
- else:
- ax = obj._get_axis(self.axis)
- if self.level is not None:
- level = self.level
-
- # if a level is given it must be a mi level or
- # equivalent to the axis name
- if isinstance(ax, MultiIndex):
- level = ax._get_level_number(level)
- ax = Index(ax._get_level_values(level), name=ax.names[level])
-
- else:
- if level not in (0, ax.name):
- raise ValueError(f"The level {level} is not valid")
-
- # possibly sort
- if (self.sort or sort) and not ax.is_monotonic_increasing:
- # use stable sort to support first, last, nth
- # TODO: why does putting na_position="first" fix datetimelike cases?
- indexer = self._indexer_deprecated = ax.array.argsort(
- kind="mergesort", na_position="first"
- )
- ax = ax.take(indexer)
- obj = obj.take(indexer, axis=self.axis)
-
- # error: Incompatible types in assignment (expression has type
- # "NDFrameT", variable has type "None")
- self._obj_deprecated = obj # type: ignore[assignment]
- self._gpr_index = ax
- return obj, ax, indexer
-
- @final
- @property
- def ax(self) -> Index:
- warnings.warn(
- f"{type(self).__name__}.ax is deprecated and will be removed in a "
- "future version. Use Resampler.ax instead",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- index = self._gpr_index
- if index is None:
- raise ValueError("_set_grouper must be called before ax is accessed")
- return index
-
- @final
- @property
- def indexer(self):
- warnings.warn(
- f"{type(self).__name__}.indexer is deprecated and will be removed "
- "in a future version. Use Resampler.indexer instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self._indexer_deprecated
-
- @final
- @property
- def obj(self):
- warnings.warn(
- f"{type(self).__name__}.obj is deprecated and will be removed "
- "in a future version. Use GroupBy.indexer instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self._obj_deprecated
-
- @final
- @property
- def grouper(self):
- warnings.warn(
- f"{type(self).__name__}.grouper is deprecated and will be removed "
- "in a future version. Use GroupBy.grouper instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self._grouper_deprecated
-
- @final
- @property
- def groups(self):
- warnings.warn(
- f"{type(self).__name__}.groups is deprecated and will be removed "
- "in a future version. Use GroupBy.groups instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- # error: "None" has no attribute "groups"
- return self._grouper_deprecated.groups # type: ignore[attr-defined]
-
- @final
- def __repr__(self) -> str:
- attrs_list = (
- f"{attr_name}={repr(getattr(self, attr_name))}"
- for attr_name in self._attributes
- if getattr(self, attr_name) is not None
- )
- attrs = ", ".join(attrs_list)
- cls_name = type(self).__name__
- return f"{cls_name}({attrs})"
-
-
-@final
-class Grouping:
- """
- Holds the grouping information for a single key
-
- Parameters
- ----------
- index : Index
- grouper :
- obj : DataFrame or Series
- name : Label
- level :
- observed : bool, default False
- If we are a Categorical, use the observed values
- in_axis : if the Grouping is a column in self.obj and hence among
- Groupby.exclusions list
- dropna : bool, default True
- Whether to drop NA groups.
- uniques : Array-like, optional
- When specified, will be used for unique values. Enables including empty groups
- in the result for a BinGrouper. Must not contain duplicates.
-
- Attributes
- -------
- indices : dict
- Mapping of {group -> index_list}
- codes : ndarray
- Group codes
- group_index : Index or None
- unique groups
- groups : dict
- Mapping of {group -> label_list}
- """
-
- _codes: npt.NDArray[np.signedinteger] | None = None
- _group_index: Index | None = None
- _all_grouper: Categorical | None
- _orig_cats: Index | None
- _index: Index
-
- def __init__(
- self,
- index: Index,
- grouper=None,
- obj: NDFrame | None = None,
- level=None,
- sort: bool = True,
- observed: bool = False,
- in_axis: bool = False,
- dropna: bool = True,
- uniques: ArrayLike | None = None,
- ) -> None:
- self.level = level
- self._orig_grouper = grouper
- grouping_vector = _convert_grouper(index, grouper)
- self._all_grouper = None
- self._orig_cats = None
- self._index = index
- self._sort = sort
- self.obj = obj
- self._observed = observed
- self.in_axis = in_axis
- self._dropna = dropna
- self._uniques = uniques
-
- # we have a single grouper which may be a myriad of things,
- # some of which are dependent on the passing in level
-
- ilevel = self._ilevel
- if ilevel is not None:
- # In extant tests, the new self.grouping_vector matches
- # `index.get_level_values(ilevel)` whenever
- # mapper is None and isinstance(index, MultiIndex)
- if isinstance(index, MultiIndex):
- index_level = index.get_level_values(ilevel)
- else:
- index_level = index
-
- if grouping_vector is None:
- grouping_vector = index_level
- else:
- mapper = grouping_vector
- grouping_vector = index_level.map(mapper)
-
- # a passed Grouper like, directly get the grouper in the same way
- # as single grouper groupby, use the group_info to get codes
- elif isinstance(grouping_vector, Grouper):
- # get the new grouper; we already have disambiguated
- # what key/level refer to exactly, don't need to
- # check again as we have by this point converted these
- # to an actual value (rather than a pd.Grouper)
- assert self.obj is not None # for mypy
- newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False)
- self.obj = newobj
-
- if isinstance(newgrouper, ops.BinGrouper):
- # TODO: can we unwrap this and get a tighter typing
- # for self.grouping_vector?
- grouping_vector = newgrouper
- else:
- # ops.BaseGrouper
- # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1.
- # If that were to occur, would we be throwing out information?
- # error: Cannot determine type of "grouping_vector" [has-type]
- ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type]
- # use Index instead of ndarray so we can recover the name
- grouping_vector = Index(ng, name=newgrouper.result_index.name)
-
- elif not isinstance(
- grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
- ):
- # no level passed
- if getattr(grouping_vector, "ndim", 1) != 1:
- t = str(type(grouping_vector))
- raise ValueError(f"Grouper for '{t}' not 1-dimensional")
-
- grouping_vector = index.map(grouping_vector)
-
- if not (
- hasattr(grouping_vector, "__len__")
- and len(grouping_vector) == len(index)
- ):
- grper = pprint_thing(grouping_vector)
- errmsg = (
- "Grouper result violates len(labels) == "
- f"len(data)\nresult: {grper}"
- )
- raise AssertionError(errmsg)
-
- if isinstance(grouping_vector, np.ndarray):
- if grouping_vector.dtype.kind in ["m", "M"]:
- # if we have a date/time-like grouper, make sure that we have
- # Timestamps like
- # TODO 2022-10-08 we only have one test that gets here and
- # values are already in nanoseconds in that case.
- grouping_vector = Series(grouping_vector).to_numpy()
- elif is_categorical_dtype(grouping_vector):
- # a passed Categorical
- self._orig_cats = grouping_vector.categories
- grouping_vector, self._all_grouper = recode_for_groupby(
- grouping_vector, sort, observed
- )
-
- self.grouping_vector = grouping_vector
-
- def __repr__(self) -> str:
- return f"Grouping({self.name})"
-
- def __iter__(self) -> Iterator:
- return iter(self.indices)
-
- @cache_readonly
- def _passed_categorical(self) -> bool:
- return is_categorical_dtype(self.grouping_vector)
-
- @cache_readonly
- def name(self) -> Hashable:
- ilevel = self._ilevel
- if ilevel is not None:
- return self._index.names[ilevel]
-
- if isinstance(self._orig_grouper, (Index, Series)):
- return self._orig_grouper.name
-
- elif isinstance(self.grouping_vector, ops.BaseGrouper):
- return self.grouping_vector.result_index.name
-
- elif isinstance(self.grouping_vector, Index):
- return self.grouping_vector.name
-
- # otherwise we have ndarray or ExtensionArray -> no name
- return None
-
- @cache_readonly
- def _ilevel(self) -> int | None:
- """
- If necessary, converted index level name to index level position.
- """
- level = self.level
- if level is None:
- return None
- if not isinstance(level, int):
- index = self._index
- if level not in index.names:
- raise AssertionError(f"Level {level} not in index")
- return index.names.index(level)
- return level
-
- @property
- def ngroups(self) -> int:
- return len(self.group_index)
-
- @cache_readonly
- def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
- # we have a list of groupers
- if isinstance(self.grouping_vector, ops.BaseGrouper):
- return self.grouping_vector.indices
-
- values = Categorical(self.grouping_vector)
- return values._reverse_indexer()
-
- @property
- def codes(self) -> npt.NDArray[np.signedinteger]:
- return self._codes_and_uniques[0]
-
- @cache_readonly
- def group_arraylike(self) -> ArrayLike:
- """
- Analogous to result_index, but holding an ArrayLike to ensure
- we can retain ExtensionDtypes.
- """
- if self._all_grouper is not None:
- # retain dtype for categories, including unobserved ones
- return self.result_index._values
-
- elif self._passed_categorical:
- return self.group_index._values
-
- return self._codes_and_uniques[1]
-
- @cache_readonly
- def result_index(self) -> Index:
- # result_index retains dtype for categories, including unobserved ones,
- # which group_index does not
- if self._all_grouper is not None:
- group_idx = self.group_index
- assert isinstance(group_idx, CategoricalIndex)
- cats = self._orig_cats
- # set_categories is dynamically added
- return group_idx.set_categories(cats) # type: ignore[attr-defined]
- return self.group_index
-
- @cache_readonly
- def group_index(self) -> Index:
- codes, uniques = self._codes_and_uniques
- if not self._dropna and self._passed_categorical:
- assert isinstance(uniques, Categorical)
- if self._sort and (codes == len(uniques)).any():
- # Add NA value on the end when sorting
- uniques = Categorical.from_codes(
- np.append(uniques.codes, [-1]), uniques.categories
- )
- elif len(codes) > 0:
- # Need to determine proper placement of NA value when not sorting
- cat = self.grouping_vector
- na_idx = (cat.codes < 0).argmax()
- if cat.codes[na_idx] < 0:
- # count number of unique codes that comes before the nan value
- na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])
- uniques = Categorical.from_codes(
- np.insert(uniques.codes, na_unique_idx, -1), uniques.categories
- )
- return Index._with_infer(uniques, name=self.name)
-
- @cache_readonly
- def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
- uniques: ArrayLike
- if self._passed_categorical:
- # we make a CategoricalIndex out of the cat grouper
- # preserving the categories / ordered attributes;
- # doesn't (yet - GH#46909) handle dropna=False
- cat = self.grouping_vector
- categories = cat.categories
-
- if self._observed:
- ucodes = algorithms.unique1d(cat.codes)
- ucodes = ucodes[ucodes != -1]
- if self._sort:
- ucodes = np.sort(ucodes)
- else:
- ucodes = np.arange(len(categories))
-
- uniques = Categorical.from_codes(
- codes=ucodes, categories=categories, ordered=cat.ordered
- )
-
- codes = cat.codes
- if not self._dropna:
- na_mask = codes < 0
- if np.any(na_mask):
- if self._sort:
- # Replace NA codes with `largest code + 1`
- na_code = len(categories)
- codes = np.where(na_mask, na_code, codes)
- else:
- # Insert NA code into the codes based on first appearance
- # A negative code must exist, no need to check codes[na_idx] < 0
- na_idx = na_mask.argmax()
- # count number of unique codes that comes before the nan value
- na_code = algorithms.nunique_ints(codes[:na_idx])
- codes = np.where(codes >= na_code, codes + 1, codes)
- codes = np.where(na_mask, na_code, codes)
-
- if not self._observed:
- uniques = uniques.reorder_categories(self._orig_cats)
-
- return codes, uniques
-
- elif isinstance(self.grouping_vector, ops.BaseGrouper):
- # we have a list of groupers
- codes = self.grouping_vector.codes_info
- uniques = self.grouping_vector.result_index._values
- elif self._uniques is not None:
- # GH#50486 Code grouping_vector using _uniques; allows
- # including uniques that are not present in grouping_vector.
- cat = Categorical(self.grouping_vector, categories=self._uniques)
- codes = cat.codes
- uniques = self._uniques
- else:
- # GH35667, replace dropna=False with use_na_sentinel=False
- # error: Incompatible types in assignment (expression has type "Union[
- # ndarray[Any, Any], Index]", variable has type "Categorical")
- codes, uniques = algorithms.factorize( # type: ignore[assignment]
- self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
- )
- return codes, uniques
-
- @cache_readonly
- def groups(self) -> dict[Hashable, np.ndarray]:
- return self._index.groupby(Categorical.from_codes(self.codes, self.group_index))
-
-
-def get_grouper(
- obj: NDFrameT,
- key=None,
- axis: Axis = 0,
- level=None,
- sort: bool = True,
- observed: bool = False,
- validate: bool = True,
- dropna: bool = True,
-) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]:
- """
- Create and return a BaseGrouper, which is an internal
- mapping of how to create the grouper indexers.
- This may be composed of multiple Grouping objects, indicating
- multiple groupers
-
- Groupers are ultimately index mappings. They can originate as:
- index mappings, keys to columns, functions, or Groupers
-
- Groupers enable local references to axis,level,sort, while
- the passed in axis, level, and sort are 'global'.
-
- This routine tries to figure out what the passing in references
- are and then creates a Grouping for each one, combined into
- a BaseGrouper.
-
- If observed & we have a categorical grouper, only show the observed
- values.
-
- If validate, then check for key/level overlaps.
-
- """
- group_axis = obj._get_axis(axis)
-
- # validate that the passed single level is compatible with the passed
- # axis of the object
- if level is not None:
- # TODO: These if-block and else-block are almost same.
- # MultiIndex instance check is removable, but it seems that there are
- # some processes only for non-MultiIndex in else-block,
- # eg. `obj.index.name != level`. We have to consider carefully whether
- # these are applicable for MultiIndex. Even if these are applicable,
- # we need to check if it makes no side effect to subsequent processes
- # on the outside of this condition.
- # (GH 17621)
- if isinstance(group_axis, MultiIndex):
- if is_list_like(level) and len(level) == 1:
- level = level[0]
-
- if key is None and is_scalar(level):
- # Get the level values from group_axis
- key = group_axis.get_level_values(level)
- level = None
-
- else:
- # allow level to be a length-one list-like object
- # (e.g., level=[0])
- # GH 13901
- if is_list_like(level):
- nlevels = len(level)
- if nlevels == 1:
- level = level[0]
- elif nlevels == 0:
- raise ValueError("No group keys passed!")
- else:
- raise ValueError("multiple levels only valid with MultiIndex")
-
- if isinstance(level, str):
- if obj._get_axis(axis).name != level:
- raise ValueError(
- f"level name {level} is not the name "
- f"of the {obj._get_axis_name(axis)}"
- )
- elif level > 0 or level < -1:
- raise ValueError("level > 0 or level < -1 only valid with MultiIndex")
-
- # NOTE: `group_axis` and `group_axis.get_level_values(level)`
- # are same in this section.
- level = None
- key = group_axis
-
- # a passed-in Grouper, directly convert
- if isinstance(key, Grouper):
- grouper, obj = key._get_grouper(obj, validate=False)
- if key.key is None:
- return grouper, frozenset(), obj
- else:
- return grouper, frozenset({key.key}), obj
-
- # already have a BaseGrouper, just return it
- elif isinstance(key, ops.BaseGrouper):
- return key, frozenset(), obj
-
- if not isinstance(key, list):
- keys = [key]
- match_axis_length = False
- else:
- keys = key
- match_axis_length = len(keys) == len(group_axis)
-
- # what are we after, exactly?
- any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
- any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys)
- any_arraylike = any(
- isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys
- )
-
- # is this an index replacement?
- if (
- not any_callable
- and not any_arraylike
- and not any_groupers
- and match_axis_length
- and level is None
- ):
- if isinstance(obj, DataFrame):
- all_in_columns_index = all(
- g in obj.columns or g in obj.index.names for g in keys
- )
- else:
- assert isinstance(obj, Series)
- all_in_columns_index = all(g in obj.index.names for g in keys)
-
- if not all_in_columns_index:
- keys = [com.asarray_tuplesafe(keys)]
-
- if isinstance(level, (tuple, list)):
- if key is None:
- keys = [None] * len(level)
- levels = level
- else:
- levels = [level] * len(keys)
-
- groupings: list[Grouping] = []
- exclusions: set[Hashable] = set()
-
- # if the actual grouper should be obj[key]
- def is_in_axis(key) -> bool:
- if not _is_label_like(key):
- if obj.ndim == 1:
- return False
-
- # items -> .columns for DataFrame, .index for Series
- items = obj.axes[-1]
- try:
- items.get_loc(key)
- except (KeyError, TypeError, InvalidIndexError):
- # TypeError shows up here if we pass e.g. an Index
- return False
-
- return True
-
- # if the grouper is obj[name]
- def is_in_obj(gpr) -> bool:
- if not hasattr(gpr, "name"):
- return False
- if using_copy_on_write():
- # For the CoW case, we check the references to determine if the
- # series is part of the object
- try:
- obj_gpr_column = obj[gpr.name]
- except (KeyError, IndexError, InvalidIndexError):
- return False
- if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series):
- return gpr._mgr.references_same_values( # type: ignore[union-attr]
- obj_gpr_column._mgr, 0 # type: ignore[arg-type]
- )
- return False
- try:
- return gpr is obj[gpr.name]
- except (KeyError, IndexError, InvalidIndexError):
- # IndexError reached in e.g. test_skip_group_keys when we pass
- # lambda here
- # InvalidIndexError raised on key-types inappropriate for index,
- # e.g. DatetimeIndex.get_loc(tuple())
- return False
-
- for gpr, level in zip(keys, levels):
- if is_in_obj(gpr): # df.groupby(df['name'])
- in_axis = True
- exclusions.add(gpr.name)
-
- elif is_in_axis(gpr): # df.groupby('name')
- if obj.ndim != 1 and gpr in obj:
- if validate:
- obj._check_label_or_level_ambiguity(gpr, axis=axis)
- in_axis, name, gpr = True, gpr, obj[gpr]
- if gpr.ndim != 1:
- # non-unique columns; raise here to get the name in the
- # exception message
- raise ValueError(f"Grouper for '{name}' not 1-dimensional")
- exclusions.add(name)
- elif obj._is_level_reference(gpr, axis=axis):
- in_axis, level, gpr = False, gpr, None
- else:
- raise KeyError(gpr)
- elif isinstance(gpr, Grouper) and gpr.key is not None:
- # Add key to exclusions
- exclusions.add(gpr.key)
- in_axis = True
- else:
- in_axis = False
-
- # create the Grouping
- # allow us to passing the actual Grouping as the gpr
- ping = (
- Grouping(
- group_axis,
- gpr,
- obj=obj,
- level=level,
- sort=sort,
- observed=observed,
- in_axis=in_axis,
- dropna=dropna,
- )
- if not isinstance(gpr, Grouping)
- else gpr
- )
-
- groupings.append(ping)
-
- if len(groupings) == 0 and len(obj):
- raise ValueError("No group keys passed!")
- if len(groupings) == 0:
- groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))
-
- # create the internals grouper
- grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna)
- return grouper, frozenset(exclusions), obj
-
-
-def _is_label_like(val) -> bool:
- return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))
-
-
-def _convert_grouper(axis: Index, grouper):
- if isinstance(grouper, dict):
- return grouper.get
- elif isinstance(grouper, Series):
- if grouper.index.equals(axis):
- return grouper._values
- else:
- return grouper.reindex(axis)._values
- elif isinstance(grouper, MultiIndex):
- return grouper._values
- elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)):
- if len(grouper) != len(axis):
- raise ValueError("Grouper and axis must be same length")
-
- if isinstance(grouper, (list, tuple)):
- grouper = com.asarray_tuplesafe(grouper)
- return grouper
- else:
- return grouper
diff --git a/contrib/python/pandas/py3/pandas/core/groupby/indexing.py b/contrib/python/pandas/py3/pandas/core/groupby/indexing.py
deleted file mode 100644
index 911ee0e8e47..00000000000
--- a/contrib/python/pandas/py3/pandas/core/groupby/indexing.py
+++ /dev/null
@@ -1,303 +0,0 @@
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Iterable,
- Literal,
- cast,
-)
-
-import numpy as np
-
-from pandas._typing import PositionalIndexer
-from pandas.util._decorators import (
- cache_readonly,
- doc,
-)
-
-from pandas.core.dtypes.common import (
- is_integer,
- is_list_like,
-)
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
- from pandas.core.groupby import groupby
-
-
-class GroupByIndexingMixin:
- """
- Mixin for adding ._positional_selector to GroupBy.
- """
-
- @cache_readonly
- def _positional_selector(self) -> GroupByPositionalSelector:
- """
- Return positional selection for each group.
-
- ``groupby._positional_selector[i:j]`` is similar to
- ``groupby.apply(lambda x: x.iloc[i:j])``
- but much faster and preserves the original index and order.
-
- ``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head`
- and :meth:`~GroupBy.tail`. For example:
-
- - ``head(5)``
- - ``_positional_selector[5:-5]``
- - ``tail(5)``
-
- together return all the rows.
-
- Allowed inputs for the index are:
-
- - An integer valued iterable, e.g. ``range(2, 4)``.
- - A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``.
-
- The output format is the same as :meth:`~GroupBy.head` and
- :meth:`~GroupBy.tail`, namely
- a subset of the ``DataFrame`` or ``Series`` with the index and order preserved.
-
- Returns
- -------
- Series
- The filtered subset of the original Series.
- DataFrame
- The filtered subset of the original DataFrame.
-
- See Also
- --------
- DataFrame.iloc : Purely integer-location based indexing for selection by
- position.
- GroupBy.head : Return first n rows of each group.
- GroupBy.tail : Return last n rows of each group.
- GroupBy.nth : Take the nth row from each group if n is an int, or a
- subset of rows, if n is a list of ints.
-
- Notes
- -----
- - The slice step cannot be negative.
- - If the index specification results in overlaps, the item is not duplicated.
- - If the index specification changes the order of items, then
- they are returned in their original order.
- By contrast, ``DataFrame.iloc`` can change the row order.
- - ``groupby()`` parameters such as as_index and dropna are ignored.
-
- The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth`
- with ``as_index=False`` are:
-
- - Input to ``_positional_selector`` can include
- one or more slices whereas ``nth``
- just handles an integer or a list of integers.
- - ``_positional_selector`` can accept a slice relative to the
- last row of each group.
- - ``_positional_selector`` does not have an equivalent to the
- ``nth()`` ``dropna`` parameter.
-
- Examples
- --------
- >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]],
- ... columns=["A", "B"])
- >>> df.groupby("A")._positional_selector[1:2]
- A B
- 1 a 2
- 4 b 5
-
- >>> df.groupby("A")._positional_selector[1, -1]
- A B
- 1 a 2
- 2 a 3
- 4 b 5
- """
- if TYPE_CHECKING:
- # pylint: disable-next=used-before-assignment
- groupby_self = cast(groupby.GroupBy, self)
- else:
- groupby_self = self
-
- return GroupByPositionalSelector(groupby_self)
-
- def _make_mask_from_positional_indexer(
- self,
- arg: PositionalIndexer | tuple,
- ) -> np.ndarray:
- if is_list_like(arg):
- if all(is_integer(i) for i in cast(Iterable, arg)):
- mask = self._make_mask_from_list(cast(Iterable[int], arg))
- else:
- mask = self._make_mask_from_tuple(cast(tuple, arg))
-
- elif isinstance(arg, slice):
- mask = self._make_mask_from_slice(arg)
- elif is_integer(arg):
- mask = self._make_mask_from_int(cast(int, arg))
- else:
- raise TypeError(
- f"Invalid index {type(arg)}. "
- "Must be integer, list-like, slice or a tuple of "
- "integers and slices"
- )
-
- if isinstance(mask, bool):
- if mask:
- mask = self._ascending_count >= 0
- else:
- mask = self._ascending_count < 0
-
- return cast(np.ndarray, mask)
-
- def _make_mask_from_int(self, arg: int) -> np.ndarray:
- if arg >= 0:
- return self._ascending_count == arg
- else:
- return self._descending_count == (-arg - 1)
-
- def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray:
- positive = [arg for arg in args if arg >= 0]
- negative = [-arg - 1 for arg in args if arg < 0]
-
- mask: bool | np.ndarray = False
-
- if positive:
- mask |= np.isin(self._ascending_count, positive)
-
- if negative:
- mask |= np.isin(self._descending_count, negative)
-
- return mask
-
- def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray:
- mask: bool | np.ndarray = False
-
- for arg in args:
- if is_integer(arg):
- mask |= self._make_mask_from_int(cast(int, arg))
- elif isinstance(arg, slice):
- mask |= self._make_mask_from_slice(arg)
- else:
- raise ValueError(
- f"Invalid argument {type(arg)}. Should be int or slice."
- )
-
- return mask
-
- def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray:
- start = arg.start
- stop = arg.stop
- step = arg.step
-
- if step is not None and step < 0:
- raise ValueError(f"Invalid step {step}. Must be non-negative")
-
- mask: bool | np.ndarray = True
-
- if step is None:
- step = 1
-
- if start is None:
- if step > 1:
- mask &= self._ascending_count % step == 0
-
- elif start >= 0:
- mask &= self._ascending_count >= start
-
- if step > 1:
- mask &= (self._ascending_count - start) % step == 0
-
- else:
- mask &= self._descending_count < -start
-
- offset_array = self._descending_count + start + 1
- limit_array = (
- self._ascending_count + self._descending_count + (start + 1)
- ) < 0
- offset_array = np.where(limit_array, self._ascending_count, offset_array)
-
- mask &= offset_array % step == 0
-
- if stop is not None:
- if stop >= 0:
- mask &= self._ascending_count < stop
- else:
- mask &= self._descending_count >= -stop
-
- return mask
-
- @cache_readonly
- def _ascending_count(self) -> np.ndarray:
- if TYPE_CHECKING:
- groupby_self = cast(groupby.GroupBy, self)
- else:
- groupby_self = self
-
- return groupby_self._cumcount_array()
-
- @cache_readonly
- def _descending_count(self) -> np.ndarray:
- if TYPE_CHECKING:
- groupby_self = cast(groupby.GroupBy, self)
- else:
- groupby_self = self
-
- return groupby_self._cumcount_array(ascending=False)
-
-
-@doc(GroupByIndexingMixin._positional_selector)
-class GroupByPositionalSelector:
- def __init__(self, groupby_object: groupby.GroupBy) -> None:
- self.groupby_object = groupby_object
-
- def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series:
- """
- Select by positional index per group.
-
- Implements GroupBy._positional_selector
-
- Parameters
- ----------
- arg : PositionalIndexer | tuple
- Allowed values are:
- - int
- - int valued iterable such as list or range
- - slice with step either None or positive
- - tuple of integers and slices
-
- Returns
- -------
- Series
- The filtered subset of the original groupby Series.
- DataFrame
- The filtered subset of the original groupby DataFrame.
-
- See Also
- --------
- DataFrame.iloc : Integer-location based indexing for selection by position.
- GroupBy.head : Return first n rows of each group.
- GroupBy.tail : Return last n rows of each group.
- GroupBy._positional_selector : Return positional selection for each group.
- GroupBy.nth : Take the nth row from each group if n is an int, or a
- subset of rows, if n is a list of ints.
- """
- mask = self.groupby_object._make_mask_from_positional_indexer(arg)
- return self.groupby_object._mask_selected_obj(mask)
-
-
-class GroupByNthSelector:
- """
- Dynamically substituted for GroupBy.nth to enable both call and index
- """
-
- def __init__(self, groupby_object: groupby.GroupBy) -> None:
- self.groupby_object = groupby_object
-
- def __call__(
- self,
- n: PositionalIndexer | tuple,
- dropna: Literal["any", "all", None] = None,
- ) -> DataFrame | Series:
- return self.groupby_object._nth(n, dropna)
-
- def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series:
- return self.groupby_object._nth(n)
diff --git a/contrib/python/pandas/py3/pandas/core/groupby/numba_.py b/contrib/python/pandas/py3/pandas/core/groupby/numba_.py
deleted file mode 100644
index 282cb81e743..00000000000
--- a/contrib/python/pandas/py3/pandas/core/groupby/numba_.py
+++ /dev/null
@@ -1,179 +0,0 @@
-"""Common utilities for Numba operations with groupby ops"""
-from __future__ import annotations
-
-import functools
-import inspect
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
-)
-
-import numpy as np
-
-from pandas._typing import Scalar
-from pandas.compat._optional import import_optional_dependency
-
-from pandas.core.util.numba_ import (
- NumbaUtilError,
- jit_user_function,
-)
-
-
-def validate_udf(func: Callable) -> None:
- """
- Validate user defined function for ops when using Numba with groupby ops.
-
- The first signature arguments should include:
-
- def f(values, index, ...):
- ...
-
- Parameters
- ----------
- func : function, default False
- user defined function
-
- Returns
- -------
- None
-
- Raises
- ------
- NumbaUtilError
- """
- if not callable(func):
- raise NotImplementedError(
- "Numba engine can only be used with a single function."
- )
- udf_signature = list(inspect.signature(func).parameters.keys())
- expected_args = ["values", "index"]
- min_number_args = len(expected_args)
- if (
- len(udf_signature) < min_number_args
- or udf_signature[:min_number_args] != expected_args
- ):
- raise NumbaUtilError(
- f"The first {min_number_args} arguments to {func.__name__} must be "
- f"{expected_args}"
- )
-
-
-@functools.lru_cache(maxsize=None)
-def generate_numba_agg_func(
- func: Callable[..., Scalar],
- nopython: bool,
- nogil: bool,
- parallel: bool,
-) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
- """
- Generate a numba jitted agg function specified by values from engine_kwargs.
-
- 1. jit the user's function
- 2. Return a groupby agg function with the jitted function inline
-
- Configurations specified in engine_kwargs apply to both the user's
- function _AND_ the groupby evaluation loop.
-
- Parameters
- ----------
- func : function
- function to be applied to each group and will be JITed
- nopython : bool
- nopython to be passed into numba.jit
- nogil : bool
- nogil to be passed into numba.jit
- parallel : bool
- parallel to be passed into numba.jit
-
- Returns
- -------
- Numba function
- """
- numba_func = jit_user_function(func, nopython, nogil, parallel)
- if TYPE_CHECKING:
- import numba
- else:
- numba = import_optional_dependency("numba")
-
- @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
- def group_agg(
- values: np.ndarray,
- index: np.ndarray,
- begin: np.ndarray,
- end: np.ndarray,
- num_columns: int,
- *args: Any,
- ) -> np.ndarray:
- assert len(begin) == len(end)
- num_groups = len(begin)
-
- result = np.empty((num_groups, num_columns))
- for i in numba.prange(num_groups):
- group_index = index[begin[i] : end[i]]
- for j in numba.prange(num_columns):
- group = values[begin[i] : end[i], j]
- result[i, j] = numba_func(group, group_index, *args)
- return result
-
- return group_agg
-
-
-@functools.lru_cache(maxsize=None)
-def generate_numba_transform_func(
- func: Callable[..., np.ndarray],
- nopython: bool,
- nogil: bool,
- parallel: bool,
-) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, Any], np.ndarray]:
- """
- Generate a numba jitted transform function specified by values from engine_kwargs.
-
- 1. jit the user's function
- 2. Return a groupby transform function with the jitted function inline
-
- Configurations specified in engine_kwargs apply to both the user's
- function _AND_ the groupby evaluation loop.
-
- Parameters
- ----------
- func : function
- function to be applied to each window and will be JITed
- nopython : bool
- nopython to be passed into numba.jit
- nogil : bool
- nogil to be passed into numba.jit
- parallel : bool
- parallel to be passed into numba.jit
-
- Returns
- -------
- Numba function
- """
- numba_func = jit_user_function(func, nopython, nogil, parallel)
- if TYPE_CHECKING:
- import numba
- else:
- numba = import_optional_dependency("numba")
-
- @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
- def group_transform(
- values: np.ndarray,
- index: np.ndarray,
- begin: np.ndarray,
- end: np.ndarray,
- num_columns: int,
- *args: Any,
- ) -> np.ndarray:
- assert len(begin) == len(end)
- num_groups = len(begin)
-
- result = np.empty((len(values), num_columns))
- for i in numba.prange(num_groups):
- group_index = index[begin[i] : end[i]]
- for j in numba.prange(num_columns):
- group = values[begin[i] : end[i], j]
- result[begin[i] : end[i], j] = numba_func(group, group_index, *args)
- return result
-
- return group_transform
diff --git a/contrib/python/pandas/py3/pandas/core/groupby/ops.py b/contrib/python/pandas/py3/pandas/core/groupby/ops.py
deleted file mode 100644
index 52b8301554c..00000000000
--- a/contrib/python/pandas/py3/pandas/core/groupby/ops.py
+++ /dev/null
@@ -1,1278 +0,0 @@
-"""
-Provide classes to perform the groupby aggregate operations.
-
-These are not exposed to the user and provide implementations of the grouping
-operations, primarily in cython. These classes (BaseGrouper and BinGrouper)
-are contained *in* the SeriesGroupBy and DataFrameGroupBy objects.
-"""
-from __future__ import annotations
-
-import collections
-import functools
-from typing import (
- TYPE_CHECKING,
- Callable,
- Generic,
- Hashable,
- Iterator,
- Sequence,
- final,
-)
-
-import numpy as np
-
-from pandas._libs import (
- NaT,
- lib,
-)
-import pandas._libs.groupby as libgroupby
-import pandas._libs.reduction as libreduction
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- DtypeObj,
- NDFrameT,
- Shape,
- npt,
-)
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import cache_readonly
-
-from pandas.core.dtypes.cast import (
- maybe_cast_pointwise_result,
- maybe_downcast_to_dtype,
-)
-from pandas.core.dtypes.common import (
- ensure_float64,
- ensure_int64,
- ensure_platform_int,
- ensure_uint64,
- is_1d_only_ea_dtype,
- is_bool_dtype,
- is_complex_dtype,
- is_datetime64_any_dtype,
- is_float_dtype,
- is_integer_dtype,
- is_numeric_dtype,
- is_period_dtype,
- is_sparse,
- is_timedelta64_dtype,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.dtypes import CategoricalDtype
-from pandas.core.dtypes.missing import (
- isna,
- maybe_fill,
-)
-
-from pandas.core.arrays import (
- Categorical,
- DatetimeArray,
- ExtensionArray,
- PeriodArray,
- TimedeltaArray,
-)
-from pandas.core.arrays.masked import (
- BaseMaskedArray,
- BaseMaskedDtype,
-)
-from pandas.core.arrays.string_ import StringDtype
-from pandas.core.frame import DataFrame
-from pandas.core.groupby import grouper
-from pandas.core.indexes.api import (
- CategoricalIndex,
- Index,
- MultiIndex,
- ensure_index,
-)
-from pandas.core.series import Series
-from pandas.core.sorting import (
- compress_group_index,
- decons_obs_group_ids,
- get_flattened_list,
- get_group_index,
- get_group_index_sorter,
- get_indexer_dict,
-)
-
-if TYPE_CHECKING:
- from pandas.core.generic import NDFrame
-
-
-class WrappedCythonOp:
- """
- Dispatch logic for functions defined in _libs.groupby
-
- Parameters
- ----------
- kind: str
- Whether the operation is an aggregate or transform.
- how: str
- Operation name, e.g. "mean".
- has_dropped_na: bool
- True precisely when dropna=True and the grouper contains a null value.
- """
-
- # Functions for which we do _not_ attempt to cast the cython result
- # back to the original dtype.
- cast_blocklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"])
-
- def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
- self.kind = kind
- self.how = how
- self.has_dropped_na = has_dropped_na
-
- _CYTHON_FUNCTIONS = {
- "aggregate": {
- "sum": "group_sum",
- "prod": "group_prod",
- "min": "group_min",
- "max": "group_max",
- "mean": "group_mean",
- "median": "group_median_float64",
- "var": "group_var",
- "first": "group_nth",
- "last": "group_last",
- "ohlc": "group_ohlc",
- },
- "transform": {
- "cumprod": "group_cumprod",
- "cumsum": "group_cumsum",
- "cummin": "group_cummin",
- "cummax": "group_cummax",
- "rank": "group_rank",
- },
- }
-
- _cython_arity = {"ohlc": 4} # OHLC
-
- # Note: we make this a classmethod and pass kind+how so that caching
- # works at the class level and not the instance level
- @classmethod
- @functools.lru_cache(maxsize=None)
- def _get_cython_function(
- cls, kind: str, how: str, dtype: np.dtype, is_numeric: bool
- ):
- dtype_str = dtype.name
- ftype = cls._CYTHON_FUNCTIONS[kind][how]
-
- # see if there is a fused-type version of function
- # only valid for numeric
- f = getattr(libgroupby, ftype)
- if is_numeric:
- return f
- elif dtype == np.dtype(object):
- if how in ["median", "cumprod"]:
- # no fused types -> no __signatures__
- raise NotImplementedError(
- f"function is not implemented for this dtype: "
- f"[how->{how},dtype->{dtype_str}]"
- )
- if "object" not in f.__signatures__:
- # raise NotImplementedError here rather than TypeError later
- raise NotImplementedError(
- f"function is not implemented for this dtype: "
- f"[how->{how},dtype->{dtype_str}]"
- )
- return f
- else:
- raise NotImplementedError(
- "This should not be reached. Please report a bug at "
- "github.com/pandas-dev/pandas/",
- dtype,
- )
-
- def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
- """
- Cast numeric dtypes to float64 for functions that only support that.
-
- Parameters
- ----------
- values : np.ndarray
-
- Returns
- -------
- values : np.ndarray
- """
- how = self.how
-
- if how == "median":
- # median only has a float64 implementation
- # We should only get here with is_numeric, as non-numeric cases
- # should raise in _get_cython_function
- values = ensure_float64(values)
-
- elif values.dtype.kind in ["i", "u"]:
- if how in ["var", "mean"] or (
- self.kind == "transform" and self.has_dropped_na
- ):
- # has_dropped_na check need for test_null_group_str_transformer
- # result may still include NaN, so we have to cast
- values = ensure_float64(values)
-
- elif how in ["sum", "ohlc", "prod", "cumsum", "cumprod"]:
- # Avoid overflow during group op
- if values.dtype.kind == "i":
- values = ensure_int64(values)
- else:
- values = ensure_uint64(values)
-
- return values
-
- # TODO: general case implementation overridable by EAs.
- def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
- """
- Check if we can do this operation with our cython functions.
-
- Raises
- ------
- TypeError
- This is not a valid operation for this dtype.
- NotImplementedError
- This may be a valid operation, but does not have a cython implementation.
- """
- how = self.how
-
- if is_numeric:
- # never an invalid op for those dtypes, so return early as fastpath
- return
-
- if isinstance(dtype, CategoricalDtype):
- if how in ["sum", "prod", "cumsum", "cumprod"]:
- raise TypeError(f"{dtype} type does not support {how} operations")
- if how in ["min", "max", "rank"] and not dtype.ordered:
- # raise TypeError instead of NotImplementedError to ensure we
- # don't go down a group-by-group path, since in the empty-groups
- # case that would fail to raise
- raise TypeError(f"Cannot perform {how} with non-ordered Categorical")
- if how not in ["rank"]:
- # only "rank" is implemented in cython
- raise NotImplementedError(f"{dtype} dtype not supported")
-
- elif is_sparse(dtype):
- raise NotImplementedError(f"{dtype} dtype not supported")
- elif is_datetime64_any_dtype(dtype):
- # Adding/multiplying datetimes is not valid
- if how in ["sum", "prod", "cumsum", "cumprod"]:
- raise TypeError(f"datetime64 type does not support {how} operations")
- elif is_period_dtype(dtype):
- # Adding/multiplying Periods is not valid
- if how in ["sum", "prod", "cumsum", "cumprod"]:
- raise TypeError(f"Period type does not support {how} operations")
- elif is_timedelta64_dtype(dtype):
- # timedeltas we can add but not multiply
- if how in ["prod", "cumprod"]:
- raise TypeError(f"timedelta64 type does not support {how} operations")
-
- def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape:
- how = self.how
- kind = self.kind
-
- arity = self._cython_arity.get(how, 1)
-
- out_shape: Shape
- if how == "ohlc":
- out_shape = (ngroups, arity)
- elif arity > 1:
- raise NotImplementedError(
- "arity of more than 1 is not supported for the 'how' argument"
- )
- elif kind == "transform":
- out_shape = values.shape
- else:
- out_shape = (ngroups,) + values.shape[1:]
- return out_shape
-
- def _get_out_dtype(self, dtype: np.dtype) -> np.dtype:
- how = self.how
-
- if how == "rank":
- out_dtype = "float64"
- else:
- if is_numeric_dtype(dtype):
- out_dtype = f"{dtype.kind}{dtype.itemsize}"
- else:
- out_dtype = "object"
- return np.dtype(out_dtype)
-
- def _get_result_dtype(self, dtype: np.dtype) -> np.dtype:
- """
- Get the desired dtype of a result based on the
- input dtype and how it was computed.
-
- Parameters
- ----------
- dtype : np.dtype
-
- Returns
- -------
- np.dtype
- The desired dtype of the result.
- """
- how = self.how
-
- if how in ["sum", "cumsum", "sum", "prod", "cumprod"]:
- if dtype == np.dtype(bool):
- return np.dtype(np.int64)
- elif how in ["mean", "median", "var"]:
- if is_float_dtype(dtype) or is_complex_dtype(dtype):
- return dtype
- elif is_numeric_dtype(dtype):
- return np.dtype(np.float64)
- return dtype
-
- @final
- def _ea_wrap_cython_operation(
- self,
- values: ExtensionArray,
- min_count: int,
- ngroups: int,
- comp_ids: np.ndarray,
- **kwargs,
- ) -> ArrayLike:
- """
- If we have an ExtensionArray, unwrap, call _cython_operation, and
- re-wrap if appropriate.
- """
- if isinstance(values, BaseMaskedArray):
- return self._masked_ea_wrap_cython_operation(
- values,
- min_count=min_count,
- ngroups=ngroups,
- comp_ids=comp_ids,
- **kwargs,
- )
-
- elif isinstance(values, Categorical):
- assert self.how == "rank" # the only one implemented ATM
- assert values.ordered # checked earlier
- mask = values.isna()
- npvalues = values._ndarray
-
- res_values = self._cython_op_ndim_compat(
- npvalues,
- min_count=min_count,
- ngroups=ngroups,
- comp_ids=comp_ids,
- mask=mask,
- **kwargs,
- )
-
- # If we ever have more than just "rank" here, we'll need to do
- # `if self.how in self.cast_blocklist` like we do for other dtypes.
- return res_values
-
- npvalues = self._ea_to_cython_values(values)
-
- res_values = self._cython_op_ndim_compat(
- npvalues,
- min_count=min_count,
- ngroups=ngroups,
- comp_ids=comp_ids,
- mask=None,
- **kwargs,
- )
-
- if self.how in self.cast_blocklist:
- # i.e. how in ["rank"], since other cast_blocklist methods don't go
- # through cython_operation
- return res_values
-
- return self._reconstruct_ea_result(values, res_values)
-
- # TODO: general case implementation overridable by EAs.
- def _ea_to_cython_values(self, values: ExtensionArray) -> np.ndarray:
- # GH#43682
- if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)):
- # All of the functions implemented here are ordinal, so we can
- # operate on the tz-naive equivalents
- npvalues = values._ndarray.view("M8[ns]")
- elif isinstance(values.dtype, StringDtype):
- # StringArray
- npvalues = values.to_numpy(object, na_value=np.nan)
- else:
- raise NotImplementedError(
- f"function is not implemented for this dtype: {values.dtype}"
- )
- return npvalues
-
- # TODO: general case implementation overridable by EAs.
- def _reconstruct_ea_result(
- self, values: ExtensionArray, res_values: np.ndarray
- ) -> ExtensionArray:
- """
- Construct an ExtensionArray result from an ndarray result.
- """
- dtype: BaseMaskedDtype | StringDtype
-
- if isinstance(values.dtype, StringDtype):
- dtype = values.dtype
- string_array_cls = dtype.construct_array_type()
- return string_array_cls._from_sequence(res_values, dtype=dtype)
-
- elif isinstance(values, (DatetimeArray, TimedeltaArray, PeriodArray)):
- # In to_cython_values we took a view as M8[ns]
- assert res_values.dtype == "M8[ns]"
- res_values = res_values.view(values._ndarray.dtype)
- return values._from_backing_data(res_values)
-
- raise NotImplementedError
-
- @final
- def _masked_ea_wrap_cython_operation(
- self,
- values: BaseMaskedArray,
- min_count: int,
- ngroups: int,
- comp_ids: np.ndarray,
- **kwargs,
- ) -> BaseMaskedArray:
- """
- Equivalent of `_ea_wrap_cython_operation`, but optimized for masked EA's
- and cython algorithms which accept a mask.
- """
- orig_values = values
-
- # libgroupby functions are responsible for NOT altering mask
- mask = values._mask
- if self.kind != "aggregate":
- result_mask = mask.copy()
- else:
- result_mask = np.zeros(ngroups, dtype=bool)
-
- arr = values._data
-
- res_values = self._cython_op_ndim_compat(
- arr,
- min_count=min_count,
- ngroups=ngroups,
- comp_ids=comp_ids,
- mask=mask,
- result_mask=result_mask,
- **kwargs,
- )
-
- if self.how == "ohlc":
- arity = self._cython_arity.get(self.how, 1)
- result_mask = np.tile(result_mask, (arity, 1)).T
-
- # res_values should already have the correct dtype, we just need to
- # wrap in a MaskedArray
- return orig_values._maybe_mask_result(res_values, result_mask)
-
- @final
- def _cython_op_ndim_compat(
- self,
- values: np.ndarray,
- *,
- min_count: int,
- ngroups: int,
- comp_ids: np.ndarray,
- mask: npt.NDArray[np.bool_] | None = None,
- result_mask: npt.NDArray[np.bool_] | None = None,
- **kwargs,
- ) -> np.ndarray:
- if values.ndim == 1:
- # expand to 2d, dispatch, then squeeze if appropriate
- values2d = values[None, :]
- if mask is not None:
- mask = mask[None, :]
- if result_mask is not None:
- result_mask = result_mask[None, :]
- res = self._call_cython_op(
- values2d,
- min_count=min_count,
- ngroups=ngroups,
- comp_ids=comp_ids,
- mask=mask,
- result_mask=result_mask,
- **kwargs,
- )
- if res.shape[0] == 1:
- return res[0]
-
- # otherwise we have OHLC
- return res.T
-
- return self._call_cython_op(
- values,
- min_count=min_count,
- ngroups=ngroups,
- comp_ids=comp_ids,
- mask=mask,
- result_mask=result_mask,
- **kwargs,
- )
-
- @final
- def _call_cython_op(
- self,
- values: np.ndarray, # np.ndarray[ndim=2]
- *,
- min_count: int,
- ngroups: int,
- comp_ids: np.ndarray,
- mask: npt.NDArray[np.bool_] | None,
- result_mask: npt.NDArray[np.bool_] | None,
- **kwargs,
- ) -> np.ndarray: # np.ndarray[ndim=2]
- orig_values = values
-
- dtype = values.dtype
- is_numeric = is_numeric_dtype(dtype)
-
- is_datetimelike = needs_i8_conversion(dtype)
-
- if is_datetimelike:
- values = values.view("int64")
- is_numeric = True
- elif is_bool_dtype(dtype):
- values = values.view("uint8")
- if values.dtype == "float16":
- values = values.astype(np.float32)
-
- values = values.T
- if mask is not None:
- mask = mask.T
- if result_mask is not None:
- result_mask = result_mask.T
-
- out_shape = self._get_output_shape(ngroups, values)
- func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric)
- values = self._get_cython_vals(values)
- out_dtype = self._get_out_dtype(values.dtype)
-
- result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
- if self.kind == "aggregate":
- counts = np.zeros(ngroups, dtype=np.int64)
- if self.how in ["min", "max", "mean", "last", "first", "sum"]:
- func(
- out=result,
- counts=counts,
- values=values,
- labels=comp_ids,
- min_count=min_count,
- mask=mask,
- result_mask=result_mask,
- is_datetimelike=is_datetimelike,
- )
- elif self.how in ["var", "ohlc", "prod", "median"]:
- func(
- result,
- counts,
- values,
- comp_ids,
- min_count=min_count,
- mask=mask,
- result_mask=result_mask,
- **kwargs,
- )
- else:
- raise NotImplementedError(f"{self.how} is not implemented")
- else:
- # TODO: min_count
- if self.how != "rank":
- # TODO: should rank take result_mask?
- kwargs["result_mask"] = result_mask
- func(
- out=result,
- values=values,
- labels=comp_ids,
- ngroups=ngroups,
- is_datetimelike=is_datetimelike,
- mask=mask,
- **kwargs,
- )
-
- if self.kind == "aggregate":
- # i.e. counts is defined. Locations where count<min_count
- # need to have the result set to np.nan, which may require casting,
- # see GH#40767
- if is_integer_dtype(result.dtype) and not is_datetimelike:
- # if the op keeps the int dtypes, we have to use 0
- cutoff = max(0 if self.how in ["sum", "prod"] else 1, min_count)
- empty_groups = counts < cutoff
- if empty_groups.any():
- if result_mask is not None:
- assert result_mask[empty_groups].all()
- else:
- # Note: this conversion could be lossy, see GH#40767
- result = result.astype("float64")
- result[empty_groups] = np.nan
-
- result = result.T
-
- if self.how not in self.cast_blocklist:
- # e.g. if we are int64 and need to restore to datetime64/timedelta64
- # "rank" is the only member of cast_blocklist we get here
- # Casting only needed for float16, bool, datetimelike,
- # and self.how in ["sum", "prod", "ohlc", "cumprod"]
- res_dtype = self._get_result_dtype(orig_values.dtype)
- op_result = maybe_downcast_to_dtype(result, res_dtype)
- else:
- op_result = result
-
- return op_result
-
- @final
- def cython_operation(
- self,
- *,
- values: ArrayLike,
- axis: AxisInt,
- min_count: int = -1,
- comp_ids: np.ndarray,
- ngroups: int,
- **kwargs,
- ) -> ArrayLike:
- """
- Call our cython function, with appropriate pre- and post- processing.
- """
- if values.ndim > 2:
- raise NotImplementedError("number of dimensions is currently limited to 2")
- if values.ndim == 2:
- assert axis == 1, axis
- elif not is_1d_only_ea_dtype(values.dtype):
- # Note: it is *not* the case that axis is always 0 for 1-dim values,
- # as we can have 1D ExtensionArrays that we need to treat as 2D
- assert axis == 0
-
- dtype = values.dtype
- is_numeric = is_numeric_dtype(dtype)
-
- # can we do this operation with our cython functions
- # if not raise NotImplementedError
- self._disallow_invalid_ops(dtype, is_numeric)
-
- if not isinstance(values, np.ndarray):
- # i.e. ExtensionArray
- return self._ea_wrap_cython_operation(
- values,
- min_count=min_count,
- ngroups=ngroups,
- comp_ids=comp_ids,
- **kwargs,
- )
-
- return self._cython_op_ndim_compat(
- values,
- min_count=min_count,
- ngroups=ngroups,
- comp_ids=comp_ids,
- mask=None,
- **kwargs,
- )
-
-
-class BaseGrouper:
- """
- This is an internal Grouper class, which actually holds
- the generated groups
-
- Parameters
- ----------
- axis : Index
- groupings : Sequence[Grouping]
- all the grouping instances to handle in this grouper
- for example for grouper list to groupby, need to pass the list
- sort : bool, default True
- whether this grouper will give sorted result or not
-
- """
-
- axis: Index
-
- def __init__(
- self,
- axis: Index,
- groupings: Sequence[grouper.Grouping],
- sort: bool = True,
- dropna: bool = True,
- ) -> None:
- assert isinstance(axis, Index), axis
-
- self.axis = axis
- self._groupings: list[grouper.Grouping] = list(groupings)
- self._sort = sort
- self.dropna = dropna
-
- @property
- def groupings(self) -> list[grouper.Grouping]:
- return self._groupings
-
- @property
- def shape(self) -> Shape:
- return tuple(ping.ngroups for ping in self.groupings)
-
- def __iter__(self) -> Iterator[Hashable]:
- return iter(self.indices)
-
- @property
- def nkeys(self) -> int:
- return len(self.groupings)
-
- def get_iterator(
- self, data: NDFrameT, axis: AxisInt = 0
- ) -> Iterator[tuple[Hashable, NDFrameT]]:
- """
- Groupby iterator
-
- Returns
- -------
- Generator yielding sequence of (name, subsetted object)
- for each group
- """
- splitter = self._get_splitter(data, axis=axis)
- keys = self.group_keys_seq
- yield from zip(keys, splitter)
-
- @final
- def _get_splitter(self, data: NDFrame, axis: AxisInt = 0) -> DataSplitter:
- """
- Returns
- -------
- Generator yielding subsetted objects
- """
- ids, _, ngroups = self.group_info
- return _get_splitter(data, ids, ngroups, axis=axis)
-
- @final
- @cache_readonly
- def group_keys_seq(self):
- if len(self.groupings) == 1:
- return self.levels[0]
- else:
- ids, _, ngroups = self.group_info
-
- # provide "flattened" iterator for multi-group setting
- return get_flattened_list(ids, ngroups, self.levels, self.codes)
-
- @final
- def apply(
- self, f: Callable, data: DataFrame | Series, axis: AxisInt = 0
- ) -> tuple[list, bool]:
- mutated = False
- splitter = self._get_splitter(data, axis=axis)
- group_keys = self.group_keys_seq
- result_values = []
-
- # This calls DataSplitter.__iter__
- zipped = zip(group_keys, splitter)
-
- for key, group in zipped:
- object.__setattr__(group, "name", key)
-
- # group might be modified
- group_axes = group.axes
- res = f(group)
- if not mutated and not _is_indexed_like(res, group_axes, axis):
- mutated = True
- result_values.append(res)
- # getattr pattern for __name__ is needed for functools.partial objects
- if len(group_keys) == 0 and getattr(f, "__name__", None) in [
- "skew",
- "sum",
- "prod",
- ]:
- # If group_keys is empty, then no function calls have been made,
- # so we will not have raised even if this is an invalid dtype.
- # So do one dummy call here to raise appropriate TypeError.
- f(data.iloc[:0])
-
- return result_values, mutated
-
- @cache_readonly
- def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
- """dict {group name -> group indices}"""
- if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex):
- # This shows unused categories in indices GH#38642
- return self.groupings[0].indices
- codes_list = [ping.codes for ping in self.groupings]
- keys = [ping.group_index for ping in self.groupings]
- return get_indexer_dict(codes_list, keys)
-
- @final
- def result_ilocs(self) -> npt.NDArray[np.intp]:
- """
- Get the original integer locations of result_index in the input.
- """
- # Original indices are where group_index would go via sorting.
- # But when dropna is true, we need to remove null values while accounting for
- # any gaps that then occur because of them.
- group_index = get_group_index(
- self.codes, self.shape, sort=self._sort, xnull=True
- )
- group_index, _ = compress_group_index(group_index, sort=self._sort)
-
- if self.has_dropped_na:
- mask = np.where(group_index >= 0)
- # Count how many gaps are caused by previous null values for each position
- null_gaps = np.cumsum(group_index == -1)[mask]
- group_index = group_index[mask]
-
- result = get_group_index_sorter(group_index, self.ngroups)
-
- if self.has_dropped_na:
- # Shift by the number of prior null gaps
- result += np.take(null_gaps, result)
-
- return result
-
- @final
- @property
- def codes(self) -> list[npt.NDArray[np.signedinteger]]:
- return [ping.codes for ping in self.groupings]
-
- @property
- def levels(self) -> list[Index]:
- return [ping.group_index for ping in self.groupings]
-
- @property
- def names(self) -> list[Hashable]:
- return [ping.name for ping in self.groupings]
-
- @final
- def size(self) -> Series:
- """
- Compute group sizes.
- """
- ids, _, ngroups = self.group_info
- out: np.ndarray | list
- if ngroups:
- out = np.bincount(ids[ids != -1], minlength=ngroups)
- else:
- out = []
- return Series(out, index=self.result_index, dtype="int64")
-
- @cache_readonly
- def groups(self) -> dict[Hashable, np.ndarray]:
- """dict {group name -> group labels}"""
- if len(self.groupings) == 1:
- return self.groupings[0].groups
- else:
- to_groupby = zip(*(ping.grouping_vector for ping in self.groupings))
- index = Index(to_groupby)
- return self.axis.groupby(index)
-
- @final
- @cache_readonly
- def is_monotonic(self) -> bool:
- # return if my group orderings are monotonic
- return Index(self.group_info[0]).is_monotonic_increasing
-
- @final
- @cache_readonly
- def has_dropped_na(self) -> bool:
- """
- Whether grouper has null value(s) that are dropped.
- """
- return bool((self.group_info[0] < 0).any())
-
- @cache_readonly
- def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
- comp_ids, obs_group_ids = self._get_compressed_codes()
-
- ngroups = len(obs_group_ids)
- comp_ids = ensure_platform_int(comp_ids)
-
- return comp_ids, obs_group_ids, ngroups
-
- @cache_readonly
- def codes_info(self) -> npt.NDArray[np.intp]:
- # return the codes of items in original grouped axis
- ids, _, _ = self.group_info
- return ids
-
- @final
- def _get_compressed_codes(
- self,
- ) -> tuple[npt.NDArray[np.signedinteger], npt.NDArray[np.intp]]:
- # The first returned ndarray may have any signed integer dtype
- if len(self.groupings) > 1:
- group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True)
- return compress_group_index(group_index, sort=self._sort)
- # FIXME: compress_group_index's second return value is int64, not intp
-
- ping = self.groupings[0]
- return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)
-
- @final
- @cache_readonly
- def ngroups(self) -> int:
- return len(self.result_index)
-
- @property
- def reconstructed_codes(self) -> list[npt.NDArray[np.intp]]:
- codes = self.codes
- ids, obs_ids, _ = self.group_info
- return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)
-
- @cache_readonly
- def result_index(self) -> Index:
- if len(self.groupings) == 1:
- return self.groupings[0].result_index.rename(self.names[0])
-
- codes = self.reconstructed_codes
- levels = [ping.result_index for ping in self.groupings]
- return MultiIndex(
- levels=levels, codes=codes, verify_integrity=False, names=self.names
- )
-
- @final
- def get_group_levels(self) -> list[ArrayLike]:
- # Note: only called from _insert_inaxis_grouper, which
- # is only called for BaseGrouper, never for BinGrouper
- if len(self.groupings) == 1:
- return [self.groupings[0].group_arraylike]
-
- name_list = []
- for ping, codes in zip(self.groupings, self.reconstructed_codes):
- codes = ensure_platform_int(codes)
- levels = ping.group_arraylike.take(codes)
-
- name_list.append(levels)
-
- return name_list
-
- # ------------------------------------------------------------
- # Aggregation functions
-
- @final
- def _cython_operation(
- self,
- kind: str,
- values,
- how: str,
- axis: AxisInt,
- min_count: int = -1,
- **kwargs,
- ) -> ArrayLike:
- """
- Returns the values of a cython operation.
- """
- assert kind in ["transform", "aggregate"]
-
- cy_op = WrappedCythonOp(kind=kind, how=how, has_dropped_na=self.has_dropped_na)
-
- ids, _, _ = self.group_info
- ngroups = self.ngroups
- return cy_op.cython_operation(
- values=values,
- axis=axis,
- min_count=min_count,
- comp_ids=ids,
- ngroups=ngroups,
- **kwargs,
- )
-
- @final
- def agg_series(
- self, obj: Series, func: Callable, preserve_dtype: bool = False
- ) -> ArrayLike:
- """
- Parameters
- ----------
- obj : Series
- func : function taking a Series and returning a scalar-like
- preserve_dtype : bool
- Whether the aggregation is known to be dtype-preserving.
-
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- # test_groupby_empty_with_category gets here with self.ngroups == 0
- # and len(obj) > 0
-
- if len(obj) > 0 and not isinstance(obj._values, np.ndarray):
- # we can preserve a little bit more aggressively with EA dtype
- # because maybe_cast_pointwise_result will do a try/except
- # with _from_sequence. NB we are assuming here that _from_sequence
- # is sufficiently strict that it casts appropriately.
- preserve_dtype = True
-
- result = self._aggregate_series_pure_python(obj, func)
-
- npvalues = lib.maybe_convert_objects(result, try_float=False)
- if preserve_dtype:
- out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True)
- else:
- out = npvalues
- return out
-
- @final
- def _aggregate_series_pure_python(
- self, obj: Series, func: Callable
- ) -> npt.NDArray[np.object_]:
- _, _, ngroups = self.group_info
-
- result = np.empty(ngroups, dtype="O")
- initialized = False
-
- splitter = self._get_splitter(obj, axis=0)
-
- for i, group in enumerate(splitter):
- res = func(group)
- res = libreduction.extract_result(res)
-
- if not initialized:
- # We only do this validation on the first iteration
- libreduction.check_result_array(res, group.dtype)
- initialized = True
-
- result[i] = res
-
- return result
-
-
-class BinGrouper(BaseGrouper):
- """
- This is an internal Grouper class
-
- Parameters
- ----------
- bins : the split index of binlabels to group the item of axis
- binlabels : the label list
- indexer : np.ndarray[np.intp], optional
- the indexer created by Grouper
- some groupers (TimeGrouper) will sort its axis and its
- group_info is also sorted, so need the indexer to reorder
-
- Examples
- --------
- bins: [2, 4, 6, 8, 10]
- binlabels: DatetimeIndex(['2005-01-01', '2005-01-03',
- '2005-01-05', '2005-01-07', '2005-01-09'],
- dtype='datetime64[ns]', freq='2D')
-
- the group_info, which contains the label of each item in grouped
- axis, the index of label in label list, group number, is
-
- (array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]), array([0, 1, 2, 3, 4]), 5)
-
- means that, the grouped axis has 10 items, can be grouped into 5
- labels, the first and second items belong to the first label, the
- third and forth items belong to the second label, and so on
-
- """
-
- bins: npt.NDArray[np.int64]
- binlabels: Index
-
- def __init__(
- self,
- bins,
- binlabels,
- indexer=None,
- ) -> None:
- self.bins = ensure_int64(bins)
- self.binlabels = ensure_index(binlabels)
- self.indexer = indexer
-
- # These lengths must match, otherwise we could call agg_series
- # with empty self.bins, which would raise in libreduction.
- assert len(self.binlabels) == len(self.bins)
-
- @cache_readonly
- def groups(self):
- """dict {group name -> group labels}"""
- # this is mainly for compat
- # GH 3881
- result = {
- key: value
- for key, value in zip(self.binlabels, self.bins)
- if key is not NaT
- }
- return result
-
- @property
- def nkeys(self) -> int:
- # still matches len(self.groupings), but we can hard-code
- return 1
-
- @cache_readonly
- def codes_info(self) -> npt.NDArray[np.intp]:
- # return the codes of items in original grouped axis
- ids, _, _ = self.group_info
- if self.indexer is not None:
- sorter = np.lexsort((ids, self.indexer))
- ids = ids[sorter]
- return ids
-
- def get_iterator(self, data: NDFrame, axis: AxisInt = 0):
- """
- Groupby iterator
-
- Returns
- -------
- Generator yielding sequence of (name, subsetted object)
- for each group
- """
- if axis == 0:
- slicer = lambda start, edge: data.iloc[start:edge]
- else:
- slicer = lambda start, edge: data.iloc[:, start:edge]
-
- length = len(data.axes[axis])
-
- start = 0
- for edge, label in zip(self.bins, self.binlabels):
- if label is not NaT:
- yield label, slicer(start, edge)
- start = edge
-
- if start < length:
- yield self.binlabels[-1], slicer(start, None)
-
- @cache_readonly
- def indices(self):
- indices = collections.defaultdict(list)
-
- i = 0
- for label, bin in zip(self.binlabels, self.bins):
- if i < bin:
- if label is not NaT:
- indices[label] = list(range(i, bin))
- i = bin
- return indices
-
- @cache_readonly
- def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
- ngroups = self.ngroups
- obs_group_ids = np.arange(ngroups, dtype=np.intp)
- rep = np.diff(np.r_[0, self.bins])
-
- rep = ensure_platform_int(rep)
- if ngroups == len(self.bins):
- comp_ids = np.repeat(np.arange(ngroups), rep)
- else:
- comp_ids = np.repeat(np.r_[-1, np.arange(ngroups)], rep)
-
- return (
- ensure_platform_int(comp_ids),
- obs_group_ids,
- ngroups,
- )
-
- @cache_readonly
- def reconstructed_codes(self) -> list[np.ndarray]:
- # get unique result indices, and prepend 0 as groupby starts from the first
- return [np.r_[0, np.flatnonzero(self.bins[1:] != self.bins[:-1]) + 1]]
-
- @cache_readonly
- def result_index(self) -> Index:
- if len(self.binlabels) != 0 and isna(self.binlabels[0]):
- return self.binlabels[1:]
-
- return self.binlabels
-
- @property
- def levels(self) -> list[Index]:
- return [self.binlabels]
-
- @property
- def names(self) -> list[Hashable]:
- return [self.binlabels.name]
-
- @property
- def groupings(self) -> list[grouper.Grouping]:
- lev = self.binlabels
- codes = self.group_info[0]
- labels = lev.take(codes)
- ping = grouper.Grouping(
- labels, labels, in_axis=False, level=None, uniques=lev._values
- )
- return [ping]
-
-
-def _is_indexed_like(obj, axes, axis: AxisInt) -> bool:
- if isinstance(obj, Series):
- if len(axes) > 1:
- return False
- return obj.axes[axis].equals(axes[axis])
- elif isinstance(obj, DataFrame):
- return obj.axes[axis].equals(axes[axis])
-
- return False
-
-
-# ----------------------------------------------------------------------
-# Splitting / application
-
-
-class DataSplitter(Generic[NDFrameT]):
- def __init__(
- self,
- data: NDFrameT,
- labels: npt.NDArray[np.intp],
- ngroups: int,
- axis: AxisInt = 0,
- ) -> None:
- self.data = data
- self.labels = ensure_platform_int(labels) # _should_ already be np.intp
- self.ngroups = ngroups
-
- self.axis = axis
- assert isinstance(axis, int), axis
-
- @cache_readonly
- def _slabels(self) -> npt.NDArray[np.intp]:
- # Sorted labels
- return self.labels.take(self._sort_idx)
-
- @cache_readonly
- def _sort_idx(self) -> npt.NDArray[np.intp]:
- # Counting sort indexer
- return get_group_index_sorter(self.labels, self.ngroups)
-
- def __iter__(self) -> Iterator:
- sdata = self._sorted_data
-
- if self.ngroups == 0:
- # we are inside a generator, rather than raise StopIteration
- # we merely return signal the end
- return
-
- starts, ends = lib.generate_slices(self._slabels, self.ngroups)
-
- for start, end in zip(starts, ends):
- yield self._chop(sdata, slice(start, end))
-
- @cache_readonly
- def _sorted_data(self) -> NDFrameT:
- return self.data.take(self._sort_idx, axis=self.axis)
-
- def _chop(self, sdata, slice_obj: slice) -> NDFrame:
- raise AbstractMethodError(self)
-
-
-class SeriesSplitter(DataSplitter):
- def _chop(self, sdata: Series, slice_obj: slice) -> Series:
- # fastpath equivalent to `sdata.iloc[slice_obj]`
- mgr = sdata._mgr.get_slice(slice_obj)
- ser = sdata._constructor(mgr, name=sdata.name, fastpath=True)
- return ser.__finalize__(sdata, method="groupby")
-
-
-class FrameSplitter(DataSplitter):
- def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
- # Fastpath equivalent to:
- # if self.axis == 0:
- # return sdata.iloc[slice_obj]
- # else:
- # return sdata.iloc[:, slice_obj]
- mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis)
- df = sdata._constructor(mgr)
- return df.__finalize__(sdata, method="groupby")
-
-
-def _get_splitter(
- data: NDFrame, labels: np.ndarray, ngroups: int, axis: AxisInt = 0
-) -> DataSplitter:
- if isinstance(data, Series):
- klass: type[DataSplitter] = SeriesSplitter
- else:
- # i.e. DataFrame
- klass = FrameSplitter
-
- return klass(data, labels, ngroups, axis)
diff --git a/contrib/python/pandas/py3/pandas/core/indexers/__init__.py b/contrib/python/pandas/py3/pandas/core/indexers/__init__.py
deleted file mode 100644
index ba8a4f1d0ee..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexers/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from pandas.core.indexers.utils import (
- check_array_indexer,
- check_key_length,
- check_setitem_lengths,
- disallow_ndim_indexing,
- is_empty_indexer,
- is_list_like_indexer,
- is_scalar_indexer,
- is_valid_positional_slice,
- length_of_indexer,
- maybe_convert_indices,
- unpack_1tuple,
- unpack_tuple_and_ellipses,
- validate_indices,
-)
-
-__all__ = [
- "is_valid_positional_slice",
- "is_list_like_indexer",
- "is_scalar_indexer",
- "is_empty_indexer",
- "check_setitem_lengths",
- "validate_indices",
- "maybe_convert_indices",
- "length_of_indexer",
- "disallow_ndim_indexing",
- "unpack_1tuple",
- "check_key_length",
- "check_array_indexer",
- "unpack_tuple_and_ellipses",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/indexers/objects.py b/contrib/python/pandas/py3/pandas/core/indexers/objects.py
deleted file mode 100644
index 714fe92301a..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexers/objects.py
+++ /dev/null
@@ -1,390 +0,0 @@
-"""Indexer objects for computing start/end window bounds for rolling operations"""
-from __future__ import annotations
-
-from datetime import timedelta
-
-import numpy as np
-
-from pandas._libs.window.indexers import calculate_variable_window_bounds
-from pandas.util._decorators import Appender
-
-from pandas.core.dtypes.common import ensure_platform_int
-
-from pandas.tseries.offsets import Nano
-
-get_window_bounds_doc = """
-Computes the bounds of a window.
-
-Parameters
-----------
-num_values : int, default 0
- number of values that will be aggregated over
-window_size : int, default 0
- the number of rows in a window
-min_periods : int, default None
- min_periods passed from the top level rolling API
-center : bool, default None
- center passed from the top level rolling API
-closed : str, default None
- closed passed from the top level rolling API
-step : int, default None
- step passed from the top level rolling API
- .. versionadded:: 1.5
-win_type : str, default None
- win_type passed from the top level rolling API
-
-Returns
--------
-A tuple of ndarray[int64]s, indicating the boundaries of each
-window
-"""
-
-
-class BaseIndexer:
- """Base class for window bounds calculations."""
-
- def __init__(
- self, index_array: np.ndarray | None = None, window_size: int = 0, **kwargs
- ) -> None:
- """
- Parameters
- ----------
- **kwargs :
- keyword arguments that will be available when get_window_bounds is called
- """
- self.index_array = index_array
- self.window_size = window_size
- # Set user defined kwargs as attributes that can be used in get_window_bounds
- for key, value in kwargs.items():
- setattr(self, key, value)
-
- @Appender(get_window_bounds_doc)
- def get_window_bounds(
- self,
- num_values: int = 0,
- min_periods: int | None = None,
- center: bool | None = None,
- closed: str | None = None,
- step: int | None = None,
- ) -> tuple[np.ndarray, np.ndarray]:
- raise NotImplementedError
-
-
-class FixedWindowIndexer(BaseIndexer):
- """Creates window boundaries that are of fixed length."""
-
- @Appender(get_window_bounds_doc)
- def get_window_bounds(
- self,
- num_values: int = 0,
- min_periods: int | None = None,
- center: bool | None = None,
- closed: str | None = None,
- step: int | None = None,
- ) -> tuple[np.ndarray, np.ndarray]:
- if center:
- offset = (self.window_size - 1) // 2
- else:
- offset = 0
-
- end = np.arange(1 + offset, num_values + 1 + offset, step, dtype="int64")
- start = end - self.window_size
- if closed in ["left", "both"]:
- start -= 1
- if closed in ["left", "neither"]:
- end -= 1
-
- end = np.clip(end, 0, num_values)
- start = np.clip(start, 0, num_values)
-
- return start, end
-
-
-class VariableWindowIndexer(BaseIndexer):
- """Creates window boundaries that are of variable length, namely for time series."""
-
- @Appender(get_window_bounds_doc)
- def get_window_bounds(
- self,
- num_values: int = 0,
- min_periods: int | None = None,
- center: bool | None = None,
- closed: str | None = None,
- step: int | None = None,
- ) -> tuple[np.ndarray, np.ndarray]:
- # error: Argument 4 to "calculate_variable_window_bounds" has incompatible
- # type "Optional[bool]"; expected "bool"
- # error: Argument 6 to "calculate_variable_window_bounds" has incompatible
- # type "Optional[ndarray]"; expected "ndarray"
- return calculate_variable_window_bounds(
- num_values,
- self.window_size,
- min_periods,
- center, # type: ignore[arg-type]
- closed,
- self.index_array, # type: ignore[arg-type]
- )
-
-
-class VariableOffsetWindowIndexer(BaseIndexer):
- """Calculate window boundaries based on a non-fixed offset such as a BusinessDay."""
-
- def __init__(
- self,
- index_array: np.ndarray | None = None,
- window_size: int = 0,
- index=None,
- offset=None,
- **kwargs,
- ) -> None:
- super().__init__(index_array, window_size, **kwargs)
- self.index = index
- self.offset = offset
-
- @Appender(get_window_bounds_doc)
- def get_window_bounds(
- self,
- num_values: int = 0,
- min_periods: int | None = None,
- center: bool | None = None,
- closed: str | None = None,
- step: int | None = None,
- ) -> tuple[np.ndarray, np.ndarray]:
- if step is not None:
- raise NotImplementedError("step not implemented for variable offset window")
- if num_values <= 0:
- return np.empty(0, dtype="int64"), np.empty(0, dtype="int64")
-
- # if windows is variable, default is 'right', otherwise default is 'both'
- if closed is None:
- closed = "right" if self.index is not None else "both"
-
- right_closed = closed in ["right", "both"]
- left_closed = closed in ["left", "both"]
-
- if self.index[num_values - 1] < self.index[0]:
- index_growth_sign = -1
- else:
- index_growth_sign = 1
-
- start = np.empty(num_values, dtype="int64")
- start.fill(-1)
- end = np.empty(num_values, dtype="int64")
- end.fill(-1)
-
- start[0] = 0
-
- # right endpoint is closed
- if right_closed:
- end[0] = 1
- # right endpoint is open
- else:
- end[0] = 0
-
- # start is start of slice interval (including)
- # end is end of slice interval (not including)
- for i in range(1, num_values):
- end_bound = self.index[i]
- start_bound = self.index[i] - index_growth_sign * self.offset
-
- # left endpoint is closed
- if left_closed:
- start_bound -= Nano(1)
-
- # advance the start bound until we are
- # within the constraint
- start[i] = i
- for j in range(start[i - 1], i):
- if (self.index[j] - start_bound) * index_growth_sign > timedelta(0):
- start[i] = j
- break
-
- # end bound is previous end
- # or current index
- if (self.index[end[i - 1]] - end_bound) * index_growth_sign <= timedelta(0):
- end[i] = i + 1
- else:
- end[i] = end[i - 1]
-
- # right endpoint is open
- if not right_closed:
- end[i] -= 1
-
- return start, end
-
-
-class ExpandingIndexer(BaseIndexer):
- """Calculate expanding window bounds, mimicking df.expanding()"""
-
- @Appender(get_window_bounds_doc)
- def get_window_bounds(
- self,
- num_values: int = 0,
- min_periods: int | None = None,
- center: bool | None = None,
- closed: str | None = None,
- step: int | None = None,
- ) -> tuple[np.ndarray, np.ndarray]:
- return (
- np.zeros(num_values, dtype=np.int64),
- np.arange(1, num_values + 1, dtype=np.int64),
- )
-
-
-class FixedForwardWindowIndexer(BaseIndexer):
- """
- Creates window boundaries for fixed-length windows that include the current row.
-
- Examples
- --------
- >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})
- >>> df
- B
- 0 0.0
- 1 1.0
- 2 2.0
- 3 NaN
- 4 4.0
-
- >>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2)
- >>> df.rolling(window=indexer, min_periods=1).sum()
- B
- 0 1.0
- 1 3.0
- 2 2.0
- 3 4.0
- 4 4.0
- """
-
- @Appender(get_window_bounds_doc)
- def get_window_bounds(
- self,
- num_values: int = 0,
- min_periods: int | None = None,
- center: bool | None = None,
- closed: str | None = None,
- step: int | None = None,
- ) -> tuple[np.ndarray, np.ndarray]:
- if center:
- raise ValueError("Forward-looking windows can't have center=True")
- if closed is not None:
- raise ValueError(
- "Forward-looking windows don't support setting the closed argument"
- )
- if step is None:
- step = 1
-
- start = np.arange(0, num_values, step, dtype="int64")
- end = start + self.window_size
- if self.window_size:
- end = np.clip(end, 0, num_values)
-
- return start, end
-
-
-class GroupbyIndexer(BaseIndexer):
- """Calculate bounds to compute groupby rolling, mimicking df.groupby().rolling()"""
-
- def __init__(
- self,
- index_array: np.ndarray | None = None,
- window_size: int | BaseIndexer = 0,
- groupby_indices: dict | None = None,
- window_indexer: type[BaseIndexer] = BaseIndexer,
- indexer_kwargs: dict | None = None,
- **kwargs,
- ) -> None:
- """
- Parameters
- ----------
- index_array : np.ndarray or None
- np.ndarray of the index of the original object that we are performing
- a chained groupby operation over. This index has been pre-sorted relative to
- the groups
- window_size : int or BaseIndexer
- window size during the windowing operation
- groupby_indices : dict or None
- dict of {group label: [positional index of rows belonging to the group]}
- window_indexer : BaseIndexer
- BaseIndexer class determining the start and end bounds of each group
- indexer_kwargs : dict or None
- Custom kwargs to be passed to window_indexer
- **kwargs :
- keyword arguments that will be available when get_window_bounds is called
- """
- self.groupby_indices = groupby_indices or {}
- self.window_indexer = window_indexer
- self.indexer_kwargs = indexer_kwargs.copy() if indexer_kwargs else {}
- super().__init__(
- index_array=index_array,
- window_size=self.indexer_kwargs.pop("window_size", window_size),
- **kwargs,
- )
-
- @Appender(get_window_bounds_doc)
- def get_window_bounds(
- self,
- num_values: int = 0,
- min_periods: int | None = None,
- center: bool | None = None,
- closed: str | None = None,
- step: int | None = None,
- ) -> tuple[np.ndarray, np.ndarray]:
- # 1) For each group, get the indices that belong to the group
- # 2) Use the indices to calculate the start & end bounds of the window
- # 3) Append the window bounds in group order
- start_arrays = []
- end_arrays = []
- window_indices_start = 0
- for key, indices in self.groupby_indices.items():
- index_array: np.ndarray | None
-
- if self.index_array is not None:
- index_array = self.index_array.take(ensure_platform_int(indices))
- else:
- index_array = self.index_array
- indexer = self.window_indexer(
- index_array=index_array,
- window_size=self.window_size,
- **self.indexer_kwargs,
- )
- start, end = indexer.get_window_bounds(
- len(indices), min_periods, center, closed, step
- )
- start = start.astype(np.int64)
- end = end.astype(np.int64)
- assert len(start) == len(
- end
- ), "these should be equal in length from get_window_bounds"
- # Cannot use groupby_indices as they might not be monotonic with the object
- # we're rolling over
- window_indices = np.arange(
- window_indices_start, window_indices_start + len(indices)
- )
- window_indices_start += len(indices)
- # Extend as we'll be slicing window like [start, end)
- window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype(
- np.int64, copy=False
- )
- start_arrays.append(window_indices.take(ensure_platform_int(start)))
- end_arrays.append(window_indices.take(ensure_platform_int(end)))
- if len(start_arrays) == 0:
- return np.array([], dtype=np.int64), np.array([], dtype=np.int64)
- start = np.concatenate(start_arrays)
- end = np.concatenate(end_arrays)
- return start, end
-
-
-class ExponentialMovingWindowIndexer(BaseIndexer):
- """Calculate ewm window bounds (the entire window)"""
-
- @Appender(get_window_bounds_doc)
- def get_window_bounds(
- self,
- num_values: int = 0,
- min_periods: int | None = None,
- center: bool | None = None,
- closed: str | None = None,
- step: int | None = None,
- ) -> tuple[np.ndarray, np.ndarray]:
- return np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64)
diff --git a/contrib/python/pandas/py3/pandas/core/indexers/utils.py b/contrib/python/pandas/py3/pandas/core/indexers/utils.py
deleted file mode 100644
index 0674831aaa6..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexers/utils.py
+++ /dev/null
@@ -1,555 +0,0 @@
-"""
-Low-dependency indexing utilities.
-"""
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Any,
-)
-
-import numpy as np
-
-from pandas._typing import AnyArrayLike
-
-from pandas.core.dtypes.common import (
- is_array_like,
- is_bool_dtype,
- is_extension_array_dtype,
- is_integer,
- is_integer_dtype,
- is_list_like,
-)
-from pandas.core.dtypes.generic import (
- ABCIndex,
- ABCSeries,
-)
-
-if TYPE_CHECKING:
- from pandas.core.frame import DataFrame
- from pandas.core.indexes.base import Index
-
-# -----------------------------------------------------------
-# Indexer Identification
-
-
-def is_valid_positional_slice(slc: slice) -> bool:
- """
- Check if a slice object can be interpreted as a positional indexer.
-
- Parameters
- ----------
- slc : slice
-
- Returns
- -------
- bool
-
- Notes
- -----
- A valid positional slice may also be interpreted as a label-based slice
- depending on the index being sliced.
- """
-
- def is_int_or_none(val):
- return val is None or is_integer(val)
-
- return (
- is_int_or_none(slc.start)
- and is_int_or_none(slc.stop)
- and is_int_or_none(slc.step)
- )
-
-
-def is_list_like_indexer(key) -> bool:
- """
- Check if we have a list-like indexer that is *not* a NamedTuple.
-
- Parameters
- ----------
- key : object
-
- Returns
- -------
- bool
- """
- # allow a list_like, but exclude NamedTuples which can be indexers
- return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple)
-
-
-def is_scalar_indexer(indexer, ndim: int) -> bool:
- """
- Return True if we are all scalar indexers.
-
- Parameters
- ----------
- indexer : object
- ndim : int
- Number of dimensions in the object being indexed.
-
- Returns
- -------
- bool
- """
- if ndim == 1 and is_integer(indexer):
- # GH37748: allow indexer to be an integer for Series
- return True
- if isinstance(indexer, tuple) and len(indexer) == ndim:
- return all(is_integer(x) for x in indexer)
- return False
-
-
-def is_empty_indexer(indexer) -> bool:
- """
- Check if we have an empty indexer.
-
- Parameters
- ----------
- indexer : object
-
- Returns
- -------
- bool
- """
- if is_list_like(indexer) and not len(indexer):
- return True
- if not isinstance(indexer, tuple):
- indexer = (indexer,)
- return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer)
-
-
-# -----------------------------------------------------------
-# Indexer Validation
-
-
-def check_setitem_lengths(indexer, value, values) -> bool:
- """
- Validate that value and indexer are the same length.
-
- An special-case is allowed for when the indexer is a boolean array
- and the number of true values equals the length of ``value``. In
- this case, no exception is raised.
-
- Parameters
- ----------
- indexer : sequence
- Key for the setitem.
- value : array-like
- Value for the setitem.
- values : array-like
- Values being set into.
-
- Returns
- -------
- bool
- Whether this is an empty listlike setting which is a no-op.
-
- Raises
- ------
- ValueError
- When the indexer is an ndarray or list and the lengths don't match.
- """
- no_op = False
-
- if isinstance(indexer, (np.ndarray, list)):
- # We can ignore other listlikes because they are either
- # a) not necessarily 1-D indexers, e.g. tuple
- # b) boolean indexers e.g. BoolArray
- if is_list_like(value):
- if len(indexer) != len(value) and values.ndim == 1:
- # boolean with truth values == len of the value is ok too
- if isinstance(indexer, list):
- indexer = np.array(indexer)
- if not (
- isinstance(indexer, np.ndarray)
- and indexer.dtype == np.bool_
- and indexer.sum() == len(value)
- ):
- raise ValueError(
- "cannot set using a list-like indexer "
- "with a different length than the value"
- )
- if not len(indexer):
- no_op = True
-
- elif isinstance(indexer, slice):
- if is_list_like(value):
- if len(value) != length_of_indexer(indexer, values) and values.ndim == 1:
- # In case of two dimensional value is used row-wise and broadcasted
- raise ValueError(
- "cannot set using a slice indexer with a "
- "different length than the value"
- )
- if not len(value):
- no_op = True
-
- return no_op
-
-
-def validate_indices(indices: np.ndarray, n: int) -> None:
- """
- Perform bounds-checking for an indexer.
-
- -1 is allowed for indicating missing values.
-
- Parameters
- ----------
- indices : ndarray
- n : int
- Length of the array being indexed.
-
- Raises
- ------
- ValueError
-
- Examples
- --------
- >>> validate_indices(np.array([1, 2]), 3) # OK
-
- >>> validate_indices(np.array([1, -2]), 3)
- Traceback (most recent call last):
- ...
- ValueError: negative dimensions are not allowed
-
- >>> validate_indices(np.array([1, 2, 3]), 3)
- Traceback (most recent call last):
- ...
- IndexError: indices are out-of-bounds
-
- >>> validate_indices(np.array([-1, -1]), 0) # OK
-
- >>> validate_indices(np.array([0, 1]), 0)
- Traceback (most recent call last):
- ...
- IndexError: indices are out-of-bounds
- """
- if len(indices):
- min_idx = indices.min()
- if min_idx < -1:
- msg = f"'indices' contains values less than allowed ({min_idx} < -1)"
- raise ValueError(msg)
-
- max_idx = indices.max()
- if max_idx >= n:
- raise IndexError("indices are out-of-bounds")
-
-
-# -----------------------------------------------------------
-# Indexer Conversion
-
-
-def maybe_convert_indices(indices, n: int, verify: bool = True) -> np.ndarray:
- """
- Attempt to convert indices into valid, positive indices.
-
- If we have negative indices, translate to positive here.
- If we have indices that are out-of-bounds, raise an IndexError.
-
- Parameters
- ----------
- indices : array-like
- Array of indices that we are to convert.
- n : int
- Number of elements in the array that we are indexing.
- verify : bool, default True
- Check that all entries are between 0 and n - 1, inclusive.
-
- Returns
- -------
- array-like
- An array-like of positive indices that correspond to the ones
- that were passed in initially to this function.
-
- Raises
- ------
- IndexError
- One of the converted indices either exceeded the number of,
- elements (specified by `n`), or was still negative.
- """
- if isinstance(indices, list):
- indices = np.array(indices)
- if len(indices) == 0:
- # If `indices` is empty, np.array will return a float,
- # and will cause indexing errors.
- return np.empty(0, dtype=np.intp)
-
- mask = indices < 0
- if mask.any():
- indices = indices.copy()
- indices[mask] += n
-
- if verify:
- mask = (indices >= n) | (indices < 0)
- if mask.any():
- raise IndexError("indices are out-of-bounds")
- return indices
-
-
-# -----------------------------------------------------------
-# Unsorted
-
-
-def length_of_indexer(indexer, target=None) -> int:
- """
- Return the expected length of target[indexer]
-
- Returns
- -------
- int
- """
- if target is not None and isinstance(indexer, slice):
- target_len = len(target)
- start = indexer.start
- stop = indexer.stop
- step = indexer.step
- if start is None:
- start = 0
- elif start < 0:
- start += target_len
- if stop is None or stop > target_len:
- stop = target_len
- elif stop < 0:
- stop += target_len
- if step is None:
- step = 1
- elif step < 0:
- start, stop = stop + 1, start + 1
- step = -step
- return (stop - start + step - 1) // step
- elif isinstance(indexer, (ABCSeries, ABCIndex, np.ndarray, list)):
- if isinstance(indexer, list):
- indexer = np.array(indexer)
-
- if indexer.dtype == bool:
- # GH#25774
- return indexer.sum()
- return len(indexer)
- elif isinstance(indexer, range):
- return (indexer.stop - indexer.start) // indexer.step
- elif not is_list_like_indexer(indexer):
- return 1
- raise AssertionError("cannot find the length of the indexer")
-
-
-def disallow_ndim_indexing(result) -> None:
- """
- Helper function to disallow multi-dimensional indexing on 1D Series/Index.
-
- GH#27125 indexer like idx[:, None] expands dim, but we cannot do that
- and keep an index, so we used to return ndarray, which was deprecated
- in GH#30588.
- """
- if np.ndim(result) > 1:
- raise ValueError(
- "Multi-dimensional indexing (e.g. `obj[:, None]`) is no longer "
- "supported. Convert to a numpy array before indexing instead."
- )
-
-
-def unpack_1tuple(tup):
- """
- If we have a length-1 tuple/list that contains a slice, unpack to just
- the slice.
-
- Notes
- -----
- The list case is deprecated.
- """
- if len(tup) == 1 and isinstance(tup[0], slice):
- # if we don't have a MultiIndex, we may still be able to handle
- # a 1-tuple. see test_1tuple_without_multiindex
-
- if isinstance(tup, list):
- # GH#31299
- raise ValueError(
- "Indexing with a single-item list containing a "
- "slice is not allowed. Pass a tuple instead.",
- )
-
- return tup[0]
- return tup
-
-
-def check_key_length(columns: Index, key, value: DataFrame) -> None:
- """
- Checks if a key used as indexer has the same length as the columns it is
- associated with.
-
- Parameters
- ----------
- columns : Index The columns of the DataFrame to index.
- key : A list-like of keys to index with.
- value : DataFrame The value to set for the keys.
-
- Raises
- ------
- ValueError: If the length of key is not equal to the number of columns in value
- or if the number of columns referenced by key is not equal to number
- of columns.
- """
- if columns.is_unique:
- if len(value.columns) != len(key):
- raise ValueError("Columns must be same length as key")
- else:
- # Missing keys in columns are represented as -1
- if len(columns.get_indexer_non_unique(key)[0]) != len(value.columns):
- raise ValueError("Columns must be same length as key")
-
-
-def unpack_tuple_and_ellipses(item: tuple):
- """
- Possibly unpack arr[..., n] to arr[n]
- """
- if len(item) > 1:
- # Note: we are assuming this indexing is being done on a 1D arraylike
- if item[0] is Ellipsis:
- item = item[1:]
- elif item[-1] is Ellipsis:
- item = item[:-1]
-
- if len(item) > 1:
- raise IndexError("too many indices for array.")
-
- item = item[0]
- return item
-
-
-# -----------------------------------------------------------
-# Public indexer validation
-
-
-def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
- """
- Check if `indexer` is a valid array indexer for `array`.
-
- For a boolean mask, `array` and `indexer` are checked to have the same
- length. The dtype is validated, and if it is an integer or boolean
- ExtensionArray, it is checked if there are missing values present, and
- it is converted to the appropriate numpy array. Other dtypes will raise
- an error.
-
- Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed
- through as is.
-
- Parameters
- ----------
- array : array-like
- The array that is being indexed (only used for the length).
- indexer : array-like or list-like
- The array-like that's used to index. List-like input that is not yet
- a numpy array or an ExtensionArray is converted to one. Other input
- types are passed through as is.
-
- Returns
- -------
- numpy.ndarray
- The validated indexer as a numpy array that can be used to index.
-
- Raises
- ------
- IndexError
- When the lengths don't match.
- ValueError
- When `indexer` cannot be converted to a numpy ndarray to index
- (e.g. presence of missing values).
-
- See Also
- --------
- api.types.is_bool_dtype : Check if `key` is of boolean dtype.
-
- Examples
- --------
- When checking a boolean mask, a boolean ndarray is returned when the
- arguments are all valid.
-
- >>> mask = pd.array([True, False])
- >>> arr = pd.array([1, 2])
- >>> pd.api.indexers.check_array_indexer(arr, mask)
- array([ True, False])
-
- An IndexError is raised when the lengths don't match.
-
- >>> mask = pd.array([True, False, True])
- >>> pd.api.indexers.check_array_indexer(arr, mask)
- Traceback (most recent call last):
- ...
- IndexError: Boolean index has wrong length: 3 instead of 2.
-
- NA values in a boolean array are treated as False.
-
- >>> mask = pd.array([True, pd.NA])
- >>> pd.api.indexers.check_array_indexer(arr, mask)
- array([ True, False])
-
- A numpy boolean mask will get passed through (if the length is correct):
-
- >>> mask = np.array([True, False])
- >>> pd.api.indexers.check_array_indexer(arr, mask)
- array([ True, False])
-
- Similarly for integer indexers, an integer ndarray is returned when it is
- a valid indexer, otherwise an error is (for integer indexers, a matching
- length is not required):
-
- >>> indexer = pd.array([0, 2], dtype="Int64")
- >>> arr = pd.array([1, 2, 3])
- >>> pd.api.indexers.check_array_indexer(arr, indexer)
- array([0, 2])
-
- >>> indexer = pd.array([0, pd.NA], dtype="Int64")
- >>> pd.api.indexers.check_array_indexer(arr, indexer)
- Traceback (most recent call last):
- ...
- ValueError: Cannot index with an integer indexer containing NA values
-
- For non-integer/boolean dtypes, an appropriate error is raised:
-
- >>> indexer = np.array([0., 2.], dtype="float64")
- >>> pd.api.indexers.check_array_indexer(arr, indexer)
- Traceback (most recent call last):
- ...
- IndexError: arrays used as indices must be of integer or boolean type
- """
- from pandas.core.construction import array as pd_array
-
- # whatever is not an array-like is returned as-is (possible valid array
- # indexers that are not array-like: integer, slice, Ellipsis, None)
- # In this context, tuples are not considered as array-like, as they have
- # a specific meaning in indexing (multi-dimensional indexing)
- if is_list_like(indexer):
- if isinstance(indexer, tuple):
- return indexer
- else:
- return indexer
-
- # convert list-likes to array
- if not is_array_like(indexer):
- indexer = pd_array(indexer)
- if len(indexer) == 0:
- # empty list is converted to float array by pd.array
- indexer = np.array([], dtype=np.intp)
-
- dtype = indexer.dtype
- if is_bool_dtype(dtype):
- if is_extension_array_dtype(dtype):
- indexer = indexer.to_numpy(dtype=bool, na_value=False)
- else:
- indexer = np.asarray(indexer, dtype=bool)
-
- # GH26658
- if len(indexer) != len(array):
- raise IndexError(
- f"Boolean index has wrong length: "
- f"{len(indexer)} instead of {len(array)}"
- )
- elif is_integer_dtype(dtype):
- try:
- indexer = np.asarray(indexer, dtype=np.intp)
- except ValueError as err:
- raise ValueError(
- "Cannot index with an integer indexer containing NA values"
- ) from err
- else:
- raise IndexError("arrays used as indices must be of integer or boolean type")
-
- return indexer
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/__init__.py b/contrib/python/pandas/py3/pandas/core/indexes/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/accessors.py b/contrib/python/pandas/py3/pandas/core/indexes/accessors.py
deleted file mode 100644
index b1ee176c7f3..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/accessors.py
+++ /dev/null
@@ -1,580 +0,0 @@
-"""
-datetimelike delegation
-"""
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- cast,
-)
-
-import numpy as np
-
-from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_datetime64_dtype,
- is_datetime64tz_dtype,
- is_integer_dtype,
- is_list_like,
- is_period_dtype,
- is_timedelta64_dtype,
-)
-from pandas.core.dtypes.generic import ABCSeries
-
-from pandas.core.accessor import (
- PandasDelegate,
- delegate_names,
-)
-from pandas.core.arrays import (
- DatetimeArray,
- PeriodArray,
- TimedeltaArray,
-)
-from pandas.core.arrays.arrow.array import ArrowExtensionArray
-from pandas.core.arrays.arrow.dtype import ArrowDtype
-from pandas.core.base import (
- NoNewAttributesMixin,
- PandasObject,
-)
-from pandas.core.indexes.datetimes import DatetimeIndex
-from pandas.core.indexes.timedeltas import TimedeltaIndex
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
-
-
-class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin):
- _hidden_attrs = PandasObject._hidden_attrs | {
- "orig",
- "name",
- }
-
- def __init__(self, data: Series, orig) -> None:
- if not isinstance(data, ABCSeries):
- raise TypeError(
- f"cannot convert an object of type {type(data)} to a datetimelike index"
- )
-
- self._parent = data
- self.orig = orig
- self.name = getattr(data, "name", None)
- self._freeze()
-
- def _get_values(self):
- data = self._parent
- if is_datetime64_dtype(data.dtype):
- return DatetimeIndex(data, copy=False, name=self.name)
-
- elif is_datetime64tz_dtype(data.dtype):
- return DatetimeIndex(data, copy=False, name=self.name)
-
- elif is_timedelta64_dtype(data.dtype):
- return TimedeltaIndex(data, copy=False, name=self.name)
-
- elif is_period_dtype(data.dtype):
- return PeriodArray(data, copy=False)
-
- raise TypeError(
- f"cannot convert an object of type {type(data)} to a datetimelike index"
- )
-
- def _delegate_property_get(self, name):
- from pandas import Series
-
- values = self._get_values()
-
- result = getattr(values, name)
-
- # maybe need to upcast (ints)
- if isinstance(result, np.ndarray):
- if is_integer_dtype(result):
- result = result.astype("int64")
- elif not is_list_like(result):
- return result
-
- result = np.asarray(result)
-
- if self.orig is not None:
- index = self.orig.index
- else:
- index = self._parent.index
- # return the result as a Series
- result = Series(result, index=index, name=self.name).__finalize__(self._parent)
-
- # setting this object will show a SettingWithCopyWarning/Error
- result._is_copy = (
- "modifications to a property of a datetimelike "
- "object are not supported and are discarded. "
- "Change values on the original."
- )
-
- return result
-
- def _delegate_property_set(self, name, value, *args, **kwargs):
- raise ValueError(
- "modifications to a property of a datetimelike object are not supported. "
- "Change values on the original."
- )
-
- def _delegate_method(self, name, *args, **kwargs):
- from pandas import Series
-
- values = self._get_values()
-
- method = getattr(values, name)
- result = method(*args, **kwargs)
-
- if not is_list_like(result):
- return result
-
- result = Series(result, index=self._parent.index, name=self.name).__finalize__(
- self._parent
- )
-
- # setting this object will show a SettingWithCopyWarning/Error
- result._is_copy = (
- "modifications to a method of a datetimelike "
- "object are not supported and are discarded. "
- "Change values on the original."
- )
-
- return result
-
-
-@delegate_names(
- delegate=ArrowExtensionArray,
- accessors=DatetimeArray._datetimelike_ops,
- typ="property",
- accessor_mapping=lambda x: f"_dt_{x}",
- raise_on_missing=False,
-)
-@delegate_names(
- delegate=ArrowExtensionArray,
- accessors=DatetimeArray._datetimelike_methods,
- typ="method",
- accessor_mapping=lambda x: f"_dt_{x}",
- raise_on_missing=False,
-)
-class ArrowTemporalProperties(PandasDelegate, PandasObject, NoNewAttributesMixin):
- def __init__(self, data: Series, orig) -> None:
- if not isinstance(data, ABCSeries):
- raise TypeError(
- f"cannot convert an object of type {type(data)} to a datetimelike index"
- )
-
- self._parent = data
- self._orig = orig
- self._freeze()
-
- def _delegate_property_get(self, name: str): # type: ignore[override]
- if not hasattr(self._parent.array, f"_dt_{name}"):
- raise NotImplementedError(
- f"dt.{name} is not supported for {self._parent.dtype}"
- )
- result = getattr(self._parent.array, f"_dt_{name}")
-
- if not is_list_like(result):
- return result
-
- if self._orig is not None:
- index = self._orig.index
- else:
- index = self._parent.index
- # return the result as a Series, which is by definition a copy
- result = type(self._parent)(
- result, index=index, name=self._parent.name
- ).__finalize__(self._parent)
-
- return result
-
- def _delegate_method(self, name: str, *args, **kwargs):
- if not hasattr(self._parent.array, f"_dt_{name}"):
- raise NotImplementedError(
- f"dt.{name} is not supported for {self._parent.dtype}"
- )
-
- result = getattr(self._parent.array, f"_dt_{name}")(*args, **kwargs)
-
- if self._orig is not None:
- index = self._orig.index
- else:
- index = self._parent.index
- # return the result as a Series, which is by definition a copy
- result = type(self._parent)(
- result, index=index, name=self._parent.name
- ).__finalize__(self._parent)
-
- return result
-
- def to_pydatetime(self):
- return cast(ArrowExtensionArray, self._parent.array)._dt_to_pydatetime()
-
- def isocalendar(self):
- from pandas import DataFrame
-
- result = (
- cast(ArrowExtensionArray, self._parent.array)
- ._dt_isocalendar()
- ._data.combine_chunks()
- )
- iso_calendar_df = DataFrame(
- {
- col: type(self._parent.array)(result.field(i)) # type: ignore[call-arg]
- for i, col in enumerate(["year", "week", "day"])
- }
- )
- return iso_calendar_df
-
-
-@delegate_names(
- delegate=DatetimeArray,
- accessors=DatetimeArray._datetimelike_ops + ["unit"],
- typ="property",
-)
-@delegate_names(
- delegate=DatetimeArray,
- accessors=DatetimeArray._datetimelike_methods + ["as_unit"],
- typ="method",
-)
-class DatetimeProperties(Properties):
- """
- Accessor object for datetimelike properties of the Series values.
-
- Examples
- --------
- >>> seconds_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="s"))
- >>> seconds_series
- 0 2000-01-01 00:00:00
- 1 2000-01-01 00:00:01
- 2 2000-01-01 00:00:02
- dtype: datetime64[ns]
- >>> seconds_series.dt.second
- 0 0
- 1 1
- 2 2
- dtype: int32
-
- >>> hours_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="h"))
- >>> hours_series
- 0 2000-01-01 00:00:00
- 1 2000-01-01 01:00:00
- 2 2000-01-01 02:00:00
- dtype: datetime64[ns]
- >>> hours_series.dt.hour
- 0 0
- 1 1
- 2 2
- dtype: int32
-
- >>> quarters_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="q"))
- >>> quarters_series
- 0 2000-03-31
- 1 2000-06-30
- 2 2000-09-30
- dtype: datetime64[ns]
- >>> quarters_series.dt.quarter
- 0 1
- 1 2
- 2 3
- dtype: int32
-
- Returns a Series indexed like the original Series.
- Raises TypeError if the Series does not contain datetimelike values.
- """
-
- def to_pydatetime(self) -> np.ndarray:
- """
- Return the data as an array of :class:`datetime.datetime` objects.
-
- Timezone information is retained if present.
-
- .. warning::
-
- Python's datetime uses microsecond resolution, which is lower than
- pandas (nanosecond). The values are truncated.
-
- Returns
- -------
- numpy.ndarray
- Object dtype array containing native Python datetime objects.
-
- See Also
- --------
- datetime.datetime : Standard library value for a datetime.
-
- Examples
- --------
- >>> s = pd.Series(pd.date_range('20180310', periods=2))
- >>> s
- 0 2018-03-10
- 1 2018-03-11
- dtype: datetime64[ns]
-
- >>> s.dt.to_pydatetime()
- array([datetime.datetime(2018, 3, 10, 0, 0),
- datetime.datetime(2018, 3, 11, 0, 0)], dtype=object)
-
- pandas' nanosecond precision is truncated to microseconds.
-
- >>> s = pd.Series(pd.date_range('20180310', periods=2, freq='ns'))
- >>> s
- 0 2018-03-10 00:00:00.000000000
- 1 2018-03-10 00:00:00.000000001
- dtype: datetime64[ns]
-
- >>> s.dt.to_pydatetime()
- array([datetime.datetime(2018, 3, 10, 0, 0),
- datetime.datetime(2018, 3, 10, 0, 0)], dtype=object)
- """
- return self._get_values().to_pydatetime()
-
- @property
- def freq(self):
- return self._get_values().inferred_freq
-
- def isocalendar(self) -> DataFrame:
- """
- Calculate year, week, and day according to the ISO 8601 standard.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- DataFrame
- With columns year, week and day.
-
- See Also
- --------
- Timestamp.isocalendar : Function return a 3-tuple containing ISO year,
- week number, and weekday for the given Timestamp object.
- datetime.date.isocalendar : Return a named tuple object with
- three components: year, week and weekday.
-
- Examples
- --------
- >>> ser = pd.to_datetime(pd.Series(["2010-01-01", pd.NaT]))
- >>> ser.dt.isocalendar()
- year week day
- 0 2009 53 5
- 1 <NA> <NA> <NA>
- >>> ser.dt.isocalendar().week
- 0 53
- 1 <NA>
- Name: week, dtype: UInt32
- """
- return self._get_values().isocalendar().set_index(self._parent.index)
-
-
-@delegate_names(
- delegate=TimedeltaArray, accessors=TimedeltaArray._datetimelike_ops, typ="property"
-)
-@delegate_names(
- delegate=TimedeltaArray,
- accessors=TimedeltaArray._datetimelike_methods,
- typ="method",
-)
-class TimedeltaProperties(Properties):
- """
- Accessor object for datetimelike properties of the Series values.
-
- Returns a Series indexed like the original Series.
- Raises TypeError if the Series does not contain datetimelike values.
-
- Examples
- --------
- >>> seconds_series = pd.Series(
- ... pd.timedelta_range(start="1 second", periods=3, freq="S")
- ... )
- >>> seconds_series
- 0 0 days 00:00:01
- 1 0 days 00:00:02
- 2 0 days 00:00:03
- dtype: timedelta64[ns]
- >>> seconds_series.dt.seconds
- 0 1
- 1 2
- 2 3
- dtype: int32
- """
-
- def to_pytimedelta(self) -> np.ndarray:
- """
- Return an array of native :class:`datetime.timedelta` objects.
-
- Python's standard `datetime` library uses a different representation
- timedelta's. This method converts a Series of pandas Timedeltas
- to `datetime.timedelta` format with the same length as the original
- Series.
-
- Returns
- -------
- numpy.ndarray
- Array of 1D containing data with `datetime.timedelta` type.
-
- See Also
- --------
- datetime.timedelta : A duration expressing the difference
- between two date, time, or datetime.
-
- Examples
- --------
- >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit="d"))
- >>> s
- 0 0 days
- 1 1 days
- 2 2 days
- 3 3 days
- 4 4 days
- dtype: timedelta64[ns]
-
- >>> s.dt.to_pytimedelta()
- array([datetime.timedelta(0), datetime.timedelta(days=1),
- datetime.timedelta(days=2), datetime.timedelta(days=3),
- datetime.timedelta(days=4)], dtype=object)
- """
- return self._get_values().to_pytimedelta()
-
- @property
- def components(self):
- """
- Return a Dataframe of the components of the Timedeltas.
-
- Returns
- -------
- DataFrame
-
- Examples
- --------
- >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s'))
- >>> s
- 0 0 days 00:00:00
- 1 0 days 00:00:01
- 2 0 days 00:00:02
- 3 0 days 00:00:03
- 4 0 days 00:00:04
- dtype: timedelta64[ns]
- >>> s.dt.components
- days hours minutes seconds milliseconds microseconds nanoseconds
- 0 0 0 0 0 0 0 0
- 1 0 0 0 1 0 0 0
- 2 0 0 0 2 0 0 0
- 3 0 0 0 3 0 0 0
- 4 0 0 0 4 0 0 0
- """
- return (
- self._get_values()
- .components.set_index(self._parent.index)
- .__finalize__(self._parent)
- )
-
- @property
- def freq(self):
- return self._get_values().inferred_freq
-
-
-@delegate_names(
- delegate=PeriodArray, accessors=PeriodArray._datetimelike_ops, typ="property"
-)
-@delegate_names(
- delegate=PeriodArray, accessors=PeriodArray._datetimelike_methods, typ="method"
-)
-class PeriodProperties(Properties):
- """
- Accessor object for datetimelike properties of the Series values.
-
- Returns a Series indexed like the original Series.
- Raises TypeError if the Series does not contain datetimelike values.
-
- Examples
- --------
- >>> seconds_series = pd.Series(
- ... pd.period_range(
- ... start="2000-01-01 00:00:00", end="2000-01-01 00:00:03", freq="s"
- ... )
- ... )
- >>> seconds_series
- 0 2000-01-01 00:00:00
- 1 2000-01-01 00:00:01
- 2 2000-01-01 00:00:02
- 3 2000-01-01 00:00:03
- dtype: period[S]
- >>> seconds_series.dt.second
- 0 0
- 1 1
- 2 2
- 3 3
- dtype: int64
-
- >>> hours_series = pd.Series(
- ... pd.period_range(start="2000-01-01 00:00", end="2000-01-01 03:00", freq="h")
- ... )
- >>> hours_series
- 0 2000-01-01 00:00
- 1 2000-01-01 01:00
- 2 2000-01-01 02:00
- 3 2000-01-01 03:00
- dtype: period[H]
- >>> hours_series.dt.hour
- 0 0
- 1 1
- 2 2
- 3 3
- dtype: int64
-
- >>> quarters_series = pd.Series(
- ... pd.period_range(start="2000-01-01", end="2000-12-31", freq="Q-DEC")
- ... )
- >>> quarters_series
- 0 2000Q1
- 1 2000Q2
- 2 2000Q3
- 3 2000Q4
- dtype: period[Q-DEC]
- >>> quarters_series.dt.quarter
- 0 1
- 1 2
- 2 3
- 3 4
- dtype: int64
- """
-
-
-class CombinedDatetimelikeProperties(
- DatetimeProperties, TimedeltaProperties, PeriodProperties
-):
- def __new__(cls, data: Series):
- # CombinedDatetimelikeProperties isn't really instantiated. Instead
- # we need to choose which parent (datetime or timedelta) is
- # appropriate. Since we're checking the dtypes anyway, we'll just
- # do all the validation here.
-
- if not isinstance(data, ABCSeries):
- raise TypeError(
- f"cannot convert an object of type {type(data)} to a datetimelike index"
- )
-
- orig = data if is_categorical_dtype(data.dtype) else None
- if orig is not None:
- data = data._constructor(
- orig.array,
- name=orig.name,
- copy=False,
- dtype=orig._values.categories.dtype,
- index=orig.index,
- )
-
- if isinstance(data.dtype, ArrowDtype) and data.dtype.kind == "M":
- return ArrowTemporalProperties(data, orig)
- if is_datetime64_dtype(data.dtype):
- return DatetimeProperties(data, orig)
- elif is_datetime64tz_dtype(data.dtype):
- return DatetimeProperties(data, orig)
- elif is_timedelta64_dtype(data.dtype):
- return TimedeltaProperties(data, orig)
- elif is_period_dtype(data.dtype):
- return PeriodProperties(data, orig)
-
- raise AttributeError("Can only use .dt accessor with datetimelike values")
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/api.py b/contrib/python/pandas/py3/pandas/core/indexes/api.py
deleted file mode 100644
index fcf529f5be9..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/api.py
+++ /dev/null
@@ -1,369 +0,0 @@
-from __future__ import annotations
-
-import textwrap
-from typing import cast
-
-import numpy as np
-
-from pandas._libs import (
- NaT,
- lib,
-)
-from pandas._typing import Axis
-from pandas.errors import InvalidIndexError
-
-from pandas.core.dtypes.cast import find_common_type
-
-from pandas.core.algorithms import safe_sort
-from pandas.core.indexes.base import (
- Index,
- _new_Index,
- ensure_index,
- ensure_index_from_sequences,
- get_unanimous_names,
-)
-from pandas.core.indexes.category import CategoricalIndex
-from pandas.core.indexes.datetimes import DatetimeIndex
-from pandas.core.indexes.interval import IntervalIndex
-from pandas.core.indexes.multi import MultiIndex
-from pandas.core.indexes.period import PeriodIndex
-from pandas.core.indexes.range import RangeIndex
-from pandas.core.indexes.timedeltas import TimedeltaIndex
-
-_sort_msg = textwrap.dedent(
- """\
-Sorting because non-concatenation axis is not aligned. A future version
-of pandas will change to not sort by default.
-
-To accept the future behavior, pass 'sort=False'.
-
-To retain the current behavior and silence the warning, pass 'sort=True'.
-"""
-)
-
-
-__all__ = [
- "Index",
- "MultiIndex",
- "CategoricalIndex",
- "IntervalIndex",
- "RangeIndex",
- "InvalidIndexError",
- "TimedeltaIndex",
- "PeriodIndex",
- "DatetimeIndex",
- "_new_Index",
- "NaT",
- "ensure_index",
- "ensure_index_from_sequences",
- "get_objs_combined_axis",
- "union_indexes",
- "get_unanimous_names",
- "all_indexes_same",
- "default_index",
- "safe_sort_index",
-]
-
-
-def get_objs_combined_axis(
- objs, intersect: bool = False, axis: Axis = 0, sort: bool = True, copy: bool = False
-) -> Index:
- """
- Extract combined index: return intersection or union (depending on the
- value of "intersect") of indexes on given axis, or None if all objects
- lack indexes (e.g. they are numpy arrays).
-
- Parameters
- ----------
- objs : list
- Series or DataFrame objects, may be mix of the two.
- intersect : bool, default False
- If True, calculate the intersection between indexes. Otherwise,
- calculate the union.
- axis : {0 or 'index', 1 or 'outer'}, default 0
- The axis to extract indexes from.
- sort : bool, default True
- Whether the result index should come out sorted or not.
- copy : bool, default False
- If True, return a copy of the combined index.
-
- Returns
- -------
- Index
- """
- obs_idxes = [obj._get_axis(axis) for obj in objs]
- return _get_combined_index(obs_idxes, intersect=intersect, sort=sort, copy=copy)
-
-
-def _get_distinct_objs(objs: list[Index]) -> list[Index]:
- """
- Return a list with distinct elements of "objs" (different ids).
- Preserves order.
- """
- ids: set[int] = set()
- res = []
- for obj in objs:
- if id(obj) not in ids:
- ids.add(id(obj))
- res.append(obj)
- return res
-
-
-def _get_combined_index(
- indexes: list[Index],
- intersect: bool = False,
- sort: bool = False,
- copy: bool = False,
-) -> Index:
- """
- Return the union or intersection of indexes.
-
- Parameters
- ----------
- indexes : list of Index or list objects
- When intersect=True, do not accept list of lists.
- intersect : bool, default False
- If True, calculate the intersection between indexes. Otherwise,
- calculate the union.
- sort : bool, default False
- Whether the result index should come out sorted or not.
- copy : bool, default False
- If True, return a copy of the combined index.
-
- Returns
- -------
- Index
- """
- # TODO: handle index names!
- indexes = _get_distinct_objs(indexes)
- if len(indexes) == 0:
- index = Index([])
- elif len(indexes) == 1:
- index = indexes[0]
- elif intersect:
- index = indexes[0]
- for other in indexes[1:]:
- index = index.intersection(other)
- else:
- index = union_indexes(indexes, sort=False)
- index = ensure_index(index)
-
- if sort:
- index = safe_sort_index(index)
- # GH 29879
- if copy:
- index = index.copy()
-
- return index
-
-
-def safe_sort_index(index: Index) -> Index:
- """
- Returns the sorted index
-
- We keep the dtypes and the name attributes.
-
- Parameters
- ----------
- index : an Index
-
- Returns
- -------
- Index
- """
- if index.is_monotonic_increasing:
- return index
-
- try:
- array_sorted = safe_sort(index)
- except TypeError:
- pass
- else:
- if isinstance(array_sorted, Index):
- return array_sorted
-
- array_sorted = cast(np.ndarray, array_sorted)
- if isinstance(index, MultiIndex):
- index = MultiIndex.from_tuples(array_sorted, names=index.names)
- else:
- index = Index(array_sorted, name=index.name, dtype=index.dtype)
-
- return index
-
-
-def union_indexes(indexes, sort: bool | None = True) -> Index:
- """
- Return the union of indexes.
-
- The behavior of sort and names is not consistent.
-
- Parameters
- ----------
- indexes : list of Index or list objects
- sort : bool, default True
- Whether the result index should come out sorted or not.
-
- Returns
- -------
- Index
- """
- if len(indexes) == 0:
- raise AssertionError("Must have at least 1 Index to union")
- if len(indexes) == 1:
- result = indexes[0]
- if isinstance(result, list):
- result = Index(sorted(result))
- return result
-
- indexes, kind = _sanitize_and_check(indexes)
-
- def _unique_indices(inds, dtype) -> Index:
- """
- Convert indexes to lists and concatenate them, removing duplicates.
-
- The final dtype is inferred.
-
- Parameters
- ----------
- inds : list of Index or list objects
- dtype : dtype to set for the resulting Index
-
- Returns
- -------
- Index
- """
-
- def conv(i):
- if isinstance(i, Index):
- i = i.tolist()
- return i
-
- return Index(
- lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort),
- dtype=dtype,
- )
-
- def _find_common_index_dtype(inds):
- """
- Finds a common type for the indexes to pass through to resulting index.
-
- Parameters
- ----------
- inds: list of Index or list objects
-
- Returns
- -------
- The common type or None if no indexes were given
- """
- dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)]
- if dtypes:
- dtype = find_common_type(dtypes)
- else:
- dtype = None
-
- return dtype
-
- if kind == "special":
- result = indexes[0]
-
- dtis = [x for x in indexes if isinstance(x, DatetimeIndex)]
- dti_tzs = [x for x in dtis if x.tz is not None]
- if len(dti_tzs) not in [0, len(dtis)]:
- # TODO: this behavior is not tested (so may not be desired),
- # but is kept in order to keep behavior the same when
- # deprecating union_many
- # test_frame_from_dict_with_mixed_indexes
- raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
-
- if len(dtis) == len(indexes):
- sort = True
- result = indexes[0]
-
- elif len(dtis) > 1:
- # If we have mixed timezones, our casting behavior may depend on
- # the order of indexes, which we don't want.
- sort = False
-
- # TODO: what about Categorical[dt64]?
- # test_frame_from_dict_with_mixed_indexes
- indexes = [x.astype(object, copy=False) for x in indexes]
- result = indexes[0]
-
- for other in indexes[1:]:
- result = result.union(other, sort=None if sort else False)
- return result
-
- elif kind == "array":
- dtype = _find_common_index_dtype(indexes)
- index = indexes[0]
- if not all(index.equals(other) for other in indexes[1:]):
- index = _unique_indices(indexes, dtype)
-
- name = get_unanimous_names(*indexes)[0]
- if name != index.name:
- index = index.rename(name)
- return index
- else: # kind='list'
- dtype = _find_common_index_dtype(indexes)
- return _unique_indices(indexes, dtype)
-
-
-def _sanitize_and_check(indexes):
- """
- Verify the type of indexes and convert lists to Index.
-
- Cases:
-
- - [list, list, ...]: Return ([list, list, ...], 'list')
- - [list, Index, ...]: Return _sanitize_and_check([Index, Index, ...])
- Lists are sorted and converted to Index.
- - [Index, Index, ...]: Return ([Index, Index, ...], TYPE)
- TYPE = 'special' if at least one special type, 'array' otherwise.
-
- Parameters
- ----------
- indexes : list of Index or list objects
-
- Returns
- -------
- sanitized_indexes : list of Index or list objects
- type : {'list', 'array', 'special'}
- """
- kinds = list({type(index) for index in indexes})
-
- if list in kinds:
- if len(kinds) > 1:
- indexes = [
- Index(list(x)) if not isinstance(x, Index) else x for x in indexes
- ]
- kinds.remove(list)
- else:
- return indexes, "list"
-
- if len(kinds) > 1 or Index not in kinds:
- return indexes, "special"
- else:
- return indexes, "array"
-
-
-def all_indexes_same(indexes) -> bool:
- """
- Determine if all indexes contain the same elements.
-
- Parameters
- ----------
- indexes : iterable of Index objects
-
- Returns
- -------
- bool
- True if all indexes contain the same elements, False otherwise.
- """
- itr = iter(indexes)
- first = next(itr)
- return all(first.equals(index) for index in itr)
-
-
-def default_index(n: int) -> RangeIndex:
- rng = range(0, n)
- return RangeIndex._simple_new(rng, name=None)
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/base.py b/contrib/python/pandas/py3/pandas/core/indexes/base.py
deleted file mode 100644
index 6df553fd57e..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/base.py
+++ /dev/null
@@ -1,7243 +0,0 @@
-from __future__ import annotations
-
-from datetime import datetime
-import functools
-from itertools import zip_longest
-import operator
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- ClassVar,
- Hashable,
- Iterable,
- Literal,
- NoReturn,
- Sequence,
- TypeVar,
- cast,
- final,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._config import get_option
-
-from pandas._libs import (
- NaT,
- algos as libalgos,
- index as libindex,
- lib,
-)
-from pandas._libs.internals import BlockValuesRefs
-import pandas._libs.join as libjoin
-from pandas._libs.lib import (
- is_datetime_array,
- no_default,
-)
-from pandas._libs.missing import is_float_nan
-from pandas._libs.tslibs import (
- IncompatibleFrequency,
- OutOfBoundsDatetime,
- Timestamp,
- tz_compare,
-)
-from pandas._typing import (
- AnyAll,
- ArrayLike,
- Axes,
- Axis,
- DropKeep,
- DtypeObj,
- F,
- IgnoreRaise,
- IndexLabel,
- JoinHow,
- Level,
- Shape,
- npt,
-)
-from pandas.compat.numpy import function as nv
-from pandas.errors import (
- DuplicateLabelError,
- InvalidIndexError,
-)
-from pandas.util._decorators import (
- Appender,
- cache_readonly,
- doc,
-)
-from pandas.util._exceptions import (
- find_stack_level,
- rewrite_exception,
-)
-
-from pandas.core.dtypes.astype import (
- astype_array,
- astype_is_view,
-)
-from pandas.core.dtypes.cast import (
- LossySetitemError,
- can_hold_element,
- common_dtype_categorical_compat,
- find_result_type,
- infer_dtype_from,
- maybe_cast_pointwise_result,
- np_can_hold_element,
-)
-from pandas.core.dtypes.common import (
- ensure_int64,
- ensure_object,
- ensure_platform_int,
- is_any_real_numeric_dtype,
- is_bool_dtype,
- is_categorical_dtype,
- is_dtype_equal,
- is_ea_or_datetimelike_dtype,
- is_extension_array_dtype,
- is_float,
- is_float_dtype,
- is_hashable,
- is_integer,
- is_integer_dtype,
- is_interval_dtype,
- is_iterator,
- is_list_like,
- is_numeric_dtype,
- is_object_dtype,
- is_scalar,
- is_signed_integer_dtype,
- is_string_dtype,
- needs_i8_conversion,
- pandas_dtype,
- validate_all_hashable,
-)
-from pandas.core.dtypes.concat import concat_compat
-from pandas.core.dtypes.dtypes import (
- CategoricalDtype,
- DatetimeTZDtype,
- ExtensionDtype,
- IntervalDtype,
- PeriodDtype,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCDatetimeIndex,
- ABCMultiIndex,
- ABCPeriodIndex,
- ABCSeries,
- ABCTimedeltaIndex,
-)
-from pandas.core.dtypes.inference import is_dict_like
-from pandas.core.dtypes.missing import (
- array_equivalent,
- is_valid_na_for_dtype,
- isna,
-)
-
-from pandas.core import (
- arraylike,
- ops,
-)
-from pandas.core.accessor import CachedAccessor
-import pandas.core.algorithms as algos
-from pandas.core.array_algos.putmask import (
- setitem_datetimelike_compat,
- validate_putmask,
-)
-from pandas.core.arrays import (
- ArrowExtensionArray,
- BaseMaskedArray,
- Categorical,
- ExtensionArray,
-)
-from pandas.core.arrays.string_ import StringArray
-from pandas.core.base import (
- IndexOpsMixin,
- PandasObject,
-)
-import pandas.core.common as com
-from pandas.core.construction import (
- ensure_wrapped_if_datetimelike,
- extract_array,
- sanitize_array,
-)
-from pandas.core.indexers import disallow_ndim_indexing
-from pandas.core.indexes.frozen import FrozenList
-from pandas.core.missing import clean_reindex_fill_method
-from pandas.core.ops import get_op_result_name
-from pandas.core.ops.invalid import make_invalid_op
-from pandas.core.sorting import (
- ensure_key_mapped,
- get_group_index_sorter,
- nargsort,
-)
-from pandas.core.strings.accessor import StringMethods
-
-from pandas.io.formats.printing import (
- PrettyDict,
- default_pprint,
- format_object_summary,
- pprint_thing,
-)
-
-if TYPE_CHECKING:
- from pandas import (
- CategoricalIndex,
- DataFrame,
- MultiIndex,
- Series,
- )
- from pandas.core.arrays import PeriodArray
-
-
-__all__ = ["Index"]
-
-_unsortable_types = frozenset(("mixed", "mixed-integer"))
-
-_index_doc_kwargs: dict[str, str] = {
- "klass": "Index",
- "inplace": "",
- "target_klass": "Index",
- "raises_section": "",
- "unique": "Index",
- "duplicated": "np.ndarray",
-}
-_index_shared_docs: dict[str, str] = {}
-str_t = str
-
-
-_dtype_obj = np.dtype("object")
-
-_masked_engines = {
- "Complex128": libindex.MaskedComplex128Engine,
- "Complex64": libindex.MaskedComplex64Engine,
- "Float64": libindex.MaskedFloat64Engine,
- "Float32": libindex.MaskedFloat32Engine,
- "UInt64": libindex.MaskedUInt64Engine,
- "UInt32": libindex.MaskedUInt32Engine,
- "UInt16": libindex.MaskedUInt16Engine,
- "UInt8": libindex.MaskedUInt8Engine,
- "Int64": libindex.MaskedInt64Engine,
- "Int32": libindex.MaskedInt32Engine,
- "Int16": libindex.MaskedInt16Engine,
- "Int8": libindex.MaskedInt8Engine,
- "boolean": libindex.MaskedBoolEngine,
- "double[pyarrow]": libindex.MaskedFloat64Engine,
- "float64[pyarrow]": libindex.MaskedFloat64Engine,
- "float32[pyarrow]": libindex.MaskedFloat32Engine,
- "float[pyarrow]": libindex.MaskedFloat32Engine,
- "uint64[pyarrow]": libindex.MaskedUInt64Engine,
- "uint32[pyarrow]": libindex.MaskedUInt32Engine,
- "uint16[pyarrow]": libindex.MaskedUInt16Engine,
- "uint8[pyarrow]": libindex.MaskedUInt8Engine,
- "int64[pyarrow]": libindex.MaskedInt64Engine,
- "int32[pyarrow]": libindex.MaskedInt32Engine,
- "int16[pyarrow]": libindex.MaskedInt16Engine,
- "int8[pyarrow]": libindex.MaskedInt8Engine,
- "bool[pyarrow]": libindex.MaskedBoolEngine,
-}
-
-
-def _maybe_return_indexers(meth: F) -> F:
- """
- Decorator to simplify 'return_indexers' checks in Index.join.
- """
-
- @functools.wraps(meth)
- def join(
- self,
- other: Index,
- *,
- how: JoinHow = "left",
- level=None,
- return_indexers: bool = False,
- sort: bool = False,
- ):
- join_index, lidx, ridx = meth(self, other, how=how, level=level, sort=sort)
- if not return_indexers:
- return join_index
-
- if lidx is not None:
- lidx = ensure_platform_int(lidx)
- if ridx is not None:
- ridx = ensure_platform_int(ridx)
- return join_index, lidx, ridx
-
- return cast(F, join)
-
-
-def _new_Index(cls, d):
- """
- This is called upon unpickling, rather than the default which doesn't
- have arguments and breaks __new__.
- """
- # required for backward compat, because PI can't be instantiated with
- # ordinals through __new__ GH #13277
- if issubclass(cls, ABCPeriodIndex):
- from pandas.core.indexes.period import _new_PeriodIndex
-
- return _new_PeriodIndex(cls, **d)
-
- if issubclass(cls, ABCMultiIndex):
- if "labels" in d and "codes" not in d:
- # GH#23752 "labels" kwarg has been replaced with "codes"
- d["codes"] = d.pop("labels")
-
- # Since this was a valid MultiIndex at pickle-time, we don't need to
- # check validty at un-pickle time.
- d["verify_integrity"] = False
-
- elif "dtype" not in d and "data" in d:
- # Prevent Index.__new__ from conducting inference;
- # "data" key not in RangeIndex
- d["dtype"] = d["data"].dtype
- return cls.__new__(cls, **d)
-
-
-_IndexT = TypeVar("_IndexT", bound="Index")
-
-
-class Index(IndexOpsMixin, PandasObject):
- """
- Immutable sequence used for indexing and alignment.
-
- The basic object storing axis labels for all pandas objects.
-
- .. versionchanged:: 2.0.0
-
- Index can hold all numpy numeric dtypes (except float16). Previously only
- int64/uint64/float64 dtypes were accepted.
-
- Parameters
- ----------
- data : array-like (1-dimensional)
- dtype : NumPy dtype (default: object)
- If dtype is None, we find the dtype that best fits the data.
- If an actual dtype is provided, we coerce to that dtype if it's safe.
- Otherwise, an error will be raised.
- copy : bool
- Make a copy of input ndarray.
- name : object
- Name to be stored in the index.
- tupleize_cols : bool (default: True)
- When True, attempt to create a MultiIndex if possible.
-
- See Also
- --------
- RangeIndex : Index implementing a monotonic integer range.
- CategoricalIndex : Index of :class:`Categorical` s.
- MultiIndex : A multi-level, or hierarchical Index.
- IntervalIndex : An Index of :class:`Interval` s.
- DatetimeIndex : Index of datetime64 data.
- TimedeltaIndex : Index of timedelta64 data.
- PeriodIndex : Index of Period data.
-
- Notes
- -----
- An Index instance can **only** contain hashable objects.
- An Index instance *can not* hold numpy float16 dtype.
-
- Examples
- --------
- >>> pd.Index([1, 2, 3])
- Index([1, 2, 3], dtype='int64')
-
- >>> pd.Index(list('abc'))
- Index(['a', 'b', 'c'], dtype='object')
-
- >>> pd.Index([1, 2, 3], dtype="uint8")
- Index([1, 2, 3], dtype='uint8')
- """
-
- # To hand over control to subclasses
- _join_precedence = 1
-
- # Cython methods; see github.com/cython/cython/issues/2647
- # for why we need to wrap these instead of making them class attributes
- # Moreover, cython will choose the appropriate-dtyped sub-function
- # given the dtypes of the passed arguments
-
- @final
- def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]:
- # Caller is responsible for ensuring other.dtype == self.dtype
- sv = self._get_join_target()
- ov = other._get_join_target()
- # can_use_libjoin assures sv and ov are ndarrays
- sv = cast(np.ndarray, sv)
- ov = cast(np.ndarray, ov)
- # similar but not identical to ov.searchsorted(sv)
- return libjoin.left_join_indexer_unique(sv, ov)
-
- @final
- def _left_indexer(
- self: _IndexT, other: _IndexT
- ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- # Caller is responsible for ensuring other.dtype == self.dtype
- sv = self._get_join_target()
- ov = other._get_join_target()
- # can_use_libjoin assures sv and ov are ndarrays
- sv = cast(np.ndarray, sv)
- ov = cast(np.ndarray, ov)
- joined_ndarray, lidx, ridx = libjoin.left_join_indexer(sv, ov)
- joined = self._from_join_target(joined_ndarray)
- return joined, lidx, ridx
-
- @final
- def _inner_indexer(
- self: _IndexT, other: _IndexT
- ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- # Caller is responsible for ensuring other.dtype == self.dtype
- sv = self._get_join_target()
- ov = other._get_join_target()
- # can_use_libjoin assures sv and ov are ndarrays
- sv = cast(np.ndarray, sv)
- ov = cast(np.ndarray, ov)
- joined_ndarray, lidx, ridx = libjoin.inner_join_indexer(sv, ov)
- joined = self._from_join_target(joined_ndarray)
- return joined, lidx, ridx
-
- @final
- def _outer_indexer(
- self: _IndexT, other: _IndexT
- ) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- # Caller is responsible for ensuring other.dtype == self.dtype
- sv = self._get_join_target()
- ov = other._get_join_target()
- # can_use_libjoin assures sv and ov are ndarrays
- sv = cast(np.ndarray, sv)
- ov = cast(np.ndarray, ov)
- joined_ndarray, lidx, ridx = libjoin.outer_join_indexer(sv, ov)
- joined = self._from_join_target(joined_ndarray)
- return joined, lidx, ridx
-
- _typ: str = "index"
- _data: ExtensionArray | np.ndarray
- _data_cls: type[ExtensionArray] | tuple[type[np.ndarray], type[ExtensionArray]] = (
- np.ndarray,
- ExtensionArray,
- )
- _id: object | None = None
- _name: Hashable = None
- # MultiIndex.levels previously allowed setting the index name. We
- # don't allow this anymore, and raise if it happens rather than
- # failing silently.
- _no_setting_name: bool = False
- _comparables: list[str] = ["name"]
- _attributes: list[str] = ["name"]
-
- @cache_readonly
- def _can_hold_strings(self) -> bool:
- return not is_numeric_dtype(self)
-
- _engine_types: dict[np.dtype | ExtensionDtype, type[libindex.IndexEngine]] = {
- np.dtype(np.int8): libindex.Int8Engine,
- np.dtype(np.int16): libindex.Int16Engine,
- np.dtype(np.int32): libindex.Int32Engine,
- np.dtype(np.int64): libindex.Int64Engine,
- np.dtype(np.uint8): libindex.UInt8Engine,
- np.dtype(np.uint16): libindex.UInt16Engine,
- np.dtype(np.uint32): libindex.UInt32Engine,
- np.dtype(np.uint64): libindex.UInt64Engine,
- np.dtype(np.float32): libindex.Float32Engine,
- np.dtype(np.float64): libindex.Float64Engine,
- np.dtype(np.complex64): libindex.Complex64Engine,
- np.dtype(np.complex128): libindex.Complex128Engine,
- }
-
- @property
- def _engine_type(
- self,
- ) -> type[libindex.IndexEngine] | type[libindex.ExtensionEngine]:
- return self._engine_types.get(self.dtype, libindex.ObjectEngine)
-
- # whether we support partial string indexing. Overridden
- # in DatetimeIndex and PeriodIndex
- _supports_partial_string_indexing = False
-
- _accessors = {"str"}
-
- str = CachedAccessor("str", StringMethods)
-
- _references = None
-
- # --------------------------------------------------------------------
- # Constructors
-
- def __new__(
- cls,
- data=None,
- dtype=None,
- copy: bool = False,
- name=None,
- tupleize_cols: bool = True,
- ) -> Index:
- from pandas.core.indexes.range import RangeIndex
-
- name = maybe_extract_name(name, data, cls)
-
- if dtype is not None:
- dtype = pandas_dtype(dtype)
-
- data_dtype = getattr(data, "dtype", None)
-
- refs = None
- if not copy and isinstance(data, (ABCSeries, Index)):
- refs = data._references
-
- # range
- if isinstance(data, (range, RangeIndex)):
- result = RangeIndex(start=data, copy=copy, name=name)
- if dtype is not None:
- return result.astype(dtype, copy=False)
- return result
-
- elif is_ea_or_datetimelike_dtype(dtype):
- # non-EA dtype indexes have special casting logic, so we punt here
- pass
-
- elif is_ea_or_datetimelike_dtype(data_dtype):
- pass
-
- elif isinstance(data, (np.ndarray, Index, ABCSeries)):
- if isinstance(data, ABCMultiIndex):
- data = data._values
-
- if data.dtype.kind not in ["i", "u", "f", "b", "c", "m", "M"]:
- # GH#11836 we need to avoid having numpy coerce
- # things that look like ints/floats to ints unless
- # they are actually ints, e.g. '0' and 0.0
- # should not be coerced
- data = com.asarray_tuplesafe(data, dtype=_dtype_obj)
-
- elif is_scalar(data):
- raise cls._raise_scalar_data_error(data)
- elif hasattr(data, "__array__"):
- return Index(np.asarray(data), dtype=dtype, copy=copy, name=name)
- elif not is_list_like(data) and not isinstance(data, memoryview):
- # 2022-11-16 the memoryview check is only necessary on some CI
- # builds, not clear why
- raise cls._raise_scalar_data_error(data)
-
- else:
- if tupleize_cols:
- # GH21470: convert iterable to list before determining if empty
- if is_iterator(data):
- data = list(data)
-
- if data and all(isinstance(e, tuple) for e in data):
- # we must be all tuples, otherwise don't construct
- # 10697
- from pandas.core.indexes.multi import MultiIndex
-
- return MultiIndex.from_tuples(data, names=name)
- # other iterable of some kind
-
- if not isinstance(data, (list, tuple)):
- # we allow set/frozenset, which Series/sanitize_array does not, so
- # cast to list here
- data = list(data)
- if len(data) == 0:
- # unlike Series, we default to object dtype:
- data = np.array(data, dtype=object)
-
- if len(data) and isinstance(data[0], tuple):
- # Ensure we get 1-D array of tuples instead of 2D array.
- data = com.asarray_tuplesafe(data, dtype=_dtype_obj)
-
- try:
- arr = sanitize_array(data, None, dtype=dtype, copy=copy)
- except ValueError as err:
- if "index must be specified when data is not list-like" in str(err):
- raise cls._raise_scalar_data_error(data) from err
- if "Data must be 1-dimensional" in str(err):
- raise ValueError("Index data must be 1-dimensional") from err
- raise
- arr = ensure_wrapped_if_datetimelike(arr)
-
- klass = cls._dtype_to_subclass(arr.dtype)
-
- arr = klass._ensure_array(arr, arr.dtype, copy=False)
- return klass._simple_new(arr, name, refs=refs)
-
- @classmethod
- def _ensure_array(cls, data, dtype, copy: bool):
- """
- Ensure we have a valid array to pass to _simple_new.
- """
- if data.ndim > 1:
- # GH#13601, GH#20285, GH#27125
- raise ValueError("Index data must be 1-dimensional")
- elif dtype == np.float16:
- # float16 not supported (no indexing engine)
- raise NotImplementedError("float16 indexes are not supported")
-
- if copy:
- # asarray_tuplesafe does not always copy underlying data,
- # so need to make sure that this happens
- data = data.copy()
- return data
-
- @final
- @classmethod
- def _dtype_to_subclass(cls, dtype: DtypeObj):
- # Delay import for perf. https://github.com/pandas-dev/pandas/pull/31423
-
- if isinstance(dtype, ExtensionDtype):
- if isinstance(dtype, DatetimeTZDtype):
- from pandas import DatetimeIndex
-
- return DatetimeIndex
- elif isinstance(dtype, CategoricalDtype):
- from pandas import CategoricalIndex
-
- return CategoricalIndex
- elif isinstance(dtype, IntervalDtype):
- from pandas import IntervalIndex
-
- return IntervalIndex
- elif isinstance(dtype, PeriodDtype):
- from pandas import PeriodIndex
-
- return PeriodIndex
-
- return Index
-
- if dtype.kind == "M":
- from pandas import DatetimeIndex
-
- return DatetimeIndex
-
- elif dtype.kind == "m":
- from pandas import TimedeltaIndex
-
- return TimedeltaIndex
-
- elif dtype.kind == "O":
- # NB: assuming away MultiIndex
- return Index
-
- elif issubclass(dtype.type, str) or is_numeric_dtype(dtype):
- return Index
-
- raise NotImplementedError(dtype)
-
- # NOTE for new Index creation:
-
- # - _simple_new: It returns new Index with the same type as the caller.
- # All metadata (such as name) must be provided by caller's responsibility.
- # Using _shallow_copy is recommended because it fills these metadata
- # otherwise specified.
-
- # - _shallow_copy: It returns new Index with the same type (using
- # _simple_new), but fills caller's metadata otherwise specified. Passed
- # kwargs will overwrite corresponding metadata.
-
- # See each method's docstring.
-
- @classmethod
- def _simple_new(
- cls: type[_IndexT], values: ArrayLike, name: Hashable = None, refs=None
- ) -> _IndexT:
- """
- We require that we have a dtype compat for the values. If we are passed
- a non-dtype compat, then coerce using the constructor.
-
- Must be careful not to recurse.
- """
- assert isinstance(values, cls._data_cls), type(values)
-
- result = object.__new__(cls)
- result._data = values
- result._name = name
- result._cache = {}
- result._reset_identity()
- if refs is not None:
- result._references = refs
- else:
- result._references = BlockValuesRefs()
- result._references.add_index_reference(result)
-
- return result
-
- @classmethod
- def _with_infer(cls, *args, **kwargs):
- """
- Constructor that uses the 1.0.x behavior inferring numeric dtypes
- for ndarray[object] inputs.
- """
- result = cls(*args, **kwargs)
-
- if result.dtype == _dtype_obj and not result._is_multi:
- # error: Argument 1 to "maybe_convert_objects" has incompatible type
- # "Union[ExtensionArray, ndarray[Any, Any]]"; expected
- # "ndarray[Any, Any]"
- values = lib.maybe_convert_objects(result._values) # type: ignore[arg-type]
- if values.dtype.kind in ["i", "u", "f", "b"]:
- return Index(values, name=result.name)
-
- return result
-
- @cache_readonly
- def _constructor(self: _IndexT) -> type[_IndexT]:
- return type(self)
-
- @final
- def _maybe_check_unique(self) -> None:
- """
- Check that an Index has no duplicates.
-
- This is typically only called via
- `NDFrame.flags.allows_duplicate_labels.setter` when it's set to
- True (duplicates aren't allowed).
-
- Raises
- ------
- DuplicateLabelError
- When the index is not unique.
- """
- if not self.is_unique:
- msg = """Index has duplicates."""
- duplicates = self._format_duplicate_message()
- msg += f"\n{duplicates}"
-
- raise DuplicateLabelError(msg)
-
- @final
- def _format_duplicate_message(self) -> DataFrame:
- """
- Construct the DataFrame for a DuplicateLabelError.
-
- This returns a DataFrame indicating the labels and positions
- of duplicates in an index. This should only be called when it's
- already known that duplicates are present.
-
- Examples
- --------
- >>> idx = pd.Index(['a', 'b', 'a'])
- >>> idx._format_duplicate_message()
- positions
- label
- a [0, 2]
- """
- from pandas import Series
-
- duplicates = self[self.duplicated(keep="first")].unique()
- assert len(duplicates)
-
- out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates]
- if self._is_multi:
- # test_format_duplicate_labels_message_multi
- # error: "Type[Index]" has no attribute "from_tuples" [attr-defined]
- out.index = type(self).from_tuples(out.index) # type: ignore[attr-defined]
-
- if self.nlevels == 1:
- out = out.rename_axis("label")
- return out.to_frame(name="positions")
-
- # --------------------------------------------------------------------
- # Index Internals Methods
-
- def _shallow_copy(self: _IndexT, values, name: Hashable = no_default) -> _IndexT:
- """
- Create a new Index with the same class as the caller, don't copy the
- data, use the same object attributes with passed in attributes taking
- precedence.
-
- *this is an internal non-public method*
-
- Parameters
- ----------
- values : the values to create the new Index, optional
- name : Label, defaults to self.name
- """
- name = self._name if name is no_default else name
-
- return self._simple_new(values, name=name, refs=self._references)
-
- def _view(self: _IndexT) -> _IndexT:
- """
- fastpath to make a shallow copy, i.e. new object with same data.
- """
- result = self._simple_new(self._values, name=self._name, refs=self._references)
-
- result._cache = self._cache
- return result
-
- @final
- def _rename(self: _IndexT, name: Hashable) -> _IndexT:
- """
- fastpath for rename if new name is already validated.
- """
- result = self._view()
- result._name = name
- return result
-
- @final
- def is_(self, other) -> bool:
- """
- More flexible, faster check like ``is`` but that works through views.
-
- Note: this is *not* the same as ``Index.identical()``, which checks
- that metadata is also the same.
-
- Parameters
- ----------
- other : object
- Other object to compare against.
-
- Returns
- -------
- bool
- True if both have same underlying data, False otherwise.
-
- See Also
- --------
- Index.identical : Works like ``Index.is_`` but also checks metadata.
- """
- if self is other:
- return True
- elif not hasattr(other, "_id"):
- return False
- elif self._id is None or other._id is None:
- return False
- else:
- return self._id is other._id
-
- @final
- def _reset_identity(self) -> None:
- """
- Initializes or resets ``_id`` attribute with new object.
- """
- self._id = object()
-
- @final
- def _cleanup(self) -> None:
- self._engine.clear_mapping()
-
- @cache_readonly
- def _engine(
- self,
- ) -> libindex.IndexEngine | libindex.ExtensionEngine | libindex.MaskedIndexEngine:
- # For base class (object dtype) we get ObjectEngine
- target_values = self._get_engine_target()
- if isinstance(target_values, ExtensionArray):
- if isinstance(target_values, (BaseMaskedArray, ArrowExtensionArray)):
- try:
- return _masked_engines[target_values.dtype.name](target_values)
- except KeyError:
- # Not supported yet e.g. decimal
- pass
- elif self._engine_type is libindex.ObjectEngine:
- return libindex.ExtensionEngine(target_values)
-
- target_values = cast(np.ndarray, target_values)
- # to avoid a reference cycle, bind `target_values` to a local variable, so
- # `self` is not passed into the lambda.
- if target_values.dtype == bool:
- return libindex.BoolEngine(target_values)
- elif target_values.dtype == np.complex64:
- return libindex.Complex64Engine(target_values)
- elif target_values.dtype == np.complex128:
- return libindex.Complex128Engine(target_values)
- elif needs_i8_conversion(self.dtype):
- # We need to keep M8/m8 dtype when initializing the Engine,
- # but don't want to change _get_engine_target bc it is used
- # elsewhere
- # error: Item "ExtensionArray" of "Union[ExtensionArray,
- # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr]
- target_values = self._data._ndarray # type: ignore[union-attr]
-
- # error: Argument 1 to "ExtensionEngine" has incompatible type
- # "ndarray[Any, Any]"; expected "ExtensionArray"
- return self._engine_type(target_values) # type: ignore[arg-type]
-
- @final
- @cache_readonly
- def _dir_additions_for_owner(self) -> set[str_t]:
- """
- Add the string-like labels to the owner dataframe/series dir output.
-
- If this is a MultiIndex, it's first level values are used.
- """
- return {
- c
- for c in self.unique(level=0)[: get_option("display.max_dir_items")]
- if isinstance(c, str) and c.isidentifier()
- }
-
- # --------------------------------------------------------------------
- # Array-Like Methods
-
- # ndarray compat
- def __len__(self) -> int:
- """
- Return the length of the Index.
- """
- return len(self._data)
-
- def __array__(self, dtype=None) -> np.ndarray:
- """
- The array interface, return my values.
- """
- return np.asarray(self._data, dtype=dtype)
-
- def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs):
- if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs):
- return NotImplemented
-
- result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
-
- if "out" in kwargs:
- # e.g. test_dti_isub_tdi
- return arraylike.dispatch_ufunc_with_out(
- self, ufunc, method, *inputs, **kwargs
- )
-
- if method == "reduce":
- result = arraylike.dispatch_reduction_ufunc(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
-
- new_inputs = [x if x is not self else x._values for x in inputs]
- result = getattr(ufunc, method)(*new_inputs, **kwargs)
- if ufunc.nout == 2:
- # i.e. np.divmod, np.modf, np.frexp
- return tuple(self.__array_wrap__(x) for x in result)
-
- if result.dtype == np.float16:
- result = result.astype(np.float32)
-
- return self.__array_wrap__(result)
-
- def __array_wrap__(self, result, context=None):
- """
- Gets called after a ufunc and other functions e.g. np.split.
- """
- result = lib.item_from_zerodim(result)
- if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1:
- return result
-
- return Index(result, name=self.name)
-
- @cache_readonly
- def dtype(self) -> DtypeObj:
- """
- Return the dtype object of the underlying data.
- """
- return self._data.dtype
-
- @final
- def ravel(self, order: str_t = "C") -> Index:
- """
- Return a view on self.
-
- Returns
- -------
- Index
-
- See Also
- --------
- numpy.ndarray.ravel : Return a flattened array.
- """
- return self[:]
-
- def view(self, cls=None):
- # we need to see if we are subclassing an
- # index type here
- if cls is not None and not hasattr(cls, "_typ"):
- dtype = cls
- if isinstance(cls, str):
- dtype = pandas_dtype(cls)
-
- if isinstance(dtype, (np.dtype, ExtensionDtype)) and needs_i8_conversion(
- dtype
- ):
- if dtype.kind == "m" and dtype != "m8[ns]":
- # e.g. m8[s]
- return self._data.view(cls)
-
- idx_cls = self._dtype_to_subclass(dtype)
- # NB: we only get here for subclasses that override
- # _data_cls such that it is a type and not a tuple
- # of types.
- arr_cls = idx_cls._data_cls
- arr = arr_cls(self._data.view("i8"), dtype=dtype)
- return idx_cls._simple_new(arr, name=self.name, refs=self._references)
-
- result = self._data.view(cls)
- else:
- result = self._view()
- if isinstance(result, Index):
- result._id = self._id
- return result
-
- def astype(self, dtype, copy: bool = True):
- """
- Create an Index with values cast to dtypes.
-
- The class of a new Index is determined by dtype. When conversion is
- impossible, a TypeError exception is raised.
-
- Parameters
- ----------
- dtype : numpy dtype or pandas type
- Note that any signed integer `dtype` is treated as ``'int64'``,
- and any unsigned integer `dtype` is treated as ``'uint64'``,
- regardless of the size.
- copy : bool, default True
- By default, astype always returns a newly allocated object.
- If copy is set to False and internal requirements on dtype are
- satisfied, the original data is used to create a new Index
- or the original Index is returned.
-
- Returns
- -------
- Index
- Index with values cast to specified dtype.
- """
- if dtype is not None:
- dtype = pandas_dtype(dtype)
-
- if is_dtype_equal(self.dtype, dtype):
- # Ensure that self.astype(self.dtype) is self
- return self.copy() if copy else self
-
- values = self._data
- if isinstance(values, ExtensionArray):
- with rewrite_exception(type(values).__name__, type(self).__name__):
- new_values = values.astype(dtype, copy=copy)
-
- elif isinstance(dtype, ExtensionDtype):
- cls = dtype.construct_array_type()
- # Note: for RangeIndex and CategoricalDtype self vs self._values
- # behaves differently here.
- new_values = cls._from_sequence(self, dtype=dtype, copy=copy)
-
- else:
- # GH#13149 specifically use astype_array instead of astype
- new_values = astype_array(values, dtype=dtype, copy=copy)
-
- # pass copy=False because any copying will be done in the astype above
- result = Index(new_values, name=self.name, dtype=new_values.dtype, copy=False)
- if (
- not copy
- and self._references is not None
- and astype_is_view(self.dtype, dtype)
- ):
- result._references = self._references
- result._references.add_index_reference(result)
- return result
-
- _index_shared_docs[
- "take"
- ] = """
- Return a new %(klass)s of the values selected by the indices.
-
- For internal compatibility with numpy arrays.
-
- Parameters
- ----------
- indices : array-like
- Indices to be taken.
- axis : int, optional
- The axis over which to select values, always 0.
- allow_fill : bool, default True
- fill_value : scalar, default None
- If allow_fill=True and fill_value is not None, indices specified by
- -1 are regarded as NA. If Index doesn't hold NA, raise ValueError.
-
- Returns
- -------
- Index
- An index formed of elements at the given indices. Will be the same
- type as self, except for RangeIndex.
-
- See Also
- --------
- numpy.ndarray.take: Return an array formed from the
- elements of a at the given indices.
- """
-
- @Appender(_index_shared_docs["take"] % _index_doc_kwargs)
- def take(
- self,
- indices,
- axis: Axis = 0,
- allow_fill: bool = True,
- fill_value=None,
- **kwargs,
- ):
- if kwargs:
- nv.validate_take((), kwargs)
- if is_scalar(indices):
- raise TypeError("Expected indices to be array-like")
- indices = ensure_platform_int(indices)
- allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices)
-
- # Note: we discard fill_value and use self._na_value, only relevant
- # in the case where allow_fill is True and fill_value is not None
- values = self._values
- if isinstance(values, np.ndarray):
- taken = algos.take(
- values, indices, allow_fill=allow_fill, fill_value=self._na_value
- )
- else:
- # algos.take passes 'axis' keyword which not all EAs accept
- taken = values.take(
- indices, allow_fill=allow_fill, fill_value=self._na_value
- )
- # _constructor so RangeIndex-> Index with an int64 dtype
- return self._constructor._simple_new(taken, name=self.name)
-
- @final
- def _maybe_disallow_fill(self, allow_fill: bool, fill_value, indices) -> bool:
- """
- We only use pandas-style take when allow_fill is True _and_
- fill_value is not None.
- """
- if allow_fill and fill_value is not None:
- # only fill if we are passing a non-None fill_value
- if self._can_hold_na:
- if (indices < -1).any():
- raise ValueError(
- "When allow_fill=True and fill_value is not None, "
- "all indices must be >= -1"
- )
- else:
- cls_name = type(self).__name__
- raise ValueError(
- f"Unable to fill values because {cls_name} cannot contain NA"
- )
- else:
- allow_fill = False
- return allow_fill
-
- _index_shared_docs[
- "repeat"
- ] = """
- Repeat elements of a %(klass)s.
-
- Returns a new %(klass)s where each element of the current %(klass)s
- is repeated consecutively a given number of times.
-
- Parameters
- ----------
- repeats : int or array of ints
- The number of repetitions for each element. This should be a
- non-negative integer. Repeating 0 times will return an empty
- %(klass)s.
- axis : None
- Must be ``None``. Has no effect but is accepted for compatibility
- with numpy.
-
- Returns
- -------
- %(klass)s
- Newly created %(klass)s with repeated elements.
-
- See Also
- --------
- Series.repeat : Equivalent function for Series.
- numpy.repeat : Similar method for :class:`numpy.ndarray`.
-
- Examples
- --------
- >>> idx = pd.Index(['a', 'b', 'c'])
- >>> idx
- Index(['a', 'b', 'c'], dtype='object')
- >>> idx.repeat(2)
- Index(['a', 'a', 'b', 'b', 'c', 'c'], dtype='object')
- >>> idx.repeat([1, 2, 3])
- Index(['a', 'b', 'b', 'c', 'c', 'c'], dtype='object')
- """
-
- @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs)
- def repeat(self, repeats, axis=None):
- repeats = ensure_platform_int(repeats)
- nv.validate_repeat((), {"axis": axis})
- res_values = self._values.repeat(repeats)
-
- # _constructor so RangeIndex-> Index with an int64 dtype
- return self._constructor._simple_new(res_values, name=self.name)
-
- # --------------------------------------------------------------------
- # Copying Methods
-
- def copy(
- self: _IndexT,
- name: Hashable | None = None,
- deep: bool = False,
- ) -> _IndexT:
- """
- Make a copy of this object.
-
- Name is set on the new object.
-
- Parameters
- ----------
- name : Label, optional
- Set name for new object.
- deep : bool, default False
-
- Returns
- -------
- Index
- Index refer to new object which is a copy of this object.
-
- Notes
- -----
- In most cases, there should be no functional difference from using
- ``deep``, but if ``deep`` is passed it will attempt to deepcopy.
- """
-
- name = self._validate_names(name=name, deep=deep)[0]
- if deep:
- new_data = self._data.copy()
- new_index = type(self)._simple_new(new_data, name=name)
- else:
- new_index = self._rename(name=name)
- return new_index
-
- @final
- def __copy__(self: _IndexT, **kwargs) -> _IndexT:
- return self.copy(**kwargs)
-
- @final
- def __deepcopy__(self: _IndexT, memo=None) -> _IndexT:
- """
- Parameters
- ----------
- memo, default None
- Standard signature. Unused
- """
- return self.copy(deep=True)
-
- # --------------------------------------------------------------------
- # Rendering Methods
-
- @final
- def __repr__(self) -> str_t:
- """
- Return a string representation for this object.
- """
- klass_name = type(self).__name__
- data = self._format_data()
- attrs = self._format_attrs()
- space = self._format_space()
- attrs_str = [f"{k}={v}" for k, v in attrs]
- prepr = f",{space}".join(attrs_str)
-
- # no data provided, just attributes
- if data is None:
- data = ""
-
- return f"{klass_name}({data}{prepr})"
-
- def _format_space(self) -> str_t:
- # using space here controls if the attributes
- # are line separated or not (the default)
-
- # max_seq_items = get_option('display.max_seq_items')
- # if len(self) > max_seq_items:
- # space = "\n%s" % (' ' * (len(klass) + 1))
- return " "
-
- @property
- def _formatter_func(self):
- """
- Return the formatter function.
- """
- return default_pprint
-
- def _format_data(self, name=None) -> str_t:
- """
- Return the formatted data as a unicode string.
- """
- # do we want to justify (only do so for non-objects)
- is_justify = True
-
- if self.inferred_type == "string":
- is_justify = False
- elif self.inferred_type == "categorical":
- self = cast("CategoricalIndex", self)
- if is_object_dtype(self.categories):
- is_justify = False
-
- return format_object_summary(
- self,
- self._formatter_func,
- is_justify=is_justify,
- name=name,
- line_break_each_value=self._is_multi,
- )
-
- def _format_attrs(self) -> list[tuple[str_t, str_t | int | bool | None]]:
- """
- Return a list of tuples of the (attr,formatted_value).
- """
- attrs: list[tuple[str_t, str_t | int | bool | None]] = []
-
- if not self._is_multi:
- attrs.append(("dtype", f"'{self.dtype}'"))
-
- if self.name is not None:
- attrs.append(("name", default_pprint(self.name)))
- elif self._is_multi and any(x is not None for x in self.names):
- attrs.append(("names", default_pprint(self.names)))
-
- max_seq_items = get_option("display.max_seq_items") or len(self)
- if len(self) > max_seq_items:
- attrs.append(("length", len(self)))
- return attrs
-
- @final
- def _get_level_names(self) -> Hashable | Sequence[Hashable]:
- """
- Return a name or list of names with None replaced by the level number.
- """
- if self._is_multi:
- return [
- level if name is None else name for level, name in enumerate(self.names)
- ]
- else:
- return 0 if self.name is None else self.name
-
- @final
- def _mpl_repr(self) -> np.ndarray:
- # how to represent ourselves to matplotlib
- if isinstance(self.dtype, np.dtype) and self.dtype.kind != "M":
- return cast(np.ndarray, self.values)
- return self.astype(object, copy=False)._values
-
- def format(
- self,
- name: bool = False,
- formatter: Callable | None = None,
- na_rep: str_t = "NaN",
- ) -> list[str_t]:
- """
- Render a string representation of the Index.
- """
- header = []
- if name:
- header.append(
- pprint_thing(self.name, escape_chars=("\t", "\r", "\n"))
- if self.name is not None
- else ""
- )
-
- if formatter is not None:
- return header + list(self.map(formatter))
-
- return self._format_with_header(header, na_rep=na_rep)
-
- def _format_with_header(self, header: list[str_t], na_rep: str_t) -> list[str_t]:
- from pandas.io.formats.format import format_array
-
- values = self._values
-
- if is_object_dtype(values.dtype):
- values = cast(np.ndarray, values)
- values = lib.maybe_convert_objects(values, safe=True)
-
- result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values]
-
- # could have nans
- mask = is_float_nan(values)
- if mask.any():
- result_arr = np.array(result)
- result_arr[mask] = na_rep
- result = result_arr.tolist()
- else:
- result = trim_front(format_array(values, None, justify="left"))
- return header + result
-
- def _format_native_types(
- self,
- *,
- na_rep: str_t = "",
- decimal: str_t = ".",
- float_format=None,
- date_format=None,
- quoting=None,
- ) -> npt.NDArray[np.object_]:
- """
- Actually format specific types of the index.
- """
- from pandas.io.formats.format import FloatArrayFormatter
-
- if is_float_dtype(self.dtype) and not is_extension_array_dtype(self.dtype):
- formatter = FloatArrayFormatter(
- self._values,
- na_rep=na_rep,
- float_format=float_format,
- decimal=decimal,
- quoting=quoting,
- fixed_width=False,
- )
- return formatter.get_result_as_array()
-
- mask = isna(self)
- if not is_object_dtype(self) and not quoting:
- values = np.asarray(self).astype(str)
- else:
- values = np.array(self, dtype=object, copy=True)
-
- values[mask] = na_rep
- return values
-
- def _summary(self, name=None) -> str_t:
- """
- Return a summarized representation.
-
- Parameters
- ----------
- name : str
- name to use in the summary representation
-
- Returns
- -------
- String with a summarized representation of the index
- """
- if len(self) > 0:
- head = self[0]
- if hasattr(head, "format") and not isinstance(head, str):
- head = head.format()
- elif needs_i8_conversion(self.dtype):
- # e.g. Timedelta, display as values, not quoted
- head = self._formatter_func(head).replace("'", "")
- tail = self[-1]
- if hasattr(tail, "format") and not isinstance(tail, str):
- tail = tail.format()
- elif needs_i8_conversion(self.dtype):
- # e.g. Timedelta, display as values, not quoted
- tail = self._formatter_func(tail).replace("'", "")
-
- index_summary = f", {head} to {tail}"
- else:
- index_summary = ""
-
- if name is None:
- name = type(self).__name__
- return f"{name}: {len(self)} entries{index_summary}"
-
- # --------------------------------------------------------------------
- # Conversion Methods
-
- def to_flat_index(self: _IndexT) -> _IndexT:
- """
- Identity method.
-
- This is implemented for compatibility with subclass implementations
- when chaining.
-
- Returns
- -------
- pd.Index
- Caller.
-
- See Also
- --------
- MultiIndex.to_flat_index : Subclass implementation.
- """
- return self
-
- @final
- def to_series(self, index=None, name: Hashable = None) -> Series:
- """
- Create a Series with both index and values equal to the index keys.
-
- Useful with map for returning an indexer based on an index.
-
- Parameters
- ----------
- index : Index, optional
- Index of resulting Series. If None, defaults to original index.
- name : str, optional
- Name of resulting Series. If None, defaults to name of original
- index.
-
- Returns
- -------
- Series
- The dtype will be based on the type of the Index values.
-
- See Also
- --------
- Index.to_frame : Convert an Index to a DataFrame.
- Series.to_frame : Convert Series to DataFrame.
-
- Examples
- --------
- >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal')
-
- By default, the original Index and original name is reused.
-
- >>> idx.to_series()
- animal
- Ant Ant
- Bear Bear
- Cow Cow
- Name: animal, dtype: object
-
- To enforce a new Index, specify new labels to ``index``:
-
- >>> idx.to_series(index=[0, 1, 2])
- 0 Ant
- 1 Bear
- 2 Cow
- Name: animal, dtype: object
-
- To override the name of the resulting column, specify `name`:
-
- >>> idx.to_series(name='zoo')
- animal
- Ant Ant
- Bear Bear
- Cow Cow
- Name: zoo, dtype: object
- """
- from pandas import Series
-
- if index is None:
- index = self._view()
- if name is None:
- name = self.name
-
- return Series(self._values.copy(), index=index, name=name)
-
- def to_frame(
- self, index: bool = True, name: Hashable = lib.no_default
- ) -> DataFrame:
- """
- Create a DataFrame with a column containing the Index.
-
- Parameters
- ----------
- index : bool, default True
- Set the index of the returned DataFrame as the original Index.
-
- name : object, defaults to index.name
- The passed name should substitute for the index name (if it has
- one).
-
- Returns
- -------
- DataFrame
- DataFrame containing the original Index data.
-
- See Also
- --------
- Index.to_series : Convert an Index to a Series.
- Series.to_frame : Convert Series to DataFrame.
-
- Examples
- --------
- >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal')
- >>> idx.to_frame()
- animal
- animal
- Ant Ant
- Bear Bear
- Cow Cow
-
- By default, the original Index is reused. To enforce a new Index:
-
- >>> idx.to_frame(index=False)
- animal
- 0 Ant
- 1 Bear
- 2 Cow
-
- To override the name of the resulting column, specify `name`:
-
- >>> idx.to_frame(index=False, name='zoo')
- zoo
- 0 Ant
- 1 Bear
- 2 Cow
- """
- from pandas import DataFrame
-
- if name is lib.no_default:
- name = self._get_level_names()
- result = DataFrame({name: self._values.copy()})
-
- if index:
- result.index = self
- return result
-
- # --------------------------------------------------------------------
- # Name-Centric Methods
-
- @property
- def name(self) -> Hashable:
- """
- Return Index or MultiIndex name.
- """
- return self._name
-
- @name.setter
- def name(self, value: Hashable) -> None:
- if self._no_setting_name:
- # Used in MultiIndex.levels to avoid silently ignoring name updates.
- raise RuntimeError(
- "Cannot set name on a level of a MultiIndex. Use "
- "'MultiIndex.set_names' instead."
- )
- maybe_extract_name(value, None, type(self))
- self._name = value
-
- @final
- def _validate_names(
- self, name=None, names=None, deep: bool = False
- ) -> list[Hashable]:
- """
- Handles the quirks of having a singular 'name' parameter for general
- Index and plural 'names' parameter for MultiIndex.
- """
- from copy import deepcopy
-
- if names is not None and name is not None:
- raise TypeError("Can only provide one of `names` and `name`")
- if names is None and name is None:
- new_names = deepcopy(self.names) if deep else self.names
- elif names is not None:
- if not is_list_like(names):
- raise TypeError("Must pass list-like as `names`.")
- new_names = names
- elif not is_list_like(name):
- new_names = [name]
- else:
- new_names = name
-
- if len(new_names) != len(self.names):
- raise ValueError(
- f"Length of new names must be {len(self.names)}, got {len(new_names)}"
- )
-
- # All items in 'new_names' need to be hashable
- validate_all_hashable(*new_names, error_name=f"{type(self).__name__}.name")
-
- return new_names
-
- def _get_default_index_names(
- self, names: Hashable | Sequence[Hashable] | None = None, default=None
- ) -> list[Hashable]:
- """
- Get names of index.
-
- Parameters
- ----------
- names : int, str or 1-dimensional list, default None
- Index names to set.
- default : str
- Default name of index.
-
- Raises
- ------
- TypeError
- if names not str or list-like
- """
- from pandas.core.indexes.multi import MultiIndex
-
- if names is not None:
- if isinstance(names, (int, str)):
- names = [names]
-
- if not isinstance(names, list) and names is not None:
- raise ValueError("Index names must be str or 1-dimensional list")
-
- if not names:
- if isinstance(self, MultiIndex):
- names = com.fill_missing_names(self.names)
- else:
- names = [default] if self.name is None else [self.name]
-
- return names
-
- def _get_names(self) -> FrozenList:
- return FrozenList((self.name,))
-
- def _set_names(self, values, *, level=None) -> None:
- """
- Set new names on index. Each name has to be a hashable type.
-
- Parameters
- ----------
- values : str or sequence
- name(s) to set
- level : int, level name, or sequence of int/level names (default None)
- If the index is a MultiIndex (hierarchical), level(s) to set (None
- for all levels). Otherwise level must be None
-
- Raises
- ------
- TypeError if each name is not hashable.
- """
- if not is_list_like(values):
- raise ValueError("Names must be a list-like")
- if len(values) != 1:
- raise ValueError(f"Length of new names must be 1, got {len(values)}")
-
- # GH 20527
- # All items in 'name' need to be hashable:
- validate_all_hashable(*values, error_name=f"{type(self).__name__}.name")
-
- self._name = values[0]
-
- names = property(fset=_set_names, fget=_get_names)
-
- @overload
- def set_names(
- self: _IndexT, names, *, level=..., inplace: Literal[False] = ...
- ) -> _IndexT:
- ...
-
- @overload
- def set_names(self, names, *, level=..., inplace: Literal[True]) -> None:
- ...
-
- @overload
- def set_names(
- self: _IndexT, names, *, level=..., inplace: bool = ...
- ) -> _IndexT | None:
- ...
-
- def set_names(
- self: _IndexT, names, *, level=None, inplace: bool = False
- ) -> _IndexT | None:
- """
- Set Index or MultiIndex name.
-
- Able to set new names partially and by level.
-
- Parameters
- ----------
-
- names : label or list of label or dict-like for MultiIndex
- Name(s) to set.
-
- .. versionchanged:: 1.3.0
-
- level : int, label or list of int or label, optional
- If the index is a MultiIndex and names is not dict-like, level(s) to set
- (None for all levels). Otherwise level must be None.
-
- .. versionchanged:: 1.3.0
-
- inplace : bool, default False
- Modifies the object directly, instead of creating a new Index or
- MultiIndex.
-
- Returns
- -------
- Index or None
- The same type as the caller or None if ``inplace=True``.
-
- See Also
- --------
- Index.rename : Able to set new names without level.
-
- Examples
- --------
- >>> idx = pd.Index([1, 2, 3, 4])
- >>> idx
- Index([1, 2, 3, 4], dtype='int64')
- >>> idx.set_names('quarter')
- Index([1, 2, 3, 4], dtype='int64', name='quarter')
-
- >>> idx = pd.MultiIndex.from_product([['python', 'cobra'],
- ... [2018, 2019]])
- >>> idx
- MultiIndex([('python', 2018),
- ('python', 2019),
- ( 'cobra', 2018),
- ( 'cobra', 2019)],
- )
- >>> idx = idx.set_names(['kind', 'year'])
- >>> idx.set_names('species', level=0)
- MultiIndex([('python', 2018),
- ('python', 2019),
- ( 'cobra', 2018),
- ( 'cobra', 2019)],
- names=['species', 'year'])
-
- When renaming levels with a dict, levels can not be passed.
-
- >>> idx.set_names({'kind': 'snake'})
- MultiIndex([('python', 2018),
- ('python', 2019),
- ( 'cobra', 2018),
- ( 'cobra', 2019)],
- names=['snake', 'year'])
- """
- if level is not None and not isinstance(self, ABCMultiIndex):
- raise ValueError("Level must be None for non-MultiIndex")
-
- if level is not None and not is_list_like(level) and is_list_like(names):
- raise TypeError("Names must be a string when a single level is provided.")
-
- if not is_list_like(names) and level is None and self.nlevels > 1:
- raise TypeError("Must pass list-like as `names`.")
-
- if is_dict_like(names) and not isinstance(self, ABCMultiIndex):
- raise TypeError("Can only pass dict-like as `names` for MultiIndex.")
-
- if is_dict_like(names) and level is not None:
- raise TypeError("Can not pass level for dictlike `names`.")
-
- if isinstance(self, ABCMultiIndex) and is_dict_like(names) and level is None:
- # Transform dict to list of new names and corresponding levels
- level, names_adjusted = [], []
- for i, name in enumerate(self.names):
- if name in names.keys():
- level.append(i)
- names_adjusted.append(names[name])
- names = names_adjusted
-
- if not is_list_like(names):
- names = [names]
- if level is not None and not is_list_like(level):
- level = [level]
-
- if inplace:
- idx = self
- else:
- idx = self._view()
-
- idx._set_names(names, level=level)
- if not inplace:
- return idx
- return None
-
- def rename(self, name, inplace: bool = False):
- """
- Alter Index or MultiIndex name.
-
- Able to set new names without level. Defaults to returning new index.
- Length of names must match number of levels in MultiIndex.
-
- Parameters
- ----------
- name : label or list of labels
- Name(s) to set.
- inplace : bool, default False
- Modifies the object directly, instead of creating a new Index or
- MultiIndex.
-
- Returns
- -------
- Index or None
- The same type as the caller or None if ``inplace=True``.
-
- See Also
- --------
- Index.set_names : Able to set new names partially and by level.
-
- Examples
- --------
- >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score')
- >>> idx.rename('grade')
- Index(['A', 'C', 'A', 'B'], dtype='object', name='grade')
-
- >>> idx = pd.MultiIndex.from_product([['python', 'cobra'],
- ... [2018, 2019]],
- ... names=['kind', 'year'])
- >>> idx
- MultiIndex([('python', 2018),
- ('python', 2019),
- ( 'cobra', 2018),
- ( 'cobra', 2019)],
- names=['kind', 'year'])
- >>> idx.rename(['species', 'year'])
- MultiIndex([('python', 2018),
- ('python', 2019),
- ( 'cobra', 2018),
- ( 'cobra', 2019)],
- names=['species', 'year'])
- >>> idx.rename('species')
- Traceback (most recent call last):
- TypeError: Must pass list-like as `names`.
- """
- return self.set_names([name], inplace=inplace)
-
- # --------------------------------------------------------------------
- # Level-Centric Methods
-
- @property
- def nlevels(self) -> int:
- """
- Number of levels.
- """
- return 1
-
- def _sort_levels_monotonic(self: _IndexT) -> _IndexT:
- """
- Compat with MultiIndex.
- """
- return self
-
- @final
- def _validate_index_level(self, level) -> None:
- """
- Validate index level.
-
- For single-level Index getting level number is a no-op, but some
- verification must be done like in MultiIndex.
-
- """
- if isinstance(level, int):
- if level < 0 and level != -1:
- raise IndexError(
- "Too many levels: Index has only 1 level, "
- f"{level} is not a valid level number"
- )
- if level > 0:
- raise IndexError(
- f"Too many levels: Index has only 1 level, not {level + 1}"
- )
- elif level != self.name:
- raise KeyError(
- f"Requested level ({level}) does not match index name ({self.name})"
- )
-
- def _get_level_number(self, level) -> int:
- self._validate_index_level(level)
- return 0
-
- def sortlevel(
- self, level=None, ascending: bool | list[bool] = True, sort_remaining=None
- ):
- """
- For internal compatibility with the Index API.
-
- Sort the Index. This is for compat with MultiIndex
-
- Parameters
- ----------
- ascending : bool, default True
- False to sort in descending order
-
- level, sort_remaining are compat parameters
-
- Returns
- -------
- Index
- """
- if not isinstance(ascending, (list, bool)):
- raise TypeError(
- "ascending must be a single bool value or"
- "a list of bool values of length 1"
- )
-
- if isinstance(ascending, list):
- if len(ascending) != 1:
- raise TypeError("ascending must be a list of bool values of length 1")
- ascending = ascending[0]
-
- if not isinstance(ascending, bool):
- raise TypeError("ascending must be a bool value")
-
- return self.sort_values(return_indexer=True, ascending=ascending)
-
- def _get_level_values(self, level) -> Index:
- """
- Return an Index of values for requested level.
-
- This is primarily useful to get an individual level of values from a
- MultiIndex, but is provided on Index as well for compatibility.
-
- Parameters
- ----------
- level : int or str
- It is either the integer position or the name of the level.
-
- Returns
- -------
- Index
- Calling object, as there is only one level in the Index.
-
- See Also
- --------
- MultiIndex.get_level_values : Get values for a level of a MultiIndex.
-
- Notes
- -----
- For Index, level should be 0, since there are no multiple levels.
-
- Examples
- --------
- >>> idx = pd.Index(list('abc'))
- >>> idx
- Index(['a', 'b', 'c'], dtype='object')
-
- Get level values by supplying `level` as integer:
-
- >>> idx.get_level_values(0)
- Index(['a', 'b', 'c'], dtype='object')
- """
- self._validate_index_level(level)
- return self
-
- get_level_values = _get_level_values
-
- @final
- def droplevel(self, level: IndexLabel = 0):
- """
- Return index with requested level(s) removed.
-
- If resulting index has only 1 level left, the result will be
- of Index type, not MultiIndex. The original index is not modified inplace.
-
- Parameters
- ----------
- level : int, str, or list-like, default 0
- If a string is given, must be the name of a level
- If list-like, elements must be names or indexes of levels.
-
- Returns
- -------
- Index or MultiIndex
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays(
- ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z'])
- >>> mi
- MultiIndex([(1, 3, 5),
- (2, 4, 6)],
- names=['x', 'y', 'z'])
-
- >>> mi.droplevel()
- MultiIndex([(3, 5),
- (4, 6)],
- names=['y', 'z'])
-
- >>> mi.droplevel(2)
- MultiIndex([(1, 3),
- (2, 4)],
- names=['x', 'y'])
-
- >>> mi.droplevel('z')
- MultiIndex([(1, 3),
- (2, 4)],
- names=['x', 'y'])
-
- >>> mi.droplevel(['x', 'y'])
- Index([5, 6], dtype='int64', name='z')
- """
- if not isinstance(level, (tuple, list)):
- level = [level]
-
- levnums = sorted(self._get_level_number(lev) for lev in level)[::-1]
-
- return self._drop_level_numbers(levnums)
-
- @final
- def _drop_level_numbers(self, levnums: list[int]):
- """
- Drop MultiIndex levels by level _number_, not name.
- """
-
- if not levnums and not isinstance(self, ABCMultiIndex):
- return self
- if len(levnums) >= self.nlevels:
- raise ValueError(
- f"Cannot remove {len(levnums)} levels from an index with "
- f"{self.nlevels} levels: at least one level must be left."
- )
- # The two checks above guarantee that here self is a MultiIndex
- self = cast("MultiIndex", self)
-
- new_levels = list(self.levels)
- new_codes = list(self.codes)
- new_names = list(self.names)
-
- for i in levnums:
- new_levels.pop(i)
- new_codes.pop(i)
- new_names.pop(i)
-
- if len(new_levels) == 1:
- lev = new_levels[0]
-
- if len(lev) == 0:
- # If lev is empty, lev.take will fail GH#42055
- if len(new_codes[0]) == 0:
- # GH#45230 preserve RangeIndex here
- # see test_reset_index_empty_rangeindex
- result = lev[:0]
- else:
- res_values = algos.take(lev._values, new_codes[0], allow_fill=True)
- # _constructor instead of type(lev) for RangeIndex compat GH#35230
- result = lev._constructor._simple_new(res_values, name=new_names[0])
- else:
- # set nan if needed
- mask = new_codes[0] == -1
- result = new_levels[0].take(new_codes[0])
- if mask.any():
- result = result.putmask(mask, np.nan)
-
- result._name = new_names[0]
-
- return result
- else:
- from pandas.core.indexes.multi import MultiIndex
-
- return MultiIndex(
- levels=new_levels,
- codes=new_codes,
- names=new_names,
- verify_integrity=False,
- )
-
- # --------------------------------------------------------------------
- # Introspection Methods
-
- @cache_readonly
- @final
- def _can_hold_na(self) -> bool:
- if isinstance(self.dtype, ExtensionDtype):
- if isinstance(self.dtype, IntervalDtype):
- # FIXME(GH#45720): this is inaccurate for integer-backed
- # IntervalArray, but without it other.categories.take raises
- # in IntervalArray._cmp_method
- return True
- return self.dtype._can_hold_na
- if self.dtype.kind in ["i", "u", "b"]:
- return False
- return True
-
- @property
- def is_monotonic_increasing(self) -> bool:
- """
- Return a boolean if the values are equal or increasing.
-
- Returns
- -------
- bool
-
- See Also
- --------
- Index.is_monotonic_decreasing : Check if the values are equal or decreasing.
-
- Examples
- --------
- >>> pd.Index([1, 2, 3]).is_monotonic_increasing
- True
- >>> pd.Index([1, 2, 2]).is_monotonic_increasing
- True
- >>> pd.Index([1, 3, 2]).is_monotonic_increasing
- False
- """
- return self._engine.is_monotonic_increasing
-
- @property
- def is_monotonic_decreasing(self) -> bool:
- """
- Return a boolean if the values are equal or decreasing.
-
- Returns
- -------
- bool
-
- See Also
- --------
- Index.is_monotonic_increasing : Check if the values are equal or increasing.
-
- Examples
- --------
- >>> pd.Index([3, 2, 1]).is_monotonic_decreasing
- True
- >>> pd.Index([3, 2, 2]).is_monotonic_decreasing
- True
- >>> pd.Index([3, 1, 2]).is_monotonic_decreasing
- False
- """
- return self._engine.is_monotonic_decreasing
-
- @final
- @property
- def _is_strictly_monotonic_increasing(self) -> bool:
- """
- Return if the index is strictly monotonic increasing
- (only increasing) values.
-
- Examples
- --------
- >>> Index([1, 2, 3])._is_strictly_monotonic_increasing
- True
- >>> Index([1, 2, 2])._is_strictly_monotonic_increasing
- False
- >>> Index([1, 3, 2])._is_strictly_monotonic_increasing
- False
- """
- return self.is_unique and self.is_monotonic_increasing
-
- @final
- @property
- def _is_strictly_monotonic_decreasing(self) -> bool:
- """
- Return if the index is strictly monotonic decreasing
- (only decreasing) values.
-
- Examples
- --------
- >>> Index([3, 2, 1])._is_strictly_monotonic_decreasing
- True
- >>> Index([3, 2, 2])._is_strictly_monotonic_decreasing
- False
- >>> Index([3, 1, 2])._is_strictly_monotonic_decreasing
- False
- """
- return self.is_unique and self.is_monotonic_decreasing
-
- @cache_readonly
- def is_unique(self) -> bool:
- """
- Return if the index has unique values.
-
- Returns
- -------
- bool
-
- See Also
- --------
- Index.has_duplicates : Inverse method that checks if it has duplicate values.
-
- Examples
- --------
- >>> idx = pd.Index([1, 5, 7, 7])
- >>> idx.is_unique
- False
-
- >>> idx = pd.Index([1, 5, 7])
- >>> idx.is_unique
- True
-
- >>> idx = pd.Index(["Watermelon", "Orange", "Apple",
- ... "Watermelon"]).astype("category")
- >>> idx.is_unique
- False
-
- >>> idx = pd.Index(["Orange", "Apple",
- ... "Watermelon"]).astype("category")
- >>> idx.is_unique
- True
- """
- return self._engine.is_unique
-
- @final
- @property
- def has_duplicates(self) -> bool:
- """
- Check if the Index has duplicate values.
-
- Returns
- -------
- bool
- Whether or not the Index has duplicate values.
-
- See Also
- --------
- Index.is_unique : Inverse method that checks if it has unique values.
-
- Examples
- --------
- >>> idx = pd.Index([1, 5, 7, 7])
- >>> idx.has_duplicates
- True
-
- >>> idx = pd.Index([1, 5, 7])
- >>> idx.has_duplicates
- False
-
- >>> idx = pd.Index(["Watermelon", "Orange", "Apple",
- ... "Watermelon"]).astype("category")
- >>> idx.has_duplicates
- True
-
- >>> idx = pd.Index(["Orange", "Apple",
- ... "Watermelon"]).astype("category")
- >>> idx.has_duplicates
- False
- """
- return not self.is_unique
-
- @final
- def is_boolean(self) -> bool:
- """
- Check if the Index only consists of booleans.
-
- .. deprecated:: 2.0.0
- Use `pandas.api.types.is_bool_dtype` instead.
-
- Returns
- -------
- bool
- Whether or not the Index only consists of booleans.
-
- See Also
- --------
- is_integer : Check if the Index only consists of integers (deprecated).
- is_floating : Check if the Index is a floating type (deprecated).
- is_numeric : Check if the Index only consists of numeric data (deprecated).
- is_object : Check if the Index is of the object dtype (deprecated).
- is_categorical : Check if the Index holds categorical data.
- is_interval : Check if the Index holds Interval objects (deprecated).
-
- Examples
- --------
- >>> idx = pd.Index([True, False, True])
- >>> idx.is_boolean() # doctest: +SKIP
- True
-
- >>> idx = pd.Index(["True", "False", "True"])
- >>> idx.is_boolean() # doctest: +SKIP
- False
-
- >>> idx = pd.Index([True, False, "True"])
- >>> idx.is_boolean() # doctest: +SKIP
- False
- """
- warnings.warn(
- f"{type(self).__name__}.is_boolean is deprecated. "
- "Use pandas.api.types.is_bool_type instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self.inferred_type in ["boolean"]
-
- @final
- def is_integer(self) -> bool:
- """
- Check if the Index only consists of integers.
-
- .. deprecated:: 2.0.0
- Use `pandas.api.types.is_integer_dtype` instead.
-
- Returns
- -------
- bool
- Whether or not the Index only consists of integers.
-
- See Also
- --------
- is_boolean : Check if the Index only consists of booleans (deprecated).
- is_floating : Check if the Index is a floating type (deprecated).
- is_numeric : Check if the Index only consists of numeric data (deprecated).
- is_object : Check if the Index is of the object dtype. (deprecated).
- is_categorical : Check if the Index holds categorical data (deprecated).
- is_interval : Check if the Index holds Interval objects (deprecated).
-
- Examples
- --------
- >>> idx = pd.Index([1, 2, 3, 4])
- >>> idx.is_integer() # doctest: +SKIP
- True
-
- >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0])
- >>> idx.is_integer() # doctest: +SKIP
- False
-
- >>> idx = pd.Index(["Apple", "Mango", "Watermelon"])
- >>> idx.is_integer() # doctest: +SKIP
- False
- """
- warnings.warn(
- f"{type(self).__name__}.is_integer is deprecated. "
- "Use pandas.api.types.is_integer_dtype instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self.inferred_type in ["integer"]
-
- @final
- def is_floating(self) -> bool:
- """
- Check if the Index is a floating type.
-
- .. deprecated:: 2.0.0
- Use `pandas.api.types.is_float_dtype` instead
-
- The Index may consist of only floats, NaNs, or a mix of floats,
- integers, or NaNs.
-
- Returns
- -------
- bool
- Whether or not the Index only consists of only consists of floats, NaNs, or
- a mix of floats, integers, or NaNs.
-
- See Also
- --------
- is_boolean : Check if the Index only consists of booleans (deprecated).
- is_integer : Check if the Index only consists of integers (deprecated).
- is_numeric : Check if the Index only consists of numeric data (deprecated).
- is_object : Check if the Index is of the object dtype. (deprecated).
- is_categorical : Check if the Index holds categorical data (deprecated).
- is_interval : Check if the Index holds Interval objects (deprecated).
-
- Examples
- --------
- >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0])
- >>> idx.is_floating() # doctest: +SKIP
- True
-
- >>> idx = pd.Index([1.0, 2.0, np.nan, 4.0])
- >>> idx.is_floating() # doctest: +SKIP
- True
-
- >>> idx = pd.Index([1, 2, 3, 4, np.nan])
- >>> idx.is_floating() # doctest: +SKIP
- True
-
- >>> idx = pd.Index([1, 2, 3, 4])
- >>> idx.is_floating() # doctest: +SKIP
- False
- """
- warnings.warn(
- f"{type(self).__name__}.is_floating is deprecated. "
- "Use pandas.api.types.is_float_dtype instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"]
-
- @final
- def is_numeric(self) -> bool:
- """
- Check if the Index only consists of numeric data.
-
- .. deprecated:: 2.0.0
- Use `pandas.api.types.is_numeric_dtype` instead.
-
- Returns
- -------
- bool
- Whether or not the Index only consists of numeric data.
-
- See Also
- --------
- is_boolean : Check if the Index only consists of booleans (deprecated).
- is_integer : Check if the Index only consists of integers (deprecated).
- is_floating : Check if the Index is a floating type (deprecated).
- is_object : Check if the Index is of the object dtype. (deprecated).
- is_categorical : Check if the Index holds categorical data (deprecated).
- is_interval : Check if the Index holds Interval objects (deprecated).
-
- Examples
- --------
- >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0])
- >>> idx.is_numeric() # doctest: +SKIP
- True
-
- >>> idx = pd.Index([1, 2, 3, 4.0])
- >>> idx.is_numeric() # doctest: +SKIP
- True
-
- >>> idx = pd.Index([1, 2, 3, 4])
- >>> idx.is_numeric() # doctest: +SKIP
- True
-
- >>> idx = pd.Index([1, 2, 3, 4.0, np.nan])
- >>> idx.is_numeric() # doctest: +SKIP
- True
-
- >>> idx = pd.Index([1, 2, 3, 4.0, np.nan, "Apple"])
- >>> idx.is_numeric() # doctest: +SKIP
- False
- """
- warnings.warn(
- f"{type(self).__name__}.is_numeric is deprecated. "
- "Use pandas.api.types.is_any_real_numeric_dtype instead",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self.inferred_type in ["integer", "floating"]
-
- @final
- def is_object(self) -> bool:
- """
- Check if the Index is of the object dtype.
-
- .. deprecated:: 2.0.0
- Use `pandas.api.types.is_object_dtype` instead.
-
- Returns
- -------
- bool
- Whether or not the Index is of the object dtype.
-
- See Also
- --------
- is_boolean : Check if the Index only consists of booleans (deprecated).
- is_integer : Check if the Index only consists of integers (deprecated).
- is_floating : Check if the Index is a floating type (deprecated).
- is_numeric : Check if the Index only consists of numeric data (deprecated).
- is_categorical : Check if the Index holds categorical data (deprecated).
- is_interval : Check if the Index holds Interval objects (deprecated).
-
- Examples
- --------
- >>> idx = pd.Index(["Apple", "Mango", "Watermelon"])
- >>> idx.is_object() # doctest: +SKIP
- True
-
- >>> idx = pd.Index(["Apple", "Mango", 2.0])
- >>> idx.is_object() # doctest: +SKIP
- True
-
- >>> idx = pd.Index(["Watermelon", "Orange", "Apple",
- ... "Watermelon"]).astype("category")
- >>> idx.is_object() # doctest: +SKIP
- False
-
- >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0])
- >>> idx.is_object() # doctest: +SKIP
- False
- """
- warnings.warn(
- f"{type(self).__name__}.is_object is deprecated."
- "Use pandas.api.types.is_object_dtype instead",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return is_object_dtype(self.dtype)
-
- @final
- def is_categorical(self) -> bool:
- """
- Check if the Index holds categorical data.
-
- .. deprecated:: 2.0.0
- Use `isinstance(index.dtype, pd.CategoricalDtype)` instead.
-
- Returns
- -------
- bool
- True if the Index is categorical.
-
- See Also
- --------
- CategoricalIndex : Index for categorical data.
- is_boolean : Check if the Index only consists of booleans (deprecated).
- is_integer : Check if the Index only consists of integers (deprecated).
- is_floating : Check if the Index is a floating type (deprecated).
- is_numeric : Check if the Index only consists of numeric data (deprecated).
- is_object : Check if the Index is of the object dtype. (deprecated).
- is_interval : Check if the Index holds Interval objects (deprecated).
-
- Examples
- --------
- >>> idx = pd.Index(["Watermelon", "Orange", "Apple",
- ... "Watermelon"]).astype("category")
- >>> idx.is_categorical() # doctest: +SKIP
- True
-
- >>> idx = pd.Index([1, 3, 5, 7])
- >>> idx.is_categorical() # doctest: +SKIP
- False
-
- >>> s = pd.Series(["Peter", "Victor", "Elisabeth", "Mar"])
- >>> s
- 0 Peter
- 1 Victor
- 2 Elisabeth
- 3 Mar
- dtype: object
- >>> s.index.is_categorical() # doctest: +SKIP
- False
- """
- warnings.warn(
- f"{type(self).__name__}.is_categorical is deprecated."
- "Use pandas.api.types.is_categorical_dtype instead",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
-
- return self.inferred_type in ["categorical"]
-
- @final
- def is_interval(self) -> bool:
- """
- Check if the Index holds Interval objects.
-
- .. deprecated:: 2.0.0
- Use `isinstance(index.dtype, pd.IntervalDtype)` instead.
-
- Returns
- -------
- bool
- Whether or not the Index holds Interval objects.
-
- See Also
- --------
- IntervalIndex : Index for Interval objects.
- is_boolean : Check if the Index only consists of booleans (deprecated).
- is_integer : Check if the Index only consists of integers (deprecated).
- is_floating : Check if the Index is a floating type (deprecated).
- is_numeric : Check if the Index only consists of numeric data (deprecated).
- is_object : Check if the Index is of the object dtype. (deprecated).
- is_categorical : Check if the Index holds categorical data (deprecated).
-
- Examples
- --------
- >>> idx = pd.Index([pd.Interval(left=0, right=5),
- ... pd.Interval(left=5, right=10)])
- >>> idx.is_interval() # doctest: +SKIP
- True
-
- >>> idx = pd.Index([1, 3, 5, 7])
- >>> idx.is_interval() # doctest: +SKIP
- False
- """
- warnings.warn(
- f"{type(self).__name__}.is_interval is deprecated."
- "Use pandas.api.types.is_interval_dtype instead",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self.inferred_type in ["interval"]
-
- @final
- def _holds_integer(self) -> bool:
- """
- Whether the type is an integer type.
- """
- return self.inferred_type in ["integer", "mixed-integer"]
-
- @final
- def holds_integer(self) -> bool:
- """
- Whether the type is an integer type.
-
- .. deprecated:: 2.0.0
- Use `pandas.api.types.infer_dtype` instead
- """
- warnings.warn(
- f"{type(self).__name__}.holds_integer is deprecated. "
- "Use pandas.api.types.infer_dtype instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self._holds_integer()
-
- @cache_readonly
- def inferred_type(self) -> str_t:
- """
- Return a string of the type inferred from the values.
- """
- return lib.infer_dtype(self._values, skipna=False)
-
- @cache_readonly
- @final
- def _is_all_dates(self) -> bool:
- """
- Whether or not the index values only consist of dates.
- """
- if needs_i8_conversion(self.dtype):
- return True
- elif self.dtype != _dtype_obj:
- # TODO(ExtensionIndex): 3rd party EA might override?
- # Note: this includes IntervalIndex, even when the left/right
- # contain datetime-like objects.
- return False
- elif self._is_multi:
- return False
- return is_datetime_array(ensure_object(self._values))
-
- @final
- @cache_readonly
- def _is_multi(self) -> bool:
- """
- Cached check equivalent to isinstance(self, MultiIndex)
- """
- return isinstance(self, ABCMultiIndex)
-
- # --------------------------------------------------------------------
- # Pickle Methods
-
- def __reduce__(self):
- d = {"data": self._data, "name": self.name}
- return _new_Index, (type(self), d), None
-
- # --------------------------------------------------------------------
- # Null Handling Methods
-
- @cache_readonly
- def _na_value(self):
- """The expected NA value to use with this index."""
- dtype = self.dtype
- if isinstance(dtype, np.dtype):
- if dtype.kind in ["m", "M"]:
- return NaT
- return np.nan
- return dtype.na_value
-
- @cache_readonly
- def _isnan(self) -> npt.NDArray[np.bool_]:
- """
- Return if each value is NaN.
- """
- if self._can_hold_na:
- return isna(self)
- else:
- # shouldn't reach to this condition by checking hasnans beforehand
- values = np.empty(len(self), dtype=np.bool_)
- values.fill(False)
- return values
-
- @cache_readonly
- def hasnans(self) -> bool:
- """
- Return True if there are any NaNs.
-
- Enables various performance speedups.
-
- Returns
- -------
- bool
- """
- if self._can_hold_na:
- return bool(self._isnan.any())
- else:
- return False
-
- @final
- def isna(self) -> npt.NDArray[np.bool_]:
- """
- Detect missing values.
-
- Return a boolean same-sized object indicating if the values are NA.
- NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get
- mapped to ``True`` values.
- Everything else get mapped to ``False`` values. Characters such as
- empty strings `''` or :attr:`numpy.inf` are not considered NA values
- (unless you set ``pandas.options.mode.use_inf_as_na = True``).
-
- Returns
- -------
- numpy.ndarray[bool]
- A boolean array of whether my values are NA.
-
- See Also
- --------
- Index.notna : Boolean inverse of isna.
- Index.dropna : Omit entries with missing values.
- isna : Top-level isna.
- Series.isna : Detect missing values in Series object.
-
- Examples
- --------
- Show which entries in a pandas.Index are NA. The result is an
- array.
-
- >>> idx = pd.Index([5.2, 6.0, np.NaN])
- >>> idx
- Index([5.2, 6.0, nan], dtype='float64')
- >>> idx.isna()
- array([False, False, True])
-
- Empty strings are not considered NA values. None is considered an NA
- value.
-
- >>> idx = pd.Index(['black', '', 'red', None])
- >>> idx
- Index(['black', '', 'red', None], dtype='object')
- >>> idx.isna()
- array([False, False, False, True])
-
- For datetimes, `NaT` (Not a Time) is considered as an NA value.
-
- >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'),
- ... pd.Timestamp(''), None, pd.NaT])
- >>> idx
- DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'],
- dtype='datetime64[ns]', freq=None)
- >>> idx.isna()
- array([False, True, True, True])
- """
- return self._isnan
-
- isnull = isna
-
- @final
- def notna(self) -> npt.NDArray[np.bool_]:
- """
- Detect existing (non-missing) values.
-
- Return a boolean same-sized object indicating if the values are not NA.
- Non-missing values get mapped to ``True``. Characters such as empty
- strings ``''`` or :attr:`numpy.inf` are not considered NA values
- (unless you set ``pandas.options.mode.use_inf_as_na = True``).
- NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False``
- values.
-
- Returns
- -------
- numpy.ndarray[bool]
- Boolean array to indicate which entries are not NA.
-
- See Also
- --------
- Index.notnull : Alias of notna.
- Index.isna: Inverse of notna.
- notna : Top-level notna.
-
- Examples
- --------
- Show which entries in an Index are not NA. The result is an
- array.
-
- >>> idx = pd.Index([5.2, 6.0, np.NaN])
- >>> idx
- Index([5.2, 6.0, nan], dtype='float64')
- >>> idx.notna()
- array([ True, True, False])
-
- Empty strings are not considered NA values. None is considered a NA
- value.
-
- >>> idx = pd.Index(['black', '', 'red', None])
- >>> idx
- Index(['black', '', 'red', None], dtype='object')
- >>> idx.notna()
- array([ True, True, True, False])
- """
- return ~self.isna()
-
- notnull = notna
-
- def fillna(self, value=None, downcast=None):
- """
- Fill NA/NaN values with the specified value.
-
- Parameters
- ----------
- value : scalar
- Scalar value to use to fill holes (e.g. 0).
- This value cannot be a list-likes.
- downcast : dict, default is None
- A dict of item->dtype of what to downcast if possible,
- or the string 'infer' which will try to downcast to an appropriate
- equal type (e.g. float64 to int64 if possible).
-
- Returns
- -------
- Index
-
- See Also
- --------
- DataFrame.fillna : Fill NaN values of a DataFrame.
- Series.fillna : Fill NaN Values of a Series.
- """
-
- value = self._require_scalar(value)
- if self.hasnans:
- result = self.putmask(self._isnan, value)
- if downcast is None:
- # no need to care metadata other than name
- # because it can't have freq if it has NaTs
- # _with_infer needed for test_fillna_categorical
- return Index._with_infer(result, name=self.name)
- raise NotImplementedError(
- f"{type(self).__name__}.fillna does not support 'downcast' "
- "argument values other than 'None'."
- )
- return self._view()
-
- def dropna(self: _IndexT, how: AnyAll = "any") -> _IndexT:
- """
- Return Index without NA/NaN values.
-
- Parameters
- ----------
- how : {'any', 'all'}, default 'any'
- If the Index is a MultiIndex, drop the value when any or all levels
- are NaN.
-
- Returns
- -------
- Index
- """
- if how not in ("any", "all"):
- raise ValueError(f"invalid how option: {how}")
-
- if self.hasnans:
- res_values = self._values[~self._isnan]
- return type(self)._simple_new(res_values, name=self.name)
- return self._view()
-
- # --------------------------------------------------------------------
- # Uniqueness Methods
-
- def unique(self: _IndexT, level: Hashable | None = None) -> _IndexT:
- """
- Return unique values in the index.
-
- Unique values are returned in order of appearance, this does NOT sort.
-
- Parameters
- ----------
- level : int or hashable, optional
- Only return values from specified level (for MultiIndex).
- If int, gets the level by integer position, else by level name.
-
- Returns
- -------
- Index
-
- See Also
- --------
- unique : Numpy array of unique values in that column.
- Series.unique : Return unique values of Series object.
- """
- if level is not None:
- self._validate_index_level(level)
-
- if self.is_unique:
- return self._view()
-
- result = super().unique()
- return self._shallow_copy(result)
-
- def drop_duplicates(self: _IndexT, *, keep: DropKeep = "first") -> _IndexT:
- """
- Return Index with duplicate values removed.
-
- Parameters
- ----------
- keep : {'first', 'last', ``False``}, default 'first'
- - 'first' : Drop duplicates except for the first occurrence.
- - 'last' : Drop duplicates except for the last occurrence.
- - ``False`` : Drop all duplicates.
-
- Returns
- -------
- Index
-
- See Also
- --------
- Series.drop_duplicates : Equivalent method on Series.
- DataFrame.drop_duplicates : Equivalent method on DataFrame.
- Index.duplicated : Related method on Index, indicating duplicate
- Index values.
-
- Examples
- --------
- Generate an pandas.Index with duplicate values.
-
- >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'])
-
- The `keep` parameter controls which duplicate values are removed.
- The value 'first' keeps the first occurrence for each
- set of duplicated entries. The default value of keep is 'first'.
-
- >>> idx.drop_duplicates(keep='first')
- Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object')
-
- The value 'last' keeps the last occurrence for each set of duplicated
- entries.
-
- >>> idx.drop_duplicates(keep='last')
- Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object')
-
- The value ``False`` discards all sets of duplicated entries.
-
- >>> idx.drop_duplicates(keep=False)
- Index(['cow', 'beetle', 'hippo'], dtype='object')
- """
- if self.is_unique:
- return self._view()
-
- return super().drop_duplicates(keep=keep)
-
- def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
- """
- Indicate duplicate index values.
-
- Duplicated values are indicated as ``True`` values in the resulting
- array. Either all duplicates, all except the first, or all except the
- last occurrence of duplicates can be indicated.
-
- Parameters
- ----------
- keep : {'first', 'last', False}, default 'first'
- The value or values in a set of duplicates to mark as missing.
-
- - 'first' : Mark duplicates as ``True`` except for the first
- occurrence.
- - 'last' : Mark duplicates as ``True`` except for the last
- occurrence.
- - ``False`` : Mark all duplicates as ``True``.
-
- Returns
- -------
- np.ndarray[bool]
-
- See Also
- --------
- Series.duplicated : Equivalent method on pandas.Series.
- DataFrame.duplicated : Equivalent method on pandas.DataFrame.
- Index.drop_duplicates : Remove duplicate values from Index.
-
- Examples
- --------
- By default, for each set of duplicated values, the first occurrence is
- set to False and all others to True:
-
- >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama'])
- >>> idx.duplicated()
- array([False, False, True, False, True])
-
- which is equivalent to
-
- >>> idx.duplicated(keep='first')
- array([False, False, True, False, True])
-
- By using 'last', the last occurrence of each set of duplicated values
- is set on False and all others on True:
-
- >>> idx.duplicated(keep='last')
- array([ True, False, True, False, False])
-
- By setting keep on ``False``, all duplicates are True:
-
- >>> idx.duplicated(keep=False)
- array([ True, False, True, False, True])
- """
- if self.is_unique:
- # fastpath available bc we are immutable
- return np.zeros(len(self), dtype=bool)
- return self._duplicated(keep=keep)
-
- # --------------------------------------------------------------------
- # Arithmetic & Logical Methods
-
- def __iadd__(self, other):
- # alias for __add__
- return self + other
-
- @final
- def __nonzero__(self) -> NoReturn:
- raise ValueError(
- f"The truth value of a {type(self).__name__} is ambiguous. "
- "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
- )
-
- __bool__ = __nonzero__
-
- # --------------------------------------------------------------------
- # Set Operation Methods
-
- def _get_reconciled_name_object(self, other):
- """
- If the result of a set operation will be self,
- return self, unless the name changes, in which
- case make a shallow copy of self.
- """
- name = get_op_result_name(self, other)
- if self.name is not name:
- return self.rename(name)
- return self
-
- @final
- def _validate_sort_keyword(self, sort):
- if sort not in [None, False, True]:
- raise ValueError(
- "The 'sort' keyword only takes the values of "
- f"None, True, or False; {sort} was passed."
- )
-
- @final
- def _dti_setop_align_tzs(self, other: Index, setop: str_t) -> tuple[Index, Index]:
- """
- With mismatched timezones, cast both to UTC.
- """
- # Caller is responsibelf or checking
- # `not is_dtype_equal(self.dtype, other.dtype)`
- if (
- isinstance(self, ABCDatetimeIndex)
- and isinstance(other, ABCDatetimeIndex)
- and self.tz is not None
- and other.tz is not None
- ):
- # GH#39328, GH#45357
- left = self.tz_convert("UTC")
- right = other.tz_convert("UTC")
- return left, right
- return self, other
-
- @final
- def union(self, other, sort=None):
- """
- Form the union of two Index objects.
-
- If the Index objects are incompatible, both Index objects will be
- cast to dtype('object') first.
-
- Parameters
- ----------
- other : Index or array-like
- sort : bool or None, default None
- Whether to sort the resulting Index.
-
- * None : Sort the result, except when
-
- 1. `self` and `other` are equal.
- 2. `self` or `other` has length 0.
- 3. Some values in `self` or `other` cannot be compared.
- A RuntimeWarning is issued in this case.
-
- * False : do not sort the result.
- * True : Sort the result (which may raise TypeError).
-
- Returns
- -------
- Index
-
- Examples
- --------
- Union matching dtypes
-
- >>> idx1 = pd.Index([1, 2, 3, 4])
- >>> idx2 = pd.Index([3, 4, 5, 6])
- >>> idx1.union(idx2)
- Index([1, 2, 3, 4, 5, 6], dtype='int64')
-
- Union mismatched dtypes
-
- >>> idx1 = pd.Index(['a', 'b', 'c', 'd'])
- >>> idx2 = pd.Index([1, 2, 3, 4])
- >>> idx1.union(idx2)
- Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object')
-
- MultiIndex case
-
- >>> idx1 = pd.MultiIndex.from_arrays(
- ... [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]
- ... )
- >>> idx1
- MultiIndex([(1, 'Red'),
- (1, 'Blue'),
- (2, 'Red'),
- (2, 'Blue')],
- )
- >>> idx2 = pd.MultiIndex.from_arrays(
- ... [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]]
- ... )
- >>> idx2
- MultiIndex([(3, 'Red'),
- (3, 'Green'),
- (2, 'Red'),
- (2, 'Green')],
- )
- >>> idx1.union(idx2)
- MultiIndex([(1, 'Blue'),
- (1, 'Red'),
- (2, 'Blue'),
- (2, 'Green'),
- (2, 'Red'),
- (3, 'Green'),
- (3, 'Red')],
- )
- >>> idx1.union(idx2, sort=False)
- MultiIndex([(1, 'Red'),
- (1, 'Blue'),
- (2, 'Red'),
- (2, 'Blue'),
- (3, 'Red'),
- (3, 'Green'),
- (2, 'Green')],
- )
- """
- self._validate_sort_keyword(sort)
- self._assert_can_do_setop(other)
- other, result_name = self._convert_can_do_setop(other)
-
- if not is_dtype_equal(self.dtype, other.dtype):
- if (
- isinstance(self, ABCMultiIndex)
- and not is_object_dtype(_unpack_nested_dtype(other))
- and len(other) > 0
- ):
- raise NotImplementedError(
- "Can only union MultiIndex with MultiIndex or Index of tuples, "
- "try mi.to_flat_index().union(other) instead."
- )
- self, other = self._dti_setop_align_tzs(other, "union")
-
- dtype = self._find_common_type_compat(other)
- left = self.astype(dtype, copy=False)
- right = other.astype(dtype, copy=False)
- return left.union(right, sort=sort)
-
- elif not len(other) or self.equals(other):
- # NB: whether this (and the `if not len(self)` check below) come before
- # or after the is_dtype_equal check above affects the returned dtype
- result = self._get_reconciled_name_object(other)
- if sort is True:
- return result.sort_values()
- return result
-
- elif not len(self):
- result = other._get_reconciled_name_object(self)
- if sort is True:
- return result.sort_values()
- return result
-
- result = self._union(other, sort=sort)
-
- return self._wrap_setop_result(other, result)
-
- def _union(self, other: Index, sort: bool | None):
- """
- Specific union logic should go here. In subclasses, union behavior
- should be overwritten here rather than in `self.union`.
-
- Parameters
- ----------
- other : Index or array-like
- sort : False or None, default False
- Whether to sort the resulting index.
-
- * True : sort the result
- * False : do not sort the result.
- * None : sort the result, except when `self` and `other` are equal
- or when the values cannot be compared.
-
- Returns
- -------
- Index
- """
- lvals = self._values
- rvals = other._values
-
- if (
- sort in (None, True)
- and self.is_monotonic_increasing
- and other.is_monotonic_increasing
- and not (self.has_duplicates and other.has_duplicates)
- and self._can_use_libjoin
- ):
- # Both are monotonic and at least one is unique, so can use outer join
- # (actually don't need either unique, but without this restriction
- # test_union_same_value_duplicated_in_both fails)
- try:
- return self._outer_indexer(other)[0]
- except (TypeError, IncompatibleFrequency):
- # incomparable objects; should only be for object dtype
- value_list = list(lvals)
-
- # worth making this faster? a very unusual case
- value_set = set(lvals)
- value_list.extend([x for x in rvals if x not in value_set])
- # If objects are unorderable, we must have object dtype.
- return np.array(value_list, dtype=object)
-
- elif not other.is_unique:
- # other has duplicates
- result_dups = algos.union_with_duplicates(self, other)
- return _maybe_try_sort(result_dups, sort)
-
- # The rest of this method is analogous to Index._intersection_via_get_indexer
-
- # Self may have duplicates; other already checked as unique
- # find indexes of things in "other" that are not in "self"
- if self._index_as_unique:
- indexer = self.get_indexer(other)
- missing = (indexer == -1).nonzero()[0]
- else:
- missing = algos.unique1d(self.get_indexer_non_unique(other)[1])
-
- result: Index | MultiIndex | ArrayLike
- if self._is_multi:
- # Preserve MultiIndex to avoid losing dtypes
- result = self.append(other.take(missing))
-
- else:
- if len(missing) > 0:
- other_diff = rvals.take(missing)
- result = concat_compat((lvals, other_diff))
- else:
- result = lvals
-
- if not self.is_monotonic_increasing or not other.is_monotonic_increasing:
- # if both are monotonic then result should already be sorted
- result = _maybe_try_sort(result, sort)
-
- return result
-
- @final
- def _wrap_setop_result(self, other: Index, result) -> Index:
- name = get_op_result_name(self, other)
- if isinstance(result, Index):
- if result.name != name:
- result = result.rename(name)
- else:
- result = self._shallow_copy(result, name=name)
- return result
-
- @final
- def intersection(self, other, sort: bool = False):
- """
- Form the intersection of two Index objects.
-
- This returns a new Index with elements common to the index and `other`.
-
- Parameters
- ----------
- other : Index or array-like
- sort : True, False or None, default False
- Whether to sort the resulting index.
-
- * None : sort the result, except when `self` and `other` are equal
- or when the values cannot be compared.
- * False : do not sort the result.
- * True : Sort the result (which may raise TypeError).
-
- Returns
- -------
- Index
-
- Examples
- --------
- >>> idx1 = pd.Index([1, 2, 3, 4])
- >>> idx2 = pd.Index([3, 4, 5, 6])
- >>> idx1.intersection(idx2)
- Index([3, 4], dtype='int64')
- """
- self._validate_sort_keyword(sort)
- self._assert_can_do_setop(other)
- other, result_name = self._convert_can_do_setop(other)
-
- if not is_dtype_equal(self.dtype, other.dtype):
- self, other = self._dti_setop_align_tzs(other, "intersection")
-
- if self.equals(other):
- if self.has_duplicates:
- result = self.unique()._get_reconciled_name_object(other)
- else:
- result = self._get_reconciled_name_object(other)
- if sort is True:
- result = result.sort_values()
- return result
-
- if len(self) == 0 or len(other) == 0:
- # fastpath; we need to be careful about having commutativity
-
- if self._is_multi or other._is_multi:
- # _convert_can_do_setop ensures that we have both or neither
- # We retain self.levels
- return self[:0].rename(result_name)
-
- dtype = self._find_common_type_compat(other)
- if is_dtype_equal(self.dtype, dtype):
- # Slicing allows us to retain DTI/TDI.freq, RangeIndex
-
- # Note: self[:0] vs other[:0] affects
- # 1) which index's `freq` we get in DTI/TDI cases
- # This may be a historical artifact, i.e. no documented
- # reason for this choice.
- # 2) The `step` we get in RangeIndex cases
- if len(self) == 0:
- return self[:0].rename(result_name)
- else:
- return other[:0].rename(result_name)
-
- return Index([], dtype=dtype, name=result_name)
-
- elif not self._should_compare(other):
- # We can infer that the intersection is empty.
- if isinstance(self, ABCMultiIndex):
- return self[:0].rename(result_name)
- return Index([], name=result_name)
-
- elif not is_dtype_equal(self.dtype, other.dtype):
- dtype = self._find_common_type_compat(other)
- this = self.astype(dtype, copy=False)
- other = other.astype(dtype, copy=False)
- return this.intersection(other, sort=sort)
-
- result = self._intersection(other, sort=sort)
- return self._wrap_intersection_result(other, result)
-
- def _intersection(self, other: Index, sort: bool = False):
- """
- intersection specialized to the case with matching dtypes.
- """
- if (
- self.is_monotonic_increasing
- and other.is_monotonic_increasing
- and self._can_use_libjoin
- and not isinstance(self, ABCMultiIndex)
- ):
- try:
- res_indexer, indexer, _ = self._inner_indexer(other)
- except TypeError:
- # non-comparable; should only be for object dtype
- pass
- else:
- # TODO: algos.unique1d should preserve DTA/TDA
- if is_numeric_dtype(self):
- # This is faster, because Index.unique() checks for uniqueness
- # before calculating the unique values.
- res = algos.unique1d(res_indexer)
- else:
- result = self.take(indexer)
- res = result.drop_duplicates()
- return ensure_wrapped_if_datetimelike(res)
-
- res_values = self._intersection_via_get_indexer(other, sort=sort)
- res_values = _maybe_try_sort(res_values, sort)
- return res_values
-
- def _wrap_intersection_result(self, other, result):
- # We will override for MultiIndex to handle empty results
- return self._wrap_setop_result(other, result)
-
- @final
- def _intersection_via_get_indexer(
- self, other: Index | MultiIndex, sort
- ) -> ArrayLike | MultiIndex:
- """
- Find the intersection of two Indexes using get_indexer.
-
- Returns
- -------
- np.ndarray or ExtensionArray
- The returned array will be unique.
- """
- left_unique = self.unique()
- right_unique = other.unique()
-
- # even though we are unique, we need get_indexer_for for IntervalIndex
- indexer = left_unique.get_indexer_for(right_unique)
-
- mask = indexer != -1
-
- taker = indexer.take(mask.nonzero()[0])
- if sort is False:
- # sort bc we want the elements in the same order they are in self
- # unnecessary in the case with sort=None bc we will sort later
- taker = np.sort(taker)
-
- if isinstance(left_unique, ABCMultiIndex):
- result = left_unique.take(taker)
- else:
- result = left_unique.take(taker)._values
- return result
-
- @final
- def difference(self, other, sort=None):
- """
- Return a new Index with elements of index not in `other`.
-
- This is the set difference of two Index objects.
-
- Parameters
- ----------
- other : Index or array-like
- sort : bool or None, default None
- Whether to sort the resulting index. By default, the
- values are attempted to be sorted, but any TypeError from
- incomparable elements is caught by pandas.
-
- * None : Attempt to sort the result, but catch any TypeErrors
- from comparing incomparable elements.
- * False : Do not sort the result.
- * True : Sort the result (which may raise TypeError).
-
- Returns
- -------
- Index
-
- Examples
- --------
- >>> idx1 = pd.Index([2, 1, 3, 4])
- >>> idx2 = pd.Index([3, 4, 5, 6])
- >>> idx1.difference(idx2)
- Index([1, 2], dtype='int64')
- >>> idx1.difference(idx2, sort=False)
- Index([2, 1], dtype='int64')
- """
- self._validate_sort_keyword(sort)
- self._assert_can_do_setop(other)
- other, result_name = self._convert_can_do_setop(other)
-
- # Note: we do NOT call _dti_setop_align_tzs here, as there
- # is no requirement that .difference be commutative, so it does
- # not cast to object.
-
- if self.equals(other):
- # Note: we do not (yet) sort even if sort=None GH#24959
- return self[:0].rename(result_name)
-
- if len(other) == 0:
- # Note: we do not (yet) sort even if sort=None GH#24959
- result = self.rename(result_name)
- if sort is True:
- return result.sort_values()
- return result
-
- if not self._should_compare(other):
- # Nothing matches -> difference is everything
- result = self.rename(result_name)
- if sort is True:
- return result.sort_values()
- return result
-
- result = self._difference(other, sort=sort)
- return self._wrap_difference_result(other, result)
-
- def _difference(self, other, sort):
- # overridden by RangeIndex
-
- this = self.unique()
-
- indexer = this.get_indexer_for(other)
- indexer = indexer.take((indexer != -1).nonzero()[0])
-
- label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True)
-
- the_diff: MultiIndex | ArrayLike
- if isinstance(this, ABCMultiIndex):
- the_diff = this.take(label_diff)
- else:
- the_diff = this._values.take(label_diff)
- the_diff = _maybe_try_sort(the_diff, sort)
-
- return the_diff
-
- def _wrap_difference_result(self, other, result):
- # We will override for MultiIndex to handle empty results
- return self._wrap_setop_result(other, result)
-
- def symmetric_difference(self, other, result_name=None, sort=None):
- """
- Compute the symmetric difference of two Index objects.
-
- Parameters
- ----------
- other : Index or array-like
- result_name : str
- sort : bool or None, default None
- Whether to sort the resulting index. By default, the
- values are attempted to be sorted, but any TypeError from
- incomparable elements is caught by pandas.
-
- * None : Attempt to sort the result, but catch any TypeErrors
- from comparing incomparable elements.
- * False : Do not sort the result.
- * True : Sort the result (which may raise TypeError).
-
- Returns
- -------
- Index
-
- Notes
- -----
- ``symmetric_difference`` contains elements that appear in either
- ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by
- ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates
- dropped.
-
- Examples
- --------
- >>> idx1 = pd.Index([1, 2, 3, 4])
- >>> idx2 = pd.Index([2, 3, 4, 5])
- >>> idx1.symmetric_difference(idx2)
- Index([1, 5], dtype='int64')
- """
- self._validate_sort_keyword(sort)
- self._assert_can_do_setop(other)
- other, result_name_update = self._convert_can_do_setop(other)
- if result_name is None:
- result_name = result_name_update
-
- if not is_dtype_equal(self.dtype, other.dtype):
- self, other = self._dti_setop_align_tzs(other, "symmetric_difference")
-
- if not self._should_compare(other):
- return self.union(other, sort=sort).rename(result_name)
-
- elif not is_dtype_equal(self.dtype, other.dtype):
- dtype = self._find_common_type_compat(other)
- this = self.astype(dtype, copy=False)
- that = other.astype(dtype, copy=False)
- return this.symmetric_difference(that, sort=sort).rename(result_name)
-
- this = self.unique()
- other = other.unique()
- indexer = this.get_indexer_for(other)
-
- # {this} minus {other}
- common_indexer = indexer.take((indexer != -1).nonzero()[0])
- left_indexer = np.setdiff1d(
- np.arange(this.size), common_indexer, assume_unique=True
- )
- left_diff = this.take(left_indexer)
-
- # {other} minus {this}
- right_indexer = (indexer == -1).nonzero()[0]
- right_diff = other.take(right_indexer)
-
- res_values = left_diff.append(right_diff)
- result = _maybe_try_sort(res_values, sort)
-
- if not self._is_multi:
- return Index(result, name=result_name, dtype=res_values.dtype)
- else:
- left_diff = cast("MultiIndex", left_diff)
- if len(result) == 0:
- # result might be an Index, if other was an Index
- return left_diff.remove_unused_levels().set_names(result_name)
- return result.set_names(result_name)
-
- @final
- def _assert_can_do_setop(self, other) -> bool:
- if not is_list_like(other):
- raise TypeError("Input must be Index or array-like")
- return True
-
- def _convert_can_do_setop(self, other) -> tuple[Index, Hashable]:
- if not isinstance(other, Index):
- other = Index(other, name=self.name)
- result_name = self.name
- else:
- result_name = get_op_result_name(self, other)
- return other, result_name
-
- # --------------------------------------------------------------------
- # Indexing Methods
-
- def get_loc(self, key):
- """
- Get integer location, slice or boolean mask for requested label.
-
- Parameters
- ----------
- key : label
-
- Returns
- -------
- int if unique index, slice if monotonic index, else mask
-
- Examples
- --------
- >>> unique_index = pd.Index(list('abc'))
- >>> unique_index.get_loc('b')
- 1
-
- >>> monotonic_index = pd.Index(list('abbc'))
- >>> monotonic_index.get_loc('b')
- slice(1, 3, None)
-
- >>> non_monotonic_index = pd.Index(list('abcb'))
- >>> non_monotonic_index.get_loc('b')
- array([False, True, False, True])
- """
- casted_key = self._maybe_cast_indexer(key)
- try:
- return self._engine.get_loc(casted_key)
- except KeyError as err:
- raise KeyError(key) from err
- except TypeError:
- # If we have a listlike key, _check_indexing_error will raise
- # InvalidIndexError. Otherwise we fall through and re-raise
- # the TypeError.
- self._check_indexing_error(key)
- raise
-
- _index_shared_docs[
- "get_indexer"
- ] = """
- Compute indexer and mask for new index given the current index.
-
- The indexer should be then used as an input to ndarray.take to align the
- current data to the new index.
-
- Parameters
- ----------
- target : %(target_klass)s
- method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional
- * default: exact matches only.
- * pad / ffill: find the PREVIOUS index value if no exact match.
- * backfill / bfill: use NEXT index value if no exact match
- * nearest: use the NEAREST index value if no exact match. Tied
- distances are broken by preferring the larger index value.
- limit : int, optional
- Maximum number of consecutive labels in ``target`` to match for
- inexact matches.
- tolerance : optional
- Maximum distance between original and new labels for inexact
- matches. The values of the index at the matching locations must
- satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
-
- Tolerance may be a scalar value, which applies the same tolerance
- to all values, or list-like, which applies variable tolerance per
- element. List-like includes list, tuple, array, Series, and must be
- the same size as the index and its dtype must exactly match the
- index's type.
-
- Returns
- -------
- np.ndarray[np.intp]
- Integers from 0 to n - 1 indicating that the index at these
- positions matches the corresponding target values. Missing values
- in the target are marked by -1.
- %(raises_section)s
- Notes
- -----
- Returns -1 for unmatched values, for further explanation see the
- example below.
-
- Examples
- --------
- >>> index = pd.Index(['c', 'a', 'b'])
- >>> index.get_indexer(['a', 'b', 'x'])
- array([ 1, 2, -1])
-
- Notice that the return value is an array of locations in ``index``
- and ``x`` is marked by -1, as it is not in ``index``.
- """
-
- @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
- @final
- def get_indexer(
- self,
- target,
- method: str_t | None = None,
- limit: int | None = None,
- tolerance=None,
- ) -> npt.NDArray[np.intp]:
- method = clean_reindex_fill_method(method)
- orig_target = target
- target = self._maybe_cast_listlike_indexer(target)
-
- self._check_indexing_method(method, limit, tolerance)
-
- if not self._index_as_unique:
- raise InvalidIndexError(self._requires_unique_msg)
-
- if len(target) == 0:
- return np.array([], dtype=np.intp)
-
- if not self._should_compare(target) and not self._should_partial_index(target):
- # IntervalIndex get special treatment bc numeric scalars can be
- # matched to Interval scalars
- return self._get_indexer_non_comparable(target, method=method, unique=True)
-
- if is_categorical_dtype(self.dtype):
- # _maybe_cast_listlike_indexer ensures target has our dtype
- # (could improve perf by doing _should_compare check earlier?)
- assert is_dtype_equal(self.dtype, target.dtype)
-
- indexer = self._engine.get_indexer(target.codes)
- if self.hasnans and target.hasnans:
- # After _maybe_cast_listlike_indexer, target elements which do not
- # belong to some category are changed to NaNs
- # Mask to track actual NaN values compared to inserted NaN values
- # GH#45361
- target_nans = isna(orig_target)
- loc = self.get_loc(np.nan)
- mask = target.isna()
- indexer[target_nans] = loc
- indexer[mask & ~target_nans] = -1
- return indexer
-
- if is_categorical_dtype(target.dtype):
- # potential fastpath
- # get an indexer for unique categories then propagate to codes via take_nd
- # get_indexer instead of _get_indexer needed for MultiIndex cases
- # e.g. test_append_different_columns_types
- categories_indexer = self.get_indexer(target.categories)
-
- indexer = algos.take_nd(categories_indexer, target.codes, fill_value=-1)
-
- if (not self._is_multi and self.hasnans) and target.hasnans:
- # Exclude MultiIndex because hasnans raises NotImplementedError
- # we should only get here if we are unique, so loc is an integer
- # GH#41934
- loc = self.get_loc(np.nan)
- mask = target.isna()
- indexer[mask] = loc
-
- return ensure_platform_int(indexer)
-
- pself, ptarget = self._maybe_promote(target)
- if pself is not self or ptarget is not target:
- return pself.get_indexer(
- ptarget, method=method, limit=limit, tolerance=tolerance
- )
-
- if is_dtype_equal(self.dtype, target.dtype) and self.equals(target):
- # Only call equals if we have same dtype to avoid inference/casting
- return np.arange(len(target), dtype=np.intp)
-
- if not is_dtype_equal(
- self.dtype, target.dtype
- ) and not self._should_partial_index(target):
- # _should_partial_index e.g. IntervalIndex with numeric scalars
- # that can be matched to Interval scalars.
- dtype = self._find_common_type_compat(target)
-
- this = self.astype(dtype, copy=False)
- target = target.astype(dtype, copy=False)
- return this._get_indexer(
- target, method=method, limit=limit, tolerance=tolerance
- )
-
- return self._get_indexer(target, method, limit, tolerance)
-
- def _get_indexer(
- self,
- target: Index,
- method: str_t | None = None,
- limit: int | None = None,
- tolerance=None,
- ) -> npt.NDArray[np.intp]:
- if tolerance is not None:
- tolerance = self._convert_tolerance(tolerance, target)
-
- if method in ["pad", "backfill"]:
- indexer = self._get_fill_indexer(target, method, limit, tolerance)
- elif method == "nearest":
- indexer = self._get_nearest_indexer(target, limit, tolerance)
- else:
- if target._is_multi and self._is_multi:
- engine = self._engine
- # error: Item "IndexEngine" of "Union[IndexEngine, ExtensionEngine]"
- # has no attribute "_extract_level_codes"
- tgt_values = engine._extract_level_codes( # type: ignore[union-attr]
- target
- )
- else:
- tgt_values = target._get_engine_target()
-
- indexer = self._engine.get_indexer(tgt_values)
-
- return ensure_platform_int(indexer)
-
- @final
- def _should_partial_index(self, target: Index) -> bool:
- """
- Should we attempt partial-matching indexing?
- """
- if is_interval_dtype(self.dtype):
- if is_interval_dtype(target.dtype):
- return False
- # See https://github.com/pandas-dev/pandas/issues/47772 the commented
- # out code can be restored (instead of hardcoding `return True`)
- # once that issue is fixed
- # "Index" has no attribute "left"
- # return self.left._should_compare(target) # type: ignore[attr-defined]
- return True
- return False
-
- @final
- def _check_indexing_method(
- self,
- method: str_t | None,
- limit: int | None = None,
- tolerance=None,
- ) -> None:
- """
- Raise if we have a get_indexer `method` that is not supported or valid.
- """
- if method not in [None, "bfill", "backfill", "pad", "ffill", "nearest"]:
- # in practice the clean_reindex_fill_method call would raise
- # before we get here
- raise ValueError("Invalid fill method") # pragma: no cover
-
- if self._is_multi:
- if method == "nearest":
- raise NotImplementedError(
- "method='nearest' not implemented yet "
- "for MultiIndex; see GitHub issue 9365"
- )
- if method in ("pad", "backfill"):
- if tolerance is not None:
- raise NotImplementedError(
- "tolerance not implemented yet for MultiIndex"
- )
-
- if is_interval_dtype(self.dtype) or is_categorical_dtype(self.dtype):
- # GH#37871 for now this is only for IntervalIndex and CategoricalIndex
- if method is not None:
- raise NotImplementedError(
- f"method {method} not yet implemented for {type(self).__name__}"
- )
-
- if method is None:
- if tolerance is not None:
- raise ValueError(
- "tolerance argument only valid if doing pad, "
- "backfill or nearest reindexing"
- )
- if limit is not None:
- raise ValueError(
- "limit argument only valid if doing pad, "
- "backfill or nearest reindexing"
- )
-
- def _convert_tolerance(self, tolerance, target: np.ndarray | Index) -> np.ndarray:
- # override this method on subclasses
- tolerance = np.asarray(tolerance)
- if target.size != tolerance.size and tolerance.size > 1:
- raise ValueError("list-like tolerance size must match target index size")
- elif is_numeric_dtype(self) and not np.issubdtype(tolerance.dtype, np.number):
- if tolerance.ndim > 0:
- raise ValueError(
- f"tolerance argument for {type(self).__name__} with dtype "
- f"{self.dtype} must contain numeric elements if it is list type"
- )
-
- raise ValueError(
- f"tolerance argument for {type(self).__name__} with dtype {self.dtype} "
- f"must be numeric if it is a scalar: {repr(tolerance)}"
- )
- return tolerance
-
- @final
- def _get_fill_indexer(
- self, target: Index, method: str_t, limit: int | None = None, tolerance=None
- ) -> npt.NDArray[np.intp]:
- if self._is_multi:
- # TODO: get_indexer_with_fill docstring says values must be _sorted_
- # but that doesn't appear to be enforced
- # error: "IndexEngine" has no attribute "get_indexer_with_fill"
- engine = self._engine
- with warnings.catch_warnings():
- # TODO: We need to fix this. Casting to int64 in cython
- warnings.filterwarnings("ignore", category=RuntimeWarning)
- return engine.get_indexer_with_fill( # type: ignore[union-attr]
- target=target._values,
- values=self._values,
- method=method,
- limit=limit,
- )
-
- if self.is_monotonic_increasing and target.is_monotonic_increasing:
- target_values = target._get_engine_target()
- own_values = self._get_engine_target()
- if not isinstance(target_values, np.ndarray) or not isinstance(
- own_values, np.ndarray
- ):
- raise NotImplementedError
-
- if method == "pad":
- indexer = libalgos.pad(own_values, target_values, limit=limit)
- else:
- # i.e. "backfill"
- indexer = libalgos.backfill(own_values, target_values, limit=limit)
- else:
- indexer = self._get_fill_indexer_searchsorted(target, method, limit)
- if tolerance is not None and len(self):
- indexer = self._filter_indexer_tolerance(target, indexer, tolerance)
- return indexer
-
- @final
- def _get_fill_indexer_searchsorted(
- self, target: Index, method: str_t, limit: int | None = None
- ) -> npt.NDArray[np.intp]:
- """
- Fallback pad/backfill get_indexer that works for monotonic decreasing
- indexes and non-monotonic targets.
- """
- if limit is not None:
- raise ValueError(
- f"limit argument for {repr(method)} method only well-defined "
- "if index and target are monotonic"
- )
-
- side: Literal["left", "right"] = "left" if method == "pad" else "right"
-
- # find exact matches first (this simplifies the algorithm)
- indexer = self.get_indexer(target)
- nonexact = indexer == -1
- indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side)
- if side == "left":
- # searchsorted returns "indices into a sorted array such that,
- # if the corresponding elements in v were inserted before the
- # indices, the order of a would be preserved".
- # Thus, we need to subtract 1 to find values to the left.
- indexer[nonexact] -= 1
- # This also mapped not found values (values of 0 from
- # np.searchsorted) to -1, which conveniently is also our
- # sentinel for missing values
- else:
- # Mark indices to the right of the largest value as not found
- indexer[indexer == len(self)] = -1
- return indexer
-
- @final
- def _get_nearest_indexer(
- self, target: Index, limit: int | None, tolerance
- ) -> npt.NDArray[np.intp]:
- """
- Get the indexer for the nearest index labels; requires an index with
- values that can be subtracted from each other (e.g., not strings or
- tuples).
- """
- if not len(self):
- return self._get_fill_indexer(target, "pad")
-
- left_indexer = self.get_indexer(target, "pad", limit=limit)
- right_indexer = self.get_indexer(target, "backfill", limit=limit)
-
- left_distances = self._difference_compat(target, left_indexer)
- right_distances = self._difference_compat(target, right_indexer)
-
- op = operator.lt if self.is_monotonic_increasing else operator.le
- indexer = np.where(
- # error: Argument 1&2 has incompatible type "Union[ExtensionArray,
- # ndarray[Any, Any]]"; expected "Union[SupportsDunderLE,
- # SupportsDunderGE, SupportsDunderGT, SupportsDunderLT]"
- op(left_distances, right_distances) # type: ignore[arg-type]
- | (right_indexer == -1),
- left_indexer,
- right_indexer,
- )
- if tolerance is not None:
- indexer = self._filter_indexer_tolerance(target, indexer, tolerance)
- return indexer
-
- @final
- def _filter_indexer_tolerance(
- self,
- target: Index,
- indexer: npt.NDArray[np.intp],
- tolerance,
- ) -> npt.NDArray[np.intp]:
- distance = self._difference_compat(target, indexer)
-
- return np.where(distance <= tolerance, indexer, -1)
-
- @final
- def _difference_compat(
- self, target: Index, indexer: npt.NDArray[np.intp]
- ) -> ArrayLike:
- # Compatibility for PeriodArray, for which __sub__ returns an ndarray[object]
- # of DateOffset objects, which do not support __abs__ (and would be slow
- # if they did)
-
- if isinstance(self.dtype, PeriodDtype):
- # Note: we only get here with matching dtypes
- own_values = cast("PeriodArray", self._data)._ndarray
- target_values = cast("PeriodArray", target._data)._ndarray
- diff = own_values[indexer] - target_values
- else:
- # error: Unsupported left operand type for - ("ExtensionArray")
- diff = self._values[indexer] - target._values # type: ignore[operator]
- return abs(diff)
-
- # --------------------------------------------------------------------
- # Indexer Conversion Methods
-
- @final
- def _validate_positional_slice(self, key: slice) -> None:
- """
- For positional indexing, a slice must have either int or None
- for each of start, stop, and step.
- """
- self._validate_indexer("positional", key.start, "iloc")
- self._validate_indexer("positional", key.stop, "iloc")
- self._validate_indexer("positional", key.step, "iloc")
-
- def _convert_slice_indexer(self, key: slice, kind: str_t):
- """
- Convert a slice indexer.
-
- By definition, these are labels unless 'iloc' is passed in.
- Floats are not allowed as the start, step, or stop of the slice.
-
- Parameters
- ----------
- key : label of the slice bound
- kind : {'loc', 'getitem'}
- """
- assert kind in ["loc", "getitem"], kind
-
- # potentially cast the bounds to integers
- start, stop, step = key.start, key.stop, key.step
-
- # TODO(GH#50617): once Series.__[gs]etitem__ is removed we should be able
- # to simplify this.
- if isinstance(self.dtype, np.dtype) and is_float_dtype(self.dtype):
- # We always treat __getitem__ slicing as label-based
- # translate to locations
- return self.slice_indexer(start, stop, step)
-
- # figure out if this is a positional indexer
- def is_int(v):
- return v is None or is_integer(v)
-
- is_index_slice = is_int(start) and is_int(stop) and is_int(step)
-
- # special case for interval_dtype bc we do not do partial-indexing
- # on integer Intervals when slicing
- # TODO: write this in terms of e.g. should_partial_index?
- ints_are_positional = self._should_fallback_to_positional or is_interval_dtype(
- self.dtype
- )
- is_positional = is_index_slice and ints_are_positional
-
- if kind == "getitem":
- # called from the getitem slicers, validate that we are in fact integers
- if is_integer_dtype(self.dtype) or is_index_slice:
- # Note: these checks are redundant if we know is_index_slice
- self._validate_indexer("slice", key.start, "getitem")
- self._validate_indexer("slice", key.stop, "getitem")
- self._validate_indexer("slice", key.step, "getitem")
- return key
-
- # convert the slice to an indexer here
-
- # if we are mixed and have integers
- if is_positional:
- try:
- # Validate start & stop
- if start is not None:
- self.get_loc(start)
- if stop is not None:
- self.get_loc(stop)
- is_positional = False
- except KeyError:
- pass
-
- if com.is_null_slice(key):
- # It doesn't matter if we are positional or label based
- indexer = key
- elif is_positional:
- if kind == "loc":
- # GH#16121, GH#24612, GH#31810
- raise TypeError(
- "Slicing a positional slice with .loc is not allowed, "
- "Use .loc with labels or .iloc with positions instead.",
- )
- indexer = key
- else:
- indexer = self.slice_indexer(start, stop, step)
-
- return indexer
-
- @final
- def _raise_invalid_indexer(
- self,
- form: str_t,
- key,
- reraise: lib.NoDefault | None | Exception = lib.no_default,
- ) -> None:
- """
- Raise consistent invalid indexer message.
- """
- msg = (
- f"cannot do {form} indexing on {type(self).__name__} with these "
- f"indexers [{key}] of type {type(key).__name__}"
- )
- if reraise is not lib.no_default:
- raise TypeError(msg) from reraise
- raise TypeError(msg)
-
- # --------------------------------------------------------------------
- # Reindex Methods
-
- @final
- def _validate_can_reindex(self, indexer: np.ndarray) -> None:
- """
- Check if we are allowing reindexing with this particular indexer.
-
- Parameters
- ----------
- indexer : an integer ndarray
-
- Raises
- ------
- ValueError if its a duplicate axis
- """
- # trying to reindex on an axis with duplicates
- if not self._index_as_unique and len(indexer):
- raise ValueError("cannot reindex on an axis with duplicate labels")
-
- def reindex(
- self, target, method=None, level=None, limit=None, tolerance=None
- ) -> tuple[Index, npt.NDArray[np.intp] | None]:
- """
- Create index with target's values.
-
- Parameters
- ----------
- target : an iterable
- method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional
- * default: exact matches only.
- * pad / ffill: find the PREVIOUS index value if no exact match.
- * backfill / bfill: use NEXT index value if no exact match
- * nearest: use the NEAREST index value if no exact match. Tied
- distances are broken by preferring the larger index value.
- level : int, optional
- Level of multiindex.
- limit : int, optional
- Maximum number of consecutive labels in ``target`` to match for
- inexact matches.
- tolerance : int or float, optional
- Maximum distance between original and new labels for inexact
- matches. The values of the index at the matching locations must
- satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
-
- Tolerance may be a scalar value, which applies the same tolerance
- to all values, or list-like, which applies variable tolerance per
- element. List-like includes list, tuple, array, Series, and must be
- the same size as the index and its dtype must exactly match the
- index's type.
-
- Returns
- -------
- new_index : pd.Index
- Resulting index.
- indexer : np.ndarray[np.intp] or None
- Indices of output values in original index.
-
- Raises
- ------
- TypeError
- If ``method`` passed along with ``level``.
- ValueError
- If non-unique multi-index
- ValueError
- If non-unique index and ``method`` or ``limit`` passed.
-
- See Also
- --------
- Series.reindex : Conform Series to new index with optional filling logic.
- DataFrame.reindex : Conform DataFrame to new index with optional filling logic.
-
- Examples
- --------
- >>> idx = pd.Index(['car', 'bike', 'train', 'tractor'])
- >>> idx
- Index(['car', 'bike', 'train', 'tractor'], dtype='object')
- >>> idx.reindex(['car', 'bike'])
- (Index(['car', 'bike'], dtype='object'), array([0, 1]))
- """
- # GH6552: preserve names when reindexing to non-named target
- # (i.e. neither Index nor Series).
- preserve_names = not hasattr(target, "name")
-
- # GH7774: preserve dtype/tz if target is empty and not an Index.
- target = ensure_has_len(target) # target may be an iterator
-
- if not isinstance(target, Index) and len(target) == 0:
- if level is not None and self._is_multi:
- # "Index" has no attribute "levels"; maybe "nlevels"?
- idx = self.levels[level] # type: ignore[attr-defined]
- else:
- idx = self
- target = idx[:0]
- else:
- target = ensure_index(target)
-
- if level is not None and (
- isinstance(self, ABCMultiIndex) or isinstance(target, ABCMultiIndex)
- ):
- if method is not None:
- raise TypeError("Fill method not supported if level passed")
-
- # TODO: tests where passing `keep_order=not self._is_multi`
- # makes a difference for non-MultiIndex case
- target, indexer, _ = self._join_level(
- target, level, how="right", keep_order=not self._is_multi
- )
-
- else:
- if self.equals(target):
- indexer = None
- else:
- if self._index_as_unique:
- indexer = self.get_indexer(
- target, method=method, limit=limit, tolerance=tolerance
- )
- elif self._is_multi:
- raise ValueError("cannot handle a non-unique multi-index!")
- elif not self.is_unique:
- # GH#42568
- raise ValueError("cannot reindex on an axis with duplicate labels")
- else:
- indexer, _ = self.get_indexer_non_unique(target)
-
- target = self._wrap_reindex_result(target, indexer, preserve_names)
- return target, indexer
-
- def _wrap_reindex_result(self, target, indexer, preserve_names: bool):
- target = self._maybe_preserve_names(target, preserve_names)
- return target
-
- def _maybe_preserve_names(self, target: Index, preserve_names: bool):
- if preserve_names and target.nlevels == 1 and target.name != self.name:
- target = target.copy(deep=False)
- target.name = self.name
- return target
-
- @final
- def _reindex_non_unique(
- self, target: Index
- ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp] | None]:
- """
- Create a new index with target's values (move/add/delete values as
- necessary) use with non-unique Index and a possibly non-unique target.
-
- Parameters
- ----------
- target : an iterable
-
- Returns
- -------
- new_index : pd.Index
- Resulting index.
- indexer : np.ndarray[np.intp]
- Indices of output values in original index.
- new_indexer : np.ndarray[np.intp] or None
-
- """
- target = ensure_index(target)
- if len(target) == 0:
- # GH#13691
- return self[:0], np.array([], dtype=np.intp), None
-
- indexer, missing = self.get_indexer_non_unique(target)
- check = indexer != -1
- new_labels = self.take(indexer[check])
- new_indexer = None
-
- if len(missing):
- length = np.arange(len(indexer), dtype=np.intp)
-
- missing = ensure_platform_int(missing)
- missing_labels = target.take(missing)
- missing_indexer = length[~check]
- cur_labels = self.take(indexer[check]).values
- cur_indexer = length[check]
-
- # Index constructor below will do inference
- new_labels = np.empty((len(indexer),), dtype=object)
- new_labels[cur_indexer] = cur_labels
- new_labels[missing_indexer] = missing_labels
-
- # GH#38906
- if not len(self):
- new_indexer = np.arange(0, dtype=np.intp)
-
- # a unique indexer
- elif target.is_unique:
- # see GH5553, make sure we use the right indexer
- new_indexer = np.arange(len(indexer), dtype=np.intp)
- new_indexer[cur_indexer] = np.arange(len(cur_labels))
- new_indexer[missing_indexer] = -1
-
- # we have a non_unique selector, need to use the original
- # indexer here
- else:
- # need to retake to have the same size as the indexer
- indexer[~check] = -1
-
- # reset the new indexer to account for the new size
- new_indexer = np.arange(len(self.take(indexer)), dtype=np.intp)
- new_indexer[~check] = -1
-
- if not isinstance(self, ABCMultiIndex):
- new_index = Index(new_labels, name=self.name)
- else:
- new_index = type(self).from_tuples(new_labels, names=self.names)
- return new_index, indexer, new_indexer
-
- # --------------------------------------------------------------------
- # Join Methods
-
- @overload
- def join(
- self,
- other: Index,
- *,
- how: JoinHow = ...,
- level: Level = ...,
- return_indexers: Literal[True],
- sort: bool = ...,
- ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
- ...
-
- @overload
- def join(
- self,
- other: Index,
- *,
- how: JoinHow = ...,
- level: Level = ...,
- return_indexers: Literal[False] = ...,
- sort: bool = ...,
- ) -> Index:
- ...
-
- @overload
- def join(
- self,
- other: Index,
- *,
- how: JoinHow = ...,
- level: Level = ...,
- return_indexers: bool = ...,
- sort: bool = ...,
- ) -> Index | tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
- ...
-
- @final
- @_maybe_return_indexers
- def join(
- self,
- other: Index,
- *,
- how: JoinHow = "left",
- level: Level = None,
- return_indexers: bool = False,
- sort: bool = False,
- ) -> Index | tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
- """
- Compute join_index and indexers to conform data structures to the new index.
-
- Parameters
- ----------
- other : Index
- how : {'left', 'right', 'inner', 'outer'}
- level : int or level name, default None
- return_indexers : bool, default False
- sort : bool, default False
- Sort the join keys lexicographically in the result Index. If False,
- the order of the join keys depends on the join type (how keyword).
-
- Returns
- -------
- join_index, (left_indexer, right_indexer)
- """
- other = ensure_index(other)
-
- if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex):
- if (self.tz is None) ^ (other.tz is None):
- # Raise instead of casting to object below.
- raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
-
- if not self._is_multi and not other._is_multi:
- # We have specific handling for MultiIndex below
- pself, pother = self._maybe_promote(other)
- if pself is not self or pother is not other:
- return pself.join(
- pother, how=how, level=level, return_indexers=True, sort=sort
- )
-
- lindexer: np.ndarray | None
- rindexer: np.ndarray | None
-
- # try to figure out the join level
- # GH3662
- if level is None and (self._is_multi or other._is_multi):
- # have the same levels/names so a simple join
- if self.names == other.names:
- pass
- else:
- return self._join_multi(other, how=how)
-
- # join on the level
- if level is not None and (self._is_multi or other._is_multi):
- return self._join_level(other, level, how=how)
-
- if len(other) == 0:
- if how in ("left", "outer"):
- join_index = self._view()
- rindexer = np.broadcast_to(np.intp(-1), len(join_index))
- return join_index, None, rindexer
- elif how in ("right", "inner", "cross"):
- join_index = other._view()
- lindexer = np.array([])
- return join_index, lindexer, None
-
- if len(self) == 0:
- if how in ("right", "outer"):
- join_index = other._view()
- lindexer = np.broadcast_to(np.intp(-1), len(join_index))
- return join_index, lindexer, None
- elif how in ("left", "inner", "cross"):
- join_index = self._view()
- rindexer = np.array([])
- return join_index, None, rindexer
-
- if self._join_precedence < other._join_precedence:
- flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"}
- how = flip.get(how, how)
- join_index, lidx, ridx = other.join(
- self, how=how, level=level, return_indexers=True
- )
- lidx, ridx = ridx, lidx
- return join_index, lidx, ridx
-
- if not is_dtype_equal(self.dtype, other.dtype):
- dtype = self._find_common_type_compat(other)
- this = self.astype(dtype, copy=False)
- other = other.astype(dtype, copy=False)
- return this.join(other, how=how, return_indexers=True)
-
- _validate_join_method(how)
-
- if not self.is_unique and not other.is_unique:
- return self._join_non_unique(other, how=how)
- elif not self.is_unique or not other.is_unique:
- if self.is_monotonic_increasing and other.is_monotonic_increasing:
- if not is_interval_dtype(self.dtype):
- # otherwise we will fall through to _join_via_get_indexer
- # GH#39133
- # go through object dtype for ea till engine is supported properly
- return self._join_monotonic(other, how=how)
- else:
- return self._join_non_unique(other, how=how)
- elif (
- # GH48504: exclude MultiIndex to avoid going through MultiIndex._values
- self.is_monotonic_increasing
- and other.is_monotonic_increasing
- and self._can_use_libjoin
- and not isinstance(self, ABCMultiIndex)
- and not is_categorical_dtype(self.dtype)
- ):
- # Categorical is monotonic if data are ordered as categories, but join can
- # not handle this in case of not lexicographically monotonic GH#38502
- try:
- return self._join_monotonic(other, how=how)
- except TypeError:
- # object dtype; non-comparable objects
- pass
-
- return self._join_via_get_indexer(other, how, sort)
-
- @final
- def _join_via_get_indexer(
- self, other: Index, how: JoinHow, sort: bool
- ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
- # Fallback if we do not have any fastpaths available based on
- # uniqueness/monotonicity
-
- # Note: at this point we have checked matching dtypes
-
- if how == "left":
- join_index = self
- elif how == "right":
- join_index = other
- elif how == "inner":
- # TODO: sort=False here for backwards compat. It may
- # be better to use the sort parameter passed into join
- join_index = self.intersection(other, sort=False)
- elif how == "outer":
- # TODO: sort=True here for backwards compat. It may
- # be better to use the sort parameter passed into join
- join_index = self.union(other)
-
- if sort:
- join_index = join_index.sort_values()
-
- if join_index is self:
- lindexer = None
- else:
- lindexer = self.get_indexer_for(join_index)
- if join_index is other:
- rindexer = None
- else:
- rindexer = other.get_indexer_for(join_index)
- return join_index, lindexer, rindexer
-
- @final
- def _join_multi(self, other: Index, how: JoinHow):
- from pandas.core.indexes.multi import MultiIndex
- from pandas.core.reshape.merge import restore_dropped_levels_multijoin
-
- # figure out join names
- self_names_list = list(com.not_none(*self.names))
- other_names_list = list(com.not_none(*other.names))
- self_names_order = self_names_list.index
- other_names_order = other_names_list.index
- self_names = set(self_names_list)
- other_names = set(other_names_list)
- overlap = self_names & other_names
-
- # need at least 1 in common
- if not overlap:
- raise ValueError("cannot join with no overlapping index names")
-
- if isinstance(self, MultiIndex) and isinstance(other, MultiIndex):
- # Drop the non-matching levels from left and right respectively
- ldrop_names = sorted(self_names - overlap, key=self_names_order)
- rdrop_names = sorted(other_names - overlap, key=other_names_order)
-
- # if only the order differs
- if not len(ldrop_names + rdrop_names):
- self_jnlevels = self
- other_jnlevels = other.reorder_levels(self.names)
- else:
- self_jnlevels = self.droplevel(ldrop_names)
- other_jnlevels = other.droplevel(rdrop_names)
-
- # Join left and right
- # Join on same leveled multi-index frames is supported
- join_idx, lidx, ridx = self_jnlevels.join(
- other_jnlevels, how=how, return_indexers=True
- )
-
- # Restore the dropped levels
- # Returned index level order is
- # common levels, ldrop_names, rdrop_names
- dropped_names = ldrop_names + rdrop_names
-
- # error: Argument 5/6 to "restore_dropped_levels_multijoin" has
- # incompatible type "Optional[ndarray[Any, dtype[signedinteger[Any
- # ]]]]"; expected "ndarray[Any, dtype[signedinteger[Any]]]"
- levels, codes, names = restore_dropped_levels_multijoin(
- self,
- other,
- dropped_names,
- join_idx,
- lidx, # type: ignore[arg-type]
- ridx, # type: ignore[arg-type]
- )
-
- # Re-create the multi-index
- multi_join_idx = MultiIndex(
- levels=levels, codes=codes, names=names, verify_integrity=False
- )
-
- multi_join_idx = multi_join_idx.remove_unused_levels()
-
- return multi_join_idx, lidx, ridx
-
- jl = list(overlap)[0]
-
- # Case where only one index is multi
- # make the indices into mi's that match
- flip_order = False
- if isinstance(self, MultiIndex):
- self, other = other, self
- flip_order = True
- # flip if join method is right or left
- flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"}
- how = flip.get(how, how)
-
- level = other.names.index(jl)
- result = self._join_level(other, level, how=how)
-
- if flip_order:
- return result[0], result[2], result[1]
- return result
-
- @final
- def _join_non_unique(
- self, other: Index, how: JoinHow = "left"
- ) -> tuple[Index, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- from pandas.core.reshape.merge import get_join_indexers
-
- # We only get here if dtypes match
- assert self.dtype == other.dtype
-
- left_idx, right_idx = get_join_indexers(
- [self._values], [other._values], how=how, sort=True
- )
- mask = left_idx == -1
-
- join_idx = self.take(left_idx)
- right = other.take(right_idx)
- join_index = join_idx.putmask(mask, right)
- return join_index, left_idx, right_idx
-
- @final
- def _join_level(
- self, other: Index, level, how: JoinHow = "left", keep_order: bool = True
- ) -> tuple[MultiIndex, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
- """
- The join method *only* affects the level of the resulting
- MultiIndex. Otherwise it just exactly aligns the Index data to the
- labels of the level in the MultiIndex.
-
- If ```keep_order == True```, the order of the data indexed by the
- MultiIndex will not be changed; otherwise, it will tie out
- with `other`.
- """
- from pandas.core.indexes.multi import MultiIndex
-
- def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]:
- """
- Returns sorter for the inner most level while preserving the
- order of higher levels.
-
- Parameters
- ----------
- labels : list[np.ndarray]
- Each ndarray has signed integer dtype, not necessarily identical.
-
- Returns
- -------
- np.ndarray[np.intp]
- """
- if labels[0].size == 0:
- return np.empty(0, dtype=np.intp)
-
- if len(labels) == 1:
- return get_group_index_sorter(ensure_platform_int(labels[0]))
-
- # find indexers of beginning of each set of
- # same-key labels w.r.t all but last level
- tic = labels[0][:-1] != labels[0][1:]
- for lab in labels[1:-1]:
- tic |= lab[:-1] != lab[1:]
-
- starts = np.hstack(([True], tic, [True])).nonzero()[0]
- lab = ensure_int64(labels[-1])
- return lib.get_level_sorter(lab, ensure_platform_int(starts))
-
- if isinstance(self, MultiIndex) and isinstance(other, MultiIndex):
- raise TypeError("Join on level between two MultiIndex objects is ambiguous")
-
- left, right = self, other
-
- flip_order = not isinstance(self, MultiIndex)
- if flip_order:
- left, right = right, left
- flip: dict[JoinHow, JoinHow] = {"right": "left", "left": "right"}
- how = flip.get(how, how)
-
- assert isinstance(left, MultiIndex)
-
- level = left._get_level_number(level)
- old_level = left.levels[level]
-
- if not right.is_unique:
- raise NotImplementedError(
- "Index._join_level on non-unique index is not implemented"
- )
-
- new_level, left_lev_indexer, right_lev_indexer = old_level.join(
- right, how=how, return_indexers=True
- )
-
- if left_lev_indexer is None:
- if keep_order or len(left) == 0:
- left_indexer = None
- join_index = left
- else: # sort the leaves
- left_indexer = _get_leaf_sorter(left.codes[: level + 1])
- join_index = left[left_indexer]
-
- else:
- left_lev_indexer = ensure_platform_int(left_lev_indexer)
- rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level))
- old_codes = left.codes[level]
-
- taker = old_codes[old_codes != -1]
- new_lev_codes = rev_indexer.take(taker)
-
- new_codes = list(left.codes)
- new_codes[level] = new_lev_codes
-
- new_levels = list(left.levels)
- new_levels[level] = new_level
-
- if keep_order: # just drop missing values. o.w. keep order
- left_indexer = np.arange(len(left), dtype=np.intp)
- left_indexer = cast(np.ndarray, left_indexer)
- mask = new_lev_codes != -1
- if not mask.all():
- new_codes = [lab[mask] for lab in new_codes]
- left_indexer = left_indexer[mask]
-
- else: # tie out the order with other
- if level == 0: # outer most level, take the fast route
- max_new_lev = 0 if len(new_lev_codes) == 0 else new_lev_codes.max()
- ngroups = 1 + max_new_lev
- left_indexer, counts = libalgos.groupsort_indexer(
- new_lev_codes, ngroups
- )
-
- # missing values are placed first; drop them!
- left_indexer = left_indexer[counts[0] :]
- new_codes = [lab[left_indexer] for lab in new_codes]
-
- else: # sort the leaves
- mask = new_lev_codes != -1
- mask_all = mask.all()
- if not mask_all:
- new_codes = [lab[mask] for lab in new_codes]
-
- left_indexer = _get_leaf_sorter(new_codes[: level + 1])
- new_codes = [lab[left_indexer] for lab in new_codes]
-
- # left_indexers are w.r.t masked frame.
- # reverse to original frame!
- if not mask_all:
- left_indexer = mask.nonzero()[0][left_indexer]
-
- join_index = MultiIndex(
- levels=new_levels,
- codes=new_codes,
- names=left.names,
- verify_integrity=False,
- )
-
- if right_lev_indexer is not None:
- right_indexer = right_lev_indexer.take(join_index.codes[level])
- else:
- right_indexer = join_index.codes[level]
-
- if flip_order:
- left_indexer, right_indexer = right_indexer, left_indexer
-
- left_indexer = (
- None if left_indexer is None else ensure_platform_int(left_indexer)
- )
- right_indexer = (
- None if right_indexer is None else ensure_platform_int(right_indexer)
- )
- return join_index, left_indexer, right_indexer
-
- @final
- def _join_monotonic(
- self, other: Index, how: JoinHow = "left"
- ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
- # We only get here with matching dtypes and both monotonic increasing
- assert other.dtype == self.dtype
-
- if self.equals(other):
- # This is a convenient place for this check, but its correctness
- # does not depend on monotonicity, so it could go earlier
- # in the calling method.
- ret_index = other if how == "right" else self
- return ret_index, None, None
-
- ridx: npt.NDArray[np.intp] | None
- lidx: npt.NDArray[np.intp] | None
-
- if self.is_unique and other.is_unique:
- # We can perform much better than the general case
- if how == "left":
- join_index = self
- lidx = None
- ridx = self._left_indexer_unique(other)
- elif how == "right":
- join_index = other
- lidx = other._left_indexer_unique(self)
- ridx = None
- elif how == "inner":
- join_array, lidx, ridx = self._inner_indexer(other)
- join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
- elif how == "outer":
- join_array, lidx, ridx = self._outer_indexer(other)
- join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
- else:
- if how == "left":
- join_array, lidx, ridx = self._left_indexer(other)
- elif how == "right":
- join_array, ridx, lidx = other._left_indexer(self)
- elif how == "inner":
- join_array, lidx, ridx = self._inner_indexer(other)
- elif how == "outer":
- join_array, lidx, ridx = self._outer_indexer(other)
-
- assert lidx is not None
- assert ridx is not None
-
- join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
-
- lidx = None if lidx is None else ensure_platform_int(lidx)
- ridx = None if ridx is None else ensure_platform_int(ridx)
- return join_index, lidx, ridx
-
- def _wrap_joined_index(
- self: _IndexT,
- joined: ArrayLike,
- other: _IndexT,
- lidx: npt.NDArray[np.intp],
- ridx: npt.NDArray[np.intp],
- ) -> _IndexT:
- assert other.dtype == self.dtype
-
- if isinstance(self, ABCMultiIndex):
- name = self.names if self.names == other.names else None
- # error: Incompatible return value type (got "MultiIndex",
- # expected "_IndexT")
- mask = lidx == -1
- join_idx = self.take(lidx)
- right = other.take(ridx)
- join_index = join_idx.putmask(mask, right)._sort_levels_monotonic()
- return join_index.set_names(name) # type: ignore[return-value]
- else:
- name = get_op_result_name(self, other)
- return self._constructor._with_infer(joined, name=name, dtype=self.dtype)
-
- @cache_readonly
- def _can_use_libjoin(self) -> bool:
- """
- Whether we can use the fastpaths implement in _libs.join
- """
- if type(self) is Index:
- # excludes EAs, but include masks, we get here with monotonic
- # values only, meaning no NA
- return (
- isinstance(self.dtype, np.dtype)
- or isinstance(self.values, BaseMaskedArray)
- or isinstance(self._values, ArrowExtensionArray)
- )
- return not is_interval_dtype(self.dtype)
-
- # --------------------------------------------------------------------
- # Uncategorized Methods
-
- @property
- def values(self) -> ArrayLike:
- """
- Return an array representing the data in the Index.
-
- .. warning::
-
- We recommend using :attr:`Index.array` or
- :meth:`Index.to_numpy`, depending on whether you need
- a reference to the underlying data or a NumPy array.
-
- Returns
- -------
- array: numpy.ndarray or ExtensionArray
-
- See Also
- --------
- Index.array : Reference to the underlying data.
- Index.to_numpy : A NumPy array representing the underlying data.
- """
- return self._data
-
- @cache_readonly
- @doc(IndexOpsMixin.array)
- def array(self) -> ExtensionArray:
- array = self._data
- if isinstance(array, np.ndarray):
- from pandas.core.arrays.numpy_ import PandasArray
-
- array = PandasArray(array)
- return array
-
- @property
- def _values(self) -> ExtensionArray | np.ndarray:
- """
- The best array representation.
-
- This is an ndarray or ExtensionArray.
-
- ``_values`` are consistent between ``Series`` and ``Index``.
-
- It may differ from the public '.values' method.
-
- index | values | _values |
- ----------------- | --------------- | ------------- |
- Index | ndarray | ndarray |
- CategoricalIndex | Categorical | Categorical |
- DatetimeIndex | ndarray[M8ns] | DatetimeArray |
- DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray |
- PeriodIndex | ndarray[object] | PeriodArray |
- IntervalIndex | IntervalArray | IntervalArray |
-
- See Also
- --------
- values : Values
- """
- return self._data
-
- def _get_engine_target(self) -> ArrayLike:
- """
- Get the ndarray or ExtensionArray that we can pass to the IndexEngine
- constructor.
- """
- vals = self._values
- if isinstance(vals, StringArray):
- # GH#45652 much more performant than ExtensionEngine
- return vals._ndarray
- if (
- type(self) is Index
- and isinstance(self._values, ExtensionArray)
- and not isinstance(self._values, BaseMaskedArray)
- and not (
- isinstance(self._values, ArrowExtensionArray)
- and is_numeric_dtype(self.dtype)
- # Exclude decimal
- and self.dtype.kind != "O"
- )
- ):
- # TODO(ExtensionIndex): remove special-case, just use self._values
- return self._values.astype(object)
- return vals
-
- def _get_join_target(self) -> ArrayLike:
- """
- Get the ndarray or ExtensionArray that we can pass to the join
- functions.
- """
- if isinstance(self._values, BaseMaskedArray):
- # This is only used if our array is monotonic, so no NAs present
- return self._values._data
- elif isinstance(self._values, ArrowExtensionArray):
- # This is only used if our array is monotonic, so no missing values
- # present
- return self._values.to_numpy()
- return self._get_engine_target()
-
- def _from_join_target(self, result: np.ndarray) -> ArrayLike:
- """
- Cast the ndarray returned from one of the libjoin.foo_indexer functions
- back to type(self)._data.
- """
- if isinstance(self.values, BaseMaskedArray):
- return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_))
- elif isinstance(self.values, ArrowExtensionArray):
- return type(self.values)._from_sequence(result)
- return result
-
- @doc(IndexOpsMixin._memory_usage)
- def memory_usage(self, deep: bool = False) -> int:
- result = self._memory_usage(deep=deep)
-
- # include our engine hashtable
- result += self._engine.sizeof(deep=deep)
- return result
-
- @final
- def where(self, cond, other=None) -> Index:
- """
- Replace values where the condition is False.
-
- The replacement is taken from other.
-
- Parameters
- ----------
- cond : bool array-like with the same length as self
- Condition to select the values on.
- other : scalar, or array-like, default None
- Replacement if the condition is False.
-
- Returns
- -------
- pandas.Index
- A copy of self with values replaced from other
- where the condition is False.
-
- See Also
- --------
- Series.where : Same method for Series.
- DataFrame.where : Same method for DataFrame.
-
- Examples
- --------
- >>> idx = pd.Index(['car', 'bike', 'train', 'tractor'])
- >>> idx
- Index(['car', 'bike', 'train', 'tractor'], dtype='object')
- >>> idx.where(idx.isin(['car', 'train']), 'other')
- Index(['car', 'other', 'train', 'other'], dtype='object')
- """
- if isinstance(self, ABCMultiIndex):
- raise NotImplementedError(
- ".where is not supported for MultiIndex operations"
- )
- cond = np.asarray(cond, dtype=bool)
- return self.putmask(~cond, other)
-
- # construction helpers
- @final
- @classmethod
- def _raise_scalar_data_error(cls, data):
- # We return the TypeError so that we can raise it from the constructor
- # in order to keep mypy happy
- raise TypeError(
- f"{cls.__name__}(...) must be called with a collection of some "
- f"kind, {repr(data)} was passed"
- )
-
- def _validate_fill_value(self, value):
- """
- Check if the value can be inserted into our array without casting,
- and convert it to an appropriate native type if necessary.
-
- Raises
- ------
- TypeError
- If the value cannot be inserted into an array of this dtype.
- """
- dtype = self.dtype
- if isinstance(dtype, np.dtype) and dtype.kind not in ["m", "M"]:
- # return np_can_hold_element(dtype, value)
- try:
- return np_can_hold_element(dtype, value)
- except LossySetitemError as err:
- # re-raise as TypeError for consistency
- raise TypeError from err
- elif not can_hold_element(self._values, value):
- raise TypeError
- return value
-
- @final
- def _require_scalar(self, value):
- """
- Check that this is a scalar value that we can use for setitem-like
- operations without changing dtype.
- """
- if not is_scalar(value):
- raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}")
- return value
-
- def _is_memory_usage_qualified(self) -> bool:
- """
- Return a boolean if we need a qualified .info display.
- """
- return is_object_dtype(self.dtype)
-
- def __contains__(self, key: Any) -> bool:
- """
- Return a boolean indicating whether the provided key is in the index.
-
- Parameters
- ----------
- key : label
- The key to check if it is present in the index.
-
- Returns
- -------
- bool
- Whether the key search is in the index.
-
- Raises
- ------
- TypeError
- If the key is not hashable.
-
- See Also
- --------
- Index.isin : Returns an ndarray of boolean dtype indicating whether the
- list-like key is in the index.
-
- Examples
- --------
- >>> idx = pd.Index([1, 2, 3, 4])
- >>> idx
- Index([1, 2, 3, 4], dtype='int64')
-
- >>> 2 in idx
- True
- >>> 6 in idx
- False
- """
- hash(key)
- try:
- return key in self._engine
- except (OverflowError, TypeError, ValueError):
- return False
-
- # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
- # Incompatible types in assignment (expression has type "None", base class
- # "object" defined the type as "Callable[[object], int]")
- __hash__: ClassVar[None] # type: ignore[assignment]
-
- @final
- def __setitem__(self, key, value):
- raise TypeError("Index does not support mutable operations")
-
- def __getitem__(self, key):
- """
- Override numpy.ndarray's __getitem__ method to work as desired.
-
- This function adds lists and Series as valid boolean indexers
- (ndarrays only supports ndarray with dtype=bool).
-
- If resulting ndim != 1, plain ndarray is returned instead of
- corresponding `Index` subclass.
-
- """
- getitem = self._data.__getitem__
-
- if is_integer(key) or is_float(key):
- # GH#44051 exclude bool, which would return a 2d ndarray
- key = com.cast_scalar_indexer(key)
- return getitem(key)
-
- if isinstance(key, slice):
- # This case is separated from the conditional above to avoid
- # pessimization com.is_bool_indexer and ndim checks.
- result = getitem(key)
- # Going through simple_new for performance.
- return type(self)._simple_new(
- result, name=self._name, refs=self._references
- )
-
- if com.is_bool_indexer(key):
- # if we have list[bools, length=1e5] then doing this check+convert
- # takes 166 µs + 2.1 ms and cuts the ndarray.__getitem__
- # time below from 3.8 ms to 496 µs
- # if we already have ndarray[bool], the overhead is 1.4 µs or .25%
- if is_extension_array_dtype(getattr(key, "dtype", None)):
- key = key.to_numpy(dtype=bool, na_value=False)
- else:
- key = np.asarray(key, dtype=bool)
-
- result = getitem(key)
- # Because we ruled out integer above, we always get an arraylike here
- if result.ndim > 1:
- disallow_ndim_indexing(result)
-
- # NB: Using _constructor._simple_new would break if MultiIndex
- # didn't override __getitem__
- return self._constructor._simple_new(result, name=self._name)
-
- def _getitem_slice(self: _IndexT, slobj: slice) -> _IndexT:
- """
- Fastpath for __getitem__ when we know we have a slice.
- """
- res = self._data[slobj]
- return type(self)._simple_new(res, name=self._name, refs=self._references)
-
- @final
- def _can_hold_identifiers_and_holds_name(self, name) -> bool:
- """
- Faster check for ``name in self`` when we know `name` is a Python
- identifier (e.g. in NDFrame.__getattr__, which hits this to support
- . key lookup). For indexes that can't hold identifiers (everything
- but object & categorical) we just return False.
-
- https://github.com/pandas-dev/pandas/issues/19764
- """
- if (
- is_object_dtype(self.dtype)
- or is_string_dtype(self.dtype)
- or is_categorical_dtype(self.dtype)
- ):
- return name in self
- return False
-
- def append(self, other: Index | Sequence[Index]) -> Index:
- """
- Append a collection of Index options together.
-
- Parameters
- ----------
- other : Index or list/tuple of indices
-
- Returns
- -------
- Index
- """
- to_concat = [self]
-
- if isinstance(other, (list, tuple)):
- to_concat += list(other)
- else:
- # error: Argument 1 to "append" of "list" has incompatible type
- # "Union[Index, Sequence[Index]]"; expected "Index"
- to_concat.append(other) # type: ignore[arg-type]
-
- for obj in to_concat:
- if not isinstance(obj, Index):
- raise TypeError("all inputs must be Index")
-
- names = {obj.name for obj in to_concat}
- name = None if len(names) > 1 else self.name
-
- return self._concat(to_concat, name)
-
- def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
- """
- Concatenate multiple Index objects.
- """
- to_concat_vals = [x._values for x in to_concat]
-
- result = concat_compat(to_concat_vals)
-
- return Index._with_infer(result, name=name)
-
- def putmask(self, mask, value) -> Index:
- """
- Return a new Index of the values set with the mask.
-
- Returns
- -------
- Index
-
- See Also
- --------
- numpy.ndarray.putmask : Changes elements of an array
- based on conditional and input values.
- """
- mask, noop = validate_putmask(self._values, mask)
- if noop:
- return self.copy()
-
- if self.dtype != object and is_valid_na_for_dtype(value, self.dtype):
- # e.g. None -> np.nan, see also Block._standardize_fill_value
- value = self._na_value
-
- try:
- converted = self._validate_fill_value(value)
- except (LossySetitemError, ValueError, TypeError) as err:
- if is_object_dtype(self): # pragma: no cover
- raise err
-
- # See also: Block.coerce_to_target_dtype
- dtype = self._find_common_type_compat(value)
- return self.astype(dtype).putmask(mask, value)
-
- values = self._values.copy()
-
- if isinstance(values, np.ndarray):
- converted = setitem_datetimelike_compat(values, mask.sum(), converted)
- np.putmask(values, mask, converted)
-
- else:
- # Note: we use the original value here, not converted, as
- # _validate_fill_value is not idempotent
- values._putmask(mask, value)
-
- return self._shallow_copy(values)
-
- def equals(self, other: Any) -> bool:
- """
- Determine if two Index object are equal.
-
- The things that are being compared are:
-
- * The elements inside the Index object.
- * The order of the elements inside the Index object.
-
- Parameters
- ----------
- other : Any
- The other object to compare against.
-
- Returns
- -------
- bool
- True if "other" is an Index and it has the same elements and order
- as the calling index; False otherwise.
-
- Examples
- --------
- >>> idx1 = pd.Index([1, 2, 3])
- >>> idx1
- Index([1, 2, 3], dtype='int64')
- >>> idx1.equals(pd.Index([1, 2, 3]))
- True
-
- The elements inside are compared
-
- >>> idx2 = pd.Index(["1", "2", "3"])
- >>> idx2
- Index(['1', '2', '3'], dtype='object')
-
- >>> idx1.equals(idx2)
- False
-
- The order is compared
-
- >>> ascending_idx = pd.Index([1, 2, 3])
- >>> ascending_idx
- Index([1, 2, 3], dtype='int64')
- >>> descending_idx = pd.Index([3, 2, 1])
- >>> descending_idx
- Index([3, 2, 1], dtype='int64')
- >>> ascending_idx.equals(descending_idx)
- False
-
- The dtype is *not* compared
-
- >>> int64_idx = pd.Index([1, 2, 3], dtype='int64')
- >>> int64_idx
- Index([1, 2, 3], dtype='int64')
- >>> uint64_idx = pd.Index([1, 2, 3], dtype='uint64')
- >>> uint64_idx
- Index([1, 2, 3], dtype='uint64')
- >>> int64_idx.equals(uint64_idx)
- True
- """
- if self.is_(other):
- return True
-
- if not isinstance(other, Index):
- return False
-
- if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype):
- # if other is not object, use other's logic for coercion
- return other.equals(self)
-
- if isinstance(other, ABCMultiIndex):
- # d-level MultiIndex can equal d-tuple Index
- return other.equals(self)
-
- if isinstance(self._values, ExtensionArray):
- # Dispatch to the ExtensionArray's .equals method.
- if not isinstance(other, type(self)):
- return False
-
- earr = cast(ExtensionArray, self._data)
- return earr.equals(other._data)
-
- if is_extension_array_dtype(other.dtype):
- # All EA-backed Index subclasses override equals
- return other.equals(self)
-
- return array_equivalent(self._values, other._values)
-
- @final
- def identical(self, other) -> bool:
- """
- Similar to equals, but checks that object attributes and types are also equal.
-
- Returns
- -------
- bool
- If two Index objects have equal elements and same type True,
- otherwise False.
- """
- return (
- self.equals(other)
- and all(
- getattr(self, c, None) == getattr(other, c, None)
- for c in self._comparables
- )
- and type(self) == type(other)
- and self.dtype == other.dtype
- )
-
- @final
- def asof(self, label):
- """
- Return the label from the index, or, if not present, the previous one.
-
- Assuming that the index is sorted, return the passed index label if it
- is in the index, or return the previous index label if the passed one
- is not in the index.
-
- Parameters
- ----------
- label : object
- The label up to which the method returns the latest index label.
-
- Returns
- -------
- object
- The passed label if it is in the index. The previous label if the
- passed label is not in the sorted index or `NaN` if there is no
- such label.
-
- See Also
- --------
- Series.asof : Return the latest value in a Series up to the
- passed index.
- merge_asof : Perform an asof merge (similar to left join but it
- matches on nearest key rather than equal key).
- Index.get_loc : An `asof` is a thin wrapper around `get_loc`
- with method='pad'.
-
- Examples
- --------
- `Index.asof` returns the latest index label up to the passed label.
-
- >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03'])
- >>> idx.asof('2014-01-01')
- '2013-12-31'
-
- If the label is in the index, the method returns the passed label.
-
- >>> idx.asof('2014-01-02')
- '2014-01-02'
-
- If all of the labels in the index are later than the passed label,
- NaN is returned.
-
- >>> idx.asof('1999-01-02')
- nan
-
- If the index is not sorted, an error is raised.
-
- >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02',
- ... '2014-01-03'])
- >>> idx_not_sorted.asof('2013-12-31')
- Traceback (most recent call last):
- ValueError: index must be monotonic increasing or decreasing
- """
- self._searchsorted_monotonic(label) # validate sortedness
- try:
- loc = self.get_loc(label)
- except (KeyError, TypeError):
- # KeyError -> No exact match, try for padded
- # TypeError -> passed e.g. non-hashable, fall through to get
- # the tested exception message
- indexer = self.get_indexer([label], method="pad")
- if indexer.ndim > 1 or indexer.size > 1:
- raise TypeError("asof requires scalar valued input")
- loc = indexer.item()
- if loc == -1:
- return self._na_value
- else:
- if isinstance(loc, slice):
- loc = loc.indices(len(self))[-1]
-
- return self[loc]
-
- def asof_locs(
- self, where: Index, mask: npt.NDArray[np.bool_]
- ) -> npt.NDArray[np.intp]:
- """
- Return the locations (indices) of labels in the index.
-
- As in the `asof` function, if the label (a particular entry in
- `where`) is not in the index, the latest index label up to the
- passed label is chosen and its index returned.
-
- If all of the labels in the index are later than a label in `where`,
- -1 is returned.
-
- `mask` is used to ignore NA values in the index during calculation.
-
- Parameters
- ----------
- where : Index
- An Index consisting of an array of timestamps.
- mask : np.ndarray[bool]
- Array of booleans denoting where values in the original
- data are not NA.
-
- Returns
- -------
- np.ndarray[np.intp]
- An array of locations (indices) of the labels from the Index
- which correspond to the return values of the `asof` function
- for every element in `where`.
- """
- # error: No overload variant of "searchsorted" of "ndarray" matches argument
- # types "Union[ExtensionArray, ndarray[Any, Any]]", "str"
- # TODO: will be fixed when ExtensionArray.searchsorted() is fixed
- locs = self._values[mask].searchsorted(
- where._values, side="right" # type: ignore[call-overload]
- )
- locs = np.where(locs > 0, locs - 1, 0)
-
- result = np.arange(len(self), dtype=np.intp)[mask].take(locs)
-
- first_value = self._values[mask.argmax()]
- result[(locs == 0) & (where._values < first_value)] = -1
-
- return result
-
- def sort_values(
- self,
- return_indexer: bool = False,
- ascending: bool = True,
- na_position: str_t = "last",
- key: Callable | None = None,
- ):
- """
- Return a sorted copy of the index.
-
- Return a sorted copy of the index, and optionally return the indices
- that sorted the index itself.
-
- Parameters
- ----------
- return_indexer : bool, default False
- Should the indices that would sort the index be returned.
- ascending : bool, default True
- Should the index values be sorted in an ascending order.
- na_position : {'first' or 'last'}, default 'last'
- Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
- the end.
-
- .. versionadded:: 1.2.0
-
- key : callable, optional
- If not None, apply the key function to the index values
- before sorting. This is similar to the `key` argument in the
- builtin :meth:`sorted` function, with the notable difference that
- this `key` function should be *vectorized*. It should expect an
- ``Index`` and return an ``Index`` of the same shape.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- sorted_index : pandas.Index
- Sorted copy of the index.
- indexer : numpy.ndarray, optional
- The indices that the index itself was sorted by.
-
- See Also
- --------
- Series.sort_values : Sort values of a Series.
- DataFrame.sort_values : Sort values in a DataFrame.
-
- Examples
- --------
- >>> idx = pd.Index([10, 100, 1, 1000])
- >>> idx
- Index([10, 100, 1, 1000], dtype='int64')
-
- Sort values in ascending order (default behavior).
-
- >>> idx.sort_values()
- Index([1, 10, 100, 1000], dtype='int64')
-
- Sort values in descending order, and also get the indices `idx` was
- sorted by.
-
- >>> idx.sort_values(ascending=False, return_indexer=True)
- (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2]))
- """
- idx = ensure_key_mapped(self, key)
-
- # GH 35584. Sort missing values according to na_position kwarg
- # ignore na_position for MultiIndex
- if not isinstance(self, ABCMultiIndex):
- _as = nargsort(
- items=idx, ascending=ascending, na_position=na_position, key=key
- )
- else:
- _as = idx.argsort()
- if not ascending:
- _as = _as[::-1]
-
- sorted_index = self.take(_as)
-
- if return_indexer:
- return sorted_index, _as
- else:
- return sorted_index
-
- @final
- def sort(self, *args, **kwargs):
- """
- Use sort_values instead.
- """
- raise TypeError("cannot sort an Index object in-place, use sort_values instead")
-
- def shift(self, periods: int = 1, freq=None):
- """
- Shift index by desired number of time frequency increments.
-
- This method is for shifting the values of datetime-like indexes
- by a specified time increment a given number of times.
-
- Parameters
- ----------
- periods : int, default 1
- Number of periods (or increments) to shift by,
- can be positive or negative.
- freq : pandas.DateOffset, pandas.Timedelta or str, optional
- Frequency increment to shift by.
- If None, the index is shifted by its own `freq` attribute.
- Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc.
-
- Returns
- -------
- pandas.Index
- Shifted index.
-
- See Also
- --------
- Series.shift : Shift values of Series.
-
- Notes
- -----
- This method is only implemented for datetime-like index classes,
- i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex.
-
- Examples
- --------
- Put the first 5 month starts of 2011 into an index.
-
- >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS')
- >>> month_starts
- DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01',
- '2011-05-01'],
- dtype='datetime64[ns]', freq='MS')
-
- Shift the index by 10 days.
-
- >>> month_starts.shift(10, freq='D')
- DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11',
- '2011-05-11'],
- dtype='datetime64[ns]', freq=None)
-
- The default value of `freq` is the `freq` attribute of the index,
- which is 'MS' (month start) in this example.
-
- >>> month_starts.shift(10)
- DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01',
- '2012-03-01'],
- dtype='datetime64[ns]', freq='MS')
- """
- raise NotImplementedError(
- f"This method is only implemented for DatetimeIndex, PeriodIndex and "
- f"TimedeltaIndex; Got type {type(self).__name__}"
- )
-
- def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]:
- """
- Return the integer indices that would sort the index.
-
- Parameters
- ----------
- *args
- Passed to `numpy.ndarray.argsort`.
- **kwargs
- Passed to `numpy.ndarray.argsort`.
-
- Returns
- -------
- np.ndarray[np.intp]
- Integer indices that would sort the index if used as
- an indexer.
-
- See Also
- --------
- numpy.argsort : Similar method for NumPy arrays.
- Index.sort_values : Return sorted copy of Index.
-
- Examples
- --------
- >>> idx = pd.Index(['b', 'a', 'd', 'c'])
- >>> idx
- Index(['b', 'a', 'd', 'c'], dtype='object')
-
- >>> order = idx.argsort()
- >>> order
- array([1, 0, 3, 2])
-
- >>> idx[order]
- Index(['a', 'b', 'c', 'd'], dtype='object')
- """
- # This works for either ndarray or EA, is overridden
- # by RangeIndex, MultIIndex
- return self._data.argsort(*args, **kwargs)
-
- def _check_indexing_error(self, key):
- if not is_scalar(key):
- # if key is not a scalar, directly raise an error (the code below
- # would convert to numpy arrays and raise later any way) - GH29926
- raise InvalidIndexError(key)
-
- @cache_readonly
- def _should_fallback_to_positional(self) -> bool:
- """
- Should an integer key be treated as positional?
- """
- return self.inferred_type not in {
- "integer",
- "mixed-integer",
- "floating",
- "complex",
- }
-
- _index_shared_docs[
- "get_indexer_non_unique"
- ] = """
- Compute indexer and mask for new index given the current index.
-
- The indexer should be then used as an input to ndarray.take to align the
- current data to the new index.
-
- Parameters
- ----------
- target : %(target_klass)s
-
- Returns
- -------
- indexer : np.ndarray[np.intp]
- Integers from 0 to n - 1 indicating that the index at these
- positions matches the corresponding target values. Missing values
- in the target are marked by -1.
- missing : np.ndarray[np.intp]
- An indexer into the target of the values not found.
- These correspond to the -1 in the indexer array.
-
- Examples
- --------
- >>> index = pd.Index(['c', 'b', 'a', 'b', 'b'])
- >>> index.get_indexer_non_unique(['b', 'b'])
- (array([1, 3, 4, 1, 3, 4]), array([], dtype=int64))
-
- In the example below there are no matched values.
-
- >>> index = pd.Index(['c', 'b', 'a', 'b', 'b'])
- >>> index.get_indexer_non_unique(['q', 'r', 't'])
- (array([-1, -1, -1]), array([0, 1, 2]))
-
- For this reason, the returned ``indexer`` contains only integers equal to -1.
- It demonstrates that there's no match between the index and the ``target``
- values at these positions. The mask [0, 1, 2] in the return value shows that
- the first, second, and third elements are missing.
-
- Notice that the return value is a tuple contains two items. In the example
- below the first item is an array of locations in ``index``. The second
- item is a mask shows that the first and third elements are missing.
-
- >>> index = pd.Index(['c', 'b', 'a', 'b', 'b'])
- >>> index.get_indexer_non_unique(['f', 'b', 's'])
- (array([-1, 1, 3, 4, -1]), array([0, 2]))
- """
-
- @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
- def get_indexer_non_unique(
- self, target
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- target = ensure_index(target)
- target = self._maybe_cast_listlike_indexer(target)
-
- if not self._should_compare(target) and not self._should_partial_index(target):
- # _should_partial_index e.g. IntervalIndex with numeric scalars
- # that can be matched to Interval scalars.
- return self._get_indexer_non_comparable(target, method=None, unique=False)
-
- pself, ptarget = self._maybe_promote(target)
- if pself is not self or ptarget is not target:
- return pself.get_indexer_non_unique(ptarget)
-
- if not is_dtype_equal(self.dtype, target.dtype):
- # TODO: if object, could use infer_dtype to preempt costly
- # conversion if still non-comparable?
- dtype = self._find_common_type_compat(target)
-
- this = self.astype(dtype, copy=False)
- that = target.astype(dtype, copy=False)
- return this.get_indexer_non_unique(that)
-
- # TODO: get_indexer has fastpaths for both Categorical-self and
- # Categorical-target. Can we do something similar here?
-
- # Note: _maybe_promote ensures we never get here with MultiIndex
- # self and non-Multi target
- tgt_values = target._get_engine_target()
- if self._is_multi and target._is_multi:
- engine = self._engine
- # Item "IndexEngine" of "Union[IndexEngine, ExtensionEngine]" has
- # no attribute "_extract_level_codes"
- tgt_values = engine._extract_level_codes(target) # type: ignore[union-attr]
-
- indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
- return ensure_platform_int(indexer), ensure_platform_int(missing)
-
- @final
- def get_indexer_for(self, target) -> npt.NDArray[np.intp]:
- """
- Guaranteed return of an indexer even when non-unique.
-
- This dispatches to get_indexer or get_indexer_non_unique
- as appropriate.
-
- Returns
- -------
- np.ndarray[np.intp]
- List of indices.
-
- Examples
- --------
- >>> idx = pd.Index([np.nan, 'var1', np.nan])
- >>> idx.get_indexer_for([np.nan])
- array([0, 2])
- """
- if self._index_as_unique:
- return self.get_indexer(target)
- indexer, _ = self.get_indexer_non_unique(target)
- return indexer
-
- def _get_indexer_strict(self, key, axis_name: str_t) -> tuple[Index, np.ndarray]:
- """
- Analogue to get_indexer that raises if any elements are missing.
- """
- keyarr = key
- if not isinstance(keyarr, Index):
- keyarr = com.asarray_tuplesafe(keyarr)
-
- if self._index_as_unique:
- indexer = self.get_indexer_for(keyarr)
- keyarr = self.reindex(keyarr)[0]
- else:
- keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-
- self._raise_if_missing(keyarr, indexer, axis_name)
-
- keyarr = self.take(indexer)
- if isinstance(key, Index):
- # GH 42790 - Preserve name from an Index
- keyarr.name = key.name
- if (
- isinstance(keyarr.dtype, np.dtype) and keyarr.dtype.kind in ["m", "M"]
- ) or isinstance(keyarr.dtype, DatetimeTZDtype):
- # DTI/TDI.take can infer a freq in some cases when we dont want one
- if isinstance(key, list) or (
- isinstance(key, type(self))
- # "Index" has no attribute "freq"
- and key.freq is None # type: ignore[attr-defined]
- ):
- keyarr = keyarr._with_freq(None)
-
- return keyarr, indexer
-
- def _raise_if_missing(self, key, indexer, axis_name: str_t) -> None:
- """
- Check that indexer can be used to return a result.
-
- e.g. at least one element was found,
- unless the list of keys was actually empty.
-
- Parameters
- ----------
- key : list-like
- Targeted labels (only used to show correct error message).
- indexer: array-like of booleans
- Indices corresponding to the key,
- (with -1 indicating not found).
- axis_name : str
-
- Raises
- ------
- KeyError
- If at least one key was requested but none was found.
- """
- if len(key) == 0:
- return
-
- # Count missing values
- missing_mask = indexer < 0
- nmissing = missing_mask.sum()
-
- if nmissing:
- # TODO: remove special-case; this is just to keep exception
- # message tests from raising while debugging
- use_interval_msg = is_interval_dtype(self.dtype) or (
- is_categorical_dtype(self.dtype)
- # "Index" has no attribute "categories" [attr-defined]
- and is_interval_dtype(
- self.categories.dtype # type: ignore[attr-defined]
- )
- )
-
- if nmissing == len(indexer):
- if use_interval_msg:
- key = list(key)
- raise KeyError(f"None of [{key}] are in the [{axis_name}]")
-
- not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
- raise KeyError(f"{not_found} not in index")
-
- @overload
- def _get_indexer_non_comparable(
- self, target: Index, method, unique: Literal[True] = ...
- ) -> npt.NDArray[np.intp]:
- ...
-
- @overload
- def _get_indexer_non_comparable(
- self, target: Index, method, unique: Literal[False]
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- ...
-
- @overload
- def _get_indexer_non_comparable(
- self, target: Index, method, unique: bool = True
- ) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- ...
-
- @final
- def _get_indexer_non_comparable(
- self, target: Index, method, unique: bool = True
- ) -> npt.NDArray[np.intp] | tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- """
- Called from get_indexer or get_indexer_non_unique when the target
- is of a non-comparable dtype.
-
- For get_indexer lookups with method=None, get_indexer is an _equality_
- check, so non-comparable dtypes mean we will always have no matches.
-
- For get_indexer lookups with a method, get_indexer is an _inequality_
- check, so non-comparable dtypes mean we will always raise TypeError.
-
- Parameters
- ----------
- target : Index
- method : str or None
- unique : bool, default True
- * True if called from get_indexer.
- * False if called from get_indexer_non_unique.
-
- Raises
- ------
- TypeError
- If doing an inequality check, i.e. method is not None.
- """
- if method is not None:
- other = _unpack_nested_dtype(target)
- raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}")
-
- no_matches = -1 * np.ones(target.shape, dtype=np.intp)
- if unique:
- # This is for get_indexer
- return no_matches
- else:
- # This is for get_indexer_non_unique
- missing = np.arange(len(target), dtype=np.intp)
- return no_matches, missing
-
- @property
- def _index_as_unique(self) -> bool:
- """
- Whether we should treat this as unique for the sake of
- get_indexer vs get_indexer_non_unique.
-
- For IntervalIndex compat.
- """
- return self.is_unique
-
- _requires_unique_msg = "Reindexing only valid with uniquely valued Index objects"
-
- @final
- def _maybe_promote(self, other: Index) -> tuple[Index, Index]:
- """
- When dealing with an object-dtype Index and a non-object Index, see
- if we can upcast the object-dtype one to improve performance.
- """
-
- if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex):
- if (
- self.tz is not None
- and other.tz is not None
- and not tz_compare(self.tz, other.tz)
- ):
- # standardize on UTC
- return self.tz_convert("UTC"), other.tz_convert("UTC")
-
- elif self.inferred_type == "date" and isinstance(other, ABCDatetimeIndex):
- try:
- return type(other)(self), other
- except OutOfBoundsDatetime:
- return self, other
- elif self.inferred_type == "timedelta" and isinstance(other, ABCTimedeltaIndex):
- # TODO: we dont have tests that get here
- return type(other)(self), other
-
- elif self.dtype.kind == "u" and other.dtype.kind == "i":
- # GH#41873
- if other.min() >= 0:
- # lookup min as it may be cached
- # TODO: may need itemsize check if we have non-64-bit Indexes
- return self, other.astype(self.dtype)
-
- elif self._is_multi and not other._is_multi:
- try:
- # "Type[Index]" has no attribute "from_tuples"
- other = type(self).from_tuples(other) # type: ignore[attr-defined]
- except (TypeError, ValueError):
- # let's instead try with a straight Index
- self = Index(self._values)
-
- if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype):
- # Reverse op so we dont need to re-implement on the subclasses
- other, self = other._maybe_promote(self)
-
- return self, other
-
- @final
- def _find_common_type_compat(self, target) -> DtypeObj:
- """
- Implementation of find_common_type that adjusts for Index-specific
- special cases.
- """
- target_dtype, _ = infer_dtype_from(target, pandas_dtype=True)
-
- # special case: if one dtype is uint64 and the other a signed int, return object
- # See https://github.com/pandas-dev/pandas/issues/26778 for discussion
- # Now it's:
- # * float | [u]int -> float
- # * uint64 | signed int -> object
- # We may change union(float | [u]int) to go to object.
- if self.dtype == "uint64" or target_dtype == "uint64":
- if is_signed_integer_dtype(self.dtype) or is_signed_integer_dtype(
- target_dtype
- ):
- return _dtype_obj
-
- dtype = find_result_type(self._values, target)
- dtype = common_dtype_categorical_compat([self, target], dtype)
- return dtype
-
- @final
- def _should_compare(self, other: Index) -> bool:
- """
- Check if `self == other` can ever have non-False entries.
- """
-
- if (is_bool_dtype(other) and is_any_real_numeric_dtype(self)) or (
- is_bool_dtype(self) and is_any_real_numeric_dtype(other)
- ):
- # GH#16877 Treat boolean labels passed to a numeric index as not
- # found. Without this fix False and True would be treated as 0 and 1
- # respectively.
- return False
-
- other = _unpack_nested_dtype(other)
- dtype = other.dtype
- return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
-
- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
- """
- Can we compare values of the given dtype to our own?
- """
- if self.dtype.kind == "b":
- return dtype.kind == "b"
- elif is_numeric_dtype(self.dtype):
- return is_numeric_dtype(dtype)
- # TODO: this was written assuming we only get here with object-dtype,
- # which is nom longer correct. Can we specialize for EA?
- return True
-
- @final
- def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]:
- """
- Group the index labels by a given array of values.
-
- Parameters
- ----------
- values : array
- Values used to determine the groups.
-
- Returns
- -------
- dict
- {group name -> group labels}
- """
- # TODO: if we are a MultiIndex, we can do better
- # that converting to tuples
- if isinstance(values, ABCMultiIndex):
- values = values._values
- values = Categorical(values)
- result = values._reverse_indexer()
-
- # map to the label
- result = {k: self.take(v) for k, v in result.items()}
-
- return PrettyDict(result)
-
- def map(self, mapper, na_action=None):
- """
- Map values using an input mapping or function.
-
- Parameters
- ----------
- mapper : function, dict, or Series
- Mapping correspondence.
- na_action : {None, 'ignore'}
- If 'ignore', propagate NA values, without passing them to the
- mapping correspondence.
-
- Returns
- -------
- Union[Index, MultiIndex]
- The output of the mapping function applied to the index.
- If the function returns a tuple with more than one element
- a MultiIndex will be returned.
- """
- from pandas.core.indexes.multi import MultiIndex
-
- new_values = self._map_values(mapper, na_action=na_action)
-
- # we can return a MultiIndex
- if new_values.size and isinstance(new_values[0], tuple):
- if isinstance(self, MultiIndex):
- names = self.names
- elif self.name:
- names = [self.name] * len(new_values[0])
- else:
- names = None
- return MultiIndex.from_tuples(new_values, names=names)
-
- dtype = None
- if not new_values.size:
- # empty
- dtype = self.dtype
-
- # e.g. if we are floating and new_values is all ints, then we
- # don't want to cast back to floating. But if we are UInt64
- # and new_values is all ints, we want to try.
- same_dtype = lib.infer_dtype(new_values, skipna=False) == self.inferred_type
- if same_dtype:
- new_values = maybe_cast_pointwise_result(
- new_values, self.dtype, same_dtype=same_dtype
- )
-
- return Index._with_infer(new_values, dtype=dtype, copy=False, name=self.name)
-
- # TODO: De-duplicate with map, xref GH#32349
- @final
- def _transform_index(self, func, *, level=None) -> Index:
- """
- Apply function to all values found in index.
-
- This includes transforming multiindex entries separately.
- Only apply function to one level of the MultiIndex if level is specified.
- """
- if isinstance(self, ABCMultiIndex):
- values = [
- self.get_level_values(i).map(func)
- if i == level or level is None
- else self.get_level_values(i)
- for i in range(self.nlevels)
- ]
- return type(self).from_arrays(values)
- else:
- items = [func(x) for x in self]
- return Index(items, name=self.name, tupleize_cols=False)
-
- def isin(self, values, level=None) -> npt.NDArray[np.bool_]:
- """
- Return a boolean array where the index values are in `values`.
-
- Compute boolean array of whether each index value is found in the
- passed set of values. The length of the returned boolean array matches
- the length of the index.
-
- Parameters
- ----------
- values : set or list-like
- Sought values.
- level : str or int, optional
- Name or position of the index level to use (if the index is a
- `MultiIndex`).
-
- Returns
- -------
- np.ndarray[bool]
- NumPy array of boolean values.
-
- See Also
- --------
- Series.isin : Same for Series.
- DataFrame.isin : Same method for DataFrames.
-
- Notes
- -----
- In the case of `MultiIndex` you must either specify `values` as a
- list-like object containing tuples that are the same length as the
- number of levels, or specify `level`. Otherwise it will raise a
- ``ValueError``.
-
- If `level` is specified:
-
- - if it is the name of one *and only one* index level, use that level;
- - otherwise it should be a number indicating level position.
-
- Examples
- --------
- >>> idx = pd.Index([1,2,3])
- >>> idx
- Index([1, 2, 3], dtype='int64')
-
- Check whether each index value in a list of values.
-
- >>> idx.isin([1, 4])
- array([ True, False, False])
-
- >>> midx = pd.MultiIndex.from_arrays([[1,2,3],
- ... ['red', 'blue', 'green']],
- ... names=('number', 'color'))
- >>> midx
- MultiIndex([(1, 'red'),
- (2, 'blue'),
- (3, 'green')],
- names=['number', 'color'])
-
- Check whether the strings in the 'color' level of the MultiIndex
- are in a list of colors.
-
- >>> midx.isin(['red', 'orange', 'yellow'], level='color')
- array([ True, False, False])
-
- To check across the levels of a MultiIndex, pass a list of tuples:
-
- >>> midx.isin([(1, 'red'), (3, 'red')])
- array([ True, False, False])
-
- For a DatetimeIndex, string values in `values` are converted to
- Timestamps.
-
- >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13']
- >>> dti = pd.to_datetime(dates)
- >>> dti
- DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'],
- dtype='datetime64[ns]', freq=None)
-
- >>> dti.isin(['2000-03-11'])
- array([ True, False, False])
- """
- if level is not None:
- self._validate_index_level(level)
- return algos.isin(self._values, values)
-
- def _get_string_slice(self, key: str_t):
- # this is for partial string indexing,
- # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex
- raise NotImplementedError
-
- def slice_indexer(
- self,
- start: Hashable | None = None,
- end: Hashable | None = None,
- step: int | None = None,
- ) -> slice:
- """
- Compute the slice indexer for input labels and step.
-
- Index needs to be ordered and unique.
-
- Parameters
- ----------
- start : label, default None
- If None, defaults to the beginning.
- end : label, default None
- If None, defaults to the end.
- step : int, default None
-
- Returns
- -------
- slice
-
- Raises
- ------
- KeyError : If key does not exist, or key is not unique and index is
- not ordered.
-
- Notes
- -----
- This function assumes that the data is sorted, so use at your own peril
-
- Examples
- --------
- This is a method on all index types. For example you can do:
-
- >>> idx = pd.Index(list('abcd'))
- >>> idx.slice_indexer(start='b', end='c')
- slice(1, 3, None)
-
- >>> idx = pd.MultiIndex.from_arrays([list('abcd'), list('efgh')])
- >>> idx.slice_indexer(start='b', end=('c', 'g'))
- slice(1, 3, None)
- """
- start_slice, end_slice = self.slice_locs(start, end, step=step)
-
- # return a slice
- if not is_scalar(start_slice):
- raise AssertionError("Start slice bound is non-scalar")
- if not is_scalar(end_slice):
- raise AssertionError("End slice bound is non-scalar")
-
- return slice(start_slice, end_slice, step)
-
- def _maybe_cast_indexer(self, key):
- """
- If we have a float key and are not a floating index, then try to cast
- to an int if equivalent.
- """
- return key
-
- def _maybe_cast_listlike_indexer(self, target) -> Index:
- """
- Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
- """
- return ensure_index(target)
-
- @final
- def _validate_indexer(self, form: str_t, key, kind: str_t) -> None:
- """
- If we are positional indexer, validate that we have appropriate
- typed bounds must be an integer.
- """
- assert kind in ["getitem", "iloc"]
-
- if key is not None and not is_integer(key):
- self._raise_invalid_indexer(form, key)
-
- def _maybe_cast_slice_bound(self, label, side: str_t):
- """
- This function should be overloaded in subclasses that allow non-trivial
- casting on label-slice bounds, e.g. datetime-like indices allowing
- strings containing formatted datetimes.
-
- Parameters
- ----------
- label : object
- side : {'left', 'right'}
-
- Returns
- -------
- label : object
-
- Notes
- -----
- Value of `side` parameter should be validated in caller.
- """
-
- # We are a plain index here (sub-class override this method if they
- # wish to have special treatment for floats/ints, e.g. datetimelike Indexes
-
- if is_numeric_dtype(self.dtype):
- return self._maybe_cast_indexer(label)
-
- # reject them, if index does not contain label
- if (is_float(label) or is_integer(label)) and label not in self:
- self._raise_invalid_indexer("slice", label)
-
- return label
-
- def _searchsorted_monotonic(self, label, side: Literal["left", "right"] = "left"):
- if self.is_monotonic_increasing:
- return self.searchsorted(label, side=side)
- elif self.is_monotonic_decreasing:
- # np.searchsorted expects ascending sort order, have to reverse
- # everything for it to work (element ordering, search side and
- # resulting value).
- pos = self[::-1].searchsorted(
- label, side="right" if side == "left" else "left"
- )
- return len(self) - pos
-
- raise ValueError("index must be monotonic increasing or decreasing")
-
- def get_slice_bound(self, label, side: Literal["left", "right"]) -> int:
- """
- Calculate slice bound that corresponds to given label.
-
- Returns leftmost (one-past-the-rightmost if ``side=='right'``) position
- of given label.
-
- Parameters
- ----------
- label : object
- side : {'left', 'right'}
-
- Returns
- -------
- int
- Index of label.
- """
-
- if side not in ("left", "right"):
- raise ValueError(
- "Invalid value for side kwarg, must be either "
- f"'left' or 'right': {side}"
- )
-
- original_label = label
-
- # For datetime indices label may be a string that has to be converted
- # to datetime boundary according to its resolution.
- label = self._maybe_cast_slice_bound(label, side)
-
- # we need to look up the label
- try:
- slc = self.get_loc(label)
- except KeyError as err:
- try:
- return self._searchsorted_monotonic(label, side)
- except ValueError:
- # raise the original KeyError
- raise err
-
- if isinstance(slc, np.ndarray):
- # get_loc may return a boolean array, which
- # is OK as long as they are representable by a slice.
- assert is_bool_dtype(slc.dtype)
- slc = lib.maybe_booleans_to_slice(slc.view("u1"))
- if isinstance(slc, np.ndarray):
- raise KeyError(
- f"Cannot get {side} slice bound for non-unique "
- f"label: {repr(original_label)}"
- )
-
- if isinstance(slc, slice):
- if side == "left":
- return slc.start
- else:
- return slc.stop
- else:
- if side == "right":
- return slc + 1
- else:
- return slc
-
- def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]:
- """
- Compute slice locations for input labels.
-
- Parameters
- ----------
- start : label, default None
- If None, defaults to the beginning.
- end : label, default None
- If None, defaults to the end.
- step : int, defaults None
- If None, defaults to 1.
-
- Returns
- -------
- tuple[int, int]
-
- See Also
- --------
- Index.get_loc : Get location for a single label.
-
- Notes
- -----
- This method only works if the index is monotonic or unique.
-
- Examples
- --------
- >>> idx = pd.Index(list('abcd'))
- >>> idx.slice_locs(start='b', end='c')
- (1, 3)
- """
- inc = step is None or step >= 0
-
- if not inc:
- # If it's a reverse slice, temporarily swap bounds.
- start, end = end, start
-
- # GH 16785: If start and end happen to be date strings with UTC offsets
- # attempt to parse and check that the offsets are the same
- if isinstance(start, (str, datetime)) and isinstance(end, (str, datetime)):
- try:
- ts_start = Timestamp(start)
- ts_end = Timestamp(end)
- except (ValueError, TypeError):
- pass
- else:
- if not tz_compare(ts_start.tzinfo, ts_end.tzinfo):
- raise ValueError("Both dates must have the same UTC offset")
-
- start_slice = None
- if start is not None:
- start_slice = self.get_slice_bound(start, "left")
- if start_slice is None:
- start_slice = 0
-
- end_slice = None
- if end is not None:
- end_slice = self.get_slice_bound(end, "right")
- if end_slice is None:
- end_slice = len(self)
-
- if not inc:
- # Bounds at this moment are swapped, swap them back and shift by 1.
- #
- # slice_locs('B', 'A', step=-1): s='B', e='A'
- #
- # s='A' e='B'
- # AFTER SWAP: | |
- # v ------------------> V
- # -----------------------------------
- # | | |A|A|A|A| | | | | |B|B| | | | |
- # -----------------------------------
- # ^ <------------------ ^
- # SHOULD BE: | |
- # end=s-1 start=e-1
- #
- end_slice, start_slice = start_slice - 1, end_slice - 1
-
- # i == -1 triggers ``len(self) + i`` selection that points to the
- # last element, not before-the-first one, subtracting len(self)
- # compensates that.
- if end_slice == -1:
- end_slice -= len(self)
- if start_slice == -1:
- start_slice -= len(self)
-
- return start_slice, end_slice
-
- def delete(self: _IndexT, loc) -> _IndexT:
- """
- Make new Index with passed location(-s) deleted.
-
- Parameters
- ----------
- loc : int or list of int
- Location of item(-s) which will be deleted.
- Use a list of locations to delete more than one value at the same time.
-
- Returns
- -------
- Index
- Will be same type as self, except for RangeIndex.
-
- See Also
- --------
- numpy.delete : Delete any rows and column from NumPy array (ndarray).
-
- Examples
- --------
- >>> idx = pd.Index(['a', 'b', 'c'])
- >>> idx.delete(1)
- Index(['a', 'c'], dtype='object')
-
- >>> idx = pd.Index(['a', 'b', 'c'])
- >>> idx.delete([0, 2])
- Index(['b'], dtype='object')
- """
- values = self._values
- res_values: ArrayLike
- if isinstance(values, np.ndarray):
- # TODO(__array_function__): special casing will be unnecessary
- res_values = np.delete(values, loc)
- else:
- res_values = values.delete(loc)
-
- # _constructor so RangeIndex-> Index with an int64 dtype
- return self._constructor._simple_new(res_values, name=self.name)
-
- def insert(self, loc: int, item) -> Index:
- """
- Make new Index inserting new item at location.
-
- Follows Python numpy.insert semantics for negative values.
-
- Parameters
- ----------
- loc : int
- item : object
-
- Returns
- -------
- Index
- """
- item = lib.item_from_zerodim(item)
- if is_valid_na_for_dtype(item, self.dtype) and self.dtype != object:
- item = self._na_value
-
- arr = self._values
-
- try:
- if isinstance(arr, ExtensionArray):
- res_values = arr.insert(loc, item)
- return type(self)._simple_new(res_values, name=self.name)
- else:
- item = self._validate_fill_value(item)
- except (TypeError, ValueError, LossySetitemError):
- # e.g. trying to insert an integer into a DatetimeIndex
- # We cannot keep the same dtype, so cast to the (often object)
- # minimal shared dtype before doing the insert.
- dtype = self._find_common_type_compat(item)
- return self.astype(dtype).insert(loc, item)
-
- if arr.dtype != object or not isinstance(
- item, (tuple, np.datetime64, np.timedelta64)
- ):
- # with object-dtype we need to worry about numpy incorrectly casting
- # dt64/td64 to integer, also about treating tuples as sequences
- # special-casing dt64/td64 https://github.com/numpy/numpy/issues/12550
- casted = arr.dtype.type(item)
- new_values = np.insert(arr, loc, casted)
-
- else:
- # error: No overload variant of "insert" matches argument types
- # "ndarray[Any, Any]", "int", "None"
- new_values = np.insert(arr, loc, None) # type: ignore[call-overload]
- loc = loc if loc >= 0 else loc - 1
- new_values[loc] = item
-
- return Index._with_infer(new_values, name=self.name)
-
- def drop(
- self,
- labels: Index | np.ndarray | Iterable[Hashable],
- errors: IgnoreRaise = "raise",
- ) -> Index:
- """
- Make new Index with passed list of labels deleted.
-
- Parameters
- ----------
- labels : array-like or scalar
- errors : {'ignore', 'raise'}, default 'raise'
- If 'ignore', suppress error and existing labels are dropped.
-
- Returns
- -------
- Index
- Will be same type as self, except for RangeIndex.
-
- Raises
- ------
- KeyError
- If not all of the labels are found in the selected axis
- """
- if not isinstance(labels, Index):
- # avoid materializing e.g. RangeIndex
- arr_dtype = "object" if self.dtype == "object" else None
- labels = com.index_labels_to_array(labels, dtype=arr_dtype)
-
- indexer = self.get_indexer_for(labels)
- mask = indexer == -1
- if mask.any():
- if errors != "ignore":
- raise KeyError(f"{list(labels[mask])} not found in axis")
- indexer = indexer[~mask]
- return self.delete(indexer)
-
- def infer_objects(self, copy: bool = True) -> Index:
- """
- If we have an object dtype, try to infer a non-object dtype.
-
- Parameters
- ----------
- copy : bool, default True
- Whether to make a copy in cases where no inference occurs.
- """
- if self._is_multi:
- raise NotImplementedError(
- "infer_objects is not implemented for MultiIndex. "
- "Use index.to_frame().infer_objects() instead."
- )
- if self.dtype != object:
- return self.copy() if copy else self
-
- values = self._values
- values = cast("npt.NDArray[np.object_]", values)
- res_values = lib.maybe_convert_objects(
- values,
- convert_datetime=True,
- convert_timedelta=True,
- convert_period=True,
- convert_interval=True,
- )
- if copy and res_values is values:
- return self.copy()
- result = Index(res_values, name=self.name)
- if not copy and res_values is values and self._references is not None:
- result._references = self._references
- result._references.add_index_reference(result)
- return result
-
- # --------------------------------------------------------------------
- # Generated Arithmetic, Comparison, and Unary Methods
-
- def _cmp_method(self, other, op):
- """
- Wrapper used to dispatch comparison operations.
- """
- if self.is_(other):
- # fastpath
- if op in {operator.eq, operator.le, operator.ge}:
- arr = np.ones(len(self), dtype=bool)
- if self._can_hold_na and not isinstance(self, ABCMultiIndex):
- # TODO: should set MultiIndex._can_hold_na = False?
- arr[self.isna()] = False
- return arr
- elif op is operator.ne:
- arr = np.zeros(len(self), dtype=bool)
- if self._can_hold_na and not isinstance(self, ABCMultiIndex):
- arr[self.isna()] = True
- return arr
-
- if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)) and len(
- self
- ) != len(other):
- raise ValueError("Lengths must match to compare")
-
- if not isinstance(other, ABCMultiIndex):
- other = extract_array(other, extract_numpy=True)
- else:
- other = np.asarray(other)
-
- if is_object_dtype(self.dtype) and isinstance(other, ExtensionArray):
- # e.g. PeriodArray, Categorical
- with np.errstate(all="ignore"):
- result = op(self._values, other)
-
- elif isinstance(self._values, ExtensionArray):
- result = op(self._values, other)
-
- elif is_object_dtype(self.dtype) and not isinstance(self, ABCMultiIndex):
- # don't pass MultiIndex
- with np.errstate(all="ignore"):
- result = ops.comp_method_OBJECT_ARRAY(op, self._values, other)
-
- else:
- with np.errstate(all="ignore"):
- result = ops.comparison_op(self._values, other, op)
-
- return result
-
- @final
- def _logical_method(self, other, op):
- res_name = ops.get_op_result_name(self, other)
-
- lvalues = self._values
- rvalues = extract_array(other, extract_numpy=True, extract_range=True)
-
- res_values = ops.logical_op(lvalues, rvalues, op)
- return self._construct_result(res_values, name=res_name)
-
- @final
- def _construct_result(self, result, name):
- if isinstance(result, tuple):
- return (
- Index(result[0], name=name, dtype=result[0].dtype),
- Index(result[1], name=name, dtype=result[1].dtype),
- )
- return Index(result, name=name, dtype=result.dtype)
-
- def _arith_method(self, other, op):
- if (
- isinstance(other, Index)
- and is_object_dtype(other.dtype)
- and type(other) is not Index
- ):
- # We return NotImplemented for object-dtype index *subclasses* so they have
- # a chance to implement ops before we unwrap them.
- # See https://github.com/pandas-dev/pandas/issues/31109
- return NotImplemented
-
- return super()._arith_method(other, op)
-
- @final
- def _unary_method(self, op):
- result = op(self._values)
- return Index(result, name=self.name)
-
- def __abs__(self) -> Index:
- return self._unary_method(operator.abs)
-
- def __neg__(self) -> Index:
- return self._unary_method(operator.neg)
-
- def __pos__(self) -> Index:
- return self._unary_method(operator.pos)
-
- def __invert__(self) -> Index:
- # GH#8875
- return self._unary_method(operator.inv)
-
- # --------------------------------------------------------------------
- # Reductions
-
- def any(self, *args, **kwargs):
- """
- Return whether any element is Truthy.
-
- Parameters
- ----------
- *args
- Required for compatibility with numpy.
- **kwargs
- Required for compatibility with numpy.
-
- Returns
- -------
- bool or array-like (if axis is specified)
- A single element array-like may be converted to bool.
-
- See Also
- --------
- Index.all : Return whether all elements are True.
- Series.all : Return whether all elements are True.
-
- Notes
- -----
- Not a Number (NaN), positive infinity and negative infinity
- evaluate to True because these are not equal to zero.
-
- Examples
- --------
- >>> index = pd.Index([0, 1, 2])
- >>> index.any()
- True
-
- >>> index = pd.Index([0, 0, 0])
- >>> index.any()
- False
- """
- nv.validate_any(args, kwargs)
- self._maybe_disable_logical_methods("any")
- # error: Argument 1 to "any" has incompatible type "ArrayLike"; expected
- # "Union[Union[int, float, complex, str, bytes, generic], Sequence[Union[int,
- # float, complex, str, bytes, generic]], Sequence[Sequence[Any]],
- # _SupportsArray]"
- return np.any(self.values) # type: ignore[arg-type]
-
- def all(self, *args, **kwargs):
- """
- Return whether all elements are Truthy.
-
- Parameters
- ----------
- *args
- Required for compatibility with numpy.
- **kwargs
- Required for compatibility with numpy.
-
- Returns
- -------
- bool or array-like (if axis is specified)
- A single element array-like may be converted to bool.
-
- See Also
- --------
- Index.any : Return whether any element in an Index is True.
- Series.any : Return whether any element in a Series is True.
- Series.all : Return whether all elements in a Series are True.
-
- Notes
- -----
- Not a Number (NaN), positive infinity and negative infinity
- evaluate to True because these are not equal to zero.
-
- Examples
- --------
- True, because nonzero integers are considered True.
-
- >>> pd.Index([1, 2, 3]).all()
- True
-
- False, because ``0`` is considered False.
-
- >>> pd.Index([0, 1, 2]).all()
- False
- """
- nv.validate_all(args, kwargs)
- self._maybe_disable_logical_methods("all")
- # error: Argument 1 to "all" has incompatible type "ArrayLike"; expected
- # "Union[Union[int, float, complex, str, bytes, generic], Sequence[Union[int,
- # float, complex, str, bytes, generic]], Sequence[Sequence[Any]],
- # _SupportsArray]"
- return np.all(self.values) # type: ignore[arg-type]
-
- @final
- def _maybe_disable_logical_methods(self, opname: str_t) -> None:
- """
- raise if this Index subclass does not support any or all.
- """
- if (
- isinstance(self, ABCMultiIndex)
- or needs_i8_conversion(self.dtype)
- or is_interval_dtype(self.dtype)
- or is_categorical_dtype(self.dtype)
- or is_float_dtype(self.dtype)
- ):
- # This call will raise
- make_invalid_op(opname)(self)
-
- @Appender(IndexOpsMixin.argmin.__doc__)
- def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int:
- nv.validate_argmin(args, kwargs)
- nv.validate_minmax_axis(axis)
-
- if not self._is_multi and self.hasnans:
- # Take advantage of cache
- mask = self._isnan
- if not skipna or mask.all():
- return -1
- return super().argmin(skipna=skipna)
-
- @Appender(IndexOpsMixin.argmax.__doc__)
- def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int:
- nv.validate_argmax(args, kwargs)
- nv.validate_minmax_axis(axis)
-
- if not self._is_multi and self.hasnans:
- # Take advantage of cache
- mask = self._isnan
- if not skipna or mask.all():
- return -1
- return super().argmax(skipna=skipna)
-
- @doc(IndexOpsMixin.min)
- def min(self, axis=None, skipna: bool = True, *args, **kwargs):
- nv.validate_min(args, kwargs)
- nv.validate_minmax_axis(axis)
-
- if not len(self):
- return self._na_value
-
- if len(self) and self.is_monotonic_increasing:
- # quick check
- first = self[0]
- if not isna(first):
- return first
-
- if not self._is_multi and self.hasnans:
- # Take advantage of cache
- mask = self._isnan
- if not skipna or mask.all():
- return self._na_value
-
- if not self._is_multi and not isinstance(self._values, np.ndarray):
- return self._values._reduce(name="min", skipna=skipna)
-
- return super().min(skipna=skipna)
-
- @doc(IndexOpsMixin.max)
- def max(self, axis=None, skipna: bool = True, *args, **kwargs):
- nv.validate_max(args, kwargs)
- nv.validate_minmax_axis(axis)
-
- if not len(self):
- return self._na_value
-
- if len(self) and self.is_monotonic_increasing:
- # quick check
- last = self[-1]
- if not isna(last):
- return last
-
- if not self._is_multi and self.hasnans:
- # Take advantage of cache
- mask = self._isnan
- if not skipna or mask.all():
- return self._na_value
-
- if not self._is_multi and not isinstance(self._values, np.ndarray):
- return self._values._reduce(name="max", skipna=skipna)
-
- return super().max(skipna=skipna)
-
- # --------------------------------------------------------------------
-
- @final
- @property
- def shape(self) -> Shape:
- """
- Return a tuple of the shape of the underlying data.
- """
- # See GH#27775, GH#27384 for history/reasoning in how this is defined.
- return (len(self),)
-
-
-def ensure_index_from_sequences(sequences, names=None) -> Index:
- """
- Construct an index from sequences of data.
-
- A single sequence returns an Index. Many sequences returns a
- MultiIndex.
-
- Parameters
- ----------
- sequences : sequence of sequences
- names : sequence of str
-
- Returns
- -------
- index : Index or MultiIndex
-
- Examples
- --------
- >>> ensure_index_from_sequences([[1, 2, 3]], names=["name"])
- Index([1, 2, 3], dtype='int64', name='name')
-
- >>> ensure_index_from_sequences([["a", "a"], ["a", "b"]], names=["L1", "L2"])
- MultiIndex([('a', 'a'),
- ('a', 'b')],
- names=['L1', 'L2'])
-
- See Also
- --------
- ensure_index
- """
- from pandas.core.indexes.multi import MultiIndex
-
- if len(sequences) == 1:
- if names is not None:
- names = names[0]
- return Index(sequences[0], name=names)
- else:
- return MultiIndex.from_arrays(sequences, names=names)
-
-
-def ensure_index(index_like: Axes, copy: bool = False) -> Index:
- """
- Ensure that we have an index from some index-like object.
-
- Parameters
- ----------
- index_like : sequence
- An Index or other sequence
- copy : bool, default False
-
- Returns
- -------
- index : Index or MultiIndex
-
- See Also
- --------
- ensure_index_from_sequences
-
- Examples
- --------
- >>> ensure_index(['a', 'b'])
- Index(['a', 'b'], dtype='object')
-
- >>> ensure_index([('a', 'a'), ('b', 'c')])
- Index([('a', 'a'), ('b', 'c')], dtype='object')
-
- >>> ensure_index([['a', 'a'], ['b', 'c']])
- MultiIndex([('a', 'b'),
- ('a', 'c')],
- )
- """
- if isinstance(index_like, Index):
- if copy:
- index_like = index_like.copy()
- return index_like
-
- if isinstance(index_like, ABCSeries):
- name = index_like.name
- return Index(index_like, name=name, copy=copy)
-
- if is_iterator(index_like):
- index_like = list(index_like)
-
- if isinstance(index_like, list):
- if type(index_like) is not list:
- # must check for exactly list here because of strict type
- # check in clean_index_list
- index_like = list(index_like)
-
- if len(index_like) and lib.is_all_arraylike(index_like):
- from pandas.core.indexes.multi import MultiIndex
-
- return MultiIndex.from_arrays(index_like)
- else:
- return Index(index_like, copy=copy, tupleize_cols=False)
- else:
- return Index(index_like, copy=copy)
-
-
-def ensure_has_len(seq):
- """
- If seq is an iterator, put its values into a list.
- """
- try:
- len(seq)
- except TypeError:
- return list(seq)
- else:
- return seq
-
-
-def trim_front(strings: list[str]) -> list[str]:
- """
- Trims zeros and decimal points.
-
- Examples
- --------
- >>> trim_front([" a", " b"])
- ['a', 'b']
-
- >>> trim_front([" a", " "])
- ['a', '']
- """
- if not strings:
- return strings
- while all(strings) and all(x[0] == " " for x in strings):
- strings = [x[1:] for x in strings]
- return strings
-
-
-def _validate_join_method(method: str) -> None:
- if method not in ["left", "right", "inner", "outer"]:
- raise ValueError(f"do not recognize join method {method}")
-
-
-def maybe_extract_name(name, obj, cls) -> Hashable:
- """
- If no name is passed, then extract it from data, validating hashability.
- """
- if name is None and isinstance(obj, (Index, ABCSeries)):
- # Note we don't just check for "name" attribute since that would
- # pick up e.g. dtype.name
- name = obj.name
-
- # GH#29069
- if not is_hashable(name):
- raise TypeError(f"{cls.__name__}.name must be a hashable type")
-
- return name
-
-
-def get_unanimous_names(*indexes: Index) -> tuple[Hashable, ...]:
- """
- Return common name if all indices agree, otherwise None (level-by-level).
-
- Parameters
- ----------
- indexes : list of Index objects
-
- Returns
- -------
- list
- A list representing the unanimous 'names' found.
- """
- name_tups = [tuple(i.names) for i in indexes]
- name_sets = [{*ns} for ns in zip_longest(*name_tups)]
- names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets)
- return names
-
-
-def _unpack_nested_dtype(other: Index) -> Index:
- """
- When checking if our dtype is comparable with another, we need
- to unpack CategoricalDtype to look at its categories.dtype.
-
- Parameters
- ----------
- other : Index
-
- Returns
- -------
- Index
- """
- from pandas.core.arrays.arrow import ArrowDtype
-
- dtype = other.dtype
- if isinstance(dtype, CategoricalDtype):
- # If there is ever a SparseIndex, this could get dispatched
- # here too.
- return dtype.categories
- elif isinstance(dtype, ArrowDtype):
- # GH 53617
- import pyarrow as pa
-
- if pa.types.is_dictionary(dtype.pyarrow_dtype):
- other = other.astype(ArrowDtype(dtype.pyarrow_dtype.value_type))
- return other
-
-
-def _maybe_try_sort(result, sort):
- if sort is not False:
- try:
- result = algos.safe_sort(result)
- except TypeError as err:
- if sort is True:
- raise
- warnings.warn(
- f"{err}, sort order is undefined for incomparable objects.",
- RuntimeWarning,
- stacklevel=find_stack_level(),
- )
- return result
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/category.py b/contrib/python/pandas/py3/pandas/core/indexes/category.py
deleted file mode 100644
index 51ff92560fe..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/category.py
+++ /dev/null
@@ -1,486 +0,0 @@
-from __future__ import annotations
-
-from typing import (
- Any,
- Hashable,
-)
-
-import numpy as np
-
-from pandas._libs import index as libindex
-from pandas._typing import (
- Dtype,
- DtypeObj,
- npt,
-)
-from pandas.util._decorators import (
- cache_readonly,
- doc,
-)
-
-from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_scalar,
-)
-from pandas.core.dtypes.missing import (
- is_valid_na_for_dtype,
- isna,
- notna,
-)
-
-from pandas.core.arrays.categorical import (
- Categorical,
- contains,
-)
-from pandas.core.construction import extract_array
-import pandas.core.indexes.base as ibase
-from pandas.core.indexes.base import (
- Index,
- maybe_extract_name,
-)
-from pandas.core.indexes.extension import (
- NDArrayBackedExtensionIndex,
- inherit_names,
-)
-
-from pandas.io.formats.printing import pprint_thing
-
-_index_doc_kwargs: dict[str, str] = dict(ibase._index_doc_kwargs)
-_index_doc_kwargs.update({"target_klass": "CategoricalIndex"})
-
-
-@inherit_names(
- [
- "argsort",
- "tolist",
- "codes",
- "categories",
- "ordered",
- "_reverse_indexer",
- "searchsorted",
- "min",
- "max",
- ],
- Categorical,
-)
-@inherit_names(
- [
- "rename_categories",
- "reorder_categories",
- "add_categories",
- "remove_categories",
- "remove_unused_categories",
- "set_categories",
- "as_ordered",
- "as_unordered",
- ],
- Categorical,
- wrap=True,
-)
-class CategoricalIndex(NDArrayBackedExtensionIndex):
- """
- Index based on an underlying :class:`Categorical`.
-
- CategoricalIndex, like Categorical, can only take on a limited,
- and usually fixed, number of possible values (`categories`). Also,
- like Categorical, it might have an order, but numerical operations
- (additions, divisions, ...) are not possible.
-
- Parameters
- ----------
- data : array-like (1-dimensional)
- The values of the categorical. If `categories` are given, values not in
- `categories` will be replaced with NaN.
- categories : index-like, optional
- The categories for the categorical. Items need to be unique.
- If the categories are not given here (and also not in `dtype`), they
- will be inferred from the `data`.
- ordered : bool, optional
- Whether or not this categorical is treated as an ordered
- categorical. If not given here or in `dtype`, the resulting
- categorical will be unordered.
- dtype : CategoricalDtype or "category", optional
- If :class:`CategoricalDtype`, cannot be used together with
- `categories` or `ordered`.
- copy : bool, default False
- Make a copy of input ndarray.
- name : object, optional
- Name to be stored in the index.
-
- Attributes
- ----------
- codes
- categories
- ordered
-
- Methods
- -------
- rename_categories
- reorder_categories
- add_categories
- remove_categories
- remove_unused_categories
- set_categories
- as_ordered
- as_unordered
- map
-
- Raises
- ------
- ValueError
- If the categories do not validate.
- TypeError
- If an explicit ``ordered=True`` is given but no `categories` and the
- `values` are not sortable.
-
- See Also
- --------
- Index : The base pandas Index type.
- Categorical : A categorical array.
- CategoricalDtype : Type for categorical data.
-
- Notes
- -----
- See the `user guide
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#categoricalindex>`__
- for more.
-
- Examples
- --------
- >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"])
- CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
- categories=['a', 'b', 'c'], ordered=False, dtype='category')
-
- ``CategoricalIndex`` can also be instantiated from a ``Categorical``:
-
- >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"])
- >>> pd.CategoricalIndex(c)
- CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
- categories=['a', 'b', 'c'], ordered=False, dtype='category')
-
- Ordered ``CategoricalIndex`` can have a min and max value.
-
- >>> ci = pd.CategoricalIndex(
- ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"]
- ... )
- >>> ci
- CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
- categories=['c', 'b', 'a'], ordered=True, dtype='category')
- >>> ci.min()
- 'c'
- """
-
- _typ = "categoricalindex"
- _data_cls = Categorical
-
- @property
- def _can_hold_strings(self):
- return self.categories._can_hold_strings
-
- @cache_readonly
- def _should_fallback_to_positional(self) -> bool:
- return self.categories._should_fallback_to_positional
-
- codes: np.ndarray
- categories: Index
- ordered: bool | None
- _data: Categorical
- _values: Categorical
-
- @property
- def _engine_type(self) -> type[libindex.IndexEngine]:
- # self.codes can have dtype int8, int16, int32 or int64, so we need
- # to return the corresponding engine type (libindex.Int8Engine, etc.).
- return {
- np.int8: libindex.Int8Engine,
- np.int16: libindex.Int16Engine,
- np.int32: libindex.Int32Engine,
- np.int64: libindex.Int64Engine,
- }[self.codes.dtype.type]
-
- # --------------------------------------------------------------------
- # Constructors
-
- def __new__(
- cls,
- data=None,
- categories=None,
- ordered=None,
- dtype: Dtype | None = None,
- copy: bool = False,
- name: Hashable = None,
- ) -> CategoricalIndex:
- name = maybe_extract_name(name, data, cls)
-
- if is_scalar(data):
- # GH#38944 include None here, which pre-2.0 subbed in []
- cls._raise_scalar_data_error(data)
-
- data = Categorical(
- data, categories=categories, ordered=ordered, dtype=dtype, copy=copy
- )
-
- return cls._simple_new(data, name=name)
-
- # --------------------------------------------------------------------
-
- def _is_dtype_compat(self, other) -> Categorical:
- """
- *this is an internal non-public method*
-
- provide a comparison between the dtype of self and other (coercing if
- needed)
-
- Parameters
- ----------
- other : Index
-
- Returns
- -------
- Categorical
-
- Raises
- ------
- TypeError if the dtypes are not compatible
- """
- if is_categorical_dtype(other):
- other = extract_array(other)
- if not other._categories_match_up_to_permutation(self):
- raise TypeError(
- "categories must match existing categories when appending"
- )
-
- elif other._is_multi:
- # preempt raising NotImplementedError in isna call
- raise TypeError("MultiIndex is not dtype-compatible with CategoricalIndex")
- else:
- values = other
-
- cat = Categorical(other, dtype=self.dtype)
- other = CategoricalIndex(cat)
- if not other.isin(values).all():
- raise TypeError(
- "cannot append a non-category item to a CategoricalIndex"
- )
- other = other._values
-
- if not ((other == values) | (isna(other) & isna(values))).all():
- # GH#37667 see test_equals_non_category
- raise TypeError(
- "categories must match existing categories when appending"
- )
-
- return other
-
- def equals(self, other: object) -> bool:
- """
- Determine if two CategoricalIndex objects contain the same elements.
-
- Returns
- -------
- bool
- If two CategoricalIndex objects have equal elements True,
- otherwise False.
- """
- if self.is_(other):
- return True
-
- if not isinstance(other, Index):
- return False
-
- try:
- other = self._is_dtype_compat(other)
- except (TypeError, ValueError):
- return False
-
- return self._data.equals(other)
-
- # --------------------------------------------------------------------
- # Rendering Methods
-
- @property
- def _formatter_func(self):
- return self.categories._formatter_func
-
- def _format_attrs(self):
- """
- Return a list of tuples of the (attr,formatted_value)
- """
- attrs: list[tuple[str, str | int | bool | None]]
-
- attrs = [
- (
- "categories",
- f"[{', '.join(self._data._repr_categories())}]",
- ),
- ("ordered", self.ordered),
- ]
- extra = super()._format_attrs()
- return attrs + extra
-
- def _format_with_header(self, header: list[str], na_rep: str) -> list[str]:
- result = [
- pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep
- for x in self._values
- ]
- return header + result
-
- # --------------------------------------------------------------------
-
- @property
- def inferred_type(self) -> str:
- return "categorical"
-
- @doc(Index.__contains__)
- def __contains__(self, key: Any) -> bool:
- # if key is a NaN, check if any NaN is in self.
- if is_valid_na_for_dtype(key, self.categories.dtype):
- return self.hasnans
-
- return contains(self, key, container=self._engine)
-
- def reindex(
- self, target, method=None, level=None, limit=None, tolerance=None
- ) -> tuple[Index, npt.NDArray[np.intp] | None]:
- """
- Create index with target's values (move/add/delete values as necessary)
-
- Returns
- -------
- new_index : pd.Index
- Resulting index
- indexer : np.ndarray[np.intp] or None
- Indices of output values in original index
-
- """
- if method is not None:
- raise NotImplementedError(
- "argument method is not implemented for CategoricalIndex.reindex"
- )
- if level is not None:
- raise NotImplementedError(
- "argument level is not implemented for CategoricalIndex.reindex"
- )
- if limit is not None:
- raise NotImplementedError(
- "argument limit is not implemented for CategoricalIndex.reindex"
- )
- return super().reindex(target)
-
- # --------------------------------------------------------------------
- # Indexing Methods
-
- def _maybe_cast_indexer(self, key) -> int:
- # GH#41933: we have to do this instead of self._data._validate_scalar
- # because this will correctly get partial-indexing on Interval categories
- try:
- return self._data._unbox_scalar(key)
- except KeyError:
- if is_valid_na_for_dtype(key, self.categories.dtype):
- return -1
- raise
-
- def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:
- if isinstance(values, CategoricalIndex):
- values = values._data
- if isinstance(values, Categorical):
- # Indexing on codes is more efficient if categories are the same,
- # so we can apply some optimizations based on the degree of
- # dtype-matching.
- cat = self._data._encode_with_my_categories(values)
- codes = cat._codes
- else:
- codes = self.categories.get_indexer(values)
- codes = codes.astype(self.codes.dtype, copy=False)
- cat = self._data._from_backing_data(codes)
- return type(self)._simple_new(cat)
-
- # --------------------------------------------------------------------
-
- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
- return self.categories._is_comparable_dtype(dtype)
-
- def map(self, mapper):
- """
- Map values using input an input mapping or function.
-
- Maps the values (their categories, not the codes) of the index to new
- categories. If the mapping correspondence is one-to-one the result is a
- :class:`~pandas.CategoricalIndex` which has the same order property as
- the original, otherwise an :class:`~pandas.Index` is returned.
-
- If a `dict` or :class:`~pandas.Series` is used any unmapped category is
- mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
- will be returned.
-
- Parameters
- ----------
- mapper : function, dict, or Series
- Mapping correspondence.
-
- Returns
- -------
- pandas.CategoricalIndex or pandas.Index
- Mapped index.
-
- See Also
- --------
- Index.map : Apply a mapping correspondence on an
- :class:`~pandas.Index`.
- Series.map : Apply a mapping correspondence on a
- :class:`~pandas.Series`.
- Series.apply : Apply more complex functions on a
- :class:`~pandas.Series`.
-
- Examples
- --------
- >>> idx = pd.CategoricalIndex(['a', 'b', 'c'])
- >>> idx
- CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
- ordered=False, dtype='category')
- >>> idx.map(lambda x: x.upper())
- CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'],
- ordered=False, dtype='category')
- >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'})
- CategoricalIndex(['first', 'second', 'third'], categories=['first',
- 'second', 'third'], ordered=False, dtype='category')
-
- If the mapping is one-to-one the ordering of the categories is
- preserved:
-
- >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True)
- >>> idx
- CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
- ordered=True, dtype='category')
- >>> idx.map({'a': 3, 'b': 2, 'c': 1})
- CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True,
- dtype='category')
-
- If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
-
- >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'})
- Index(['first', 'second', 'first'], dtype='object')
-
- If a `dict` is used, all unmapped categories are mapped to `NaN` and
- the result is an :class:`~pandas.Index`:
-
- >>> idx.map({'a': 'first', 'b': 'second'})
- Index(['first', 'second', nan], dtype='object')
- """
- mapped = self._values.map(mapper)
- return Index(mapped, name=self.name)
-
- def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
- # if calling index is category, don't check dtype of others
- try:
- cat = Categorical._concat_same_type(
- [self._is_dtype_compat(c) for c in to_concat]
- )
- except TypeError:
- # not all to_concat elements are among our categories (or NA)
- from pandas.core.dtypes.concat import concat_compat
-
- res = concat_compat([x._values for x in to_concat])
- return Index(res, name=name)
- else:
- return type(self)._simple_new(cat, name=name)
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/datetimelike.py b/contrib/python/pandas/py3/pandas/core/indexes/datetimelike.py
deleted file mode 100644
index 9237423fb03..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/datetimelike.py
+++ /dev/null
@@ -1,787 +0,0 @@
-"""
-Base and utility classes for tseries type pandas objects.
-"""
-from __future__ import annotations
-
-from abc import (
- ABC,
- abstractmethod,
-)
-from datetime import datetime
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Sequence,
- TypeVar,
- cast,
- final,
-)
-
-import numpy as np
-
-from pandas._libs import (
- NaT,
- Timedelta,
- lib,
-)
-from pandas._libs.tslibs import (
- BaseOffset,
- Resolution,
- Tick,
- parsing,
- to_offset,
-)
-from pandas._typing import (
- Axis,
- npt,
-)
-from pandas.compat.numpy import function as nv
-from pandas.errors import NullFrequencyError
-from pandas.util._decorators import (
- Appender,
- cache_readonly,
- doc,
-)
-
-from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_dtype_equal,
- is_integer,
- is_list_like,
-)
-from pandas.core.dtypes.concat import concat_compat
-
-from pandas.core.arrays import (
- DatetimeArray,
- ExtensionArray,
- PeriodArray,
- TimedeltaArray,
-)
-from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
-import pandas.core.common as com
-import pandas.core.indexes.base as ibase
-from pandas.core.indexes.base import (
- Index,
- _index_shared_docs,
-)
-from pandas.core.indexes.extension import NDArrayBackedExtensionIndex
-from pandas.core.indexes.range import RangeIndex
-from pandas.core.tools.timedeltas import to_timedelta
-
-if TYPE_CHECKING:
- from pandas import CategoricalIndex
-
-_index_doc_kwargs = dict(ibase._index_doc_kwargs)
-
-_T = TypeVar("_T", bound="DatetimeIndexOpsMixin")
-_TDT = TypeVar("_TDT", bound="DatetimeTimedeltaMixin")
-
-
-class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex, ABC):
- """
- Common ops mixin to support a unified interface datetimelike Index.
- """
-
- _can_hold_strings = False
- _data: DatetimeArray | TimedeltaArray | PeriodArray
-
- @doc(DatetimeLikeArrayMixin.mean)
- def mean(self, *, skipna: bool = True, axis: int | None = 0):
- return self._data.mean(skipna=skipna, axis=axis)
-
- @property
- def freq(self) -> BaseOffset | None:
- return self._data.freq
-
- @freq.setter
- def freq(self, value) -> None:
- # error: Property "freq" defined in "PeriodArray" is read-only [misc]
- self._data.freq = value # type: ignore[misc]
-
- @property
- def asi8(self) -> npt.NDArray[np.int64]:
- return self._data.asi8
-
- @property
- @doc(DatetimeLikeArrayMixin.freqstr)
- def freqstr(self) -> str | None:
- return self._data.freqstr
-
- @cache_readonly
- @abstractmethod
- def _resolution_obj(self) -> Resolution:
- ...
-
- @cache_readonly
- @doc(DatetimeLikeArrayMixin.resolution)
- def resolution(self) -> str:
- return self._data.resolution
-
- # ------------------------------------------------------------------------
-
- @cache_readonly
- def hasnans(self) -> bool:
- return self._data._hasna
-
- def equals(self, other: Any) -> bool:
- """
- Determines if two Index objects contain the same elements.
- """
- if self.is_(other):
- return True
-
- if not isinstance(other, Index):
- return False
- elif other.dtype.kind in ["f", "i", "u", "c"]:
- return False
- elif not isinstance(other, type(self)):
- should_try = False
- inferable = self._data._infer_matches
- if other.dtype == object:
- should_try = other.inferred_type in inferable
- elif is_categorical_dtype(other.dtype):
- other = cast("CategoricalIndex", other)
- should_try = other.categories.inferred_type in inferable
-
- if should_try:
- try:
- other = type(self)(other)
- except (ValueError, TypeError, OverflowError):
- # e.g.
- # ValueError -> cannot parse str entry, or OutOfBoundsDatetime
- # TypeError -> trying to convert IntervalIndex to DatetimeIndex
- # OverflowError -> Index([very_large_timedeltas])
- return False
-
- if not is_dtype_equal(self.dtype, other.dtype):
- # have different timezone
- return False
-
- return np.array_equal(self.asi8, other.asi8)
-
- @Appender(Index.__contains__.__doc__)
- def __contains__(self, key: Any) -> bool:
- hash(key)
- try:
- self.get_loc(key)
- except (KeyError, TypeError, ValueError):
- return False
- return True
-
- def _convert_tolerance(self, tolerance, target):
- tolerance = np.asarray(to_timedelta(tolerance).to_numpy())
- return super()._convert_tolerance(tolerance, target)
-
- # --------------------------------------------------------------------
- # Rendering Methods
-
- def format(
- self,
- name: bool = False,
- formatter: Callable | None = None,
- na_rep: str = "NaT",
- date_format: str | None = None,
- ) -> list[str]:
- """
- Render a string representation of the Index.
- """
- header = []
- if name:
- header.append(
- ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n"))
- if self.name is not None
- else ""
- )
-
- if formatter is not None:
- return header + list(self.map(formatter))
-
- return self._format_with_header(header, na_rep=na_rep, date_format=date_format)
-
- def _format_with_header(
- self, header: list[str], na_rep: str = "NaT", date_format: str | None = None
- ) -> list[str]:
- # matches base class except for whitespace padding and date_format
- return header + list(
- self._format_native_types(na_rep=na_rep, date_format=date_format)
- )
-
- @property
- def _formatter_func(self):
- return self._data._formatter()
-
- def _format_attrs(self):
- """
- Return a list of tuples of the (attr,formatted_value).
- """
- attrs = super()._format_attrs()
- for attrib in self._attributes:
- # iterating over _attributes prevents us from doing this for PeriodIndex
- if attrib == "freq":
- freq = self.freqstr
- if freq is not None:
- freq = repr(freq) # e.g. D -> 'D'
- attrs.append(("freq", freq))
- return attrs
-
- @Appender(Index._summary.__doc__)
- def _summary(self, name=None) -> str:
- result = super()._summary(name=name)
- if self.freq:
- result += f"\nFreq: {self.freqstr}"
-
- return result
-
- # --------------------------------------------------------------------
- # Indexing Methods
-
- @final
- def _can_partial_date_slice(self, reso: Resolution) -> bool:
- # e.g. test_getitem_setitem_periodindex
- # History of conversation GH#3452, GH#3931, GH#2369, GH#14826
- return reso > self._resolution_obj
- # NB: for DTI/PI, not TDI
-
- def _parsed_string_to_bounds(self, reso: Resolution, parsed):
- raise NotImplementedError
-
- def _parse_with_reso(self, label: str):
- # overridden by TimedeltaIndex
- try:
- if self.freq is None or hasattr(self.freq, "rule_code"):
- freq = self.freq
- except NotImplementedError:
- freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None))
-
- freqstr: str | None
- if freq is not None and not isinstance(freq, str):
- freqstr = freq.rule_code
- else:
- freqstr = freq
-
- if isinstance(label, np.str_):
- # GH#45580
- label = str(label)
-
- parsed, reso_str = parsing.parse_datetime_string_with_reso(label, freqstr)
- reso = Resolution.from_attrname(reso_str)
- return parsed, reso
-
- def _get_string_slice(self, key: str):
- # overridden by TimedeltaIndex
- parsed, reso = self._parse_with_reso(key)
- try:
- return self._partial_date_slice(reso, parsed)
- except KeyError as err:
- raise KeyError(key) from err
-
- @final
- def _partial_date_slice(
- self,
- reso: Resolution,
- parsed: datetime,
- ):
- """
- Parameters
- ----------
- reso : Resolution
- parsed : datetime
-
- Returns
- -------
- slice or ndarray[intp]
- """
- if not self._can_partial_date_slice(reso):
- raise ValueError
-
- t1, t2 = self._parsed_string_to_bounds(reso, parsed)
- vals = self._data._ndarray
- unbox = self._data._unbox
-
- if self.is_monotonic_increasing:
- if len(self) and (
- (t1 < self[0] and t2 < self[0]) or (t1 > self[-1] and t2 > self[-1])
- ):
- # we are out of range
- raise KeyError
-
- # TODO: does this depend on being monotonic _increasing_?
-
- # a monotonic (sorted) series can be sliced
- left = vals.searchsorted(unbox(t1), side="left")
- right = vals.searchsorted(unbox(t2), side="right")
- return slice(left, right)
-
- else:
- lhs_mask = vals >= unbox(t1)
- rhs_mask = vals <= unbox(t2)
-
- # try to find the dates
- return (lhs_mask & rhs_mask).nonzero()[0]
-
- def _maybe_cast_slice_bound(self, label, side: str):
- """
- If label is a string, cast it to scalar type according to resolution.
-
- Parameters
- ----------
- label : object
- side : {'left', 'right'}
-
- Returns
- -------
- label : object
-
- Notes
- -----
- Value of `side` parameter should be validated in caller.
- """
- if isinstance(label, str):
- try:
- parsed, reso = self._parse_with_reso(label)
- except ValueError as err:
- # DTI -> parsing.DateParseError
- # TDI -> 'unit abbreviation w/o a number'
- # PI -> string cannot be parsed as datetime-like
- self._raise_invalid_indexer("slice", label, err)
-
- lower, upper = self._parsed_string_to_bounds(reso, parsed)
- return lower if side == "left" else upper
- elif not isinstance(label, self._data._recognized_scalars):
- self._raise_invalid_indexer("slice", label)
-
- return label
-
- # --------------------------------------------------------------------
- # Arithmetic Methods
-
- def shift(self: _T, periods: int = 1, freq=None) -> _T:
- """
- Shift index by desired number of time frequency increments.
-
- This method is for shifting the values of datetime-like indexes
- by a specified time increment a given number of times.
-
- Parameters
- ----------
- periods : int, default 1
- Number of periods (or increments) to shift by,
- can be positive or negative.
- freq : pandas.DateOffset, pandas.Timedelta or string, optional
- Frequency increment to shift by.
- If None, the index is shifted by its own `freq` attribute.
- Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc.
-
- Returns
- -------
- pandas.DatetimeIndex
- Shifted index.
-
- See Also
- --------
- Index.shift : Shift values of Index.
- PeriodIndex.shift : Shift values of PeriodIndex.
- """
- raise NotImplementedError
-
- # --------------------------------------------------------------------
-
- @doc(Index._maybe_cast_listlike_indexer)
- def _maybe_cast_listlike_indexer(self, keyarr):
- try:
- res = self._data._validate_listlike(keyarr, allow_object=True)
- except (ValueError, TypeError):
- if not isinstance(keyarr, ExtensionArray):
- # e.g. we don't want to cast DTA to ndarray[object]
- res = com.asarray_tuplesafe(keyarr)
- # TODO: com.asarray_tuplesafe shouldn't cast e.g. DatetimeArray
- else:
- res = keyarr
- return Index(res, dtype=res.dtype)
-
-
-class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, ABC):
- """
- Mixin class for methods shared by DatetimeIndex and TimedeltaIndex,
- but not PeriodIndex
- """
-
- _data: DatetimeArray | TimedeltaArray
- _comparables = ["name", "freq"]
- _attributes = ["name", "freq"]
-
- # Compat for frequency inference, see GH#23789
- _is_monotonic_increasing = Index.is_monotonic_increasing
- _is_monotonic_decreasing = Index.is_monotonic_decreasing
- _is_unique = Index.is_unique
-
- _join_precedence = 10
-
- @property
- def unit(self) -> str:
- return self._data.unit
-
- def as_unit(self: _TDT, unit: str) -> _TDT:
- """
- Convert to a dtype with the given unit resolution.
-
- Parameters
- ----------
- unit : {'s', 'ms', 'us', 'ns'}
-
- Returns
- -------
- same type as self
- """
- arr = self._data.as_unit(unit)
- return type(self)._simple_new(arr, name=self.name)
-
- def _with_freq(self, freq):
- arr = self._data._with_freq(freq)
- return type(self)._simple_new(arr, name=self._name)
-
- @property
- def values(self) -> np.ndarray:
- # NB: For Datetime64TZ this is lossy
- return self._data._ndarray
-
- @doc(DatetimeIndexOpsMixin.shift)
- def shift(self: _TDT, periods: int = 1, freq=None) -> _TDT:
- if freq is not None and freq != self.freq:
- if isinstance(freq, str):
- freq = to_offset(freq)
- offset = periods * freq
- return self + offset
-
- if periods == 0 or len(self) == 0:
- # GH#14811 empty case
- return self.copy()
-
- if self.freq is None:
- raise NullFrequencyError("Cannot shift with no freq")
-
- start = self[0] + periods * self.freq
- end = self[-1] + periods * self.freq
-
- # Note: in the DatetimeTZ case, _generate_range will infer the
- # appropriate timezone from `start` and `end`, so tz does not need
- # to be passed explicitly.
- result = self._data._generate_range(
- start=start, end=end, periods=None, freq=self.freq
- )
- return type(self)._simple_new(result, name=self.name)
-
- @cache_readonly
- @doc(DatetimeLikeArrayMixin.inferred_freq)
- def inferred_freq(self) -> str | None:
- return self._data.inferred_freq
-
- # --------------------------------------------------------------------
- # Set Operation Methods
-
- @cache_readonly
- def _as_range_index(self) -> RangeIndex:
- # Convert our i8 representations to RangeIndex
- # Caller is responsible for checking isinstance(self.freq, Tick)
- freq = cast(Tick, self.freq)
- tick = freq.delta._value
- rng = range(self[0]._value, self[-1]._value + tick, tick)
- return RangeIndex(rng)
-
- def _can_range_setop(self, other):
- return isinstance(self.freq, Tick) and isinstance(other.freq, Tick)
-
- def _wrap_range_setop(self, other, res_i8):
- new_freq = None
- if not len(res_i8):
- # RangeIndex defaults to step=1, which we don't want.
- new_freq = self.freq
- elif isinstance(res_i8, RangeIndex):
- new_freq = to_offset(Timedelta(res_i8.step))
-
- # TODO(GH#41493): we cannot just do
- # type(self._data)(res_i8.values, dtype=self.dtype, freq=new_freq)
- # because test_setops_preserve_freq fails with _validate_frequency raising.
- # This raising is incorrect, as 'on_freq' is incorrect. This will
- # be fixed by GH#41493
- res_values = res_i8.values.view(self._data._ndarray.dtype)
- result = type(self._data)._simple_new(
- res_values, dtype=self.dtype, freq=new_freq
- )
- return self._wrap_setop_result(other, result)
-
- def _range_intersect(self, other, sort):
- # Dispatch to RangeIndex intersection logic.
- left = self._as_range_index
- right = other._as_range_index
- res_i8 = left.intersection(right, sort=sort)
- return self._wrap_range_setop(other, res_i8)
-
- def _range_union(self, other, sort):
- # Dispatch to RangeIndex union logic.
- left = self._as_range_index
- right = other._as_range_index
- res_i8 = left.union(right, sort=sort)
- return self._wrap_range_setop(other, res_i8)
-
- def _intersection(self, other: Index, sort: bool = False) -> Index:
- """
- intersection specialized to the case with matching dtypes and both non-empty.
- """
- other = cast("DatetimeTimedeltaMixin", other)
-
- if self._can_range_setop(other):
- return self._range_intersect(other, sort=sort)
-
- if not self._can_fast_intersect(other):
- result = Index._intersection(self, other, sort=sort)
- # We need to invalidate the freq because Index._intersection
- # uses _shallow_copy on a view of self._data, which will preserve
- # self.freq if we're not careful.
- # At this point we should have result.dtype == self.dtype
- # and type(result) is type(self._data)
- result = self._wrap_setop_result(other, result)
- return result._with_freq(None)._with_freq("infer")
-
- else:
- return self._fast_intersect(other, sort)
-
- def _fast_intersect(self, other, sort):
- # to make our life easier, "sort" the two ranges
- if self[0] <= other[0]:
- left, right = self, other
- else:
- left, right = other, self
-
- # after sorting, the intersection always starts with the right index
- # and ends with the index of which the last elements is smallest
- end = min(left[-1], right[-1])
- start = right[0]
-
- if end < start:
- result = self[:0]
- else:
- lslice = slice(*left.slice_locs(start, end))
- result = left._values[lslice]
-
- return result
-
- def _can_fast_intersect(self: _T, other: _T) -> bool:
- # Note: we only get here with len(self) > 0 and len(other) > 0
- if self.freq is None:
- return False
-
- elif other.freq != self.freq:
- return False
-
- elif not self.is_monotonic_increasing:
- # Because freq is not None, we must then be monotonic decreasing
- return False
-
- # this along with matching freqs ensure that we "line up",
- # so intersection will preserve freq
- # Note we are assuming away Ticks, as those go through _range_intersect
- # GH#42104
- return self.freq.n == 1
-
- def _can_fast_union(self: _T, other: _T) -> bool:
- # Assumes that type(self) == type(other), as per the annotation
- # The ability to fast_union also implies that `freq` should be
- # retained on union.
- freq = self.freq
-
- if freq is None or freq != other.freq:
- return False
-
- if not self.is_monotonic_increasing:
- # Because freq is not None, we must then be monotonic decreasing
- # TODO: do union on the reversed indexes?
- return False
-
- if len(self) == 0 or len(other) == 0:
- # only reached via union_many
- return True
-
- # to make our life easier, "sort" the two ranges
- if self[0] <= other[0]:
- left, right = self, other
- else:
- left, right = other, self
-
- right_start = right[0]
- left_end = left[-1]
-
- # Only need to "adjoin", not overlap
- return (right_start == left_end + freq) or right_start in left
-
- def _fast_union(self: _TDT, other: _TDT, sort=None) -> _TDT:
- # Caller is responsible for ensuring self and other are non-empty
-
- # to make our life easier, "sort" the two ranges
- if self[0] <= other[0]:
- left, right = self, other
- elif sort is False:
- # TDIs are not in the "correct" order and we don't want
- # to sort but want to remove overlaps
- left, right = self, other
- left_start = left[0]
- loc = right.searchsorted(left_start, side="left")
- right_chunk = right._values[:loc]
- dates = concat_compat((left._values, right_chunk))
- result = type(self)._simple_new(dates, name=self.name)
- return result
- else:
- left, right = other, self
-
- left_end = left[-1]
- right_end = right[-1]
-
- # concatenate
- if left_end < right_end:
- loc = right.searchsorted(left_end, side="right")
- right_chunk = right._values[loc:]
- dates = concat_compat([left._values, right_chunk])
- # The can_fast_union check ensures that the result.freq
- # should match self.freq
- dates = type(self._data)(dates, freq=self.freq)
- result = type(self)._simple_new(dates)
- return result
- else:
- return left
-
- def _union(self, other, sort):
- # We are called by `union`, which is responsible for this validation
- assert isinstance(other, type(self))
- assert self.dtype == other.dtype
-
- if self._can_range_setop(other):
- return self._range_union(other, sort=sort)
-
- if self._can_fast_union(other):
- result = self._fast_union(other, sort=sort)
- # in the case with sort=None, the _can_fast_union check ensures
- # that result.freq == self.freq
- return result
- else:
- return super()._union(other, sort)._with_freq("infer")
-
- # --------------------------------------------------------------------
- # Join Methods
-
- def _get_join_freq(self, other):
- """
- Get the freq to attach to the result of a join operation.
- """
- freq = None
- if self._can_fast_union(other):
- freq = self.freq
- return freq
-
- def _wrap_joined_index(
- self, joined, other, lidx: npt.NDArray[np.intp], ridx: npt.NDArray[np.intp]
- ):
- assert other.dtype == self.dtype, (other.dtype, self.dtype)
- result = super()._wrap_joined_index(joined, other, lidx, ridx)
- result._data._freq = self._get_join_freq(other)
- return result
-
- def _get_engine_target(self) -> np.ndarray:
- # engine methods and libjoin methods need dt64/td64 values cast to i8
- return self._data._ndarray.view("i8")
-
- def _from_join_target(self, result: np.ndarray):
- # view e.g. i8 back to M8[ns]
- result = result.view(self._data._ndarray.dtype)
- return self._data._from_backing_data(result)
-
- # --------------------------------------------------------------------
- # List-like Methods
-
- def _get_delete_freq(self, loc: int | slice | Sequence[int]):
- """
- Find the `freq` for self.delete(loc).
- """
- freq = None
- if self.freq is not None:
- if is_integer(loc):
- if loc in (0, -len(self), -1, len(self) - 1):
- freq = self.freq
- else:
- if is_list_like(loc):
- # error: Incompatible types in assignment (expression has
- # type "Union[slice, ndarray]", variable has type
- # "Union[int, slice, Sequence[int]]")
- loc = lib.maybe_indices_to_slice( # type: ignore[assignment]
- np.asarray(loc, dtype=np.intp), len(self)
- )
- if isinstance(loc, slice) and loc.step in (1, None):
- if loc.start in (0, None) or loc.stop in (len(self), None):
- freq = self.freq
- return freq
-
- def _get_insert_freq(self, loc: int, item):
- """
- Find the `freq` for self.insert(loc, item).
- """
- value = self._data._validate_scalar(item)
- item = self._data._box_func(value)
-
- freq = None
- if self.freq is not None:
- # freq can be preserved on edge cases
- if self.size:
- if item is NaT:
- pass
- elif loc in (0, -len(self)) and item + self.freq == self[0]:
- freq = self.freq
- elif (loc == len(self)) and item - self.freq == self[-1]:
- freq = self.freq
- else:
- # Adding a single item to an empty index may preserve freq
- if isinstance(self.freq, Tick):
- # all TimedeltaIndex cases go through here; is_on_offset
- # would raise TypeError
- freq = self.freq
- elif self.freq.is_on_offset(item):
- freq = self.freq
- return freq
-
- @doc(NDArrayBackedExtensionIndex.delete)
- def delete(self, loc) -> DatetimeTimedeltaMixin:
- result = super().delete(loc)
- result._data._freq = self._get_delete_freq(loc)
- return result
-
- @doc(NDArrayBackedExtensionIndex.insert)
- def insert(self, loc: int, item):
- result = super().insert(loc, item)
- if isinstance(result, type(self)):
- # i.e. parent class method did not cast
- result._data._freq = self._get_insert_freq(loc, item)
- return result
-
- # --------------------------------------------------------------------
- # NDArray-Like Methods
-
- @Appender(_index_shared_docs["take"] % _index_doc_kwargs)
- def take(
- self,
- indices,
- axis: Axis = 0,
- allow_fill: bool = True,
- fill_value=None,
- **kwargs,
- ):
- nv.validate_take((), kwargs)
- indices = np.asarray(indices, dtype=np.intp)
-
- result = NDArrayBackedExtensionIndex.take(
- self, indices, axis, allow_fill, fill_value, **kwargs
- )
-
- maybe_slice = lib.maybe_indices_to_slice(indices, len(self))
- if isinstance(maybe_slice, slice):
- freq = self._data._get_getitem_freq(maybe_slice)
- result._data._freq = freq
- return result
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/datetimes.py b/contrib/python/pandas/py3/pandas/core/indexes/datetimes.py
deleted file mode 100644
index 1d24af5293a..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/datetimes.py
+++ /dev/null
@@ -1,1064 +0,0 @@
-from __future__ import annotations
-
-import datetime as dt
-import operator
-from typing import (
- TYPE_CHECKING,
- Hashable,
-)
-import warnings
-
-import numpy as np
-import pytz
-
-from pandas._libs import (
- NaT,
- Period,
- Timestamp,
- index as libindex,
- lib,
-)
-from pandas._libs.tslibs import (
- Resolution,
- periods_per_day,
- timezones,
- to_offset,
-)
-from pandas._libs.tslibs.offsets import prefix_mapping
-from pandas._typing import (
- Dtype,
- DtypeObj,
- Frequency,
- IntervalClosedType,
- TimeAmbiguous,
- TimeNonexistent,
- npt,
-)
-from pandas.util._decorators import (
- cache_readonly,
- doc,
-)
-
-from pandas.core.dtypes.common import (
- is_datetime64_dtype,
- is_datetime64tz_dtype,
- is_scalar,
-)
-from pandas.core.dtypes.generic import ABCSeries
-from pandas.core.dtypes.missing import is_valid_na_for_dtype
-
-from pandas.core.arrays.datetimes import (
- DatetimeArray,
- tz_to_dtype,
-)
-import pandas.core.common as com
-from pandas.core.indexes.base import (
- Index,
- maybe_extract_name,
-)
-from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin
-from pandas.core.indexes.extension import inherit_names
-from pandas.core.tools.times import to_time
-
-if TYPE_CHECKING:
- from pandas.core.api import (
- DataFrame,
- PeriodIndex,
- )
-
-
-def _new_DatetimeIndex(cls, d):
- """
- This is called upon unpickling, rather than the default which doesn't
- have arguments and breaks __new__
- """
- if "data" in d and not isinstance(d["data"], DatetimeIndex):
- # Avoid need to verify integrity by calling simple_new directly
- data = d.pop("data")
- if not isinstance(data, DatetimeArray):
- # For backward compat with older pickles, we may need to construct
- # a DatetimeArray to adapt to the newer _simple_new signature
- tz = d.pop("tz")
- freq = d.pop("freq")
- dta = DatetimeArray._simple_new(data, dtype=tz_to_dtype(tz), freq=freq)
- else:
- dta = data
- for key in ["tz", "freq"]:
- # These are already stored in our DatetimeArray; if they are
- # also in the pickle and don't match, we have a problem.
- if key in d:
- assert d[key] == getattr(dta, key)
- d.pop(key)
- result = cls._simple_new(dta, **d)
- else:
- with warnings.catch_warnings():
- # TODO: If we knew what was going in to **d, we might be able to
- # go through _simple_new instead
- warnings.simplefilter("ignore")
- result = cls.__new__(cls, **d)
-
- return result
-
-
-@inherit_names(
- DatetimeArray._field_ops
- + [
- method
- for method in DatetimeArray._datetimelike_methods
- if method not in ("tz_localize", "tz_convert", "strftime")
- ],
- DatetimeArray,
- wrap=True,
-)
-@inherit_names(["is_normalized"], DatetimeArray, cache=True)
-@inherit_names(
- [
- "tz",
- "tzinfo",
- "dtype",
- "to_pydatetime",
- "_format_native_types",
- "date",
- "time",
- "timetz",
- "std",
- ]
- + DatetimeArray._bool_ops,
- DatetimeArray,
-)
-class DatetimeIndex(DatetimeTimedeltaMixin):
- """
- Immutable ndarray-like of datetime64 data.
-
- Represented internally as int64, and which can be boxed to Timestamp objects
- that are subclasses of datetime and carry metadata.
-
- .. versionchanged:: 2.0.0
- The various numeric date/time attributes (:attr:`~DatetimeIndex.day`,
- :attr:`~DatetimeIndex.month`, :attr:`~DatetimeIndex.year` etc.) now have dtype
- ``int32``. Previously they had dtype ``int64``.
-
- Parameters
- ----------
- data : array-like (1-dimensional)
- Datetime-like data to construct index with.
- freq : str or pandas offset object, optional
- One of pandas date offset strings or corresponding objects. The string
- 'infer' can be passed in order to set the frequency of the index as the
- inferred frequency upon creation.
- tz : pytz.timezone or dateutil.tz.tzfile or datetime.tzinfo or str
- Set the Timezone of the data.
- normalize : bool, default False
- Normalize start/end dates to midnight before generating date range.
- closed : {'left', 'right'}, optional
- Set whether to include `start` and `end` that are on the
- boundary. The default includes boundary points on either end.
- ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
- When clocks moved backward due to DST, ambiguous times may arise.
- For example in Central European Time (UTC+01), when going from 03:00
- DST to 02:00 non-DST, 02:30:00 local time occurs both at 00:30:00 UTC
- and at 01:30:00 UTC. In such a situation, the `ambiguous` parameter
- dictates how ambiguous times should be handled.
-
- - 'infer' will attempt to infer fall dst-transition hours based on
- order
- - bool-ndarray where True signifies a DST time, False signifies a
- non-DST time (note that this flag is only applicable for ambiguous
- times)
- - 'NaT' will return NaT where there are ambiguous times
- - 'raise' will raise an AmbiguousTimeError if there are ambiguous times.
- dayfirst : bool, default False
- If True, parse dates in `data` with the day first order.
- yearfirst : bool, default False
- If True parse dates in `data` with the year first order.
- dtype : numpy.dtype or DatetimeTZDtype or str, default None
- Note that the only NumPy dtype allowed is ‘datetime64[ns]’.
- copy : bool, default False
- Make a copy of input ndarray.
- name : label, default None
- Name to be stored in the index.
-
- Attributes
- ----------
- year
- month
- day
- hour
- minute
- second
- microsecond
- nanosecond
- date
- time
- timetz
- dayofyear
- day_of_year
- weekofyear
- week
- dayofweek
- day_of_week
- weekday
- quarter
- tz
- freq
- freqstr
- is_month_start
- is_month_end
- is_quarter_start
- is_quarter_end
- is_year_start
- is_year_end
- is_leap_year
- inferred_freq
-
- Methods
- -------
- normalize
- strftime
- snap
- tz_convert
- tz_localize
- round
- floor
- ceil
- to_period
- to_pydatetime
- to_series
- to_frame
- month_name
- day_name
- mean
- std
-
- See Also
- --------
- Index : The base pandas Index type.
- TimedeltaIndex : Index of timedelta64 data.
- PeriodIndex : Index of Period data.
- to_datetime : Convert argument to datetime.
- date_range : Create a fixed-frequency DatetimeIndex.
-
- Notes
- -----
- To learn more about the frequency strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
- """
-
- _typ = "datetimeindex"
-
- _data_cls = DatetimeArray
- _supports_partial_string_indexing = True
-
- @property
- def _engine_type(self) -> type[libindex.DatetimeEngine]:
- return libindex.DatetimeEngine
-
- _data: DatetimeArray
- tz: dt.tzinfo | None
-
- # --------------------------------------------------------------------
- # methods that dispatch to DatetimeArray and wrap result
-
- @doc(DatetimeArray.strftime)
- def strftime(self, date_format) -> Index:
- arr = self._data.strftime(date_format)
- return Index(arr, name=self.name, dtype=object)
-
- @doc(DatetimeArray.tz_convert)
- def tz_convert(self, tz) -> DatetimeIndex:
- arr = self._data.tz_convert(tz)
- return type(self)._simple_new(arr, name=self.name, refs=self._references)
-
- @doc(DatetimeArray.tz_localize)
- def tz_localize(
- self,
- tz,
- ambiguous: TimeAmbiguous = "raise",
- nonexistent: TimeNonexistent = "raise",
- ) -> DatetimeIndex:
- arr = self._data.tz_localize(tz, ambiguous, nonexistent)
- return type(self)._simple_new(arr, name=self.name)
-
- @doc(DatetimeArray.to_period)
- def to_period(self, freq=None) -> PeriodIndex:
- from pandas.core.indexes.api import PeriodIndex
-
- arr = self._data.to_period(freq)
- return PeriodIndex._simple_new(arr, name=self.name)
-
- @doc(DatetimeArray.to_julian_date)
- def to_julian_date(self) -> Index:
- arr = self._data.to_julian_date()
- return Index._simple_new(arr, name=self.name)
-
- @doc(DatetimeArray.isocalendar)
- def isocalendar(self) -> DataFrame:
- df = self._data.isocalendar()
- return df.set_index(self)
-
- @cache_readonly
- def _resolution_obj(self) -> Resolution:
- return self._data._resolution_obj
-
- # --------------------------------------------------------------------
- # Constructors
-
- def __new__(
- cls,
- data=None,
- freq: Frequency | lib.NoDefault = lib.no_default,
- tz=lib.no_default,
- normalize: bool = False,
- closed=None,
- ambiguous: TimeAmbiguous = "raise",
- dayfirst: bool = False,
- yearfirst: bool = False,
- dtype: Dtype | None = None,
- copy: bool = False,
- name: Hashable = None,
- ) -> DatetimeIndex:
- if is_scalar(data):
- cls._raise_scalar_data_error(data)
-
- # - Cases checked above all return/raise before reaching here - #
-
- name = maybe_extract_name(name, data, cls)
-
- if (
- isinstance(data, DatetimeArray)
- and freq is lib.no_default
- and tz is lib.no_default
- and dtype is None
- ):
- # fastpath, similar logic in TimedeltaIndex.__new__;
- # Note in this particular case we retain non-nano.
- if copy:
- data = data.copy()
- return cls._simple_new(data, name=name)
-
- dtarr = DatetimeArray._from_sequence_not_strict(
- data,
- dtype=dtype,
- copy=copy,
- tz=tz,
- freq=freq,
- dayfirst=dayfirst,
- yearfirst=yearfirst,
- ambiguous=ambiguous,
- )
- refs = None
- if not copy and isinstance(data, (Index, ABCSeries)):
- refs = data._references
-
- subarr = cls._simple_new(dtarr, name=name, refs=refs)
- return subarr
-
- # --------------------------------------------------------------------
-
- @cache_readonly
- def _is_dates_only(self) -> bool:
- """
- Return a boolean if we are only dates (and don't have a timezone)
-
- Returns
- -------
- bool
- """
- from pandas.io.formats.format import is_dates_only
-
- # error: Argument 1 to "is_dates_only" has incompatible type
- # "Union[ExtensionArray, ndarray]"; expected "Union[ndarray,
- # DatetimeArray, Index, DatetimeIndex]"
- return self.tz is None and is_dates_only(self._values) # type: ignore[arg-type]
-
- def __reduce__(self):
- d = {"data": self._data, "name": self.name}
- return _new_DatetimeIndex, (type(self), d), None
-
- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
- """
- Can we compare values of the given dtype to our own?
- """
- if self.tz is not None:
- # If we have tz, we can compare to tzaware
- return is_datetime64tz_dtype(dtype)
- # if we dont have tz, we can only compare to tznaive
- return is_datetime64_dtype(dtype)
-
- # --------------------------------------------------------------------
- # Rendering Methods
-
- @property
- def _formatter_func(self):
- from pandas.io.formats.format import get_format_datetime64
-
- formatter = get_format_datetime64(is_dates_only_=self._is_dates_only)
- return lambda x: f"'{formatter(x)}'"
-
- # --------------------------------------------------------------------
- # Set Operation Methods
-
- def _can_range_setop(self, other) -> bool:
- # GH 46702: If self or other have non-UTC tzs, DST transitions prevent
- # range representation due to no singular step
- if (
- self.tz is not None
- and not timezones.is_utc(self.tz)
- and not timezones.is_fixed_offset(self.tz)
- ):
- return False
- if (
- other.tz is not None
- and not timezones.is_utc(other.tz)
- and not timezones.is_fixed_offset(other.tz)
- ):
- return False
- return super()._can_range_setop(other)
-
- # --------------------------------------------------------------------
-
- def _get_time_micros(self) -> npt.NDArray[np.int64]:
- """
- Return the number of microseconds since midnight.
-
- Returns
- -------
- ndarray[int64_t]
- """
- values = self._data._local_timestamps()
-
- ppd = periods_per_day(self._data._creso)
-
- frac = values % ppd
- if self.unit == "ns":
- micros = frac // 1000
- elif self.unit == "us":
- micros = frac
- elif self.unit == "ms":
- micros = frac * 1000
- elif self.unit == "s":
- micros = frac * 1_000_000
- else: # pragma: no cover
- raise NotImplementedError(self.unit)
-
- micros[self._isnan] = -1
- return micros
-
- def snap(self, freq: Frequency = "S") -> DatetimeIndex:
- """
- Snap time stamps to nearest occurring frequency.
-
- Returns
- -------
- DatetimeIndex
- """
- # Superdumb, punting on any optimizing
- freq = to_offset(freq)
-
- dta = self._data.copy()
-
- for i, v in enumerate(self):
- s = v
- if not freq.is_on_offset(s):
- t0 = freq.rollback(s)
- t1 = freq.rollforward(s)
- if abs(s - t0) < abs(t1 - s):
- s = t0
- else:
- s = t1
- dta[i] = s
-
- return DatetimeIndex._simple_new(dta, name=self.name)
-
- # --------------------------------------------------------------------
- # Indexing Methods
-
- def _parsed_string_to_bounds(self, reso: Resolution, parsed: dt.datetime):
- """
- Calculate datetime bounds for parsed time string and its resolution.
-
- Parameters
- ----------
- reso : Resolution
- Resolution provided by parsed string.
- parsed : datetime
- Datetime from parsed string.
-
- Returns
- -------
- lower, upper: pd.Timestamp
- """
- per = Period(parsed, freq=reso.attr_abbrev)
- start, end = per.start_time, per.end_time
-
- # GH 24076
- # If an incoming date string contained a UTC offset, need to localize
- # the parsed date to this offset first before aligning with the index's
- # timezone
- start = start.tz_localize(parsed.tzinfo)
- end = end.tz_localize(parsed.tzinfo)
-
- if parsed.tzinfo is not None:
- if self.tz is None:
- raise ValueError(
- "The index must be timezone aware when indexing "
- "with a date string with a UTC offset"
- )
- # The flipped case with parsed.tz is None and self.tz is not None
- # is ruled out bc parsed and reso are produced by _parse_with_reso,
- # which localizes parsed.
- return start, end
-
- def _parse_with_reso(self, label: str):
- parsed, reso = super()._parse_with_reso(label)
-
- parsed = Timestamp(parsed)
-
- if self.tz is not None and parsed.tzinfo is None:
- # we special-case timezone-naive strings and timezone-aware
- # DatetimeIndex
- # https://github.com/pandas-dev/pandas/pull/36148#issuecomment-687883081
- parsed = parsed.tz_localize(self.tz)
-
- return parsed, reso
-
- def _disallow_mismatched_indexing(self, key) -> None:
- """
- Check for mismatched-tzawareness indexing and re-raise as KeyError.
- """
- # we get here with isinstance(key, self._data._recognized_scalars)
- try:
- # GH#36148
- self._data._assert_tzawareness_compat(key)
- except TypeError as err:
- raise KeyError(key) from err
-
- def get_loc(self, key):
- """
- Get integer location for requested label
-
- Returns
- -------
- loc : int
- """
- self._check_indexing_error(key)
-
- orig_key = key
- if is_valid_na_for_dtype(key, self.dtype):
- key = NaT
-
- if isinstance(key, self._data._recognized_scalars):
- # needed to localize naive datetimes
- self._disallow_mismatched_indexing(key)
- key = Timestamp(key)
-
- elif isinstance(key, str):
- try:
- parsed, reso = self._parse_with_reso(key)
- except (ValueError, pytz.NonExistentTimeError) as err:
- raise KeyError(key) from err
- self._disallow_mismatched_indexing(parsed)
-
- if self._can_partial_date_slice(reso):
- try:
- return self._partial_date_slice(reso, parsed)
- except KeyError as err:
- raise KeyError(key) from err
-
- key = parsed
-
- elif isinstance(key, dt.timedelta):
- # GH#20464
- raise TypeError(
- f"Cannot index {type(self).__name__} with {type(key).__name__}"
- )
-
- elif isinstance(key, dt.time):
- return self.indexer_at_time(key)
-
- else:
- # unrecognized type
- raise KeyError(key)
-
- try:
- return Index.get_loc(self, key)
- except KeyError as err:
- raise KeyError(orig_key) from err
-
- @doc(DatetimeTimedeltaMixin._maybe_cast_slice_bound)
- def _maybe_cast_slice_bound(self, label, side: str):
- # GH#42855 handle date here instead of get_slice_bound
- if isinstance(label, dt.date) and not isinstance(label, dt.datetime):
- # Pandas supports slicing with dates, treated as datetimes at midnight.
- # https://github.com/pandas-dev/pandas/issues/31501
- label = Timestamp(label).to_pydatetime()
-
- label = super()._maybe_cast_slice_bound(label, side)
- self._data._assert_tzawareness_compat(label)
- return Timestamp(label)
-
- def slice_indexer(self, start=None, end=None, step=None):
- """
- Return indexer for specified label slice.
- Index.slice_indexer, customized to handle time slicing.
-
- In addition to functionality provided by Index.slice_indexer, does the
- following:
-
- - if both `start` and `end` are instances of `datetime.time`, it
- invokes `indexer_between_time`
- - if `start` and `end` are both either string or None perform
- value-based selection in non-monotonic cases.
-
- """
- # For historical reasons DatetimeIndex supports slices between two
- # instances of datetime.time as if it were applying a slice mask to
- # an array of (self.hour, self.minute, self.seconds, self.microsecond).
- if isinstance(start, dt.time) and isinstance(end, dt.time):
- if step is not None and step != 1:
- raise ValueError("Must have step size of 1 with time slices")
- return self.indexer_between_time(start, end)
-
- if isinstance(start, dt.time) or isinstance(end, dt.time):
- raise KeyError("Cannot mix time and non-time slice keys")
-
- def check_str_or_none(point) -> bool:
- return point is not None and not isinstance(point, str)
-
- # GH#33146 if start and end are combinations of str and None and Index is not
- # monotonic, we can not use Index.slice_indexer because it does not honor the
- # actual elements, is only searching for start and end
- if (
- check_str_or_none(start)
- or check_str_or_none(end)
- or self.is_monotonic_increasing
- ):
- return Index.slice_indexer(self, start, end, step)
-
- mask = np.array(True)
- raise_mask = np.array(True)
- if start is not None:
- start_casted = self._maybe_cast_slice_bound(start, "left")
- mask = start_casted <= self
- raise_mask = start_casted == self
-
- if end is not None:
- end_casted = self._maybe_cast_slice_bound(end, "right")
- mask = (self <= end_casted) & mask
- raise_mask = (end_casted == self) | raise_mask
-
- if not raise_mask.any():
- raise KeyError(
- "Value based partial slicing on non-monotonic DatetimeIndexes "
- "with non-existing keys is not allowed.",
- )
- indexer = mask.nonzero()[0][::step]
- if len(indexer) == len(self):
- return slice(None)
- else:
- return indexer
-
- # --------------------------------------------------------------------
-
- @property
- def inferred_type(self) -> str:
- # b/c datetime is represented as microseconds since the epoch, make
- # sure we can't have ambiguous indexing
- return "datetime64"
-
- def indexer_at_time(self, time, asof: bool = False) -> npt.NDArray[np.intp]:
- """
- Return index locations of values at particular time of day.
-
- Parameters
- ----------
- time : datetime.time or str
- Time passed in either as object (datetime.time) or as string in
- appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p",
- "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p").
-
- Returns
- -------
- np.ndarray[np.intp]
-
- See Also
- --------
- indexer_between_time : Get index locations of values between particular
- times of day.
- DataFrame.at_time : Select values at particular time of day.
- """
- if asof:
- raise NotImplementedError("'asof' argument is not supported")
-
- if isinstance(time, str):
- from dateutil.parser import parse
-
- time = parse(time).time()
-
- if time.tzinfo:
- if self.tz is None:
- raise ValueError("Index must be timezone aware.")
- time_micros = self.tz_convert(time.tzinfo)._get_time_micros()
- else:
- time_micros = self._get_time_micros()
- micros = _time_to_micros(time)
- return (time_micros == micros).nonzero()[0]
-
- def indexer_between_time(
- self, start_time, end_time, include_start: bool = True, include_end: bool = True
- ) -> npt.NDArray[np.intp]:
- """
- Return index locations of values between particular times of day.
-
- Parameters
- ----------
- start_time, end_time : datetime.time, str
- Time passed either as object (datetime.time) or as string in
- appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p",
- "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p").
- include_start : bool, default True
- include_end : bool, default True
-
- Returns
- -------
- np.ndarray[np.intp]
-
- See Also
- --------
- indexer_at_time : Get index locations of values at particular time of day.
- DataFrame.between_time : Select values between particular times of day.
- """
- start_time = to_time(start_time)
- end_time = to_time(end_time)
- time_micros = self._get_time_micros()
- start_micros = _time_to_micros(start_time)
- end_micros = _time_to_micros(end_time)
-
- if include_start and include_end:
- lop = rop = operator.le
- elif include_start:
- lop = operator.le
- rop = operator.lt
- elif include_end:
- lop = operator.lt
- rop = operator.le
- else:
- lop = rop = operator.lt
-
- if start_time <= end_time:
- join_op = operator.and_
- else:
- join_op = operator.or_
-
- mask = join_op(lop(start_micros, time_micros), rop(time_micros, end_micros))
-
- return mask.nonzero()[0]
-
-
-def date_range(
- start=None,
- end=None,
- periods=None,
- freq=None,
- tz=None,
- normalize: bool = False,
- name: Hashable = None,
- inclusive: IntervalClosedType = "both",
- *,
- unit: str | None = None,
- **kwargs,
-) -> DatetimeIndex:
- """
- Return a fixed frequency DatetimeIndex.
-
- Returns the range of equally spaced time points (where the difference between any
- two adjacent points is specified by the given frequency) such that they all
- satisfy `start <[=] x <[=] end`, where the first one and the last one are, resp.,
- the first and last time points in that range that fall on the boundary of ``freq``
- (if given as a frequency string) or that are valid for ``freq`` (if given as a
- :class:`pandas.tseries.offsets.DateOffset`). (If exactly one of ``start``,
- ``end``, or ``freq`` is *not* specified, this missing parameter can be computed
- given ``periods``, the number of timesteps in the range. See the note below.)
-
- Parameters
- ----------
- start : str or datetime-like, optional
- Left bound for generating dates.
- end : str or datetime-like, optional
- Right bound for generating dates.
- periods : int, optional
- Number of periods to generate.
- freq : str, datetime.timedelta, or DateOffset, default 'D'
- Frequency strings can have multiples, e.g. '5H'. See
- :ref:`here <timeseries.offset_aliases>` for a list of
- frequency aliases.
- tz : str or tzinfo, optional
- Time zone name for returning localized DatetimeIndex, for example
- 'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is
- timezone-naive unless timezone-aware datetime-likes are passed.
- normalize : bool, default False
- Normalize start/end dates to midnight before generating date range.
- name : str, default None
- Name of the resulting DatetimeIndex.
- inclusive : {"both", "neither", "left", "right"}, default "both"
- Include boundaries; Whether to set each bound as closed or open.
-
- .. versionadded:: 1.4.0
- unit : str, default None
- Specify the desired resolution of the result.
-
- .. versionadded:: 2.0.0
- **kwargs
- For compatibility. Has no effect on the result.
-
- Returns
- -------
- DatetimeIndex
-
- See Also
- --------
- DatetimeIndex : An immutable container for datetimes.
- timedelta_range : Return a fixed frequency TimedeltaIndex.
- period_range : Return a fixed frequency PeriodIndex.
- interval_range : Return a fixed frequency IntervalIndex.
-
- Notes
- -----
- Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,
- exactly three must be specified. If ``freq`` is omitted, the resulting
- ``DatetimeIndex`` will have ``periods`` linearly spaced elements between
- ``start`` and ``end`` (closed on both sides).
-
- To learn more about the frequency strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
-
- Examples
- --------
- **Specifying the values**
-
- The next four examples generate the same `DatetimeIndex`, but vary
- the combination of `start`, `end` and `periods`.
-
- Specify `start` and `end`, with the default daily frequency.
-
- >>> pd.date_range(start='1/1/2018', end='1/08/2018')
- DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
- '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
- dtype='datetime64[ns]', freq='D')
-
- Specify timezone-aware `start` and `end`, with the default daily frequency.
-
- >>> pd.date_range(
- ... start=pd.to_datetime("1/1/2018").tz_localize("Europe/Berlin"),
- ... end=pd.to_datetime("1/08/2018").tz_localize("Europe/Berlin"),
- ... )
- DatetimeIndex(['2018-01-01 00:00:00+01:00', '2018-01-02 00:00:00+01:00',
- '2018-01-03 00:00:00+01:00', '2018-01-04 00:00:00+01:00',
- '2018-01-05 00:00:00+01:00', '2018-01-06 00:00:00+01:00',
- '2018-01-07 00:00:00+01:00', '2018-01-08 00:00:00+01:00'],
- dtype='datetime64[ns, Europe/Berlin]', freq='D')
-
- Specify `start` and `periods`, the number of periods (days).
-
- >>> pd.date_range(start='1/1/2018', periods=8)
- DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
- '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
- dtype='datetime64[ns]', freq='D')
-
- Specify `end` and `periods`, the number of periods (days).
-
- >>> pd.date_range(end='1/1/2018', periods=8)
- DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
- '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],
- dtype='datetime64[ns]', freq='D')
-
- Specify `start`, `end`, and `periods`; the frequency is generated
- automatically (linearly spaced).
-
- >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3)
- DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
- '2018-04-27 00:00:00'],
- dtype='datetime64[ns]', freq=None)
-
- **Other Parameters**
-
- Changed the `freq` (frequency) to ``'M'`` (month end frequency).
-
- >>> pd.date_range(start='1/1/2018', periods=5, freq='M')
- DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
- '2018-05-31'],
- dtype='datetime64[ns]', freq='M')
-
- Multiples are allowed
-
- >>> pd.date_range(start='1/1/2018', periods=5, freq='3M')
- DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
- '2019-01-31'],
- dtype='datetime64[ns]', freq='3M')
-
- `freq` can also be specified as an Offset object.
-
- >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3))
- DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
- '2019-01-31'],
- dtype='datetime64[ns]', freq='3M')
-
- Specify `tz` to set the timezone.
-
- >>> pd.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo')
- DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00',
- '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00',
- '2018-01-05 00:00:00+09:00'],
- dtype='datetime64[ns, Asia/Tokyo]', freq='D')
-
- `inclusive` controls whether to include `start` and `end` that are on the
- boundary. The default, "both", includes boundary points on either end.
-
- >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive="both")
- DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'],
- dtype='datetime64[ns]', freq='D')
-
- Use ``inclusive='left'`` to exclude `end` if it falls on the boundary.
-
- >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='left')
- DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'],
- dtype='datetime64[ns]', freq='D')
-
- Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and
- similarly ``inclusive='neither'`` will exclude both `start` and `end`.
-
- >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right')
- DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'],
- dtype='datetime64[ns]', freq='D')
-
- **Specify a unit**
-
- >>> pd.date_range(start="2017-01-01", periods=10, freq="100AS", unit="s")
- DatetimeIndex(['2017-01-01', '2117-01-01', '2217-01-01', '2317-01-01',
- '2417-01-01', '2517-01-01', '2617-01-01', '2717-01-01',
- '2817-01-01', '2917-01-01'],
- dtype='datetime64[s]', freq='100AS-JAN')
- """
- if freq is None and com.any_none(periods, start, end):
- freq = "D"
-
- dtarr = DatetimeArray._generate_range(
- start=start,
- end=end,
- periods=periods,
- freq=freq,
- tz=tz,
- normalize=normalize,
- inclusive=inclusive,
- unit=unit,
- **kwargs,
- )
- return DatetimeIndex._simple_new(dtarr, name=name)
-
-
-def bdate_range(
- start=None,
- end=None,
- periods: int | None = None,
- freq: Frequency = "B",
- tz=None,
- normalize: bool = True,
- name: Hashable = None,
- weekmask=None,
- holidays=None,
- inclusive: IntervalClosedType = "both",
- **kwargs,
-) -> DatetimeIndex:
- """
- Return a fixed frequency DatetimeIndex with business day as the default.
-
- Parameters
- ----------
- start : str or datetime-like, default None
- Left bound for generating dates.
- end : str or datetime-like, default None
- Right bound for generating dates.
- periods : int, default None
- Number of periods to generate.
- freq : str, Timedelta, datetime.timedelta, or DateOffset, default 'B'
- Frequency strings can have multiples, e.g. '5H'. The default is
- business daily ('B').
- tz : str or None
- Time zone name for returning localized DatetimeIndex, for example
- Asia/Beijing.
- normalize : bool, default False
- Normalize start/end dates to midnight before generating date range.
- name : str, default None
- Name of the resulting DatetimeIndex.
- weekmask : str or None, default None
- Weekmask of valid business days, passed to ``numpy.busdaycalendar``,
- only used when custom frequency strings are passed. The default
- value None is equivalent to 'Mon Tue Wed Thu Fri'.
- holidays : list-like or None, default None
- Dates to exclude from the set of valid business days, passed to
- ``numpy.busdaycalendar``, only used when custom frequency strings
- are passed.
- inclusive : {"both", "neither", "left", "right"}, default "both"
- Include boundaries; Whether to set each bound as closed or open.
-
- .. versionadded:: 1.4.0
- **kwargs
- For compatibility. Has no effect on the result.
-
- Returns
- -------
- DatetimeIndex
-
- Notes
- -----
- Of the four parameters: ``start``, ``end``, ``periods``, and ``freq``,
- exactly three must be specified. Specifying ``freq`` is a requirement
- for ``bdate_range``. Use ``date_range`` if specifying ``freq`` is not
- desired.
-
- To learn more about the frequency strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
-
- Examples
- --------
- Note how the two weekend days are skipped in the result.
-
- >>> pd.bdate_range(start='1/1/2018', end='1/08/2018')
- DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
- '2018-01-05', '2018-01-08'],
- dtype='datetime64[ns]', freq='B')
- """
- if freq is None:
- msg = "freq must be specified for bdate_range; use date_range instead"
- raise TypeError(msg)
-
- if isinstance(freq, str) and freq.startswith("C"):
- try:
- weekmask = weekmask or "Mon Tue Wed Thu Fri"
- freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask)
- except (KeyError, TypeError) as err:
- msg = f"invalid custom frequency string: {freq}"
- raise ValueError(msg) from err
- elif holidays or weekmask:
- msg = (
- "a custom frequency string is required when holidays or "
- f"weekmask are passed, got frequency {freq}"
- )
- raise ValueError(msg)
-
- return date_range(
- start=start,
- end=end,
- periods=periods,
- freq=freq,
- tz=tz,
- normalize=normalize,
- name=name,
- inclusive=inclusive,
- **kwargs,
- )
-
-
-def _time_to_micros(time_obj: dt.time) -> int:
- seconds = time_obj.hour * 60 * 60 + 60 * time_obj.minute + time_obj.second
- return 1_000_000 * seconds + time_obj.microsecond
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/extension.py b/contrib/python/pandas/py3/pandas/core/indexes/extension.py
deleted file mode 100644
index 81d502b60d6..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/extension.py
+++ /dev/null
@@ -1,192 +0,0 @@
-"""
-Shared methods for Index subclasses backed by ExtensionArray.
-"""
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Callable,
- TypeVar,
-)
-
-import numpy as np
-
-from pandas._typing import (
- ArrayLike,
- npt,
-)
-from pandas.util._decorators import (
- cache_readonly,
- doc,
-)
-
-from pandas.core.dtypes.generic import ABCDataFrame
-
-from pandas.core.indexes.base import Index
-
-if TYPE_CHECKING:
- from pandas.core.arrays import IntervalArray
- from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
-
-_T = TypeVar("_T", bound="NDArrayBackedExtensionIndex")
-_ExtensionIndexT = TypeVar("_ExtensionIndexT", bound="ExtensionIndex")
-
-
-def _inherit_from_data(
- name: str, delegate: type, cache: bool = False, wrap: bool = False
-):
- """
- Make an alias for a method of the underlying ExtensionArray.
-
- Parameters
- ----------
- name : str
- Name of an attribute the class should inherit from its EA parent.
- delegate : class
- cache : bool, default False
- Whether to convert wrapped properties into cache_readonly
- wrap : bool, default False
- Whether to wrap the inherited result in an Index.
-
- Returns
- -------
- attribute, method, property, or cache_readonly
- """
- attr = getattr(delegate, name)
-
- if isinstance(attr, property) or type(attr).__name__ == "getset_descriptor":
- # getset_descriptor i.e. property defined in cython class
- if cache:
-
- def cached(self):
- return getattr(self._data, name)
-
- cached.__name__ = name
- cached.__doc__ = attr.__doc__
- method = cache_readonly(cached)
-
- else:
-
- def fget(self):
- result = getattr(self._data, name)
- if wrap:
- if isinstance(result, type(self._data)):
- return type(self)._simple_new(result, name=self.name)
- elif isinstance(result, ABCDataFrame):
- return result.set_index(self)
- return Index(result, name=self.name)
- return result
-
- def fset(self, value) -> None:
- setattr(self._data, name, value)
-
- fget.__name__ = name
- fget.__doc__ = attr.__doc__
-
- method = property(fget, fset)
-
- elif not callable(attr):
- # just a normal attribute, no wrapping
- method = attr
-
- else:
- # error: Incompatible redefinition (redefinition with type "Callable[[Any,
- # VarArg(Any), KwArg(Any)], Any]", original type "property")
- def method(self, *args, **kwargs): # type: ignore[misc]
- if "inplace" in kwargs:
- raise ValueError(f"cannot use inplace with {type(self).__name__}")
- result = attr(self._data, *args, **kwargs)
- if wrap:
- if isinstance(result, type(self._data)):
- return type(self)._simple_new(result, name=self.name)
- elif isinstance(result, ABCDataFrame):
- return result.set_index(self)
- return Index(result, name=self.name)
- return result
-
- # error: "property" has no attribute "__name__"
- method.__name__ = name # type: ignore[attr-defined]
- method.__doc__ = attr.__doc__
- return method
-
-
-def inherit_names(
- names: list[str], delegate: type, cache: bool = False, wrap: bool = False
-) -> Callable[[type[_ExtensionIndexT]], type[_ExtensionIndexT]]:
- """
- Class decorator to pin attributes from an ExtensionArray to a Index subclass.
-
- Parameters
- ----------
- names : List[str]
- delegate : class
- cache : bool, default False
- wrap : bool, default False
- Whether to wrap the inherited result in an Index.
- """
-
- def wrapper(cls: type[_ExtensionIndexT]) -> type[_ExtensionIndexT]:
- for name in names:
- meth = _inherit_from_data(name, delegate, cache=cache, wrap=wrap)
- setattr(cls, name, meth)
-
- return cls
-
- return wrapper
-
-
-class ExtensionIndex(Index):
- """
- Index subclass for indexes backed by ExtensionArray.
- """
-
- # The base class already passes through to _data:
- # size, __len__, dtype
-
- _data: IntervalArray | NDArrayBackedExtensionArray
-
- # ---------------------------------------------------------------------
-
- def _validate_fill_value(self, value):
- """
- Convert value to be insertable to underlying array.
- """
- return self._data._validate_setitem_value(value)
-
- @doc(Index.map)
- def map(self, mapper, na_action=None):
- # Try to run function on index first, and then on elements of index
- # Especially important for group-by functionality
- try:
- result = mapper(self)
-
- # Try to use this result if we can
- if isinstance(result, np.ndarray):
- result = Index(result)
-
- if not isinstance(result, Index):
- raise TypeError("The map function must return an Index object")
- return result
- except Exception:
- return self.astype(object).map(mapper)
-
- @cache_readonly
- def _isnan(self) -> npt.NDArray[np.bool_]:
- # error: Incompatible return value type (got "ExtensionArray", expected
- # "ndarray")
- return self._data.isna() # type: ignore[return-value]
-
-
-class NDArrayBackedExtensionIndex(ExtensionIndex):
- """
- Index subclass for indexes backed by NDArrayBackedExtensionArray.
- """
-
- _data: NDArrayBackedExtensionArray
-
- def _get_engine_target(self) -> np.ndarray:
- return self._data._ndarray
-
- def _from_join_target(self, result: np.ndarray) -> ArrayLike:
- assert result.dtype == self._data._ndarray.dtype
- return self._data._from_backing_data(result)
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/frozen.py b/contrib/python/pandas/py3/pandas/core/indexes/frozen.py
deleted file mode 100644
index 3b8aefdbeb8..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/frozen.py
+++ /dev/null
@@ -1,117 +0,0 @@
-"""
-frozen (immutable) data structures to support MultiIndexing
-
-These are used for:
-
-- .names (FrozenList)
-
-"""
-from __future__ import annotations
-
-from typing import (
- Any,
- NoReturn,
-)
-
-from pandas.core.base import PandasObject
-
-from pandas.io.formats.printing import pprint_thing
-
-
-class FrozenList(PandasObject, list):
- """
- Container that doesn't allow setting item *but*
- because it's technically hashable, will be used
- for lookups, appropriately, etc.
- """
-
- # Side note: This has to be of type list. Otherwise,
- # it messes up PyTables type checks.
-
- def union(self, other) -> FrozenList:
- """
- Returns a FrozenList with other concatenated to the end of self.
-
- Parameters
- ----------
- other : array-like
- The array-like whose elements we are concatenating.
-
- Returns
- -------
- FrozenList
- The collection difference between self and other.
- """
- if isinstance(other, tuple):
- other = list(other)
- return type(self)(super().__add__(other))
-
- def difference(self, other) -> FrozenList:
- """
- Returns a FrozenList with elements from other removed from self.
-
- Parameters
- ----------
- other : array-like
- The array-like whose elements we are removing self.
-
- Returns
- -------
- FrozenList
- The collection difference between self and other.
- """
- other = set(other)
- temp = [x for x in self if x not in other]
- return type(self)(temp)
-
- # TODO: Consider deprecating these in favor of `union` (xref gh-15506)
- # error: Incompatible types in assignment (expression has type
- # "Callable[[FrozenList, Any], FrozenList]", base class "list" defined the
- # type as overloaded function)
- __add__ = __iadd__ = union # type: ignore[assignment]
-
- def __getitem__(self, n):
- if isinstance(n, slice):
- return type(self)(super().__getitem__(n))
- return super().__getitem__(n)
-
- def __radd__(self, other):
- if isinstance(other, tuple):
- other = list(other)
- return type(self)(other + list(self))
-
- def __eq__(self, other: Any) -> bool:
- if isinstance(other, (tuple, FrozenList)):
- other = list(other)
- return super().__eq__(other)
-
- __req__ = __eq__
-
- def __mul__(self, other):
- return type(self)(super().__mul__(other))
-
- __imul__ = __mul__
-
- def __reduce__(self):
- return type(self), (list(self),)
-
- # error: Signature of "__hash__" incompatible with supertype "list"
- def __hash__(self) -> int: # type: ignore[override]
- return hash(tuple(self))
-
- def _disabled(self, *args, **kwargs) -> NoReturn:
- """
- This method will not function because object is immutable.
- """
- raise TypeError(f"'{type(self).__name__}' does not support mutable operations.")
-
- def __str__(self) -> str:
- return pprint_thing(self, quote_strings=True, escape_chars=("\t", "\r", "\n"))
-
- def __repr__(self) -> str:
- return f"{type(self).__name__}({str(self)})"
-
- __setitem__ = __setslice__ = _disabled # type: ignore[assignment]
- __delitem__ = __delslice__ = _disabled
- pop = append = extend = _disabled
- remove = sort = insert = _disabled # type: ignore[assignment]
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/interval.py b/contrib/python/pandas/py3/pandas/core/indexes/interval.py
deleted file mode 100644
index b1705b1f2c8..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/interval.py
+++ /dev/null
@@ -1,1137 +0,0 @@
-""" define the IntervalIndex """
-from __future__ import annotations
-
-from operator import (
- le,
- lt,
-)
-import textwrap
-from typing import (
- Any,
- Hashable,
- Literal,
-)
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.interval import (
- Interval,
- IntervalMixin,
- IntervalTree,
-)
-from pandas._libs.tslibs import (
- BaseOffset,
- Timedelta,
- Timestamp,
- to_offset,
-)
-from pandas._typing import (
- Dtype,
- DtypeObj,
- IntervalClosedType,
- npt,
-)
-from pandas.errors import InvalidIndexError
-from pandas.util._decorators import (
- Appender,
- cache_readonly,
-)
-from pandas.util._exceptions import rewrite_exception
-
-from pandas.core.dtypes.cast import (
- find_common_type,
- infer_dtype_from_scalar,
- maybe_box_datetimelike,
- maybe_downcast_numeric,
- maybe_upcast_numeric_to_64bit,
-)
-from pandas.core.dtypes.common import (
- ensure_platform_int,
- is_datetime64tz_dtype,
- is_datetime_or_timedelta_dtype,
- is_dtype_equal,
- is_float,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_interval_dtype,
- is_list_like,
- is_number,
- is_object_dtype,
- is_scalar,
-)
-from pandas.core.dtypes.dtypes import IntervalDtype
-from pandas.core.dtypes.missing import is_valid_na_for_dtype
-
-from pandas.core.algorithms import unique
-from pandas.core.arrays.interval import (
- IntervalArray,
- _interval_shared_docs,
-)
-import pandas.core.common as com
-from pandas.core.indexers import is_valid_positional_slice
-import pandas.core.indexes.base as ibase
-from pandas.core.indexes.base import (
- Index,
- _index_shared_docs,
- ensure_index,
- maybe_extract_name,
-)
-from pandas.core.indexes.datetimes import (
- DatetimeIndex,
- date_range,
-)
-from pandas.core.indexes.extension import (
- ExtensionIndex,
- inherit_names,
-)
-from pandas.core.indexes.multi import MultiIndex
-from pandas.core.indexes.timedeltas import (
- TimedeltaIndex,
- timedelta_range,
-)
-
-_index_doc_kwargs = dict(ibase._index_doc_kwargs)
-
-_index_doc_kwargs.update(
- {
- "klass": "IntervalIndex",
- "qualname": "IntervalIndex",
- "target_klass": "IntervalIndex or list of Intervals",
- "name": textwrap.dedent(
- """\
- name : object, optional
- Name to be stored in the index.
- """
- ),
- }
-)
-
-
-def _get_next_label(label):
- dtype = getattr(label, "dtype", type(label))
- if isinstance(label, (Timestamp, Timedelta)):
- dtype = "datetime64"
- if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype):
- return label + np.timedelta64(1, "ns")
- elif is_integer_dtype(dtype):
- return label + 1
- elif is_float_dtype(dtype):
- return np.nextafter(label, np.infty)
- else:
- raise TypeError(f"cannot determine next label for type {repr(type(label))}")
-
-
-def _get_prev_label(label):
- dtype = getattr(label, "dtype", type(label))
- if isinstance(label, (Timestamp, Timedelta)):
- dtype = "datetime64"
- if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype):
- return label - np.timedelta64(1, "ns")
- elif is_integer_dtype(dtype):
- return label - 1
- elif is_float_dtype(dtype):
- return np.nextafter(label, -np.infty)
- else:
- raise TypeError(f"cannot determine next label for type {repr(type(label))}")
-
-
-def _new_IntervalIndex(cls, d):
- """
- This is called upon unpickling, rather than the default which doesn't have
- arguments and breaks __new__.
- """
- return cls.from_arrays(**d)
-
-
-@Appender(
- _interval_shared_docs["class"]
- % {
- "klass": "IntervalIndex",
- "summary": "Immutable index of intervals that are closed on the same side.",
- "name": _index_doc_kwargs["name"],
- "versionadded": "0.20.0",
- "extra_attributes": "is_overlapping\nvalues\n",
- "extra_methods": "",
- "examples": textwrap.dedent(
- """\
- Examples
- --------
- A new ``IntervalIndex`` is typically constructed using
- :func:`interval_range`:
-
- >>> pd.interval_range(start=0, end=5)
- IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]],
- dtype='interval[int64, right]')
-
- It may also be constructed using one of the constructor
- methods: :meth:`IntervalIndex.from_arrays`,
- :meth:`IntervalIndex.from_breaks`, and :meth:`IntervalIndex.from_tuples`.
-
- See further examples in the doc strings of ``interval_range`` and the
- mentioned constructor methods.
- """
- ),
- }
-)
-@inherit_names(["set_closed", "to_tuples"], IntervalArray, wrap=True)
-@inherit_names(
- [
- "__array__",
- "overlaps",
- "contains",
- "closed_left",
- "closed_right",
- "open_left",
- "open_right",
- "is_empty",
- ],
- IntervalArray,
-)
-@inherit_names(["is_non_overlapping_monotonic", "closed"], IntervalArray, cache=True)
-class IntervalIndex(ExtensionIndex):
- _typ = "intervalindex"
-
- # annotate properties pinned via inherit_names
- closed: IntervalClosedType
- is_non_overlapping_monotonic: bool
- closed_left: bool
- closed_right: bool
- open_left: bool
- open_right: bool
-
- _data: IntervalArray
- _values: IntervalArray
- _can_hold_strings = False
- _data_cls = IntervalArray
-
- # --------------------------------------------------------------------
- # Constructors
-
- def __new__(
- cls,
- data,
- closed=None,
- dtype: Dtype | None = None,
- copy: bool = False,
- name: Hashable = None,
- verify_integrity: bool = True,
- ) -> IntervalIndex:
- name = maybe_extract_name(name, data, cls)
-
- with rewrite_exception("IntervalArray", cls.__name__):
- array = IntervalArray(
- data,
- closed=closed,
- copy=copy,
- dtype=dtype,
- verify_integrity=verify_integrity,
- )
-
- return cls._simple_new(array, name)
-
- @classmethod
- @Appender(
- _interval_shared_docs["from_breaks"]
- % {
- "klass": "IntervalIndex",
- "name": textwrap.dedent(
- """
- name : str, optional
- Name of the resulting IntervalIndex."""
- ),
- "examples": textwrap.dedent(
- """\
- Examples
- --------
- >>> pd.IntervalIndex.from_breaks([0, 1, 2, 3])
- IntervalIndex([(0, 1], (1, 2], (2, 3]],
- dtype='interval[int64, right]')
- """
- ),
- }
- )
- def from_breaks(
- cls,
- breaks,
- closed: IntervalClosedType | None = "right",
- name: Hashable = None,
- copy: bool = False,
- dtype: Dtype | None = None,
- ) -> IntervalIndex:
- with rewrite_exception("IntervalArray", cls.__name__):
- array = IntervalArray.from_breaks(
- breaks, closed=closed, copy=copy, dtype=dtype
- )
- return cls._simple_new(array, name=name)
-
- @classmethod
- @Appender(
- _interval_shared_docs["from_arrays"]
- % {
- "klass": "IntervalIndex",
- "name": textwrap.dedent(
- """
- name : str, optional
- Name of the resulting IntervalIndex."""
- ),
- "examples": textwrap.dedent(
- """\
- Examples
- --------
- >>> pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3])
- IntervalIndex([(0, 1], (1, 2], (2, 3]],
- dtype='interval[int64, right]')
- """
- ),
- }
- )
- def from_arrays(
- cls,
- left,
- right,
- closed: IntervalClosedType = "right",
- name: Hashable = None,
- copy: bool = False,
- dtype: Dtype | None = None,
- ) -> IntervalIndex:
- with rewrite_exception("IntervalArray", cls.__name__):
- array = IntervalArray.from_arrays(
- left, right, closed, copy=copy, dtype=dtype
- )
- return cls._simple_new(array, name=name)
-
- @classmethod
- @Appender(
- _interval_shared_docs["from_tuples"]
- % {
- "klass": "IntervalIndex",
- "name": textwrap.dedent(
- """
- name : str, optional
- Name of the resulting IntervalIndex."""
- ),
- "examples": textwrap.dedent(
- """\
- Examples
- --------
- >>> pd.IntervalIndex.from_tuples([(0, 1), (1, 2)])
- IntervalIndex([(0, 1], (1, 2]],
- dtype='interval[int64, right]')
- """
- ),
- }
- )
- def from_tuples(
- cls,
- data,
- closed: IntervalClosedType = "right",
- name: Hashable = None,
- copy: bool = False,
- dtype: Dtype | None = None,
- ) -> IntervalIndex:
- with rewrite_exception("IntervalArray", cls.__name__):
- arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype)
- return cls._simple_new(arr, name=name)
-
- # --------------------------------------------------------------------
- # error: Return type "IntervalTree" of "_engine" incompatible with return type
- # "Union[IndexEngine, ExtensionEngine]" in supertype "Index"
- @cache_readonly
- def _engine(self) -> IntervalTree: # type: ignore[override]
- # IntervalTree does not supports numpy array unless they are 64 bit
- left = self._maybe_convert_i8(self.left)
- left = maybe_upcast_numeric_to_64bit(left)
- right = self._maybe_convert_i8(self.right)
- right = maybe_upcast_numeric_to_64bit(right)
- return IntervalTree(left, right, closed=self.closed)
-
- def __contains__(self, key: Any) -> bool:
- """
- return a boolean if this key is IN the index
- We *only* accept an Interval
-
- Parameters
- ----------
- key : Interval
-
- Returns
- -------
- bool
- """
- hash(key)
- if not isinstance(key, Interval):
- if is_valid_na_for_dtype(key, self.dtype):
- return self.hasnans
- return False
-
- try:
- self.get_loc(key)
- return True
- except KeyError:
- return False
-
- @cache_readonly
- def _multiindex(self) -> MultiIndex:
- return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"])
-
- def __reduce__(self):
- d = {
- "left": self.left,
- "right": self.right,
- "closed": self.closed,
- "name": self.name,
- }
- return _new_IntervalIndex, (type(self), d), None
-
- @property
- def inferred_type(self) -> str:
- """Return a string of the type inferred from the values"""
- return "interval"
-
- # Cannot determine type of "memory_usage"
- @Appender(Index.memory_usage.__doc__) # type: ignore[has-type]
- def memory_usage(self, deep: bool = False) -> int:
- # we don't use an explicit engine
- # so return the bytes here
- return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep)
-
- # IntervalTree doesn't have a is_monotonic_decreasing, so have to override
- # the Index implementation
- @cache_readonly
- def is_monotonic_decreasing(self) -> bool:
- """
- Return True if the IntervalIndex is monotonic decreasing (only equal or
- decreasing values), else False
- """
- return self[::-1].is_monotonic_increasing
-
- @cache_readonly
- def is_unique(self) -> bool:
- """
- Return True if the IntervalIndex contains unique elements, else False.
- """
- left = self.left
- right = self.right
-
- if self.isna().sum() > 1:
- return False
-
- if left.is_unique or right.is_unique:
- return True
-
- seen_pairs = set()
- check_idx = np.where(left.duplicated(keep=False))[0]
- for idx in check_idx:
- pair = (left[idx], right[idx])
- if pair in seen_pairs:
- return False
- seen_pairs.add(pair)
-
- return True
-
- @property
- def is_overlapping(self) -> bool:
- """
- Return True if the IntervalIndex has overlapping intervals, else False.
-
- Two intervals overlap if they share a common point, including closed
- endpoints. Intervals that only have an open endpoint in common do not
- overlap.
-
- Returns
- -------
- bool
- Boolean indicating if the IntervalIndex has overlapping intervals.
-
- See Also
- --------
- Interval.overlaps : Check whether two Interval objects overlap.
- IntervalIndex.overlaps : Check an IntervalIndex elementwise for
- overlaps.
-
- Examples
- --------
- >>> index = pd.IntervalIndex.from_tuples([(0, 2), (1, 3), (4, 5)])
- >>> index
- IntervalIndex([(0, 2], (1, 3], (4, 5]],
- dtype='interval[int64, right]')
- >>> index.is_overlapping
- True
-
- Intervals that share closed endpoints overlap:
-
- >>> index = pd.interval_range(0, 3, closed='both')
- >>> index
- IntervalIndex([[0, 1], [1, 2], [2, 3]],
- dtype='interval[int64, both]')
- >>> index.is_overlapping
- True
-
- Intervals that only have an open endpoint in common do not overlap:
-
- >>> index = pd.interval_range(0, 3, closed='left')
- >>> index
- IntervalIndex([[0, 1), [1, 2), [2, 3)],
- dtype='interval[int64, left]')
- >>> index.is_overlapping
- False
- """
- # GH 23309
- return self._engine.is_overlapping
-
- def _needs_i8_conversion(self, key) -> bool:
- """
- Check if a given key needs i8 conversion. Conversion is necessary for
- Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An
- Interval-like requires conversion if its endpoints are one of the
- aforementioned types.
-
- Assumes that any list-like data has already been cast to an Index.
-
- Parameters
- ----------
- key : scalar or Index-like
- The key that should be checked for i8 conversion
-
- Returns
- -------
- bool
- """
- if is_interval_dtype(key) or isinstance(key, Interval):
- return self._needs_i8_conversion(key.left)
-
- i8_types = (Timestamp, Timedelta, DatetimeIndex, TimedeltaIndex)
- return isinstance(key, i8_types)
-
- def _maybe_convert_i8(self, key):
- """
- Maybe convert a given key to its equivalent i8 value(s). Used as a
- preprocessing step prior to IntervalTree queries (self._engine), which
- expects numeric data.
-
- Parameters
- ----------
- key : scalar or list-like
- The key that should maybe be converted to i8.
-
- Returns
- -------
- scalar or list-like
- The original key if no conversion occurred, int if converted scalar,
- Index with an int64 dtype if converted list-like.
- """
- if is_list_like(key):
- key = ensure_index(key)
- key = maybe_upcast_numeric_to_64bit(key)
-
- if not self._needs_i8_conversion(key):
- return key
-
- scalar = is_scalar(key)
- if is_interval_dtype(key) or isinstance(key, Interval):
- # convert left/right and reconstruct
- left = self._maybe_convert_i8(key.left)
- right = self._maybe_convert_i8(key.right)
- constructor = Interval if scalar else IntervalIndex.from_arrays
- # error: "object" not callable
- return constructor(
- left, right, closed=self.closed
- ) # type: ignore[operator]
-
- if scalar:
- # Timestamp/Timedelta
- key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True)
- if lib.is_period(key):
- key_i8 = key.ordinal
- elif isinstance(key_i8, Timestamp):
- key_i8 = key_i8._value
- elif isinstance(key_i8, (np.datetime64, np.timedelta64)):
- key_i8 = key_i8.view("i8")
- else:
- # DatetimeIndex/TimedeltaIndex
- key_dtype, key_i8 = key.dtype, Index(key.asi8)
- if key.hasnans:
- # convert NaT from its i8 value to np.nan so it's not viewed
- # as a valid value, maybe causing errors (e.g. is_overlapping)
- key_i8 = key_i8.where(~key._isnan)
-
- # ensure consistency with IntervalIndex subtype
- # error: Item "ExtensionDtype"/"dtype[Any]" of "Union[dtype[Any],
- # ExtensionDtype]" has no attribute "subtype"
- subtype = self.dtype.subtype # type: ignore[union-attr]
-
- if not is_dtype_equal(subtype, key_dtype):
- raise ValueError(
- f"Cannot index an IntervalIndex of subtype {subtype} with "
- f"values of dtype {key_dtype}"
- )
-
- return key_i8
-
- def _searchsorted_monotonic(self, label, side: Literal["left", "right"] = "left"):
- if not self.is_non_overlapping_monotonic:
- raise KeyError(
- "can only get slices from an IntervalIndex if bounds are "
- "non-overlapping and all monotonic increasing or decreasing"
- )
-
- if isinstance(label, (IntervalMixin, IntervalIndex)):
- raise NotImplementedError("Interval objects are not currently supported")
-
- # GH 20921: "not is_monotonic_increasing" for the second condition
- # instead of "is_monotonic_decreasing" to account for single element
- # indexes being both increasing and decreasing
- if (side == "left" and self.left.is_monotonic_increasing) or (
- side == "right" and not self.left.is_monotonic_increasing
- ):
- sub_idx = self.right
- if self.open_right:
- label = _get_next_label(label)
- else:
- sub_idx = self.left
- if self.open_left:
- label = _get_prev_label(label)
-
- return sub_idx._searchsorted_monotonic(label, side)
-
- # --------------------------------------------------------------------
- # Indexing Methods
-
- def get_loc(self, key) -> int | slice | np.ndarray:
- """
- Get integer location, slice or boolean mask for requested label.
-
- Parameters
- ----------
- key : label
-
- Returns
- -------
- int if unique index, slice if monotonic index, else mask
-
- Examples
- --------
- >>> i1, i2 = pd.Interval(0, 1), pd.Interval(1, 2)
- >>> index = pd.IntervalIndex([i1, i2])
- >>> index.get_loc(1)
- 0
-
- You can also supply a point inside an interval.
-
- >>> index.get_loc(1.5)
- 1
-
- If a label is in several intervals, you get the locations of all the
- relevant intervals.
-
- >>> i3 = pd.Interval(0, 2)
- >>> overlapping_index = pd.IntervalIndex([i1, i2, i3])
- >>> overlapping_index.get_loc(0.5)
- array([ True, False, True])
-
- Only exact matches will be returned if an interval is provided.
-
- >>> index.get_loc(pd.Interval(0, 1))
- 0
- """
- self._check_indexing_error(key)
-
- if isinstance(key, Interval):
- if self.closed != key.closed:
- raise KeyError(key)
- mask = (self.left == key.left) & (self.right == key.right)
- elif is_valid_na_for_dtype(key, self.dtype):
- mask = self.isna()
- else:
- # assume scalar
- op_left = le if self.closed_left else lt
- op_right = le if self.closed_right else lt
- try:
- mask = op_left(self.left, key) & op_right(key, self.right)
- except TypeError as err:
- # scalar is not comparable to II subtype --> invalid label
- raise KeyError(key) from err
-
- matches = mask.sum()
- if matches == 0:
- raise KeyError(key)
- if matches == 1:
- return mask.argmax()
-
- res = lib.maybe_booleans_to_slice(mask.view("u1"))
- if isinstance(res, slice) and res.stop is None:
- # TODO: DO this in maybe_booleans_to_slice?
- res = slice(res.start, len(self), res.step)
- return res
-
- def _get_indexer(
- self,
- target: Index,
- method: str | None = None,
- limit: int | None = None,
- tolerance: Any | None = None,
- ) -> npt.NDArray[np.intp]:
- if isinstance(target, IntervalIndex):
- # We only get here with not self.is_overlapping
- # -> at most one match per interval in target
- # want exact matches -> need both left/right to match, so defer to
- # left/right get_indexer, compare elementwise, equality -> match
- indexer = self._get_indexer_unique_sides(target)
-
- elif not is_object_dtype(target.dtype):
- # homogeneous scalar index: use IntervalTree
- # we should always have self._should_partial_index(target) here
- target = self._maybe_convert_i8(target)
- indexer = self._engine.get_indexer(target.values)
- else:
- # heterogeneous scalar index: defer elementwise to get_loc
- # we should always have self._should_partial_index(target) here
- return self._get_indexer_pointwise(target)[0]
-
- return ensure_platform_int(indexer)
-
- @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
- def get_indexer_non_unique(
- self, target: Index
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- target = ensure_index(target)
-
- if not self._should_compare(target) and not self._should_partial_index(target):
- # e.g. IntervalIndex with different closed or incompatible subtype
- # -> no matches
- return self._get_indexer_non_comparable(target, None, unique=False)
-
- elif isinstance(target, IntervalIndex):
- if self.left.is_unique and self.right.is_unique:
- # fastpath available even if we don't have self._index_as_unique
- indexer = self._get_indexer_unique_sides(target)
- missing = (indexer == -1).nonzero()[0]
- else:
- return self._get_indexer_pointwise(target)
-
- elif is_object_dtype(target.dtype) or not self._should_partial_index(target):
- # target might contain intervals: defer elementwise to get_loc
- return self._get_indexer_pointwise(target)
-
- else:
- # Note: this case behaves differently from other Index subclasses
- # because IntervalIndex does partial-int indexing
- target = self._maybe_convert_i8(target)
- indexer, missing = self._engine.get_indexer_non_unique(target.values)
-
- return ensure_platform_int(indexer), ensure_platform_int(missing)
-
- def _get_indexer_unique_sides(self, target: IntervalIndex) -> npt.NDArray[np.intp]:
- """
- _get_indexer specialized to the case where both of our sides are unique.
- """
- # Caller is responsible for checking
- # `self.left.is_unique and self.right.is_unique`
-
- left_indexer = self.left.get_indexer(target.left)
- right_indexer = self.right.get_indexer(target.right)
- indexer = np.where(left_indexer == right_indexer, left_indexer, -1)
- return indexer
-
- def _get_indexer_pointwise(
- self, target: Index
- ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- """
- pointwise implementation for get_indexer and get_indexer_non_unique.
- """
- indexer, missing = [], []
- for i, key in enumerate(target):
- try:
- locs = self.get_loc(key)
- if isinstance(locs, slice):
- # Only needed for get_indexer_non_unique
- locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp")
- elif lib.is_integer(locs):
- locs = np.array(locs, ndmin=1)
- else:
- # otherwise we have ndarray[bool]
- locs = np.where(locs)[0]
- except KeyError:
- missing.append(i)
- locs = np.array([-1])
- except InvalidIndexError:
- # i.e. non-scalar key e.g. a tuple.
- # see test_append_different_columns_types_raises
- missing.append(i)
- locs = np.array([-1])
-
- indexer.append(locs)
-
- indexer = np.concatenate(indexer)
- return ensure_platform_int(indexer), ensure_platform_int(missing)
-
- @cache_readonly
- def _index_as_unique(self) -> bool:
- return not self.is_overlapping and self._engine._na_count < 2
-
- _requires_unique_msg = (
- "cannot handle overlapping indices; use IntervalIndex.get_indexer_non_unique"
- )
-
- def _convert_slice_indexer(self, key: slice, kind: str):
- if not (key.step is None or key.step == 1):
- # GH#31658 if label-based, we require step == 1,
- # if positional, we disallow float start/stop
- msg = "label-based slicing with step!=1 is not supported for IntervalIndex"
- if kind == "loc":
- raise ValueError(msg)
- if kind == "getitem":
- if not is_valid_positional_slice(key):
- # i.e. this cannot be interpreted as a positional slice
- raise ValueError(msg)
-
- return super()._convert_slice_indexer(key, kind)
-
- @cache_readonly
- def _should_fallback_to_positional(self) -> bool:
- # integer lookups in Series.__getitem__ are unambiguously
- # positional in this case
- # error: Item "ExtensionDtype"/"dtype[Any]" of "Union[dtype[Any],
- # ExtensionDtype]" has no attribute "subtype"
- return self.dtype.subtype.kind in ["m", "M"] # type: ignore[union-attr]
-
- def _maybe_cast_slice_bound(self, label, side: str):
- return getattr(self, side)._maybe_cast_slice_bound(label, side)
-
- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
- if not isinstance(dtype, IntervalDtype):
- return False
- common_subtype = find_common_type([self.dtype, dtype])
- return not is_object_dtype(common_subtype)
-
- # --------------------------------------------------------------------
-
- @cache_readonly
- def left(self) -> Index:
- return Index(self._data.left, copy=False)
-
- @cache_readonly
- def right(self) -> Index:
- return Index(self._data.right, copy=False)
-
- @cache_readonly
- def mid(self) -> Index:
- return Index(self._data.mid, copy=False)
-
- @property
- def length(self) -> Index:
- return Index(self._data.length, copy=False)
-
- # --------------------------------------------------------------------
- # Rendering Methods
- # __repr__ associated methods are based on MultiIndex
-
- def _format_with_header(self, header: list[str], na_rep: str) -> list[str]:
- # matches base class except for whitespace padding
- return header + list(self._format_native_types(na_rep=na_rep))
-
- def _format_native_types(
- self, *, na_rep: str = "NaN", quoting=None, **kwargs
- ) -> npt.NDArray[np.object_]:
- # GH 28210: use base method but with different default na_rep
- return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs)
-
- def _format_data(self, name=None) -> str:
- # TODO: integrate with categorical and make generic
- # name argument is unused here; just for compat with base / categorical
- return f"{self._data._format_data()},{self._format_space()}"
-
- # --------------------------------------------------------------------
- # Set Operations
-
- def _intersection(self, other, sort):
- """
- intersection specialized to the case with matching dtypes.
- """
- # For IntervalIndex we also know other.closed == self.closed
- if self.left.is_unique and self.right.is_unique:
- taken = self._intersection_unique(other)
- elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1:
- # Swap other/self if other is unique and self does not have
- # multiple NaNs
- taken = other._intersection_unique(self)
- else:
- # duplicates
- taken = self._intersection_non_unique(other)
-
- if sort is None:
- taken = taken.sort_values()
-
- return taken
-
- def _intersection_unique(self, other: IntervalIndex) -> IntervalIndex:
- """
- Used when the IntervalIndex does not have any common endpoint,
- no matter left or right.
- Return the intersection with another IntervalIndex.
- Parameters
- ----------
- other : IntervalIndex
- Returns
- -------
- IntervalIndex
- """
- # Note: this is much more performant than super()._intersection(other)
- lindexer = self.left.get_indexer(other.left)
- rindexer = self.right.get_indexer(other.right)
-
- match = (lindexer == rindexer) & (lindexer != -1)
- indexer = lindexer.take(match.nonzero()[0])
- indexer = unique(indexer)
-
- return self.take(indexer)
-
- def _intersection_non_unique(self, other: IntervalIndex) -> IntervalIndex:
- """
- Used when the IntervalIndex does have some common endpoints,
- on either sides.
- Return the intersection with another IntervalIndex.
-
- Parameters
- ----------
- other : IntervalIndex
-
- Returns
- -------
- IntervalIndex
- """
- # Note: this is about 3.25x faster than super()._intersection(other)
- # in IntervalIndexMethod.time_intersection_both_duplicate(1000)
- mask = np.zeros(len(self), dtype=bool)
-
- if self.hasnans and other.hasnans:
- first_nan_loc = np.arange(len(self))[self.isna()][0]
- mask[first_nan_loc] = True
-
- other_tups = set(zip(other.left, other.right))
- for i, tup in enumerate(zip(self.left, self.right)):
- if tup in other_tups:
- mask[i] = True
-
- return self[mask]
-
- # --------------------------------------------------------------------
-
- def _get_engine_target(self) -> np.ndarray:
- # Note: we _could_ use libjoin functions by either casting to object
- # dtype or constructing tuples (faster than constructing Intervals)
- # but the libjoin fastpaths are no longer fast in these cases.
- raise NotImplementedError(
- "IntervalIndex does not use libjoin fastpaths or pass values to "
- "IndexEngine objects"
- )
-
- def _from_join_target(self, result):
- raise NotImplementedError("IntervalIndex does not use libjoin fastpaths")
-
- # TODO: arithmetic operations
-
-
-def _is_valid_endpoint(endpoint) -> bool:
- """
- Helper for interval_range to check if start/end are valid types.
- """
- return any(
- [
- is_number(endpoint),
- isinstance(endpoint, Timestamp),
- isinstance(endpoint, Timedelta),
- endpoint is None,
- ]
- )
-
-
-def _is_type_compatible(a, b) -> bool:
- """
- Helper for interval_range to check type compat of start/end/freq.
- """
- is_ts_compat = lambda x: isinstance(x, (Timestamp, BaseOffset))
- is_td_compat = lambda x: isinstance(x, (Timedelta, BaseOffset))
- return (
- (is_number(a) and is_number(b))
- or (is_ts_compat(a) and is_ts_compat(b))
- or (is_td_compat(a) and is_td_compat(b))
- or com.any_none(a, b)
- )
-
-
-def interval_range(
- start=None,
- end=None,
- periods=None,
- freq=None,
- name: Hashable = None,
- closed: IntervalClosedType = "right",
-) -> IntervalIndex:
- """
- Return a fixed frequency IntervalIndex.
-
- Parameters
- ----------
- start : numeric or datetime-like, default None
- Left bound for generating intervals.
- end : numeric or datetime-like, default None
- Right bound for generating intervals.
- periods : int, default None
- Number of periods to generate.
- freq : numeric, str, datetime.timedelta, or DateOffset, default None
- The length of each interval. Must be consistent with the type of start
- and end, e.g. 2 for numeric, or '5H' for datetime-like. Default is 1
- for numeric and 'D' for datetime-like.
- name : str, default None
- Name of the resulting IntervalIndex.
- closed : {'left', 'right', 'both', 'neither'}, default 'right'
- Whether the intervals are closed on the left-side, right-side, both
- or neither.
-
- Returns
- -------
- IntervalIndex
-
- See Also
- --------
- IntervalIndex : An Index of intervals that are all closed on the same side.
-
- Notes
- -----
- Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,
- exactly three must be specified. If ``freq`` is omitted, the resulting
- ``IntervalIndex`` will have ``periods`` linearly spaced elements between
- ``start`` and ``end``, inclusively.
-
- To learn more about datetime-like frequency strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
-
- Examples
- --------
- Numeric ``start`` and ``end`` is supported.
-
- >>> pd.interval_range(start=0, end=5)
- IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]],
- dtype='interval[int64, right]')
-
- Additionally, datetime-like input is also supported.
-
- >>> pd.interval_range(start=pd.Timestamp('2017-01-01'),
- ... end=pd.Timestamp('2017-01-04'))
- IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03],
- (2017-01-03, 2017-01-04]],
- dtype='interval[datetime64[ns], right]')
-
- The ``freq`` parameter specifies the frequency between the left and right.
- endpoints of the individual intervals within the ``IntervalIndex``. For
- numeric ``start`` and ``end``, the frequency must also be numeric.
-
- >>> pd.interval_range(start=0, periods=4, freq=1.5)
- IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]],
- dtype='interval[float64, right]')
-
- Similarly, for datetime-like ``start`` and ``end``, the frequency must be
- convertible to a DateOffset.
-
- >>> pd.interval_range(start=pd.Timestamp('2017-01-01'),
- ... periods=3, freq='MS')
- IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01],
- (2017-03-01, 2017-04-01]],
- dtype='interval[datetime64[ns], right]')
-
- Specify ``start``, ``end``, and ``periods``; the frequency is generated
- automatically (linearly spaced).
-
- >>> pd.interval_range(start=0, end=6, periods=4)
- IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]],
- dtype='interval[float64, right]')
-
- The ``closed`` parameter specifies which endpoints of the individual
- intervals within the ``IntervalIndex`` are closed.
-
- >>> pd.interval_range(end=5, periods=4, closed='both')
- IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]],
- dtype='interval[int64, both]')
- """
- start = maybe_box_datetimelike(start)
- end = maybe_box_datetimelike(end)
- endpoint = start if start is not None else end
-
- if freq is None and com.any_none(periods, start, end):
- freq = 1 if is_number(endpoint) else "D"
-
- if com.count_not_none(start, end, periods, freq) != 3:
- raise ValueError(
- "Of the four parameters: start, end, periods, and "
- "freq, exactly three must be specified"
- )
-
- if not _is_valid_endpoint(start):
- raise ValueError(f"start must be numeric or datetime-like, got {start}")
- if not _is_valid_endpoint(end):
- raise ValueError(f"end must be numeric or datetime-like, got {end}")
-
- if is_float(periods):
- periods = int(periods)
- elif not is_integer(periods) and periods is not None:
- raise TypeError(f"periods must be a number, got {periods}")
-
- if freq is not None and not is_number(freq):
- try:
- freq = to_offset(freq)
- except ValueError as err:
- raise ValueError(
- f"freq must be numeric or convertible to DateOffset, got {freq}"
- ) from err
-
- # verify type compatibility
- if not all(
- [
- _is_type_compatible(start, end),
- _is_type_compatible(start, freq),
- _is_type_compatible(end, freq),
- ]
- ):
- raise TypeError("start, end, freq need to be type compatible")
-
- # +1 to convert interval count to breaks count (n breaks = n-1 intervals)
- if periods is not None:
- periods += 1
-
- breaks: np.ndarray | TimedeltaIndex | DatetimeIndex
-
- if is_number(endpoint):
- # force consistency between start/end/freq (lower end if freq skips it)
- if com.all_not_none(start, end, freq):
- end -= (end - start) % freq
-
- # compute the period/start/end if unspecified (at most one)
- if periods is None:
- periods = int((end - start) // freq) + 1
- elif start is None:
- start = end - (periods - 1) * freq
- elif end is None:
- end = start + (periods - 1) * freq
-
- breaks = np.linspace(start, end, periods)
- if all(is_integer(x) for x in com.not_none(start, end, freq)):
- # np.linspace always produces float output
-
- # error: Argument 1 to "maybe_downcast_numeric" has incompatible type
- # "Union[ndarray[Any, Any], TimedeltaIndex, DatetimeIndex]";
- # expected "ndarray[Any, Any]" [
- breaks = maybe_downcast_numeric(
- breaks, # type: ignore[arg-type]
- np.dtype("int64"),
- )
- else:
- # delegate to the appropriate range function
- if isinstance(endpoint, Timestamp):
- breaks = date_range(start=start, end=end, periods=periods, freq=freq)
- else:
- breaks = timedelta_range(start=start, end=end, periods=periods, freq=freq)
-
- return IntervalIndex.from_breaks(breaks, name=name, closed=closed)
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/multi.py b/contrib/python/pandas/py3/pandas/core/indexes/multi.py
deleted file mode 100644
index 2054cdae989..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/multi.py
+++ /dev/null
@@ -1,3918 +0,0 @@
-from __future__ import annotations
-
-from functools import wraps
-from sys import getsizeof
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Collection,
- Generator,
- Hashable,
- Iterable,
- List,
- Literal,
- Sequence,
- Tuple,
- cast,
-)
-import warnings
-
-import numpy as np
-
-from pandas._config import get_option
-
-from pandas._libs import (
- algos as libalgos,
- index as libindex,
- lib,
-)
-from pandas._libs.hashtable import duplicated
-from pandas._typing import (
- AnyAll,
- AnyArrayLike,
- Axis,
- DropKeep,
- DtypeObj,
- F,
- IgnoreRaise,
- IndexLabel,
- Scalar,
- Shape,
- npt,
-)
-from pandas.compat.numpy import function as nv
-from pandas.errors import (
- InvalidIndexError,
- PerformanceWarning,
- UnsortedIndexError,
-)
-from pandas.util._decorators import (
- Appender,
- cache_readonly,
- doc,
-)
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.cast import coerce_indexer_dtype
-from pandas.core.dtypes.common import (
- ensure_int64,
- ensure_platform_int,
- is_categorical_dtype,
- is_extension_array_dtype,
- is_hashable,
- is_integer,
- is_iterator,
- is_list_like,
- is_object_dtype,
- is_scalar,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import ExtensionDtype
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCDatetimeIndex,
- ABCTimedeltaIndex,
-)
-from pandas.core.dtypes.missing import (
- array_equivalent,
- isna,
-)
-
-import pandas.core.algorithms as algos
-from pandas.core.array_algos.putmask import validate_putmask
-from pandas.core.arrays import Categorical
-from pandas.core.arrays.categorical import factorize_from_iterables
-import pandas.core.common as com
-import pandas.core.indexes.base as ibase
-from pandas.core.indexes.base import (
- Index,
- _index_shared_docs,
- ensure_index,
- get_unanimous_names,
-)
-from pandas.core.indexes.frozen import FrozenList
-from pandas.core.ops.invalid import make_invalid_op
-from pandas.core.sorting import (
- get_group_index,
- indexer_from_factorized,
- lexsort_indexer,
-)
-
-from pandas.io.formats.printing import pprint_thing
-
-if TYPE_CHECKING:
- from pandas import (
- CategoricalIndex,
- DataFrame,
- Series,
- )
-
-_index_doc_kwargs = dict(ibase._index_doc_kwargs)
-_index_doc_kwargs.update(
- {"klass": "MultiIndex", "target_klass": "MultiIndex or list of tuples"}
-)
-
-
-class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine):
- """
- This class manages a MultiIndex by mapping label combinations to positive
- integers.
- """
-
- _base = libindex.UInt64Engine
-
- def _codes_to_ints(self, codes):
- """
- Transform combination(s) of uint64 in one uint64 (each), in a strictly
- monotonic way (i.e. respecting the lexicographic order of integer
- combinations): see BaseMultiIndexCodesEngine documentation.
-
- Parameters
- ----------
- codes : 1- or 2-dimensional array of dtype uint64
- Combinations of integers (one per row)
-
- Returns
- -------
- scalar or 1-dimensional array, of dtype uint64
- Integer(s) representing one combination (each).
- """
- # Shift the representation of each level by the pre-calculated number
- # of bits:
- codes <<= self.offsets
-
- # Now sum and OR are in fact interchangeable. This is a simple
- # composition of the (disjunct) significant bits of each level (i.e.
- # each column in "codes") in a single positive integer:
- if codes.ndim == 1:
- # Single key
- return np.bitwise_or.reduce(codes)
-
- # Multiple keys
- return np.bitwise_or.reduce(codes, axis=1)
-
-
-class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine):
- """
- This class manages those (extreme) cases in which the number of possible
- label combinations overflows the 64 bits integers, and uses an ObjectEngine
- containing Python integers.
- """
-
- _base = libindex.ObjectEngine
-
- def _codes_to_ints(self, codes):
- """
- Transform combination(s) of uint64 in one Python integer (each), in a
- strictly monotonic way (i.e. respecting the lexicographic order of
- integer combinations): see BaseMultiIndexCodesEngine documentation.
-
- Parameters
- ----------
- codes : 1- or 2-dimensional array of dtype uint64
- Combinations of integers (one per row)
-
- Returns
- -------
- int, or 1-dimensional array of dtype object
- Integer(s) representing one combination (each).
- """
- # Shift the representation of each level by the pre-calculated number
- # of bits. Since this can overflow uint64, first make sure we are
- # working with Python integers:
- codes = codes.astype("object") << self.offsets
-
- # Now sum and OR are in fact interchangeable. This is a simple
- # composition of the (disjunct) significant bits of each level (i.e.
- # each column in "codes") in a single positive integer (per row):
- if codes.ndim == 1:
- # Single key
- return np.bitwise_or.reduce(codes)
-
- # Multiple keys
- return np.bitwise_or.reduce(codes, axis=1)
-
-
-def names_compat(meth: F) -> F:
- """
- A decorator to allow either `name` or `names` keyword but not both.
-
- This makes it easier to share code with base class.
- """
-
- @wraps(meth)
- def new_meth(self_or_cls, *args, **kwargs):
- if "name" in kwargs and "names" in kwargs:
- raise TypeError("Can only provide one of `names` and `name`")
- if "name" in kwargs:
- kwargs["names"] = kwargs.pop("name")
-
- return meth(self_or_cls, *args, **kwargs)
-
- return cast(F, new_meth)
-
-
-class MultiIndex(Index):
- """
- A multi-level, or hierarchical, index object for pandas objects.
-
- Parameters
- ----------
- levels : sequence of arrays
- The unique labels for each level.
- codes : sequence of arrays
- Integers for each level designating which label at each location.
- sortorder : optional int
- Level of sortedness (must be lexicographically sorted by that
- level).
- names : optional sequence of objects
- Names for each of the index levels. (name is accepted for compat).
- copy : bool, default False
- Copy the meta-data.
- verify_integrity : bool, default True
- Check that the levels/codes are consistent and valid.
-
- Attributes
- ----------
- names
- levels
- codes
- nlevels
- levshape
- dtypes
-
- Methods
- -------
- from_arrays
- from_tuples
- from_product
- from_frame
- set_levels
- set_codes
- to_frame
- to_flat_index
- sortlevel
- droplevel
- swaplevel
- reorder_levels
- remove_unused_levels
- get_level_values
- get_indexer
- get_loc
- get_locs
- get_loc_level
- drop
-
- See Also
- --------
- MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
- MultiIndex.from_product : Create a MultiIndex from the cartesian product
- of iterables.
- MultiIndex.from_tuples : Convert list of tuples to a MultiIndex.
- MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
- Index : The base pandas Index type.
-
- Notes
- -----
- See the `user guide
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html>`__
- for more.
-
- Examples
- --------
- A new ``MultiIndex`` is typically constructed using one of the helper
- methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product`
- and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``):
-
- >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
- >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
- MultiIndex([(1, 'red'),
- (1, 'blue'),
- (2, 'red'),
- (2, 'blue')],
- names=['number', 'color'])
-
- See further examples for how to construct a MultiIndex in the doc strings
- of the mentioned helper methods.
- """
-
- _hidden_attrs = Index._hidden_attrs | frozenset()
-
- # initialize to zero-length tuples to make everything work
- _typ = "multiindex"
- _names: list[Hashable | None] = []
- _levels = FrozenList()
- _codes = FrozenList()
- _comparables = ["names"]
-
- sortorder: int | None
-
- # --------------------------------------------------------------------
- # Constructors
-
- def __new__(
- cls,
- levels=None,
- codes=None,
- sortorder=None,
- names=None,
- dtype=None,
- copy: bool = False,
- name=None,
- verify_integrity: bool = True,
- ) -> MultiIndex:
- # compat with Index
- if name is not None:
- names = name
- if levels is None or codes is None:
- raise TypeError("Must pass both levels and codes")
- if len(levels) != len(codes):
- raise ValueError("Length of levels and codes must be the same.")
- if len(levels) == 0:
- raise ValueError("Must pass non-zero number of levels/codes")
-
- result = object.__new__(cls)
- result._cache = {}
-
- # we've already validated levels and codes, so shortcut here
- result._set_levels(levels, copy=copy, validate=False)
- result._set_codes(codes, copy=copy, validate=False)
-
- result._names = [None] * len(levels)
- if names is not None:
- # handles name validation
- result._set_names(names)
-
- if sortorder is not None:
- result.sortorder = int(sortorder)
- else:
- result.sortorder = sortorder
-
- if verify_integrity:
- new_codes = result._verify_integrity()
- result._codes = new_codes
-
- result._reset_identity()
- result._references = None
-
- return result
-
- def _validate_codes(self, level: list, code: list):
- """
- Reassign code values as -1 if their corresponding levels are NaN.
-
- Parameters
- ----------
- code : list
- Code to reassign.
- level : list
- Level to check for missing values (NaN, NaT, None).
-
- Returns
- -------
- new code where code value = -1 if it corresponds
- to a level with missing values (NaN, NaT, None).
- """
- null_mask = isna(level)
- if np.any(null_mask):
- # error: Incompatible types in assignment
- # (expression has type "ndarray[Any, dtype[Any]]",
- # variable has type "List[Any]")
- code = np.where(null_mask[code], -1, code) # type: ignore[assignment]
- return code
-
- def _verify_integrity(self, codes: list | None = None, levels: list | None = None):
- """
- Parameters
- ----------
- codes : optional list
- Codes to check for validity. Defaults to current codes.
- levels : optional list
- Levels to check for validity. Defaults to current levels.
-
- Raises
- ------
- ValueError
- If length of levels and codes don't match, if the codes for any
- level would exceed level bounds, or there are any duplicate levels.
-
- Returns
- -------
- new codes where code value = -1 if it corresponds to a
- NaN level.
- """
- # NOTE: Currently does not check, among other things, that cached
- # nlevels matches nor that sortorder matches actually sortorder.
- codes = codes or self.codes
- levels = levels or self.levels
-
- if len(levels) != len(codes):
- raise ValueError(
- "Length of levels and codes must match. NOTE: "
- "this index is in an inconsistent state."
- )
- codes_length = len(codes[0])
- for i, (level, level_codes) in enumerate(zip(levels, codes)):
- if len(level_codes) != codes_length:
- raise ValueError(
- f"Unequal code lengths: {[len(code_) for code_ in codes]}"
- )
- if len(level_codes) and level_codes.max() >= len(level):
- raise ValueError(
- f"On level {i}, code max ({level_codes.max()}) >= length of "
- f"level ({len(level)}). NOTE: this index is in an "
- "inconsistent state"
- )
- if len(level_codes) and level_codes.min() < -1:
- raise ValueError(f"On level {i}, code value ({level_codes.min()}) < -1")
- if not level.is_unique:
- raise ValueError(
- f"Level values must be unique: {list(level)} on level {i}"
- )
- if self.sortorder is not None:
- if self.sortorder > _lexsort_depth(self.codes, self.nlevels):
- raise ValueError(
- "Value for sortorder must be inferior or equal to actual "
- f"lexsort_depth: sortorder {self.sortorder} "
- f"with lexsort_depth {_lexsort_depth(self.codes, self.nlevels)}"
- )
-
- codes = [
- self._validate_codes(level, code) for level, code in zip(levels, codes)
- ]
- new_codes = FrozenList(codes)
- return new_codes
-
- @classmethod
- def from_arrays(
- cls,
- arrays,
- sortorder=None,
- names: Sequence[Hashable] | Hashable | lib.NoDefault = lib.no_default,
- ) -> MultiIndex:
- """
- Convert arrays to MultiIndex.
-
- Parameters
- ----------
- arrays : list / sequence of array-likes
- Each array-like gives one level's value for each data point.
- len(arrays) is the number of levels.
- sortorder : int or None
- Level of sortedness (must be lexicographically sorted by that
- level).
- names : list / sequence of str, optional
- Names for the levels in the index.
-
- Returns
- -------
- MultiIndex
-
- See Also
- --------
- MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
- MultiIndex.from_product : Make a MultiIndex from cartesian product
- of iterables.
- MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
-
- Examples
- --------
- >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
- >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
- MultiIndex([(1, 'red'),
- (1, 'blue'),
- (2, 'red'),
- (2, 'blue')],
- names=['number', 'color'])
- """
- error_msg = "Input must be a list / sequence of array-likes."
- if not is_list_like(arrays):
- raise TypeError(error_msg)
- if is_iterator(arrays):
- arrays = list(arrays)
-
- # Check if elements of array are list-like
- for array in arrays:
- if not is_list_like(array):
- raise TypeError(error_msg)
-
- # Check if lengths of all arrays are equal or not,
- # raise ValueError, if not
- for i in range(1, len(arrays)):
- if len(arrays[i]) != len(arrays[i - 1]):
- raise ValueError("all arrays must be same length")
-
- codes, levels = factorize_from_iterables(arrays)
- if names is lib.no_default:
- names = [getattr(arr, "name", None) for arr in arrays]
-
- return cls(
- levels=levels,
- codes=codes,
- sortorder=sortorder,
- names=names,
- verify_integrity=False,
- )
-
- @classmethod
- @names_compat
- def from_tuples(
- cls,
- tuples: Iterable[tuple[Hashable, ...]],
- sortorder: int | None = None,
- names: Sequence[Hashable] | Hashable = None,
- ) -> MultiIndex:
- """
- Convert list of tuples to MultiIndex.
-
- Parameters
- ----------
- tuples : list / sequence of tuple-likes
- Each tuple is the index of one row/column.
- sortorder : int or None
- Level of sortedness (must be lexicographically sorted by that
- level).
- names : list / sequence of str, optional
- Names for the levels in the index.
-
- Returns
- -------
- MultiIndex
-
- See Also
- --------
- MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
- MultiIndex.from_product : Make a MultiIndex from cartesian product
- of iterables.
- MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
-
- Examples
- --------
- >>> tuples = [(1, 'red'), (1, 'blue'),
- ... (2, 'red'), (2, 'blue')]
- >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color'))
- MultiIndex([(1, 'red'),
- (1, 'blue'),
- (2, 'red'),
- (2, 'blue')],
- names=['number', 'color'])
- """
- if not is_list_like(tuples):
- raise TypeError("Input must be a list / sequence of tuple-likes.")
- if is_iterator(tuples):
- tuples = list(tuples)
- tuples = cast(Collection[Tuple[Hashable, ...]], tuples)
-
- # handling the empty tuple cases
- if len(tuples) and all(isinstance(e, tuple) and not e for e in tuples):
- codes = [np.zeros(len(tuples))]
- levels = [Index(com.asarray_tuplesafe(tuples, dtype=np.dtype("object")))]
- return cls(
- levels=levels,
- codes=codes,
- sortorder=sortorder,
- names=names,
- verify_integrity=False,
- )
-
- arrays: list[Sequence[Hashable]]
- if len(tuples) == 0:
- if names is None:
- raise TypeError("Cannot infer number of levels from empty list")
- # error: Argument 1 to "len" has incompatible type "Hashable";
- # expected "Sized"
- arrays = [[]] * len(names) # type: ignore[arg-type]
- elif isinstance(tuples, (np.ndarray, Index)):
- if isinstance(tuples, Index):
- tuples = np.asarray(tuples._values)
-
- arrays = list(lib.tuples_to_object_array(tuples).T)
- elif isinstance(tuples, list):
- arrays = list(lib.to_object_array_tuples(tuples).T)
- else:
- arrs = zip(*tuples)
- arrays = cast(List[Sequence[Hashable]], arrs)
-
- return cls.from_arrays(arrays, sortorder=sortorder, names=names)
-
- @classmethod
- def from_product(
- cls,
- iterables: Sequence[Iterable[Hashable]],
- sortorder: int | None = None,
- names: Sequence[Hashable] | Hashable | lib.NoDefault = lib.no_default,
- ) -> MultiIndex:
- """
- Make a MultiIndex from the cartesian product of multiple iterables.
-
- Parameters
- ----------
- iterables : list / sequence of iterables
- Each iterable has unique labels for each level of the index.
- sortorder : int or None
- Level of sortedness (must be lexicographically sorted by that
- level).
- names : list / sequence of str, optional
- Names for the levels in the index.
- If not explicitly provided, names will be inferred from the
- elements of iterables if an element has a name attribute.
-
- Returns
- -------
- MultiIndex
-
- See Also
- --------
- MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
- MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
- MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
-
- Examples
- --------
- >>> numbers = [0, 1, 2]
- >>> colors = ['green', 'purple']
- >>> pd.MultiIndex.from_product([numbers, colors],
- ... names=['number', 'color'])
- MultiIndex([(0, 'green'),
- (0, 'purple'),
- (1, 'green'),
- (1, 'purple'),
- (2, 'green'),
- (2, 'purple')],
- names=['number', 'color'])
- """
- from pandas.core.reshape.util import cartesian_product
-
- if not is_list_like(iterables):
- raise TypeError("Input must be a list / sequence of iterables.")
- if is_iterator(iterables):
- iterables = list(iterables)
-
- codes, levels = factorize_from_iterables(iterables)
- if names is lib.no_default:
- names = [getattr(it, "name", None) for it in iterables]
-
- # codes are all ndarrays, so cartesian_product is lossless
- codes = cartesian_product(codes)
- return cls(levels, codes, sortorder=sortorder, names=names)
-
- @classmethod
- def from_frame(cls, df: DataFrame, sortorder=None, names=None) -> MultiIndex:
- """
- Make a MultiIndex from a DataFrame.
-
- Parameters
- ----------
- df : DataFrame
- DataFrame to be converted to MultiIndex.
- sortorder : int, optional
- Level of sortedness (must be lexicographically sorted by that
- level).
- names : list-like, optional
- If no names are provided, use the column names, or tuple of column
- names if the columns is a MultiIndex. If a sequence, overwrite
- names with the given sequence.
-
- Returns
- -------
- MultiIndex
- The MultiIndex representation of the given DataFrame.
-
- See Also
- --------
- MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
- MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
- MultiIndex.from_product : Make a MultiIndex from cartesian product
- of iterables.
-
- Examples
- --------
- >>> df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'],
- ... ['NJ', 'Temp'], ['NJ', 'Precip']],
- ... columns=['a', 'b'])
- >>> df
- a b
- 0 HI Temp
- 1 HI Precip
- 2 NJ Temp
- 3 NJ Precip
-
- >>> pd.MultiIndex.from_frame(df)
- MultiIndex([('HI', 'Temp'),
- ('HI', 'Precip'),
- ('NJ', 'Temp'),
- ('NJ', 'Precip')],
- names=['a', 'b'])
-
- Using explicit names, instead of the column names
-
- >>> pd.MultiIndex.from_frame(df, names=['state', 'observation'])
- MultiIndex([('HI', 'Temp'),
- ('HI', 'Precip'),
- ('NJ', 'Temp'),
- ('NJ', 'Precip')],
- names=['state', 'observation'])
- """
- if not isinstance(df, ABCDataFrame):
- raise TypeError("Input must be a DataFrame")
-
- column_names, columns = zip(*df.items())
- names = column_names if names is None else names
- return cls.from_arrays(columns, sortorder=sortorder, names=names)
-
- # --------------------------------------------------------------------
-
- @cache_readonly
- def _values(self) -> np.ndarray:
- # We override here, since our parent uses _data, which we don't use.
- values = []
-
- for i in range(self.nlevels):
- index = self.levels[i]
- codes = self.codes[i]
-
- vals = index
- if is_categorical_dtype(vals.dtype):
- vals = cast("CategoricalIndex", vals)
- vals = vals._data._internal_get_values()
-
- if isinstance(vals.dtype, ExtensionDtype) or isinstance(
- vals, (ABCDatetimeIndex, ABCTimedeltaIndex)
- ):
- vals = vals.astype(object)
-
- vals = np.array(vals, copy=False)
- vals = algos.take_nd(vals, codes, fill_value=index._na_value)
- values.append(vals)
-
- arr = lib.fast_zip(values)
- return arr
-
- @property
- def values(self) -> np.ndarray:
- return self._values
-
- @property
- def array(self):
- """
- Raises a ValueError for `MultiIndex` because there's no single
- array backing a MultiIndex.
-
- Raises
- ------
- ValueError
- """
- raise ValueError(
- "MultiIndex has no single backing array. Use "
- "'MultiIndex.to_numpy()' to get a NumPy array of tuples."
- )
-
- @cache_readonly
- def dtypes(self) -> Series:
- """
- Return the dtypes as a Series for the underlying MultiIndex.
- """
- from pandas import Series
-
- names = com.fill_missing_names([level.name for level in self.levels])
- return Series([level.dtype for level in self.levels], index=Index(names))
-
- def __len__(self) -> int:
- return len(self.codes[0])
-
- @property
- def size(self) -> int:
- """
- Return the number of elements in the underlying data.
- """
- # override Index.size to avoid materializing _values
- return len(self)
-
- # --------------------------------------------------------------------
- # Levels Methods
-
- @cache_readonly
- def levels(self) -> FrozenList:
- # Use cache_readonly to ensure that self.get_locs doesn't repeatedly
- # create new IndexEngine
- # https://github.com/pandas-dev/pandas/issues/31648
- result = [x._rename(name=name) for x, name in zip(self._levels, self._names)]
- for level in result:
- # disallow midx.levels[0].name = "foo"
- level._no_setting_name = True
- return FrozenList(result)
-
- def _set_levels(
- self,
- levels,
- *,
- level=None,
- copy: bool = False,
- validate: bool = True,
- verify_integrity: bool = False,
- ) -> None:
- # This is NOT part of the levels property because it should be
- # externally not allowed to set levels. User beware if you change
- # _levels directly
- if validate:
- if len(levels) == 0:
- raise ValueError("Must set non-zero number of levels.")
- if level is None and len(levels) != self.nlevels:
- raise ValueError("Length of levels must match number of levels.")
- if level is not None and len(levels) != len(level):
- raise ValueError("Length of levels must match length of level.")
-
- if level is None:
- new_levels = FrozenList(
- ensure_index(lev, copy=copy)._view() for lev in levels
- )
- else:
- level_numbers = [self._get_level_number(lev) for lev in level]
- new_levels_list = list(self._levels)
- for lev_num, lev in zip(level_numbers, levels):
- new_levels_list[lev_num] = ensure_index(lev, copy=copy)._view()
- new_levels = FrozenList(new_levels_list)
-
- if verify_integrity:
- new_codes = self._verify_integrity(levels=new_levels)
- self._codes = new_codes
-
- names = self.names
- self._levels = new_levels
- if any(names):
- self._set_names(names)
-
- self._reset_cache()
-
- def set_levels(
- self, levels, *, level=None, verify_integrity: bool = True
- ) -> MultiIndex:
- """
- Set new levels on MultiIndex. Defaults to returning new index.
-
- Parameters
- ----------
- levels : sequence or list of sequence
- New level(s) to apply.
- level : int, level name, or sequence of int/level names (default None)
- Level(s) to set (None for all levels).
- verify_integrity : bool, default True
- If True, checks that levels and codes are compatible.
-
- Returns
- -------
- MultiIndex
-
- Examples
- --------
- >>> idx = pd.MultiIndex.from_tuples(
- ... [
- ... (1, "one"),
- ... (1, "two"),
- ... (2, "one"),
- ... (2, "two"),
- ... (3, "one"),
- ... (3, "two")
- ... ],
- ... names=["foo", "bar"]
- ... )
- >>> idx
- MultiIndex([(1, 'one'),
- (1, 'two'),
- (2, 'one'),
- (2, 'two'),
- (3, 'one'),
- (3, 'two')],
- names=['foo', 'bar'])
-
- >>> idx.set_levels([['a', 'b', 'c'], [1, 2]])
- MultiIndex([('a', 1),
- ('a', 2),
- ('b', 1),
- ('b', 2),
- ('c', 1),
- ('c', 2)],
- names=['foo', 'bar'])
- >>> idx.set_levels(['a', 'b', 'c'], level=0)
- MultiIndex([('a', 'one'),
- ('a', 'two'),
- ('b', 'one'),
- ('b', 'two'),
- ('c', 'one'),
- ('c', 'two')],
- names=['foo', 'bar'])
- >>> idx.set_levels(['a', 'b'], level='bar')
- MultiIndex([(1, 'a'),
- (1, 'b'),
- (2, 'a'),
- (2, 'b'),
- (3, 'a'),
- (3, 'b')],
- names=['foo', 'bar'])
-
- If any of the levels passed to ``set_levels()`` exceeds the
- existing length, all of the values from that argument will
- be stored in the MultiIndex levels, though the values will
- be truncated in the MultiIndex output.
-
- >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1])
- MultiIndex([('a', 1),
- ('a', 2),
- ('b', 1),
- ('b', 2),
- ('c', 1),
- ('c', 2)],
- names=['foo', 'bar'])
- >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels
- FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]])
- """
-
- if is_list_like(levels) and not isinstance(levels, Index):
- levels = list(levels)
-
- level, levels = _require_listlike(level, levels, "Levels")
- idx = self._view()
- idx._reset_identity()
- idx._set_levels(
- levels, level=level, validate=True, verify_integrity=verify_integrity
- )
- return idx
-
- @property
- def nlevels(self) -> int:
- """
- Integer number of levels in this MultiIndex.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']])
- >>> mi
- MultiIndex([('a', 'b', 'c')],
- )
- >>> mi.nlevels
- 3
- """
- return len(self._levels)
-
- @property
- def levshape(self) -> Shape:
- """
- A tuple with the length of each level.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']])
- >>> mi
- MultiIndex([('a', 'b', 'c')],
- )
- >>> mi.levshape
- (1, 1, 1)
- """
- return tuple(len(x) for x in self.levels)
-
- # --------------------------------------------------------------------
- # Codes Methods
-
- @property
- def codes(self):
- return self._codes
-
- def _set_codes(
- self,
- codes,
- *,
- level=None,
- copy: bool = False,
- validate: bool = True,
- verify_integrity: bool = False,
- ) -> None:
- if validate:
- if level is None and len(codes) != self.nlevels:
- raise ValueError("Length of codes must match number of levels")
- if level is not None and len(codes) != len(level):
- raise ValueError("Length of codes must match length of levels.")
-
- if level is None:
- new_codes = FrozenList(
- _coerce_indexer_frozen(level_codes, lev, copy=copy).view()
- for lev, level_codes in zip(self._levels, codes)
- )
- else:
- level_numbers = [self._get_level_number(lev) for lev in level]
- new_codes_list = list(self._codes)
- for lev_num, level_codes in zip(level_numbers, codes):
- lev = self.levels[lev_num]
- new_codes_list[lev_num] = _coerce_indexer_frozen(
- level_codes, lev, copy=copy
- )
- new_codes = FrozenList(new_codes_list)
-
- if verify_integrity:
- new_codes = self._verify_integrity(codes=new_codes)
-
- self._codes = new_codes
-
- self._reset_cache()
-
- def set_codes(self, codes, *, level=None, verify_integrity: bool = True):
- """
- Set new codes on MultiIndex. Defaults to returning new index.
-
- Parameters
- ----------
- codes : sequence or list of sequence
- New codes to apply.
- level : int, level name, or sequence of int/level names (default None)
- Level(s) to set (None for all levels).
- verify_integrity : bool, default True
- If True, checks that levels and codes are compatible.
-
- Returns
- -------
- new index (of same type and class...etc) or None
- The same type as the caller or None if ``inplace=True``.
-
- Examples
- --------
- >>> idx = pd.MultiIndex.from_tuples(
- ... [(1, "one"), (1, "two"), (2, "one"), (2, "two")], names=["foo", "bar"]
- ... )
- >>> idx
- MultiIndex([(1, 'one'),
- (1, 'two'),
- (2, 'one'),
- (2, 'two')],
- names=['foo', 'bar'])
-
- >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]])
- MultiIndex([(2, 'one'),
- (1, 'one'),
- (2, 'two'),
- (1, 'two')],
- names=['foo', 'bar'])
- >>> idx.set_codes([1, 0, 1, 0], level=0)
- MultiIndex([(2, 'one'),
- (1, 'two'),
- (2, 'one'),
- (1, 'two')],
- names=['foo', 'bar'])
- >>> idx.set_codes([0, 0, 1, 1], level='bar')
- MultiIndex([(1, 'one'),
- (1, 'one'),
- (2, 'two'),
- (2, 'two')],
- names=['foo', 'bar'])
- >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]], level=[0, 1])
- MultiIndex([(2, 'one'),
- (1, 'one'),
- (2, 'two'),
- (1, 'two')],
- names=['foo', 'bar'])
- """
-
- level, codes = _require_listlike(level, codes, "Codes")
- idx = self._view()
- idx._reset_identity()
- idx._set_codes(codes, level=level, verify_integrity=verify_integrity)
- return idx
-
- # --------------------------------------------------------------------
- # Index Internals
-
- @cache_readonly
- def _engine(self):
- # Calculate the number of bits needed to represent labels in each
- # level, as log2 of their sizes:
- # NaN values are shifted to 1 and missing values in other while
- # calculating the indexer are shifted to 0
- sizes = np.ceil(
- np.log2(
- [
- len(level)
- + libindex.multiindex_nulls_shift # type: ignore[attr-defined]
- for level in self.levels
- ]
- )
- )
-
- # Sum bit counts, starting from the _right_....
- lev_bits = np.cumsum(sizes[::-1])[::-1]
-
- # ... in order to obtain offsets such that sorting the combination of
- # shifted codes (one for each level, resulting in a unique integer) is
- # equivalent to sorting lexicographically the codes themselves. Notice
- # that each level needs to be shifted by the number of bits needed to
- # represent the _previous_ ones:
- offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64")
-
- # Check the total number of bits needed for our representation:
- if lev_bits[0] > 64:
- # The levels would overflow a 64 bit uint - use Python integers:
- return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
- return MultiIndexUIntEngine(self.levels, self.codes, offsets)
-
- # Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return
- # type "Type[MultiIndex]" in supertype "Index"
- @property
- def _constructor(self) -> Callable[..., MultiIndex]: # type: ignore[override]
- return type(self).from_tuples
-
- @doc(Index._shallow_copy)
- def _shallow_copy(self, values: np.ndarray, name=lib.no_default) -> MultiIndex:
- names = name if name is not lib.no_default else self.names
-
- return type(self).from_tuples(values, sortorder=None, names=names)
-
- def _view(self) -> MultiIndex:
- result = type(self)(
- levels=self.levels,
- codes=self.codes,
- sortorder=self.sortorder,
- names=self.names,
- verify_integrity=False,
- )
- result._cache = self._cache.copy()
- result._cache.pop("levels", None) # GH32669
- return result
-
- # --------------------------------------------------------------------
-
- # error: Signature of "copy" incompatible with supertype "Index"
- def copy( # type: ignore[override]
- self,
- names=None,
- deep: bool = False,
- name=None,
- ):
- """
- Make a copy of this object.
-
- Names, dtype, levels and codes can be passed and will be set on new copy.
-
- Parameters
- ----------
- names : sequence, optional
- deep : bool, default False
- name : Label
- Kept for compatibility with 1-dimensional Index. Should not be used.
-
- Returns
- -------
- MultiIndex
-
- Notes
- -----
- In most cases, there should be no functional difference from using
- ``deep``, but if ``deep`` is passed it will attempt to deepcopy.
- This could be potentially expensive on large MultiIndex objects.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']])
- >>> mi
- MultiIndex([('a', 'b', 'c')],
- )
- >>> mi.copy()
- MultiIndex([('a', 'b', 'c')],
- )
- """
- names = self._validate_names(name=name, names=names, deep=deep)
- keep_id = not deep
- levels, codes = None, None
-
- if deep:
- from copy import deepcopy
-
- levels = deepcopy(self.levels)
- codes = deepcopy(self.codes)
-
- levels = levels if levels is not None else self.levels
- codes = codes if codes is not None else self.codes
-
- new_index = type(self)(
- levels=levels,
- codes=codes,
- sortorder=self.sortorder,
- names=names,
- verify_integrity=False,
- )
- new_index._cache = self._cache.copy()
- new_index._cache.pop("levels", None) # GH32669
- if keep_id:
- new_index._id = self._id
- return new_index
-
- def __array__(self, dtype=None) -> np.ndarray:
- """the array interface, return my values"""
- return self.values
-
- def view(self, cls=None):
- """this is defined as a copy with the same identity"""
- result = self.copy()
- result._id = self._id
- return result
-
- @doc(Index.__contains__)
- def __contains__(self, key: Any) -> bool:
- hash(key)
- try:
- self.get_loc(key)
- return True
- except (LookupError, TypeError, ValueError):
- return False
-
- @cache_readonly
- def dtype(self) -> np.dtype:
- return np.dtype("O")
-
- def _is_memory_usage_qualified(self) -> bool:
- """return a boolean if we need a qualified .info display"""
-
- def f(level) -> bool:
- return "mixed" in level or "string" in level or "unicode" in level
-
- return any(f(level) for level in self._inferred_type_levels)
-
- # Cannot determine type of "memory_usage"
- @doc(Index.memory_usage) # type: ignore[has-type]
- def memory_usage(self, deep: bool = False) -> int:
- # we are overwriting our base class to avoid
- # computing .values here which could materialize
- # a tuple representation unnecessarily
- return self._nbytes(deep)
-
- @cache_readonly
- def nbytes(self) -> int:
- """return the number of bytes in the underlying data"""
- return self._nbytes(False)
-
- def _nbytes(self, deep: bool = False) -> int:
- """
- return the number of bytes in the underlying data
- deeply introspect the level data if deep=True
-
- include the engine hashtable
-
- *this is in internal routine*
-
- """
- # for implementations with no useful getsizeof (PyPy)
- objsize = 24
-
- level_nbytes = sum(i.memory_usage(deep=deep) for i in self.levels)
- label_nbytes = sum(i.nbytes for i in self.codes)
- names_nbytes = sum(getsizeof(i, objsize) for i in self.names)
- result = level_nbytes + label_nbytes + names_nbytes
-
- # include our engine hashtable
- result += self._engine.sizeof(deep=deep)
- return result
-
- # --------------------------------------------------------------------
- # Rendering Methods
-
- def _formatter_func(self, tup):
- """
- Formats each item in tup according to its level's formatter function.
- """
- formatter_funcs = [level._formatter_func for level in self.levels]
- return tuple(func(val) for func, val in zip(formatter_funcs, tup))
-
- def _format_native_types(
- self, *, na_rep: str = "nan", **kwargs
- ) -> npt.NDArray[np.object_]:
- new_levels = []
- new_codes = []
-
- # go through the levels and format them
- for level, level_codes in zip(self.levels, self.codes):
- level_strs = level._format_native_types(na_rep=na_rep, **kwargs)
- # add nan values, if there are any
- mask = level_codes == -1
- if mask.any():
- nan_index = len(level_strs)
- # numpy 1.21 deprecated implicit string casting
- level_strs = level_strs.astype(str)
- level_strs = np.append(level_strs, na_rep)
- assert not level_codes.flags.writeable # i.e. copy is needed
- level_codes = level_codes.copy() # make writeable
- level_codes[mask] = nan_index
- new_levels.append(level_strs)
- new_codes.append(level_codes)
-
- if len(new_levels) == 1:
- # a single-level multi-index
- return Index(new_levels[0].take(new_codes[0]))._format_native_types()
- else:
- # reconstruct the multi-index
- mi = MultiIndex(
- levels=new_levels,
- codes=new_codes,
- names=self.names,
- sortorder=self.sortorder,
- verify_integrity=False,
- )
- return mi._values
-
- def format(
- self,
- name: bool | None = None,
- formatter: Callable | None = None,
- na_rep: str | None = None,
- names: bool = False,
- space: int = 2,
- sparsify=None,
- adjoin: bool = True,
- ) -> list:
- if name is not None:
- names = name
-
- if len(self) == 0:
- return []
-
- stringified_levels = []
- for lev, level_codes in zip(self.levels, self.codes):
- na = na_rep if na_rep is not None else _get_na_rep(lev.dtype)
-
- if len(lev) > 0:
- formatted = lev.take(level_codes).format(formatter=formatter)
-
- # we have some NA
- mask = level_codes == -1
- if mask.any():
- formatted = np.array(formatted, dtype=object)
- formatted[mask] = na
- formatted = formatted.tolist()
-
- else:
- # weird all NA case
- formatted = [
- pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n"))
- for x in algos.take_nd(lev._values, level_codes)
- ]
- stringified_levels.append(formatted)
-
- result_levels = []
- for lev, lev_name in zip(stringified_levels, self.names):
- level = []
-
- if names:
- level.append(
- pprint_thing(lev_name, escape_chars=("\t", "\r", "\n"))
- if lev_name is not None
- else ""
- )
-
- level.extend(np.array(lev, dtype=object))
- result_levels.append(level)
-
- if sparsify is None:
- sparsify = get_option("display.multi_sparse")
-
- if sparsify:
- sentinel: Literal[""] | bool | lib.NoDefault = ""
- # GH3547 use value of sparsify as sentinel if it's "Falsey"
- assert isinstance(sparsify, bool) or sparsify is lib.no_default
- if sparsify in [False, lib.no_default]:
- sentinel = sparsify
- # little bit of a kludge job for #1217
- result_levels = sparsify_labels(
- result_levels, start=int(names), sentinel=sentinel
- )
-
- if adjoin:
- from pandas.io.formats.format import get_adjustment
-
- adj = get_adjustment()
- return adj.adjoin(space, *result_levels).split("\n")
- else:
- return result_levels
-
- # --------------------------------------------------------------------
- # Names Methods
-
- def _get_names(self) -> FrozenList:
- return FrozenList(self._names)
-
- def _set_names(self, names, *, level=None, validate: bool = True):
- """
- Set new names on index. Each name has to be a hashable type.
-
- Parameters
- ----------
- values : str or sequence
- name(s) to set
- level : int, level name, or sequence of int/level names (default None)
- If the index is a MultiIndex (hierarchical), level(s) to set (None
- for all levels). Otherwise level must be None
- validate : bool, default True
- validate that the names match level lengths
-
- Raises
- ------
- TypeError if each name is not hashable.
-
- Notes
- -----
- sets names on levels. WARNING: mutates!
-
- Note that you generally want to set this *after* changing levels, so
- that it only acts on copies
- """
- # GH 15110
- # Don't allow a single string for names in a MultiIndex
- if names is not None and not is_list_like(names):
- raise ValueError("Names should be list-like for a MultiIndex")
- names = list(names)
-
- if validate:
- if level is not None and len(names) != len(level):
- raise ValueError("Length of names must match length of level.")
- if level is None and len(names) != self.nlevels:
- raise ValueError(
- "Length of names must match number of levels in MultiIndex."
- )
-
- if level is None:
- level = range(self.nlevels)
- else:
- level = [self._get_level_number(lev) for lev in level]
-
- # set the name
- for lev, name in zip(level, names):
- if name is not None:
- # GH 20527
- # All items in 'names' need to be hashable:
- if not is_hashable(name):
- raise TypeError(
- f"{type(self).__name__}.name must be a hashable type"
- )
- self._names[lev] = name
-
- # If .levels has been accessed, the names in our cache will be stale.
- self._reset_cache()
-
- names = property(
- fset=_set_names,
- fget=_get_names,
- doc="""
- Names of levels in MultiIndex.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays(
- ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z'])
- >>> mi
- MultiIndex([(1, 3, 5),
- (2, 4, 6)],
- names=['x', 'y', 'z'])
- >>> mi.names
- FrozenList(['x', 'y', 'z'])
- """,
- )
-
- # --------------------------------------------------------------------
-
- @cache_readonly
- def inferred_type(self) -> str:
- return "mixed"
-
- def _get_level_number(self, level) -> int:
- count = self.names.count(level)
- if (count > 1) and not is_integer(level):
- raise ValueError(
- f"The name {level} occurs multiple times, use a level number"
- )
- try:
- level = self.names.index(level)
- except ValueError as err:
- if not is_integer(level):
- raise KeyError(f"Level {level} not found") from err
- if level < 0:
- level += self.nlevels
- if level < 0:
- orig_level = level - self.nlevels
- raise IndexError(
- f"Too many levels: Index has only {self.nlevels} levels, "
- f"{orig_level} is not a valid level number"
- ) from err
- # Note: levels are zero-based
- elif level >= self.nlevels:
- raise IndexError(
- f"Too many levels: Index has only {self.nlevels} levels, "
- f"not {level + 1}"
- ) from err
- return level
-
- @cache_readonly
- def is_monotonic_increasing(self) -> bool:
- """
- Return a boolean if the values are equal or increasing.
- """
- if any(-1 in code for code in self.codes):
- return False
-
- if all(level.is_monotonic_increasing for level in self.levels):
- # If each level is sorted, we can operate on the codes directly. GH27495
- return libalgos.is_lexsorted(
- [x.astype("int64", copy=False) for x in self.codes]
- )
-
- # reversed() because lexsort() wants the most significant key last.
- values = [
- self._get_level_values(i)._values for i in reversed(range(len(self.levels)))
- ]
- try:
- # error: Argument 1 to "lexsort" has incompatible type
- # "List[Union[ExtensionArray, ndarray[Any, Any]]]";
- # expected "Union[_SupportsArray[dtype[Any]],
- # _NestedSequence[_SupportsArray[dtype[Any]]], bool,
- # int, float, complex, str, bytes, _NestedSequence[Union
- # [bool, int, float, complex, str, bytes]]]"
- sort_order = np.lexsort(values) # type: ignore[arg-type]
- return Index(sort_order).is_monotonic_increasing
- except TypeError:
- # we have mixed types and np.lexsort is not happy
- return Index(self._values).is_monotonic_increasing
-
- @cache_readonly
- def is_monotonic_decreasing(self) -> bool:
- """
- Return a boolean if the values are equal or decreasing.
- """
- # monotonic decreasing if and only if reverse is monotonic increasing
- return self[::-1].is_monotonic_increasing
-
- @cache_readonly
- def _inferred_type_levels(self) -> list[str]:
- """return a list of the inferred types, one for each level"""
- return [i.inferred_type for i in self.levels]
-
- @doc(Index.duplicated)
- def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
- shape = tuple(len(lev) for lev in self.levels)
- ids = get_group_index(self.codes, shape, sort=False, xnull=False)
-
- return duplicated(ids, keep)
-
- # error: Cannot override final attribute "_duplicated"
- # (previously declared in base class "IndexOpsMixin")
- _duplicated = duplicated # type: ignore[misc]
-
- def fillna(self, value=None, downcast=None):
- """
- fillna is not implemented for MultiIndex
- """
- raise NotImplementedError("isna is not defined for MultiIndex")
-
- @doc(Index.dropna)
- def dropna(self, how: AnyAll = "any") -> MultiIndex:
- nans = [level_codes == -1 for level_codes in self.codes]
- if how == "any":
- indexer = np.any(nans, axis=0)
- elif how == "all":
- indexer = np.all(nans, axis=0)
- else:
- raise ValueError(f"invalid how option: {how}")
-
- new_codes = [level_codes[~indexer] for level_codes in self.codes]
- return self.set_codes(codes=new_codes)
-
- def _get_level_values(self, level: int, unique: bool = False) -> Index:
- """
- Return vector of label values for requested level,
- equal to the length of the index
-
- **this is an internal method**
-
- Parameters
- ----------
- level : int
- unique : bool, default False
- if True, drop duplicated values
-
- Returns
- -------
- Index
- """
- lev = self.levels[level]
- level_codes = self.codes[level]
- name = self._names[level]
- if unique:
- level_codes = algos.unique(level_codes)
- filled = algos.take_nd(lev._values, level_codes, fill_value=lev._na_value)
- return lev._shallow_copy(filled, name=name)
-
- def get_level_values(self, level):
- """
- Return vector of label values for requested level.
-
- Length of returned vector is equal to the length of the index.
-
- Parameters
- ----------
- level : int or str
- ``level`` is either the integer position of the level in the
- MultiIndex, or the name of the level.
-
- Returns
- -------
- Index
- Values is a level of this MultiIndex converted to
- a single :class:`Index` (or subclass thereof).
-
- Notes
- -----
- If the level contains missing values, the result may be casted to
- ``float`` with missing values specified as ``NaN``. This is because
- the level is converted to a regular ``Index``.
-
- Examples
- --------
- Create a MultiIndex:
-
- >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def')))
- >>> mi.names = ['level_1', 'level_2']
-
- Get level values by supplying level as either integer or name:
-
- >>> mi.get_level_values(0)
- Index(['a', 'b', 'c'], dtype='object', name='level_1')
- >>> mi.get_level_values('level_2')
- Index(['d', 'e', 'f'], dtype='object', name='level_2')
-
- If a level contains missing values, the return type of the level
- may be cast to ``float``.
-
- >>> pd.MultiIndex.from_arrays([[1, None, 2], [3, 4, 5]]).dtypes
- level_0 int64
- level_1 int64
- dtype: object
- >>> pd.MultiIndex.from_arrays([[1, None, 2], [3, 4, 5]]).get_level_values(0)
- Index([1.0, nan, 2.0], dtype='float64')
- """
- level = self._get_level_number(level)
- values = self._get_level_values(level)
- return values
-
- @doc(Index.unique)
- def unique(self, level=None):
- if level is None:
- return self.drop_duplicates()
- else:
- level = self._get_level_number(level)
- return self._get_level_values(level=level, unique=True)
-
- def to_frame(
- self,
- index: bool = True,
- name=lib.no_default,
- allow_duplicates: bool = False,
- ) -> DataFrame:
- """
- Create a DataFrame with the levels of the MultiIndex as columns.
-
- Column ordering is determined by the DataFrame constructor with data as
- a dict.
-
- Parameters
- ----------
- index : bool, default True
- Set the index of the returned DataFrame as the original MultiIndex.
-
- name : list / sequence of str, optional
- The passed names should substitute index level names.
-
- allow_duplicates : bool, optional default False
- Allow duplicate column labels to be created.
-
- .. versionadded:: 1.5.0
-
- Returns
- -------
- DataFrame
-
- See Also
- --------
- DataFrame : Two-dimensional, size-mutable, potentially heterogeneous
- tabular data.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']])
- >>> mi
- MultiIndex([('a', 'c'),
- ('b', 'd')],
- )
-
- >>> df = mi.to_frame()
- >>> df
- 0 1
- a c a c
- b d b d
-
- >>> df = mi.to_frame(index=False)
- >>> df
- 0 1
- 0 a c
- 1 b d
-
- >>> df = mi.to_frame(name=['x', 'y'])
- >>> df
- x y
- a c a c
- b d b d
- """
- from pandas import DataFrame
-
- if name is not lib.no_default:
- if not is_list_like(name):
- raise TypeError("'name' must be a list / sequence of column names.")
-
- if len(name) != len(self.levels):
- raise ValueError(
- "'name' should have same length as number of levels on index."
- )
- idx_names = name
- else:
- idx_names = self._get_level_names()
-
- if not allow_duplicates and len(set(idx_names)) != len(idx_names):
- raise ValueError(
- "Cannot create duplicate column labels if allow_duplicates is False"
- )
-
- # Guarantee resulting column order - PY36+ dict maintains insertion order
- result = DataFrame(
- {level: self._get_level_values(level) for level in range(len(self.levels))},
- copy=False,
- )
- result.columns = idx_names
-
- if index:
- result.index = self
- return result
-
- # error: Return type "Index" of "to_flat_index" incompatible with return type
- # "MultiIndex" in supertype "Index"
- def to_flat_index(self) -> Index: # type: ignore[override]
- """
- Convert a MultiIndex to an Index of Tuples containing the level values.
-
- Returns
- -------
- pd.Index
- Index with the MultiIndex data represented in Tuples.
-
- See Also
- --------
- MultiIndex.from_tuples : Convert flat index back to MultiIndex.
-
- Notes
- -----
- This method will simply return the caller if called by anything other
- than a MultiIndex.
-
- Examples
- --------
- >>> index = pd.MultiIndex.from_product(
- ... [['foo', 'bar'], ['baz', 'qux']],
- ... names=['a', 'b'])
- >>> index.to_flat_index()
- Index([('foo', 'baz'), ('foo', 'qux'),
- ('bar', 'baz'), ('bar', 'qux')],
- dtype='object')
- """
- return Index(self._values, tupleize_cols=False)
-
- def _is_lexsorted(self) -> bool:
- """
- Return True if the codes are lexicographically sorted.
-
- Returns
- -------
- bool
-
- Examples
- --------
- In the below examples, the first level of the MultiIndex is sorted because
- a<b<c, so there is no need to look at the next level.
-
- >>> pd.MultiIndex.from_arrays([['a', 'b', 'c'],
- ... ['d', 'e', 'f']])._is_lexsorted()
- True
- >>> pd.MultiIndex.from_arrays([['a', 'b', 'c'],
- ... ['d', 'f', 'e']])._is_lexsorted()
- True
-
- In case there is a tie, the lexicographical sorting looks
- at the next level of the MultiIndex.
-
- >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'b', 'c']])._is_lexsorted()
- True
- >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'c', 'b']])._is_lexsorted()
- False
- >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'],
- ... ['aa', 'bb', 'aa', 'bb']])._is_lexsorted()
- True
- >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'],
- ... ['bb', 'aa', 'aa', 'bb']])._is_lexsorted()
- False
- """
- return self._lexsort_depth == self.nlevels
-
- @cache_readonly
- def _lexsort_depth(self) -> int:
- """
- Compute and return the lexsort_depth, the number of levels of the
- MultiIndex that are sorted lexically
-
- Returns
- -------
- int
- """
- if self.sortorder is not None:
- return self.sortorder
- return _lexsort_depth(self.codes, self.nlevels)
-
- def _sort_levels_monotonic(self, raise_if_incomparable: bool = False) -> MultiIndex:
- """
- This is an *internal* function.
-
- Create a new MultiIndex from the current to monotonically sorted
- items IN the levels. This does not actually make the entire MultiIndex
- monotonic, JUST the levels.
-
- The resulting MultiIndex will have the same outward
- appearance, meaning the same .values and ordering. It will also
- be .equals() to the original.
-
- Returns
- -------
- MultiIndex
-
- Examples
- --------
- >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
- ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
- >>> mi
- MultiIndex([('a', 'bb'),
- ('a', 'aa'),
- ('b', 'bb'),
- ('b', 'aa')],
- )
-
- >>> mi.sort_values()
- MultiIndex([('a', 'aa'),
- ('a', 'bb'),
- ('b', 'aa'),
- ('b', 'bb')],
- )
- """
- if self._is_lexsorted() and self.is_monotonic_increasing:
- return self
-
- new_levels = []
- new_codes = []
-
- for lev, level_codes in zip(self.levels, self.codes):
- if not lev.is_monotonic_increasing:
- try:
- # indexer to reorder the levels
- indexer = lev.argsort()
- except TypeError:
- if raise_if_incomparable:
- raise
- else:
- lev = lev.take(indexer)
-
- # indexer to reorder the level codes
- indexer = ensure_platform_int(indexer)
- ri = lib.get_reverse_indexer(indexer, len(indexer))
- level_codes = algos.take_nd(ri, level_codes)
-
- new_levels.append(lev)
- new_codes.append(level_codes)
-
- return MultiIndex(
- new_levels,
- new_codes,
- names=self.names,
- sortorder=self.sortorder,
- verify_integrity=False,
- )
-
- def remove_unused_levels(self) -> MultiIndex:
- """
- Create new MultiIndex from current that removes unused levels.
-
- Unused level(s) means levels that are not expressed in the
- labels. The resulting MultiIndex will have the same outward
- appearance, meaning the same .values and ordering. It will
- also be .equals() to the original.
-
- Returns
- -------
- MultiIndex
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_product([range(2), list('ab')])
- >>> mi
- MultiIndex([(0, 'a'),
- (0, 'b'),
- (1, 'a'),
- (1, 'b')],
- )
-
- >>> mi[2:]
- MultiIndex([(1, 'a'),
- (1, 'b')],
- )
-
- The 0 from the first level is not represented
- and can be removed
-
- >>> mi2 = mi[2:].remove_unused_levels()
- >>> mi2.levels
- FrozenList([[1], ['a', 'b']])
- """
- new_levels = []
- new_codes = []
-
- changed = False
- for lev, level_codes in zip(self.levels, self.codes):
- # Since few levels are typically unused, bincount() is more
- # efficient than unique() - however it only accepts positive values
- # (and drops order):
- uniques = np.where(np.bincount(level_codes + 1) > 0)[0] - 1
- has_na = int(len(uniques) and (uniques[0] == -1))
-
- if len(uniques) != len(lev) + has_na:
- if lev.isna().any() and len(uniques) == len(lev):
- break
- # We have unused levels
- changed = True
-
- # Recalculate uniques, now preserving order.
- # Can easily be cythonized by exploiting the already existing
- # "uniques" and stop parsing "level_codes" when all items
- # are found:
- uniques = algos.unique(level_codes)
- if has_na:
- na_idx = np.where(uniques == -1)[0]
- # Just ensure that -1 is in first position:
- uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]]
-
- # codes get mapped from uniques to 0:len(uniques)
- # -1 (if present) is mapped to last position
- code_mapping = np.zeros(len(lev) + has_na)
- # ... and reassigned value -1:
- code_mapping[uniques] = np.arange(len(uniques)) - has_na
-
- level_codes = code_mapping[level_codes]
-
- # new levels are simple
- lev = lev.take(uniques[has_na:])
-
- new_levels.append(lev)
- new_codes.append(level_codes)
-
- result = self.view()
-
- if changed:
- result._reset_identity()
- result._set_levels(new_levels, validate=False)
- result._set_codes(new_codes, validate=False)
-
- return result
-
- # --------------------------------------------------------------------
- # Pickling Methods
-
- def __reduce__(self):
- """Necessary for making this object picklable"""
- d = {
- "levels": list(self.levels),
- "codes": list(self.codes),
- "sortorder": self.sortorder,
- "names": list(self.names),
- }
- return ibase._new_Index, (type(self), d), None
-
- # --------------------------------------------------------------------
-
- def __getitem__(self, key):
- if is_scalar(key):
- key = com.cast_scalar_indexer(key)
-
- retval = []
- for lev, level_codes in zip(self.levels, self.codes):
- if level_codes[key] == -1:
- retval.append(np.nan)
- else:
- retval.append(lev[level_codes[key]])
-
- return tuple(retval)
- else:
- # in general cannot be sure whether the result will be sorted
- sortorder = None
- if com.is_bool_indexer(key):
- key = np.asarray(key, dtype=bool)
- sortorder = self.sortorder
- elif isinstance(key, slice):
- if key.step is None or key.step > 0:
- sortorder = self.sortorder
- elif isinstance(key, Index):
- key = np.asarray(key)
-
- new_codes = [level_codes[key] for level_codes in self.codes]
-
- return MultiIndex(
- levels=self.levels,
- codes=new_codes,
- names=self.names,
- sortorder=sortorder,
- verify_integrity=False,
- )
-
- def _getitem_slice(self: MultiIndex, slobj: slice) -> MultiIndex:
- """
- Fastpath for __getitem__ when we know we have a slice.
- """
- sortorder = None
- if slobj.step is None or slobj.step > 0:
- sortorder = self.sortorder
-
- new_codes = [level_codes[slobj] for level_codes in self.codes]
-
- return type(self)(
- levels=self.levels,
- codes=new_codes,
- names=self._names,
- sortorder=sortorder,
- verify_integrity=False,
- )
-
- @Appender(_index_shared_docs["take"] % _index_doc_kwargs)
- def take(
- self: MultiIndex,
- indices,
- axis: Axis = 0,
- allow_fill: bool = True,
- fill_value=None,
- **kwargs,
- ) -> MultiIndex:
- nv.validate_take((), kwargs)
- indices = ensure_platform_int(indices)
-
- # only fill if we are passing a non-None fill_value
- allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices)
-
- na_value = -1
-
- taken = [lab.take(indices) for lab in self.codes]
- if allow_fill:
- mask = indices == -1
- if mask.any():
- masked = []
- for new_label in taken:
- label_values = new_label
- label_values[mask] = na_value
- masked.append(np.asarray(label_values))
- taken = masked
-
- return MultiIndex(
- levels=self.levels, codes=taken, names=self.names, verify_integrity=False
- )
-
- def append(self, other):
- """
- Append a collection of Index options together.
-
- Parameters
- ----------
- other : Index or list/tuple of indices
-
- Returns
- -------
- Index
- The combined index.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([['a'], ['b']])
- >>> mi
- MultiIndex([('a', 'b')],
- )
- >>> mi.append(mi)
- MultiIndex([('a', 'b'), ('a', 'b')],
- )
- """
- if not isinstance(other, (list, tuple)):
- other = [other]
-
- if all(
- (isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other
- ):
- arrays, names = [], []
- for i in range(self.nlevels):
- label = self._get_level_values(i)
- appended = [o._get_level_values(i) for o in other]
- arrays.append(label.append(appended))
- single_label_name = all(label.name == x.name for x in appended)
- names.append(label.name if single_label_name else None)
- return MultiIndex.from_arrays(arrays, names=names)
-
- to_concat = (self._values,) + tuple(k._values for k in other)
- new_tuples = np.concatenate(to_concat)
-
- # if all(isinstance(x, MultiIndex) for x in other):
- try:
- # We only get here if other contains at least one index with tuples,
- # setting names to None automatically
- return MultiIndex.from_tuples(new_tuples)
- except (TypeError, IndexError):
- return Index(new_tuples)
-
- def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]:
- if len(args) == 0 and len(kwargs) == 0:
- # lexsort is significantly faster than self._values.argsort()
- target = self._sort_levels_monotonic(raise_if_incomparable=True)
- return lexsort_indexer(target._get_codes_for_sorting())
- return self._values.argsort(*args, **kwargs)
-
- @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs)
- def repeat(self, repeats: int, axis=None) -> MultiIndex:
- nv.validate_repeat((), {"axis": axis})
- # error: Incompatible types in assignment (expression has type "ndarray",
- # variable has type "int")
- repeats = ensure_platform_int(repeats) # type: ignore[assignment]
- return MultiIndex(
- levels=self.levels,
- codes=[
- level_codes.view(np.ndarray).astype(np.intp, copy=False).repeat(repeats)
- for level_codes in self.codes
- ],
- names=self.names,
- sortorder=self.sortorder,
- verify_integrity=False,
- )
-
- # error: Signature of "drop" incompatible with supertype "Index"
- def drop( # type: ignore[override]
- self,
- codes,
- level: Index | np.ndarray | Iterable[Hashable] | None = None,
- errors: IgnoreRaise = "raise",
- ) -> MultiIndex:
- """
- Make new MultiIndex with passed list of codes deleted.
-
- Parameters
- ----------
- codes : array-like
- Must be a list of tuples when level is not specified.
- level : int or level name, default None
- errors : str, default 'raise'
-
- Returns
- -------
- MultiIndex
- """
- if level is not None:
- return self._drop_from_level(codes, level, errors)
-
- if not isinstance(codes, (np.ndarray, Index)):
- try:
- codes = com.index_labels_to_array(codes, dtype=np.dtype("object"))
- except ValueError:
- pass
-
- inds = []
- for level_codes in codes:
- try:
- loc = self.get_loc(level_codes)
- # get_loc returns either an integer, a slice, or a boolean
- # mask
- if isinstance(loc, int):
- inds.append(loc)
- elif isinstance(loc, slice):
- step = loc.step if loc.step is not None else 1
- inds.extend(range(loc.start, loc.stop, step))
- elif com.is_bool_indexer(loc):
- if self._lexsort_depth == 0:
- warnings.warn(
- "dropping on a non-lexsorted multi-index "
- "without a level parameter may impact performance.",
- PerformanceWarning,
- stacklevel=find_stack_level(),
- )
- loc = loc.nonzero()[0]
- inds.extend(loc)
- else:
- msg = f"unsupported indexer of type {type(loc)}"
- raise AssertionError(msg)
- except KeyError:
- if errors != "ignore":
- raise
-
- return self.delete(inds)
-
- def _drop_from_level(
- self, codes, level, errors: IgnoreRaise = "raise"
- ) -> MultiIndex:
- codes = com.index_labels_to_array(codes)
- i = self._get_level_number(level)
- index = self.levels[i]
- values = index.get_indexer(codes)
- # If nan should be dropped it will equal -1 here. We have to check which values
- # are not nan and equal -1, this means they are missing in the index
- nan_codes = isna(codes)
- values[(np.equal(nan_codes, False)) & (values == -1)] = -2
- if index.shape[0] == self.shape[0]:
- values[np.equal(nan_codes, True)] = -2
-
- not_found = codes[values == -2]
- if len(not_found) != 0 and errors != "ignore":
- raise KeyError(f"labels {not_found} not found in level")
- mask = ~algos.isin(self.codes[i], values)
-
- return self[mask]
-
- def swaplevel(self, i=-2, j=-1) -> MultiIndex:
- """
- Swap level i with level j.
-
- Calling this method does not change the ordering of the values.
-
- Parameters
- ----------
- i : int, str, default -2
- First level of index to be swapped. Can pass level name as string.
- Type of parameters can be mixed.
- j : int, str, default -1
- Second level of index to be swapped. Can pass level name as string.
- Type of parameters can be mixed.
-
- Returns
- -------
- MultiIndex
- A new MultiIndex.
-
- See Also
- --------
- Series.swaplevel : Swap levels i and j in a MultiIndex.
- DataFrame.swaplevel : Swap levels i and j in a MultiIndex on a
- particular axis.
-
- Examples
- --------
- >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
- ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
- >>> mi
- MultiIndex([('a', 'bb'),
- ('a', 'aa'),
- ('b', 'bb'),
- ('b', 'aa')],
- )
- >>> mi.swaplevel(0, 1)
- MultiIndex([('bb', 'a'),
- ('aa', 'a'),
- ('bb', 'b'),
- ('aa', 'b')],
- )
- """
- new_levels = list(self.levels)
- new_codes = list(self.codes)
- new_names = list(self.names)
-
- i = self._get_level_number(i)
- j = self._get_level_number(j)
-
- new_levels[i], new_levels[j] = new_levels[j], new_levels[i]
- new_codes[i], new_codes[j] = new_codes[j], new_codes[i]
- new_names[i], new_names[j] = new_names[j], new_names[i]
-
- return MultiIndex(
- levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
- )
-
- def reorder_levels(self, order) -> MultiIndex:
- """
- Rearrange levels using input order. May not drop or duplicate levels.
-
- Parameters
- ----------
- order : list of int or list of str
- List representing new level order. Reference level by number
- (position) or by key (label).
-
- Returns
- -------
- MultiIndex
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y'])
- >>> mi
- MultiIndex([(1, 3),
- (2, 4)],
- names=['x', 'y'])
-
- >>> mi.reorder_levels(order=[1, 0])
- MultiIndex([(3, 1),
- (4, 2)],
- names=['y', 'x'])
-
- >>> mi.reorder_levels(order=['y', 'x'])
- MultiIndex([(3, 1),
- (4, 2)],
- names=['y', 'x'])
- """
- order = [self._get_level_number(i) for i in order]
- if len(order) != self.nlevels:
- raise AssertionError(
- f"Length of order must be same as number of levels ({self.nlevels}), "
- f"got {len(order)}"
- )
- new_levels = [self.levels[i] for i in order]
- new_codes = [self.codes[i] for i in order]
- new_names = [self.names[i] for i in order]
-
- return MultiIndex(
- levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
- )
-
- def _get_codes_for_sorting(self) -> list[Categorical]:
- """
- we are categorizing our codes by using the
- available categories (all, not just observed)
- excluding any missing ones (-1); this is in preparation
- for sorting, where we need to disambiguate that -1 is not
- a valid valid
- """
-
- def cats(level_codes):
- return np.arange(
- np.array(level_codes).max() + 1 if len(level_codes) else 0,
- dtype=level_codes.dtype,
- )
-
- return [
- Categorical.from_codes(level_codes, cats(level_codes), ordered=True)
- for level_codes in self.codes
- ]
-
- def sortlevel(
- self,
- level: IndexLabel = 0,
- ascending: bool | list[bool] = True,
- sort_remaining: bool = True,
- ) -> tuple[MultiIndex, npt.NDArray[np.intp]]:
- """
- Sort MultiIndex at the requested level.
-
- The result will respect the original ordering of the associated
- factor at that level.
-
- Parameters
- ----------
- level : list-like, int or str, default 0
- If a string is given, must be a name of the level.
- If list-like must be names or ints of levels.
- ascending : bool, default True
- False to sort in descending order.
- Can also be a list to specify a directed ordering.
- sort_remaining : sort by the remaining levels after level
-
- Returns
- -------
- sorted_index : pd.MultiIndex
- Resulting index.
- indexer : np.ndarray[np.intp]
- Indices of output values in original index.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]])
- >>> mi
- MultiIndex([(0, 2),
- (0, 1)],
- )
-
- >>> mi.sortlevel()
- (MultiIndex([(0, 1),
- (0, 2)],
- ), array([1, 0]))
-
- >>> mi.sortlevel(sort_remaining=False)
- (MultiIndex([(0, 2),
- (0, 1)],
- ), array([0, 1]))
-
- >>> mi.sortlevel(1)
- (MultiIndex([(0, 1),
- (0, 2)],
- ), array([1, 0]))
-
- >>> mi.sortlevel(1, ascending=False)
- (MultiIndex([(0, 2),
- (0, 1)],
- ), array([0, 1]))
- """
- if not is_list_like(level):
- level = [level]
- # error: Item "Hashable" of "Union[Hashable, Sequence[Hashable]]" has
- # no attribute "__iter__" (not iterable)
- level = [
- self._get_level_number(lev) for lev in level # type: ignore[union-attr]
- ]
- sortorder = None
-
- # we have a directed ordering via ascending
- if isinstance(ascending, list):
- if not len(level) == len(ascending):
- raise ValueError("level must have same length as ascending")
-
- indexer = lexsort_indexer(
- [self.codes[lev] for lev in level], orders=ascending
- )
-
- # level ordering
- else:
- codes = list(self.codes)
- shape = list(self.levshape)
-
- # partition codes and shape
- primary = tuple(codes[lev] for lev in level)
- primshp = tuple(shape[lev] for lev in level)
-
- # Reverse sorted to retain the order of
- # smaller indices that needs to be removed
- for lev in sorted(level, reverse=True):
- codes.pop(lev)
- shape.pop(lev)
-
- if sort_remaining:
- primary += primary + tuple(codes)
- primshp += primshp + tuple(shape)
- else:
- sortorder = level[0]
-
- indexer = indexer_from_factorized(primary, primshp, compress=False)
-
- if not ascending:
- indexer = indexer[::-1]
-
- indexer = ensure_platform_int(indexer)
- new_codes = [level_codes.take(indexer) for level_codes in self.codes]
-
- new_index = MultiIndex(
- codes=new_codes,
- levels=self.levels,
- names=self.names,
- sortorder=sortorder,
- verify_integrity=False,
- )
-
- return new_index, indexer
-
- def _wrap_reindex_result(self, target, indexer, preserve_names: bool):
- if not isinstance(target, MultiIndex):
- if indexer is None:
- target = self
- elif (indexer >= 0).all():
- target = self.take(indexer)
- else:
- try:
- target = MultiIndex.from_tuples(target)
- except TypeError:
- # not all tuples, see test_constructor_dict_multiindex_reindex_flat
- return target
-
- target = self._maybe_preserve_names(target, preserve_names)
- return target
-
- def _maybe_preserve_names(self, target: Index, preserve_names: bool) -> Index:
- if (
- preserve_names
- and target.nlevels == self.nlevels
- and target.names != self.names
- ):
- target = target.copy(deep=False)
- target.names = self.names
- return target
-
- # --------------------------------------------------------------------
- # Indexing Methods
-
- def _check_indexing_error(self, key) -> None:
- if not is_hashable(key) or is_iterator(key):
- # We allow tuples if they are hashable, whereas other Index
- # subclasses require scalar.
- # We have to explicitly exclude generators, as these are hashable.
- raise InvalidIndexError(key)
-
- @cache_readonly
- def _should_fallback_to_positional(self) -> bool:
- """
- Should integer key(s) be treated as positional?
- """
- # GH#33355
- return self.levels[0]._should_fallback_to_positional
-
- def _get_indexer_strict(
- self, key, axis_name: str
- ) -> tuple[Index, npt.NDArray[np.intp]]:
- keyarr = key
- if not isinstance(keyarr, Index):
- keyarr = com.asarray_tuplesafe(keyarr)
-
- if len(keyarr) and not isinstance(keyarr[0], tuple):
- indexer = self._get_indexer_level_0(keyarr)
-
- self._raise_if_missing(key, indexer, axis_name)
- return self[indexer], indexer
-
- return super()._get_indexer_strict(key, axis_name)
-
- def _raise_if_missing(self, key, indexer, axis_name: str) -> None:
- keyarr = key
- if not isinstance(key, Index):
- keyarr = com.asarray_tuplesafe(key)
-
- if len(keyarr) and not isinstance(keyarr[0], tuple):
- # i.e. same condition for special case in MultiIndex._get_indexer_strict
-
- mask = indexer == -1
- if mask.any():
- check = self.levels[0].get_indexer(keyarr)
- cmask = check == -1
- if cmask.any():
- raise KeyError(f"{keyarr[cmask]} not in index")
- # We get here when levels still contain values which are not
- # actually in Index anymore
- raise KeyError(f"{keyarr} not in index")
- else:
- return super()._raise_if_missing(key, indexer, axis_name)
-
- def _get_indexer_level_0(self, target) -> npt.NDArray[np.intp]:
- """
- Optimized equivalent to `self.get_level_values(0).get_indexer_for(target)`.
- """
- lev = self.levels[0]
- codes = self._codes[0]
- cat = Categorical.from_codes(codes=codes, categories=lev)
- ci = Index(cat)
- return ci.get_indexer_for(target)
-
- def get_slice_bound(
- self,
- label: Hashable | Sequence[Hashable],
- side: Literal["left", "right"],
- ) -> int:
- """
- For an ordered MultiIndex, compute slice bound
- that corresponds to given label.
-
- Returns leftmost (one-past-the-rightmost if `side=='right') position
- of given label.
-
- Parameters
- ----------
- label : object or tuple of objects
- side : {'left', 'right'}
-
- Returns
- -------
- int
- Index of label.
-
- Notes
- -----
- This method only works if level 0 index of the MultiIndex is lexsorted.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')])
-
- Get the locations from the leftmost 'b' in the first level
- until the end of the multiindex:
-
- >>> mi.get_slice_bound('b', side="left")
- 1
-
- Like above, but if you get the locations from the rightmost
- 'b' in the first level and 'f' in the second level:
-
- >>> mi.get_slice_bound(('b','f'), side="right")
- 3
-
- See Also
- --------
- MultiIndex.get_loc : Get location for a label or a tuple of labels.
- MultiIndex.get_locs : Get location for a label/slice/list/mask or a
- sequence of such.
- """
- if not isinstance(label, tuple):
- label = (label,)
- return self._partial_tup_index(label, side=side)
-
- # pylint: disable-next=useless-parent-delegation
- def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]:
- """
- For an ordered MultiIndex, compute the slice locations for input
- labels.
-
- The input labels can be tuples representing partial levels, e.g. for a
- MultiIndex with 3 levels, you can pass a single value (corresponding to
- the first level), or a 1-, 2-, or 3-tuple.
-
- Parameters
- ----------
- start : label or tuple, default None
- If None, defaults to the beginning
- end : label or tuple
- If None, defaults to the end
- step : int or None
- Slice step
-
- Returns
- -------
- (start, end) : (int, int)
-
- Notes
- -----
- This method only works if the MultiIndex is properly lexsorted. So,
- if only the first 2 levels of a 3-level MultiIndex are lexsorted,
- you can only pass two levels to ``.slice_locs``.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([list('abbd'), list('deff')],
- ... names=['A', 'B'])
-
- Get the slice locations from the beginning of 'b' in the first level
- until the end of the multiindex:
-
- >>> mi.slice_locs(start='b')
- (1, 4)
-
- Like above, but stop at the end of 'b' in the first level and 'f' in
- the second level:
-
- >>> mi.slice_locs(start='b', end=('b', 'f'))
- (1, 3)
-
- See Also
- --------
- MultiIndex.get_loc : Get location for a label or a tuple of labels.
- MultiIndex.get_locs : Get location for a label/slice/list/mask or a
- sequence of such.
- """
- # This function adds nothing to its parent implementation (the magic
- # happens in get_slice_bound method), but it adds meaningful doc.
- return super().slice_locs(start, end, step)
-
- def _partial_tup_index(self, tup: tuple, side: Literal["left", "right"] = "left"):
- if len(tup) > self._lexsort_depth:
- raise UnsortedIndexError(
- f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth "
- f"({self._lexsort_depth})"
- )
-
- n = len(tup)
- start, end = 0, len(self)
- zipped = zip(tup, self.levels, self.codes)
- for k, (lab, lev, level_codes) in enumerate(zipped):
- section = level_codes[start:end]
-
- if lab not in lev and not isna(lab):
- # short circuit
- try:
- loc = algos.searchsorted(lev, lab, side=side)
- except TypeError as err:
- # non-comparable e.g. test_slice_locs_with_type_mismatch
- raise TypeError(f"Level type mismatch: {lab}") from err
- if not is_integer(loc):
- # non-comparable level, e.g. test_groupby_example
- raise TypeError(f"Level type mismatch: {lab}")
- if side == "right" and loc >= 0:
- loc -= 1
- return start + algos.searchsorted(section, loc, side=side)
-
- idx = self._get_loc_single_level_index(lev, lab)
- if isinstance(idx, slice) and k < n - 1:
- # Get start and end value from slice, necessary when a non-integer
- # interval is given as input GH#37707
- start = idx.start
- end = idx.stop
- elif k < n - 1:
- # error: Incompatible types in assignment (expression has type
- # "Union[ndarray[Any, dtype[signedinteger[Any]]]
- end = start + algos.searchsorted( # type: ignore[assignment]
- section, idx, side="right"
- )
- # error: Incompatible types in assignment (expression has type
- # "Union[ndarray[Any, dtype[signedinteger[Any]]]
- start = start + algos.searchsorted( # type: ignore[assignment]
- section, idx, side="left"
- )
- elif isinstance(idx, slice):
- idx = idx.start
- return start + algos.searchsorted(section, idx, side=side)
- else:
- return start + algos.searchsorted(section, idx, side=side)
-
- def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int:
- """
- If key is NA value, location of index unify as -1.
-
- Parameters
- ----------
- level_index: Index
- key : label
-
- Returns
- -------
- loc : int
- If key is NA value, loc is -1
- Else, location of key in index.
-
- See Also
- --------
- Index.get_loc : The get_loc method for (single-level) index.
- """
- if is_scalar(key) and isna(key):
- # TODO: need is_valid_na_for_dtype(key, level_index.dtype)
- return -1
- else:
- return level_index.get_loc(key)
-
- def get_loc(self, key):
- """
- Get location for a label or a tuple of labels.
-
- The location is returned as an integer/slice or boolean
- mask.
-
- Parameters
- ----------
- key : label or tuple of labels (one for each level)
-
- Returns
- -------
- int, slice object or boolean mask
- If the key is past the lexsort depth, the return may be a
- boolean mask array, otherwise it is always a slice or int.
-
- See Also
- --------
- Index.get_loc : The get_loc method for (single-level) index.
- MultiIndex.slice_locs : Get slice location given start label(s) and
- end label(s).
- MultiIndex.get_locs : Get location for a label/slice/list/mask or a
- sequence of such.
-
- Notes
- -----
- The key cannot be a slice, list of same-level labels, a boolean mask,
- or a sequence of such. If you want to use those, use
- :meth:`MultiIndex.get_locs` instead.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')])
-
- >>> mi.get_loc('b')
- slice(1, 3, None)
-
- >>> mi.get_loc(('b', 'e'))
- 1
- """
- self._check_indexing_error(key)
-
- def _maybe_to_slice(loc):
- """convert integer indexer to boolean mask or slice if possible"""
- if not isinstance(loc, np.ndarray) or loc.dtype != np.intp:
- return loc
-
- loc = lib.maybe_indices_to_slice(loc, len(self))
- if isinstance(loc, slice):
- return loc
-
- mask = np.empty(len(self), dtype="bool")
- mask.fill(False)
- mask[loc] = True
- return mask
-
- if not isinstance(key, tuple):
- loc = self._get_level_indexer(key, level=0)
- return _maybe_to_slice(loc)
-
- keylen = len(key)
- if self.nlevels < keylen:
- raise KeyError(
- f"Key length ({keylen}) exceeds index depth ({self.nlevels})"
- )
-
- if keylen == self.nlevels and self.is_unique:
- # TODO: what if we have an IntervalIndex level?
- # i.e. do we need _index_as_unique on that level?
- try:
- return self._engine.get_loc(key)
- except TypeError:
- # e.g. test_partial_slicing_with_multiindex partial string slicing
- loc, _ = self.get_loc_level(key, list(range(self.nlevels)))
- return loc
-
- # -- partial selection or non-unique index
- # break the key into 2 parts based on the lexsort_depth of the index;
- # the first part returns a continuous slice of the index; the 2nd part
- # needs linear search within the slice
- i = self._lexsort_depth
- lead_key, follow_key = key[:i], key[i:]
-
- if not lead_key:
- start = 0
- stop = len(self)
- else:
- try:
- start, stop = self.slice_locs(lead_key, lead_key)
- except TypeError as err:
- # e.g. test_groupby_example key = ((0, 0, 1, 2), "new_col")
- # when self has 5 integer levels
- raise KeyError(key) from err
-
- if start == stop:
- raise KeyError(key)
-
- if not follow_key:
- return slice(start, stop)
-
- warnings.warn(
- "indexing past lexsort depth may impact performance.",
- PerformanceWarning,
- stacklevel=find_stack_level(),
- )
-
- loc = np.arange(start, stop, dtype=np.intp)
-
- for i, k in enumerate(follow_key, len(lead_key)):
- mask = self.codes[i][loc] == self._get_loc_single_level_index(
- self.levels[i], k
- )
- if not mask.all():
- loc = loc[mask]
- if not len(loc):
- raise KeyError(key)
-
- return _maybe_to_slice(loc) if len(loc) != stop - start else slice(start, stop)
-
- def get_loc_level(self, key, level: IndexLabel = 0, drop_level: bool = True):
- """
- Get location and sliced index for requested label(s)/level(s).
-
- Parameters
- ----------
- key : label or sequence of labels
- level : int/level name or list thereof, optional
- drop_level : bool, default True
- If ``False``, the resulting index will not drop any level.
-
- Returns
- -------
- tuple
- A 2-tuple where the elements :
-
- Element 0: int, slice object or boolean array.
-
- Element 1: The resulting sliced multiindex/index. If the key
- contains all levels, this will be ``None``.
-
- See Also
- --------
- MultiIndex.get_loc : Get location for a label or a tuple of labels.
- MultiIndex.get_locs : Get location for a label/slice/list/mask or a
- sequence of such.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')],
- ... names=['A', 'B'])
-
- >>> mi.get_loc_level('b')
- (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B'))
-
- >>> mi.get_loc_level('e', level='B')
- (array([False, True, False]), Index(['b'], dtype='object', name='A'))
-
- >>> mi.get_loc_level(['b', 'e'])
- (1, None)
- """
- if not isinstance(level, (list, tuple)):
- level = self._get_level_number(level)
- else:
- level = [self._get_level_number(lev) for lev in level]
-
- loc, mi = self._get_loc_level(key, level=level)
- if not drop_level:
- if lib.is_integer(loc):
- mi = self[loc : loc + 1]
- else:
- mi = self[loc]
- return loc, mi
-
- def _get_loc_level(self, key, level: int | list[int] = 0):
- """
- get_loc_level but with `level` known to be positional, not name-based.
- """
-
- # different name to distinguish from maybe_droplevels
- def maybe_mi_droplevels(indexer, levels):
- """
- If level does not exist or all levels were dropped, the exception
- has to be handled outside.
- """
- new_index = self[indexer]
-
- for i in sorted(levels, reverse=True):
- new_index = new_index._drop_level_numbers([i])
-
- return new_index
-
- if isinstance(level, (tuple, list)):
- if len(key) != len(level):
- raise AssertionError(
- "Key for location must have same length as number of levels"
- )
- result = None
- for lev, k in zip(level, key):
- loc, new_index = self._get_loc_level(k, level=lev)
- if isinstance(loc, slice):
- mask = np.zeros(len(self), dtype=bool)
- mask[loc] = True
- loc = mask
- result = loc if result is None else result & loc
-
- try:
- # FIXME: we should be only dropping levels on which we are
- # scalar-indexing
- mi = maybe_mi_droplevels(result, level)
- except ValueError:
- # droplevel failed because we tried to drop all levels,
- # i.e. len(level) == self.nlevels
- mi = self[result]
-
- return result, mi
-
- # kludge for #1796
- if isinstance(key, list):
- key = tuple(key)
-
- if isinstance(key, tuple) and level == 0:
- try:
- # Check if this tuple is a single key in our first level
- if key in self.levels[0]:
- indexer = self._get_level_indexer(key, level=level)
- new_index = maybe_mi_droplevels(indexer, [0])
- return indexer, new_index
- except (TypeError, InvalidIndexError):
- pass
-
- if not any(isinstance(k, slice) for k in key):
- if len(key) == self.nlevels and self.is_unique:
- # Complete key in unique index -> standard get_loc
- try:
- return (self._engine.get_loc(key), None)
- except KeyError as err:
- raise KeyError(key) from err
- except TypeError:
- # e.g. partial string indexing
- # test_partial_string_timestamp_multiindex
- pass
-
- # partial selection
- indexer = self.get_loc(key)
- ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)]
- if len(ilevels) == self.nlevels:
- if is_integer(indexer):
- # we are dropping all levels
- return indexer, None
-
- # TODO: in some cases we still need to drop some levels,
- # e.g. test_multiindex_perf_warn
- # test_partial_string_timestamp_multiindex
- ilevels = [
- i
- for i in range(len(key))
- if (
- not isinstance(key[i], str)
- or not self.levels[i]._supports_partial_string_indexing
- )
- and key[i] != slice(None, None)
- ]
- if len(ilevels) == self.nlevels:
- # TODO: why?
- ilevels = []
- return indexer, maybe_mi_droplevels(indexer, ilevels)
-
- else:
- indexer = None
- for i, k in enumerate(key):
- if not isinstance(k, slice):
- loc_level = self._get_level_indexer(k, level=i)
- if isinstance(loc_level, slice):
- if com.is_null_slice(loc_level) or com.is_full_slice(
- loc_level, len(self)
- ):
- # everything
- continue
-
- # e.g. test_xs_IndexSlice_argument_not_implemented
- k_index = np.zeros(len(self), dtype=bool)
- k_index[loc_level] = True
-
- else:
- k_index = loc_level
-
- elif com.is_null_slice(k):
- # taking everything, does not affect `indexer` below
- continue
-
- else:
- # FIXME: this message can be inaccurate, e.g.
- # test_series_varied_multiindex_alignment
- raise TypeError(f"Expected label or tuple of labels, got {key}")
-
- if indexer is None:
- indexer = k_index
- else:
- indexer &= k_index
- if indexer is None:
- indexer = slice(None, None)
- ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)]
- return indexer, maybe_mi_droplevels(indexer, ilevels)
- else:
- indexer = self._get_level_indexer(key, level=level)
- if (
- isinstance(key, str)
- and self.levels[level]._supports_partial_string_indexing
- ):
- # check to see if we did an exact lookup vs sliced
- check = self.levels[level].get_loc(key)
- if not is_integer(check):
- # e.g. test_partial_string_timestamp_multiindex
- return indexer, self[indexer]
-
- try:
- result_index = maybe_mi_droplevels(indexer, [level])
- except ValueError:
- result_index = self[indexer]
-
- return indexer, result_index
-
- def _get_level_indexer(
- self, key, level: int = 0, indexer: npt.NDArray[np.bool_] | None = None
- ):
- # `level` kwarg is _always_ positional, never name
- # return a boolean array or slice showing where the key is
- # in the totality of values
- # if the indexer is provided, then use this
-
- level_index = self.levels[level]
- level_codes = self.codes[level]
-
- def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
- # Compute a bool indexer to identify the positions to take.
- # If we have an existing indexer, we only need to examine the
- # subset of positions where the existing indexer is True.
- if indexer is not None:
- # we only need to look at the subset of codes where the
- # existing indexer equals True
- codes = codes[indexer]
-
- if step is None or step == 1:
- new_indexer = (codes >= start) & (codes < stop)
- else:
- r = np.arange(start, stop, step, dtype=codes.dtype)
- new_indexer = algos.isin(codes, r)
-
- if indexer is None:
- return new_indexer
-
- indexer = indexer.copy()
- indexer[indexer] = new_indexer
- return indexer
-
- if isinstance(key, slice):
- # handle a slice, returning a slice if we can
- # otherwise a boolean indexer
- step = key.step
- is_negative_step = step is not None and step < 0
-
- try:
- if key.start is not None:
- start = level_index.get_loc(key.start)
- elif is_negative_step:
- start = len(level_index) - 1
- else:
- start = 0
-
- if key.stop is not None:
- stop = level_index.get_loc(key.stop)
- elif is_negative_step:
- stop = 0
- elif isinstance(start, slice):
- stop = len(level_index)
- else:
- stop = len(level_index) - 1
- except KeyError:
- # we have a partial slice (like looking up a partial date
- # string)
- start = stop = level_index.slice_indexer(key.start, key.stop, key.step)
- step = start.step
-
- if isinstance(start, slice) or isinstance(stop, slice):
- # we have a slice for start and/or stop
- # a partial date slicer on a DatetimeIndex generates a slice
- # note that the stop ALREADY includes the stopped point (if
- # it was a string sliced)
- start = getattr(start, "start", start)
- stop = getattr(stop, "stop", stop)
- return convert_indexer(start, stop, step)
-
- elif level > 0 or self._lexsort_depth == 0 or step is not None:
- # need to have like semantics here to right
- # searching as when we are using a slice
- # so adjust the stop by 1 (so we include stop)
- stop = (stop - 1) if is_negative_step else (stop + 1)
- return convert_indexer(start, stop, step)
- else:
- # sorted, so can return slice object -> view
- i = algos.searchsorted(level_codes, start, side="left")
- j = algos.searchsorted(level_codes, stop, side="right")
- return slice(i, j, step)
-
- else:
- idx = self._get_loc_single_level_index(level_index, key)
-
- if level > 0 or self._lexsort_depth == 0:
- # Desired level is not sorted
- if isinstance(idx, slice):
- # test_get_loc_partial_timestamp_multiindex
- locs = (level_codes >= idx.start) & (level_codes < idx.stop)
- return locs
-
- locs = np.array(level_codes == idx, dtype=bool, copy=False)
-
- if not locs.any():
- # The label is present in self.levels[level] but unused:
- raise KeyError(key)
- return locs
-
- if isinstance(idx, slice):
- # e.g. test_partial_string_timestamp_multiindex
- start = algos.searchsorted(level_codes, idx.start, side="left")
- # NB: "left" here bc of slice semantics
- end = algos.searchsorted(level_codes, idx.stop, side="left")
- else:
- start = algos.searchsorted(level_codes, idx, side="left")
- end = algos.searchsorted(level_codes, idx, side="right")
-
- if start == end:
- # The label is present in self.levels[level] but unused:
- raise KeyError(key)
- return slice(start, end)
-
- def get_locs(self, seq):
- """
- Get location for a sequence of labels.
-
- Parameters
- ----------
- seq : label, slice, list, mask or a sequence of such
- You should use one of the above for each level.
- If a level should not be used, set it to ``slice(None)``.
-
- Returns
- -------
- numpy.ndarray
- NumPy array of integers suitable for passing to iloc.
-
- See Also
- --------
- MultiIndex.get_loc : Get location for a label or a tuple of labels.
- MultiIndex.slice_locs : Get slice location given start label(s) and
- end label(s).
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')])
-
- >>> mi.get_locs('b') # doctest: +SKIP
- array([1, 2], dtype=int64)
-
- >>> mi.get_locs([slice(None), ['e', 'f']]) # doctest: +SKIP
- array([1, 2], dtype=int64)
-
- >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP
- array([2], dtype=int64)
- """
-
- # must be lexsorted to at least as many levels
- true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s]
- if true_slices and true_slices[-1] >= self._lexsort_depth:
- raise UnsortedIndexError(
- "MultiIndex slicing requires the index to be lexsorted: slicing "
- f"on levels {true_slices}, lexsort depth {self._lexsort_depth}"
- )
-
- if any(x is Ellipsis for x in seq):
- raise NotImplementedError(
- "MultiIndex does not support indexing with Ellipsis"
- )
-
- n = len(self)
-
- def _to_bool_indexer(indexer) -> npt.NDArray[np.bool_]:
- if isinstance(indexer, slice):
- new_indexer = np.zeros(n, dtype=np.bool_)
- new_indexer[indexer] = True
- return new_indexer
- return indexer
-
- # a bool indexer for the positions we want to take
- indexer: npt.NDArray[np.bool_] | None = None
-
- for i, k in enumerate(seq):
- lvl_indexer: npt.NDArray[np.bool_] | slice | None = None
-
- if com.is_bool_indexer(k):
- if len(k) != n:
- raise ValueError(
- "cannot index with a boolean indexer that "
- "is not the same length as the index"
- )
- lvl_indexer = np.asarray(k)
-
- elif is_list_like(k):
- # a collection of labels to include from this level (these are or'd)
-
- # GH#27591 check if this is a single tuple key in the level
- try:
- lvl_indexer = self._get_level_indexer(k, level=i, indexer=indexer)
- except (InvalidIndexError, TypeError, KeyError) as err:
- # InvalidIndexError e.g. non-hashable, fall back to treating
- # this as a sequence of labels
- # KeyError it can be ambiguous if this is a label or sequence
- # of labels
- # github.com/pandas-dev/pandas/issues/39424#issuecomment-871626708
- for x in k:
- if not is_hashable(x):
- # e.g. slice
- raise err
- # GH 39424: Ignore not founds
- # GH 42351: No longer ignore not founds & enforced in 2.0
- # TODO: how to handle IntervalIndex level? (no test cases)
- item_indexer = self._get_level_indexer(
- x, level=i, indexer=indexer
- )
- if lvl_indexer is None:
- lvl_indexer = _to_bool_indexer(item_indexer)
- elif isinstance(item_indexer, slice):
- lvl_indexer[item_indexer] = True # type: ignore[index]
- else:
- lvl_indexer |= item_indexer
-
- if lvl_indexer is None:
- # no matches we are done
- # test_loc_getitem_duplicates_multiindex_empty_indexer
- return np.array([], dtype=np.intp)
-
- elif com.is_null_slice(k):
- # empty slice
- if indexer is None and i == len(seq) - 1:
- return np.arange(n, dtype=np.intp)
- continue
-
- else:
- # a slice or a single label
- lvl_indexer = self._get_level_indexer(k, level=i, indexer=indexer)
-
- # update indexer
- lvl_indexer = _to_bool_indexer(lvl_indexer)
- if indexer is None:
- indexer = lvl_indexer
- else:
- indexer &= lvl_indexer
- if not np.any(indexer) and np.any(lvl_indexer):
- raise KeyError(seq)
-
- # empty indexer
- if indexer is None:
- return np.array([], dtype=np.intp)
-
- pos_indexer = indexer.nonzero()[0]
- return self._reorder_indexer(seq, pos_indexer)
-
- # --------------------------------------------------------------------
-
- def _reorder_indexer(
- self,
- seq: tuple[Scalar | Iterable | AnyArrayLike, ...],
- indexer: npt.NDArray[np.intp],
- ) -> npt.NDArray[np.intp]:
- """
- Reorder an indexer of a MultiIndex (self) so that the labels are in the
- same order as given in seq
-
- Parameters
- ----------
- seq : label/slice/list/mask or a sequence of such
- indexer: a position indexer of self
-
- Returns
- -------
- indexer : a sorted position indexer of self ordered as seq
- """
-
- # check if sorting is necessary
- need_sort = False
- for i, k in enumerate(seq):
- if com.is_null_slice(k) or com.is_bool_indexer(k) or is_scalar(k):
- pass
- elif is_list_like(k):
- if len(k) <= 1: # type: ignore[arg-type]
- pass
- elif self._is_lexsorted():
- # If the index is lexsorted and the list_like label
- # in seq are sorted then we do not need to sort
- k_codes = self.levels[i].get_indexer(k)
- k_codes = k_codes[k_codes >= 0] # Filter absent keys
- # True if the given codes are not ordered
- need_sort = (k_codes[:-1] > k_codes[1:]).any()
- else:
- need_sort = True
- elif isinstance(k, slice):
- if self._is_lexsorted():
- need_sort = k.step is not None and k.step < 0
- else:
- need_sort = True
- else:
- need_sort = True
- if need_sort:
- break
- if not need_sort:
- return indexer
-
- n = len(self)
- keys: tuple[np.ndarray, ...] = ()
- # For each level of the sequence in seq, map the level codes with the
- # order they appears in a list-like sequence
- # This mapping is then use to reorder the indexer
- for i, k in enumerate(seq):
- if is_scalar(k):
- # GH#34603 we want to treat a scalar the same as an all equal list
- k = [k]
- if com.is_bool_indexer(k):
- new_order = np.arange(n)[indexer]
- elif is_list_like(k):
- # Generate a map with all level codes as sorted initially
- k = algos.unique(k)
- key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len(
- self.levels[i]
- )
- # Set order as given in the indexer list
- level_indexer = self.levels[i].get_indexer(k)
- level_indexer = level_indexer[level_indexer >= 0] # Filter absent keys
- key_order_map[level_indexer] = np.arange(len(level_indexer))
-
- new_order = key_order_map[self.codes[i][indexer]]
- elif isinstance(k, slice) and k.step is not None and k.step < 0:
- # flip order for negative step
- new_order = np.arange(n)[::-1][indexer]
- elif isinstance(k, slice) and k.start is None and k.stop is None:
- # slice(None) should not determine order GH#31330
- new_order = np.ones((n,), dtype=np.intp)[indexer]
- else:
- # For all other case, use the same order as the level
- new_order = np.arange(n)[indexer]
- keys = (new_order,) + keys
-
- # Find the reordering using lexsort on the keys mapping
- ind = np.lexsort(keys)
- return indexer[ind]
-
- def truncate(self, before=None, after=None) -> MultiIndex:
- """
- Slice index between two labels / tuples, return new MultiIndex.
-
- Parameters
- ----------
- before : label or tuple, can be partial. Default None
- None defaults to start.
- after : label or tuple, can be partial. Default None
- None defaults to end.
-
- Returns
- -------
- MultiIndex
- The truncated MultiIndex.
-
- Examples
- --------
- >>> mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z']])
- >>> mi
- MultiIndex([('a', 'x'), ('b', 'y'), ('c', 'z')],
- )
- >>> mi.truncate(before='a', after='b')
- MultiIndex([('a', 'x'), ('b', 'y')],
- )
- """
- if after and before and after < before:
- raise ValueError("after < before")
-
- i, j = self.levels[0].slice_locs(before, after)
- left, right = self.slice_locs(before, after)
-
- new_levels = list(self.levels)
- new_levels[0] = new_levels[0][i:j]
-
- new_codes = [level_codes[left:right] for level_codes in self.codes]
- new_codes[0] = new_codes[0] - i
-
- return MultiIndex(
- levels=new_levels,
- codes=new_codes,
- names=self._names,
- verify_integrity=False,
- )
-
- def equals(self, other: object) -> bool:
- """
- Determines if two MultiIndex objects have the same labeling information
- (the levels themselves do not necessarily have to be the same)
-
- See Also
- --------
- equal_levels
- """
- if self.is_(other):
- return True
-
- if not isinstance(other, Index):
- return False
-
- if len(self) != len(other):
- return False
-
- if not isinstance(other, MultiIndex):
- # d-level MultiIndex can equal d-tuple Index
- if not self._should_compare(other):
- # object Index or Categorical[object] may contain tuples
- return False
- return array_equivalent(self._values, other._values)
-
- if self.nlevels != other.nlevels:
- return False
-
- for i in range(self.nlevels):
- self_codes = self.codes[i]
- other_codes = other.codes[i]
- self_mask = self_codes == -1
- other_mask = other_codes == -1
- if not np.array_equal(self_mask, other_mask):
- return False
- self_codes = self_codes[~self_mask]
- self_values = self.levels[i]._values.take(self_codes)
-
- other_codes = other_codes[~other_mask]
- other_values = other.levels[i]._values.take(other_codes)
-
- # since we use NaT both datetime64 and timedelta64 we can have a
- # situation where a level is typed say timedelta64 in self (IOW it
- # has other values than NaT) but types datetime64 in other (where
- # its all NaT) but these are equivalent
- if len(self_values) == 0 and len(other_values) == 0:
- continue
-
- if not isinstance(self_values, np.ndarray):
- # i.e. ExtensionArray
- if not self_values.equals(other_values):
- return False
- elif not isinstance(other_values, np.ndarray):
- # i.e. other is ExtensionArray
- if not other_values.equals(self_values):
- return False
- else:
- if not array_equivalent(self_values, other_values):
- return False
-
- return True
-
- def equal_levels(self, other: MultiIndex) -> bool:
- """
- Return True if the levels of both MultiIndex objects are the same
-
- """
- if self.nlevels != other.nlevels:
- return False
-
- for i in range(self.nlevels):
- if not self.levels[i].equals(other.levels[i]):
- return False
- return True
-
- # --------------------------------------------------------------------
- # Set Methods
-
- def _union(self, other, sort) -> MultiIndex:
- other, result_names = self._convert_can_do_setop(other)
- if other.has_duplicates:
- # This is only necessary if other has dupes,
- # otherwise difference is faster
- result = super()._union(other, sort)
-
- if isinstance(result, MultiIndex):
- return result
- return MultiIndex.from_arrays(
- zip(*result), sortorder=None, names=result_names
- )
-
- else:
- right_missing = other.difference(self, sort=False)
- if len(right_missing):
- result = self.append(right_missing)
- else:
- result = self._get_reconciled_name_object(other)
-
- if sort is not False:
- try:
- result = result.sort_values()
- except TypeError:
- if sort is True:
- raise
- warnings.warn(
- "The values in the array are unorderable. "
- "Pass `sort=False` to suppress this warning.",
- RuntimeWarning,
- stacklevel=find_stack_level(),
- )
- return result
-
- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
- return is_object_dtype(dtype)
-
- def _get_reconciled_name_object(self, other) -> MultiIndex:
- """
- If the result of a set operation will be self,
- return self, unless the names change, in which
- case make a shallow copy of self.
- """
- names = self._maybe_match_names(other)
- if self.names != names:
- # error: Cannot determine type of "rename"
- return self.rename(names) # type: ignore[has-type]
- return self
-
- def _maybe_match_names(self, other):
- """
- Try to find common names to attach to the result of an operation between
- a and b. Return a consensus list of names if they match at least partly
- or list of None if they have completely different names.
- """
- if len(self.names) != len(other.names):
- return [None] * len(self.names)
- names = []
- for a_name, b_name in zip(self.names, other.names):
- if a_name == b_name:
- names.append(a_name)
- else:
- # TODO: what if they both have np.nan for their names?
- names.append(None)
- return names
-
- def _wrap_intersection_result(self, other, result) -> MultiIndex:
- _, result_names = self._convert_can_do_setop(other)
- return result.set_names(result_names)
-
- def _wrap_difference_result(self, other, result: MultiIndex) -> MultiIndex:
- _, result_names = self._convert_can_do_setop(other)
-
- if len(result) == 0:
- return result.remove_unused_levels().set_names(result_names)
- else:
- return result.set_names(result_names)
-
- def _convert_can_do_setop(self, other):
- result_names = self.names
-
- if not isinstance(other, Index):
- if len(other) == 0:
- return self[:0], self.names
- else:
- msg = "other must be a MultiIndex or a list of tuples"
- try:
- other = MultiIndex.from_tuples(other, names=self.names)
- except (ValueError, TypeError) as err:
- # ValueError raised by tuples_to_object_array if we
- # have non-object dtype
- raise TypeError(msg) from err
- else:
- result_names = get_unanimous_names(self, other)
-
- return other, result_names
-
- # --------------------------------------------------------------------
-
- @doc(Index.astype)
- def astype(self, dtype, copy: bool = True):
- dtype = pandas_dtype(dtype)
- if is_categorical_dtype(dtype):
- msg = "> 1 ndim Categorical are not supported at this time"
- raise NotImplementedError(msg)
- if not is_object_dtype(dtype):
- raise TypeError(
- "Setting a MultiIndex dtype to anything other than object "
- "is not supported"
- )
- if copy is True:
- return self._view()
- return self
-
- def _validate_fill_value(self, item):
- if isinstance(item, MultiIndex):
- # GH#43212
- if item.nlevels != self.nlevels:
- raise ValueError("Item must have length equal to number of levels.")
- return item._values
- elif not isinstance(item, tuple):
- # Pad the key with empty strings if lower levels of the key
- # aren't specified:
- item = (item,) + ("",) * (self.nlevels - 1)
- elif len(item) != self.nlevels:
- raise ValueError("Item must have length equal to number of levels.")
- return item
-
- def putmask(self, mask, value: MultiIndex) -> MultiIndex:
- """
- Return a new MultiIndex of the values set with the mask.
-
- Parameters
- ----------
- mask : array like
- value : MultiIndex
- Must either be the same length as self or length one
-
- Returns
- -------
- MultiIndex
- """
- mask, noop = validate_putmask(self, mask)
- if noop:
- return self.copy()
-
- if len(mask) == len(value):
- subset = value[mask].remove_unused_levels()
- else:
- subset = value.remove_unused_levels()
-
- new_levels = []
- new_codes = []
-
- for i, (value_level, level, level_codes) in enumerate(
- zip(subset.levels, self.levels, self.codes)
- ):
- new_level = level.union(value_level, sort=False)
- value_codes = new_level.get_indexer_for(subset.get_level_values(i))
- new_code = ensure_int64(level_codes)
- new_code[mask] = value_codes
- new_levels.append(new_level)
- new_codes.append(new_code)
-
- return MultiIndex(
- levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False
- )
-
- def insert(self, loc: int, item) -> MultiIndex:
- """
- Make new MultiIndex inserting new item at location
-
- Parameters
- ----------
- loc : int
- item : tuple
- Must be same length as number of levels in the MultiIndex
-
- Returns
- -------
- new_index : Index
- """
- item = self._validate_fill_value(item)
-
- new_levels = []
- new_codes = []
- for k, level, level_codes in zip(item, self.levels, self.codes):
- if k not in level:
- # have to insert into level
- # must insert at end otherwise you have to recompute all the
- # other codes
- lev_loc = len(level)
- level = level.insert(lev_loc, k)
- else:
- lev_loc = level.get_loc(k)
-
- new_levels.append(level)
- new_codes.append(np.insert(ensure_int64(level_codes), loc, lev_loc))
-
- return MultiIndex(
- levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False
- )
-
- def delete(self, loc) -> MultiIndex:
- """
- Make new index with passed location deleted
-
- Returns
- -------
- new_index : MultiIndex
- """
- new_codes = [np.delete(level_codes, loc) for level_codes in self.codes]
- return MultiIndex(
- levels=self.levels,
- codes=new_codes,
- names=self.names,
- verify_integrity=False,
- )
-
- @doc(Index.isin)
- def isin(self, values, level=None) -> npt.NDArray[np.bool_]:
- if isinstance(values, Generator):
- values = list(values)
-
- if level is None:
- if len(values) == 0:
- return np.zeros((len(self),), dtype=np.bool_)
- if not isinstance(values, MultiIndex):
- values = MultiIndex.from_tuples(values)
- return values.unique().get_indexer_for(self) != -1
- else:
- num = self._get_level_number(level)
- levs = self.get_level_values(num)
-
- if levs.size == 0:
- return np.zeros(len(levs), dtype=np.bool_)
- return levs.isin(values)
-
- # error: Incompatible types in assignment (expression has type overloaded function,
- # base class "Index" defined the type as "Callable[[Index, Any, bool], Any]")
- rename = Index.set_names # type: ignore[assignment]
-
- # ---------------------------------------------------------------
- # Arithmetic/Numeric Methods - Disabled
-
- __add__ = make_invalid_op("__add__")
- __radd__ = make_invalid_op("__radd__")
- __iadd__ = make_invalid_op("__iadd__")
- __sub__ = make_invalid_op("__sub__")
- __rsub__ = make_invalid_op("__rsub__")
- __isub__ = make_invalid_op("__isub__")
- __pow__ = make_invalid_op("__pow__")
- __rpow__ = make_invalid_op("__rpow__")
- __mul__ = make_invalid_op("__mul__")
- __rmul__ = make_invalid_op("__rmul__")
- __floordiv__ = make_invalid_op("__floordiv__")
- __rfloordiv__ = make_invalid_op("__rfloordiv__")
- __truediv__ = make_invalid_op("__truediv__")
- __rtruediv__ = make_invalid_op("__rtruediv__")
- __mod__ = make_invalid_op("__mod__")
- __rmod__ = make_invalid_op("__rmod__")
- __divmod__ = make_invalid_op("__divmod__")
- __rdivmod__ = make_invalid_op("__rdivmod__")
- # Unary methods disabled
- __neg__ = make_invalid_op("__neg__")
- __pos__ = make_invalid_op("__pos__")
- __abs__ = make_invalid_op("__abs__")
- __invert__ = make_invalid_op("__invert__")
-
-
-def _lexsort_depth(codes: list[np.ndarray], nlevels: int) -> int:
- """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted."""
- int64_codes = [ensure_int64(level_codes) for level_codes in codes]
- for k in range(nlevels, 0, -1):
- if libalgos.is_lexsorted(int64_codes[:k]):
- return k
- return 0
-
-
-def sparsify_labels(label_list, start: int = 0, sentinel: object = ""):
- pivoted = list(zip(*label_list))
- k = len(label_list)
-
- result = pivoted[: start + 1]
- prev = pivoted[start]
-
- for cur in pivoted[start + 1 :]:
- sparse_cur = []
-
- for i, (p, t) in enumerate(zip(prev, cur)):
- if i == k - 1:
- sparse_cur.append(t)
- result.append(sparse_cur)
- break
-
- if p == t:
- sparse_cur.append(sentinel)
- else:
- sparse_cur.extend(cur[i:])
- result.append(sparse_cur)
- break
-
- prev = cur
-
- return list(zip(*result))
-
-
-def _get_na_rep(dtype) -> str:
- if is_extension_array_dtype(dtype):
- return f"{dtype.na_value}"
- else:
- dtype = dtype.type
-
- return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype, "NaN")
-
-
-def maybe_droplevels(index: Index, key) -> Index:
- """
- Attempt to drop level or levels from the given index.
-
- Parameters
- ----------
- index: Index
- key : scalar or tuple
-
- Returns
- -------
- Index
- """
- # drop levels
- original_index = index
- if isinstance(key, tuple):
- # Caller is responsible for ensuring the key is not an entry in the first
- # level of the MultiIndex.
- for _ in key:
- try:
- index = index._drop_level_numbers([0])
- except ValueError:
- # we have dropped too much, so back out
- return original_index
- else:
- try:
- index = index._drop_level_numbers([0])
- except ValueError:
- pass
-
- return index
-
-
-def _coerce_indexer_frozen(array_like, categories, copy: bool = False) -> np.ndarray:
- """
- Coerce the array-like indexer to the smallest integer dtype that can encode all
- of the given categories.
-
- Parameters
- ----------
- array_like : array-like
- categories : array-like
- copy : bool
-
- Returns
- -------
- np.ndarray
- Non-writeable.
- """
- array_like = coerce_indexer_dtype(array_like, categories)
- if copy:
- array_like = array_like.copy()
- array_like.flags.writeable = False
- return array_like
-
-
-def _require_listlike(level, arr, arrname: str):
- """
- Ensure that level is either None or listlike, and arr is list-of-listlike.
- """
- if level is not None and not is_list_like(level):
- if not is_list_like(arr):
- raise TypeError(f"{arrname} must be list-like")
- if len(arr) > 0 and is_list_like(arr[0]):
- raise TypeError(f"{arrname} must be list-like")
- level = [level]
- arr = [arr]
- elif level is None or is_list_like(level):
- if not is_list_like(arr) or not is_list_like(arr[0]):
- raise TypeError(f"{arrname} must be list of lists-like")
- return level, arr
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/period.py b/contrib/python/pandas/py3/pandas/core/indexes/period.py
deleted file mode 100644
index eb898786e24..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/period.py
+++ /dev/null
@@ -1,547 +0,0 @@
-from __future__ import annotations
-
-from datetime import (
- datetime,
- timedelta,
-)
-from typing import Hashable
-
-import numpy as np
-
-from pandas._libs import index as libindex
-from pandas._libs.tslibs import (
- BaseOffset,
- NaT,
- Period,
- Resolution,
- Tick,
-)
-from pandas._typing import (
- Dtype,
- DtypeObj,
- npt,
-)
-from pandas.util._decorators import (
- cache_readonly,
- doc,
-)
-
-from pandas.core.dtypes.common import is_integer
-from pandas.core.dtypes.dtypes import PeriodDtype
-from pandas.core.dtypes.generic import ABCSeries
-from pandas.core.dtypes.missing import is_valid_na_for_dtype
-
-from pandas.core.arrays.period import (
- PeriodArray,
- period_array,
- raise_on_incompatible,
- validate_dtype_freq,
-)
-import pandas.core.common as com
-import pandas.core.indexes.base as ibase
-from pandas.core.indexes.base import maybe_extract_name
-from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
-from pandas.core.indexes.datetimes import (
- DatetimeIndex,
- Index,
-)
-from pandas.core.indexes.extension import inherit_names
-
-_index_doc_kwargs = dict(ibase._index_doc_kwargs)
-_index_doc_kwargs.update({"target_klass": "PeriodIndex or list of Periods"})
-_shared_doc_kwargs = {
- "klass": "PeriodArray",
-}
-
-# --- Period index sketch
-
-
-def _new_PeriodIndex(cls, **d):
- # GH13277 for unpickling
- values = d.pop("data")
- if values.dtype == "int64":
- freq = d.pop("freq", None)
- values = PeriodArray(values, freq=freq)
- return cls._simple_new(values, **d)
- else:
- return cls(values, **d)
-
-
-@inherit_names(
- ["strftime", "start_time", "end_time"] + PeriodArray._field_ops,
- PeriodArray,
- wrap=True,
-)
-@inherit_names(["is_leap_year", "_format_native_types"], PeriodArray)
-class PeriodIndex(DatetimeIndexOpsMixin):
- """
- Immutable ndarray holding ordinal values indicating regular periods in time.
-
- Index keys are boxed to Period objects which carries the metadata (eg,
- frequency information).
-
- Parameters
- ----------
- data : array-like (1d int np.ndarray or PeriodArray), optional
- Optional period-like data to construct index with.
- copy : bool
- Make a copy of input ndarray.
- freq : str or period object, optional
- One of pandas period strings or corresponding objects.
- year : int, array, or Series, default None
- month : int, array, or Series, default None
- quarter : int, array, or Series, default None
- day : int, array, or Series, default None
- hour : int, array, or Series, default None
- minute : int, array, or Series, default None
- second : int, array, or Series, default None
- dtype : str or PeriodDtype, default None
-
- Attributes
- ----------
- day
- dayofweek
- day_of_week
- dayofyear
- day_of_year
- days_in_month
- daysinmonth
- end_time
- freq
- freqstr
- hour
- is_leap_year
- minute
- month
- quarter
- qyear
- second
- start_time
- week
- weekday
- weekofyear
- year
-
- Methods
- -------
- asfreq
- strftime
- to_timestamp
-
- See Also
- --------
- Index : The base pandas Index type.
- Period : Represents a period of time.
- DatetimeIndex : Index with datetime64 data.
- TimedeltaIndex : Index of timedelta64 data.
- period_range : Create a fixed-frequency PeriodIndex.
-
- Examples
- --------
- >>> idx = pd.PeriodIndex(year=[2000, 2002], quarter=[1, 3])
- >>> idx
- PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]')
- """
-
- _typ = "periodindex"
-
- _data: PeriodArray
- freq: BaseOffset
- dtype: PeriodDtype
-
- _data_cls = PeriodArray
- _supports_partial_string_indexing = True
-
- @property
- def _engine_type(self) -> type[libindex.PeriodEngine]:
- return libindex.PeriodEngine
-
- @cache_readonly
- def _resolution_obj(self) -> Resolution:
- # for compat with DatetimeIndex
- return self.dtype._resolution_obj
-
- # --------------------------------------------------------------------
- # methods that dispatch to array and wrap result in Index
- # These are defined here instead of via inherit_names for mypy
-
- @doc(
- PeriodArray.asfreq,
- other="pandas.arrays.PeriodArray",
- other_name="PeriodArray",
- **_shared_doc_kwargs,
- )
- def asfreq(self, freq=None, how: str = "E") -> PeriodIndex:
- arr = self._data.asfreq(freq, how)
- return type(self)._simple_new(arr, name=self.name)
-
- @doc(PeriodArray.to_timestamp)
- def to_timestamp(self, freq=None, how: str = "start") -> DatetimeIndex:
- arr = self._data.to_timestamp(freq, how)
- return DatetimeIndex._simple_new(arr, name=self.name)
-
- @property
- @doc(PeriodArray.hour.fget)
- def hour(self) -> Index:
- return Index(self._data.hour, name=self.name)
-
- @property
- @doc(PeriodArray.minute.fget)
- def minute(self) -> Index:
- return Index(self._data.minute, name=self.name)
-
- @property
- @doc(PeriodArray.second.fget)
- def second(self) -> Index:
- return Index(self._data.second, name=self.name)
-
- # ------------------------------------------------------------------------
- # Index Constructors
-
- def __new__(
- cls,
- data=None,
- ordinal=None,
- freq=None,
- dtype: Dtype | None = None,
- copy: bool = False,
- name: Hashable = None,
- **fields,
- ) -> PeriodIndex:
- valid_field_set = {
- "year",
- "month",
- "day",
- "quarter",
- "hour",
- "minute",
- "second",
- }
-
- refs = None
- if not copy and isinstance(data, (Index, ABCSeries)):
- refs = data._references
-
- if not set(fields).issubset(valid_field_set):
- argument = list(set(fields) - valid_field_set)[0]
- raise TypeError(f"__new__() got an unexpected keyword argument {argument}")
-
- name = maybe_extract_name(name, data, cls)
-
- if data is None and ordinal is None:
- # range-based.
- if not fields:
- # test_pickle_compat_construction
- cls._raise_scalar_data_error(None)
-
- data, freq2 = PeriodArray._generate_range(None, None, None, freq, fields)
- # PeriodArray._generate range does validation that fields is
- # empty when really using the range-based constructor.
- freq = freq2
-
- data = PeriodArray(data, freq=freq)
- else:
- freq = validate_dtype_freq(dtype, freq)
-
- # PeriodIndex allow PeriodIndex(period_index, freq=different)
- # Let's not encourage that kind of behavior in PeriodArray.
-
- if freq and isinstance(data, cls) and data.freq != freq:
- # TODO: We can do some of these with no-copy / coercion?
- # e.g. D -> 2D seems to be OK
- data = data.asfreq(freq)
-
- if data is None and ordinal is not None:
- # we strangely ignore `ordinal` if data is passed.
- ordinal = np.asarray(ordinal, dtype=np.int64)
- data = PeriodArray(ordinal, freq=freq)
- else:
- # don't pass copy here, since we copy later.
- data = period_array(data=data, freq=freq)
-
- if copy:
- data = data.copy()
-
- return cls._simple_new(data, name=name, refs=refs)
-
- # ------------------------------------------------------------------------
- # Data
-
- @property
- def values(self) -> np.ndarray:
- return np.asarray(self, dtype=object)
-
- def _maybe_convert_timedelta(self, other) -> int | npt.NDArray[np.int64]:
- """
- Convert timedelta-like input to an integer multiple of self.freq
-
- Parameters
- ----------
- other : timedelta, np.timedelta64, DateOffset, int, np.ndarray
-
- Returns
- -------
- converted : int, np.ndarray[int64]
-
- Raises
- ------
- IncompatibleFrequency : if the input cannot be written as a multiple
- of self.freq. Note IncompatibleFrequency subclasses ValueError.
- """
- if isinstance(other, (timedelta, np.timedelta64, Tick, np.ndarray)):
- if isinstance(self.freq, Tick):
- # _check_timedeltalike_freq_compat will raise if incompatible
- delta = self._data._check_timedeltalike_freq_compat(other)
- return delta
- elif isinstance(other, BaseOffset):
- if other.base == self.freq.base:
- return other.n
-
- raise raise_on_incompatible(self, other)
- elif is_integer(other):
- # integer is passed to .shift via
- # _add_datetimelike_methods basically
- # but ufunc may pass integer to _add_delta
- return other
-
- # raise when input doesn't have freq
- raise raise_on_incompatible(self, None)
-
- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
- """
- Can we compare values of the given dtype to our own?
- """
- if not isinstance(dtype, PeriodDtype):
- return False
- # For the subset of DateOffsets that can be a dtype.freq, it
- # suffices (and is much faster) to compare the dtype_code rather than
- # the freq itself.
- # See also: PeriodDtype.__eq__
- freq = dtype.freq
- own_freq = self.freq
- return (
- freq._period_dtype_code
- # error: "BaseOffset" has no attribute "_period_dtype_code"
- == own_freq._period_dtype_code # type: ignore[attr-defined]
- and freq.n == own_freq.n
- )
-
- # ------------------------------------------------------------------------
- # Index Methods
-
- def asof_locs(self, where: Index, mask: npt.NDArray[np.bool_]) -> np.ndarray:
- """
- where : array of timestamps
- mask : np.ndarray[bool]
- Array of booleans where data is not NA.
- """
- if isinstance(where, DatetimeIndex):
- where = PeriodIndex(where._values, freq=self.freq)
- elif not isinstance(where, PeriodIndex):
- raise TypeError("asof_locs `where` must be DatetimeIndex or PeriodIndex")
-
- return super().asof_locs(where, mask)
-
- @property
- def is_full(self) -> bool:
- """
- Returns True if this PeriodIndex is range-like in that all Periods
- between start and end are present, in order.
- """
- if len(self) == 0:
- return True
- if not self.is_monotonic_increasing:
- raise ValueError("Index is not monotonic")
- values = self.asi8
- return bool(((values[1:] - values[:-1]) < 2).all())
-
- @property
- def inferred_type(self) -> str:
- # b/c data is represented as ints make sure we can't have ambiguous
- # indexing
- return "period"
-
- # ------------------------------------------------------------------------
- # Indexing Methods
-
- def _convert_tolerance(self, tolerance, target):
- # Returned tolerance must be in dtype/units so that
- # `|self._get_engine_target() - target._engine_target()| <= tolerance`
- # is meaningful. Since PeriodIndex returns int64 for engine_target,
- # we may need to convert timedelta64 tolerance to int64.
- tolerance = super()._convert_tolerance(tolerance, target)
-
- if self.dtype == target.dtype:
- # convert tolerance to i8
- tolerance = self._maybe_convert_timedelta(tolerance)
-
- return tolerance
-
- def get_loc(self, key):
- """
- Get integer location for requested label.
-
- Parameters
- ----------
- key : Period, NaT, str, or datetime
- String or datetime key must be parsable as Period.
-
- Returns
- -------
- loc : int or ndarray[int64]
-
- Raises
- ------
- KeyError
- Key is not present in the index.
- TypeError
- If key is listlike or otherwise not hashable.
- """
- orig_key = key
-
- self._check_indexing_error(key)
-
- if is_valid_na_for_dtype(key, self.dtype):
- key = NaT
-
- elif isinstance(key, str):
- try:
- parsed, reso = self._parse_with_reso(key)
- except ValueError as err:
- # A string with invalid format
- raise KeyError(f"Cannot interpret '{key}' as period") from err
-
- if self._can_partial_date_slice(reso):
- try:
- return self._partial_date_slice(reso, parsed)
- except KeyError as err:
- raise KeyError(key) from err
-
- if reso == self._resolution_obj:
- # the reso < self._resolution_obj case goes
- # through _get_string_slice
- key = self._cast_partial_indexing_scalar(parsed)
- else:
- raise KeyError(key)
-
- elif isinstance(key, Period):
- self._disallow_mismatched_indexing(key)
-
- elif isinstance(key, datetime):
- key = self._cast_partial_indexing_scalar(key)
-
- else:
- # in particular integer, which Period constructor would cast to string
- raise KeyError(key)
-
- try:
- return Index.get_loc(self, key)
- except KeyError as err:
- raise KeyError(orig_key) from err
-
- def _disallow_mismatched_indexing(self, key: Period) -> None:
- sfreq = self.freq
- kfreq = key.freq
- if not (
- sfreq.n == kfreq.n
- # error: "BaseOffset" has no attribute "_period_dtype_code"
- and sfreq._period_dtype_code # type: ignore[attr-defined]
- # error: "BaseOffset" has no attribute "_period_dtype_code"
- == kfreq._period_dtype_code # type: ignore[attr-defined]
- ):
- # GH#42247 For the subset of DateOffsets that can be Period freqs,
- # checking these two attributes is sufficient to check equality,
- # and much more performant than `self.freq == key.freq`
- raise KeyError(key)
-
- def _cast_partial_indexing_scalar(self, label: datetime) -> Period:
- try:
- period = Period(label, freq=self.freq)
- except ValueError as err:
- # we cannot construct the Period
- raise KeyError(label) from err
- return period
-
- @doc(DatetimeIndexOpsMixin._maybe_cast_slice_bound)
- def _maybe_cast_slice_bound(self, label, side: str):
- if isinstance(label, datetime):
- label = self._cast_partial_indexing_scalar(label)
-
- return super()._maybe_cast_slice_bound(label, side)
-
- def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime):
- iv = Period(parsed, freq=reso.attr_abbrev)
- return (iv.asfreq(self.freq, how="start"), iv.asfreq(self.freq, how="end"))
-
- @doc(DatetimeIndexOpsMixin.shift)
- def shift(self, periods: int = 1, freq=None):
- if freq is not None:
- raise TypeError(
- f"`freq` argument is not supported for {type(self).__name__}.shift"
- )
- return self + periods
-
-
-def period_range(
- start=None, end=None, periods: int | None = None, freq=None, name=None
-) -> PeriodIndex:
- """
- Return a fixed frequency PeriodIndex.
-
- The day (calendar) is the default frequency.
-
- Parameters
- ----------
- start : str or period-like, default None
- Left bound for generating periods.
- end : str or period-like, default None
- Right bound for generating periods.
- periods : int, default None
- Number of periods to generate.
- freq : str or DateOffset, optional
- Frequency alias. By default the freq is taken from `start` or `end`
- if those are Period objects. Otherwise, the default is ``"D"`` for
- daily frequency.
- name : str, default None
- Name of the resulting PeriodIndex.
-
- Returns
- -------
- PeriodIndex
-
- Notes
- -----
- Of the three parameters: ``start``, ``end``, and ``periods``, exactly two
- must be specified.
-
- To learn more about the frequency strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
-
- Examples
- --------
- >>> pd.period_range(start='2017-01-01', end='2018-01-01', freq='M')
- PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06',
- '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12',
- '2018-01'],
- dtype='period[M]')
-
- If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor
- endpoints for a ``PeriodIndex`` with frequency matching that of the
- ``period_range`` constructor.
-
- >>> pd.period_range(start=pd.Period('2017Q1', freq='Q'),
- ... end=pd.Period('2017Q2', freq='Q'), freq='M')
- PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'],
- dtype='period[M]')
- """
- if com.count_not_none(start, end, periods) != 2:
- raise ValueError(
- "Of the three parameters: start, end, and periods, "
- "exactly two must be specified"
- )
- if freq is None and (not isinstance(start, Period) and not isinstance(end, Period)):
- freq = "D"
-
- data, freq = PeriodArray._generate_range(start, end, periods, freq, fields={})
- data = PeriodArray(data, freq=freq)
- return PeriodIndex(data, name=name)
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/range.py b/contrib/python/pandas/py3/pandas/core/indexes/range.py
deleted file mode 100644
index 03de8a1f320..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/range.py
+++ /dev/null
@@ -1,1037 +0,0 @@
-from __future__ import annotations
-
-from datetime import timedelta
-import operator
-from sys import getsizeof
-from typing import (
- Any,
- Callable,
- Hashable,
- Iterator,
- List,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs import (
- index as libindex,
- lib,
-)
-from pandas._libs.algos import unique_deltas
-from pandas._libs.lib import no_default
-from pandas._typing import (
- Dtype,
- npt,
-)
-from pandas.compat.numpy import function as nv
-from pandas.util._decorators import (
- cache_readonly,
- doc,
-)
-
-from pandas.core.dtypes.common import (
- ensure_platform_int,
- ensure_python_int,
- is_float,
- is_integer,
- is_scalar,
- is_signed_integer_dtype,
- is_timedelta64_dtype,
-)
-from pandas.core.dtypes.generic import ABCTimedeltaIndex
-
-from pandas.core import ops
-import pandas.core.common as com
-from pandas.core.construction import extract_array
-import pandas.core.indexes.base as ibase
-from pandas.core.indexes.base import (
- Index,
- maybe_extract_name,
-)
-from pandas.core.ops.common import unpack_zerodim_and_defer
-
-_empty_range = range(0)
-
-
-class RangeIndex(Index):
- """
- Immutable Index implementing a monotonic integer range.
-
- RangeIndex is a memory-saving special case of an Index limited to representing
- monotonic ranges with a 64-bit dtype. Using RangeIndex may in some instances
- improve computing speed.
-
- This is the default index type used
- by DataFrame and Series when no explicit index is provided by the user.
-
- Parameters
- ----------
- start : int (default: 0), range, or other RangeIndex instance
- If int and "stop" is not given, interpreted as "stop" instead.
- stop : int (default: 0)
- step : int (default: 1)
- dtype : np.int64
- Unused, accepted for homogeneity with other index types.
- copy : bool, default False
- Unused, accepted for homogeneity with other index types.
- name : object, optional
- Name to be stored in the index.
-
- Attributes
- ----------
- start
- stop
- step
-
- Methods
- -------
- from_range
-
- See Also
- --------
- Index : The base pandas Index type.
- """
-
- _typ = "rangeindex"
- _dtype_validation_metadata = (is_signed_integer_dtype, "signed integer")
- _range: range
- _values: np.ndarray
-
- @property
- def _engine_type(self) -> type[libindex.Int64Engine]:
- return libindex.Int64Engine
-
- # --------------------------------------------------------------------
- # Constructors
-
- def __new__(
- cls,
- start=None,
- stop=None,
- step=None,
- dtype: Dtype | None = None,
- copy: bool = False,
- name: Hashable = None,
- ) -> RangeIndex:
- cls._validate_dtype(dtype)
- name = maybe_extract_name(name, start, cls)
-
- # RangeIndex
- if isinstance(start, RangeIndex):
- return start.copy(name=name)
- elif isinstance(start, range):
- return cls._simple_new(start, name=name)
-
- # validate the arguments
- if com.all_none(start, stop, step):
- raise TypeError("RangeIndex(...) must be called with integers")
-
- start = ensure_python_int(start) if start is not None else 0
-
- if stop is None:
- start, stop = 0, start
- else:
- stop = ensure_python_int(stop)
-
- step = ensure_python_int(step) if step is not None else 1
- if step == 0:
- raise ValueError("Step must not be zero")
-
- rng = range(start, stop, step)
- return cls._simple_new(rng, name=name)
-
- @classmethod
- def from_range(
- cls, data: range, name=None, dtype: Dtype | None = None
- ) -> RangeIndex:
- """
- Create RangeIndex from a range object.
-
- Returns
- -------
- RangeIndex
- """
- if not isinstance(data, range):
- raise TypeError(
- f"{cls.__name__}(...) must be called with object coercible to a "
- f"range, {repr(data)} was passed"
- )
- cls._validate_dtype(dtype)
- return cls._simple_new(data, name=name)
-
- # error: Argument 1 of "_simple_new" is incompatible with supertype "Index";
- # supertype defines the argument type as
- # "Union[ExtensionArray, ndarray[Any, Any]]" [override]
- @classmethod
- def _simple_new( # type: ignore[override]
- cls, values: range, name: Hashable = None
- ) -> RangeIndex:
- result = object.__new__(cls)
-
- assert isinstance(values, range)
-
- result._range = values
- result._name = name
- result._cache = {}
- result._reset_identity()
- result._references = None
- return result
-
- @classmethod
- def _validate_dtype(cls, dtype: Dtype | None) -> None:
- if dtype is None:
- return
-
- validation_func, expected = cls._dtype_validation_metadata
- if not validation_func(dtype):
- raise ValueError(
- f"Incorrect `dtype` passed: expected {expected}, received {dtype}"
- )
-
- # --------------------------------------------------------------------
-
- # error: Return type "Type[Index]" of "_constructor" incompatible with return
- # type "Type[RangeIndex]" in supertype "Index"
- @cache_readonly
- def _constructor(self) -> type[Index]: # type: ignore[override]
- """return the class to use for construction"""
- return Index
-
- # error: Signature of "_data" incompatible with supertype "Index"
- @cache_readonly
- def _data(self) -> np.ndarray: # type: ignore[override]
- """
- An int array that for performance reasons is created only when needed.
-
- The constructed array is saved in ``_cache``.
- """
- return np.arange(self.start, self.stop, self.step, dtype=np.int64)
-
- def _get_data_as_items(self):
- """return a list of tuples of start, stop, step"""
- rng = self._range
- return [("start", rng.start), ("stop", rng.stop), ("step", rng.step)]
-
- def __reduce__(self):
- d = {"name": self.name}
- d.update(dict(self._get_data_as_items()))
- return ibase._new_Index, (type(self), d), None
-
- # --------------------------------------------------------------------
- # Rendering Methods
-
- def _format_attrs(self):
- """
- Return a list of tuples of the (attr, formatted_value)
- """
- attrs = self._get_data_as_items()
- if self.name is not None:
- attrs.append(("name", ibase.default_pprint(self.name)))
- return attrs
-
- def _format_data(self, name=None):
- # we are formatting thru the attributes
- return None
-
- def _format_with_header(self, header: list[str], na_rep: str) -> list[str]:
- # Equivalent to Index implementation, but faster
- if not len(self._range):
- return header
- first_val_str = str(self._range[0])
- last_val_str = str(self._range[-1])
- max_length = max(len(first_val_str), len(last_val_str))
-
- return header + [f"{x:<{max_length}}" for x in self._range]
-
- # --------------------------------------------------------------------
-
- @property
- def start(self) -> int:
- """
- The value of the `start` parameter (``0`` if this was not supplied).
- """
- # GH 25710
- return self._range.start
-
- @property
- def stop(self) -> int:
- """
- The value of the `stop` parameter.
- """
- return self._range.stop
-
- @property
- def step(self) -> int:
- """
- The value of the `step` parameter (``1`` if this was not supplied).
- """
- # GH 25710
- return self._range.step
-
- @cache_readonly
- def nbytes(self) -> int:
- """
- Return the number of bytes in the underlying data.
- """
- rng = self._range
- return getsizeof(rng) + sum(
- getsizeof(getattr(rng, attr_name))
- for attr_name in ["start", "stop", "step"]
- )
-
- def memory_usage(self, deep: bool = False) -> int:
- """
- Memory usage of my values
-
- Parameters
- ----------
- deep : bool
- Introspect the data deeply, interrogate
- `object` dtypes for system-level memory consumption
-
- Returns
- -------
- bytes used
-
- Notes
- -----
- Memory usage does not include memory consumed by elements that
- are not components of the array if deep=False
-
- See Also
- --------
- numpy.ndarray.nbytes
- """
- return self.nbytes
-
- @property
- def dtype(self) -> np.dtype:
- return np.dtype(np.int64)
-
- @property
- def is_unique(self) -> bool:
- """return if the index has unique values"""
- return True
-
- @cache_readonly
- def is_monotonic_increasing(self) -> bool:
- return self._range.step > 0 or len(self) <= 1
-
- @cache_readonly
- def is_monotonic_decreasing(self) -> bool:
- return self._range.step < 0 or len(self) <= 1
-
- def __contains__(self, key: Any) -> bool:
- hash(key)
- try:
- key = ensure_python_int(key)
- except TypeError:
- return False
- return key in self._range
-
- @property
- def inferred_type(self) -> str:
- return "integer"
-
- # --------------------------------------------------------------------
- # Indexing Methods
-
- @doc(Index.get_loc)
- def get_loc(self, key):
- if is_integer(key) or (is_float(key) and key.is_integer()):
- new_key = int(key)
- try:
- return self._range.index(new_key)
- except ValueError as err:
- raise KeyError(key) from err
- if isinstance(key, Hashable):
- raise KeyError(key)
- self._check_indexing_error(key)
- raise KeyError(key)
-
- def _get_indexer(
- self,
- target: Index,
- method: str | None = None,
- limit: int | None = None,
- tolerance=None,
- ) -> npt.NDArray[np.intp]:
- if com.any_not_none(method, tolerance, limit):
- return super()._get_indexer(
- target, method=method, tolerance=tolerance, limit=limit
- )
-
- if self.step > 0:
- start, stop, step = self.start, self.stop, self.step
- else:
- # GH 28678: work on reversed range for simplicity
- reverse = self._range[::-1]
- start, stop, step = reverse.start, reverse.stop, reverse.step
-
- target_array = np.asarray(target)
- locs = target_array - start
- valid = (locs % step == 0) & (locs >= 0) & (target_array < stop)
- locs[~valid] = -1
- locs[valid] = locs[valid] / step
-
- if step != self.step:
- # We reversed this range: transform to original locs
- locs[valid] = len(self) - 1 - locs[valid]
- return ensure_platform_int(locs)
-
- @cache_readonly
- def _should_fallback_to_positional(self) -> bool:
- """
- Should an integer key be treated as positional?
- """
- return False
-
- # --------------------------------------------------------------------
-
- def tolist(self) -> list[int]:
- return list(self._range)
-
- @doc(Index.__iter__)
- def __iter__(self) -> Iterator[int]:
- yield from self._range
-
- @doc(Index._shallow_copy)
- def _shallow_copy(self, values, name: Hashable = no_default):
- name = self.name if name is no_default else name
-
- if values.dtype.kind == "f":
- return Index(values, name=name, dtype=np.float64)
- # GH 46675 & 43885: If values is equally spaced, return a
- # more memory-compact RangeIndex instead of Index with 64-bit dtype
- unique_diffs = unique_deltas(values)
- if len(unique_diffs) == 1 and unique_diffs[0] != 0:
- diff = unique_diffs[0]
- new_range = range(values[0], values[-1] + diff, diff)
- return type(self)._simple_new(new_range, name=name)
- else:
- return self._constructor._simple_new(values, name=name)
-
- def _view(self: RangeIndex) -> RangeIndex:
- result = type(self)._simple_new(self._range, name=self._name)
- result._cache = self._cache
- return result
-
- @doc(Index.copy)
- def copy(self, name: Hashable = None, deep: bool = False):
- name = self._validate_names(name=name, deep=deep)[0]
- new_index = self._rename(name=name)
- return new_index
-
- def _minmax(self, meth: str):
- no_steps = len(self) - 1
- if no_steps == -1:
- return np.nan
- elif (meth == "min" and self.step > 0) or (meth == "max" and self.step < 0):
- return self.start
-
- return self.start + self.step * no_steps
-
- def min(self, axis=None, skipna: bool = True, *args, **kwargs) -> int:
- """The minimum value of the RangeIndex"""
- nv.validate_minmax_axis(axis)
- nv.validate_min(args, kwargs)
- return self._minmax("min")
-
- def max(self, axis=None, skipna: bool = True, *args, **kwargs) -> int:
- """The maximum value of the RangeIndex"""
- nv.validate_minmax_axis(axis)
- nv.validate_max(args, kwargs)
- return self._minmax("max")
-
- def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]:
- """
- Returns the indices that would sort the index and its
- underlying data.
-
- Returns
- -------
- np.ndarray[np.intp]
-
- See Also
- --------
- numpy.ndarray.argsort
- """
- ascending = kwargs.pop("ascending", True) # EA compat
- kwargs.pop("kind", None) # e.g. "mergesort" is irrelevant
- nv.validate_argsort(args, kwargs)
-
- if self._range.step > 0:
- result = np.arange(len(self), dtype=np.intp)
- else:
- result = np.arange(len(self) - 1, -1, -1, dtype=np.intp)
-
- if not ascending:
- result = result[::-1]
- return result
-
- def factorize(
- self,
- sort: bool = False,
- use_na_sentinel: bool = True,
- ) -> tuple[npt.NDArray[np.intp], RangeIndex]:
- codes = np.arange(len(self), dtype=np.intp)
- uniques = self
- if sort and self.step < 0:
- codes = codes[::-1]
- uniques = uniques[::-1]
- return codes, uniques
-
- def equals(self, other: object) -> bool:
- """
- Determines if two Index objects contain the same elements.
- """
- if isinstance(other, RangeIndex):
- return self._range == other._range
- return super().equals(other)
-
- def sort_values(
- self,
- return_indexer: bool = False,
- ascending: bool = True,
- na_position: str = "last",
- key: Callable | None = None,
- ):
- if key is not None:
- return super().sort_values(
- return_indexer=return_indexer,
- ascending=ascending,
- na_position=na_position,
- key=key,
- )
- else:
- sorted_index = self
- inverse_indexer = False
- if ascending:
- if self.step < 0:
- sorted_index = self[::-1]
- inverse_indexer = True
- else:
- if self.step > 0:
- sorted_index = self[::-1]
- inverse_indexer = True
-
- if return_indexer:
- if inverse_indexer:
- rng = range(len(self) - 1, -1, -1)
- else:
- rng = range(len(self))
- return sorted_index, RangeIndex(rng)
- else:
- return sorted_index
-
- # --------------------------------------------------------------------
- # Set Operations
-
- def _intersection(self, other: Index, sort: bool = False):
- # caller is responsible for checking self and other are both non-empty
-
- if not isinstance(other, RangeIndex):
- return super()._intersection(other, sort=sort)
-
- first = self._range[::-1] if self.step < 0 else self._range
- second = other._range[::-1] if other.step < 0 else other._range
-
- # check whether intervals intersect
- # deals with in- and decreasing ranges
- int_low = max(first.start, second.start)
- int_high = min(first.stop, second.stop)
- if int_high <= int_low:
- return self._simple_new(_empty_range)
-
- # Method hint: linear Diophantine equation
- # solve intersection problem
- # performance hint: for identical step sizes, could use
- # cheaper alternative
- gcd, s, _ = self._extended_gcd(first.step, second.step)
-
- # check whether element sets intersect
- if (first.start - second.start) % gcd:
- return self._simple_new(_empty_range)
-
- # calculate parameters for the RangeIndex describing the
- # intersection disregarding the lower bounds
- tmp_start = first.start + (second.start - first.start) * first.step // gcd * s
- new_step = first.step * second.step // gcd
- new_range = range(tmp_start, int_high, new_step)
- new_index = self._simple_new(new_range)
-
- # adjust index to limiting interval
- new_start = new_index._min_fitting_element(int_low)
- new_range = range(new_start, new_index.stop, new_index.step)
- new_index = self._simple_new(new_range)
-
- if (self.step < 0 and other.step < 0) is not (new_index.step < 0):
- new_index = new_index[::-1]
-
- if sort is None:
- new_index = new_index.sort_values()
-
- return new_index
-
- def _min_fitting_element(self, lower_limit: int) -> int:
- """Returns the smallest element greater than or equal to the limit"""
- no_steps = -(-(lower_limit - self.start) // abs(self.step))
- return self.start + abs(self.step) * no_steps
-
- def _extended_gcd(self, a: int, b: int) -> tuple[int, int, int]:
- """
- Extended Euclidean algorithms to solve Bezout's identity:
- a*x + b*y = gcd(x, y)
- Finds one particular solution for x, y: s, t
- Returns: gcd, s, t
- """
- s, old_s = 0, 1
- t, old_t = 1, 0
- r, old_r = b, a
- while r:
- quotient = old_r // r
- old_r, r = r, old_r - quotient * r
- old_s, s = s, old_s - quotient * s
- old_t, t = t, old_t - quotient * t
- return old_r, old_s, old_t
-
- def _range_in_self(self, other: range) -> bool:
- """Check if other range is contained in self"""
- # https://stackoverflow.com/a/32481015
- if not other:
- return True
- if not self._range:
- return False
- if len(other) > 1 and other.step % self._range.step:
- return False
- return other.start in self._range and other[-1] in self._range
-
- def _union(self, other: Index, sort: bool | None):
- """
- Form the union of two Index objects and sorts if possible
-
- Parameters
- ----------
- other : Index or array-like
-
- sort : bool or None, default None
- Whether to sort (monotonically increasing) the resulting index.
- ``sort=None|True`` returns a ``RangeIndex`` if possible or a sorted
- ``Index`` with a int64 dtype if not.
- ``sort=False`` can return a ``RangeIndex`` if self is monotonically
- increasing and other is fully contained in self. Otherwise, returns
- an unsorted ``Index`` with an int64 dtype.
-
- Returns
- -------
- union : Index
- """
- if isinstance(other, RangeIndex):
- if sort in (None, True) or (
- sort is False and self.step > 0 and self._range_in_self(other._range)
- ):
- # GH 47557: Can still return a RangeIndex
- # if other range in self and sort=False
- start_s, step_s = self.start, self.step
- end_s = self.start + self.step * (len(self) - 1)
- start_o, step_o = other.start, other.step
- end_o = other.start + other.step * (len(other) - 1)
- if self.step < 0:
- start_s, step_s, end_s = end_s, -step_s, start_s
- if other.step < 0:
- start_o, step_o, end_o = end_o, -step_o, start_o
- if len(self) == 1 and len(other) == 1:
- step_s = step_o = abs(self.start - other.start)
- elif len(self) == 1:
- step_s = step_o
- elif len(other) == 1:
- step_o = step_s
- start_r = min(start_s, start_o)
- end_r = max(end_s, end_o)
- if step_o == step_s:
- if (
- (start_s - start_o) % step_s == 0
- and (start_s - end_o) <= step_s
- and (start_o - end_s) <= step_s
- ):
- return type(self)(start_r, end_r + step_s, step_s)
- if (
- (step_s % 2 == 0)
- and (abs(start_s - start_o) == step_s / 2)
- and (abs(end_s - end_o) == step_s / 2)
- ):
- # e.g. range(0, 10, 2) and range(1, 11, 2)
- # but not range(0, 20, 4) and range(1, 21, 4) GH#44019
- return type(self)(start_r, end_r + step_s / 2, step_s / 2)
-
- elif step_o % step_s == 0:
- if (
- (start_o - start_s) % step_s == 0
- and (start_o + step_s >= start_s)
- and (end_o - step_s <= end_s)
- ):
- return type(self)(start_r, end_r + step_s, step_s)
- elif step_s % step_o == 0:
- if (
- (start_s - start_o) % step_o == 0
- and (start_s + step_o >= start_o)
- and (end_s - step_o <= end_o)
- ):
- return type(self)(start_r, end_r + step_o, step_o)
-
- return super()._union(other, sort=sort)
-
- def _difference(self, other, sort=None):
- # optimized set operation if we have another RangeIndex
- self._validate_sort_keyword(sort)
- self._assert_can_do_setop(other)
- other, result_name = self._convert_can_do_setop(other)
-
- if not isinstance(other, RangeIndex):
- return super()._difference(other, sort=sort)
-
- if sort is not False and self.step < 0:
- return self[::-1]._difference(other)
-
- res_name = ops.get_op_result_name(self, other)
-
- first = self._range[::-1] if self.step < 0 else self._range
- overlap = self.intersection(other)
- if overlap.step < 0:
- overlap = overlap[::-1]
-
- if len(overlap) == 0:
- return self.rename(name=res_name)
- if len(overlap) == len(self):
- return self[:0].rename(res_name)
-
- # overlap.step will always be a multiple of self.step (see _intersection)
-
- if len(overlap) == 1:
- if overlap[0] == self[0]:
- return self[1:]
-
- elif overlap[0] == self[-1]:
- return self[:-1]
-
- elif len(self) == 3 and overlap[0] == self[1]:
- return self[::2]
-
- else:
- return super()._difference(other, sort=sort)
-
- elif len(overlap) == 2 and overlap[0] == first[0] and overlap[-1] == first[-1]:
- # e.g. range(-8, 20, 7) and range(13, -9, -3)
- return self[1:-1]
-
- if overlap.step == first.step:
- if overlap[0] == first.start:
- # The difference is everything after the intersection
- new_rng = range(overlap[-1] + first.step, first.stop, first.step)
- elif overlap[-1] == first[-1]:
- # The difference is everything before the intersection
- new_rng = range(first.start, overlap[0], first.step)
- elif overlap._range == first[1:-1]:
- # e.g. range(4) and range(1, 3)
- step = len(first) - 1
- new_rng = first[::step]
- else:
- # The difference is not range-like
- # e.g. range(1, 10, 1) and range(3, 7, 1)
- return super()._difference(other, sort=sort)
-
- else:
- # We must have len(self) > 1, bc we ruled out above
- # len(overlap) == 0 and len(overlap) == len(self)
- assert len(self) > 1
-
- if overlap.step == first.step * 2:
- if overlap[0] == first[0] and overlap[-1] in (first[-1], first[-2]):
- # e.g. range(1, 10, 1) and range(1, 10, 2)
- new_rng = first[1::2]
-
- elif overlap[0] == first[1] and overlap[-1] in (first[-1], first[-2]):
- # e.g. range(1, 10, 1) and range(2, 10, 2)
- new_rng = first[::2]
-
- else:
- # We can get here with e.g. range(20) and range(0, 10, 2)
- return super()._difference(other, sort=sort)
-
- else:
- # e.g. range(10) and range(0, 10, 3)
- return super()._difference(other, sort=sort)
-
- new_index = type(self)._simple_new(new_rng, name=res_name)
- if first is not self._range:
- new_index = new_index[::-1]
-
- return new_index
-
- def symmetric_difference(self, other, result_name: Hashable = None, sort=None):
- if not isinstance(other, RangeIndex) or sort is not None:
- return super().symmetric_difference(other, result_name, sort)
-
- left = self.difference(other)
- right = other.difference(self)
- result = left.union(right)
-
- if result_name is not None:
- result = result.rename(result_name)
- return result
-
- # --------------------------------------------------------------------
-
- # error: Return type "Index" of "delete" incompatible with return type
- # "RangeIndex" in supertype "Index"
- def delete(self, loc) -> Index: # type: ignore[override]
- # In some cases we can retain RangeIndex, see also
- # DatetimeTimedeltaMixin._get_delete_Freq
- if is_integer(loc):
- if loc in (0, -len(self)):
- return self[1:]
- if loc in (-1, len(self) - 1):
- return self[:-1]
- if len(self) == 3 and loc in (1, -2):
- return self[::2]
-
- elif lib.is_list_like(loc):
- slc = lib.maybe_indices_to_slice(np.asarray(loc, dtype=np.intp), len(self))
-
- if isinstance(slc, slice):
- # defer to RangeIndex._difference, which is optimized to return
- # a RangeIndex whenever possible
- other = self[slc]
- return self.difference(other, sort=False)
-
- return super().delete(loc)
-
- def insert(self, loc: int, item) -> Index:
- if len(self) and (is_integer(item) or is_float(item)):
- # We can retain RangeIndex is inserting at the beginning or end,
- # or right in the middle.
- rng = self._range
- if loc == 0 and item == self[0] - self.step:
- new_rng = range(rng.start - rng.step, rng.stop, rng.step)
- return type(self)._simple_new(new_rng, name=self.name)
-
- elif loc == len(self) and item == self[-1] + self.step:
- new_rng = range(rng.start, rng.stop + rng.step, rng.step)
- return type(self)._simple_new(new_rng, name=self.name)
-
- elif len(self) == 2 and item == self[0] + self.step / 2:
- # e.g. inserting 1 into [0, 2]
- step = int(self.step / 2)
- new_rng = range(self.start, self.stop, step)
- return type(self)._simple_new(new_rng, name=self.name)
-
- return super().insert(loc, item)
-
- def _concat(self, indexes: list[Index], name: Hashable) -> Index:
- """
- Overriding parent method for the case of all RangeIndex instances.
-
- When all members of "indexes" are of type RangeIndex: result will be
- RangeIndex if possible, Index with a int64 dtype otherwise. E.g.:
- indexes = [RangeIndex(3), RangeIndex(3, 6)] -> RangeIndex(6)
- indexes = [RangeIndex(3), RangeIndex(4, 6)] -> Index([0,1,2,4,5], dtype='int64')
- """
- if not all(isinstance(x, RangeIndex) for x in indexes):
- return super()._concat(indexes, name)
-
- elif len(indexes) == 1:
- return indexes[0]
-
- rng_indexes = cast(List[RangeIndex], indexes)
-
- start = step = next_ = None
-
- # Filter the empty indexes
- non_empty_indexes = [obj for obj in rng_indexes if len(obj)]
-
- for obj in non_empty_indexes:
- rng = obj._range
-
- if start is None:
- # This is set by the first non-empty index
- start = rng.start
- if step is None and len(rng) > 1:
- step = rng.step
- elif step is None:
- # First non-empty index had only one element
- if rng.start == start:
- values = np.concatenate([x._values for x in rng_indexes])
- result = self._constructor(values)
- return result.rename(name)
-
- step = rng.start - start
-
- non_consecutive = (step != rng.step and len(rng) > 1) or (
- next_ is not None and rng.start != next_
- )
- if non_consecutive:
- result = self._constructor(
- np.concatenate([x._values for x in rng_indexes])
- )
- return result.rename(name)
-
- if step is not None:
- next_ = rng[-1] + step
-
- if non_empty_indexes:
- # Get the stop value from "next" or alternatively
- # from the last non-empty index
- stop = non_empty_indexes[-1].stop if next_ is None else next_
- return RangeIndex(start, stop, step).rename(name)
-
- # Here all "indexes" had 0 length, i.e. were empty.
- # In this case return an empty range index.
- return RangeIndex(0, 0).rename(name)
-
- def __len__(self) -> int:
- """
- return the length of the RangeIndex
- """
- return len(self._range)
-
- @property
- def size(self) -> int:
- return len(self)
-
- def __getitem__(self, key):
- """
- Conserve RangeIndex type for scalar and slice keys.
- """
- if isinstance(key, slice):
- new_range = self._range[key]
- return self._simple_new(new_range, name=self._name)
- elif is_integer(key):
- new_key = int(key)
- try:
- return self._range[new_key]
- except IndexError as err:
- raise IndexError(
- f"index {key} is out of bounds for axis 0 with size {len(self)}"
- ) from err
- elif is_scalar(key):
- raise IndexError(
- "only integers, slices (`:`), "
- "ellipsis (`...`), numpy.newaxis (`None`) "
- "and integer or boolean "
- "arrays are valid indices"
- )
- return super().__getitem__(key)
-
- def _getitem_slice(self: RangeIndex, slobj: slice) -> RangeIndex:
- """
- Fastpath for __getitem__ when we know we have a slice.
- """
- res = self._range[slobj]
- return type(self)._simple_new(res, name=self._name)
-
- @unpack_zerodim_and_defer("__floordiv__")
- def __floordiv__(self, other):
- if is_integer(other) and other != 0:
- if len(self) == 0 or self.start % other == 0 and self.step % other == 0:
- start = self.start // other
- step = self.step // other
- stop = start + len(self) * step
- new_range = range(start, stop, step or 1)
- return self._simple_new(new_range, name=self.name)
- if len(self) == 1:
- start = self.start // other
- new_range = range(start, start + 1, 1)
- return self._simple_new(new_range, name=self.name)
-
- return super().__floordiv__(other)
-
- # --------------------------------------------------------------------
- # Reductions
-
- def all(self, *args, **kwargs) -> bool:
- return 0 not in self._range
-
- def any(self, *args, **kwargs) -> bool:
- return any(self._range)
-
- # --------------------------------------------------------------------
-
- def _cmp_method(self, other, op):
- if isinstance(other, RangeIndex) and self._range == other._range:
- # Both are immutable so if ._range attr. are equal, shortcut is possible
- return super()._cmp_method(self, op)
- return super()._cmp_method(other, op)
-
- def _arith_method(self, other, op):
- """
- Parameters
- ----------
- other : Any
- op : callable that accepts 2 params
- perform the binary op
- """
-
- if isinstance(other, ABCTimedeltaIndex):
- # Defer to TimedeltaIndex implementation
- return NotImplemented
- elif isinstance(other, (timedelta, np.timedelta64)):
- # GH#19333 is_integer evaluated True on timedelta64,
- # so we need to catch these explicitly
- return super()._arith_method(other, op)
- elif is_timedelta64_dtype(other):
- # Must be an np.ndarray; GH#22390
- return super()._arith_method(other, op)
-
- if op in [
- operator.pow,
- ops.rpow,
- operator.mod,
- ops.rmod,
- operator.floordiv,
- ops.rfloordiv,
- divmod,
- ops.rdivmod,
- ]:
- return super()._arith_method(other, op)
-
- step: Callable | None = None
- if op in [operator.mul, ops.rmul, operator.truediv, ops.rtruediv]:
- step = op
-
- # TODO: if other is a RangeIndex we may have more efficient options
- right = extract_array(other, extract_numpy=True, extract_range=True)
- left = self
-
- try:
- # apply if we have an override
- if step:
- with np.errstate(all="ignore"):
- rstep = step(left.step, right)
-
- # we don't have a representable op
- # so return a base index
- if not is_integer(rstep) or not rstep:
- raise ValueError
-
- else:
- rstep = left.step
-
- with np.errstate(all="ignore"):
- rstart = op(left.start, right)
- rstop = op(left.stop, right)
-
- res_name = ops.get_op_result_name(self, other)
- result = type(self)(rstart, rstop, rstep, name=res_name)
-
- # for compat with numpy / Index with int64 dtype
- # even if we can represent as a RangeIndex, return
- # as a float64 Index if we have float-like descriptors
- if not all(is_integer(x) for x in [rstart, rstop, rstep]):
- result = result.astype("float64")
-
- return result
-
- except (ValueError, TypeError, ZeroDivisionError):
- # test_arithmetic_explicit_conversions
- return super()._arith_method(other, op)
diff --git a/contrib/python/pandas/py3/pandas/core/indexes/timedeltas.py b/contrib/python/pandas/py3/pandas/core/indexes/timedeltas.py
deleted file mode 100644
index 482c0da36f6..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexes/timedeltas.py
+++ /dev/null
@@ -1,315 +0,0 @@
-""" implement the TimedeltaIndex """
-from __future__ import annotations
-
-from pandas._libs import (
- index as libindex,
- lib,
-)
-from pandas._libs.tslibs import (
- Resolution,
- Timedelta,
- to_offset,
-)
-from pandas._typing import DtypeObj
-
-from pandas.core.dtypes.common import (
- is_dtype_equal,
- is_scalar,
- is_timedelta64_dtype,
-)
-from pandas.core.dtypes.generic import ABCSeries
-
-from pandas.core.arrays import datetimelike as dtl
-from pandas.core.arrays.timedeltas import TimedeltaArray
-import pandas.core.common as com
-from pandas.core.indexes.base import (
- Index,
- maybe_extract_name,
-)
-from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin
-from pandas.core.indexes.extension import inherit_names
-
-
-@inherit_names(
- ["__neg__", "__pos__", "__abs__", "total_seconds", "round", "floor", "ceil"]
- + TimedeltaArray._field_ops,
- TimedeltaArray,
- wrap=True,
-)
-@inherit_names(
- [
- "components",
- "to_pytimedelta",
- "sum",
- "std",
- "median",
- "_format_native_types",
- ],
- TimedeltaArray,
-)
-class TimedeltaIndex(DatetimeTimedeltaMixin):
- """
- Immutable Index of timedelta64 data.
-
- Represented internally as int64, and scalars returned Timedelta objects.
-
- Parameters
- ----------
- data : array-like (1-dimensional), optional
- Optional timedelta-like data to construct index with.
- unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional
- Which is an integer/float number.
- freq : str or pandas offset object, optional
- One of pandas date offset strings or corresponding objects. The string
- 'infer' can be passed in order to set the frequency of the index as the
- inferred frequency upon creation.
- copy : bool
- Make a copy of input ndarray.
- name : object
- Name to be stored in the index.
-
- Attributes
- ----------
- days
- seconds
- microseconds
- nanoseconds
- components
- inferred_freq
-
- Methods
- -------
- to_pytimedelta
- to_series
- round
- floor
- ceil
- to_frame
- mean
-
- See Also
- --------
- Index : The base pandas Index type.
- Timedelta : Represents a duration between two dates or times.
- DatetimeIndex : Index of datetime64 data.
- PeriodIndex : Index of Period data.
- timedelta_range : Create a fixed-frequency TimedeltaIndex.
-
- Notes
- -----
- To learn more about the frequency strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
- """
-
- _typ = "timedeltaindex"
-
- _data_cls = TimedeltaArray
-
- @property
- def _engine_type(self) -> type[libindex.TimedeltaEngine]:
- return libindex.TimedeltaEngine
-
- _data: TimedeltaArray
-
- # Use base class method instead of DatetimeTimedeltaMixin._get_string_slice
- _get_string_slice = Index._get_string_slice
-
- # error: Signature of "_resolution_obj" incompatible with supertype
- # "DatetimeIndexOpsMixin"
- @property
- def _resolution_obj(self) -> Resolution | None: # type: ignore[override]
- return self._data._resolution_obj
-
- # -------------------------------------------------------------------
- # Constructors
-
- def __new__(
- cls,
- data=None,
- unit=None,
- freq=lib.no_default,
- closed=None,
- dtype=None,
- copy: bool = False,
- name=None,
- ):
- name = maybe_extract_name(name, data, cls)
-
- if is_scalar(data):
- cls._raise_scalar_data_error(data)
-
- if unit in {"Y", "y", "M"}:
- raise ValueError(
- "Units 'M', 'Y', and 'y' are no longer supported, as they do not "
- "represent unambiguous timedelta values durations."
- )
-
- if (
- isinstance(data, TimedeltaArray)
- and freq is lib.no_default
- and (dtype is None or is_dtype_equal(dtype, data.dtype))
- ):
- if copy:
- data = data.copy()
- return cls._simple_new(data, name=name)
-
- if (
- isinstance(data, TimedeltaIndex)
- and freq is lib.no_default
- and name is None
- and (dtype is None or is_dtype_equal(dtype, data.dtype))
- ):
- if copy:
- return data.copy()
- else:
- return data._view()
-
- # - Cases checked above all return/raise before reaching here - #
-
- tdarr = TimedeltaArray._from_sequence_not_strict(
- data, freq=freq, unit=unit, dtype=dtype, copy=copy
- )
- refs = None
- if not copy and isinstance(data, (ABCSeries, Index)):
- refs = data._references
-
- return cls._simple_new(tdarr, name=name, refs=refs)
-
- # -------------------------------------------------------------------
-
- def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
- """
- Can we compare values of the given dtype to our own?
- """
- return is_timedelta64_dtype(dtype) # aka self._data._is_recognized_dtype
-
- # -------------------------------------------------------------------
- # Indexing Methods
-
- def get_loc(self, key):
- """
- Get integer location for requested label
-
- Returns
- -------
- loc : int, slice, or ndarray[int]
- """
- self._check_indexing_error(key)
-
- try:
- key = self._data._validate_scalar(key, unbox=False)
- except TypeError as err:
- raise KeyError(key) from err
-
- return Index.get_loc(self, key)
-
- def _parse_with_reso(self, label: str):
- # the "with_reso" is a no-op for TimedeltaIndex
- parsed = Timedelta(label)
- return parsed, None
-
- def _parsed_string_to_bounds(self, reso, parsed: Timedelta):
- # reso is unused, included to match signature of DTI/PI
- lbound = parsed.round(parsed.resolution_string)
- rbound = lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns")
- return lbound, rbound
-
- # -------------------------------------------------------------------
-
- @property
- def inferred_type(self) -> str:
- return "timedelta64"
-
-
-def timedelta_range(
- start=None,
- end=None,
- periods: int | None = None,
- freq=None,
- name=None,
- closed=None,
- *,
- unit: str | None = None,
-) -> TimedeltaIndex:
- """
- Return a fixed frequency TimedeltaIndex with day as the default.
-
- Parameters
- ----------
- start : str or timedelta-like, default None
- Left bound for generating timedeltas.
- end : str or timedelta-like, default None
- Right bound for generating timedeltas.
- periods : int, default None
- Number of periods to generate.
- freq : str or DateOffset, default 'D'
- Frequency strings can have multiples, e.g. '5H'.
- name : str, default None
- Name of the resulting TimedeltaIndex.
- closed : str, default None
- Make the interval closed with respect to the given frequency to
- the 'left', 'right', or both sides (None).
- unit : str, default None
- Specify the desired resolution of the result.
-
- .. versionadded:: 2.0.0
-
- Returns
- -------
- TimedeltaIndex
-
- Notes
- -----
- Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,
- exactly three must be specified. If ``freq`` is omitted, the resulting
- ``TimedeltaIndex`` will have ``periods`` linearly spaced elements between
- ``start`` and ``end`` (closed on both sides).
-
- To learn more about the frequency strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
-
- Examples
- --------
- >>> pd.timedelta_range(start='1 day', periods=4)
- TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'],
- dtype='timedelta64[ns]', freq='D')
-
- The ``closed`` parameter specifies which endpoint is included. The default
- behavior is to include both endpoints.
-
- >>> pd.timedelta_range(start='1 day', periods=4, closed='right')
- TimedeltaIndex(['2 days', '3 days', '4 days'],
- dtype='timedelta64[ns]', freq='D')
-
- The ``freq`` parameter specifies the frequency of the TimedeltaIndex.
- Only fixed frequencies can be passed, non-fixed frequencies such as
- 'M' (month end) will raise.
-
- >>> pd.timedelta_range(start='1 day', end='2 days', freq='6H')
- TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00',
- '1 days 18:00:00', '2 days 00:00:00'],
- dtype='timedelta64[ns]', freq='6H')
-
- Specify ``start``, ``end``, and ``periods``; the frequency is generated
- automatically (linearly spaced).
-
- >>> pd.timedelta_range(start='1 day', end='5 days', periods=4)
- TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00',
- '5 days 00:00:00'],
- dtype='timedelta64[ns]', freq=None)
-
- **Specify a unit**
-
- >>> pd.timedelta_range("1 Day", periods=3, freq="100000D", unit="s")
- TimedeltaIndex(['1 days 00:00:00', '100001 days 00:00:00',
- '200001 days 00:00:00'],
- dtype='timedelta64[s]', freq='100000D')
- """
- if freq is None and com.any_none(periods, start, end):
- freq = "D"
-
- freq, _ = dtl.maybe_infer_freq(freq)
- tdarr = TimedeltaArray._generate_range(
- start, end, periods, freq, closed=closed, unit=unit
- )
- return TimedeltaIndex._simple_new(tdarr, name=name)
diff --git a/contrib/python/pandas/py3/pandas/core/indexing.py b/contrib/python/pandas/py3/pandas/core/indexing.py
deleted file mode 100644
index f8d78d21f74..00000000000
--- a/contrib/python/pandas/py3/pandas/core/indexing.py
+++ /dev/null
@@ -1,2629 +0,0 @@
-from __future__ import annotations
-
-from contextlib import suppress
-import sys
-from typing import (
- TYPE_CHECKING,
- Hashable,
- Sequence,
- TypeVar,
- cast,
- final,
-)
-import warnings
-
-import numpy as np
-
-from pandas._config import using_copy_on_write
-
-from pandas._libs.indexing import NDFrameIndexerBase
-from pandas._libs.lib import item_from_zerodim
-from pandas._typing import (
- Axis,
- AxisInt,
-)
-from pandas.compat import PYPY
-from pandas.errors import (
- AbstractMethodError,
- ChainedAssignmentError,
- IndexingError,
- InvalidIndexError,
- LossySetitemError,
- _chained_assignment_msg,
-)
-from pandas.util._decorators import doc
-
-from pandas.core.dtypes.cast import (
- can_hold_element,
- maybe_promote,
-)
-from pandas.core.dtypes.common import (
- is_array_like,
- is_bool_dtype,
- is_extension_array_dtype,
- is_hashable,
- is_integer,
- is_iterator,
- is_list_like,
- is_numeric_dtype,
- is_object_dtype,
- is_scalar,
- is_sequence,
-)
-from pandas.core.dtypes.concat import concat_compat
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import (
- infer_fill_value,
- is_valid_na_for_dtype,
- isna,
- na_value_for_dtype,
-)
-
-from pandas.core import algorithms as algos
-import pandas.core.common as com
-from pandas.core.construction import (
- array as pd_array,
- extract_array,
-)
-from pandas.core.indexers import (
- check_array_indexer,
- is_list_like_indexer,
- is_scalar_indexer,
- length_of_indexer,
-)
-from pandas.core.indexes.api import (
- Index,
- MultiIndex,
-)
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
-
-_LocationIndexerT = TypeVar("_LocationIndexerT", bound="_LocationIndexer")
-
-# "null slice"
-_NS = slice(None, None)
-_one_ellipsis_message = "indexer may only contain one '...' entry"
-
-
-# the public IndexSlicerMaker
-class _IndexSlice:
- """
- Create an object to more easily perform multi-index slicing.
-
- See Also
- --------
- MultiIndex.remove_unused_levels : New MultiIndex with no unused levels.
-
- Notes
- -----
- See :ref:`Defined Levels <advanced.shown_levels>`
- for further info on slicing a MultiIndex.
-
- Examples
- --------
- >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']])
- >>> columns = ['foo', 'bar']
- >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))),
- ... index=midx, columns=columns)
-
- Using the default slice command:
-
- >>> dfmi.loc[(slice(None), slice('B0', 'B1')), :]
- foo bar
- A0 B0 0 1
- B1 2 3
- A1 B0 8 9
- B1 10 11
-
- Using the IndexSlice class for a more intuitive command:
-
- >>> idx = pd.IndexSlice
- >>> dfmi.loc[idx[:, 'B0':'B1'], :]
- foo bar
- A0 B0 0 1
- B1 2 3
- A1 B0 8 9
- B1 10 11
- """
-
- def __getitem__(self, arg):
- return arg
-
-
-IndexSlice = _IndexSlice()
-
-
-class IndexingMixin:
- """
- Mixin for adding .loc/.iloc/.at/.iat to Dataframes and Series.
- """
-
- @property
- def iloc(self) -> _iLocIndexer:
- """
- Purely integer-location based indexing for selection by position.
-
- ``.iloc[]`` is primarily integer position based (from ``0`` to
- ``length-1`` of the axis), but may also be used with a boolean
- array.
-
- Allowed inputs are:
-
- - An integer, e.g. ``5``.
- - A list or array of integers, e.g. ``[4, 3, 0]``.
- - A slice object with ints, e.g. ``1:7``.
- - A boolean array.
- - A ``callable`` function with one argument (the calling Series or
- DataFrame) and that returns valid output for indexing (one of the above).
- This is useful in method chains, when you don't have a reference to the
- calling object, but would like to base your selection on some value.
- - A tuple of row and column indexes. The tuple elements consist of one of the
- above inputs, e.g. ``(0, 1)``.
-
- ``.iloc`` will raise ``IndexError`` if a requested indexer is
- out-of-bounds, except *slice* indexers which allow out-of-bounds
- indexing (this conforms with python/numpy *slice* semantics).
-
- See more at :ref:`Selection by Position <indexing.integer>`.
-
- See Also
- --------
- DataFrame.iat : Fast integer location scalar accessor.
- DataFrame.loc : Purely label-location based indexer for selection by label.
- Series.iloc : Purely integer-location based indexing for
- selection by position.
-
- Examples
- --------
- >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
- ... {'a': 100, 'b': 200, 'c': 300, 'd': 400},
- ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]
- >>> df = pd.DataFrame(mydict)
- >>> df
- a b c d
- 0 1 2 3 4
- 1 100 200 300 400
- 2 1000 2000 3000 4000
-
- **Indexing just the rows**
-
- With a scalar integer.
-
- >>> type(df.iloc[0])
- <class 'pandas.core.series.Series'>
- >>> df.iloc[0]
- a 1
- b 2
- c 3
- d 4
- Name: 0, dtype: int64
-
- With a list of integers.
-
- >>> df.iloc[[0]]
- a b c d
- 0 1 2 3 4
- >>> type(df.iloc[[0]])
- <class 'pandas.core.frame.DataFrame'>
-
- >>> df.iloc[[0, 1]]
- a b c d
- 0 1 2 3 4
- 1 100 200 300 400
-
- With a `slice` object.
-
- >>> df.iloc[:3]
- a b c d
- 0 1 2 3 4
- 1 100 200 300 400
- 2 1000 2000 3000 4000
-
- With a boolean mask the same length as the index.
-
- >>> df.iloc[[True, False, True]]
- a b c d
- 0 1 2 3 4
- 2 1000 2000 3000 4000
-
- With a callable, useful in method chains. The `x` passed
- to the ``lambda`` is the DataFrame being sliced. This selects
- the rows whose index label even.
-
- >>> df.iloc[lambda x: x.index % 2 == 0]
- a b c d
- 0 1 2 3 4
- 2 1000 2000 3000 4000
-
- **Indexing both axes**
-
- You can mix the indexer types for the index and columns. Use ``:`` to
- select the entire axis.
-
- With scalar integers.
-
- >>> df.iloc[0, 1]
- 2
-
- With lists of integers.
-
- >>> df.iloc[[0, 2], [1, 3]]
- b d
- 0 2 4
- 2 2000 4000
-
- With `slice` objects.
-
- >>> df.iloc[1:3, 0:3]
- a b c
- 1 100 200 300
- 2 1000 2000 3000
-
- With a boolean array whose length matches the columns.
-
- >>> df.iloc[:, [True, False, True, False]]
- a c
- 0 1 3
- 1 100 300
- 2 1000 3000
-
- With a callable function that expects the Series or DataFrame.
-
- >>> df.iloc[:, lambda df: [0, 2]]
- a c
- 0 1 3
- 1 100 300
- 2 1000 3000
- """
- return _iLocIndexer("iloc", self)
-
- @property
- def loc(self) -> _LocIndexer:
- """
- Access a group of rows and columns by label(s) or a boolean array.
-
- ``.loc[]`` is primarily label based, but may also be used with a
- boolean array.
-
- Allowed inputs are:
-
- - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
- interpreted as a *label* of the index, and **never** as an
- integer position along the index).
- - A list or array of labels, e.g. ``['a', 'b', 'c']``.
- - A slice object with labels, e.g. ``'a':'f'``.
-
- .. warning:: Note that contrary to usual python slices, **both** the
- start and the stop are included
-
- - A boolean array of the same length as the axis being sliced,
- e.g. ``[True, False, True]``.
- - An alignable boolean Series. The index of the key will be aligned before
- masking.
- - An alignable Index. The Index of the returned selection will be the input.
- - A ``callable`` function with one argument (the calling Series or
- DataFrame) and that returns valid output for indexing (one of the above)
-
- See more at :ref:`Selection by Label <indexing.label>`.
-
- Raises
- ------
- KeyError
- If any items are not found.
- IndexingError
- If an indexed key is passed and its index is unalignable to the frame index.
-
- See Also
- --------
- DataFrame.at : Access a single value for a row/column label pair.
- DataFrame.iloc : Access group of rows and columns by integer position(s).
- DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the
- Series/DataFrame.
- Series.loc : Access group of values using labels.
-
- Examples
- --------
- **Getting values**
-
- >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
- ... index=['cobra', 'viper', 'sidewinder'],
- ... columns=['max_speed', 'shield'])
- >>> df
- max_speed shield
- cobra 1 2
- viper 4 5
- sidewinder 7 8
-
- Single label. Note this returns the row as a Series.
-
- >>> df.loc['viper']
- max_speed 4
- shield 5
- Name: viper, dtype: int64
-
- List of labels. Note using ``[[]]`` returns a DataFrame.
-
- >>> df.loc[['viper', 'sidewinder']]
- max_speed shield
- viper 4 5
- sidewinder 7 8
-
- Single label for row and column
-
- >>> df.loc['cobra', 'shield']
- 2
-
- Slice with labels for row and single label for column. As mentioned
- above, note that both the start and stop of the slice are included.
-
- >>> df.loc['cobra':'viper', 'max_speed']
- cobra 1
- viper 4
- Name: max_speed, dtype: int64
-
- Boolean list with the same length as the row axis
-
- >>> df.loc[[False, False, True]]
- max_speed shield
- sidewinder 7 8
-
- Alignable boolean Series:
-
- >>> df.loc[pd.Series([False, True, False],
- ... index=['viper', 'sidewinder', 'cobra'])]
- max_speed shield
- sidewinder 7 8
-
- Index (same behavior as ``df.reindex``)
-
- >>> df.loc[pd.Index(["cobra", "viper"], name="foo")]
- max_speed shield
- foo
- cobra 1 2
- viper 4 5
-
- Conditional that returns a boolean Series
-
- >>> df.loc[df['shield'] > 6]
- max_speed shield
- sidewinder 7 8
-
- Conditional that returns a boolean Series with column labels specified
-
- >>> df.loc[df['shield'] > 6, ['max_speed']]
- max_speed
- sidewinder 7
-
- Callable that returns a boolean Series
-
- >>> df.loc[lambda df: df['shield'] == 8]
- max_speed shield
- sidewinder 7 8
-
- **Setting values**
-
- Set value for all items matching the list of labels
-
- >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50
- >>> df
- max_speed shield
- cobra 1 2
- viper 4 50
- sidewinder 7 50
-
- Set value for an entire row
-
- >>> df.loc['cobra'] = 10
- >>> df
- max_speed shield
- cobra 10 10
- viper 4 50
- sidewinder 7 50
-
- Set value for an entire column
-
- >>> df.loc[:, 'max_speed'] = 30
- >>> df
- max_speed shield
- cobra 30 10
- viper 30 50
- sidewinder 30 50
-
- Set value for rows matching callable condition
-
- >>> df.loc[df['shield'] > 35] = 0
- >>> df
- max_speed shield
- cobra 30 10
- viper 0 0
- sidewinder 0 0
-
- **Getting values on a DataFrame with an index that has integer labels**
-
- Another example using integers for the index
-
- >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
- ... index=[7, 8, 9], columns=['max_speed', 'shield'])
- >>> df
- max_speed shield
- 7 1 2
- 8 4 5
- 9 7 8
-
- Slice with integer labels for rows. As mentioned above, note that both
- the start and stop of the slice are included.
-
- >>> df.loc[7:9]
- max_speed shield
- 7 1 2
- 8 4 5
- 9 7 8
-
- **Getting values with a MultiIndex**
-
- A number of examples using a DataFrame with a MultiIndex
-
- >>> tuples = [
- ... ('cobra', 'mark i'), ('cobra', 'mark ii'),
- ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'),
- ... ('viper', 'mark ii'), ('viper', 'mark iii')
- ... ]
- >>> index = pd.MultiIndex.from_tuples(tuples)
- >>> values = [[12, 2], [0, 4], [10, 20],
- ... [1, 4], [7, 1], [16, 36]]
- >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index)
- >>> df
- max_speed shield
- cobra mark i 12 2
- mark ii 0 4
- sidewinder mark i 10 20
- mark ii 1 4
- viper mark ii 7 1
- mark iii 16 36
-
- Single label. Note this returns a DataFrame with a single index.
-
- >>> df.loc['cobra']
- max_speed shield
- mark i 12 2
- mark ii 0 4
-
- Single index tuple. Note this returns a Series.
-
- >>> df.loc[('cobra', 'mark ii')]
- max_speed 0
- shield 4
- Name: (cobra, mark ii), dtype: int64
-
- Single label for row and column. Similar to passing in a tuple, this
- returns a Series.
-
- >>> df.loc['cobra', 'mark i']
- max_speed 12
- shield 2
- Name: (cobra, mark i), dtype: int64
-
- Single tuple. Note using ``[[]]`` returns a DataFrame.
-
- >>> df.loc[[('cobra', 'mark ii')]]
- max_speed shield
- cobra mark ii 0 4
-
- Single tuple for the index with a single label for the column
-
- >>> df.loc[('cobra', 'mark i'), 'shield']
- 2
-
- Slice from index tuple to single label
-
- >>> df.loc[('cobra', 'mark i'):'viper']
- max_speed shield
- cobra mark i 12 2
- mark ii 0 4
- sidewinder mark i 10 20
- mark ii 1 4
- viper mark ii 7 1
- mark iii 16 36
-
- Slice from index tuple to index tuple
-
- >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')]
- max_speed shield
- cobra mark i 12 2
- mark ii 0 4
- sidewinder mark i 10 20
- mark ii 1 4
- viper mark ii 7 1
-
- Please see the :ref:`user guide<advanced.advanced_hierarchical>`
- for more details and explanations of advanced indexing.
- """
- return _LocIndexer("loc", self)
-
- @property
- def at(self) -> _AtIndexer:
- """
- Access a single value for a row/column label pair.
-
- Similar to ``loc``, in that both provide label-based lookups. Use
- ``at`` if you only need to get or set a single value in a DataFrame
- or Series.
-
- Raises
- ------
- KeyError
- * If getting a value and 'label' does not exist in a DataFrame or
- Series.
- ValueError
- * If row/column label pair is not a tuple or if any label from
- the pair is not a scalar for DataFrame.
- * If label is list-like (*excluding* NamedTuple) for Series.
-
- See Also
- --------
- DataFrame.at : Access a single value for a row/column pair by label.
- DataFrame.iat : Access a single value for a row/column pair by integer
- position.
- DataFrame.loc : Access a group of rows and columns by label(s).
- DataFrame.iloc : Access a group of rows and columns by integer
- position(s).
- Series.at : Access a single value by label.
- Series.iat : Access a single value by integer position.
- Series.loc : Access a group of rows by label(s).
- Series.iloc : Access a group of rows by integer position(s).
-
- Notes
- -----
- See :ref:`Fast scalar value getting and setting <indexing.basics.get_value>`
- for more details.
-
- Examples
- --------
- >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
- ... index=[4, 5, 6], columns=['A', 'B', 'C'])
- >>> df
- A B C
- 4 0 2 3
- 5 0 4 1
- 6 10 20 30
-
- Get value at specified row/column pair
-
- >>> df.at[4, 'B']
- 2
-
- Set value at specified row/column pair
-
- >>> df.at[4, 'B'] = 10
- >>> df.at[4, 'B']
- 10
-
- Get value within a Series
-
- >>> df.loc[5].at['B']
- 4
- """
- return _AtIndexer("at", self)
-
- @property
- def iat(self) -> _iAtIndexer:
- """
- Access a single value for a row/column pair by integer position.
-
- Similar to ``iloc``, in that both provide integer-based lookups. Use
- ``iat`` if you only need to get or set a single value in a DataFrame
- or Series.
-
- Raises
- ------
- IndexError
- When integer position is out of bounds.
-
- See Also
- --------
- DataFrame.at : Access a single value for a row/column label pair.
- DataFrame.loc : Access a group of rows and columns by label(s).
- DataFrame.iloc : Access a group of rows and columns by integer position(s).
-
- Examples
- --------
- >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],
- ... columns=['A', 'B', 'C'])
- >>> df
- A B C
- 0 0 2 3
- 1 0 4 1
- 2 10 20 30
-
- Get value at specified row/column pair
-
- >>> df.iat[1, 2]
- 1
-
- Set value at specified row/column pair
-
- >>> df.iat[1, 2] = 10
- >>> df.iat[1, 2]
- 10
-
- Get value within a series
-
- >>> df.loc[0].iat[1]
- 2
- """
- return _iAtIndexer("iat", self)
-
-
-class _LocationIndexer(NDFrameIndexerBase):
- _valid_types: str
- axis: AxisInt | None = None
-
- # sub-classes need to set _takeable
- _takeable: bool
-
- @final
- def __call__(
- self: _LocationIndexerT, axis: Axis | None = None
- ) -> _LocationIndexerT:
- # we need to return a copy of ourselves
- new_self = type(self)(self.name, self.obj)
-
- if axis is not None:
- axis_int_none = self.obj._get_axis_number(axis)
- else:
- axis_int_none = axis
- new_self.axis = axis_int_none
- return new_self
-
- def _get_setitem_indexer(self, key):
- """
- Convert a potentially-label-based key into a positional indexer.
- """
- if self.name == "loc":
- # always holds here bc iloc overrides _get_setitem_indexer
- self._ensure_listlike_indexer(key)
-
- if isinstance(key, tuple):
- for x in key:
- check_dict_or_set_indexers(x)
-
- if self.axis is not None:
- key = _tupleize_axis_indexer(self.ndim, self.axis, key)
-
- ax = self.obj._get_axis(0)
-
- if isinstance(ax, MultiIndex) and self.name != "iloc" and is_hashable(key):
- with suppress(KeyError, InvalidIndexError):
- # TypeError e.g. passed a bool
- return ax.get_loc(key)
-
- if isinstance(key, tuple):
- with suppress(IndexingError):
- # suppress "Too many indexers"
- return self._convert_tuple(key)
-
- if isinstance(key, range):
- # GH#45479 test_loc_setitem_range_key
- key = list(key)
-
- return self._convert_to_indexer(key, axis=0)
-
- @final
- def _maybe_mask_setitem_value(self, indexer, value):
- """
- If we have obj.iloc[mask] = series_or_frame and series_or_frame has the
- same length as obj, we treat this as obj.iloc[mask] = series_or_frame[mask],
- similar to Series.__setitem__.
-
- Note this is only for loc, not iloc.
- """
-
- if (
- isinstance(indexer, tuple)
- and len(indexer) == 2
- and isinstance(value, (ABCSeries, ABCDataFrame))
- ):
- pi, icols = indexer
- ndim = value.ndim
- if com.is_bool_indexer(pi) and len(value) == len(pi):
- newkey = pi.nonzero()[0]
-
- if is_scalar_indexer(icols, self.ndim - 1) and ndim == 1:
- # e.g. test_loc_setitem_boolean_mask_allfalse
- if len(newkey) == 0:
- # FIXME: kludge for test_loc_setitem_boolean_mask_allfalse
- # TODO(GH#45333): may be fixed when deprecation is enforced
-
- value = value.iloc[:0]
- else:
- # test_loc_setitem_ndframe_values_alignment
- value = self.obj.iloc._align_series(indexer, value)
- indexer = (newkey, icols)
-
- elif (
- isinstance(icols, np.ndarray)
- and icols.dtype.kind == "i"
- and len(icols) == 1
- ):
- if ndim == 1:
- # We implicitly broadcast, though numpy does not, see
- # github.com/pandas-dev/pandas/pull/45501#discussion_r789071825
- # test_loc_setitem_ndframe_values_alignment
- value = self.obj.iloc._align_series(indexer, value)
- indexer = (newkey, icols)
-
- elif ndim == 2 and value.shape[1] == 1:
- if len(newkey) == 0:
- # FIXME: kludge for
- # test_loc_setitem_all_false_boolean_two_blocks
- # TODO(GH#45333): may be fixed when deprecation is enforced
- value = value.iloc[:0]
- else:
- # test_loc_setitem_ndframe_values_alignment
- value = self.obj.iloc._align_frame(indexer, value)
- indexer = (newkey, icols)
- elif com.is_bool_indexer(indexer):
- indexer = indexer.nonzero()[0]
-
- return indexer, value
-
- @final
- def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None:
- """
- Ensure that a list-like of column labels are all present by adding them if
- they do not already exist.
-
- Parameters
- ----------
- key : list-like of column labels
- Target labels.
- axis : key axis if known
- """
- column_axis = 1
-
- # column only exists in 2-dimensional DataFrame
- if self.ndim != 2:
- return
-
- orig_key = key
- if isinstance(key, tuple) and len(key) > 1:
- # key may be a tuple if we are .loc
- # if length of key is > 1 set key to column part
- key = key[column_axis]
- axis = column_axis
-
- if (
- axis == column_axis
- and not isinstance(self.obj.columns, MultiIndex)
- and is_list_like_indexer(key)
- and not com.is_bool_indexer(key)
- and all(is_hashable(k) for k in key)
- ):
- # GH#38148
- keys = self.obj.columns.union(key, sort=False)
- diff = Index(key).difference(self.obj.columns, sort=False)
-
- if len(diff) and com.is_null_slice(orig_key[0]):
- # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B"
- # is a new column, add the new columns with dtype=np.void
- # so that later when we go through setitem_single_column
- # we will use isetitem. Without this, the reindex_axis
- # below would create float64 columns in this example, which
- # would successfully hold 7, so we would end up with the wrong
- # dtype.
- indexer = np.arange(len(keys), dtype=np.intp)
- indexer[len(self.obj.columns) :] = -1
- new_mgr = self.obj._mgr.reindex_indexer(
- keys, indexer=indexer, axis=0, only_slice=True, use_na_proxy=True
- )
- self.obj._mgr = new_mgr
- return
-
- self.obj._mgr = self.obj._mgr.reindex_axis(keys, axis=0, only_slice=True)
-
- @final
- def __setitem__(self, key, value) -> None:
- if not PYPY and using_copy_on_write():
- if sys.getrefcount(self.obj) <= 2:
- warnings.warn(
- _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
- )
-
- check_dict_or_set_indexers(key)
- if isinstance(key, tuple):
- key = tuple(list(x) if is_iterator(x) else x for x in key)
- key = tuple(com.apply_if_callable(x, self.obj) for x in key)
- else:
- key = com.apply_if_callable(key, self.obj)
- indexer = self._get_setitem_indexer(key)
- self._has_valid_setitem_indexer(key)
-
- iloc = self if self.name == "iloc" else self.obj.iloc
- iloc._setitem_with_indexer(indexer, value, self.name)
-
- def _validate_key(self, key, axis: AxisInt):
- """
- Ensure that key is valid for current indexer.
-
- Parameters
- ----------
- key : scalar, slice or list-like
- Key requested.
- axis : int
- Dimension on which the indexing is being made.
-
- Raises
- ------
- TypeError
- If the key (or some element of it) has wrong type.
- IndexError
- If the key (or some element of it) is out of bounds.
- KeyError
- If the key was not found.
- """
- raise AbstractMethodError(self)
-
- @final
- def _expand_ellipsis(self, tup: tuple) -> tuple:
- """
- If a tuple key includes an Ellipsis, replace it with an appropriate
- number of null slices.
- """
- if any(x is Ellipsis for x in tup):
- if tup.count(Ellipsis) > 1:
- raise IndexingError(_one_ellipsis_message)
-
- if len(tup) == self.ndim:
- # It is unambiguous what axis this Ellipsis is indexing,
- # treat as a single null slice.
- i = tup.index(Ellipsis)
- # FIXME: this assumes only one Ellipsis
- new_key = tup[:i] + (_NS,) + tup[i + 1 :]
- return new_key
-
- # TODO: other cases? only one test gets here, and that is covered
- # by _validate_key_length
- return tup
-
- @final
- def _validate_tuple_indexer(self, key: tuple) -> tuple:
- """
- Check the key for valid keys across my indexer.
- """
- key = self._validate_key_length(key)
- key = self._expand_ellipsis(key)
- for i, k in enumerate(key):
- try:
- self._validate_key(k, i)
- except ValueError as err:
- raise ValueError(
- "Location based indexing can only have "
- f"[{self._valid_types}] types"
- ) from err
- return key
-
- @final
- def _is_nested_tuple_indexer(self, tup: tuple) -> bool:
- """
- Returns
- -------
- bool
- """
- if any(isinstance(ax, MultiIndex) for ax in self.obj.axes):
- return any(is_nested_tuple(tup, ax) for ax in self.obj.axes)
- return False
-
- @final
- def _convert_tuple(self, key: tuple) -> tuple:
- # Note: we assume _tupleize_axis_indexer has been called, if necessary.
- self._validate_key_length(key)
- keyidx = [self._convert_to_indexer(k, axis=i) for i, k in enumerate(key)]
- return tuple(keyidx)
-
- @final
- def _validate_key_length(self, key: tuple) -> tuple:
- if len(key) > self.ndim:
- if key[0] is Ellipsis:
- # e.g. Series.iloc[..., 3] reduces to just Series.iloc[3]
- key = key[1:]
- if Ellipsis in key:
- raise IndexingError(_one_ellipsis_message)
- return self._validate_key_length(key)
- raise IndexingError("Too many indexers")
- return key
-
- @final
- def _getitem_tuple_same_dim(self, tup: tuple):
- """
- Index with indexers that should return an object of the same dimension
- as self.obj.
-
- This is only called after a failed call to _getitem_lowerdim.
- """
- retval = self.obj
- for i, key in enumerate(tup):
- if com.is_null_slice(key):
- continue
-
- retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
- # We should never have retval.ndim < self.ndim, as that should
- # be handled by the _getitem_lowerdim call above.
- assert retval.ndim == self.ndim
-
- if retval is self.obj:
- # if all axes were a null slice (`df.loc[:, :]`), ensure we still
- # return a new object (https://github.com/pandas-dev/pandas/pull/49469)
- retval = retval.copy(deep=False)
-
- return retval
-
- @final
- def _getitem_lowerdim(self, tup: tuple):
- # we can directly get the axis result since the axis is specified
- if self.axis is not None:
- axis = self.obj._get_axis_number(self.axis)
- return self._getitem_axis(tup, axis=axis)
-
- # we may have a nested tuples indexer here
- if self._is_nested_tuple_indexer(tup):
- return self._getitem_nested_tuple(tup)
-
- # we maybe be using a tuple to represent multiple dimensions here
- ax0 = self.obj._get_axis(0)
- # ...but iloc should handle the tuple as simple integer-location
- # instead of checking it as multiindex representation (GH 13797)
- if (
- isinstance(ax0, MultiIndex)
- and self.name != "iloc"
- and not any(isinstance(x, slice) for x in tup)
- ):
- # Note: in all extant test cases, replacing the slice condition with
- # `all(is_hashable(x) or com.is_null_slice(x) for x in tup)`
- # is equivalent.
- # (see the other place where we call _handle_lowerdim_multi_index_axis0)
- with suppress(IndexingError):
- return cast(_LocIndexer, self)._handle_lowerdim_multi_index_axis0(tup)
-
- tup = self._validate_key_length(tup)
-
- for i, key in enumerate(tup):
- if is_label_like(key):
- # We don't need to check for tuples here because those are
- # caught by the _is_nested_tuple_indexer check above.
- section = self._getitem_axis(key, axis=i)
-
- # We should never have a scalar section here, because
- # _getitem_lowerdim is only called after a check for
- # is_scalar_access, which that would be.
- if section.ndim == self.ndim:
- # we're in the middle of slicing through a MultiIndex
- # revise the key wrt to `section` by inserting an _NS
- new_key = tup[:i] + (_NS,) + tup[i + 1 :]
-
- else:
- # Note: the section.ndim == self.ndim check above
- # rules out having DataFrame here, so we dont need to worry
- # about transposing.
- new_key = tup[:i] + tup[i + 1 :]
-
- if len(new_key) == 1:
- new_key = new_key[0]
-
- # Slices should return views, but calling iloc/loc with a null
- # slice returns a new object.
- if com.is_null_slice(new_key):
- return section
- # This is an elided recursive call to iloc/loc
- return getattr(section, self.name)[new_key]
-
- raise IndexingError("not applicable")
-
- @final
- def _getitem_nested_tuple(self, tup: tuple):
- # we have a nested tuple so have at least 1 multi-index level
- # we should be able to match up the dimensionality here
-
- for key in tup:
- check_dict_or_set_indexers(key)
-
- # we have too many indexers for our dim, but have at least 1
- # multi-index dimension, try to see if we have something like
- # a tuple passed to a series with a multi-index
- if len(tup) > self.ndim:
- if self.name != "loc":
- # This should never be reached, but let's be explicit about it
- raise ValueError("Too many indices") # pragma: no cover
- if all(is_hashable(x) or com.is_null_slice(x) for x in tup):
- # GH#10521 Series should reduce MultiIndex dimensions instead of
- # DataFrame, IndexingError is not raised when slice(None,None,None)
- # with one row.
- with suppress(IndexingError):
- return cast(_LocIndexer, self)._handle_lowerdim_multi_index_axis0(
- tup
- )
- elif isinstance(self.obj, ABCSeries) and any(
- isinstance(k, tuple) for k in tup
- ):
- # GH#35349 Raise if tuple in tuple for series
- # Do this after the all-hashable-or-null-slice check so that
- # we are only getting non-hashable tuples, in particular ones
- # that themselves contain a slice entry
- # See test_loc_series_getitem_too_many_dimensions
- raise IndexingError("Too many indexers")
-
- # this is a series with a multi-index specified a tuple of
- # selectors
- axis = self.axis or 0
- return self._getitem_axis(tup, axis=axis)
-
- # handle the multi-axis by taking sections and reducing
- # this is iterative
- obj = self.obj
- # GH#41369 Loop in reverse order ensures indexing along columns before rows
- # which selects only necessary blocks which avoids dtype conversion if possible
- axis = len(tup) - 1
- for key in tup[::-1]:
- if com.is_null_slice(key):
- axis -= 1
- continue
-
- obj = getattr(obj, self.name)._getitem_axis(key, axis=axis)
- axis -= 1
-
- # if we have a scalar, we are done
- if is_scalar(obj) or not hasattr(obj, "ndim"):
- break
-
- return obj
-
- def _convert_to_indexer(self, key, axis: AxisInt):
- raise AbstractMethodError(self)
-
- @final
- def __getitem__(self, key):
- check_dict_or_set_indexers(key)
- if type(key) is tuple:
- key = tuple(list(x) if is_iterator(x) else x for x in key)
- key = tuple(com.apply_if_callable(x, self.obj) for x in key)
- if self._is_scalar_access(key):
- return self.obj._get_value(*key, takeable=self._takeable)
- return self._getitem_tuple(key)
- else:
- # we by definition only have the 0th axis
- axis = self.axis or 0
-
- maybe_callable = com.apply_if_callable(key, self.obj)
- return self._getitem_axis(maybe_callable, axis=axis)
-
- def _is_scalar_access(self, key: tuple):
- raise NotImplementedError()
-
- def _getitem_tuple(self, tup: tuple):
- raise AbstractMethodError(self)
-
- def _getitem_axis(self, key, axis: AxisInt):
- raise NotImplementedError()
-
- def _has_valid_setitem_indexer(self, indexer) -> bool:
- raise AbstractMethodError(self)
-
- @final
- def _getbool_axis(self, key, axis: AxisInt):
- # caller is responsible for ensuring non-None axis
- labels = self.obj._get_axis(axis)
- key = check_bool_indexer(labels, key)
- inds = key.nonzero()[0]
- return self.obj._take_with_is_copy(inds, axis=axis)
-
-
-@doc(IndexingMixin.loc)
-class _LocIndexer(_LocationIndexer):
- _takeable: bool = False
- _valid_types = (
- "labels (MUST BE IN THE INDEX), slices of labels (BOTH "
- "endpoints included! Can be slices of integers if the "
- "index is integers), listlike of labels, boolean"
- )
-
- # -------------------------------------------------------------------
- # Key Checks
-
- @doc(_LocationIndexer._validate_key)
- def _validate_key(self, key, axis: Axis):
- # valid for a collection of labels (we check their presence later)
- # slice of labels (where start-end in labels)
- # slice of integers (only if in the labels)
- # boolean not in slice and with boolean index
- ax = self.obj._get_axis(axis)
- if isinstance(key, bool) and not (
- is_bool_dtype(ax)
- or ax.dtype.name == "boolean"
- or isinstance(ax, MultiIndex)
- and is_bool_dtype(ax.get_level_values(0))
- ):
- raise KeyError(
- f"{key}: boolean label can not be used without a boolean index"
- )
-
- if isinstance(key, slice) and (
- isinstance(key.start, bool) or isinstance(key.stop, bool)
- ):
- raise TypeError(f"{key}: boolean values can not be used in a slice")
-
- def _has_valid_setitem_indexer(self, indexer) -> bool:
- return True
-
- def _is_scalar_access(self, key: tuple) -> bool:
- """
- Returns
- -------
- bool
- """
- # this is a shortcut accessor to both .loc and .iloc
- # that provide the equivalent access of .at and .iat
- # a) avoid getting things via sections and (to minimize dtype changes)
- # b) provide a performant path
- if len(key) != self.ndim:
- return False
-
- for i, k in enumerate(key):
- if not is_scalar(k):
- return False
-
- ax = self.obj.axes[i]
- if isinstance(ax, MultiIndex):
- return False
-
- if isinstance(k, str) and ax._supports_partial_string_indexing:
- # partial string indexing, df.loc['2000', 'A']
- # should not be considered scalar
- return False
-
- if not ax._index_as_unique:
- return False
-
- return True
-
- # -------------------------------------------------------------------
- # MultiIndex Handling
-
- def _multi_take_opportunity(self, tup: tuple) -> bool:
- """
- Check whether there is the possibility to use ``_multi_take``.
-
- Currently the limit is that all axes being indexed, must be indexed with
- list-likes.
-
- Parameters
- ----------
- tup : tuple
- Tuple of indexers, one per axis.
-
- Returns
- -------
- bool
- Whether the current indexing,
- can be passed through `_multi_take`.
- """
- if not all(is_list_like_indexer(x) for x in tup):
- return False
-
- # just too complicated
- return not any(com.is_bool_indexer(x) for x in tup)
-
- def _multi_take(self, tup: tuple):
- """
- Create the indexers for the passed tuple of keys, and
- executes the take operation. This allows the take operation to be
- executed all at once, rather than once for each dimension.
- Improving efficiency.
-
- Parameters
- ----------
- tup : tuple
- Tuple of indexers, one per axis.
-
- Returns
- -------
- values: same type as the object being indexed
- """
- # GH 836
- d = {
- axis: self._get_listlike_indexer(key, axis)
- for (key, axis) in zip(tup, self.obj._AXIS_ORDERS)
- }
- return self.obj._reindex_with_indexers(d, copy=True, allow_dups=True)
-
- # -------------------------------------------------------------------
-
- def _getitem_iterable(self, key, axis: AxisInt):
- """
- Index current object with an iterable collection of keys.
-
- Parameters
- ----------
- key : iterable
- Targeted labels.
- axis : int
- Dimension on which the indexing is being made.
-
- Raises
- ------
- KeyError
- If no key was found. Will change in the future to raise if not all
- keys were found.
-
- Returns
- -------
- scalar, DataFrame, or Series: indexed value(s).
- """
- # we assume that not com.is_bool_indexer(key), as that is
- # handled before we get here.
- self._validate_key(key, axis)
-
- # A collection of keys
- keyarr, indexer = self._get_listlike_indexer(key, axis)
- return self.obj._reindex_with_indexers(
- {axis: [keyarr, indexer]}, copy=True, allow_dups=True
- )
-
- def _getitem_tuple(self, tup: tuple):
- with suppress(IndexingError):
- tup = self._expand_ellipsis(tup)
- return self._getitem_lowerdim(tup)
-
- # no multi-index, so validate all of the indexers
- tup = self._validate_tuple_indexer(tup)
-
- # ugly hack for GH #836
- if self._multi_take_opportunity(tup):
- return self._multi_take(tup)
-
- return self._getitem_tuple_same_dim(tup)
-
- def _get_label(self, label, axis: AxisInt):
- # GH#5567 this will fail if the label is not present in the axis.
- return self.obj.xs(label, axis=axis)
-
- def _handle_lowerdim_multi_index_axis0(self, tup: tuple):
- # we have an axis0 multi-index, handle or raise
- axis = self.axis or 0
- try:
- # fast path for series or for tup devoid of slices
- return self._get_label(tup, axis=axis)
-
- except KeyError as ek:
- # raise KeyError if number of indexers match
- # else IndexingError will be raised
- if self.ndim < len(tup) <= self.obj.index.nlevels:
- raise ek
- raise IndexingError("No label returned") from ek
-
- def _getitem_axis(self, key, axis: AxisInt):
- key = item_from_zerodim(key)
- if is_iterator(key):
- key = list(key)
- if key is Ellipsis:
- key = slice(None)
-
- labels = self.obj._get_axis(axis)
-
- if isinstance(key, tuple) and isinstance(labels, MultiIndex):
- key = tuple(key)
-
- if isinstance(key, slice):
- self._validate_key(key, axis)
- return self._get_slice_axis(key, axis=axis)
- elif com.is_bool_indexer(key):
- return self._getbool_axis(key, axis=axis)
- elif is_list_like_indexer(key):
- # an iterable multi-selection
- if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)):
- if hasattr(key, "ndim") and key.ndim > 1:
- raise ValueError("Cannot index with multidimensional key")
-
- return self._getitem_iterable(key, axis=axis)
-
- # nested tuple slicing
- if is_nested_tuple(key, labels):
- locs = labels.get_locs(key)
- indexer = [slice(None)] * self.ndim
- indexer[axis] = locs
- return self.obj.iloc[tuple(indexer)]
-
- # fall thru to straight lookup
- self._validate_key(key, axis)
- return self._get_label(key, axis=axis)
-
- def _get_slice_axis(self, slice_obj: slice, axis: AxisInt):
- """
- This is pretty simple as we just have to deal with labels.
- """
- # caller is responsible for ensuring non-None axis
- obj = self.obj
- if not need_slice(slice_obj):
- return obj.copy(deep=False)
-
- labels = obj._get_axis(axis)
- indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step)
-
- if isinstance(indexer, slice):
- return self.obj._slice(indexer, axis=axis)
- else:
- # DatetimeIndex overrides Index.slice_indexer and may
- # return a DatetimeIndex instead of a slice object.
- return self.obj.take(indexer, axis=axis)
-
- def _convert_to_indexer(self, key, axis: AxisInt):
- """
- Convert indexing key into something we can use to do actual fancy
- indexing on a ndarray.
-
- Examples
- ix[:5] -> slice(0, 5)
- ix[[1,2,3]] -> [1,2,3]
- ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)
-
- Going by Zen of Python?
- 'In the face of ambiguity, refuse the temptation to guess.'
- raise AmbiguousIndexError with integer labels?
- - No, prefer label-based indexing
- """
- labels = self.obj._get_axis(axis)
-
- if isinstance(key, slice):
- return labels._convert_slice_indexer(key, kind="loc")
-
- if (
- isinstance(key, tuple)
- and not isinstance(labels, MultiIndex)
- and self.ndim < 2
- and len(key) > 1
- ):
- raise IndexingError("Too many indexers")
-
- if is_scalar(key) or (isinstance(labels, MultiIndex) and is_hashable(key)):
- # Otherwise get_loc will raise InvalidIndexError
-
- # if we are a label return me
- try:
- return labels.get_loc(key)
- except LookupError:
- if isinstance(key, tuple) and isinstance(labels, MultiIndex):
- if len(key) == labels.nlevels:
- return {"key": key}
- raise
- except InvalidIndexError:
- # GH35015, using datetime as column indices raises exception
- if not isinstance(labels, MultiIndex):
- raise
- except ValueError:
- if not is_integer(key):
- raise
- return {"key": key}
-
- if is_nested_tuple(key, labels):
- if self.ndim == 1 and any(isinstance(k, tuple) for k in key):
- # GH#35349 Raise if tuple in tuple for series
- raise IndexingError("Too many indexers")
- return labels.get_locs(key)
-
- elif is_list_like_indexer(key):
- if is_iterator(key):
- key = list(key)
-
- if com.is_bool_indexer(key):
- key = check_bool_indexer(labels, key)
- return key
- else:
- return self._get_listlike_indexer(key, axis)[1]
- else:
- try:
- return labels.get_loc(key)
- except LookupError:
- # allow a not found key only if we are a setter
- if not is_list_like_indexer(key):
- return {"key": key}
- raise
-
- def _get_listlike_indexer(self, key, axis: AxisInt):
- """
- Transform a list-like of keys into a new index and an indexer.
-
- Parameters
- ----------
- key : list-like
- Targeted labels.
- axis: int
- Dimension on which the indexing is being made.
-
- Raises
- ------
- KeyError
- If at least one key was requested but none was found.
-
- Returns
- -------
- keyarr: Index
- New index (coinciding with 'key' if the axis is unique).
- values : array-like
- Indexer for the return object, -1 denotes keys not found.
- """
- ax = self.obj._get_axis(axis)
- axis_name = self.obj._get_axis_name(axis)
-
- keyarr, indexer = ax._get_indexer_strict(key, axis_name)
-
- return keyarr, indexer
-
-
-@doc(IndexingMixin.iloc)
-class _iLocIndexer(_LocationIndexer):
- _valid_types = (
- "integer, integer slice (START point is INCLUDED, END "
- "point is EXCLUDED), listlike of integers, boolean array"
- )
- _takeable = True
-
- # -------------------------------------------------------------------
- # Key Checks
-
- def _validate_key(self, key, axis: AxisInt):
- if com.is_bool_indexer(key):
- if hasattr(key, "index") and isinstance(key.index, Index):
- if key.index.inferred_type == "integer":
- raise NotImplementedError(
- "iLocation based boolean "
- "indexing on an integer type "
- "is not available"
- )
- raise ValueError(
- "iLocation based boolean indexing cannot use "
- "an indexable as a mask"
- )
- return
-
- if isinstance(key, slice):
- return
- elif is_integer(key):
- self._validate_integer(key, axis)
- elif isinstance(key, tuple):
- # a tuple should already have been caught by this point
- # so don't treat a tuple as a valid indexer
- raise IndexingError("Too many indexers")
- elif is_list_like_indexer(key):
- if isinstance(key, ABCSeries):
- arr = key._values
- elif is_array_like(key):
- arr = key
- else:
- arr = np.array(key)
- len_axis = len(self.obj._get_axis(axis))
-
- # check that the key has a numeric dtype
- if not is_numeric_dtype(arr.dtype):
- raise IndexError(f".iloc requires numeric indexers, got {arr}")
-
- # check that the key does not exceed the maximum size of the index
- if len(arr) and (arr.max() >= len_axis or arr.min() < -len_axis):
- raise IndexError("positional indexers are out-of-bounds")
- else:
- raise ValueError(f"Can only index by location with a [{self._valid_types}]")
-
- def _has_valid_setitem_indexer(self, indexer) -> bool:
- """
- Validate that a positional indexer cannot enlarge its target
- will raise if needed, does not modify the indexer externally.
-
- Returns
- -------
- bool
- """
- if isinstance(indexer, dict):
- raise IndexError("iloc cannot enlarge its target object")
-
- if isinstance(indexer, ABCDataFrame):
- raise TypeError(
- "DataFrame indexer for .iloc is not supported. "
- "Consider using .loc with a DataFrame indexer for automatic alignment.",
- )
-
- if not isinstance(indexer, tuple):
- indexer = _tuplify(self.ndim, indexer)
-
- for ax, i in zip(self.obj.axes, indexer):
- if isinstance(i, slice):
- # should check the stop slice?
- pass
- elif is_list_like_indexer(i):
- # should check the elements?
- pass
- elif is_integer(i):
- if i >= len(ax):
- raise IndexError("iloc cannot enlarge its target object")
- elif isinstance(i, dict):
- raise IndexError("iloc cannot enlarge its target object")
-
- return True
-
- def _is_scalar_access(self, key: tuple) -> bool:
- """
- Returns
- -------
- bool
- """
- # this is a shortcut accessor to both .loc and .iloc
- # that provide the equivalent access of .at and .iat
- # a) avoid getting things via sections and (to minimize dtype changes)
- # b) provide a performant path
- if len(key) != self.ndim:
- return False
-
- return all(is_integer(k) for k in key)
-
- def _validate_integer(self, key: int, axis: AxisInt) -> None:
- """
- Check that 'key' is a valid position in the desired axis.
-
- Parameters
- ----------
- key : int
- Requested position.
- axis : int
- Desired axis.
-
- Raises
- ------
- IndexError
- If 'key' is not a valid position in axis 'axis'.
- """
- len_axis = len(self.obj._get_axis(axis))
- if key >= len_axis or key < -len_axis:
- raise IndexError("single positional indexer is out-of-bounds")
-
- # -------------------------------------------------------------------
-
- def _getitem_tuple(self, tup: tuple):
- tup = self._validate_tuple_indexer(tup)
- with suppress(IndexingError):
- return self._getitem_lowerdim(tup)
-
- return self._getitem_tuple_same_dim(tup)
-
- def _get_list_axis(self, key, axis: AxisInt):
- """
- Return Series values by list or array of integers.
-
- Parameters
- ----------
- key : list-like positional indexer
- axis : int
-
- Returns
- -------
- Series object
-
- Notes
- -----
- `axis` can only be zero.
- """
- try:
- return self.obj._take_with_is_copy(key, axis=axis)
- except IndexError as err:
- # re-raise with different error message
- raise IndexError("positional indexers are out-of-bounds") from err
-
- def _getitem_axis(self, key, axis: AxisInt):
- if key is Ellipsis:
- key = slice(None)
- elif isinstance(key, ABCDataFrame):
- raise IndexError(
- "DataFrame indexer is not allowed for .iloc\n"
- "Consider using .loc for automatic alignment."
- )
-
- if isinstance(key, slice):
- return self._get_slice_axis(key, axis=axis)
-
- if is_iterator(key):
- key = list(key)
-
- if isinstance(key, list):
- key = np.asarray(key)
-
- if com.is_bool_indexer(key):
- self._validate_key(key, axis)
- return self._getbool_axis(key, axis=axis)
-
- # a list of integers
- elif is_list_like_indexer(key):
- return self._get_list_axis(key, axis=axis)
-
- # a single integer
- else:
- key = item_from_zerodim(key)
- if not is_integer(key):
- raise TypeError("Cannot index by location index with a non-integer key")
-
- # validate the location
- self._validate_integer(key, axis)
-
- return self.obj._ixs(key, axis=axis)
-
- def _get_slice_axis(self, slice_obj: slice, axis: AxisInt):
- # caller is responsible for ensuring non-None axis
- obj = self.obj
-
- if not need_slice(slice_obj):
- return obj.copy(deep=False)
-
- labels = obj._get_axis(axis)
- labels._validate_positional_slice(slice_obj)
- return self.obj._slice(slice_obj, axis=axis)
-
- def _convert_to_indexer(self, key, axis: AxisInt):
- """
- Much simpler as we only have to deal with our valid types.
- """
- return key
-
- def _get_setitem_indexer(self, key):
- # GH#32257 Fall through to let numpy do validation
- if is_iterator(key):
- key = list(key)
-
- if self.axis is not None:
- key = _tupleize_axis_indexer(self.ndim, self.axis, key)
-
- return key
-
- # -------------------------------------------------------------------
-
- def _setitem_with_indexer(self, indexer, value, name: str = "iloc"):
- """
- _setitem_with_indexer is for setting values on a Series/DataFrame
- using positional indexers.
-
- If the relevant keys are not present, the Series/DataFrame may be
- expanded.
-
- This method is currently broken when dealing with non-unique Indexes,
- since it goes from positional indexers back to labels when calling
- BlockManager methods, see GH#12991, GH#22046, GH#15686.
- """
- info_axis = self.obj._info_axis_number
-
- # maybe partial set
- take_split_path = not self.obj._mgr.is_single_block
-
- if not take_split_path and isinstance(value, ABCDataFrame):
- # Avoid cast of values
- take_split_path = not value._mgr.is_single_block
-
- # if there is only one block/type, still have to take split path
- # unless the block is one-dimensional or it can hold the value
- if not take_split_path and len(self.obj._mgr.arrays) and self.ndim > 1:
- # in case of dict, keys are indices
- val = list(value.values()) if isinstance(value, dict) else value
- arr = self.obj._mgr.arrays[0]
- take_split_path = not can_hold_element(
- arr, extract_array(val, extract_numpy=True)
- )
-
- # if we have any multi-indexes that have non-trivial slices
- # (not null slices) then we must take the split path, xref
- # GH 10360, GH 27841
- if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes):
- for i, ax in zip(indexer, self.obj.axes):
- if isinstance(ax, MultiIndex) and not (
- is_integer(i) or com.is_null_slice(i)
- ):
- take_split_path = True
- break
-
- if isinstance(indexer, tuple):
- nindexer = []
- for i, idx in enumerate(indexer):
- if isinstance(idx, dict):
- # reindex the axis to the new value
- # and set inplace
- key, _ = convert_missing_indexer(idx)
-
- # if this is the items axes, then take the main missing
- # path first
- # this correctly sets the dtype and avoids cache issues
- # essentially this separates out the block that is needed
- # to possibly be modified
- if self.ndim > 1 and i == info_axis:
- # add the new item, and set the value
- # must have all defined axes if we have a scalar
- # or a list-like on the non-info axes if we have a
- # list-like
- if not len(self.obj):
- if not is_list_like_indexer(value):
- raise ValueError(
- "cannot set a frame with no "
- "defined index and a scalar"
- )
- self.obj[key] = value
- return
-
- # add a new item with the dtype setup
- if com.is_null_slice(indexer[0]):
- # We are setting an entire column
- self.obj[key] = value
- return
- elif is_array_like(value):
- # GH#42099
- arr = extract_array(value, extract_numpy=True)
- taker = -1 * np.ones(len(self.obj), dtype=np.intp)
- empty_value = algos.take_nd(arr, taker)
- if not isinstance(value, ABCSeries):
- # if not Series (in which case we need to align),
- # we can short-circuit
- if (
- isinstance(arr, np.ndarray)
- and arr.ndim == 1
- and len(arr) == 1
- ):
- # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
- arr = arr[0, ...]
- empty_value[indexer[0]] = arr
- self.obj[key] = empty_value
- return
-
- self.obj[key] = empty_value
-
- else:
- # FIXME: GH#42099#issuecomment-864326014
- self.obj[key] = infer_fill_value(value)
-
- new_indexer = convert_from_missing_indexer_tuple(
- indexer, self.obj.axes
- )
- self._setitem_with_indexer(new_indexer, value, name)
-
- return
-
- # reindex the axis
- # make sure to clear the cache because we are
- # just replacing the block manager here
- # so the object is the same
- index = self.obj._get_axis(i)
- labels = index.insert(len(index), key)
-
- # We are expanding the Series/DataFrame values to match
- # the length of thenew index `labels`. GH#40096 ensure
- # this is valid even if the index has duplicates.
- taker = np.arange(len(index) + 1, dtype=np.intp)
- taker[-1] = -1
- reindexers = {i: (labels, taker)}
- new_obj = self.obj._reindex_with_indexers(
- reindexers, allow_dups=True
- )
- self.obj._mgr = new_obj._mgr
- self.obj._maybe_update_cacher(clear=True)
- self.obj._is_copy = None
-
- nindexer.append(labels.get_loc(key))
-
- else:
- nindexer.append(idx)
-
- indexer = tuple(nindexer)
- else:
- indexer, missing = convert_missing_indexer(indexer)
-
- if missing:
- self._setitem_with_indexer_missing(indexer, value)
- return
-
- if name == "loc":
- # must come after setting of missing
- indexer, value = self._maybe_mask_setitem_value(indexer, value)
-
- # align and set the values
- if take_split_path:
- # We have to operate column-wise
- self._setitem_with_indexer_split_path(indexer, value, name)
- else:
- self._setitem_single_block(indexer, value, name)
-
- def _setitem_with_indexer_split_path(self, indexer, value, name: str):
- """
- Setitem column-wise.
- """
- # Above we only set take_split_path to True for 2D cases
- assert self.ndim == 2
-
- if not isinstance(indexer, tuple):
- indexer = _tuplify(self.ndim, indexer)
- if len(indexer) > self.ndim:
- raise IndexError("too many indices for array")
- if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2:
- raise ValueError(r"Cannot set values with ndim > 2")
-
- if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict):
- from pandas import Series
-
- value = self._align_series(indexer, Series(value))
-
- # Ensure we have something we can iterate over
- info_axis = indexer[1]
- ilocs = self._ensure_iterable_column_indexer(info_axis)
-
- pi = indexer[0]
- lplane_indexer = length_of_indexer(pi, self.obj.index)
- # lplane_indexer gives the expected length of obj[indexer[0]]
-
- # we need an iterable, with a ndim of at least 1
- # eg. don't pass through np.array(0)
- if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0:
- if isinstance(value, ABCDataFrame):
- self._setitem_with_indexer_frame_value(indexer, value, name)
-
- elif np.ndim(value) == 2:
- # TODO: avoid np.ndim call in case it isn't an ndarray, since
- # that will construct an ndarray, which will be wasteful
- self._setitem_with_indexer_2d_value(indexer, value)
-
- elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi):
- # We are setting multiple rows in a single column.
- self._setitem_single_column(ilocs[0], value, pi)
-
- elif len(ilocs) == 1 and 0 != lplane_indexer != len(value):
- # We are trying to set N values into M entries of a single
- # column, which is invalid for N != M
- # Exclude zero-len for e.g. boolean masking that is all-false
-
- if len(value) == 1 and not is_integer(info_axis):
- # This is a case like df.iloc[:3, [1]] = [0]
- # where we treat as df.iloc[:3, 1] = 0
- return self._setitem_with_indexer((pi, info_axis[0]), value[0])
-
- raise ValueError(
- "Must have equal len keys and value "
- "when setting with an iterable"
- )
-
- elif lplane_indexer == 0 and len(value) == len(self.obj.index):
- # We get here in one case via .loc with a all-False mask
- pass
-
- elif self._is_scalar_access(indexer) and is_object_dtype(
- self.obj.dtypes[ilocs[0]]
- ):
- # We are setting nested data, only possible for object dtype data
- self._setitem_single_column(indexer[1], value, pi)
-
- elif len(ilocs) == len(value):
- # We are setting multiple columns in a single row.
- for loc, v in zip(ilocs, value):
- self._setitem_single_column(loc, v, pi)
-
- elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0:
- # This is a setitem-with-expansion, see
- # test_loc_setitem_empty_append_expands_rows_mixed_dtype
- # e.g. df = DataFrame(columns=["x", "y"])
- # df["x"] = df["x"].astype(np.int64)
- # df.loc[:, "x"] = [1, 2, 3]
- self._setitem_single_column(ilocs[0], value, pi)
-
- else:
- raise ValueError(
- "Must have equal len keys and value "
- "when setting with an iterable"
- )
-
- else:
- # scalar value
- for loc in ilocs:
- self._setitem_single_column(loc, value, pi)
-
- def _setitem_with_indexer_2d_value(self, indexer, value):
- # We get here with np.ndim(value) == 2, excluding DataFrame,
- # which goes through _setitem_with_indexer_frame_value
- pi = indexer[0]
-
- ilocs = self._ensure_iterable_column_indexer(indexer[1])
-
- if not is_array_like(value):
- # cast lists to array
- value = np.array(value, dtype=object)
- if len(ilocs) != value.shape[1]:
- raise ValueError(
- "Must have equal len keys and value when setting with an ndarray"
- )
-
- for i, loc in enumerate(ilocs):
- value_col = value[:, i]
- if is_object_dtype(value_col.dtype):
- # casting to list so that we do type inference in setitem_single_column
- value_col = value_col.tolist()
- self._setitem_single_column(loc, value_col, pi)
-
- def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str):
- ilocs = self._ensure_iterable_column_indexer(indexer[1])
-
- sub_indexer = list(indexer)
- pi = indexer[0]
-
- multiindex_indexer = isinstance(self.obj.columns, MultiIndex)
-
- unique_cols = value.columns.is_unique
-
- # We do not want to align the value in case of iloc GH#37728
- if name == "iloc":
- for i, loc in enumerate(ilocs):
- val = value.iloc[:, i]
- self._setitem_single_column(loc, val, pi)
-
- elif not unique_cols and value.columns.equals(self.obj.columns):
- # We assume we are already aligned, see
- # test_iloc_setitem_frame_duplicate_columns_multiple_blocks
- for loc in ilocs:
- item = self.obj.columns[loc]
- if item in value:
- sub_indexer[1] = item
- val = self._align_series(
- tuple(sub_indexer),
- value.iloc[:, loc],
- multiindex_indexer,
- )
- else:
- val = np.nan
-
- self._setitem_single_column(loc, val, pi)
-
- elif not unique_cols:
- raise ValueError("Setting with non-unique columns is not allowed.")
-
- else:
- for loc in ilocs:
- item = self.obj.columns[loc]
- if item in value:
- sub_indexer[1] = item
- val = self._align_series(
- tuple(sub_indexer), value[item], multiindex_indexer
- )
- else:
- val = np.nan
-
- self._setitem_single_column(loc, val, pi)
-
- def _setitem_single_column(self, loc: int, value, plane_indexer) -> None:
- """
-
- Parameters
- ----------
- loc : int
- Indexer for column position
- plane_indexer : int, slice, listlike[int]
- The indexer we use for setitem along axis=0.
- """
- pi = plane_indexer
-
- is_full_setter = com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj))
-
- is_null_setter = com.is_empty_slice(pi) or is_array_like(pi) and len(pi) == 0
-
- if is_null_setter:
- # no-op, don't cast dtype later
- return
-
- elif is_full_setter:
- try:
- self.obj._mgr.column_setitem(
- loc, plane_indexer, value, inplace_only=True
- )
- except (ValueError, TypeError, LossySetitemError):
- # If we're setting an entire column and we can't do it inplace,
- # then we can use value's dtype (or inferred dtype)
- # instead of object
- self.obj.isetitem(loc, value)
- else:
- # set value into the column (first attempting to operate inplace, then
- # falling back to casting if necessary)
- self.obj._mgr.column_setitem(loc, plane_indexer, value)
-
- self.obj._clear_item_cache()
-
- def _setitem_single_block(self, indexer, value, name: str) -> None:
- """
- _setitem_with_indexer for the case when we have a single Block.
- """
- from pandas import Series
-
- info_axis = self.obj._info_axis_number
- item_labels = self.obj._get_axis(info_axis)
- if isinstance(indexer, tuple):
- # if we are setting on the info axis ONLY
- # set using those methods to avoid block-splitting
- # logic here
- if (
- self.ndim == len(indexer) == 2
- and is_integer(indexer[1])
- and com.is_null_slice(indexer[0])
- ):
- col = item_labels[indexer[info_axis]]
- if len(item_labels.get_indexer_for([col])) == 1:
- # e.g. test_loc_setitem_empty_append_expands_rows
- loc = item_labels.get_loc(col)
- self._setitem_single_column(loc, value, indexer[0])
- return
-
- indexer = maybe_convert_ix(*indexer) # e.g. test_setitem_frame_align
-
- if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict):
- # TODO(EA): ExtensionBlock.setitem this causes issues with
- # setting for extensionarrays that store dicts. Need to decide
- # if it's worth supporting that.
- value = self._align_series(indexer, Series(value))
-
- elif isinstance(value, ABCDataFrame) and name != "iloc":
- value = self._align_frame(indexer, value)._values
-
- # check for chained assignment
- self.obj._check_is_chained_assignment_possible()
-
- # actually do the set
- self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
- self.obj._maybe_update_cacher(clear=True, inplace=True)
-
- def _setitem_with_indexer_missing(self, indexer, value):
- """
- Insert new row(s) or column(s) into the Series or DataFrame.
- """
- from pandas import Series
-
- # reindex the axis to the new value
- # and set inplace
- if self.ndim == 1:
- index = self.obj.index
- new_index = index.insert(len(index), indexer)
-
- # we have a coerced indexer, e.g. a float
- # that matches in an int64 Index, so
- # we will not create a duplicate index, rather
- # index to that element
- # e.g. 0.0 -> 0
- # GH#12246
- if index.is_unique:
- # pass new_index[-1:] instead if [new_index[-1]]
- # so that we retain dtype
- new_indexer = index.get_indexer(new_index[-1:])
- if (new_indexer != -1).any():
- # We get only here with loc, so can hard code
- return self._setitem_with_indexer(new_indexer, value, "loc")
-
- # this preserves dtype of the value and of the object
- if not is_scalar(value):
- new_dtype = None
-
- elif is_valid_na_for_dtype(value, self.obj.dtype):
- if not is_object_dtype(self.obj.dtype):
- # Every NA value is suitable for object, no conversion needed
- value = na_value_for_dtype(self.obj.dtype, compat=False)
-
- new_dtype = maybe_promote(self.obj.dtype, value)[0]
-
- elif isna(value):
- new_dtype = None
- elif not self.obj.empty and not is_object_dtype(self.obj.dtype):
- # We should not cast, if we have object dtype because we can
- # set timedeltas into object series
- curr_dtype = self.obj.dtype
- curr_dtype = getattr(curr_dtype, "numpy_dtype", curr_dtype)
- new_dtype = maybe_promote(curr_dtype, value)[0]
- else:
- new_dtype = None
-
- new_values = Series([value], dtype=new_dtype)._values
-
- if len(self.obj._values):
- # GH#22717 handle casting compatibility that np.concatenate
- # does incorrectly
- new_values = concat_compat([self.obj._values, new_values])
- self.obj._mgr = self.obj._constructor(
- new_values, index=new_index, name=self.obj.name
- )._mgr
- self.obj._maybe_update_cacher(clear=True)
-
- elif self.ndim == 2:
- if not len(self.obj.columns):
- # no columns and scalar
- raise ValueError("cannot set a frame with no defined columns")
-
- has_dtype = hasattr(value, "dtype")
- if isinstance(value, ABCSeries):
- # append a Series
- value = value.reindex(index=self.obj.columns, copy=True)
- value.name = indexer
- elif isinstance(value, dict):
- value = Series(
- value, index=self.obj.columns, name=indexer, dtype=object
- )
- else:
- # a list-list
- if is_list_like_indexer(value):
- # must have conforming columns
- if len(value) != len(self.obj.columns):
- raise ValueError("cannot set a row with mismatched columns")
-
- value = Series(value, index=self.obj.columns, name=indexer)
-
- if not len(self.obj):
- # We will ignore the existing dtypes instead of using
- # internals.concat logic
- df = value.to_frame().T
-
- idx = self.obj.index
- if isinstance(idx, MultiIndex):
- name = idx.names
- else:
- name = idx.name
-
- df.index = Index([indexer], name=name)
- if not has_dtype:
- # i.e. if we already had a Series or ndarray, keep that
- # dtype. But if we had a list or dict, then do inference
- df = df.infer_objects(copy=False)
- self.obj._mgr = df._mgr
- else:
- self.obj._mgr = self.obj._append(value)._mgr
- self.obj._maybe_update_cacher(clear=True)
-
- def _ensure_iterable_column_indexer(self, column_indexer):
- """
- Ensure that our column indexer is something that can be iterated over.
- """
- ilocs: Sequence[int] | np.ndarray
- if is_integer(column_indexer):
- ilocs = [column_indexer]
- elif isinstance(column_indexer, slice):
- ilocs = np.arange(len(self.obj.columns))[column_indexer]
- elif isinstance(column_indexer, np.ndarray) and is_bool_dtype(
- column_indexer.dtype
- ):
- ilocs = np.arange(len(column_indexer))[column_indexer]
- else:
- ilocs = column_indexer
- return ilocs
-
- def _align_series(self, indexer, ser: Series, multiindex_indexer: bool = False):
- """
- Parameters
- ----------
- indexer : tuple, slice, scalar
- Indexer used to get the locations that will be set to `ser`.
- ser : pd.Series
- Values to assign to the locations specified by `indexer`.
- multiindex_indexer : bool, optional
- Defaults to False. Should be set to True if `indexer` was from
- a `pd.MultiIndex`, to avoid unnecessary broadcasting.
-
- Returns
- -------
- `np.array` of `ser` broadcast to the appropriate shape for assignment
- to the locations selected by `indexer`
- """
- if isinstance(indexer, (slice, np.ndarray, list, Index)):
- indexer = (indexer,)
-
- if isinstance(indexer, tuple):
- # flatten np.ndarray indexers
- def ravel(i):
- return i.ravel() if isinstance(i, np.ndarray) else i
-
- indexer = tuple(map(ravel, indexer))
-
- aligners = [not com.is_null_slice(idx) for idx in indexer]
- sum_aligners = sum(aligners)
- single_aligner = sum_aligners == 1
- is_frame = self.ndim == 2
- obj = self.obj
-
- # are we a single alignable value on a non-primary
- # dim (e.g. panel: 1,2, or frame: 0) ?
- # hence need to align to a single axis dimension
- # rather that find all valid dims
-
- # frame
- if is_frame:
- single_aligner = single_aligner and aligners[0]
-
- # we have a frame, with multiple indexers on both axes; and a
- # series, so need to broadcast (see GH5206)
- if sum_aligners == self.ndim and all(is_sequence(_) for _ in indexer):
- ser_values = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values
-
- # single indexer
- if len(indexer) > 1 and not multiindex_indexer:
- len_indexer = len(indexer[1])
- ser_values = (
- np.tile(ser_values, len_indexer).reshape(len_indexer, -1).T
- )
-
- return ser_values
-
- for i, idx in enumerate(indexer):
- ax = obj.axes[i]
-
- # multiple aligners (or null slices)
- if is_sequence(idx) or isinstance(idx, slice):
- if single_aligner and com.is_null_slice(idx):
- continue
- new_ix = ax[idx]
- if not is_list_like_indexer(new_ix):
- new_ix = Index([new_ix])
- else:
- new_ix = Index(new_ix)
- if ser.index.equals(new_ix) or not len(new_ix):
- return ser._values.copy()
-
- return ser.reindex(new_ix)._values
-
- # 2 dims
- elif single_aligner:
- # reindex along index
- ax = self.obj.axes[1]
- if ser.index.equals(ax) or not len(ax):
- return ser._values.copy()
- return ser.reindex(ax)._values
-
- elif is_integer(indexer) and self.ndim == 1:
- if is_object_dtype(self.obj):
- return ser
- ax = self.obj._get_axis(0)
-
- if ser.index.equals(ax):
- return ser._values.copy()
-
- return ser.reindex(ax)._values[indexer]
-
- elif is_integer(indexer):
- ax = self.obj._get_axis(1)
-
- if ser.index.equals(ax):
- return ser._values.copy()
-
- return ser.reindex(ax)._values
-
- raise ValueError("Incompatible indexer with Series")
-
- def _align_frame(self, indexer, df: DataFrame) -> DataFrame:
- is_frame = self.ndim == 2
-
- if isinstance(indexer, tuple):
- idx, cols = None, None
- sindexers = []
- for i, ix in enumerate(indexer):
- ax = self.obj.axes[i]
- if is_sequence(ix) or isinstance(ix, slice):
- if isinstance(ix, np.ndarray):
- ix = ix.ravel()
- if idx is None:
- idx = ax[ix]
- elif cols is None:
- cols = ax[ix]
- else:
- break
- else:
- sindexers.append(i)
-
- if idx is not None and cols is not None:
- if df.index.equals(idx) and df.columns.equals(cols):
- val = df.copy()
- else:
- val = df.reindex(idx, columns=cols)
- return val
-
- elif (isinstance(indexer, slice) or is_list_like_indexer(indexer)) and is_frame:
- ax = self.obj.index[indexer]
- if df.index.equals(ax):
- val = df.copy()
- else:
- # we have a multi-index and are trying to align
- # with a particular, level GH3738
- if (
- isinstance(ax, MultiIndex)
- and isinstance(df.index, MultiIndex)
- and ax.nlevels != df.index.nlevels
- ):
- raise TypeError(
- "cannot align on a multi-index with out "
- "specifying the join levels"
- )
-
- val = df.reindex(index=ax)
- return val
-
- raise ValueError("Incompatible indexer with DataFrame")
-
-
-class _ScalarAccessIndexer(NDFrameIndexerBase):
- """
- Access scalars quickly.
- """
-
- # sub-classes need to set _takeable
- _takeable: bool
-
- def _convert_key(self, key):
- raise AbstractMethodError(self)
-
- def __getitem__(self, key):
- if not isinstance(key, tuple):
- # we could have a convertible item here (e.g. Timestamp)
- if not is_list_like_indexer(key):
- key = (key,)
- else:
- raise ValueError("Invalid call for scalar access (getting)!")
-
- key = self._convert_key(key)
- return self.obj._get_value(*key, takeable=self._takeable)
-
- def __setitem__(self, key, value) -> None:
- if isinstance(key, tuple):
- key = tuple(com.apply_if_callable(x, self.obj) for x in key)
- else:
- # scalar callable may return tuple
- key = com.apply_if_callable(key, self.obj)
-
- if not isinstance(key, tuple):
- key = _tuplify(self.ndim, key)
- key = list(self._convert_key(key))
- if len(key) != self.ndim:
- raise ValueError("Not enough indexers for scalar access (setting)!")
-
- self.obj._set_value(*key, value=value, takeable=self._takeable)
-
-
-@doc(IndexingMixin.at)
-class _AtIndexer(_ScalarAccessIndexer):
- _takeable = False
-
- def _convert_key(self, key):
- """
- Require they keys to be the same type as the index. (so we don't
- fallback)
- """
- # GH 26989
- # For series, unpacking key needs to result in the label.
- # This is already the case for len(key) == 1; e.g. (1,)
- if self.ndim == 1 and len(key) > 1:
- key = (key,)
-
- return key
-
- @property
- def _axes_are_unique(self) -> bool:
- # Only relevant for self.ndim == 2
- assert self.ndim == 2
- return self.obj.index.is_unique and self.obj.columns.is_unique
-
- def __getitem__(self, key):
- if self.ndim == 2 and not self._axes_are_unique:
- # GH#33041 fall back to .loc
- if not isinstance(key, tuple) or not all(is_scalar(x) for x in key):
- raise ValueError("Invalid call for scalar access (getting)!")
- return self.obj.loc[key]
-
- return super().__getitem__(key)
-
- def __setitem__(self, key, value):
- if self.ndim == 2 and not self._axes_are_unique:
- # GH#33041 fall back to .loc
- if not isinstance(key, tuple) or not all(is_scalar(x) for x in key):
- raise ValueError("Invalid call for scalar access (setting)!")
-
- self.obj.loc[key] = value
- return
-
- return super().__setitem__(key, value)
-
-
-@doc(IndexingMixin.iat)
-class _iAtIndexer(_ScalarAccessIndexer):
- _takeable = True
-
- def _convert_key(self, key):
- """
- Require integer args. (and convert to label arguments)
- """
- for i in key:
- if not is_integer(i):
- raise ValueError("iAt based indexing can only have integer indexers")
- return key
-
-
-def _tuplify(ndim: int, loc: Hashable) -> tuple[Hashable | slice, ...]:
- """
- Given an indexer for the first dimension, create an equivalent tuple
- for indexing over all dimensions.
-
- Parameters
- ----------
- ndim : int
- loc : object
-
- Returns
- -------
- tuple
- """
- _tup: list[Hashable | slice]
- _tup = [slice(None, None) for _ in range(ndim)]
- _tup[0] = loc
- return tuple(_tup)
-
-
-def _tupleize_axis_indexer(ndim: int, axis: AxisInt, key) -> tuple:
- """
- If we have an axis, adapt the given key to be axis-independent.
- """
- new_key = [slice(None)] * ndim
- new_key[axis] = key
- return tuple(new_key)
-
-
-def check_bool_indexer(index: Index, key) -> np.ndarray:
- """
- Check if key is a valid boolean indexer for an object with such index and
- perform reindexing or conversion if needed.
-
- This function assumes that is_bool_indexer(key) == True.
-
- Parameters
- ----------
- index : Index
- Index of the object on which the indexing is done.
- key : list-like
- Boolean indexer to check.
-
- Returns
- -------
- np.array
- Resulting key.
-
- Raises
- ------
- IndexError
- If the key does not have the same length as index.
- IndexingError
- If the index of the key is unalignable to index.
- """
- result = key
- if isinstance(key, ABCSeries) and not key.index.equals(index):
- indexer = result.index.get_indexer_for(index)
- if -1 in indexer:
- raise IndexingError(
- "Unalignable boolean Series provided as "
- "indexer (index of the boolean Series and of "
- "the indexed object do not match)."
- )
-
- result = result.take(indexer)
-
- # fall through for boolean
- if not is_extension_array_dtype(result.dtype):
- return result.astype(bool)._values
-
- if is_object_dtype(key):
- # key might be object-dtype bool, check_array_indexer needs bool array
- result = np.asarray(result, dtype=bool)
- elif not is_array_like(result):
- # GH 33924
- # key may contain nan elements, check_array_indexer needs bool array
- result = pd_array(result, dtype=bool)
- return check_array_indexer(index, result)
-
-
-def convert_missing_indexer(indexer):
- """
- Reverse convert a missing indexer, which is a dict
- return the scalar indexer and a boolean indicating if we converted
- """
- if isinstance(indexer, dict):
- # a missing key (but not a tuple indexer)
- indexer = indexer["key"]
-
- if isinstance(indexer, bool):
- raise KeyError("cannot use a single bool to index into setitem")
- return indexer, True
-
- return indexer, False
-
-
-def convert_from_missing_indexer_tuple(indexer, axes):
- """
- Create a filtered indexer that doesn't have any missing indexers.
- """
-
- def get_indexer(_i, _idx):
- return axes[_i].get_loc(_idx["key"]) if isinstance(_idx, dict) else _idx
-
- return tuple(get_indexer(_i, _idx) for _i, _idx in enumerate(indexer))
-
-
-def maybe_convert_ix(*args):
- """
- We likely want to take the cross-product.
- """
- for arg in args:
- if not isinstance(arg, (np.ndarray, list, ABCSeries, Index)):
- return args
- return np.ix_(*args)
-
-
-def is_nested_tuple(tup, labels) -> bool:
- """
- Returns
- -------
- bool
- """
- # check for a compatible nested tuple and multiindexes among the axes
- if not isinstance(tup, tuple):
- return False
-
- for k in tup:
- if is_list_like(k) or isinstance(k, slice):
- return isinstance(labels, MultiIndex)
-
- return False
-
-
-def is_label_like(key) -> bool:
- """
- Returns
- -------
- bool
- """
- # select a label or row
- return (
- not isinstance(key, slice)
- and not is_list_like_indexer(key)
- and key is not Ellipsis
- )
-
-
-def need_slice(obj: slice) -> bool:
- """
- Returns
- -------
- bool
- """
- return (
- obj.start is not None
- or obj.stop is not None
- or (obj.step is not None and obj.step != 1)
- )
-
-
-def check_dict_or_set_indexers(key) -> None:
- """
- Check if the indexer is or contains a dict or set, which is no longer allowed.
- """
- if (
- isinstance(key, set)
- or isinstance(key, tuple)
- and any(isinstance(x, set) for x in key)
- ):
- raise TypeError(
- "Passing a set as an indexer is not supported. Use a list instead."
- )
-
- if (
- isinstance(key, dict)
- or isinstance(key, tuple)
- and any(isinstance(x, dict) for x in key)
- ):
- raise TypeError(
- "Passing a dict as an indexer is not supported. Use a list instead."
- )
diff --git a/contrib/python/pandas/py3/pandas/core/interchange/__init__.py b/contrib/python/pandas/py3/pandas/core/interchange/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/interchange/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/core/interchange/buffer.py b/contrib/python/pandas/py3/pandas/core/interchange/buffer.py
deleted file mode 100644
index 0f62dd00a0f..00000000000
--- a/contrib/python/pandas/py3/pandas/core/interchange/buffer.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from __future__ import annotations
-
-import numpy as np
-
-from pandas.core.interchange.dataframe_protocol import (
- Buffer,
- DlpackDeviceType,
-)
-from pandas.util.version import Version
-
-_NUMPY_HAS_DLPACK = Version(np.__version__) >= Version("1.22.0")
-
-
-class PandasBuffer(Buffer):
- """
- Data in the buffer is guaranteed to be contiguous in memory.
- """
-
- def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None:
- """
- Handle only regular columns (= numpy arrays) for now.
- """
- if not x.strides == (x.dtype.itemsize,):
- # The protocol does not support strided buffers, so a copy is
- # necessary. If that's not allowed, we need to raise an exception.
- if allow_copy:
- x = x.copy()
- else:
- raise RuntimeError(
- "Exports cannot be zero-copy in the case "
- "of a non-contiguous buffer"
- )
-
- # Store the numpy array in which the data resides as a private
- # attribute, so we can use it to retrieve the public attributes
- self._x = x
-
- @property
- def bufsize(self) -> int:
- """
- Buffer size in bytes.
- """
- return self._x.size * self._x.dtype.itemsize
-
- @property
- def ptr(self) -> int:
- """
- Pointer to start of the buffer as an integer.
- """
- return self._x.__array_interface__["data"][0]
-
- def __dlpack__(self):
- """
- Represent this structure as DLPack interface.
- """
- if _NUMPY_HAS_DLPACK:
- return self._x.__dlpack__()
- raise NotImplementedError("__dlpack__")
-
- def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
- """
- Device type and device ID for where the data in the buffer resides.
- """
- return (DlpackDeviceType.CPU, None)
-
- def __repr__(self) -> str:
- return (
- "PandasBuffer("
- + str(
- {
- "bufsize": self.bufsize,
- "ptr": self.ptr,
- "device": self.__dlpack_device__()[0].name,
- }
- )
- + ")"
- )
diff --git a/contrib/python/pandas/py3/pandas/core/interchange/column.py b/contrib/python/pandas/py3/pandas/core/interchange/column.py
deleted file mode 100644
index d8c6a58e774..00000000000
--- a/contrib/python/pandas/py3/pandas/core/interchange/column.py
+++ /dev/null
@@ -1,377 +0,0 @@
-from __future__ import annotations
-
-from typing import Any
-
-import numpy as np
-
-from pandas._libs.lib import infer_dtype
-from pandas._libs.tslibs import iNaT
-from pandas.errors import NoBufferPresent
-from pandas.util._decorators import cache_readonly
-
-import pandas as pd
-from pandas.api.types import (
- is_categorical_dtype,
- is_string_dtype,
-)
-from pandas.core.interchange.buffer import PandasBuffer
-from pandas.core.interchange.dataframe_protocol import (
- Column,
- ColumnBuffers,
- ColumnNullType,
- DtypeKind,
-)
-from pandas.core.interchange.utils import (
- ArrowCTypes,
- Endianness,
- dtype_to_arrow_c_fmt,
-)
-
-_NP_KINDS = {
- "i": DtypeKind.INT,
- "u": DtypeKind.UINT,
- "f": DtypeKind.FLOAT,
- "b": DtypeKind.BOOL,
- "U": DtypeKind.STRING,
- "M": DtypeKind.DATETIME,
- "m": DtypeKind.DATETIME,
-}
-
-_NULL_DESCRIPTION = {
- DtypeKind.FLOAT: (ColumnNullType.USE_NAN, None),
- DtypeKind.DATETIME: (ColumnNullType.USE_SENTINEL, iNaT),
- DtypeKind.INT: (ColumnNullType.NON_NULLABLE, None),
- DtypeKind.UINT: (ColumnNullType.NON_NULLABLE, None),
- DtypeKind.BOOL: (ColumnNullType.NON_NULLABLE, None),
- # Null values for categoricals are stored as `-1` sentinel values
- # in the category date (e.g., `col.values.codes` is int8 np.ndarray)
- DtypeKind.CATEGORICAL: (ColumnNullType.USE_SENTINEL, -1),
- # follow Arrow in using 1 as valid value and 0 for missing/null value
- DtypeKind.STRING: (ColumnNullType.USE_BYTEMASK, 0),
-}
-
-_NO_VALIDITY_BUFFER = {
- ColumnNullType.NON_NULLABLE: "This column is non-nullable",
- ColumnNullType.USE_NAN: "This column uses NaN as null",
- ColumnNullType.USE_SENTINEL: "This column uses a sentinel value",
-}
-
-
-class PandasColumn(Column):
- """
- A column object, with only the methods and properties required by the
- interchange protocol defined.
- A column can contain one or more chunks. Each chunk can contain up to three
- buffers - a data buffer, a mask buffer (depending on null representation),
- and an offsets buffer (if variable-size binary; e.g., variable-length
- strings).
- Note: this Column object can only be produced by ``__dataframe__``, so
- doesn't need its own version or ``__column__`` protocol.
- """
-
- def __init__(self, column: pd.Series, allow_copy: bool = True) -> None:
- """
- Note: doesn't deal with extension arrays yet, just assume a regular
- Series/ndarray for now.
- """
- if not isinstance(column, pd.Series):
- raise NotImplementedError(f"Columns of type {type(column)} not handled yet")
-
- # Store the column as a private attribute
- self._col = column
- self._allow_copy = allow_copy
-
- def size(self) -> int:
- """
- Size of the column, in elements.
- """
- return self._col.size
-
- @property
- def offset(self) -> int:
- """
- Offset of first element. Always zero.
- """
- # TODO: chunks are implemented now, probably this should return something
- return 0
-
- @cache_readonly
- def dtype(self) -> tuple[DtypeKind, int, str, str]:
- dtype = self._col.dtype
-
- if is_categorical_dtype(dtype):
- codes = self._col.values.codes
- (
- _,
- bitwidth,
- c_arrow_dtype_f_str,
- _,
- ) = self._dtype_from_pandasdtype(codes.dtype)
- return (
- DtypeKind.CATEGORICAL,
- bitwidth,
- c_arrow_dtype_f_str,
- Endianness.NATIVE,
- )
- elif is_string_dtype(dtype):
- if infer_dtype(self._col) == "string":
- return (
- DtypeKind.STRING,
- 8,
- dtype_to_arrow_c_fmt(dtype),
- Endianness.NATIVE,
- )
- raise NotImplementedError("Non-string object dtypes are not supported yet")
- else:
- return self._dtype_from_pandasdtype(dtype)
-
- def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
- """
- See `self.dtype` for details.
- """
- # Note: 'c' (complex) not handled yet (not in array spec v1).
- # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled
- # datetime and timedelta both map to datetime (is timedelta handled?)
-
- kind = _NP_KINDS.get(dtype.kind, None)
- if kind is None:
- # Not a NumPy dtype. Check if it's a categorical maybe
- raise ValueError(f"Data type {dtype} not supported by interchange protocol")
-
- return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), dtype.byteorder
-
- @property
- def describe_categorical(self):
- """
- If the dtype is categorical, there are two options:
- - There are only values in the data buffer.
- - There is a separate non-categorical Column encoding for categorical values.
-
- Raises TypeError if the dtype is not categorical
-
- Content of returned dict:
- - "is_ordered" : bool, whether the ordering of dictionary indices is
- semantically meaningful.
- - "is_dictionary" : bool, whether a dictionary-style mapping of
- categorical values to other objects exists
- - "categories" : Column representing the (implicit) mapping of indices to
- category values (e.g. an array of cat1, cat2, ...).
- None if not a dictionary-style categorical.
- """
- if not self.dtype[0] == DtypeKind.CATEGORICAL:
- raise TypeError(
- "describe_categorical only works on a column with categorical dtype!"
- )
-
- return {
- "is_ordered": self._col.cat.ordered,
- "is_dictionary": True,
- "categories": PandasColumn(pd.Series(self._col.cat.categories)),
- }
-
- @property
- def describe_null(self):
- kind = self.dtype[0]
- try:
- null, value = _NULL_DESCRIPTION[kind]
- except KeyError:
- raise NotImplementedError(f"Data type {kind} not yet supported")
-
- return null, value
-
- @cache_readonly
- def null_count(self) -> int:
- """
- Number of null elements. Should always be known.
- """
- return self._col.isna().sum().item()
-
- @property
- def metadata(self) -> dict[str, pd.Index]:
- """
- Store specific metadata of the column.
- """
- return {"pandas.index": self._col.index}
-
- def num_chunks(self) -> int:
- """
- Return the number of chunks the column consists of.
- """
- return 1
-
- def get_chunks(self, n_chunks: int | None = None):
- """
- Return an iterator yielding the chunks.
- See `DataFrame.get_chunks` for details on ``n_chunks``.
- """
- if n_chunks and n_chunks > 1:
- size = len(self._col)
- step = size // n_chunks
- if size % n_chunks != 0:
- step += 1
- for start in range(0, step * n_chunks, step):
- yield PandasColumn(
- self._col.iloc[start : start + step], self._allow_copy
- )
- else:
- yield self
-
- def get_buffers(self) -> ColumnBuffers:
- """
- Return a dictionary containing the underlying buffers.
- The returned dictionary has the following contents:
- - "data": a two-element tuple whose first element is a buffer
- containing the data and whose second element is the data
- buffer's associated dtype.
- - "validity": a two-element tuple whose first element is a buffer
- containing mask values indicating missing data and
- whose second element is the mask value buffer's
- associated dtype. None if the null representation is
- not a bit or byte mask.
- - "offsets": a two-element tuple whose first element is a buffer
- containing the offset values for variable-size binary
- data (e.g., variable-length strings) and whose second
- element is the offsets buffer's associated dtype. None
- if the data buffer does not have an associated offsets
- buffer.
- """
- buffers: ColumnBuffers = {
- "data": self._get_data_buffer(),
- "validity": None,
- "offsets": None,
- }
-
- try:
- buffers["validity"] = self._get_validity_buffer()
- except NoBufferPresent:
- pass
-
- try:
- buffers["offsets"] = self._get_offsets_buffer()
- except NoBufferPresent:
- pass
-
- return buffers
-
- def _get_data_buffer(
- self,
- ) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple
- """
- Return the buffer containing the data and the buffer's associated dtype.
- """
- if self.dtype[0] in (
- DtypeKind.INT,
- DtypeKind.UINT,
- DtypeKind.FLOAT,
- DtypeKind.BOOL,
- DtypeKind.DATETIME,
- ):
- buffer = PandasBuffer(self._col.to_numpy(), allow_copy=self._allow_copy)
- dtype = self.dtype
- elif self.dtype[0] == DtypeKind.CATEGORICAL:
- codes = self._col.values._codes
- buffer = PandasBuffer(codes, allow_copy=self._allow_copy)
- dtype = self._dtype_from_pandasdtype(codes.dtype)
- elif self.dtype[0] == DtypeKind.STRING:
- # Marshal the strings from a NumPy object array into a byte array
- buf = self._col.to_numpy()
- b = bytearray()
-
- # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later
- for obj in buf:
- if isinstance(obj, str):
- b.extend(obj.encode(encoding="utf-8"))
-
- # Convert the byte array to a Pandas "buffer" using
- # a NumPy array as the backing store
- buffer = PandasBuffer(np.frombuffer(b, dtype="uint8"))
-
- # Define the dtype for the returned buffer
- dtype = (
- DtypeKind.STRING,
- 8,
- ArrowCTypes.STRING,
- Endianness.NATIVE,
- ) # note: currently only support native endianness
- else:
- raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
-
- return buffer, dtype
-
- def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
- """
- Return the buffer containing the mask values indicating missing data and
- the buffer's associated dtype.
- Raises NoBufferPresent if null representation is not a bit or byte mask.
- """
- null, invalid = self.describe_null
-
- if self.dtype[0] == DtypeKind.STRING:
- # For now, use byte array as the mask.
- # TODO: maybe store as bit array to save space?..
- buf = self._col.to_numpy()
-
- # Determine the encoding for valid values
- valid = invalid == 0
- invalid = not valid
-
- mask = np.zeros(shape=(len(buf),), dtype=np.bool_)
- for i, obj in enumerate(buf):
- mask[i] = valid if isinstance(obj, str) else invalid
-
- # Convert the mask array to a Pandas "buffer" using
- # a NumPy array as the backing store
- buffer = PandasBuffer(mask)
-
- # Define the dtype of the returned buffer
- dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
-
- return buffer, dtype
-
- try:
- msg = f"{_NO_VALIDITY_BUFFER[null]} so does not have a separate mask"
- except KeyError:
- # TODO: implement for other bit/byte masks?
- raise NotImplementedError("See self.describe_null")
-
- raise NoBufferPresent(msg)
-
- def _get_offsets_buffer(self) -> tuple[PandasBuffer, Any]:
- """
- Return the buffer containing the offset values for variable-size binary
- data (e.g., variable-length strings) and the buffer's associated dtype.
- Raises NoBufferPresent if the data buffer does not have an associated
- offsets buffer.
- """
- if self.dtype[0] == DtypeKind.STRING:
- # For each string, we need to manually determine the next offset
- values = self._col.to_numpy()
- ptr = 0
- offsets = np.zeros(shape=(len(values) + 1,), dtype=np.int64)
- for i, v in enumerate(values):
- # For missing values (in this case, `np.nan` values)
- # we don't increment the pointer
- if isinstance(v, str):
- b = v.encode(encoding="utf-8")
- ptr += len(b)
-
- offsets[i + 1] = ptr
-
- # Convert the offsets to a Pandas "buffer" using
- # the NumPy array as the backing store
- buffer = PandasBuffer(offsets)
-
- # Assemble the buffer dtype info
- dtype = (
- DtypeKind.INT,
- 64,
- ArrowCTypes.INT64,
- Endianness.NATIVE,
- ) # note: currently only support native endianness
- else:
- raise NoBufferPresent(
- "This column has a fixed-length dtype so "
- "it does not have an offsets buffer"
- )
-
- return buffer, dtype
diff --git a/contrib/python/pandas/py3/pandas/core/interchange/dataframe.py b/contrib/python/pandas/py3/pandas/core/interchange/dataframe.py
deleted file mode 100644
index 0de9b130f0a..00000000000
--- a/contrib/python/pandas/py3/pandas/core/interchange/dataframe.py
+++ /dev/null
@@ -1,111 +0,0 @@
-from __future__ import annotations
-
-from collections import abc
-from typing import TYPE_CHECKING
-
-from pandas.core.interchange.column import PandasColumn
-from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Index,
- )
-
-
-class PandasDataFrameXchg(DataFrameXchg):
- """
- A data frame class, with only the methods required by the interchange
- protocol defined.
- Instances of this (private) class are returned from
- ``pd.DataFrame.__dataframe__`` as objects with the methods and
- attributes defined on this class.
- """
-
- def __init__(
- self, df: DataFrame, nan_as_null: bool = False, allow_copy: bool = True
- ) -> None:
- """
- Constructor - an instance of this (private) class is returned from
- `pd.DataFrame.__dataframe__`.
- """
- self._df = df
- # ``nan_as_null`` is a keyword intended for the consumer to tell the
- # producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
- # This currently has no effect; once support for nullable extension
- # dtypes is added, this value should be propagated to columns.
- self._nan_as_null = nan_as_null
- self._allow_copy = allow_copy
-
- def __dataframe__(
- self, nan_as_null: bool = False, allow_copy: bool = True
- ) -> PandasDataFrameXchg:
- return PandasDataFrameXchg(self._df, nan_as_null, allow_copy)
-
- @property
- def metadata(self) -> dict[str, Index]:
- # `index` isn't a regular column, and the protocol doesn't support row
- # labels - so we export it as Pandas-specific metadata here.
- return {"pandas.index": self._df.index}
-
- def num_columns(self) -> int:
- return len(self._df.columns)
-
- def num_rows(self) -> int:
- return len(self._df)
-
- def num_chunks(self) -> int:
- return 1
-
- def column_names(self) -> Index:
- return self._df.columns
-
- def get_column(self, i: int) -> PandasColumn:
- return PandasColumn(self._df.iloc[:, i], allow_copy=self._allow_copy)
-
- def get_column_by_name(self, name: str) -> PandasColumn:
- return PandasColumn(self._df[name], allow_copy=self._allow_copy)
-
- def get_columns(self) -> list[PandasColumn]:
- return [
- PandasColumn(self._df[name], allow_copy=self._allow_copy)
- for name in self._df.columns
- ]
-
- def select_columns(self, indices) -> PandasDataFrameXchg:
- if not isinstance(indices, abc.Sequence):
- raise ValueError("`indices` is not a sequence")
- if not isinstance(indices, list):
- indices = list(indices)
-
- return PandasDataFrameXchg(
- self._df.iloc[:, indices], self._nan_as_null, self._allow_copy
- )
-
- def select_columns_by_name(self, names) -> PandasDataFrameXchg:
- if not isinstance(names, abc.Sequence):
- raise ValueError("`names` is not a sequence")
- if not isinstance(names, list):
- names = list(names)
-
- return PandasDataFrameXchg(
- self._df.loc[:, names], self._nan_as_null, self._allow_copy
- )
-
- def get_chunks(self, n_chunks=None):
- """
- Return an iterator yielding the chunks.
- """
- if n_chunks and n_chunks > 1:
- size = len(self._df)
- step = size // n_chunks
- if size % n_chunks != 0:
- step += 1
- for start in range(0, step * n_chunks, step):
- yield PandasDataFrameXchg(
- self._df.iloc[start : start + step, :],
- self._nan_as_null,
- self._allow_copy,
- )
- else:
- yield self
diff --git a/contrib/python/pandas/py3/pandas/core/interchange/dataframe_protocol.py b/contrib/python/pandas/py3/pandas/core/interchange/dataframe_protocol.py
deleted file mode 100644
index d36bda120e3..00000000000
--- a/contrib/python/pandas/py3/pandas/core/interchange/dataframe_protocol.py
+++ /dev/null
@@ -1,460 +0,0 @@
-"""
-A verbatim copy (vendored) of the spec from https://github.com/data-apis/dataframe-api
-"""
-
-from __future__ import annotations
-
-from abc import (
- ABC,
- abstractmethod,
-)
-import enum
-from typing import (
- Any,
- Iterable,
- Sequence,
- TypedDict,
-)
-
-
-class DlpackDeviceType(enum.IntEnum):
- """Integer enum for device type codes matching DLPack."""
-
- CPU = 1
- CUDA = 2
- CPU_PINNED = 3
- OPENCL = 4
- VULKAN = 7
- METAL = 8
- VPI = 9
- ROCM = 10
-
-
-class DtypeKind(enum.IntEnum):
- """
- Integer enum for data types.
-
- Attributes
- ----------
- INT : int
- Matches to signed integer data type.
- UINT : int
- Matches to unsigned integer data type.
- FLOAT : int
- Matches to floating point data type.
- BOOL : int
- Matches to boolean data type.
- STRING : int
- Matches to string data type (UTF-8 encoded).
- DATETIME : int
- Matches to datetime data type.
- CATEGORICAL : int
- Matches to categorical data type.
- """
-
- INT = 0
- UINT = 1
- FLOAT = 2
- BOOL = 20
- STRING = 21 # UTF-8
- DATETIME = 22
- CATEGORICAL = 23
-
-
-class ColumnNullType(enum.IntEnum):
- """
- Integer enum for null type representation.
-
- Attributes
- ----------
- NON_NULLABLE : int
- Non-nullable column.
- USE_NAN : int
- Use explicit float NaN value.
- USE_SENTINEL : int
- Sentinel value besides NaN/NaT.
- USE_BITMASK : int
- The bit is set/unset representing a null on a certain position.
- USE_BYTEMASK : int
- The byte is set/unset representing a null on a certain position.
- """
-
- NON_NULLABLE = 0
- USE_NAN = 1
- USE_SENTINEL = 2
- USE_BITMASK = 3
- USE_BYTEMASK = 4
-
-
-class ColumnBuffers(TypedDict):
- # first element is a buffer containing the column data;
- # second element is the data buffer's associated dtype
- data: tuple[Buffer, Any]
-
- # first element is a buffer containing mask values indicating missing data;
- # second element is the mask value buffer's associated dtype.
- # None if the null representation is not a bit or byte mask
- validity: tuple[Buffer, Any] | None
-
- # first element is a buffer containing the offset values for
- # variable-size binary data (e.g., variable-length strings);
- # second element is the offsets buffer's associated dtype.
- # None if the data buffer does not have an associated offsets buffer
- offsets: tuple[Buffer, Any] | None
-
-
-class CategoricalDescription(TypedDict):
- # whether the ordering of dictionary indices is semantically meaningful
- is_ordered: bool
- # whether a dictionary-style mapping of categorical values to other objects exists
- is_dictionary: bool
- # Python-level only (e.g. ``{int: str}``).
- # None if not a dictionary-style categorical.
- categories: Column | None
-
-
-class Buffer(ABC):
- """
- Data in the buffer is guaranteed to be contiguous in memory.
-
- Note that there is no dtype attribute present, a buffer can be thought of
- as simply a block of memory. However, if the column that the buffer is
- attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
- implemented, then that dtype information will be contained in the return
- value from ``__dlpack__``.
-
- This distinction is useful to support both data exchange via DLPack on a
- buffer and (b) dtypes like variable-length strings which do not have a
- fixed number of bytes per element.
- """
-
- @property
- @abstractmethod
- def bufsize(self) -> int:
- """
- Buffer size in bytes.
- """
-
- @property
- @abstractmethod
- def ptr(self) -> int:
- """
- Pointer to start of the buffer as an integer.
- """
-
- @abstractmethod
- def __dlpack__(self):
- """
- Produce DLPack capsule (see array API standard).
-
- Raises:
-
- - TypeError : if the buffer contains unsupported dtypes.
- - NotImplementedError : if DLPack support is not implemented
-
- Useful to have to connect to array libraries. Support optional because
- it's not completely trivial to implement for a Python-only library.
- """
- raise NotImplementedError("__dlpack__")
-
- @abstractmethod
- def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
- """
- Device type and device ID for where the data in the buffer resides.
- Uses device type codes matching DLPack.
- Note: must be implemented even if ``__dlpack__`` is not.
- """
-
-
-class Column(ABC):
- """
- A column object, with only the methods and properties required by the
- interchange protocol defined.
-
- A column can contain one or more chunks. Each chunk can contain up to three
- buffers - a data buffer, a mask buffer (depending on null representation),
- and an offsets buffer (if variable-size binary; e.g., variable-length
- strings).
-
- TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
- Instead, it seems to use "children" for both columns with a bit mask,
- and for nested dtypes. Unclear whether this is elegant or confusing.
- This design requires checking the null representation explicitly.
-
- The Arrow design requires checking:
- 1. the ARROW_FLAG_NULLABLE (for sentinel values)
- 2. if a column has two children, combined with one of those children
- having a null dtype.
-
- Making the mask concept explicit seems useful. One null dtype would
- not be enough to cover both bit and byte masks, so that would mean
- even more checking if we did it the Arrow way.
-
- TBD: there's also the "chunk" concept here, which is implicit in Arrow as
- multiple buffers per array (= column here). Semantically it may make
- sense to have both: chunks were meant for example for lazy evaluation
- of data which doesn't fit in memory, while multiple buffers per column
- could also come from doing a selection operation on a single
- contiguous buffer.
-
- Given these concepts, one would expect chunks to be all of the same
- size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
- while multiple buffers could have data-dependent lengths. Not an issue
- in pandas if one column is backed by a single NumPy array, but in
- Arrow it seems possible.
- Are multiple chunks *and* multiple buffers per column necessary for
- the purposes of this interchange protocol, or must producers either
- reuse the chunk concept for this or copy the data?
-
- Note: this Column object can only be produced by ``__dataframe__``, so
- doesn't need its own version or ``__column__`` protocol.
- """
-
- @abstractmethod
- def size(self) -> int:
- """
- Size of the column, in elements.
-
- Corresponds to DataFrame.num_rows() if column is a single chunk;
- equal to size of this current chunk otherwise.
- """
-
- @property
- @abstractmethod
- def offset(self) -> int:
- """
- Offset of first element.
-
- May be > 0 if using chunks; for example for a column with N chunks of
- equal size M (only the last chunk may be shorter),
- ``offset = n * M``, ``n = 0 .. N-1``.
- """
-
- @property
- @abstractmethod
- def dtype(self) -> tuple[DtypeKind, int, str, str]:
- """
- Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
-
- Bit-width : the number of bits as an integer
- Format string : data type description format string in Apache Arrow C
- Data Interface format.
- Endianness : current only native endianness (``=``) is supported
-
- Notes:
- - Kind specifiers are aligned with DLPack where possible (hence the
- jump to 20, leave enough room for future extension)
- - Masks must be specified as boolean with either bit width 1 (for bit
- masks) or 8 (for byte masks).
- - Dtype width in bits was preferred over bytes
- - Endianness isn't too useful, but included now in case in the future
- we need to support non-native endianness
- - Went with Apache Arrow format strings over NumPy format strings
- because they're more complete from a dataframe perspective
- - Format strings are mostly useful for datetime specification, and
- for categoricals.
- - For categoricals, the format string describes the type of the
- categorical in the data buffer. In case of a separate encoding of
- the categorical (e.g. an integer to string mapping), this can
- be derived from ``self.describe_categorical``.
- - Data types not included: complex, Arrow-style null, binary, decimal,
- and nested (list, struct, map, union) dtypes.
- """
-
- @property
- @abstractmethod
- def describe_categorical(self) -> CategoricalDescription:
- """
- If the dtype is categorical, there are two options:
- - There are only values in the data buffer.
- - There is a separate non-categorical Column encoding for categorical values.
-
- Raises TypeError if the dtype is not categorical
-
- Returns the dictionary with description on how to interpret the data buffer:
- - "is_ordered" : bool, whether the ordering of dictionary indices is
- semantically meaningful.
- - "is_dictionary" : bool, whether a mapping of
- categorical values to other objects exists
- - "categories" : Column representing the (implicit) mapping of indices to
- category values (e.g. an array of cat1, cat2, ...).
- None if not a dictionary-style categorical.
-
- TBD: are there any other in-memory representations that are needed?
- """
-
- @property
- @abstractmethod
- def describe_null(self) -> tuple[ColumnNullType, Any]:
- """
- Return the missing value (or "null") representation the column dtype
- uses, as a tuple ``(kind, value)``.
-
- Value : if kind is "sentinel value", the actual value. If kind is a bit
- mask or a byte mask, the value (0 or 1) indicating a missing value. None
- otherwise.
- """
-
- @property
- @abstractmethod
- def null_count(self) -> int | None:
- """
- Number of null elements, if known.
-
- Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
- """
-
- @property
- @abstractmethod
- def metadata(self) -> dict[str, Any]:
- """
- The metadata for the column. See `DataFrame.metadata` for more details.
- """
-
- @abstractmethod
- def num_chunks(self) -> int:
- """
- Return the number of chunks the column consists of.
- """
-
- @abstractmethod
- def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:
- """
- Return an iterator yielding the chunks.
-
- See `DataFrame.get_chunks` for details on ``n_chunks``.
- """
-
- @abstractmethod
- def get_buffers(self) -> ColumnBuffers:
- """
- Return a dictionary containing the underlying buffers.
-
- The returned dictionary has the following contents:
-
- - "data": a two-element tuple whose first element is a buffer
- containing the data and whose second element is the data
- buffer's associated dtype.
- - "validity": a two-element tuple whose first element is a buffer
- containing mask values indicating missing data and
- whose second element is the mask value buffer's
- associated dtype. None if the null representation is
- not a bit or byte mask.
- - "offsets": a two-element tuple whose first element is a buffer
- containing the offset values for variable-size binary
- data (e.g., variable-length strings) and whose second
- element is the offsets buffer's associated dtype. None
- if the data buffer does not have an associated offsets
- buffer.
- """
-
-
-# def get_children(self) -> Iterable[Column]:
-# """
-# Children columns underneath the column, each object in this iterator
-# must adhere to the column specification.
-# """
-# pass
-
-
-class DataFrame(ABC):
- """
- A data frame class, with only the methods required by the interchange
- protocol defined.
-
- A "data frame" represents an ordered collection of named columns.
- A column's "name" must be a unique string.
- Columns may be accessed by name or by position.
-
- This could be a public data frame class, or an object with the methods and
- attributes defined on this DataFrame class could be returned from the
- ``__dataframe__`` method of a public data frame class in a library adhering
- to the dataframe interchange protocol specification.
- """
-
- version = 0 # version of the protocol
-
- @abstractmethod
- def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
- """Construct a new interchange object, potentially changing the parameters."""
-
- @property
- @abstractmethod
- def metadata(self) -> dict[str, Any]:
- """
- The metadata for the data frame, as a dictionary with string keys. The
- contents of `metadata` may be anything, they are meant for a library
- to store information that it needs to, e.g., roundtrip losslessly or
- for two implementations to share data that is not (yet) part of the
- interchange protocol specification. For avoiding collisions with other
- entries, please add name the keys with the name of the library
- followed by a period and the desired name, e.g, ``pandas.indexcol``.
- """
-
- @abstractmethod
- def num_columns(self) -> int:
- """
- Return the number of columns in the DataFrame.
- """
-
- @abstractmethod
- def num_rows(self) -> int | None:
- # TODO: not happy with Optional, but need to flag it may be expensive
- # why include it if it may be None - what do we expect consumers
- # to do here?
- """
- Return the number of rows in the DataFrame, if available.
- """
-
- @abstractmethod
- def num_chunks(self) -> int:
- """
- Return the number of chunks the DataFrame consists of.
- """
-
- @abstractmethod
- def column_names(self) -> Iterable[str]:
- """
- Return an iterator yielding the column names.
- """
-
- @abstractmethod
- def get_column(self, i: int) -> Column:
- """
- Return the column at the indicated position.
- """
-
- @abstractmethod
- def get_column_by_name(self, name: str) -> Column:
- """
- Return the column whose name is the indicated name.
- """
-
- @abstractmethod
- def get_columns(self) -> Iterable[Column]:
- """
- Return an iterator yielding the columns.
- """
-
- @abstractmethod
- def select_columns(self, indices: Sequence[int]) -> DataFrame:
- """
- Create a new DataFrame by selecting a subset of columns by index.
- """
-
- @abstractmethod
- def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:
- """
- Create a new DataFrame by selecting a subset of columns by name.
- """
-
- @abstractmethod
- def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:
- """
- Return an iterator yielding the chunks.
-
- By default (None), yields the chunks that the data is stored as by the
- producer. If given, ``n_chunks`` must be a multiple of
- ``self.num_chunks()``, meaning the producer must subdivide each chunk
- before yielding it.
- """
diff --git a/contrib/python/pandas/py3/pandas/core/interchange/from_dataframe.py b/contrib/python/pandas/py3/pandas/core/interchange/from_dataframe.py
deleted file mode 100644
index 78e530f9151..00000000000
--- a/contrib/python/pandas/py3/pandas/core/interchange/from_dataframe.py
+++ /dev/null
@@ -1,499 +0,0 @@
-from __future__ import annotations
-
-import ctypes
-import re
-from typing import Any
-
-import numpy as np
-
-from pandas.compat._optional import import_optional_dependency
-
-import pandas as pd
-from pandas.core.interchange.dataframe_protocol import (
- Buffer,
- Column,
- ColumnNullType,
- DataFrame as DataFrameXchg,
- DtypeKind,
-)
-from pandas.core.interchange.utils import (
- ArrowCTypes,
- Endianness,
-)
-
-_NP_DTYPES: dict[DtypeKind, dict[int, Any]] = {
- DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
- DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
- DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
- DtypeKind.BOOL: {1: bool, 8: bool},
-}
-
-
-def from_dataframe(df, allow_copy: bool = True) -> pd.DataFrame:
- """
- Build a ``pd.DataFrame`` from any DataFrame supporting the interchange protocol.
-
- Parameters
- ----------
- df : DataFrameXchg
- Object supporting the interchange protocol, i.e. `__dataframe__` method.
- allow_copy : bool, default: True
- Whether to allow copying the memory to perform the conversion
- (if false then zero-copy approach is requested).
-
- Returns
- -------
- pd.DataFrame
- """
- if isinstance(df, pd.DataFrame):
- return df
-
- if not hasattr(df, "__dataframe__"):
- raise ValueError("`df` does not support __dataframe__")
-
- return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
-
-
-def _from_dataframe(df: DataFrameXchg, allow_copy: bool = True):
- """
- Build a ``pd.DataFrame`` from the DataFrame interchange object.
-
- Parameters
- ----------
- df : DataFrameXchg
- Object supporting the interchange protocol, i.e. `__dataframe__` method.
- allow_copy : bool, default: True
- Whether to allow copying the memory to perform the conversion
- (if false then zero-copy approach is requested).
-
- Returns
- -------
- pd.DataFrame
- """
- pandas_dfs = []
- for chunk in df.get_chunks():
- pandas_df = protocol_df_chunk_to_pandas(chunk)
- pandas_dfs.append(pandas_df)
-
- if not allow_copy and len(pandas_dfs) > 1:
- raise RuntimeError(
- "To join chunks a copy is required which is forbidden by allow_copy=False"
- )
- if len(pandas_dfs) == 1:
- pandas_df = pandas_dfs[0]
- else:
- pandas_df = pd.concat(pandas_dfs, axis=0, ignore_index=True, copy=False)
-
- index_obj = df.metadata.get("pandas.index", None)
- if index_obj is not None:
- pandas_df.index = index_obj
-
- return pandas_df
-
-
-def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
- """
- Convert interchange protocol chunk to ``pd.DataFrame``.
-
- Parameters
- ----------
- df : DataFrameXchg
-
- Returns
- -------
- pd.DataFrame
- """
- # We need a dict of columns here, with each column being a NumPy array (at
- # least for now, deal with non-NumPy dtypes later).
- columns: dict[str, Any] = {}
- buffers = [] # hold on to buffers, keeps memory alive
- for name in df.column_names():
- if not isinstance(name, str):
- raise ValueError(f"Column {name} is not a string")
- if name in columns:
- raise ValueError(f"Column {name} is not unique")
- col = df.get_column_by_name(name)
- dtype = col.dtype[0]
- if dtype in (
- DtypeKind.INT,
- DtypeKind.UINT,
- DtypeKind.FLOAT,
- DtypeKind.BOOL,
- ):
- columns[name], buf = primitive_column_to_ndarray(col)
- elif dtype == DtypeKind.CATEGORICAL:
- columns[name], buf = categorical_column_to_series(col)
- elif dtype == DtypeKind.STRING:
- columns[name], buf = string_column_to_ndarray(col)
- elif dtype == DtypeKind.DATETIME:
- columns[name], buf = datetime_column_to_ndarray(col)
- else:
- raise NotImplementedError(f"Data type {dtype} not handled yet")
-
- buffers.append(buf)
-
- pandas_df = pd.DataFrame(columns)
- pandas_df.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"] = buffers
- return pandas_df
-
-
-def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
- """
- Convert a column holding one of the primitive dtypes to a NumPy array.
-
- A primitive type is one of: int, uint, float, bool.
-
- Parameters
- ----------
- col : Column
-
- Returns
- -------
- tuple
- Tuple of np.ndarray holding the data and the memory owner object
- that keeps the memory alive.
- """
- buffers = col.get_buffers()
-
- data_buff, data_dtype = buffers["data"]
- data = buffer_to_ndarray(
- data_buff, data_dtype, offset=col.offset, length=col.size()
- )
-
- data = set_nulls(data, col, buffers["validity"])
- return data, buffers
-
-
-def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
- """
- Convert a column holding categorical data to a pandas Series.
-
- Parameters
- ----------
- col : Column
-
- Returns
- -------
- tuple
- Tuple of pd.Series holding the data and the memory owner object
- that keeps the memory alive.
- """
- categorical = col.describe_categorical
-
- if not categorical["is_dictionary"]:
- raise NotImplementedError("Non-dictionary categoricals not supported yet")
-
- cat_column = categorical["categories"]
- if hasattr(cat_column, "_col"):
- # Item "Column" of "Optional[Column]" has no attribute "_col"
- # Item "None" of "Optional[Column]" has no attribute "_col"
- categories = np.array(cat_column._col) # type: ignore[union-attr]
- else:
- raise NotImplementedError(
- "Interchanging categorical columns isn't supported yet, and our "
- "fallback of using the `col._col` attribute (a ndarray) failed."
- )
- buffers = col.get_buffers()
-
- codes_buff, codes_dtype = buffers["data"]
- codes = buffer_to_ndarray(
- codes_buff, codes_dtype, offset=col.offset, length=col.size()
- )
-
- # Doing module in order to not get ``IndexError`` for
- # out-of-bounds sentinel values in `codes`
- if len(categories) > 0:
- values = categories[codes % len(categories)]
- else:
- values = codes
-
- cat = pd.Categorical(
- values, categories=categories, ordered=categorical["is_ordered"]
- )
- data = pd.Series(cat)
-
- data = set_nulls(data, col, buffers["validity"])
- return data, buffers
-
-
-def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
- """
- Convert a column holding string data to a NumPy array.
-
- Parameters
- ----------
- col : Column
-
- Returns
- -------
- tuple
- Tuple of np.ndarray holding the data and the memory owner object
- that keeps the memory alive.
- """
- null_kind, sentinel_val = col.describe_null
-
- if null_kind not in (
- ColumnNullType.NON_NULLABLE,
- ColumnNullType.USE_BITMASK,
- ColumnNullType.USE_BYTEMASK,
- ):
- raise NotImplementedError(
- f"{null_kind} null kind is not yet supported for string columns."
- )
-
- buffers = col.get_buffers()
-
- assert buffers["offsets"], "String buffers must contain offsets"
- # Retrieve the data buffer containing the UTF-8 code units
- data_buff, protocol_data_dtype = buffers["data"]
- # We're going to reinterpret the buffer as uint8, so make sure we can do it safely
- assert protocol_data_dtype[1] == 8
- assert protocol_data_dtype[2] in (
- ArrowCTypes.STRING,
- ArrowCTypes.LARGE_STRING,
- ) # format_str == utf-8
- # Convert the buffers to NumPy arrays. In order to go from STRING to
- # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
- data_dtype = (
- DtypeKind.UINT,
- 8,
- ArrowCTypes.UINT8,
- Endianness.NATIVE,
- )
- # Specify zero offset as we don't want to chunk the string data
- data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
-
- # Retrieve the offsets buffer containing the index offsets demarcating
- # the beginning and the ending of each string
- offset_buff, offset_dtype = buffers["offsets"]
- # Offsets buffer contains start-stop positions of strings in the data buffer,
- # meaning that it has more elements than in the data buffer, do `col.size() + 1`
- # here to pass a proper offsets buffer size
- offsets = buffer_to_ndarray(
- offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1
- )
-
- null_pos = None
- if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
- assert buffers["validity"], "Validity buffers cannot be empty for masks"
- valid_buff, valid_dtype = buffers["validity"]
- null_pos = buffer_to_ndarray(
- valid_buff, valid_dtype, offset=col.offset, length=col.size()
- )
- if sentinel_val == 0:
- null_pos = ~null_pos
-
- # Assemble the strings from the code units
- str_list: list[None | float | str] = [None] * col.size()
- for i in range(col.size()):
- # Check for missing values
- if null_pos is not None and null_pos[i]:
- str_list[i] = np.nan
- continue
-
- # Extract a range of code units
- units = data[offsets[i] : offsets[i + 1]]
-
- # Convert the list of code units to bytes
- str_bytes = bytes(units)
-
- # Create the string
- string = str_bytes.decode(encoding="utf-8")
-
- # Add to our list of strings
- str_list[i] = string
-
- # Convert the string list to a NumPy array
- return np.asarray(str_list, dtype="object"), buffers
-
-
-def parse_datetime_format_str(format_str, data):
- """Parse datetime `format_str` to interpret the `data`."""
- # timestamp 'ts{unit}:tz'
- timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
- if timestamp_meta:
- unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
- if tz != "":
- raise NotImplementedError("Timezones are not supported yet")
- if unit != "s":
- # the format string describes only a first letter of the unit, so
- # add one extra letter to convert the unit to numpy-style:
- # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
- unit += "s"
- data = data.astype(f"datetime64[{unit}]")
- return data
-
- # date 'td{Days/Ms}'
- date_meta = re.match(r"td([Dm])", format_str)
- if date_meta:
- unit = date_meta.group(1)
- if unit == "D":
- # NumPy doesn't support DAY unit, so converting days to seconds
- # (converting to uint64 to avoid overflow)
- data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]")
- elif unit == "m":
- data = data.astype("datetime64[ms]")
- else:
- raise NotImplementedError(f"Date unit is not supported: {unit}")
- return data
-
- raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
-
-
-def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
- """
- Convert a column holding DateTime data to a NumPy array.
-
- Parameters
- ----------
- col : Column
-
- Returns
- -------
- tuple
- Tuple of np.ndarray holding the data and the memory owner object
- that keeps the memory alive.
- """
- buffers = col.get_buffers()
-
- _, _, format_str, _ = col.dtype
- dbuf, dtype = buffers["data"]
- # Consider dtype being `uint` to get number of units passed since the 01.01.1970
- data = buffer_to_ndarray(
- dbuf,
- (
- DtypeKind.UINT,
- dtype[1],
- getattr(ArrowCTypes, f"UINT{dtype[1]}"),
- Endianness.NATIVE,
- ),
- offset=col.offset,
- length=col.size(),
- )
-
- data = parse_datetime_format_str(format_str, data)
- data = set_nulls(data, col, buffers["validity"])
- return data, buffers
-
-
-def buffer_to_ndarray(
- buffer: Buffer,
- dtype: tuple[DtypeKind, int, str, str],
- *,
- length: int,
- offset: int = 0,
-) -> np.ndarray:
- """
- Build a NumPy array from the passed buffer.
-
- Parameters
- ----------
- buffer : Buffer
- Buffer to build a NumPy array from.
- dtype : tuple
- Data type of the buffer conforming protocol dtypes format.
- offset : int, default: 0
- Number of elements to offset from the start of the buffer.
- length : int, optional
- If the buffer is a bit-mask, specifies a number of bits to read
- from the buffer. Has no effect otherwise.
-
- Returns
- -------
- np.ndarray
-
- Notes
- -----
- The returned array doesn't own the memory. The caller of this function is
- responsible for keeping the memory owner object alive as long as
- the returned NumPy array is being used.
- """
- kind, bit_width, _, _ = dtype
-
- column_dtype = _NP_DTYPES.get(kind, {}).get(bit_width, None)
- if column_dtype is None:
- raise NotImplementedError(f"Conversion for {dtype} is not yet supported.")
-
- # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer
- # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
- # it since https://github.com/numpy/numpy/pull/19083
- ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
-
- if bit_width == 1:
- assert length is not None, "`length` must be specified for a bit-mask buffer."
- pa = import_optional_dependency("pyarrow")
- arr = pa.BooleanArray.from_buffers(
- pa.bool_(),
- length,
- [None, pa.foreign_buffer(buffer.ptr, length)],
- offset=offset,
- )
- return np.asarray(arr)
- else:
- data_pointer = ctypes.cast(
- buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
- )
- return np.ctypeslib.as_array(
- data_pointer,
- shape=(length,),
- )
-
-
-def set_nulls(
- data: np.ndarray | pd.Series,
- col: Column,
- validity: tuple[Buffer, tuple[DtypeKind, int, str, str]] | None,
- allow_modify_inplace: bool = True,
-):
- """
- Set null values for the data according to the column null kind.
-
- Parameters
- ----------
- data : np.ndarray or pd.Series
- Data to set nulls in.
- col : Column
- Column object that describes the `data`.
- validity : tuple(Buffer, dtype) or None
- The return value of ``col.buffers()``. We do not access the ``col.buffers()``
- here to not take the ownership of the memory of buffer objects.
- allow_modify_inplace : bool, default: True
- Whether to modify the `data` inplace when zero-copy is possible (True) or always
- modify a copy of the `data` (False).
-
- Returns
- -------
- np.ndarray or pd.Series
- Data with the nulls being set.
- """
- null_kind, sentinel_val = col.describe_null
- null_pos = None
-
- if null_kind == ColumnNullType.USE_SENTINEL:
- null_pos = pd.Series(data) == sentinel_val
- elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
- assert validity, "Expected to have a validity buffer for the mask"
- valid_buff, valid_dtype = validity
- null_pos = buffer_to_ndarray(
- valid_buff, valid_dtype, offset=col.offset, length=col.size()
- )
- if sentinel_val == 0:
- null_pos = ~null_pos
- elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):
- pass
- else:
- raise NotImplementedError(f"Null kind {null_kind} is not yet supported.")
-
- if null_pos is not None and np.any(null_pos):
- if not allow_modify_inplace:
- data = data.copy()
- try:
- data[null_pos] = None
- except TypeError:
- # TypeError happens if the `data` dtype appears to be non-nullable
- # in numpy notation (bool, int, uint). If this happens,
- # cast the `data` to nullable float dtype.
- data = data.astype(float)
- data[null_pos] = None
-
- return data
diff --git a/contrib/python/pandas/py3/pandas/core/interchange/utils.py b/contrib/python/pandas/py3/pandas/core/interchange/utils.py
deleted file mode 100644
index 5176423adde..00000000000
--- a/contrib/python/pandas/py3/pandas/core/interchange/utils.py
+++ /dev/null
@@ -1,92 +0,0 @@
-"""
-Utility functions and objects for implementing the interchange API.
-"""
-
-from __future__ import annotations
-
-import re
-import typing
-
-import numpy as np
-
-from pandas._typing import DtypeObj
-
-import pandas as pd
-from pandas.api.types import is_datetime64_dtype
-
-
-class ArrowCTypes:
- """
- Enum for Apache Arrow C type format strings.
-
- The Arrow C data interface:
- https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
- """
-
- NULL = "n"
- BOOL = "b"
- INT8 = "c"
- UINT8 = "C"
- INT16 = "s"
- UINT16 = "S"
- INT32 = "i"
- UINT32 = "I"
- INT64 = "l"
- UINT64 = "L"
- FLOAT16 = "e"
- FLOAT32 = "f"
- FLOAT64 = "g"
- STRING = "u" # utf-8
- LARGE_STRING = "U" # utf-8
- DATE32 = "tdD"
- DATE64 = "tdm"
- # Resoulution:
- # - seconds -> 's'
- # - milliseconds -> 'm'
- # - microseconds -> 'u'
- # - nanoseconds -> 'n'
- TIMESTAMP = "ts{resolution}:{tz}"
- TIME = "tt{resolution}"
-
-
-class Endianness:
- """Enum indicating the byte-order of a data-type."""
-
- LITTLE = "<"
- BIG = ">"
- NATIVE = "="
- NA = "|"
-
-
-def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
- """
- Represent pandas `dtype` as a format string in Apache Arrow C notation.
-
- Parameters
- ----------
- dtype : np.dtype
- Datatype of pandas DataFrame to represent.
-
- Returns
- -------
- str
- Format string in Apache Arrow C notation of the given `dtype`.
- """
- if isinstance(dtype, pd.CategoricalDtype):
- return ArrowCTypes.INT64
- elif dtype == np.dtype("O"):
- return ArrowCTypes.STRING
-
- format_str = getattr(ArrowCTypes, dtype.name.upper(), None)
- if format_str is not None:
- return format_str
-
- if is_datetime64_dtype(dtype):
- # Selecting the first char of resolution string:
- # dtype.str -> '<M8[ns]'
- resolution = re.findall(r"\[(.*)\]", typing.cast(np.dtype, dtype).str)[0][:1]
- return ArrowCTypes.TIMESTAMP.format(resolution=resolution, tz="")
-
- raise NotImplementedError(
- f"Conversion of {dtype} to Arrow C format string is not implemented."
- )
diff --git a/contrib/python/pandas/py3/pandas/core/internals/__init__.py b/contrib/python/pandas/py3/pandas/core/internals/__init__.py
deleted file mode 100644
index 0797e62de7a..00000000000
--- a/contrib/python/pandas/py3/pandas/core/internals/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from pandas.core.internals.api import make_block
-from pandas.core.internals.array_manager import (
- ArrayManager,
- SingleArrayManager,
-)
-from pandas.core.internals.base import (
- DataManager,
- SingleDataManager,
-)
-from pandas.core.internals.blocks import ( # io.pytables, io.packers
- Block,
- DatetimeTZBlock,
- ExtensionBlock,
- NumericBlock,
- ObjectBlock,
-)
-from pandas.core.internals.concat import concatenate_managers
-from pandas.core.internals.managers import (
- BlockManager,
- SingleBlockManager,
- create_block_manager_from_blocks,
-)
-
-__all__ = [
- "Block",
- "NumericBlock",
- "DatetimeTZBlock",
- "ExtensionBlock",
- "ObjectBlock",
- "make_block",
- "DataManager",
- "ArrayManager",
- "BlockManager",
- "SingleDataManager",
- "SingleBlockManager",
- "SingleArrayManager",
- "concatenate_managers",
- # this is preserved here for downstream compatibility (GH-33892)
- "create_block_manager_from_blocks",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/internals/api.py b/contrib/python/pandas/py3/pandas/core/internals/api.py
deleted file mode 100644
index 5e03621db2a..00000000000
--- a/contrib/python/pandas/py3/pandas/core/internals/api.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""
-This is a pseudo-public API for downstream libraries. We ask that downstream
-authors
-
-1) Try to avoid using internals directly altogether, and failing that,
-2) Use only functions exposed here (or in core.internals)
-
-"""
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._libs.internals import BlockPlacement
-from pandas._typing import Dtype
-
-from pandas.core.dtypes.common import (
- is_datetime64tz_dtype,
- is_period_dtype,
- pandas_dtype,
-)
-
-from pandas.core.arrays import DatetimeArray
-from pandas.core.construction import extract_array
-from pandas.core.internals.blocks import (
- Block,
- DatetimeTZBlock,
- ExtensionBlock,
- check_ndim,
- ensure_block_shape,
- extract_pandas_array,
- get_block_type,
- maybe_coerce_values,
-)
-
-
-def make_block(
- values, placement, klass=None, ndim=None, dtype: Dtype | None = None
-) -> Block:
- """
- This is a pseudo-public analogue to blocks.new_block.
-
- We ask that downstream libraries use this rather than any fully-internal
- APIs, including but not limited to:
-
- - core.internals.blocks.make_block
- - Block.make_block
- - Block.make_block_same_class
- - Block.__init__
- """
- if dtype is not None:
- dtype = pandas_dtype(dtype)
-
- values, dtype = extract_pandas_array(values, dtype, ndim)
-
- if klass is ExtensionBlock and is_period_dtype(values.dtype):
- # GH-44681 changed PeriodArray to be stored in the 2D
- # NDArrayBackedExtensionBlock instead of ExtensionBlock
- # -> still allow ExtensionBlock to be passed in this case for back compat
- klass = None
-
- if klass is None:
- dtype = dtype or values.dtype
- klass = get_block_type(dtype)
-
- elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values.dtype):
- # pyarrow calls get here
- values = DatetimeArray._simple_new(values, dtype=dtype)
-
- if not isinstance(placement, BlockPlacement):
- placement = BlockPlacement(placement)
-
- ndim = maybe_infer_ndim(values, placement, ndim)
- if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype):
- # GH#41168 ensure we can pass 1D dt64tz values
- # More generally, any EA dtype that isn't is_1d_only_ea_dtype
- values = extract_array(values, extract_numpy=True)
- values = ensure_block_shape(values, ndim)
-
- check_ndim(values, placement, ndim)
- values = maybe_coerce_values(values)
- return klass(values, ndim=ndim, placement=placement)
-
-
-def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int:
- """
- If `ndim` is not provided, infer it from placement and values.
- """
- if ndim is None:
- # GH#38134 Block constructor now assumes ndim is not None
- if not isinstance(values.dtype, np.dtype):
- if len(placement) != 1:
- ndim = 1
- else:
- ndim = 2
- else:
- ndim = values.ndim
- return ndim
diff --git a/contrib/python/pandas/py3/pandas/core/internals/array_manager.py b/contrib/python/pandas/py3/pandas/core/internals/array_manager.py
deleted file mode 100644
index a96025dfebc..00000000000
--- a/contrib/python/pandas/py3/pandas/core/internals/array_manager.py
+++ /dev/null
@@ -1,1361 +0,0 @@
-"""
-Experimental manager based on storing a collection of 1D arrays
-"""
-from __future__ import annotations
-
-from typing import (
- Any,
- Callable,
- Hashable,
- Literal,
- TypeVar,
-)
-
-import numpy as np
-
-from pandas._libs import (
- NaT,
- algos as libalgos,
- lib,
-)
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- DtypeObj,
- QuantileInterpolation,
- npt,
-)
-from pandas.util._validators import validate_bool_kwarg
-
-from pandas.core.dtypes.astype import astype_array_safe
-from pandas.core.dtypes.cast import (
- ensure_dtype_can_hold_na,
- infer_dtype_from_scalar,
-)
-from pandas.core.dtypes.common import (
- ensure_platform_int,
- is_datetime64_ns_dtype,
- is_dtype_equal,
- is_extension_array_dtype,
- is_integer,
- is_numeric_dtype,
- is_object_dtype,
- is_timedelta64_ns_dtype,
-)
-from pandas.core.dtypes.dtypes import (
- ExtensionDtype,
- PandasDtype,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import (
- array_equals,
- isna,
- na_value_for_dtype,
-)
-
-import pandas.core.algorithms as algos
-from pandas.core.array_algos.quantile import quantile_compat
-from pandas.core.array_algos.take import take_1d
-from pandas.core.arrays import (
- DatetimeArray,
- ExtensionArray,
- PandasArray,
- TimedeltaArray,
-)
-from pandas.core.arrays.sparse import SparseDtype
-from pandas.core.construction import (
- ensure_wrapped_if_datetimelike,
- extract_array,
- sanitize_array,
-)
-from pandas.core.indexers import (
- maybe_convert_indices,
- validate_indices,
-)
-from pandas.core.indexes.api import (
- Index,
- ensure_index,
-)
-from pandas.core.internals.base import (
- DataManager,
- SingleDataManager,
- interleaved_dtype,
-)
-from pandas.core.internals.blocks import (
- ensure_block_shape,
- external_values,
- extract_pandas_array,
- maybe_coerce_values,
- new_block,
- to_native_types,
-)
-
-T = TypeVar("T", bound="BaseArrayManager")
-
-
-class BaseArrayManager(DataManager):
- """
- Core internal data structure to implement DataFrame and Series.
-
- Alternative to the BlockManager, storing a list of 1D arrays instead of
- Blocks.
-
- This is *not* a public API class
-
- Parameters
- ----------
- arrays : Sequence of arrays
- axes : Sequence of Index
- verify_integrity : bool, default True
-
- """
-
- __slots__ = [
- "_axes", # private attribute, because 'axes' has different order, see below
- "arrays",
- ]
-
- arrays: list[np.ndarray | ExtensionArray]
- _axes: list[Index]
-
- def __init__(
- self,
- arrays: list[np.ndarray | ExtensionArray],
- axes: list[Index],
- verify_integrity: bool = True,
- ) -> None:
- raise NotImplementedError
-
- def make_empty(self: T, axes=None) -> T:
- """Return an empty ArrayManager with the items axis of len 0 (no columns)"""
- if axes is None:
- axes = [self.axes[1:], Index([])]
-
- arrays: list[np.ndarray | ExtensionArray] = []
- return type(self)(arrays, axes)
-
- @property
- def items(self) -> Index:
- return self._axes[-1]
-
- @property
- # error: Signature of "axes" incompatible with supertype "DataManager"
- def axes(self) -> list[Index]: # type: ignore[override]
- # mypy doesn't work to override attribute with property
- # see https://github.com/python/mypy/issues/4125
- """Axes is BlockManager-compatible order (columns, rows)"""
- return [self._axes[1], self._axes[0]]
-
- @property
- def shape_proper(self) -> tuple[int, ...]:
- # this returns (n_rows, n_columns)
- return tuple(len(ax) for ax in self._axes)
-
- @staticmethod
- def _normalize_axis(axis: AxisInt) -> int:
- # switch axis
- axis = 1 if axis == 0 else 0
- return axis
-
- def set_axis(self, axis: AxisInt, new_labels: Index) -> None:
- # Caller is responsible for ensuring we have an Index object.
- self._validate_set_axis(axis, new_labels)
- axis = self._normalize_axis(axis)
- self._axes[axis] = new_labels
-
- def get_dtypes(self) -> np.ndarray:
- return np.array([arr.dtype for arr in self.arrays], dtype="object")
-
- def add_references(self, mgr: BaseArrayManager) -> None:
- """
- Only implemented on the BlockManager level
- """
- return
-
- def __getstate__(self):
- return self.arrays, self._axes
-
- def __setstate__(self, state) -> None:
- self.arrays = state[0]
- self._axes = state[1]
-
- def __repr__(self) -> str:
- output = type(self).__name__
- output += f"\nIndex: {self._axes[0]}"
- if self.ndim == 2:
- output += f"\nColumns: {self._axes[1]}"
- output += f"\n{len(self.arrays)} arrays:"
- for arr in self.arrays:
- output += f"\n{arr.dtype}"
- return output
-
- def apply(
- self: T,
- f,
- align_keys: list[str] | None = None,
- **kwargs,
- ) -> T:
- """
- Iterate over the arrays, collect and create a new ArrayManager.
-
- Parameters
- ----------
- f : str or callable
- Name of the Array method to apply.
- align_keys: List[str] or None, default None
- **kwargs
- Keywords to pass to `f`
-
- Returns
- -------
- ArrayManager
- """
- assert "filter" not in kwargs
-
- align_keys = align_keys or []
- result_arrays: list[np.ndarray] = []
- # fillna: Series/DataFrame is responsible for making sure value is aligned
-
- aligned_args = {k: kwargs[k] for k in align_keys}
-
- if f == "apply":
- f = kwargs.pop("func")
-
- for i, arr in enumerate(self.arrays):
- if aligned_args:
- for k, obj in aligned_args.items():
- if isinstance(obj, (ABCSeries, ABCDataFrame)):
- # The caller is responsible for ensuring that
- # obj.axes[-1].equals(self.items)
- if obj.ndim == 1:
- kwargs[k] = obj.iloc[i]
- else:
- kwargs[k] = obj.iloc[:, i]._values
- else:
- # otherwise we have an array-like
- kwargs[k] = obj[i]
-
- if callable(f):
- applied = f(arr, **kwargs)
- else:
- applied = getattr(arr, f)(**kwargs)
-
- # if not isinstance(applied, ExtensionArray):
- # # TODO not all EA operations return new EAs (eg astype)
- # applied = array(applied)
- result_arrays.append(applied)
-
- new_axes = self._axes
-
- # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
- # expected "List[Union[ndarray, ExtensionArray]]"
- return type(self)(result_arrays, new_axes) # type: ignore[arg-type]
-
- def apply_with_block(
- self: T, f, align_keys=None, swap_axis: bool = True, **kwargs
- ) -> T:
- # switch axis to follow BlockManager logic
- if swap_axis and "axis" in kwargs and self.ndim == 2:
- kwargs["axis"] = 1 if kwargs["axis"] == 0 else 0
-
- align_keys = align_keys or []
- aligned_args = {k: kwargs[k] for k in align_keys}
-
- result_arrays = []
-
- for i, arr in enumerate(self.arrays):
- if aligned_args:
- for k, obj in aligned_args.items():
- if isinstance(obj, (ABCSeries, ABCDataFrame)):
- # The caller is responsible for ensuring that
- # obj.axes[-1].equals(self.items)
- if obj.ndim == 1:
- if self.ndim == 2:
- kwargs[k] = obj.iloc[slice(i, i + 1)]._values
- else:
- kwargs[k] = obj.iloc[:]._values
- else:
- kwargs[k] = obj.iloc[:, [i]]._values
- else:
- # otherwise we have an ndarray
- if obj.ndim == 2:
- kwargs[k] = obj[[i]]
-
- if isinstance(arr.dtype, np.dtype) and not isinstance(arr, np.ndarray):
- # i.e. TimedeltaArray, DatetimeArray with tz=None. Need to
- # convert for the Block constructors.
- arr = np.asarray(arr)
-
- if self.ndim == 2:
- arr = ensure_block_shape(arr, 2)
- block = new_block(arr, placement=slice(0, 1, 1), ndim=2)
- else:
- block = new_block(arr, placement=slice(0, len(self), 1), ndim=1)
-
- applied = getattr(block, f)(**kwargs)
- if isinstance(applied, list):
- applied = applied[0]
- arr = applied.values
- if self.ndim == 2 and arr.ndim == 2:
- # 2D for np.ndarray or DatetimeArray/TimedeltaArray
- assert len(arr) == 1
- # error: No overload variant of "__getitem__" of "ExtensionArray"
- # matches argument type "Tuple[int, slice]"
- arr = arr[0, :] # type: ignore[call-overload]
- result_arrays.append(arr)
-
- return type(self)(result_arrays, self._axes)
-
- def where(self: T, other, cond, align: bool) -> T:
- if align:
- align_keys = ["other", "cond"]
- else:
- align_keys = ["cond"]
- other = extract_array(other, extract_numpy=True)
-
- return self.apply_with_block(
- "where",
- align_keys=align_keys,
- other=other,
- cond=cond,
- )
-
- def round(self: T, decimals: int, using_cow: bool = False) -> T:
- return self.apply_with_block("round", decimals=decimals, using_cow=using_cow)
-
- def setitem(self: T, indexer, value) -> T:
- return self.apply_with_block("setitem", indexer=indexer, value=value)
-
- def putmask(self: T, mask, new, align: bool = True) -> T:
- if align:
- align_keys = ["new", "mask"]
- else:
- align_keys = ["mask"]
- new = extract_array(new, extract_numpy=True)
-
- return self.apply_with_block(
- "putmask",
- align_keys=align_keys,
- mask=mask,
- new=new,
- )
-
- def diff(self: T, n: int, axis: AxisInt) -> T:
- assert self.ndim == 2 and axis == 0 # caller ensures
- return self.apply(algos.diff, n=n, axis=axis)
-
- def interpolate(self: T, **kwargs) -> T:
- return self.apply_with_block("interpolate", swap_axis=False, **kwargs)
-
- def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T:
- if fill_value is lib.no_default:
- fill_value = None
-
- if axis == 1 and self.ndim == 2:
- # TODO column-wise shift
- raise NotImplementedError
-
- return self.apply_with_block(
- "shift", periods=periods, axis=axis, fill_value=fill_value
- )
-
- def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
- if limit is not None:
- # Do this validation even if we go through one of the no-op paths
- limit = libalgos.validate_limit(None, limit=limit)
-
- return self.apply_with_block(
- "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
- )
-
- def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
- if copy is None:
- copy = True
-
- return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)
-
- def convert(self: T, copy: bool | None) -> T:
- if copy is None:
- copy = True
-
- def _convert(arr):
- if is_object_dtype(arr.dtype):
- # extract PandasArray for tests that patch PandasArray._typ
- arr = np.asarray(arr)
- result = lib.maybe_convert_objects(
- arr,
- convert_datetime=True,
- convert_timedelta=True,
- convert_period=True,
- convert_interval=True,
- )
- if result is arr and copy:
- return arr.copy()
- return result
- else:
- return arr.copy() if copy else arr
-
- return self.apply(_convert)
-
- def replace_regex(self: T, **kwargs) -> T:
- return self.apply_with_block("_replace_regex", **kwargs)
-
- def replace(self: T, to_replace, value, inplace: bool) -> T:
- inplace = validate_bool_kwarg(inplace, "inplace")
- assert np.ndim(value) == 0, value
- # TODO "replace" is right now implemented on the blocks, we should move
- # it to general array algos so it can be reused here
- return self.apply_with_block(
- "replace", value=value, to_replace=to_replace, inplace=inplace
- )
-
- def replace_list(
- self: T,
- src_list: list[Any],
- dest_list: list[Any],
- inplace: bool = False,
- regex: bool = False,
- ) -> T:
- """do a list replace"""
- inplace = validate_bool_kwarg(inplace, "inplace")
-
- return self.apply_with_block(
- "replace_list",
- src_list=src_list,
- dest_list=dest_list,
- inplace=inplace,
- regex=regex,
- )
-
- def to_native_types(self: T, **kwargs) -> T:
- return self.apply(to_native_types, **kwargs)
-
- @property
- def is_mixed_type(self) -> bool:
- return True
-
- @property
- def is_numeric_mixed_type(self) -> bool:
- return all(is_numeric_dtype(t) for t in self.get_dtypes())
-
- @property
- def any_extension_types(self) -> bool:
- """Whether any of the blocks in this manager are extension blocks"""
- return False # any(block.is_extension for block in self.blocks)
-
- @property
- def is_view(self) -> bool:
- """return a boolean if we are a single block and are a view"""
- # TODO what is this used for?
- return False
-
- @property
- def is_single_block(self) -> bool:
- return len(self.arrays) == 1
-
- def _get_data_subset(self: T, predicate: Callable) -> T:
- indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)]
- arrays = [self.arrays[i] for i in indices]
- # TODO copy?
- # Note: using Index.take ensures we can retain e.g. DatetimeIndex.freq,
- # see test_describe_datetime_columns
- taker = np.array(indices, dtype="intp")
- new_cols = self._axes[1].take(taker)
- new_axes = [self._axes[0], new_cols]
- return type(self)(arrays, new_axes, verify_integrity=False)
-
- def get_bool_data(self: T, copy: bool = False) -> T:
- """
- Select columns that are bool-dtype and object-dtype columns that are all-bool.
-
- Parameters
- ----------
- copy : bool, default False
- Whether to copy the blocks
- """
- return self._get_data_subset(lambda x: x.dtype == np.dtype(bool))
-
- def get_numeric_data(self: T, copy: bool = False) -> T:
- """
- Select columns that have a numeric dtype.
-
- Parameters
- ----------
- copy : bool, default False
- Whether to copy the blocks
- """
- return self._get_data_subset(
- lambda arr: is_numeric_dtype(arr.dtype)
- or getattr(arr.dtype, "_is_numeric", False)
- )
-
- def copy(self: T, deep: bool | Literal["all"] | None = True) -> T:
- """
- Make deep or shallow copy of ArrayManager
-
- Parameters
- ----------
- deep : bool or string, default True
- If False, return shallow copy (do not copy data)
- If 'all', copy data and a deep copy of the index
-
- Returns
- -------
- BlockManager
- """
- if deep is None:
- # ArrayManager does not yet support CoW, so deep=None always means
- # deep=True for now
- deep = True
-
- # this preserves the notion of view copying of axes
- if deep:
- # hit in e.g. tests.io.json.test_pandas
-
- def copy_func(ax):
- return ax.copy(deep=True) if deep == "all" else ax.view()
-
- new_axes = [copy_func(ax) for ax in self._axes]
- else:
- new_axes = list(self._axes)
-
- if deep:
- new_arrays = [arr.copy() for arr in self.arrays]
- else:
- new_arrays = list(self.arrays)
- return type(self)(new_arrays, new_axes, verify_integrity=False)
-
- def reindex_indexer(
- self: T,
- new_axis,
- indexer,
- axis: AxisInt,
- fill_value=None,
- allow_dups: bool = False,
- copy: bool | None = True,
- # ignored keywords
- only_slice: bool = False,
- # ArrayManager specific keywords
- use_na_proxy: bool = False,
- ) -> T:
- axis = self._normalize_axis(axis)
- return self._reindex_indexer(
- new_axis,
- indexer,
- axis,
- fill_value,
- allow_dups,
- copy,
- use_na_proxy,
- )
-
- def _reindex_indexer(
- self: T,
- new_axis,
- indexer: npt.NDArray[np.intp] | None,
- axis: AxisInt,
- fill_value=None,
- allow_dups: bool = False,
- copy: bool | None = True,
- use_na_proxy: bool = False,
- ) -> T:
- """
- Parameters
- ----------
- new_axis : Index
- indexer : ndarray[intp] or None
- axis : int
- fill_value : object, default None
- allow_dups : bool, default False
- copy : bool, default True
-
-
- pandas-indexer with -1's only.
- """
- if copy is None:
- # ArrayManager does not yet support CoW, so deep=None always means
- # deep=True for now
- copy = True
-
- if indexer is None:
- if new_axis is self._axes[axis] and not copy:
- return self
-
- result = self.copy(deep=copy)
- result._axes = list(self._axes)
- result._axes[axis] = new_axis
- return result
-
- # some axes don't allow reindexing with dups
- if not allow_dups:
- self._axes[axis]._validate_can_reindex(indexer)
-
- if axis >= self.ndim:
- raise IndexError("Requested axis not found in manager")
-
- if axis == 1:
- new_arrays = []
- for i in indexer:
- if i == -1:
- arr = self._make_na_array(
- fill_value=fill_value, use_na_proxy=use_na_proxy
- )
- else:
- arr = self.arrays[i]
- if copy:
- arr = arr.copy()
- new_arrays.append(arr)
-
- else:
- validate_indices(indexer, len(self._axes[0]))
- indexer = ensure_platform_int(indexer)
- mask = indexer == -1
- needs_masking = mask.any()
- new_arrays = [
- take_1d(
- arr,
- indexer,
- allow_fill=needs_masking,
- fill_value=fill_value,
- mask=mask,
- # if fill_value is not None else blk.fill_value
- )
- for arr in self.arrays
- ]
-
- new_axes = list(self._axes)
- new_axes[axis] = new_axis
-
- return type(self)(new_arrays, new_axes, verify_integrity=False)
-
- def take(
- self: T,
- indexer,
- axis: AxisInt = 1,
- verify: bool = True,
- convert_indices: bool = True,
- ) -> T:
- """
- Take items along any axis.
- """
- axis = self._normalize_axis(axis)
-
- indexer = (
- np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64")
- if isinstance(indexer, slice)
- else np.asanyarray(indexer, dtype="int64")
- )
-
- if not indexer.ndim == 1:
- raise ValueError("indexer should be 1-dimensional")
-
- n = self.shape_proper[axis]
- if convert_indices:
- indexer = maybe_convert_indices(indexer, n, verify=verify)
-
- new_labels = self._axes[axis].take(indexer)
- return self._reindex_indexer(
- new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True
- )
-
- def _make_na_array(self, fill_value=None, use_na_proxy: bool = False):
- if use_na_proxy:
- assert fill_value is None
- return NullArrayProxy(self.shape_proper[0])
-
- if fill_value is None:
- fill_value = np.nan
-
- dtype, fill_value = infer_dtype_from_scalar(fill_value)
- # error: Argument "dtype" to "empty" has incompatible type "Union[dtype[Any],
- # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
- # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
- # _DTypeDict, Tuple[Any, Any]]]"
- values = np.empty(self.shape_proper[0], dtype=dtype) # type: ignore[arg-type]
- values.fill(fill_value)
- return values
-
- def _equal_values(self, other) -> bool:
- """
- Used in .equals defined in base class. Only check the column values
- assuming shape and indexes have already been checked.
- """
- for left, right in zip(self.arrays, other.arrays):
- if not array_equals(left, right):
- return False
- return True
-
- # TODO
- # to_dict
-
-
-class ArrayManager(BaseArrayManager):
- @property
- def ndim(self) -> Literal[2]:
- return 2
-
- def __init__(
- self,
- arrays: list[np.ndarray | ExtensionArray],
- axes: list[Index],
- verify_integrity: bool = True,
- ) -> None:
- # Note: we are storing the axes in "_axes" in the (row, columns) order
- # which contrasts the order how it is stored in BlockManager
- self._axes = axes
- self.arrays = arrays
-
- if verify_integrity:
- self._axes = [ensure_index(ax) for ax in axes]
- arrays = [extract_pandas_array(x, None, 1)[0] for x in arrays]
- self.arrays = [maybe_coerce_values(arr) for arr in arrays]
- self._verify_integrity()
-
- def _verify_integrity(self) -> None:
- n_rows, n_columns = self.shape_proper
- if not len(self.arrays) == n_columns:
- raise ValueError(
- "Number of passed arrays must equal the size of the column Index: "
- f"{len(self.arrays)} arrays vs {n_columns} columns."
- )
- for arr in self.arrays:
- if not len(arr) == n_rows:
- raise ValueError(
- "Passed arrays should have the same length as the rows Index: "
- f"{len(arr)} vs {n_rows} rows"
- )
- if not isinstance(arr, (np.ndarray, ExtensionArray)):
- raise ValueError(
- "Passed arrays should be np.ndarray or ExtensionArray instances, "
- f"got {type(arr)} instead"
- )
- if not arr.ndim == 1:
- raise ValueError(
- "Passed arrays should be 1-dimensional, got array with "
- f"{arr.ndim} dimensions instead."
- )
-
- # --------------------------------------------------------------------
- # Indexing
-
- def fast_xs(self, loc: int) -> SingleArrayManager:
- """
- Return the array corresponding to `frame.iloc[loc]`.
-
- Parameters
- ----------
- loc : int
-
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- dtype = interleaved_dtype([arr.dtype for arr in self.arrays])
-
- values = [arr[loc] for arr in self.arrays]
- if isinstance(dtype, ExtensionDtype):
- result = dtype.construct_array_type()._from_sequence(values, dtype=dtype)
- # for datetime64/timedelta64, the np.ndarray constructor cannot handle pd.NaT
- elif is_datetime64_ns_dtype(dtype):
- result = DatetimeArray._from_sequence(values, dtype=dtype)._ndarray
- elif is_timedelta64_ns_dtype(dtype):
- result = TimedeltaArray._from_sequence(values, dtype=dtype)._ndarray
- else:
- result = np.array(values, dtype=dtype)
- return SingleArrayManager([result], [self._axes[1]])
-
- def get_slice(self, slobj: slice, axis: AxisInt = 0) -> ArrayManager:
- axis = self._normalize_axis(axis)
-
- if axis == 0:
- arrays = [arr[slobj] for arr in self.arrays]
- elif axis == 1:
- arrays = self.arrays[slobj]
-
- new_axes = list(self._axes)
- new_axes[axis] = new_axes[axis]._getitem_slice(slobj)
-
- return type(self)(arrays, new_axes, verify_integrity=False)
-
- def iget(self, i: int) -> SingleArrayManager:
- """
- Return the data as a SingleArrayManager.
- """
- values = self.arrays[i]
- return SingleArrayManager([values], [self._axes[0]])
-
- def iget_values(self, i: int) -> ArrayLike:
- """
- Return the data for column i as the values (ndarray or ExtensionArray).
- """
- return self.arrays[i]
-
- @property
- def column_arrays(self) -> list[ArrayLike]:
- """
- Used in the JSON C code to access column arrays.
- """
-
- return [np.asarray(arr) for arr in self.arrays]
-
- def iset(
- self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
- ) -> None:
- """
- Set new column(s).
-
- This changes the ArrayManager in-place, but replaces (an) existing
- column(s), not changing column values in-place).
-
- Parameters
- ----------
- loc : integer, slice or boolean mask
- Positional location (already bounds checked)
- value : np.ndarray or ExtensionArray
- inplace : bool, default False
- Whether overwrite existing array as opposed to replacing it.
- """
- # single column -> single integer index
- if lib.is_integer(loc):
- # TODO can we avoid needing to unpack this here? That means converting
- # DataFrame into 1D array when loc is an integer
- if isinstance(value, np.ndarray) and value.ndim == 2:
- assert value.shape[1] == 1
- value = value[:, 0]
-
- # TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item
- # but we should avoid that and pass directly the proper array
- value = maybe_coerce_values(value)
-
- assert isinstance(value, (np.ndarray, ExtensionArray))
- assert value.ndim == 1
- assert len(value) == len(self._axes[0])
- self.arrays[loc] = value
- return
-
- # multiple columns -> convert slice or array to integer indices
- elif isinstance(loc, slice):
- indices = range(
- loc.start if loc.start is not None else 0,
- loc.stop if loc.stop is not None else self.shape_proper[1],
- loc.step if loc.step is not None else 1,
- )
- else:
- assert isinstance(loc, np.ndarray)
- assert loc.dtype == "bool"
- # error: Incompatible types in assignment (expression has type "ndarray",
- # variable has type "range")
- indices = np.nonzero(loc)[0] # type: ignore[assignment]
-
- assert value.ndim == 2
- assert value.shape[0] == len(self._axes[0])
-
- for value_idx, mgr_idx in enumerate(indices):
- # error: No overload variant of "__getitem__" of "ExtensionArray" matches
- # argument type "Tuple[slice, int]"
- value_arr = value[:, value_idx] # type: ignore[call-overload]
- self.arrays[mgr_idx] = value_arr
- return
-
- def column_setitem(
- self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
- ) -> None:
- """
- Set values ("setitem") into a single column (not setting the full column).
-
- This is a method on the ArrayManager level, to avoid creating an
- intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
- """
- if not is_integer(loc):
- raise TypeError("The column index should be an integer")
- arr = self.arrays[loc]
- mgr = SingleArrayManager([arr], [self._axes[0]])
- if inplace_only:
- mgr.setitem_inplace(idx, value)
- else:
- new_mgr = mgr.setitem((idx,), value)
- # update existing ArrayManager in-place
- self.arrays[loc] = new_mgr.arrays[0]
-
- def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
- """
- Insert item at selected position.
-
- Parameters
- ----------
- loc : int
- item : hashable
- value : np.ndarray or ExtensionArray
- """
- # insert to the axis; this could possibly raise a TypeError
- new_axis = self.items.insert(loc, item)
-
- value = extract_array(value, extract_numpy=True)
- if value.ndim == 2:
- if value.shape[0] == 1:
- # error: No overload variant of "__getitem__" of "ExtensionArray"
- # matches argument type "Tuple[int, slice]"
- value = value[0, :] # type: ignore[call-overload]
- else:
- raise ValueError(
- f"Expected a 1D array, got an array with shape {value.shape}"
- )
- value = maybe_coerce_values(value)
-
- # TODO self.arrays can be empty
- # assert len(value) == len(self.arrays[0])
-
- # TODO is this copy needed?
- arrays = self.arrays.copy()
- arrays.insert(loc, value)
-
- self.arrays = arrays
- self._axes[1] = new_axis
-
- def idelete(self, indexer) -> ArrayManager:
- """
- Delete selected locations in-place (new block and array, same BlockManager)
- """
- to_keep = np.ones(self.shape[0], dtype=np.bool_)
- to_keep[indexer] = False
-
- self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]]
- self._axes = [self._axes[0], self._axes[1][to_keep]]
- return self
-
- # --------------------------------------------------------------------
- # Array-wise Operation
-
- def grouped_reduce(self: T, func: Callable) -> T:
- """
- Apply grouped reduction function columnwise, returning a new ArrayManager.
-
- Parameters
- ----------
- func : grouped reduction function
-
- Returns
- -------
- ArrayManager
- """
- result_arrays: list[np.ndarray] = []
- result_indices: list[int] = []
-
- for i, arr in enumerate(self.arrays):
- # grouped_reduce functions all expect 2D arrays
- arr = ensure_block_shape(arr, ndim=2)
- res = func(arr)
- if res.ndim == 2:
- # reverse of ensure_block_shape
- assert res.shape[0] == 1
- res = res[0]
-
- result_arrays.append(res)
- result_indices.append(i)
-
- if len(result_arrays) == 0:
- nrows = 0
- else:
- nrows = result_arrays[0].shape[0]
- index = Index(range(nrows))
-
- columns = self.items
-
- # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
- # expected "List[Union[ndarray, ExtensionArray]]"
- return type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
-
- def reduce(self: T, func: Callable) -> T:
- """
- Apply reduction function column-wise, returning a single-row ArrayManager.
-
- Parameters
- ----------
- func : reduction function
-
- Returns
- -------
- ArrayManager
- """
- result_arrays: list[np.ndarray] = []
- for i, arr in enumerate(self.arrays):
- res = func(arr, axis=0)
-
- # TODO NaT doesn't preserve dtype, so we need to ensure to create
- # a timedelta result array if original was timedelta
- # what if datetime results in timedelta? (eg std)
- dtype = arr.dtype if res is NaT else None
- result_arrays.append(
- sanitize_array([res], None, dtype=dtype) # type: ignore[arg-type]
- )
-
- index = Index._simple_new(np.array([None], dtype=object)) # placeholder
- columns = self.items
-
- # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]";
- # expected "List[Union[ndarray, ExtensionArray]]"
- new_mgr = type(self)(result_arrays, [index, columns]) # type: ignore[arg-type]
- return new_mgr
-
- def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager:
- """
- Apply array_op blockwise with another (aligned) BlockManager.
- """
- # TODO what if `other` is BlockManager ?
- left_arrays = self.arrays
- right_arrays = other.arrays
- result_arrays = [
- array_op(left, right) for left, right in zip(left_arrays, right_arrays)
- ]
- return type(self)(result_arrays, self._axes)
-
- def quantile(
- self,
- *,
- qs: Index, # with dtype float64
- axis: AxisInt = 0,
- transposed: bool = False,
- interpolation: QuantileInterpolation = "linear",
- ) -> ArrayManager:
- arrs = [ensure_block_shape(x, 2) for x in self.arrays]
- assert axis == 1
- new_arrs = [
- quantile_compat(x, np.asarray(qs._values), interpolation) for x in arrs
- ]
- for i, arr in enumerate(new_arrs):
- if arr.ndim == 2:
- assert arr.shape[0] == 1, arr.shape
- new_arrs[i] = arr[0]
-
- axes = [qs, self._axes[1]]
- return type(self)(new_arrs, axes)
-
- # ----------------------------------------------------------------
-
- def unstack(self, unstacker, fill_value) -> ArrayManager:
- """
- Return a BlockManager with all blocks unstacked.
-
- Parameters
- ----------
- unstacker : reshape._Unstacker
- fill_value : Any
- fill_value for newly introduced missing values.
-
- Returns
- -------
- unstacked : BlockManager
- """
- indexer, _ = unstacker._indexer_and_to_sort
- if unstacker.mask.all():
- new_indexer = indexer
- allow_fill = False
- new_mask2D = None
- needs_masking = None
- else:
- new_indexer = np.full(unstacker.mask.shape, -1)
- new_indexer[unstacker.mask] = indexer
- allow_fill = True
- # calculating the full mask once and passing it to take_1d is faster
- # than letting take_1d calculate it in each repeated call
- new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)
- needs_masking = new_mask2D.any(axis=0)
- new_indexer2D = new_indexer.reshape(*unstacker.full_shape)
- new_indexer2D = ensure_platform_int(new_indexer2D)
-
- new_arrays = []
- for arr in self.arrays:
- for i in range(unstacker.full_shape[1]):
- if allow_fill:
- # error: Value of type "Optional[Any]" is not indexable [index]
- new_arr = take_1d(
- arr,
- new_indexer2D[:, i],
- allow_fill=needs_masking[i], # type: ignore[index]
- fill_value=fill_value,
- mask=new_mask2D[:, i], # type: ignore[index]
- )
- else:
- new_arr = take_1d(arr, new_indexer2D[:, i], allow_fill=False)
- new_arrays.append(new_arr)
-
- new_index = unstacker.new_index
- new_columns = unstacker.get_new_columns(self._axes[1])
- new_axes = [new_index, new_columns]
-
- return type(self)(new_arrays, new_axes, verify_integrity=False)
-
- def as_array(
- self,
- dtype=None,
- copy: bool = False,
- na_value: object = lib.no_default,
- ) -> np.ndarray:
- """
- Convert the blockmanager data into an numpy array.
-
- Parameters
- ----------
- dtype : object, default None
- Data type of the return array.
- copy : bool, default False
- If True then guarantee that a copy is returned. A value of
- False does not guarantee that the underlying data is not
- copied.
- na_value : object, default lib.no_default
- Value to be used as the missing value sentinel.
-
- Returns
- -------
- arr : ndarray
- """
- if len(self.arrays) == 0:
- empty_arr = np.empty(self.shape, dtype=float)
- return empty_arr.transpose()
-
- # We want to copy when na_value is provided to avoid
- # mutating the original object
- copy = copy or na_value is not lib.no_default
-
- if not dtype:
- dtype = interleaved_dtype([arr.dtype for arr in self.arrays])
-
- if isinstance(dtype, SparseDtype):
- dtype = dtype.subtype
- elif isinstance(dtype, PandasDtype):
- dtype = dtype.numpy_dtype
- elif is_extension_array_dtype(dtype):
- dtype = "object"
- elif is_dtype_equal(dtype, str):
- dtype = "object"
-
- result = np.empty(self.shape_proper, dtype=dtype)
-
- for i, arr in enumerate(self.arrays):
- arr = arr.astype(dtype, copy=copy)
- result[:, i] = arr
-
- if na_value is not lib.no_default:
- result[isna(result)] = na_value
-
- return result
-
-
-class SingleArrayManager(BaseArrayManager, SingleDataManager):
- __slots__ = [
- "_axes", # private attribute, because 'axes' has different order, see below
- "arrays",
- ]
-
- arrays: list[np.ndarray | ExtensionArray]
- _axes: list[Index]
-
- @property
- def ndim(self) -> Literal[1]:
- return 1
-
- def __init__(
- self,
- arrays: list[np.ndarray | ExtensionArray],
- axes: list[Index],
- verify_integrity: bool = True,
- ) -> None:
- self._axes = axes
- self.arrays = arrays
-
- if verify_integrity:
- assert len(axes) == 1
- assert len(arrays) == 1
- self._axes = [ensure_index(ax) for ax in self._axes]
- arr = arrays[0]
- arr = maybe_coerce_values(arr)
- arr = extract_pandas_array(arr, None, 1)[0]
- self.arrays = [arr]
- self._verify_integrity()
-
- def _verify_integrity(self) -> None:
- (n_rows,) = self.shape
- assert len(self.arrays) == 1
- arr = self.arrays[0]
- assert len(arr) == n_rows
- if not arr.ndim == 1:
- raise ValueError(
- "Passed array should be 1-dimensional, got array with "
- f"{arr.ndim} dimensions instead."
- )
-
- @staticmethod
- def _normalize_axis(axis):
- return axis
-
- def make_empty(self, axes=None) -> SingleArrayManager:
- """Return an empty ArrayManager with index/array of length 0"""
- if axes is None:
- axes = [Index([], dtype=object)]
- array: np.ndarray = np.array([], dtype=self.dtype)
- return type(self)([array], axes)
-
- @classmethod
- def from_array(cls, array, index) -> SingleArrayManager:
- return cls([array], [index])
-
- @property
- def axes(self):
- return self._axes
-
- @property
- def index(self) -> Index:
- return self._axes[0]
-
- @property
- def dtype(self):
- return self.array.dtype
-
- def external_values(self):
- """The array that Series.values returns"""
- return external_values(self.array)
-
- def internal_values(self):
- """The array that Series._values returns"""
- return self.array
-
- def array_values(self):
- """The array that Series.array returns"""
- arr = self.array
- if isinstance(arr, np.ndarray):
- arr = PandasArray(arr)
- return arr
-
- @property
- def _can_hold_na(self) -> bool:
- if isinstance(self.array, np.ndarray):
- return self.array.dtype.kind not in ["b", "i", "u"]
- else:
- # ExtensionArray
- return self.array._can_hold_na
-
- @property
- def is_single_block(self) -> bool:
- return True
-
- def fast_xs(self, loc: int) -> SingleArrayManager:
- raise NotImplementedError("Use series._values[loc] instead")
-
- def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleArrayManager:
- if axis >= self.ndim:
- raise IndexError("Requested axis not found in manager")
-
- new_array = self.array[slobj]
- new_index = self.index._getitem_slice(slobj)
- return type(self)([new_array], [new_index], verify_integrity=False)
-
- def getitem_mgr(self, indexer) -> SingleArrayManager:
- new_array = self.array[indexer]
- new_index = self.index[indexer]
- return type(self)([new_array], [new_index])
-
- def apply(self, func, **kwargs):
- if callable(func):
- new_array = func(self.array, **kwargs)
- else:
- new_array = getattr(self.array, func)(**kwargs)
- return type(self)([new_array], self._axes)
-
- def setitem(self, indexer, value) -> SingleArrayManager:
- """
- Set values with indexer.
-
- For SingleArrayManager, this backs s[indexer] = value
-
- See `setitem_inplace` for a version that works inplace and doesn't
- return a new Manager.
- """
- if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
- raise ValueError(f"Cannot set values with ndim > {self.ndim}")
- return self.apply_with_block("setitem", indexer=indexer, value=value)
-
- def idelete(self, indexer) -> SingleArrayManager:
- """
- Delete selected locations in-place (new array, same ArrayManager)
- """
- to_keep = np.ones(self.shape[0], dtype=np.bool_)
- to_keep[indexer] = False
-
- self.arrays = [self.arrays[0][to_keep]]
- self._axes = [self._axes[0][to_keep]]
- return self
-
- def _get_data_subset(self, predicate: Callable) -> SingleArrayManager:
- # used in get_numeric_data / get_bool_data
- if predicate(self.array):
- return type(self)(self.arrays, self._axes, verify_integrity=False)
- else:
- return self.make_empty()
-
- def set_values(self, values: ArrayLike) -> None:
- """
- Set (replace) the values of the SingleArrayManager in place.
-
- Use at your own risk! This does not check if the passed values are
- valid for the current SingleArrayManager (length, dtype, etc).
- """
- self.arrays[0] = values
-
- def to_2d_mgr(self, columns: Index) -> ArrayManager:
- """
- Manager analogue of Series.to_frame
- """
- arrays = [self.arrays[0]]
- axes = [self.axes[0], columns]
-
- return ArrayManager(arrays, axes, verify_integrity=False)
-
-
-class NullArrayProxy:
- """
- Proxy object for an all-NA array.
-
- Only stores the length of the array, and not the dtype. The dtype
- will only be known when actually concatenating (after determining the
- common dtype, for which this proxy is ignored).
- Using this object avoids that the internals/concat.py needs to determine
- the proper dtype and array type.
- """
-
- ndim = 1
-
- def __init__(self, n: int) -> None:
- self.n = n
-
- @property
- def shape(self) -> tuple[int]:
- return (self.n,)
-
- def to_array(self, dtype: DtypeObj) -> ArrayLike:
- """
- Helper function to create the actual all-NA array from the NullArrayProxy
- object.
-
- Parameters
- ----------
- arr : NullArrayProxy
- dtype : the dtype for the resulting array
-
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- if isinstance(dtype, ExtensionDtype):
- empty = dtype.construct_array_type()._from_sequence([], dtype=dtype)
- indexer = -np.ones(self.n, dtype=np.intp)
- return empty.take(indexer, allow_fill=True)
- else:
- # when introducing missing values, int becomes float, bool becomes object
- dtype = ensure_dtype_can_hold_na(dtype)
- fill_value = na_value_for_dtype(dtype)
- arr = np.empty(self.n, dtype=dtype)
- arr.fill(fill_value)
- return ensure_wrapped_if_datetimelike(arr)
diff --git a/contrib/python/pandas/py3/pandas/core/internals/base.py b/contrib/python/pandas/py3/pandas/core/internals/base.py
deleted file mode 100644
index bf48e1ff0a6..00000000000
--- a/contrib/python/pandas/py3/pandas/core/internals/base.py
+++ /dev/null
@@ -1,224 +0,0 @@
-"""
-Base class for the internal managers. Both BlockManager and ArrayManager
-inherit from this class.
-"""
-from __future__ import annotations
-
-from typing import (
- Literal,
- TypeVar,
- final,
-)
-
-import numpy as np
-
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- DtypeObj,
- Shape,
-)
-from pandas.errors import AbstractMethodError
-
-from pandas.core.dtypes.cast import (
- find_common_type,
- np_can_hold_element,
-)
-
-from pandas.core.base import PandasObject
-from pandas.core.indexes.api import (
- Index,
- default_index,
-)
-
-T = TypeVar("T", bound="DataManager")
-
-
-class DataManager(PandasObject):
- # TODO share more methods/attributes
-
- axes: list[Index]
-
- @property
- def items(self) -> Index:
- raise AbstractMethodError(self)
-
- @final
- def __len__(self) -> int:
- return len(self.items)
-
- @property
- def ndim(self) -> int:
- return len(self.axes)
-
- @property
- def shape(self) -> Shape:
- return tuple(len(ax) for ax in self.axes)
-
- @final
- def _validate_set_axis(self, axis: AxisInt, new_labels: Index) -> None:
- # Caller is responsible for ensuring we have an Index object.
- old_len = len(self.axes[axis])
- new_len = len(new_labels)
-
- if axis == 1 and len(self.items) == 0:
- # If we are setting the index on a DataFrame with no columns,
- # it is OK to change the length.
- pass
-
- elif new_len != old_len:
- raise ValueError(
- f"Length mismatch: Expected axis has {old_len} elements, new "
- f"values have {new_len} elements"
- )
-
- def reindex_indexer(
- self: T,
- new_axis,
- indexer,
- axis: AxisInt,
- fill_value=None,
- allow_dups: bool = False,
- copy: bool = True,
- only_slice: bool = False,
- ) -> T:
- raise AbstractMethodError(self)
-
- @final
- def reindex_axis(
- self: T,
- new_index: Index,
- axis: AxisInt,
- fill_value=None,
- only_slice: bool = False,
- ) -> T:
- """
- Conform data manager to new index.
- """
- new_index, indexer = self.axes[axis].reindex(new_index)
-
- return self.reindex_indexer(
- new_index,
- indexer,
- axis=axis,
- fill_value=fill_value,
- copy=False,
- only_slice=only_slice,
- )
-
- def _equal_values(self: T, other: T) -> bool:
- """
- To be implemented by the subclasses. Only check the column values
- assuming shape and indexes have already been checked.
- """
- raise AbstractMethodError(self)
-
- @final
- def equals(self, other: object) -> bool:
- """
- Implementation for DataFrame.equals
- """
- if not isinstance(other, DataManager):
- return False
-
- self_axes, other_axes = self.axes, other.axes
- if len(self_axes) != len(other_axes):
- return False
- if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)):
- return False
-
- return self._equal_values(other)
-
- def apply(
- self: T,
- f,
- align_keys: list[str] | None = None,
- **kwargs,
- ) -> T:
- raise AbstractMethodError(self)
-
- @final
- def isna(self: T, func) -> T:
- return self.apply("apply", func=func)
-
- # --------------------------------------------------------------------
- # Consolidation: No-ops for all but BlockManager
-
- def is_consolidated(self) -> bool:
- return True
-
- def consolidate(self: T) -> T:
- return self
-
- def _consolidate_inplace(self) -> None:
- return
-
-
-class SingleDataManager(DataManager):
- @property
- def ndim(self) -> Literal[1]:
- return 1
-
- @final
- @property
- def array(self) -> ArrayLike:
- """
- Quick access to the backing array of the Block or SingleArrayManager.
- """
- # error: "SingleDataManager" has no attribute "arrays"; maybe "array"
- return self.arrays[0] # type: ignore[attr-defined]
-
- def setitem_inplace(self, indexer, value) -> None:
- """
- Set values with indexer.
-
- For Single[Block/Array]Manager, this backs s[indexer] = value
-
- This is an inplace version of `setitem()`, mutating the manager/values
- in place, not returning a new Manager (and Block), and thus never changing
- the dtype.
- """
- arr = self.array
-
- # EAs will do this validation in their own __setitem__ methods.
- if isinstance(arr, np.ndarray):
- # Note: checking for ndarray instead of np.dtype means we exclude
- # dt64/td64, which do their own validation.
- value = np_can_hold_element(arr.dtype, value)
-
- if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1:
- # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
- value = value[0, ...]
-
- arr[indexer] = value
-
- def grouped_reduce(self, func):
- arr = self.array
- res = func(arr)
- index = default_index(len(res))
-
- mgr = type(self).from_array(res, index)
- return mgr
-
- @classmethod
- def from_array(cls, arr: ArrayLike, index: Index):
- raise AbstractMethodError(cls)
-
-
-def interleaved_dtype(dtypes: list[DtypeObj]) -> DtypeObj | None:
- """
- Find the common dtype for `blocks`.
-
- Parameters
- ----------
- blocks : List[DtypeObj]
-
- Returns
- -------
- dtype : np.dtype, ExtensionDtype, or None
- None is returned when `blocks` is empty.
- """
- if not len(dtypes):
- return None
-
- return find_common_type(dtypes)
diff --git a/contrib/python/pandas/py3/pandas/core/internals/blocks.py b/contrib/python/pandas/py3/pandas/core/internals/blocks.py
deleted file mode 100644
index b2a6b1fa392..00000000000
--- a/contrib/python/pandas/py3/pandas/core/internals/blocks.py
+++ /dev/null
@@ -1,2607 +0,0 @@
-from __future__ import annotations
-
-from functools import wraps
-import re
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Iterable,
- Sequence,
- cast,
- final,
-)
-
-import numpy as np
-
-from pandas._config import using_copy_on_write
-
-from pandas._libs import (
- internals as libinternals,
- lib,
- writers,
-)
-from pandas._libs.internals import (
- BlockPlacement,
- BlockValuesRefs,
-)
-from pandas._libs.missing import NA
-from pandas._libs.tslibs import IncompatibleFrequency
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- DtypeObj,
- F,
- FillnaOptions,
- IgnoreRaise,
- QuantileInterpolation,
- Shape,
- npt,
-)
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import cache_readonly
-from pandas.util._validators import validate_bool_kwarg
-
-from pandas.core.dtypes.astype import (
- astype_array_safe,
- astype_is_view,
-)
-from pandas.core.dtypes.cast import (
- LossySetitemError,
- can_hold_element,
- find_result_type,
- maybe_downcast_to_dtype,
- np_can_hold_element,
-)
-from pandas.core.dtypes.common import (
- ensure_platform_int,
- is_1d_only_ea_dtype,
- is_1d_only_ea_obj,
- is_dtype_equal,
- is_interval_dtype,
- is_list_like,
- is_sparse,
- is_string_dtype,
-)
-from pandas.core.dtypes.dtypes import (
- DatetimeTZDtype,
- ExtensionDtype,
- PandasDtype,
- PeriodDtype,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCIndex,
- ABCPandasArray,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import (
- is_valid_na_for_dtype,
- isna,
- na_value_for_dtype,
-)
-
-from pandas.core import missing
-import pandas.core.algorithms as algos
-from pandas.core.array_algos.putmask import (
- extract_bool_array,
- putmask_inplace,
- putmask_without_repeat,
- setitem_datetimelike_compat,
- validate_putmask,
-)
-from pandas.core.array_algos.quantile import quantile_compat
-from pandas.core.array_algos.replace import (
- compare_or_regex_search,
- replace_regex,
- should_use_regex,
-)
-from pandas.core.array_algos.transforms import shift
-from pandas.core.arrays import (
- Categorical,
- DatetimeArray,
- ExtensionArray,
- IntervalArray,
- PandasArray,
- PeriodArray,
- TimedeltaArray,
-)
-from pandas.core.arrays.sparse import SparseDtype
-from pandas.core.base import PandasObject
-import pandas.core.common as com
-from pandas.core.computation import expressions
-from pandas.core.construction import (
- ensure_wrapped_if_datetimelike,
- extract_array,
-)
-from pandas.core.indexers import check_setitem_lengths
-
-if TYPE_CHECKING:
- from pandas.core.api import Index
- from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
-
-# comparison is faster than is_object_dtype
-_dtype_obj = np.dtype("object")
-
-
-def maybe_split(meth: F) -> F:
- """
- If we have a multi-column block, split and operate block-wise. Otherwise
- use the original method.
- """
-
- @wraps(meth)
- def newfunc(self, *args, **kwargs) -> list[Block]:
- if self.ndim == 1 or self.shape[0] == 1:
- return meth(self, *args, **kwargs)
- else:
- # Split and operate column-by-column
- return self.split_and_operate(meth, *args, **kwargs)
-
- return cast(F, newfunc)
-
-
-class Block(PandasObject):
- """
- Canonical n-dimensional unit of homogeneous dtype contained in a pandas
- data structure
-
- Index-ignorant; let the container take care of that
- """
-
- values: np.ndarray | ExtensionArray
- ndim: int
- refs: BlockValuesRefs
- __init__: Callable
-
- __slots__ = ()
- is_numeric = False
- is_object = False
- is_extension = False
- _can_consolidate = True
- _validate_ndim = True
-
- @final
- @cache_readonly
- def _consolidate_key(self):
- return self._can_consolidate, self.dtype.name
-
- @final
- @cache_readonly
- def _can_hold_na(self) -> bool:
- """
- Can we store NA values in this Block?
- """
- dtype = self.dtype
- if isinstance(dtype, np.dtype):
- return dtype.kind not in ["b", "i", "u"]
- return dtype._can_hold_na
-
- @final
- @property
- def is_bool(self) -> bool:
- """
- We can be bool if a) we are bool dtype or b) object dtype with bool objects.
- """
- return self.values.dtype == np.dtype(bool)
-
- @final
- def external_values(self):
- return external_values(self.values)
-
- @final
- @cache_readonly
- def fill_value(self):
- # Used in reindex_indexer
- return na_value_for_dtype(self.dtype, compat=False)
-
- @final
- def _standardize_fill_value(self, value):
- # if we are passed a scalar None, convert it here
- if self.dtype != _dtype_obj and is_valid_na_for_dtype(value, self.dtype):
- value = self.fill_value
- return value
-
- @property
- def mgr_locs(self) -> BlockPlacement:
- return self._mgr_locs
-
- @mgr_locs.setter
- def mgr_locs(self, new_mgr_locs: BlockPlacement) -> None:
- self._mgr_locs = new_mgr_locs
-
- @final
- def make_block(
- self, values, placement=None, refs: BlockValuesRefs | None = None
- ) -> Block:
- """
- Create a new block, with type inference propagate any values that are
- not specified
- """
- if placement is None:
- placement = self._mgr_locs
- if self.is_extension:
- values = ensure_block_shape(values, ndim=self.ndim)
-
- # TODO: perf by not going through new_block
- # We assume maybe_coerce_values has already been called
- return new_block(values, placement=placement, ndim=self.ndim, refs=refs)
-
- @final
- def make_block_same_class(
- self,
- values,
- placement: BlockPlacement | None = None,
- refs: BlockValuesRefs | None = None,
- ) -> Block:
- """Wrap given values in a block of same type as self."""
- # Pre-2.0 we called ensure_wrapped_if_datetimelike because fastparquet
- # relied on it, as of 2.0 the caller is responsible for this.
- if placement is None:
- placement = self._mgr_locs
-
- # We assume maybe_coerce_values has already been called
- return type(self)(values, placement=placement, ndim=self.ndim, refs=refs)
-
- @final
- def __repr__(self) -> str:
- # don't want to print out all of the items here
- name = type(self).__name__
- if self.ndim == 1:
- result = f"{name}: {len(self)} dtype: {self.dtype}"
- else:
- shape = " x ".join([str(s) for s in self.shape])
- result = f"{name}: {self.mgr_locs.indexer}, {shape}, dtype: {self.dtype}"
-
- return result
-
- @final
- def __len__(self) -> int:
- return len(self.values)
-
- @final
- def getitem_block(self, slicer: slice | npt.NDArray[np.intp]) -> Block:
- """
- Perform __getitem__-like, return result as block.
-
- Only supports slices that preserve dimensionality.
- """
- # Note: the only place where we are called with ndarray[intp]
- # is from internals.concat, and we can verify that never happens
- # with 1-column blocks, i.e. never for ExtensionBlock.
-
- new_mgr_locs = self._mgr_locs[slicer]
-
- new_values = self._slice(slicer)
- refs = self.refs if isinstance(slicer, slice) else None
- return type(self)(new_values, new_mgr_locs, self.ndim, refs=refs)
-
- @final
- def getitem_block_columns(
- self, slicer: slice, new_mgr_locs: BlockPlacement
- ) -> Block:
- """
- Perform __getitem__-like, return result as block.
-
- Only supports slices that preserve dimensionality.
- """
- new_values = self._slice(slicer)
-
- if new_values.ndim != self.values.ndim:
- raise ValueError("Only same dim slicing is allowed")
-
- return type(self)(new_values, new_mgr_locs, self.ndim, refs=self.refs)
-
- @final
- def _can_hold_element(self, element: Any) -> bool:
- """require the same dtype as ourselves"""
- element = extract_array(element, extract_numpy=True)
- return can_hold_element(self.values, element)
-
- @final
- def should_store(self, value: ArrayLike) -> bool:
- """
- Should we set self.values[indexer] = value inplace or do we need to cast?
-
- Parameters
- ----------
- value : np.ndarray or ExtensionArray
-
- Returns
- -------
- bool
- """
- # faster equivalent to is_dtype_equal(value.dtype, self.dtype)
- try:
- return value.dtype == self.dtype
- except TypeError:
- return False
-
- # ---------------------------------------------------------------------
- # Apply/Reduce and Helpers
-
- @final
- def apply(self, func, **kwargs) -> list[Block]:
- """
- apply the function to my values; return a block if we are not
- one
- """
- result = func(self.values, **kwargs)
-
- return self._split_op_result(result)
-
- @final
- def reduce(self, func) -> list[Block]:
- # We will apply the function and reshape the result into a single-row
- # Block with the same mgr_locs; squeezing will be done at a higher level
- assert self.ndim == 2
-
- result = func(self.values)
-
- if self.values.ndim == 1:
- # TODO(EA2D): special case not needed with 2D EAs
- res_values = np.array([[result]])
- else:
- res_values = result.reshape(-1, 1)
-
- nb = self.make_block(res_values)
- return [nb]
-
- @final
- def _split_op_result(self, result: ArrayLike) -> list[Block]:
- # See also: split_and_operate
- if result.ndim > 1 and isinstance(result.dtype, ExtensionDtype):
- # TODO(EA2D): unnecessary with 2D EAs
- # if we get a 2D ExtensionArray, we need to split it into 1D pieces
- nbs = []
- for i, loc in enumerate(self._mgr_locs):
- if not is_1d_only_ea_obj(result):
- vals = result[i : i + 1]
- else:
- vals = result[i]
-
- block = self.make_block(values=vals, placement=loc)
- nbs.append(block)
- return nbs
-
- nb = self.make_block(result)
-
- return [nb]
-
- @final
- def _split(self) -> list[Block]:
- """
- Split a block into a list of single-column blocks.
- """
- assert self.ndim == 2
-
- new_blocks = []
- for i, ref_loc in enumerate(self._mgr_locs):
- vals = self.values[slice(i, i + 1)]
-
- bp = BlockPlacement(ref_loc)
- nb = type(self)(vals, placement=bp, ndim=2, refs=self.refs)
- new_blocks.append(nb)
- return new_blocks
-
- @final
- def split_and_operate(self, func, *args, **kwargs) -> list[Block]:
- """
- Split the block and apply func column-by-column.
-
- Parameters
- ----------
- func : Block method
- *args
- **kwargs
-
- Returns
- -------
- List[Block]
- """
- assert self.ndim == 2 and self.shape[0] != 1
-
- res_blocks = []
- for nb in self._split():
- rbs = func(nb, *args, **kwargs)
- res_blocks.extend(rbs)
- return res_blocks
-
- # ---------------------------------------------------------------------
- # Up/Down-casting
-
- @final
- def coerce_to_target_dtype(self, other) -> Block:
- """
- coerce the current block to a dtype compat for other
- we will return a block, possibly object, and not raise
-
- we can also safely try to coerce to the same dtype
- and will receive the same block
- """
- new_dtype = find_result_type(self.values, other)
-
- return self.astype(new_dtype, copy=False)
-
- @final
- def _maybe_downcast(
- self, blocks: list[Block], downcast=None, using_cow: bool = False
- ) -> list[Block]:
- if downcast is False:
- return blocks
-
- if self.dtype == _dtype_obj:
- # TODO: does it matter that self.dtype might not match blocks[i].dtype?
- # GH#44241 We downcast regardless of the argument;
- # respecting 'downcast=None' may be worthwhile at some point,
- # but ATM it breaks too much existing code.
- # split and convert the blocks
-
- return extend_blocks(
- [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks]
- )
-
- if downcast is None:
- return blocks
-
- return extend_blocks([b._downcast_2d(downcast, using_cow) for b in blocks])
-
- @final
- @maybe_split
- def _downcast_2d(self, dtype, using_cow: bool = False) -> list[Block]:
- """
- downcast specialized to 2D case post-validation.
-
- Refactored to allow use of maybe_split.
- """
- new_values = maybe_downcast_to_dtype(self.values, dtype=dtype)
- refs = self.refs if using_cow and new_values is self.values else None
- return [self.make_block(new_values, refs=refs)]
-
- def convert(
- self,
- *,
- copy: bool = True,
- using_cow: bool = False,
- ) -> list[Block]:
- """
- attempt to coerce any object types to better types return a copy
- of the block (if copy = True) by definition we are not an ObjectBlock
- here!
- """
- if not copy and using_cow:
- return [self.copy(deep=False)]
- return [self.copy()] if copy else [self]
-
- # ---------------------------------------------------------------------
- # Array-Like Methods
-
- @cache_readonly
- def dtype(self) -> DtypeObj:
- return self.values.dtype
-
- @final
- def astype(
- self,
- dtype: DtypeObj,
- copy: bool = False,
- errors: IgnoreRaise = "raise",
- using_cow: bool = False,
- ) -> Block:
- """
- Coerce to the new dtype.
-
- Parameters
- ----------
- dtype : np.dtype or ExtensionDtype
- copy : bool, default False
- copy if indicated
- errors : str, {'raise', 'ignore'}, default 'raise'
- - ``raise`` : allow exceptions to be raised
- - ``ignore`` : suppress exceptions. On error return original object
- using_cow: bool, default False
- Signaling if copy on write copy logic is used.
-
- Returns
- -------
- Block
- """
- values = self.values
-
- new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
-
- new_values = maybe_coerce_values(new_values)
-
- refs = None
- if using_cow and astype_is_view(values.dtype, new_values.dtype):
- refs = self.refs
-
- newb = self.make_block(new_values, refs=refs)
- if newb.shape != self.shape:
- raise TypeError(
- f"cannot set astype for copy = [{copy}] for dtype "
- f"({self.dtype.name} [{self.shape}]) to different shape "
- f"({newb.dtype.name} [{newb.shape}])"
- )
- return newb
-
- @final
- def to_native_types(self, na_rep: str = "nan", quoting=None, **kwargs) -> Block:
- """convert to our native types format"""
- result = to_native_types(self.values, na_rep=na_rep, quoting=quoting, **kwargs)
- return self.make_block(result)
-
- @final
- def copy(self, deep: bool = True) -> Block:
- """copy constructor"""
- values = self.values
- refs: BlockValuesRefs | None
- if deep:
- values = values.copy()
- refs = None
- else:
- refs = self.refs
- return type(self)(values, placement=self._mgr_locs, ndim=self.ndim, refs=refs)
-
- # ---------------------------------------------------------------------
- # Replace
-
- @final
- def replace(
- self,
- to_replace,
- value,
- inplace: bool = False,
- # mask may be pre-computed if we're called from replace_list
- mask: npt.NDArray[np.bool_] | None = None,
- using_cow: bool = False,
- ) -> list[Block]:
- """
- replace the to_replace value with value, possible to create new
- blocks here this is just a call to putmask.
- """
-
- # Note: the checks we do in NDFrame.replace ensure we never get
- # here with listlike to_replace or value, as those cases
- # go through replace_list
- values = self.values
-
- if isinstance(values, Categorical):
- # TODO: avoid special-casing
- # GH49404
- if using_cow and (self.refs.has_reference() or not inplace):
- blk = self.copy()
- elif using_cow:
- blk = self.copy(deep=False)
- else:
- blk = self if inplace else self.copy()
- values = cast(Categorical, blk.values)
- values._replace(to_replace=to_replace, value=value, inplace=True)
- return [blk]
-
- if not self._can_hold_element(to_replace):
- # We cannot hold `to_replace`, so we know immediately that
- # replacing it is a no-op.
- # Note: If to_replace were a list, NDFrame.replace would call
- # replace_list instead of replace.
- if using_cow:
- return [self.copy(deep=False)]
- else:
- return [self] if inplace else [self.copy()]
-
- if mask is None:
- mask = missing.mask_missing(values, to_replace)
- if not mask.any():
- # Note: we get here with test_replace_extension_other incorrectly
- # bc _can_hold_element is incorrect.
- if using_cow:
- return [self.copy(deep=False)]
- else:
- return [self] if inplace else [self.copy()]
-
- elif self._can_hold_element(value):
- # TODO(CoW): Maybe split here as well into columns where mask has True
- # and rest?
- if using_cow:
- if inplace:
- blk = self.copy(deep=self.refs.has_reference())
- else:
- blk = self.copy()
- else:
- blk = self if inplace else self.copy()
- putmask_inplace(blk.values, mask, value)
- if not (self.is_object and value is None):
- # if the user *explicitly* gave None, we keep None, otherwise
- # may downcast to NaN
- blocks = blk.convert(copy=False, using_cow=using_cow)
- else:
- blocks = [blk]
- return blocks
-
- elif self.ndim == 1 or self.shape[0] == 1:
- if value is None or value is NA:
- blk = self.astype(np.dtype(object))
- else:
- blk = self.coerce_to_target_dtype(value)
- return blk.replace(
- to_replace=to_replace,
- value=value,
- inplace=True,
- mask=mask,
- )
-
- else:
- # split so that we only upcast where necessary
- blocks = []
- for i, nb in enumerate(self._split()):
- blocks.extend(
- type(self).replace(
- nb,
- to_replace=to_replace,
- value=value,
- inplace=True,
- mask=mask[i : i + 1],
- using_cow=using_cow,
- )
- )
- return blocks
-
- @final
- def _replace_regex(
- self,
- to_replace,
- value,
- inplace: bool = False,
- mask=None,
- using_cow: bool = False,
- ) -> list[Block]:
- """
- Replace elements by the given value.
-
- Parameters
- ----------
- to_replace : object or pattern
- Scalar to replace or regular expression to match.
- value : object
- Replacement object.
- inplace : bool, default False
- Perform inplace modification.
- mask : array-like of bool, optional
- True indicate corresponding element is ignored.
- using_cow: bool, default False
- Specifying if copy on write is enabled.
-
- Returns
- -------
- List[Block]
- """
- if not self._can_hold_element(to_replace):
- # i.e. only ObjectBlock, but could in principle include a
- # String ExtensionBlock
- if using_cow:
- return [self.copy(deep=False)]
- return [self] if inplace else [self.copy()]
-
- rx = re.compile(to_replace)
-
- if using_cow:
- if inplace and not self.refs.has_reference():
- refs = self.refs
- new_values = self.values
- else:
- refs = None
- new_values = self.values.copy()
- else:
- refs = None
- new_values = self.values if inplace else self.values.copy()
-
- replace_regex(new_values, rx, value, mask)
-
- block = self.make_block(new_values, refs=refs)
- return block.convert(copy=False, using_cow=using_cow)
-
- @final
- def replace_list(
- self,
- src_list: Iterable[Any],
- dest_list: Sequence[Any],
- inplace: bool = False,
- regex: bool = False,
- using_cow: bool = False,
- ) -> list[Block]:
- """
- See BlockManager.replace_list docstring.
- """
- values = self.values
-
- if isinstance(values, Categorical):
- # TODO: avoid special-casing
- # GH49404
- if using_cow and inplace:
- blk = self.copy(deep=self.refs.has_reference())
- else:
- blk = self if inplace else self.copy()
- values = cast(Categorical, blk.values)
- values._replace(to_replace=src_list, value=dest_list, inplace=True)
- return [blk]
-
- # Exclude anything that we know we won't contain
- pairs = [
- (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
- ]
- if not len(pairs):
- if using_cow:
- return [self.copy(deep=False)]
- # shortcut, nothing to replace
- return [self] if inplace else [self.copy()]
-
- src_len = len(pairs) - 1
-
- if is_string_dtype(values.dtype):
- # Calculate the mask once, prior to the call of comp
- # in order to avoid repeating the same computations
- na_mask = ~isna(values)
- masks: Iterable[npt.NDArray[np.bool_]] = (
- extract_bool_array(
- cast(
- ArrayLike,
- compare_or_regex_search(
- values, s[0], regex=regex, mask=na_mask
- ),
- )
- )
- for s in pairs
- )
- else:
- # GH#38086 faster if we know we dont need to check for regex
- masks = (missing.mask_missing(values, s[0]) for s in pairs)
- # Materialize if inplace = True, since the masks can change
- # as we replace
- if inplace:
- masks = list(masks)
-
- if using_cow and inplace:
- # Don't set up refs here, otherwise we will think that we have
- # references when we check again later
- rb = [self]
- else:
- rb = [self if inplace else self.copy()]
-
- for i, ((src, dest), mask) in enumerate(zip(pairs, masks)):
- convert = i == src_len # only convert once at the end
- new_rb: list[Block] = []
-
- # GH-39338: _replace_coerce can split a block into
- # single-column blocks, so track the index so we know
- # where to index into the mask
- for blk_num, blk in enumerate(rb):
- if len(rb) == 1:
- m = mask
- else:
- mib = mask
- assert not isinstance(mib, bool)
- m = mib[blk_num : blk_num + 1]
-
- # error: Argument "mask" to "_replace_coerce" of "Block" has
- # incompatible type "Union[ExtensionArray, ndarray[Any, Any], bool]";
- # expected "ndarray[Any, dtype[bool_]]"
- result = blk._replace_coerce(
- to_replace=src,
- value=dest,
- mask=m,
- inplace=inplace,
- regex=regex,
- using_cow=using_cow,
- )
- if convert and blk.is_object and not all(x is None for x in dest_list):
- # GH#44498 avoid unwanted cast-back
- result = extend_blocks(
- [
- b.convert(copy=True and not using_cow, using_cow=using_cow)
- for b in result
- ]
- )
- new_rb.extend(result)
- rb = new_rb
- return rb
-
- @final
- def _replace_coerce(
- self,
- to_replace,
- value,
- mask: npt.NDArray[np.bool_],
- inplace: bool = True,
- regex: bool = False,
- using_cow: bool = False,
- ) -> list[Block]:
- """
- Replace value corresponding to the given boolean array with another
- value.
-
- Parameters
- ----------
- to_replace : object or pattern
- Scalar to replace or regular expression to match.
- value : object
- Replacement object.
- mask : np.ndarray[bool]
- True indicate corresponding element is ignored.
- inplace : bool, default True
- Perform inplace modification.
- regex : bool, default False
- If true, perform regular expression substitution.
-
- Returns
- -------
- List[Block]
- """
- if should_use_regex(regex, to_replace):
- return self._replace_regex(
- to_replace,
- value,
- inplace=inplace,
- mask=mask,
- )
- else:
- if value is None:
- # gh-45601, gh-45836, gh-46634
- if mask.any():
- has_ref = self.refs.has_reference()
- nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow)
- if (nb is self or using_cow) and not inplace:
- nb = nb.copy()
- elif inplace and has_ref and nb.refs.has_reference():
- # no copy in astype and we had refs before
- nb = nb.copy()
- putmask_inplace(nb.values, mask, value)
- return [nb]
- if using_cow:
- return [self.copy(deep=False)]
- return [self] if inplace else [self.copy()]
- return self.replace(
- to_replace=to_replace,
- value=value,
- inplace=inplace,
- mask=mask,
- using_cow=using_cow,
- )
-
- # ---------------------------------------------------------------------
- # 2D Methods - Shared by NumpyBlock and NDArrayBackedExtensionBlock
- # but not ExtensionBlock
-
- def _maybe_squeeze_arg(self, arg: np.ndarray) -> np.ndarray:
- """
- For compatibility with 1D-only ExtensionArrays.
- """
- return arg
-
- def _unwrap_setitem_indexer(self, indexer):
- """
- For compatibility with 1D-only ExtensionArrays.
- """
- return indexer
-
- # NB: this cannot be made cache_readonly because in mgr.set_values we pin
- # new .values that can have different shape GH#42631
- @property
- def shape(self) -> Shape:
- return self.values.shape
-
- def iget(self, i: int | tuple[int, int] | tuple[slice, int]) -> np.ndarray:
- # In the case where we have a tuple[slice, int], the slice will always
- # be slice(None)
- # Note: only reached with self.ndim == 2
- # Invalid index type "Union[int, Tuple[int, int], Tuple[slice, int]]"
- # for "Union[ndarray[Any, Any], ExtensionArray]"; expected type
- # "Union[int, integer[Any]]"
- return self.values[i] # type: ignore[index]
-
- def _slice(
- self, slicer: slice | npt.NDArray[np.bool_] | npt.NDArray[np.intp]
- ) -> ArrayLike:
- """return a slice of my values"""
-
- return self.values[slicer]
-
- def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None:
- """
- Modify block values in-place with new item value.
-
- If copy=True, first copy the underlying values in place before modifying
- (for Copy-on-Write).
-
- Notes
- -----
- `set_inplace` never creates a new array or new Block, whereas `setitem`
- _may_ create a new array and always creates a new Block.
-
- Caller is responsible for checking values.dtype == self.dtype.
- """
- if copy:
- self.values = self.values.copy()
- self.values[locs] = values
-
- def take_nd(
- self,
- indexer: npt.NDArray[np.intp],
- axis: AxisInt,
- new_mgr_locs: BlockPlacement | None = None,
- fill_value=lib.no_default,
- ) -> Block:
- """
- Take values according to indexer and return them as a block.
- """
- values = self.values
-
- if fill_value is lib.no_default:
- fill_value = self.fill_value
- allow_fill = False
- else:
- allow_fill = True
-
- # Note: algos.take_nd has upcast logic similar to coerce_to_target_dtype
- new_values = algos.take_nd(
- values, indexer, axis=axis, allow_fill=allow_fill, fill_value=fill_value
- )
-
- # Called from three places in managers, all of which satisfy
- # these assertions
- if isinstance(self, ExtensionBlock):
- # NB: in this case, the 'axis' kwarg will be ignored in the
- # algos.take_nd call above.
- assert not (self.ndim == 1 and new_mgr_locs is None)
- assert not (axis == 0 and new_mgr_locs is None)
-
- if new_mgr_locs is None:
- new_mgr_locs = self._mgr_locs
-
- if not is_dtype_equal(new_values.dtype, self.dtype):
- return self.make_block(new_values, new_mgr_locs)
- else:
- return self.make_block_same_class(new_values, new_mgr_locs)
-
- def _unstack(
- self,
- unstacker,
- fill_value,
- new_placement: npt.NDArray[np.intp],
- needs_masking: npt.NDArray[np.bool_],
- ):
- """
- Return a list of unstacked blocks of self
-
- Parameters
- ----------
- unstacker : reshape._Unstacker
- fill_value : int
- Only used in ExtensionBlock._unstack
- new_placement : np.ndarray[np.intp]
- allow_fill : bool
- needs_masking : np.ndarray[bool]
-
- Returns
- -------
- blocks : list of Block
- New blocks of unstacked values.
- mask : array-like of bool
- The mask of columns of `blocks` we should keep.
- """
- new_values, mask = unstacker.get_new_values(
- self.values.T, fill_value=fill_value
- )
-
- mask = mask.any(0)
- # TODO: in all tests we have mask.all(); can we rely on that?
-
- # Note: these next two lines ensure that
- # mask.sum() == sum(len(nb.mgr_locs) for nb in blocks)
- # which the calling function needs in order to pass verify_integrity=False
- # to the BlockManager constructor
- new_values = new_values.T[mask]
- new_placement = new_placement[mask]
-
- bp = BlockPlacement(new_placement)
- blocks = [new_block_2d(new_values, placement=bp)]
- return blocks, mask
-
- # ---------------------------------------------------------------------
-
- def setitem(self, indexer, value, using_cow: bool = False) -> Block:
- """
- Attempt self.values[indexer] = value, possibly creating a new array.
-
- Parameters
- ----------
- indexer : tuple, list-like, array-like, slice, int
- The subset of self.values to set
- value : object
- The value being set
- using_cow: bool, default False
- Signaling if CoW is used.
-
- Returns
- -------
- Block
-
- Notes
- -----
- `indexer` is a direct slice/positional indexer. `value` must
- be a compatible shape.
- """
-
- value = self._standardize_fill_value(value)
-
- values = cast(np.ndarray, self.values)
- if self.ndim == 2:
- values = values.T
-
- # length checking
- check_setitem_lengths(indexer, value, values)
-
- value = extract_array(value, extract_numpy=True)
- try:
- casted = np_can_hold_element(values.dtype, value)
- except LossySetitemError:
- # current dtype cannot store value, coerce to common dtype
- nb = self.coerce_to_target_dtype(value)
- return nb.setitem(indexer, value)
- else:
- if self.dtype == _dtype_obj:
- # TODO: avoid having to construct values[indexer]
- vi = values[indexer]
- if lib.is_list_like(vi):
- # checking lib.is_scalar here fails on
- # test_iloc_setitem_custom_object
- casted = setitem_datetimelike_compat(values, len(vi), casted)
-
- if using_cow and self.refs.has_reference():
- values = values.copy()
- self = self.make_block_same_class(
- values.T if values.ndim == 2 else values
- )
- if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1:
- # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
- casted = casted[0, ...]
- values[indexer] = casted
- return self
-
- def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
- """
- putmask the data to the block; it is possible that we may create a
- new dtype of block
-
- Return the resulting block(s).
-
- Parameters
- ----------
- mask : np.ndarray[bool], SparseArray[bool], or BooleanArray
- new : a ndarray/object
- using_cow: bool, default False
-
- Returns
- -------
- List[Block]
- """
- orig_mask = mask
- values = cast(np.ndarray, self.values)
- mask, noop = validate_putmask(values.T, mask)
- assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame))
-
- if new is lib.no_default:
- new = self.fill_value
-
- new = self._standardize_fill_value(new)
- new = extract_array(new, extract_numpy=True)
-
- if noop:
- if using_cow:
- return [self.copy(deep=False)]
- return [self]
-
- try:
- casted = np_can_hold_element(values.dtype, new)
-
- if using_cow and self.refs.has_reference():
- # Do this here to avoid copying twice
- values = values.copy()
- self = self.make_block_same_class(values)
-
- putmask_without_repeat(values.T, mask, casted)
- if using_cow:
- return [self.copy(deep=False)]
- return [self]
- except LossySetitemError:
- if self.ndim == 1 or self.shape[0] == 1:
- # no need to split columns
-
- if not is_list_like(new):
- # using just new[indexer] can't save us the need to cast
- return self.coerce_to_target_dtype(new).putmask(mask, new)
- else:
- indexer = mask.nonzero()[0]
- nb = self.setitem(indexer, new[indexer], using_cow=using_cow)
- return [nb]
-
- else:
- is_array = isinstance(new, np.ndarray)
-
- res_blocks = []
- nbs = self._split()
- for i, nb in enumerate(nbs):
- n = new
- if is_array:
- # we have a different value per-column
- n = new[:, i : i + 1]
-
- submask = orig_mask[:, i : i + 1]
- rbs = nb.putmask(submask, n, using_cow=using_cow)
- res_blocks.extend(rbs)
- return res_blocks
-
- def where(
- self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False
- ) -> list[Block]:
- """
- evaluate the block; return result block(s) from the result
-
- Parameters
- ----------
- other : a ndarray/object
- cond : np.ndarray[bool], SparseArray[bool], or BooleanArray
- _downcast : str or None, default "infer"
- Private because we only specify it when calling from fillna.
-
- Returns
- -------
- List[Block]
- """
- assert cond.ndim == self.ndim
- assert not isinstance(other, (ABCIndex, ABCSeries, ABCDataFrame))
-
- transpose = self.ndim == 2
-
- cond = extract_bool_array(cond)
-
- # EABlocks override where
- values = cast(np.ndarray, self.values)
- orig_other = other
- if transpose:
- values = values.T
-
- icond, noop = validate_putmask(values, ~cond)
- if noop:
- # GH-39595: Always return a copy; short-circuit up/downcasting
- if using_cow:
- return [self.copy(deep=False)]
- return [self.copy()]
-
- if other is lib.no_default:
- other = self.fill_value
-
- other = self._standardize_fill_value(other)
-
- try:
- # try/except here is equivalent to a self._can_hold_element check,
- # but this gets us back 'casted' which we will re-use below;
- # without using 'casted', expressions.where may do unwanted upcasts.
- casted = np_can_hold_element(values.dtype, other)
- except (ValueError, TypeError, LossySetitemError):
- # we cannot coerce, return a compat dtype
-
- if self.ndim == 1 or self.shape[0] == 1:
- # no need to split columns
-
- block = self.coerce_to_target_dtype(other)
- blocks = block.where(orig_other, cond, using_cow=using_cow)
- return self._maybe_downcast(
- blocks, downcast=_downcast, using_cow=using_cow
- )
-
- else:
- # since _maybe_downcast would split blocks anyway, we
- # can avoid some potential upcast/downcast by splitting
- # on the front end.
- is_array = isinstance(other, (np.ndarray, ExtensionArray))
-
- res_blocks = []
- nbs = self._split()
- for i, nb in enumerate(nbs):
- oth = other
- if is_array:
- # we have a different value per-column
- oth = other[:, i : i + 1]
-
- submask = cond[:, i : i + 1]
- rbs = nb.where(
- oth, submask, _downcast=_downcast, using_cow=using_cow
- )
- res_blocks.extend(rbs)
- return res_blocks
-
- else:
- other = casted
- alt = setitem_datetimelike_compat(values, icond.sum(), other)
- if alt is not other:
- if is_list_like(other) and len(other) < len(values):
- # call np.where with other to get the appropriate ValueError
- np.where(~icond, values, other)
- raise NotImplementedError(
- "This should not be reached; call to np.where above is "
- "expected to raise ValueError. Please report a bug at "
- "github.com/pandas-dev/pandas"
- )
- result = values.copy()
- np.putmask(result, icond, alt)
- else:
- # By the time we get here, we should have all Series/Index
- # args extracted to ndarray
- if (
- is_list_like(other)
- and not isinstance(other, np.ndarray)
- and len(other) == self.shape[-1]
- ):
- # If we don't do this broadcasting here, then expressions.where
- # will broadcast a 1D other to be row-like instead of
- # column-like.
- other = np.array(other).reshape(values.shape)
- # If lengths don't match (or len(other)==1), we will raise
- # inside expressions.where, see test_series_where
-
- # Note: expressions.where may upcast.
- result = expressions.where(~icond, values, other)
- # The np_can_hold_element check _should_ ensure that we always
- # have result.dtype == self.dtype here.
-
- if transpose:
- result = result.T
-
- return [self.make_block(result)]
-
- def fillna(
- self,
- value,
- limit: int | None = None,
- inplace: bool = False,
- downcast=None,
- using_cow: bool = False,
- ) -> list[Block]:
- """
- fillna on the block with the value. If we fail, then convert to
- ObjectBlock and try again
- """
- # Caller is responsible for validating limit; if int it is strictly positive
- inplace = validate_bool_kwarg(inplace, "inplace")
-
- if not self._can_hold_na:
- # can short-circuit the isna call
- noop = True
- else:
- mask = isna(self.values)
- mask, noop = validate_putmask(self.values, mask)
-
- if noop:
- # we can't process the value, but nothing to do
- if inplace:
- if using_cow:
- return [self.copy(deep=False)]
- # Arbitrarily imposing the convention that we ignore downcast
- # on no-op when inplace=True
- return [self]
- else:
- # GH#45423 consistent downcasting on no-ops.
- nb = self.copy(deep=not using_cow)
- nbs = nb._maybe_downcast([nb], downcast=downcast, using_cow=using_cow)
- return nbs
-
- if limit is not None:
- mask[mask.cumsum(self.ndim - 1) > limit] = False
-
- if inplace:
- nbs = self.putmask(mask.T, value, using_cow=using_cow)
- else:
- # without _downcast, we would break
- # test_fillna_dtype_conversion_equiv_replace
- nbs = self.where(value, ~mask.T, _downcast=False)
-
- # Note: blk._maybe_downcast vs self._maybe_downcast(nbs)
- # makes a difference bc blk may have object dtype, which has
- # different behavior in _maybe_downcast.
- return extend_blocks(
- [
- blk._maybe_downcast([blk], downcast=downcast, using_cow=using_cow)
- for blk in nbs
- ]
- )
-
- def interpolate(
- self,
- *,
- method: FillnaOptions = "pad",
- axis: AxisInt = 0,
- index: Index | None = None,
- inplace: bool = False,
- limit: int | None = None,
- limit_direction: str = "forward",
- limit_area: str | None = None,
- fill_value: Any | None = None,
- downcast: str | None = None,
- using_cow: bool = False,
- **kwargs,
- ) -> list[Block]:
- inplace = validate_bool_kwarg(inplace, "inplace")
-
- if not self._can_hold_na:
- # If there are no NAs, then interpolate is a no-op
- if using_cow:
- return [self.copy(deep=False)]
- return [self] if inplace else [self.copy()]
-
- try:
- m = missing.clean_fill_method(method)
- except ValueError:
- m = None
- if m is None and self.dtype.kind != "f":
- # only deal with floats
- # bc we already checked that can_hold_na, we don't have int dtype here
- # test_interp_basic checks that we make a copy here
- if using_cow:
- return [self.copy(deep=False)]
- return [self] if inplace else [self.copy()]
-
- if self.is_object and self.ndim == 2 and self.shape[0] != 1 and axis == 0:
- # split improves performance in ndarray.copy()
- return self.split_and_operate(
- type(self).interpolate,
- method=method,
- axis=axis,
- index=index,
- inplace=inplace,
- limit=limit,
- limit_direction=limit_direction,
- limit_area=limit_area,
- fill_value=fill_value,
- downcast=downcast,
- **kwargs,
- )
-
- refs = None
- if inplace:
- if using_cow and self.refs.has_reference():
- data = self.values.copy()
- else:
- data = self.values
- refs = self.refs
- else:
- data = self.values.copy()
- data = cast(np.ndarray, data) # bc overridden by ExtensionBlock
-
- missing.interpolate_array_2d(
- data,
- method=method,
- axis=axis,
- index=index,
- limit=limit,
- limit_direction=limit_direction,
- limit_area=limit_area,
- fill_value=fill_value,
- **kwargs,
- )
-
- nb = self.make_block_same_class(data, refs=refs)
- return nb._maybe_downcast([nb], downcast, using_cow)
-
- def diff(self, n: int, axis: AxisInt = 1) -> list[Block]:
- """return block for the diff of the values"""
- # only reached with ndim == 2 and axis == 1
- new_values = algos.diff(self.values, n, axis=axis)
- return [self.make_block(values=new_values)]
-
- def shift(
- self, periods: int, axis: AxisInt = 0, fill_value: Any = None
- ) -> list[Block]:
- """shift the block by periods, possibly upcast"""
- # convert integer to float if necessary. need to do a lot more than
- # that, handle boolean etc also
-
- # Note: periods is never 0 here, as that is handled at the top of
- # NDFrame.shift. If that ever changes, we can do a check for periods=0
- # and possibly avoid coercing.
-
- if not lib.is_scalar(fill_value) and self.dtype != _dtype_obj:
- # with object dtype there is nothing to promote, and the user can
- # pass pretty much any weird fill_value they like
- # see test_shift_object_non_scalar_fill
- raise ValueError("fill_value must be a scalar")
-
- fill_value = self._standardize_fill_value(fill_value)
-
- try:
- # error: Argument 1 to "np_can_hold_element" has incompatible type
- # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]"
- casted = np_can_hold_element(
- self.dtype, fill_value # type: ignore[arg-type]
- )
- except LossySetitemError:
- nb = self.coerce_to_target_dtype(fill_value)
- return nb.shift(periods, axis=axis, fill_value=fill_value)
-
- else:
- values = cast(np.ndarray, self.values)
- new_values = shift(values, periods, axis, casted)
- return [self.make_block(new_values)]
-
- @final
- def quantile(
- self,
- qs: Index, # with dtype float64
- interpolation: QuantileInterpolation = "linear",
- axis: AxisInt = 0,
- ) -> Block:
- """
- compute the quantiles of the
-
- Parameters
- ----------
- qs : Index
- The quantiles to be computed in float64.
- interpolation : str, default 'linear'
- Type of interpolation.
- axis : int, default 0
- Axis to compute.
-
- Returns
- -------
- Block
- """
- # We should always have ndim == 2 because Series dispatches to DataFrame
- assert self.ndim == 2
- assert axis == 1 # only ever called this way
- assert is_list_like(qs) # caller is responsible for this
-
- result = quantile_compat(self.values, np.asarray(qs._values), interpolation)
- # ensure_block_shape needed for cases where we start with EA and result
- # is ndarray, e.g. IntegerArray, SparseArray
- result = ensure_block_shape(result, ndim=2)
- return new_block_2d(result, placement=self._mgr_locs)
-
- def round(self, decimals: int, using_cow: bool = False) -> Block:
- """
- Rounds the values.
- If the block is not of an integer or float dtype, nothing happens.
- This is consistent with DataFrame.round behavivor.
- (Note: Series.round would raise)
-
- Parameters
- ----------
- decimals: int,
- Number of decimal places to round to.
- Caller is responsible for validating this
- using_cow: bool,
- Whether Copy on Write is enabled right now
- """
- if not self.is_numeric or self.is_bool:
- return self.copy(deep=not using_cow)
- refs = None
- # TODO: round only defined on BaseMaskedArray
- # Series also does this, so would need to fix both places
- # error: Item "ExtensionArray" of "Union[ndarray[Any, Any], ExtensionArray]"
- # has no attribute "round"
- values = self.values.round(decimals) # type: ignore[union-attr]
- if values is self.values:
- refs = self.refs
- if not using_cow:
- # Normally would need to do this before, but
- # numpy only returns same array when round operation
- # is no-op
- # https://github.com/numpy/numpy/blob/486878b37fc7439a3b2b87747f50db9b62fea8eb/numpy/core/src/multiarray/calculation.c#L625-L636
- values = values.copy()
- return self.make_block_same_class(values, refs=refs)
-
- # ---------------------------------------------------------------------
- # Abstract Methods Overridden By EABackedBlock and NumpyBlock
-
- def delete(self, loc) -> list[Block]:
- """Deletes the locs from the block.
-
- We split the block to avoid copying the underlying data. We create new
- blocks for every connected segment of the initial block that is not deleted.
- The new blocks point to the initial array.
- """
- if not is_list_like(loc):
- loc = [loc]
-
- if self.ndim == 1:
- values = cast(np.ndarray, self.values)
- values = np.delete(values, loc)
- mgr_locs = self._mgr_locs.delete(loc)
- return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
-
- if np.max(loc) >= self.values.shape[0]:
- raise IndexError
-
- # Add one out-of-bounds indexer as maximum to collect
- # all columns after our last indexer if any
- loc = np.concatenate([loc, [self.values.shape[0]]])
- mgr_locs_arr = self._mgr_locs.as_array
- new_blocks: list[Block] = []
-
- previous_loc = -1
- # TODO(CoW): This is tricky, if parent block goes out of scope
- # all split blocks are referencing each other even though they
- # don't share data
- refs = self.refs if self.refs.has_reference() else None
- for idx in loc:
- if idx == previous_loc + 1:
- # There is no column between current and last idx
- pass
- else:
- # No overload variant of "__getitem__" of "ExtensionArray" matches
- # argument type "Tuple[slice, slice]"
- values = self.values[previous_loc + 1 : idx, :] # type: ignore[call-overload] # noqa
- locs = mgr_locs_arr[previous_loc + 1 : idx]
- nb = type(self)(
- values, placement=BlockPlacement(locs), ndim=self.ndim, refs=refs
- )
- new_blocks.append(nb)
-
- previous_loc = idx
-
- return new_blocks
-
- @property
- def is_view(self) -> bool:
- """return a boolean if I am possibly a view"""
- raise AbstractMethodError(self)
-
- @property
- def array_values(self) -> ExtensionArray:
- """
- The array that Series.array returns. Always an ExtensionArray.
- """
- raise AbstractMethodError(self)
-
- def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
- """
- return an internal format, currently just the ndarray
- this is often overridden to handle to_dense like operations
- """
- raise AbstractMethodError(self)
-
- def values_for_json(self) -> np.ndarray:
- raise AbstractMethodError(self)
-
-
-class EABackedBlock(Block):
- """
- Mixin for Block subclasses backed by ExtensionArray.
- """
-
- values: ExtensionArray
-
- def setitem(self, indexer, value, using_cow: bool = False):
- """
- Attempt self.values[indexer] = value, possibly creating a new array.
-
- This differs from Block.setitem by not allowing setitem to change
- the dtype of the Block.
-
- Parameters
- ----------
- indexer : tuple, list-like, array-like, slice, int
- The subset of self.values to set
- value : object
- The value being set
- using_cow: bool, default False
- Signaling if CoW is used.
-
- Returns
- -------
- Block
-
- Notes
- -----
- `indexer` is a direct slice/positional indexer. `value` must
- be a compatible shape.
- """
- orig_indexer = indexer
- orig_value = value
-
- indexer = self._unwrap_setitem_indexer(indexer)
- value = self._maybe_squeeze_arg(value)
-
- values = self.values
- if values.ndim == 2:
- # TODO(GH#45419): string[pyarrow] tests break if we transpose
- # unconditionally
- values = values.T
- check_setitem_lengths(indexer, value, values)
-
- try:
- values[indexer] = value
- except (ValueError, TypeError) as err:
- _catch_deprecated_value_error(err)
-
- if is_interval_dtype(self.dtype):
- # see TestSetitemFloatIntervalWithIntIntervalValues
- nb = self.coerce_to_target_dtype(orig_value)
- return nb.setitem(orig_indexer, orig_value)
-
- elif isinstance(self, NDArrayBackedExtensionBlock):
- nb = self.coerce_to_target_dtype(orig_value)
- return nb.setitem(orig_indexer, orig_value)
-
- else:
- raise
-
- else:
- return self
-
- def where(
- self, other, cond, _downcast: str | bool = "infer", using_cow: bool = False
- ) -> list[Block]:
- # _downcast private bc we only specify it when calling from fillna
- arr = self.values.T
-
- cond = extract_bool_array(cond)
-
- orig_other = other
- orig_cond = cond
- other = self._maybe_squeeze_arg(other)
- cond = self._maybe_squeeze_arg(cond)
-
- if other is lib.no_default:
- other = self.fill_value
-
- icond, noop = validate_putmask(arr, ~cond)
- if noop:
- # GH#44181, GH#45135
- # Avoid a) raising for Interval/PeriodDtype and b) unnecessary object upcast
- if using_cow:
- return [self.copy(deep=False)]
- return [self.copy()]
-
- try:
- res_values = arr._where(cond, other).T
- except (ValueError, TypeError) as err:
- _catch_deprecated_value_error(err)
-
- if self.ndim == 1 or self.shape[0] == 1:
- if is_interval_dtype(self.dtype):
- # TestSetitemFloatIntervalWithIntIntervalValues
- blk = self.coerce_to_target_dtype(orig_other)
- nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
- return self._maybe_downcast(
- nbs, downcast=_downcast, using_cow=using_cow
- )
-
- elif isinstance(self, NDArrayBackedExtensionBlock):
- # NB: not (yet) the same as
- # isinstance(values, NDArrayBackedExtensionArray)
- blk = self.coerce_to_target_dtype(orig_other)
- nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
- return self._maybe_downcast(
- nbs, downcast=_downcast, using_cow=using_cow
- )
-
- else:
- raise
-
- else:
- # Same pattern we use in Block.putmask
- is_array = isinstance(orig_other, (np.ndarray, ExtensionArray))
-
- res_blocks = []
- nbs = self._split()
- for i, nb in enumerate(nbs):
- n = orig_other
- if is_array:
- # we have a different value per-column
- n = orig_other[:, i : i + 1]
-
- submask = orig_cond[:, i : i + 1]
- rbs = nb.where(n, submask, using_cow=using_cow)
- res_blocks.extend(rbs)
- return res_blocks
-
- nb = self.make_block_same_class(res_values)
- return [nb]
-
- def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
- """
- See Block.putmask.__doc__
- """
- mask = extract_bool_array(mask)
- if new is lib.no_default:
- new = self.fill_value
-
- values = self.values
- if values.ndim == 2:
- values = values.T
-
- orig_new = new
- orig_mask = mask
- new = self._maybe_squeeze_arg(new)
- mask = self._maybe_squeeze_arg(mask)
-
- if not mask.any():
- if using_cow:
- return [self.copy(deep=False)]
- return [self]
-
- if using_cow and self.refs.has_reference():
- values = values.copy()
- self = self.make_block_same_class( # type: ignore[assignment]
- values.T if values.ndim == 2 else values
- )
-
- try:
- # Caller is responsible for ensuring matching lengths
- values._putmask(mask, new)
- except (TypeError, ValueError) as err:
- _catch_deprecated_value_error(err)
-
- if self.ndim == 1 or self.shape[0] == 1:
- if is_interval_dtype(self.dtype):
- # Discussion about what we want to support in the general
- # case GH#39584
- blk = self.coerce_to_target_dtype(orig_new)
- return blk.putmask(orig_mask, orig_new)
-
- elif isinstance(self, NDArrayBackedExtensionBlock):
- # NB: not (yet) the same as
- # isinstance(values, NDArrayBackedExtensionArray)
- blk = self.coerce_to_target_dtype(orig_new)
- return blk.putmask(orig_mask, orig_new)
-
- else:
- raise
-
- else:
- # Same pattern we use in Block.putmask
- is_array = isinstance(orig_new, (np.ndarray, ExtensionArray))
-
- res_blocks = []
- nbs = self._split()
- for i, nb in enumerate(nbs):
- n = orig_new
- if is_array:
- # we have a different value per-column
- n = orig_new[:, i : i + 1]
-
- submask = orig_mask[:, i : i + 1]
- rbs = nb.putmask(submask, n)
- res_blocks.extend(rbs)
- return res_blocks
-
- return [self]
-
- def delete(self, loc) -> list[Block]:
- # This will be unnecessary if/when __array_function__ is implemented
- if self.ndim == 1:
- values = self.values.delete(loc)
- mgr_locs = self._mgr_locs.delete(loc)
- return [type(self)(values, placement=mgr_locs, ndim=self.ndim)]
- elif self.values.ndim == 1:
- # We get here through to_stata
- return []
- return super().delete(loc)
-
- @cache_readonly
- def array_values(self) -> ExtensionArray:
- return self.values
-
- def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
- """
- return object dtype as boxed values, such as Timestamps/Timedelta
- """
- values: ArrayLike = self.values
- if dtype == _dtype_obj:
- values = values.astype(object)
- # TODO(EA2D): reshape not needed with 2D EAs
- return np.asarray(values).reshape(self.shape)
-
- def values_for_json(self) -> np.ndarray:
- return np.asarray(self.values)
-
- def interpolate(
- self,
- *,
- method: FillnaOptions = "pad",
- axis: int = 0,
- inplace: bool = False,
- limit: int | None = None,
- fill_value=None,
- using_cow: bool = False,
- **kwargs,
- ):
- values = self.values
- if values.ndim == 2 and axis == 0:
- # NDArrayBackedExtensionArray.fillna assumes axis=1
- new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T
- else:
- new_values = values.fillna(value=fill_value, method=method, limit=limit)
- return self.make_block_same_class(new_values)
-
-
-class ExtensionBlock(libinternals.Block, EABackedBlock):
- """
- Block for holding extension types.
-
- Notes
- -----
- This holds all 3rd-party extension array types. It's also the immediate
- parent class for our internal extension types' blocks.
-
- ExtensionArrays are limited to 1-D.
- """
-
- _can_consolidate = False
- _validate_ndim = False
- is_extension = True
-
- values: ExtensionArray
-
- def fillna(
- self,
- value,
- limit: int | None = None,
- inplace: bool = False,
- downcast=None,
- using_cow: bool = False,
- ) -> list[Block]:
- if is_interval_dtype(self.dtype):
- # Block.fillna handles coercion (test_fillna_interval)
- return super().fillna(
- value=value,
- limit=limit,
- inplace=inplace,
- downcast=downcast,
- using_cow=using_cow,
- )
- if using_cow and self._can_hold_na and not self.values._hasna:
- refs = self.refs
- new_values = self.values
- else:
- refs = None
- new_values = self.values.fillna(value=value, method=None, limit=limit)
- nb = self.make_block_same_class(new_values, refs=refs)
- return nb._maybe_downcast([nb], downcast, using_cow=using_cow)
-
- @cache_readonly
- def shape(self) -> Shape:
- # TODO(EA2D): override unnecessary with 2D EAs
- if self.ndim == 1:
- return (len(self.values),)
- return len(self._mgr_locs), len(self.values)
-
- def iget(self, i: int | tuple[int, int] | tuple[slice, int]):
- # In the case where we have a tuple[slice, int], the slice will always
- # be slice(None)
- # We _could_ make the annotation more specific, but mypy would
- # complain about override mismatch:
- # Literal[0] | tuple[Literal[0], int] | tuple[slice, int]
-
- # Note: only reached with self.ndim == 2
-
- if isinstance(i, tuple):
- # TODO(EA2D): unnecessary with 2D EAs
- col, loc = i
- if not com.is_null_slice(col) and col != 0:
- raise IndexError(f"{self} only contains one item")
- if isinstance(col, slice):
- # the is_null_slice check above assures that col is slice(None)
- # so what we want is a view on all our columns and row loc
- if loc < 0:
- loc += len(self.values)
- # Note: loc:loc+1 vs [[loc]] makes a difference when called
- # from fast_xs because we want to get a view back.
- return self.values[loc : loc + 1]
- return self.values[loc]
- else:
- if i != 0:
- raise IndexError(f"{self} only contains one item")
- return self.values
-
- def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None:
- # When an ndarray, we should have locs.tolist() == [0]
- # When a BlockPlacement we should have list(locs) == [0]
- if copy:
- self.values = self.values.copy()
- self.values[:] = values
-
- def _maybe_squeeze_arg(self, arg):
- """
- If necessary, squeeze a (N, 1) ndarray to (N,)
- """
- # e.g. if we are passed a 2D mask for putmask
- if (
- isinstance(arg, (np.ndarray, ExtensionArray))
- and arg.ndim == self.values.ndim + 1
- ):
- # TODO(EA2D): unnecessary with 2D EAs
- assert arg.shape[1] == 1
- # error: No overload variant of "__getitem__" of "ExtensionArray"
- # matches argument type "Tuple[slice, int]"
- arg = arg[:, 0] # type: ignore[call-overload]
- elif isinstance(arg, ABCDataFrame):
- # 2022-01-06 only reached for setitem
- # TODO: should we avoid getting here with DataFrame?
- assert arg.shape[1] == 1
- arg = arg._ixs(0, axis=1)._values
-
- return arg
-
- def _unwrap_setitem_indexer(self, indexer):
- """
- Adapt a 2D-indexer to our 1D values.
-
- This is intended for 'setitem', not 'iget' or '_slice'.
- """
- # TODO: ATM this doesn't work for iget/_slice, can we change that?
-
- if isinstance(indexer, tuple) and len(indexer) == 2:
- # TODO(EA2D): not needed with 2D EAs
- # Should never have length > 2. Caller is responsible for checking.
- # Length 1 is reached vis setitem_single_block and setitem_single_column
- # each of which pass indexer=(pi,)
- if all(isinstance(x, np.ndarray) and x.ndim == 2 for x in indexer):
- # GH#44703 went through indexing.maybe_convert_ix
- first, second = indexer
- if not (
- second.size == 1 and (second == 0).all() and first.shape[1] == 1
- ):
- raise NotImplementedError(
- "This should not be reached. Please report a bug at "
- "github.com/pandas-dev/pandas/"
- )
- indexer = first[:, 0]
-
- elif lib.is_integer(indexer[1]) and indexer[1] == 0:
- # reached via setitem_single_block passing the whole indexer
- indexer = indexer[0]
-
- elif com.is_null_slice(indexer[1]):
- indexer = indexer[0]
-
- elif is_list_like(indexer[1]) and indexer[1][0] == 0:
- indexer = indexer[0]
-
- else:
- raise NotImplementedError(
- "This should not be reached. Please report a bug at "
- "github.com/pandas-dev/pandas/"
- )
- return indexer
-
- @property
- def is_view(self) -> bool:
- """Extension arrays are never treated as views."""
- return False
-
- @cache_readonly
- def is_numeric(self):
- return self.values.dtype._is_numeric
-
- def _slice(
- self, slicer: slice | npt.NDArray[np.bool_] | npt.NDArray[np.intp]
- ) -> ExtensionArray:
- """
- Return a slice of my values.
-
- Parameters
- ----------
- slicer : slice, ndarray[int], or ndarray[bool]
- Valid (non-reducing) indexer for self.values.
-
- Returns
- -------
- ExtensionArray
- """
- # Notes: ndarray[bool] is only reachable when via getitem_mgr, which
- # is only for Series, i.e. self.ndim == 1.
-
- # return same dims as we currently have
- if self.ndim == 2:
- # reached via getitem_block via _slice_take_blocks_ax0
- # TODO(EA2D): won't be necessary with 2D EAs
-
- if not isinstance(slicer, slice):
- raise AssertionError(
- "invalid slicing for a 1-ndim ExtensionArray", slicer
- )
- # GH#32959 only full-slicers along fake-dim0 are valid
- # TODO(EA2D): won't be necessary with 2D EAs
- # range(1) instead of self._mgr_locs to avoid exception on [::-1]
- # see test_iloc_getitem_slice_negative_step_ea_block
- new_locs = range(1)[slicer]
- if not len(new_locs):
- raise AssertionError(
- "invalid slicing for a 1-ndim ExtensionArray", slicer
- )
- slicer = slice(None)
-
- return self.values[slicer]
-
- @final
- def getitem_block_index(self, slicer: slice) -> ExtensionBlock:
- """
- Perform __getitem__-like specialized to slicing along index.
- """
- # GH#42787 in principle this is equivalent to values[..., slicer], but we don't
- # require subclasses of ExtensionArray to support that form (for now).
- new_values = self.values[slicer]
- return type(self)(new_values, self._mgr_locs, ndim=self.ndim, refs=self.refs)
-
- def diff(self, n: int, axis: AxisInt = 1) -> list[Block]:
- # only reached with ndim == 2 and axis == 1
- # TODO(EA2D): Can share with NDArrayBackedExtensionBlock
- new_values = algos.diff(self.values, n, axis=0)
- return [self.make_block(values=new_values)]
-
- def shift(
- self, periods: int, axis: AxisInt = 0, fill_value: Any = None
- ) -> list[Block]:
- """
- Shift the block by `periods`.
-
- Dispatches to underlying ExtensionArray and re-boxes in an
- ExtensionBlock.
- """
- new_values = self.values.shift(periods=periods, fill_value=fill_value)
- return [self.make_block_same_class(new_values)]
-
- def _unstack(
- self,
- unstacker,
- fill_value,
- new_placement: npt.NDArray[np.intp],
- needs_masking: npt.NDArray[np.bool_],
- ):
- # ExtensionArray-safe unstack.
- # We override ObjectBlock._unstack, which unstacks directly on the
- # values of the array. For EA-backed blocks, this would require
- # converting to a 2-D ndarray of objects.
- # Instead, we unstack an ndarray of integer positions, followed by
- # a `take` on the actual values.
-
- # Caller is responsible for ensuring self.shape[-1] == len(unstacker.index)
- new_values, mask = unstacker.arange_result
-
- # Note: these next two lines ensure that
- # mask.sum() == sum(len(nb.mgr_locs) for nb in blocks)
- # which the calling function needs in order to pass verify_integrity=False
- # to the BlockManager constructor
- new_values = new_values.T[mask]
- new_placement = new_placement[mask]
-
- # needs_masking[i] calculated once in BlockManager.unstack tells
- # us if there are any -1s in the relevant indices. When False,
- # that allows us to go through a faster path in 'take', among
- # other things avoiding e.g. Categorical._validate_scalar.
- blocks = [
- # TODO: could cast to object depending on fill_value?
- type(self)(
- self.values.take(
- indices, allow_fill=needs_masking[i], fill_value=fill_value
- ),
- BlockPlacement(place),
- ndim=2,
- )
- for i, (indices, place) in enumerate(zip(new_values, new_placement))
- ]
- return blocks, mask
-
-
-class NumpyBlock(libinternals.NumpyBlock, Block):
- values: np.ndarray
-
- @property
- def is_view(self) -> bool:
- """return a boolean if I am possibly a view"""
- return self.values.base is not None
-
- @property
- def array_values(self) -> ExtensionArray:
- return PandasArray(self.values)
-
- def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray:
- if dtype == _dtype_obj:
- return self.values.astype(_dtype_obj)
- return self.values
-
- def values_for_json(self) -> np.ndarray:
- return self.values
-
-
-class NumericBlock(NumpyBlock):
- __slots__ = ()
- is_numeric = True
-
-
-class NDArrayBackedExtensionBlock(libinternals.NDArrayBackedBlock, EABackedBlock):
- """
- Block backed by an NDArrayBackedExtensionArray
- """
-
- values: NDArrayBackedExtensionArray
-
- # error: Signature of "is_extension" incompatible with supertype "Block"
- @cache_readonly
- def is_extension(self) -> bool: # type: ignore[override]
- # i.e. datetime64tz, PeriodDtype
- return not isinstance(self.dtype, np.dtype)
-
- @property
- def is_view(self) -> bool:
- """return a boolean if I am possibly a view"""
- # check the ndarray values of the DatetimeIndex values
- return self.values._ndarray.base is not None
-
- def diff(self, n: int, axis: AxisInt = 0) -> list[Block]:
- """
- 1st discrete difference.
-
- Parameters
- ----------
- n : int
- Number of periods to diff.
- axis : int, default 0
- Axis to diff upon.
-
- Returns
- -------
- A list with a new Block.
-
- Notes
- -----
- The arguments here are mimicking shift so they are called correctly
- by apply.
- """
- # only reached with ndim == 2 and axis == 1
- values = self.values
-
- new_values = values - values.shift(n, axis=axis)
- return [self.make_block(new_values)]
-
- def shift(
- self, periods: int, axis: AxisInt = 0, fill_value: Any = None
- ) -> list[Block]:
- values = self.values
- new_values = values.shift(periods, fill_value=fill_value, axis=axis)
- return [self.make_block_same_class(new_values)]
-
-
-def _catch_deprecated_value_error(err: Exception) -> None:
- """
- We catch ValueError for now, but only a specific one raised by DatetimeArray
- which will no longer be raised in version.2.0.
- """
- if isinstance(err, ValueError):
- if isinstance(err, IncompatibleFrequency):
- pass
- elif "'value.closed' is" in str(err):
- # IntervalDtype mismatched 'closed'
- pass
-
-
-class DatetimeLikeBlock(NDArrayBackedExtensionBlock):
- """Block for datetime64[ns], timedelta64[ns]."""
-
- __slots__ = ()
- is_numeric = False
- values: DatetimeArray | TimedeltaArray
-
- def values_for_json(self) -> np.ndarray:
- return self.values._ndarray
-
- def interpolate(
- self,
- *,
- method: FillnaOptions = "pad",
- index: Index | None = None,
- axis: int = 0,
- inplace: bool = False,
- limit: int | None = None,
- fill_value=None,
- using_cow: bool = False,
- **kwargs,
- ):
- values = self.values
-
- # error: Non-overlapping equality check (left operand type:
- # "Literal['backfill', 'bfill', 'ffill', 'pad']", right operand type:
- # "Literal['linear']") [comparison-overlap]
- if method == "linear": # type: ignore[comparison-overlap]
- # TODO: GH#50950 implement for arbitrary EAs
- refs = None
- if using_cow:
- if inplace and not self.refs.has_reference():
- data_out = values._ndarray
- refs = self.refs
- else:
- data_out = values._ndarray.copy()
- else:
- data_out = values._ndarray if inplace else values._ndarray.copy()
- missing.interpolate_array_2d(
- data_out, method=method, limit=limit, index=index, axis=axis
- )
- new_values = type(values)._simple_new(data_out, dtype=values.dtype)
- return self.make_block_same_class(new_values, refs=refs)
-
- elif values.ndim == 2 and axis == 0:
- # NDArrayBackedExtensionArray.fillna assumes axis=1
- new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T
- else:
- new_values = values.fillna(value=fill_value, method=method, limit=limit)
- return self.make_block_same_class(new_values)
-
-
-class DatetimeTZBlock(DatetimeLikeBlock):
- """implement a datetime64 block with a tz attribute"""
-
- values: DatetimeArray
-
- __slots__ = ()
- is_extension = True
- _validate_ndim = True
- _can_consolidate = False
-
- # Don't use values_for_json from DatetimeLikeBlock since it is
- # an invalid optimization here(drop the tz)
- values_for_json = NDArrayBackedExtensionBlock.values_for_json
-
-
-class ObjectBlock(NumpyBlock):
- __slots__ = ()
- is_object = True
-
- @maybe_split
- def convert(
- self,
- *,
- copy: bool = True,
- using_cow: bool = False,
- ) -> list[Block]:
- """
- attempt to cast any object types to better types return a copy of
- the block (if copy = True) by definition we ARE an ObjectBlock!!!!!
- """
- if self.dtype != _dtype_obj:
- # GH#50067 this should be impossible in ObjectBlock, but until
- # that is fixed, we short-circuit here.
- if using_cow:
- return [self.copy(deep=False)]
- return [self]
-
- values = self.values
- if values.ndim == 2:
- # maybe_split ensures we only get here with values.shape[0] == 1,
- # avoid doing .ravel as that might make a copy
- values = values[0]
-
- res_values = lib.maybe_convert_objects(
- values,
- convert_datetime=True,
- convert_timedelta=True,
- convert_period=True,
- convert_interval=True,
- )
- refs = None
- if copy and res_values is values:
- res_values = values.copy()
- elif res_values is values and using_cow:
- refs = self.refs
-
- res_values = ensure_block_shape(res_values, self.ndim)
- return [self.make_block(res_values, refs=refs)]
-
-
-# -----------------------------------------------------------------
-# Constructor Helpers
-
-
-def maybe_coerce_values(values: ArrayLike) -> ArrayLike:
- """
- Input validation for values passed to __init__. Ensure that
- any datetime64/timedelta64 dtypes are in nanoseconds. Ensure
- that we do not have string dtypes.
-
- Parameters
- ----------
- values : np.ndarray or ExtensionArray
-
- Returns
- -------
- values : np.ndarray or ExtensionArray
- """
- # Caller is responsible for ensuring PandasArray is already extracted.
-
- if isinstance(values, np.ndarray):
- values = ensure_wrapped_if_datetimelike(values)
-
- if issubclass(values.dtype.type, str):
- values = np.array(values, dtype=object)
-
- if isinstance(values, (DatetimeArray, TimedeltaArray)) and values.freq is not None:
- # freq is only stored in DatetimeIndex/TimedeltaIndex, not in Series/DataFrame
- values = values._with_freq(None)
-
- return values
-
-
-def get_block_type(dtype: DtypeObj):
- """
- Find the appropriate Block subclass to use for the given values and dtype.
-
- Parameters
- ----------
- dtype : numpy or pandas dtype
-
- Returns
- -------
- cls : class, subclass of Block
- """
- # We use kind checks because it is much more performant
- # than is_foo_dtype
- kind = dtype.kind
-
- cls: type[Block]
-
- if isinstance(dtype, SparseDtype):
- # Need this first(ish) so that Sparse[datetime] is sparse
- cls = ExtensionBlock
- elif isinstance(dtype, DatetimeTZDtype):
- cls = DatetimeTZBlock
- elif isinstance(dtype, PeriodDtype):
- cls = NDArrayBackedExtensionBlock
- elif isinstance(dtype, ExtensionDtype):
- # Note: need to be sure PandasArray is unwrapped before we get here
- cls = ExtensionBlock
-
- elif kind in ["M", "m"]:
- cls = DatetimeLikeBlock
- elif kind in ["f", "c", "i", "u", "b"]:
- cls = NumericBlock
- else:
- cls = ObjectBlock
- return cls
-
-
-def new_block_2d(
- values: ArrayLike, placement: BlockPlacement, refs: BlockValuesRefs | None = None
-):
- # new_block specialized to case with
- # ndim=2
- # isinstance(placement, BlockPlacement)
- # check_ndim/ensure_block_shape already checked
- klass = get_block_type(values.dtype)
-
- values = maybe_coerce_values(values)
- return klass(values, ndim=2, placement=placement, refs=refs)
-
-
-def new_block(
- values, placement, *, ndim: int, refs: BlockValuesRefs | None = None
-) -> Block:
- # caller is responsible for ensuring values is NOT a PandasArray
-
- if not isinstance(placement, BlockPlacement):
- placement = BlockPlacement(placement)
-
- check_ndim(values, placement, ndim)
-
- klass = get_block_type(values.dtype)
-
- values = maybe_coerce_values(values)
- return klass(values, ndim=ndim, placement=placement, refs=refs)
-
-
-def check_ndim(values, placement: BlockPlacement, ndim: int) -> None:
- """
- ndim inference and validation.
-
- Validates that values.ndim and ndim are consistent.
- Validates that len(values) and len(placement) are consistent.
-
- Parameters
- ----------
- values : array-like
- placement : BlockPlacement
- ndim : int
-
- Raises
- ------
- ValueError : the number of dimensions do not match
- """
-
- if values.ndim > ndim:
- # Check for both np.ndarray and ExtensionArray
- raise ValueError(
- "Wrong number of dimensions. "
- f"values.ndim > ndim [{values.ndim} > {ndim}]"
- )
-
- if not is_1d_only_ea_dtype(values.dtype):
- # TODO(EA2D): special case not needed with 2D EAs
- if values.ndim != ndim:
- raise ValueError(
- "Wrong number of dimensions. "
- f"values.ndim != ndim [{values.ndim} != {ndim}]"
- )
- if len(placement) != len(values):
- raise ValueError(
- f"Wrong number of items passed {len(values)}, "
- f"placement implies {len(placement)}"
- )
- elif ndim == 2 and len(placement) != 1:
- # TODO(EA2D): special case unnecessary with 2D EAs
- raise ValueError("need to split")
-
-
-def extract_pandas_array(
- values: np.ndarray | ExtensionArray, dtype: DtypeObj | None, ndim: int
-) -> tuple[np.ndarray | ExtensionArray, DtypeObj | None]:
- """
- Ensure that we don't allow PandasArray / PandasDtype in internals.
- """
- # For now, blocks should be backed by ndarrays when possible.
- if isinstance(values, ABCPandasArray):
- values = values.to_numpy()
- if ndim and ndim > 1:
- # TODO(EA2D): special case not needed with 2D EAs
- values = np.atleast_2d(values)
-
- if isinstance(dtype, PandasDtype):
- dtype = dtype.numpy_dtype
-
- return values, dtype
-
-
-# -----------------------------------------------------------------
-
-
-def extend_blocks(result, blocks=None) -> list[Block]:
- """return a new extended blocks, given the result"""
- if blocks is None:
- blocks = []
- if isinstance(result, list):
- for r in result:
- if isinstance(r, list):
- blocks.extend(r)
- else:
- blocks.append(r)
- else:
- assert isinstance(result, Block), type(result)
- blocks.append(result)
- return blocks
-
-
-def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike:
- """
- Reshape if possible to have values.ndim == ndim.
- """
-
- if values.ndim < ndim:
- if not is_1d_only_ea_dtype(values.dtype):
- # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023
- # block.shape is incorrect for "2D" ExtensionArrays
- # We can't, and don't need to, reshape.
- values = cast("np.ndarray | DatetimeArray | TimedeltaArray", values)
- values = values.reshape(1, -1)
-
- return values
-
-
-def to_native_types(
- values: ArrayLike,
- *,
- na_rep: str = "nan",
- quoting=None,
- float_format=None,
- decimal: str = ".",
- **kwargs,
-) -> np.ndarray:
- """convert to our native types format"""
- if isinstance(values, Categorical) and values.categories.dtype.kind in "Mm":
- # GH#40754 Convert categorical datetimes to datetime array
- values = algos.take_nd(
- values.categories._values,
- ensure_platform_int(values._codes),
- fill_value=na_rep,
- )
-
- values = ensure_wrapped_if_datetimelike(values)
-
- if isinstance(values, (DatetimeArray, TimedeltaArray)):
- if values.ndim == 1:
- result = values._format_native_types(na_rep=na_rep, **kwargs)
- result = result.astype(object, copy=False)
- return result
-
- # GH#21734 Process every column separately, they might have different formats
- results_converted = []
- for i in range(len(values)):
- result = values[i, :]._format_native_types(na_rep=na_rep, **kwargs)
- results_converted.append(result.astype(object, copy=False))
- return np.vstack(results_converted)
-
- elif values.dtype.kind == "f" and not is_sparse(values):
- # see GH#13418: no special formatting is desired at the
- # output (important for appropriate 'quoting' behaviour),
- # so do not pass it through the FloatArrayFormatter
- if float_format is None and decimal == ".":
- mask = isna(values)
-
- if not quoting:
- values = values.astype(str)
- else:
- values = np.array(values, dtype="object")
-
- values[mask] = na_rep
- values = values.astype(object, copy=False)
- return values
-
- from pandas.io.formats.format import FloatArrayFormatter
-
- formatter = FloatArrayFormatter(
- values,
- na_rep=na_rep,
- float_format=float_format,
- decimal=decimal,
- quoting=quoting,
- fixed_width=False,
- )
- res = formatter.get_result_as_array()
- res = res.astype(object, copy=False)
- return res
-
- elif isinstance(values, ExtensionArray):
- mask = isna(values)
-
- new_values = np.asarray(values.astype(object))
- new_values[mask] = na_rep
- return new_values
-
- else:
- mask = isna(values)
- itemsize = writers.word_len(na_rep)
-
- if values.dtype != _dtype_obj and not quoting and itemsize:
- values = values.astype(str)
- if values.dtype.itemsize / np.dtype("U1").itemsize < itemsize:
- # enlarge for the na_rep
- values = values.astype(f"<U{itemsize}")
- else:
- values = np.array(values, dtype="object")
-
- values[mask] = na_rep
- values = values.astype(object, copy=False)
- return values
-
-
-def external_values(values: ArrayLike) -> ArrayLike:
- """
- The array that Series.values returns (public attribute).
-
- This has some historical constraints, and is overridden in block
- subclasses to return the correct array (e.g. period returns
- object ndarray and datetimetz a datetime64[ns] ndarray instead of
- proper extension array).
- """
- if isinstance(values, (PeriodArray, IntervalArray)):
- return values.astype(object)
- elif isinstance(values, (DatetimeArray, TimedeltaArray)):
- # NB: for datetime64tz this is different from np.asarray(values), since
- # that returns an object-dtype ndarray of Timestamps.
- # Avoid raising in .astype in casting from dt64tz to dt64
- values = values._ndarray
-
- if isinstance(values, np.ndarray) and using_copy_on_write():
- values = values.view()
- values.flags.writeable = False
-
- # TODO(CoW) we should also mark our ExtensionArrays as read-only
-
- return values
diff --git a/contrib/python/pandas/py3/pandas/core/internals/concat.py b/contrib/python/pandas/py3/pandas/core/internals/concat.py
deleted file mode 100644
index e9cf7a151b6..00000000000
--- a/contrib/python/pandas/py3/pandas/core/internals/concat.py
+++ /dev/null
@@ -1,791 +0,0 @@
-from __future__ import annotations
-
-import copy as cp
-import itertools
-from typing import (
- TYPE_CHECKING,
- Sequence,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs import (
- NaT,
- internals as libinternals,
-)
-from pandas._libs.missing import NA
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- DtypeObj,
- Manager,
- Shape,
-)
-from pandas.util._decorators import cache_readonly
-
-from pandas.core.dtypes.astype import astype_array
-from pandas.core.dtypes.cast import (
- ensure_dtype_can_hold_na,
- find_common_type,
- np_find_common_type,
-)
-from pandas.core.dtypes.common import (
- is_1d_only_ea_dtype,
- is_dtype_equal,
- is_scalar,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.concat import concat_compat
-from pandas.core.dtypes.dtypes import (
- DatetimeTZDtype,
- ExtensionDtype,
-)
-from pandas.core.dtypes.missing import (
- is_valid_na_for_dtype,
- isna,
- isna_all,
-)
-
-import pandas.core.algorithms as algos
-from pandas.core.arrays import (
- DatetimeArray,
- ExtensionArray,
-)
-from pandas.core.arrays.sparse import SparseDtype
-from pandas.core.construction import ensure_wrapped_if_datetimelike
-from pandas.core.internals.array_manager import (
- ArrayManager,
- NullArrayProxy,
-)
-from pandas.core.internals.blocks import (
- ensure_block_shape,
- new_block_2d,
-)
-from pandas.core.internals.managers import BlockManager
-
-if TYPE_CHECKING:
- from pandas import Index
- from pandas.core.internals.blocks import Block
-
-
-def _concatenate_array_managers(
- mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
-) -> Manager:
- """
- Concatenate array managers into one.
-
- Parameters
- ----------
- mgrs_indexers : list of (ArrayManager, {axis: indexer,...}) tuples
- axes : list of Index
- concat_axis : int
- copy : bool
-
- Returns
- -------
- ArrayManager
- """
- # reindex all arrays
- mgrs = []
- for mgr, indexers in mgrs_indexers:
- axis1_made_copy = False
- for ax, indexer in indexers.items():
- mgr = mgr.reindex_indexer(
- axes[ax], indexer, axis=ax, allow_dups=True, use_na_proxy=True
- )
- if ax == 1 and indexer is not None:
- axis1_made_copy = True
- if copy and concat_axis == 0 and not axis1_made_copy:
- # for concat_axis 1 we will always get a copy through concat_arrays
- mgr = mgr.copy()
- mgrs.append(mgr)
-
- if concat_axis == 1:
- # concatting along the rows -> concat the reindexed arrays
- # TODO(ArrayManager) doesn't yet preserve the correct dtype
- arrays = [
- concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
- for j in range(len(mgrs[0].arrays))
- ]
- else:
- # concatting along the columns -> combine reindexed arrays in a single manager
- assert concat_axis == 0
- arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
-
- new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
- return new_mgr
-
-
-def concat_arrays(to_concat: list) -> ArrayLike:
- """
- Alternative for concat_compat but specialized for use in the ArrayManager.
-
- Differences: only deals with 1D arrays (no axis keyword), assumes
- ensure_wrapped_if_datetimelike and does not skip empty arrays to determine
- the dtype.
- In addition ensures that all NullArrayProxies get replaced with actual
- arrays.
-
- Parameters
- ----------
- to_concat : list of arrays
-
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- # ignore the all-NA proxies to determine the resulting dtype
- to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)]
-
- dtypes = {x.dtype for x in to_concat_no_proxy}
- single_dtype = len(dtypes) == 1
-
- if single_dtype:
- target_dtype = to_concat_no_proxy[0].dtype
- elif all(x.kind in ["i", "u", "b"] and isinstance(x, np.dtype) for x in dtypes):
- # GH#42092
- target_dtype = np_find_common_type(*dtypes)
- else:
- target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
-
- to_concat = [
- arr.to_array(target_dtype)
- if isinstance(arr, NullArrayProxy)
- else astype_array(arr, target_dtype, copy=False)
- for arr in to_concat
- ]
-
- if isinstance(to_concat[0], ExtensionArray):
- cls = type(to_concat[0])
- return cls._concat_same_type(to_concat)
-
- result = np.concatenate(to_concat)
-
- # TODO decide on exact behaviour (we shouldn't do this only for empty result)
- # see https://github.com/pandas-dev/pandas/issues/39817
- if len(result) == 0:
- # all empties -> check for bool to not coerce to float
- kinds = {obj.dtype.kind for obj in to_concat_no_proxy}
- if len(kinds) != 1:
- if "b" in kinds:
- result = result.astype(object)
- return result
-
-
-def concatenate_managers(
- mgrs_indexers, axes: list[Index], concat_axis: AxisInt, copy: bool
-) -> Manager:
- """
- Concatenate block managers into one.
-
- Parameters
- ----------
- mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
- axes : list of Index
- concat_axis : int
- copy : bool
-
- Returns
- -------
- BlockManager
- """
- # TODO(ArrayManager) this assumes that all managers are of the same type
- if isinstance(mgrs_indexers[0][0], ArrayManager):
- return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy)
-
- # Assertions disabled for performance
- # for tup in mgrs_indexers:
- # # caller is responsible for ensuring this
- # indexers = tup[1]
- # assert concat_axis not in indexers
-
- if concat_axis == 0:
- return _concat_managers_axis0(mgrs_indexers, axes, copy)
-
- mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
-
- concat_plans = [
- _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
- ]
- concat_plan = _combine_concat_plans(concat_plans)
- blocks = []
-
- for placement, join_units in concat_plan:
- unit = join_units[0]
- blk = unit.block
-
- if len(join_units) == 1 and not join_units[0].indexers:
- values = blk.values
- if copy:
- values = values.copy()
- else:
- values = values.view()
- fastpath = True
- elif _is_uniform_join_units(join_units):
- vals = [ju.block.values for ju in join_units]
-
- if not blk.is_extension:
- # _is_uniform_join_units ensures a single dtype, so
- # we can use np.concatenate, which is more performant
- # than concat_compat
- values = np.concatenate(vals, axis=1)
- else:
- # TODO(EA2D): special-casing not needed with 2D EAs
- values = concat_compat(vals, axis=1)
- values = ensure_block_shape(values, ndim=2)
-
- values = ensure_wrapped_if_datetimelike(values)
-
- fastpath = blk.values.dtype == values.dtype
- else:
- values = _concatenate_join_units(join_units, copy=copy)
- fastpath = False
-
- if fastpath:
- b = blk.make_block_same_class(values, placement=placement)
- else:
- b = new_block_2d(values, placement=placement)
-
- blocks.append(b)
-
- return BlockManager(tuple(blocks), axes)
-
-
-def _concat_managers_axis0(
- mgrs_indexers, axes: list[Index], copy: bool
-) -> BlockManager:
- """
- concat_managers specialized to concat_axis=0, with reindexing already
- having been done in _maybe_reindex_columns_na_proxy.
- """
- had_reindexers = {
- i: len(mgrs_indexers[i][1]) > 0 for i in range(len(mgrs_indexers))
- }
- mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers)
-
- mgrs = [x[0] for x in mgrs_indexers]
-
- offset = 0
- blocks = []
- for i, mgr in enumerate(mgrs):
- # If we already reindexed, then we definitely don't need another copy
- made_copy = had_reindexers[i]
-
- for blk in mgr.blocks:
- if made_copy:
- nb = blk.copy(deep=False)
- elif copy:
- nb = blk.copy()
- else:
- # by slicing instead of copy(deep=False), we get a new array
- # object, see test_concat_copy
- nb = blk.getitem_block(slice(None))
- nb._mgr_locs = nb._mgr_locs.add(offset)
- blocks.append(nb)
-
- offset += len(mgr.items)
-
- result = BlockManager(tuple(blocks), axes)
- return result
-
-
-def _maybe_reindex_columns_na_proxy(
- axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]]
-) -> list[tuple[BlockManager, dict[int, np.ndarray]]]:
- """
- Reindex along columns so that all of the BlockManagers being concatenated
- have matching columns.
-
- Columns added in this reindexing have dtype=np.void, indicating they
- should be ignored when choosing a column's final dtype.
- """
- new_mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] = []
-
- for mgr, indexers in mgrs_indexers:
- # For axis=0 (i.e. columns) we use_na_proxy and only_slice, so this
- # is a cheap reindexing.
- for i, indexer in indexers.items():
- mgr = mgr.reindex_indexer(
- axes[i],
- indexers[i],
- axis=i,
- copy=False,
- only_slice=True, # only relevant for i==0
- allow_dups=True,
- use_na_proxy=True, # only relevant for i==0
- )
- new_mgrs_indexers.append((mgr, {}))
- return new_mgrs_indexers
-
-
-def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]):
- """
- Construct concatenation plan for given block manager and indexers.
-
- Parameters
- ----------
- mgr : BlockManager
- indexers : dict of {axis: indexer}
-
- Returns
- -------
- plan : list of (BlockPlacement, JoinUnit) tuples
-
- """
- assert len(indexers) == 0
-
- # Calculate post-reindex shape, save for item axis which will be separate
- # for each block anyway.
- mgr_shape_list = list(mgr.shape)
- for ax, indexer in indexers.items():
- mgr_shape_list[ax] = len(indexer)
- mgr_shape = tuple(mgr_shape_list)
-
- assert 0 not in indexers
-
- if mgr.is_single_block:
- blk = mgr.blocks[0]
- return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]
-
- blknos = mgr.blknos
- blklocs = mgr.blklocs
-
- plan = []
- for blkno, placements in libinternals.get_blkno_placements(blknos, group=False):
- assert placements.is_slice_like
- assert blkno != -1
-
- join_unit_indexers = indexers.copy()
-
- shape_list = list(mgr_shape)
- shape_list[0] = len(placements)
- shape = tuple(shape_list)
-
- blk = mgr.blocks[blkno]
- ax0_blk_indexer = blklocs[placements.indexer]
-
- unit_no_ax0_reindexing = (
- len(placements) == len(blk.mgr_locs)
- and
- # Fastpath detection of join unit not
- # needing to reindex its block: no ax0
- # reindexing took place and block
- # placement was sequential before.
- (
- (blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1)
- or
- # Slow-ish detection: all indexer locs
- # are sequential (and length match is
- # checked above).
- (np.diff(ax0_blk_indexer) == 1).all()
- )
- )
-
- # Omit indexer if no item reindexing is required.
- if unit_no_ax0_reindexing:
- join_unit_indexers.pop(0, None)
- else:
- join_unit_indexers[0] = ax0_blk_indexer
-
- unit = JoinUnit(blk, shape, join_unit_indexers)
-
- plan.append((placements, unit))
-
- return plan
-
-
-class JoinUnit:
- def __init__(self, block: Block, shape: Shape, indexers=None) -> None:
- # Passing shape explicitly is required for cases when block is None.
- # Note: block is None implies indexers is None, but not vice-versa
- if indexers is None:
- indexers = {}
- self.block = block
- self.indexers = indexers
- self.shape = shape
-
- def __repr__(self) -> str:
- return f"{type(self).__name__}({repr(self.block)}, {self.indexers})"
-
- @cache_readonly
- def needs_filling(self) -> bool:
- for indexer in self.indexers.values():
- # FIXME: cache results of indexer == -1 checks.
- if (indexer == -1).any():
- return True
-
- return False
-
- @cache_readonly
- def dtype(self) -> DtypeObj:
- blk = self.block
- if blk.values.dtype.kind == "V":
- raise AssertionError("Block is None, no dtype")
-
- if not self.needs_filling:
- return blk.dtype
- return ensure_dtype_can_hold_na(blk.dtype)
-
- def _is_valid_na_for(self, dtype: DtypeObj) -> bool:
- """
- Check that we are all-NA of a type/dtype that is compatible with this dtype.
- Augments `self.is_na` with an additional check of the type of NA values.
- """
- if not self.is_na:
- return False
- if self.block.dtype.kind == "V":
- return True
-
- if self.dtype == object:
- values = self.block.values
- return all(is_valid_na_for_dtype(x, dtype) for x in values.ravel(order="K"))
-
- na_value = self.block.fill_value
- if na_value is NaT and not is_dtype_equal(self.dtype, dtype):
- # e.g. we are dt64 and other is td64
- # fill_values match but we should not cast self.block.values to dtype
- # TODO: this will need updating if we ever have non-nano dt64/td64
- return False
-
- if na_value is NA and needs_i8_conversion(dtype):
- # FIXME: kludge; test_append_empty_frame_with_timedelta64ns_nat
- # e.g. self.dtype == "Int64" and dtype is td64, we dont want
- # to consider these as matching
- return False
-
- # TODO: better to use can_hold_element?
- return is_valid_na_for_dtype(na_value, dtype)
-
- @cache_readonly
- def is_na(self) -> bool:
- blk = self.block
- if blk.dtype.kind == "V":
- return True
-
- if not blk._can_hold_na:
- return False
-
- values = blk.values
- if values.size == 0:
- return True
- if isinstance(values.dtype, SparseDtype):
- return False
-
- if values.ndim == 1:
- # TODO(EA2D): no need for special case with 2D EAs
- val = values[0]
- if not is_scalar(val) or not isna(val):
- # ideally isna_all would do this short-circuiting
- return False
- return isna_all(values)
- else:
- val = values[0][0]
- if not is_scalar(val) or not isna(val):
- # ideally isna_all would do this short-circuiting
- return False
- return all(isna_all(row) for row in values)
-
- def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
- values: ArrayLike
-
- if upcasted_na is None and self.block.dtype.kind != "V":
- # No upcasting is necessary
- fill_value = self.block.fill_value
- values = self.block.values
- else:
- fill_value = upcasted_na
-
- if self._is_valid_na_for(empty_dtype):
- # note: always holds when self.block.dtype.kind == "V"
- blk_dtype = self.block.dtype
-
- if blk_dtype == np.dtype("object"):
- # we want to avoid filling with np.nan if we are
- # using None; we already know that we are all
- # nulls
- values = self.block.values.ravel(order="K")
- if len(values) and values[0] is None:
- fill_value = None
-
- if isinstance(empty_dtype, DatetimeTZDtype):
- # NB: exclude e.g. pyarrow[dt64tz] dtypes
- i8values = np.full(self.shape, fill_value._value)
- return DatetimeArray(i8values, dtype=empty_dtype)
-
- elif is_1d_only_ea_dtype(empty_dtype):
- if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers:
- # avoid creating new empty array if we already have an array
- # with correct dtype that can be reindexed
- pass
- else:
- empty_dtype = cast(ExtensionDtype, empty_dtype)
- cls = empty_dtype.construct_array_type()
-
- missing_arr = cls._from_sequence([], dtype=empty_dtype)
- ncols, nrows = self.shape
- assert ncols == 1, ncols
- empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
- return missing_arr.take(
- empty_arr, allow_fill=True, fill_value=fill_value
- )
- elif isinstance(empty_dtype, ExtensionDtype):
- # TODO: no tests get here, a handful would if we disabled
- # the dt64tz special-case above (which is faster)
- cls = empty_dtype.construct_array_type()
- missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype)
- missing_arr[:] = fill_value
- return missing_arr
- else:
- # NB: we should never get here with empty_dtype integer or bool;
- # if we did, the missing_arr.fill would cast to gibberish
- missing_arr = np.empty(self.shape, dtype=empty_dtype)
- missing_arr.fill(fill_value)
- return missing_arr
-
- if (not self.indexers) and (not self.block._can_consolidate):
- # preserve these for validation in concat_compat
- return self.block.values
-
- if self.block.is_bool:
- # External code requested filling/upcasting, bool values must
- # be upcasted to object to avoid being upcasted to numeric.
- values = self.block.astype(np.dtype("object")).values
- else:
- # No dtype upcasting is done here, it will be performed during
- # concatenation itself.
- values = self.block.values
-
- if not self.indexers:
- # If there's no indexing to be done, we want to signal outside
- # code that this array must be copied explicitly. This is done
- # by returning a view and checking `retval.base`.
- values = values.view()
-
- else:
- for ax, indexer in self.indexers.items():
- values = algos.take_nd(values, indexer, axis=ax)
-
- return values
-
-
-def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike:
- """
- Concatenate values from several join units along axis=1.
- """
- empty_dtype = _get_empty_dtype(join_units)
-
- has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
- upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
-
- to_concat = [
- ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na)
- for ju in join_units
- ]
-
- if len(to_concat) == 1:
- # Only one block, nothing to concatenate.
- concat_values = to_concat[0]
- if copy:
- if isinstance(concat_values, np.ndarray):
- # non-reindexed (=not yet copied) arrays are made into a view
- # in JoinUnit.get_reindexed_values
- if concat_values.base is not None:
- concat_values = concat_values.copy()
- else:
- concat_values = concat_values.copy()
-
- elif any(is_1d_only_ea_dtype(t.dtype) for t in to_concat):
- # TODO(EA2D): special case not needed if all EAs used HybridBlocks
-
- # error: No overload variant of "__getitem__" of "ExtensionArray" matches
- # argument type "Tuple[int, slice]"
- to_concat = [
- t
- if is_1d_only_ea_dtype(t.dtype)
- else t[0, :] # type: ignore[call-overload]
- for t in to_concat
- ]
- concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True)
- concat_values = ensure_block_shape(concat_values, 2)
-
- else:
- concat_values = concat_compat(to_concat, axis=1)
-
- return concat_values
-
-
-def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool):
- """
- Find the NA value to go with this dtype.
- """
- if isinstance(dtype, ExtensionDtype):
- return dtype.na_value
- elif dtype.kind in ["m", "M"]:
- return dtype.type("NaT")
- elif dtype.kind in ["f", "c"]:
- return dtype.type("NaN")
- elif dtype.kind == "b":
- # different from missing.na_value_for_dtype
- return None
- elif dtype.kind in ["i", "u"]:
- if not has_none_blocks:
- # different from missing.na_value_for_dtype
- return None
- return np.nan
- elif dtype.kind == "O":
- return np.nan
- raise NotImplementedError
-
-
-def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj:
- """
- Return dtype and N/A values to use when concatenating specified units.
-
- Returned N/A value may be None which means there was no casting involved.
-
- Returns
- -------
- dtype
- """
- if len(join_units) == 1:
- blk = join_units[0].block
- return blk.dtype
-
- if _is_uniform_reindex(join_units):
- empty_dtype = join_units[0].block.dtype
- return empty_dtype
-
- has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units)
-
- dtypes = [unit.dtype for unit in join_units if not unit.is_na]
- if not len(dtypes):
- dtypes = [unit.dtype for unit in join_units if unit.block.dtype.kind != "V"]
-
- dtype = find_common_type(dtypes)
- if has_none_blocks:
- dtype = ensure_dtype_can_hold_na(dtype)
- return dtype
-
-
-def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool:
- """
- Check if the join units consist of blocks of uniform type that can
- be concatenated using Block.concat_same_type instead of the generic
- _concatenate_join_units (which uses `concat_compat`).
-
- """
- first = join_units[0].block
- if first.dtype.kind == "V":
- return False
- return (
- # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64
- all(type(ju.block) is type(first) for ju in join_units)
- and
- # e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform
- all(
- is_dtype_equal(ju.block.dtype, first.dtype)
- # GH#42092 we only want the dtype_equal check for non-numeric blocks
- # (for now, may change but that would need a deprecation)
- or ju.block.dtype.kind in ["b", "i", "u"]
- for ju in join_units
- )
- and
- # no blocks that would get missing values (can lead to type upcasts)
- # unless we're an extension dtype.
- all(not ju.is_na or ju.block.is_extension for ju in join_units)
- and
- # no blocks with indexers (as then the dimensions do not fit)
- all(not ju.indexers for ju in join_units)
- and
- # only use this path when there is something to concatenate
- len(join_units) > 1
- )
-
-
-def _is_uniform_reindex(join_units) -> bool:
- return (
- # TODO: should this be ju.block._can_hold_na?
- all(ju.block.is_extension for ju in join_units)
- and len({ju.block.dtype.name for ju in join_units}) == 1
- )
-
-
-def _trim_join_unit(join_unit: JoinUnit, length: int) -> JoinUnit:
- """
- Reduce join_unit's shape along item axis to length.
-
- Extra items that didn't fit are returned as a separate block.
- """
- if 0 not in join_unit.indexers:
- extra_indexers = join_unit.indexers
-
- if join_unit.block is None:
- extra_block = None
- else:
- extra_block = join_unit.block.getitem_block(slice(length, None))
- join_unit.block = join_unit.block.getitem_block(slice(length))
- else:
- extra_block = join_unit.block
-
- extra_indexers = cp.copy(join_unit.indexers)
- extra_indexers[0] = extra_indexers[0][length:]
- join_unit.indexers[0] = join_unit.indexers[0][:length]
-
- extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
- join_unit.shape = (length,) + join_unit.shape[1:]
-
- return JoinUnit(block=extra_block, indexers=extra_indexers, shape=extra_shape)
-
-
-def _combine_concat_plans(plans):
- """
- Combine multiple concatenation plans into one.
-
- existing_plan is updated in-place.
-
- We only get here with concat_axis == 1.
- """
- if len(plans) == 1:
- for p in plans[0]:
- yield p[0], [p[1]]
-
- else:
- # singleton list so we can modify it as a side-effect within _next_or_none
- num_ended = [0]
-
- def _next_or_none(seq):
- retval = next(seq, None)
- if retval is None:
- num_ended[0] += 1
- return retval
-
- plans = list(map(iter, plans))
- next_items = list(map(_next_or_none, plans))
-
- while num_ended[0] != len(next_items):
- if num_ended[0] > 0:
- raise ValueError("Plan shapes are not aligned")
-
- placements, units = zip(*next_items)
-
- lengths = list(map(len, placements))
- min_len, max_len = min(lengths), max(lengths)
-
- if min_len == max_len:
- yield placements[0], units
- next_items[:] = map(_next_or_none, plans)
- else:
- yielded_placement = None
- yielded_units = [None] * len(next_items)
- for i, (plc, unit) in enumerate(next_items):
- yielded_units[i] = unit
- if len(plc) > min_len:
- # _trim_join_unit updates unit in place, so only
- # placement needs to be sliced to skip min_len.
- next_items[i] = (plc[min_len:], _trim_join_unit(unit, min_len))
- else:
- yielded_placement = plc
- next_items[i] = _next_or_none(plans[i])
-
- yield yielded_placement, yielded_units
diff --git a/contrib/python/pandas/py3/pandas/core/internals/construction.py b/contrib/python/pandas/py3/pandas/core/internals/construction.py
deleted file mode 100644
index 8bfad966390..00000000000
--- a/contrib/python/pandas/py3/pandas/core/internals/construction.py
+++ /dev/null
@@ -1,1069 +0,0 @@
-"""
-Functions for preparing various inputs passed to the DataFrame or Series
-constructors before passing them to a BlockManager.
-"""
-from __future__ import annotations
-
-from collections import abc
-from typing import (
- Any,
- Hashable,
- Sequence,
-)
-
-import numpy as np
-from numpy import ma
-
-from pandas._libs import lib
-from pandas._typing import (
- ArrayLike,
- DtypeObj,
- Manager,
- npt,
-)
-
-from pandas.core.dtypes.astype import astype_is_view
-from pandas.core.dtypes.cast import (
- construct_1d_arraylike_from_scalar,
- dict_compat,
- maybe_cast_to_datetime,
- maybe_convert_platform,
- maybe_infer_to_datetimelike,
-)
-from pandas.core.dtypes.common import (
- is_1d_only_ea_dtype,
- is_bool_dtype,
- is_datetime_or_timedelta_dtype,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float_dtype,
- is_integer_dtype,
- is_list_like,
- is_named_tuple,
- is_object_dtype,
-)
-from pandas.core.dtypes.dtypes import ExtensionDtype
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-
-from pandas.core import (
- algorithms,
- common as com,
-)
-from pandas.core.arrays import (
- BooleanArray,
- ExtensionArray,
- FloatingArray,
- IntegerArray,
-)
-from pandas.core.arrays.string_ import StringDtype
-from pandas.core.construction import (
- ensure_wrapped_if_datetimelike,
- extract_array,
- range_to_ndarray,
- sanitize_array,
-)
-from pandas.core.indexes.api import (
- DatetimeIndex,
- Index,
- TimedeltaIndex,
- default_index,
- ensure_index,
- get_objs_combined_axis,
- union_indexes,
-)
-from pandas.core.internals.array_manager import (
- ArrayManager,
- SingleArrayManager,
-)
-from pandas.core.internals.blocks import (
- BlockPlacement,
- ensure_block_shape,
- new_block_2d,
-)
-from pandas.core.internals.managers import (
- BlockManager,
- SingleBlockManager,
- create_block_manager_from_blocks,
- create_block_manager_from_column_arrays,
-)
-
-# ---------------------------------------------------------------------
-# BlockManager Interface
-
-
-def arrays_to_mgr(
- arrays,
- columns: Index,
- index,
- *,
- dtype: DtypeObj | None = None,
- verify_integrity: bool = True,
- typ: str | None = None,
- consolidate: bool = True,
-) -> Manager:
- """
- Segregate Series based on type and coerce into matrices.
-
- Needs to handle a lot of exceptional cases.
- """
- if verify_integrity:
- # figure out the index, if necessary
- if index is None:
- index = _extract_index(arrays)
- else:
- index = ensure_index(index)
-
- # don't force copy because getting jammed in an ndarray anyway
- arrays, refs = _homogenize(arrays, index, dtype)
- # _homogenize ensures
- # - all(len(x) == len(index) for x in arrays)
- # - all(x.ndim == 1 for x in arrays)
- # - all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
- # - all(type(x) is not PandasArray for x in arrays)
-
- else:
- index = ensure_index(index)
- arrays = [extract_array(x, extract_numpy=True) for x in arrays]
- # with _from_arrays, the passed arrays should never be Series objects
- refs = [None] * len(arrays)
-
- # Reached via DataFrame._from_arrays; we do minimal validation here
- for arr in arrays:
- if (
- not isinstance(arr, (np.ndarray, ExtensionArray))
- or arr.ndim != 1
- or len(arr) != len(index)
- ):
- raise ValueError(
- "Arrays must be 1-dimensional np.ndarray or ExtensionArray "
- "with length matching len(index)"
- )
-
- columns = ensure_index(columns)
- if len(columns) != len(arrays):
- raise ValueError("len(arrays) must match len(columns)")
-
- # from BlockManager perspective
- axes = [columns, index]
-
- if typ == "block":
- return create_block_manager_from_column_arrays(
- arrays, axes, consolidate=consolidate, refs=refs
- )
- elif typ == "array":
- return ArrayManager(arrays, [index, columns])
- else:
- raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
-
-
-def rec_array_to_mgr(
- data: np.recarray | np.ndarray,
- index,
- columns,
- dtype: DtypeObj | None,
- copy: bool,
- typ: str,
-) -> Manager:
- """
- Extract from a masked rec array and create the manager.
- """
- # essentially process a record array then fill it
- fdata = ma.getdata(data)
- if index is None:
- index = default_index(len(fdata))
- else:
- index = ensure_index(index)
-
- if columns is not None:
- columns = ensure_index(columns)
- arrays, arr_columns = to_arrays(fdata, columns)
-
- # create the manager
-
- arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, len(index))
- if columns is None:
- columns = arr_columns
-
- mgr = arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ)
-
- if copy:
- mgr = mgr.copy()
- return mgr
-
-
-def mgr_to_mgr(mgr, typ: str, copy: bool = True):
- """
- Convert to specific type of Manager. Does not copy if the type is already
- correct. Does not guarantee a copy otherwise. `copy` keyword only controls
- whether conversion from Block->ArrayManager copies the 1D arrays.
- """
- new_mgr: Manager
-
- if typ == "block":
- if isinstance(mgr, BlockManager):
- new_mgr = mgr
- else:
- if mgr.ndim == 2:
- new_mgr = arrays_to_mgr(
- mgr.arrays, mgr.axes[0], mgr.axes[1], typ="block"
- )
- else:
- new_mgr = SingleBlockManager.from_array(mgr.arrays[0], mgr.index)
- elif typ == "array":
- if isinstance(mgr, ArrayManager):
- new_mgr = mgr
- else:
- if mgr.ndim == 2:
- arrays = [mgr.iget_values(i) for i in range(len(mgr.axes[0]))]
- if copy:
- arrays = [arr.copy() for arr in arrays]
- new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]])
- else:
- array = mgr.internal_values()
- if copy:
- array = array.copy()
- new_mgr = SingleArrayManager([array], [mgr.index])
- else:
- raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'")
- return new_mgr
-
-
-# ---------------------------------------------------------------------
-# DataFrame Constructor Interface
-
-
-def ndarray_to_mgr(
- values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str
-) -> Manager:
- # used in DataFrame.__init__
- # input must be a ndarray, list, Series, Index, ExtensionArray
-
- if isinstance(values, ABCSeries):
- if columns is None:
- if values.name is not None:
- columns = Index([values.name])
- if index is None:
- index = values.index
- else:
- values = values.reindex(index)
-
- # zero len case (GH #2234)
- if not len(values) and columns is not None and len(columns):
- values = np.empty((0, 1), dtype=object)
-
- # if the array preparation does a copy -> avoid this for ArrayManager,
- # since the copy is done on conversion to 1D arrays
- copy_on_sanitize = False if typ == "array" else copy
-
- vdtype = getattr(values, "dtype", None)
- refs = None
- if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype):
- # GH#19157
-
- if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1:
- # GH#12513 a EA dtype passed with a 2D array, split into
- # multiple EAs that view the values
- # error: No overload variant of "__getitem__" of "ExtensionArray"
- # matches argument type "Tuple[slice, int]"
- values = [
- values[:, n] # type: ignore[call-overload]
- for n in range(values.shape[1])
- ]
- else:
- values = [values]
-
- if columns is None:
- columns = Index(range(len(values)))
- else:
- columns = ensure_index(columns)
-
- return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)
-
- elif is_extension_array_dtype(vdtype):
- # i.e. Datetime64TZ, PeriodDtype; cases with is_1d_only_ea_dtype(vdtype)
- # are already caught above
- values = extract_array(values, extract_numpy=True)
- if copy:
- values = values.copy()
- if values.ndim == 1:
- values = values.reshape(-1, 1)
-
- elif isinstance(values, (ABCSeries, Index)):
- if not copy_on_sanitize and (
- dtype is None or astype_is_view(values.dtype, dtype)
- ):
- refs = values._references
-
- if copy_on_sanitize:
- values = values._values.copy()
- else:
- values = values._values
-
- values = _ensure_2d(values)
-
- elif isinstance(values, (np.ndarray, ExtensionArray)):
- # drop subclass info
- _copy = (
- copy_on_sanitize
- if (dtype is None or astype_is_view(values.dtype, dtype))
- else False
- )
- values = np.array(values, copy=_copy)
- values = _ensure_2d(values)
-
- else:
- # by definition an array here
- # the dtypes will be coerced to a single dtype
- values = _prep_ndarraylike(values, copy=copy_on_sanitize)
-
- if dtype is not None and not is_dtype_equal(values.dtype, dtype):
- # GH#40110 see similar check inside sanitize_array
- values = sanitize_array(
- values,
- None,
- dtype=dtype,
- copy=copy_on_sanitize,
- allow_2d=True,
- )
-
- # _prep_ndarraylike ensures that values.ndim == 2 at this point
- index, columns = _get_axes(
- values.shape[0], values.shape[1], index=index, columns=columns
- )
-
- _check_values_indices_shape_match(values, index, columns)
-
- if typ == "array":
- if issubclass(values.dtype.type, str):
- values = np.array(values, dtype=object)
-
- if dtype is None and is_object_dtype(values.dtype):
- arrays = [
- ensure_wrapped_if_datetimelike(
- maybe_infer_to_datetimelike(values[:, i])
- )
- for i in range(values.shape[1])
- ]
- else:
- if is_datetime_or_timedelta_dtype(values.dtype):
- values = ensure_wrapped_if_datetimelike(values)
- arrays = [values[:, i] for i in range(values.shape[1])]
-
- if copy:
- arrays = [arr.copy() for arr in arrays]
-
- return ArrayManager(arrays, [index, columns], verify_integrity=False)
-
- values = values.T
-
- # if we don't have a dtype specified, then try to convert objects
- # on the entire block; this is to convert if we have datetimelike's
- # embedded in an object type
- if dtype is None and is_object_dtype(values.dtype):
- obj_columns = list(values)
- maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
- # don't convert (and copy) the objects if no type inference occurs
- if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
- dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime]
- block_values = [
- new_block_2d(dvals_list[n], placement=BlockPlacement(n))
- for n in range(len(dvals_list))
- ]
- else:
- bp = BlockPlacement(slice(len(columns)))
- nb = new_block_2d(values, placement=bp, refs=refs)
- block_values = [nb]
- else:
- bp = BlockPlacement(slice(len(columns)))
- nb = new_block_2d(values, placement=bp, refs=refs)
- block_values = [nb]
-
- if len(columns) == 0:
- # TODO: check len(values) == 0?
- block_values = []
-
- return create_block_manager_from_blocks(
- block_values, [columns, index], verify_integrity=False
- )
-
-
-def _check_values_indices_shape_match(
- values: np.ndarray, index: Index, columns: Index
-) -> None:
- """
- Check that the shape implied by our axes matches the actual shape of the
- data.
- """
- if values.shape[1] != len(columns) or values.shape[0] != len(index):
- # Could let this raise in Block constructor, but we get a more
- # helpful exception message this way.
- if values.shape[0] == 0:
- raise ValueError("Empty data passed with indices specified.")
-
- passed = values.shape
- implied = (len(index), len(columns))
- raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
-
-
-def dict_to_mgr(
- data: dict,
- index,
- columns,
- *,
- dtype: DtypeObj | None = None,
- typ: str = "block",
- copy: bool = True,
-) -> Manager:
- """
- Segregate Series based on type and coerce into matrices.
- Needs to handle a lot of exceptional cases.
-
- Used in DataFrame.__init__
- """
- arrays: Sequence[Any] | Series
-
- if columns is not None:
- from pandas.core.series import Series
-
- arrays = Series(data, index=columns, dtype=object)
- missing = arrays.isna()
- if index is None:
- # GH10856
- # raise ValueError if only scalars in dict
- index = _extract_index(arrays[~missing])
- else:
- index = ensure_index(index)
-
- # no obvious "empty" int column
- if missing.any() and not is_integer_dtype(dtype):
- nan_dtype: DtypeObj
-
- if dtype is not None:
- # calling sanitize_array ensures we don't mix-and-match
- # NA dtypes
- midxs = missing.values.nonzero()[0]
- for i in midxs:
- arr = sanitize_array(arrays.iat[i], index, dtype=dtype)
- arrays.iat[i] = arr
- else:
- # GH#1783
- nan_dtype = np.dtype("object")
- val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
- nmissing = missing.sum()
- if copy:
- rhs = [val] * nmissing
- else:
- # GH#45369
- rhs = [val.copy() for _ in range(nmissing)]
- arrays.loc[missing] = rhs
-
- arrays = list(arrays)
- columns = ensure_index(columns)
-
- else:
- keys = list(data.keys())
- columns = Index(keys) if keys else default_index(0)
- arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]
- arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays]
-
- if copy:
- if typ == "block":
- # We only need to copy arrays that will not get consolidated, i.e.
- # only EA arrays
- arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays]
- else:
- # dtype check to exclude e.g. range objects, scalars
- arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]
-
- return arrays_to_mgr(arrays, columns, index, dtype=dtype, typ=typ, consolidate=copy)
-
-
-def nested_data_to_arrays(
- data: Sequence,
- columns: Index | None,
- index: Index | None,
- dtype: DtypeObj | None,
-) -> tuple[list[ArrayLike], Index, Index]:
- """
- Convert a single sequence of arrays to multiple arrays.
- """
- # By the time we get here we have already checked treat_as_nested(data)
-
- if is_named_tuple(data[0]) and columns is None:
- columns = ensure_index(data[0]._fields)
-
- arrays, columns = to_arrays(data, columns, dtype=dtype)
- columns = ensure_index(columns)
-
- if index is None:
- if isinstance(data[0], ABCSeries):
- index = _get_names_from_index(data)
- else:
- index = default_index(len(data))
-
- return arrays, columns, index
-
-
-def treat_as_nested(data) -> bool:
- """
- Check if we should use nested_data_to_arrays.
- """
- return (
- len(data) > 0
- and is_list_like(data[0])
- and getattr(data[0], "ndim", 1) == 1
- and not (isinstance(data, ExtensionArray) and data.ndim == 2)
- )
-
-
-# ---------------------------------------------------------------------
-
-
-def _prep_ndarraylike(values, copy: bool = True) -> np.ndarray:
- # values is specifically _not_ ndarray, EA, Index, or Series
- # We only get here with `not treat_as_nested(values)`
-
- if len(values) == 0:
- # TODO: check for length-zero range, in which case return int64 dtype?
- # TODO: re-use anything in try_cast?
- return np.empty((0, 0), dtype=object)
- elif isinstance(values, range):
- arr = range_to_ndarray(values)
- return arr[..., np.newaxis]
-
- def convert(v):
- if not is_list_like(v) or isinstance(v, ABCDataFrame):
- return v
-
- v = extract_array(v, extract_numpy=True)
- res = maybe_convert_platform(v)
- # We don't do maybe_infer_to_datetimelike here bc we will end up doing
- # it column-by-column in ndarray_to_mgr
- return res
-
- # we could have a 1-dim or 2-dim list here
- # this is equiv of np.asarray, but does object conversion
- # and platform dtype preservation
- # does not convert e.g. [1, "a", True] to ["1", "a", "True"] like
- # np.asarray would
- if is_list_like(values[0]):
- values = np.array([convert(v) for v in values])
- elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:
- # GH#21861 see test_constructor_list_of_lists
- values = np.array([convert(v) for v in values])
- else:
- values = convert(values)
-
- return _ensure_2d(values)
-
-
-def _ensure_2d(values: np.ndarray) -> np.ndarray:
- """
- Reshape 1D values, raise on anything else other than 2D.
- """
- if values.ndim == 1:
- values = values.reshape((values.shape[0], 1))
- elif values.ndim != 2:
- raise ValueError(f"Must pass 2-d input. shape={values.shape}")
- return values
-
-
-def _homogenize(
- data, index: Index, dtype: DtypeObj | None
-) -> tuple[list[ArrayLike], list[Any]]:
- oindex = None
- homogenized = []
- # if the original array-like in `data` is a Series, keep track of this Series' refs
- refs: list[Any] = []
-
- for val in data:
- if isinstance(val, ABCSeries):
- if dtype is not None:
- val = val.astype(dtype, copy=False)
- if val.index is not index:
- # Forces alignment. No need to copy data since we
- # are putting it into an ndarray later
- val = val.reindex(index, copy=False)
- refs.append(val._references)
- val = val._values
- else:
- if isinstance(val, dict):
- # GH#41785 this _should_ be equivalent to (but faster than)
- # val = Series(val, index=index)._values
- if oindex is None:
- oindex = index.astype("O")
-
- if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
- # see test_constructor_dict_datetime64_index
- val = dict_compat(val)
- else:
- # see test_constructor_subclass_dict
- val = dict(val)
- val = lib.fast_multiget(val, oindex._values, default=np.nan)
-
- val = sanitize_array(val, index, dtype=dtype, copy=False)
- com.require_length_match(val, index)
- refs.append(None)
-
- homogenized.append(val)
-
- return homogenized, refs
-
-
-def _extract_index(data) -> Index:
- """
- Try to infer an Index from the passed data, raise ValueError on failure.
- """
- index: Index
- if len(data) == 0:
- return default_index(0)
-
- raw_lengths = []
- indexes: list[list[Hashable] | Index] = []
-
- have_raw_arrays = False
- have_series = False
- have_dicts = False
-
- for val in data:
- if isinstance(val, ABCSeries):
- have_series = True
- indexes.append(val.index)
- elif isinstance(val, dict):
- have_dicts = True
- indexes.append(list(val.keys()))
- elif is_list_like(val) and getattr(val, "ndim", 1) == 1:
- have_raw_arrays = True
- raw_lengths.append(len(val))
- elif isinstance(val, np.ndarray) and val.ndim > 1:
- raise ValueError("Per-column arrays must each be 1-dimensional")
-
- if not indexes and not raw_lengths:
- raise ValueError("If using all scalar values, you must pass an index")
-
- if have_series:
- index = union_indexes(indexes)
- elif have_dicts:
- index = union_indexes(indexes, sort=False)
-
- if have_raw_arrays:
- lengths = list(set(raw_lengths))
- if len(lengths) > 1:
- raise ValueError("All arrays must be of the same length")
-
- if have_dicts:
- raise ValueError(
- "Mixing dicts with non-Series may lead to ambiguous ordering."
- )
-
- if have_series:
- if lengths[0] != len(index):
- msg = (
- f"array length {lengths[0]} does not match index "
- f"length {len(index)}"
- )
- raise ValueError(msg)
- else:
- index = default_index(lengths[0])
-
- return ensure_index(index)
-
-
-def reorder_arrays(
- arrays: list[ArrayLike], arr_columns: Index, columns: Index | None, length: int
-) -> tuple[list[ArrayLike], Index]:
- """
- Pre-emptively (cheaply) reindex arrays with new columns.
- """
- # reorder according to the columns
- if columns is not None:
- if not columns.equals(arr_columns):
- # if they are equal, there is nothing to do
- new_arrays: list[ArrayLike | None]
- new_arrays = [None] * len(columns)
- indexer = arr_columns.get_indexer(columns)
- for i, k in enumerate(indexer):
- if k == -1:
- # by convention default is all-NaN object dtype
- arr = np.empty(length, dtype=object)
- arr.fill(np.nan)
- else:
- arr = arrays[k]
- new_arrays[i] = arr
-
- # Incompatible types in assignment (expression has type
- # "List[Union[ExtensionArray, ndarray[Any, Any], None]]", variable
- # has type "List[Union[ExtensionArray, ndarray[Any, Any]]]")
- arrays = new_arrays # type: ignore[assignment]
- arr_columns = columns
-
- return arrays, arr_columns
-
-
-def _get_names_from_index(data) -> Index:
- has_some_name = any(getattr(s, "name", None) is not None for s in data)
- if not has_some_name:
- return default_index(len(data))
-
- index: list[Hashable] = list(range(len(data)))
- count = 0
- for i, s in enumerate(data):
- n = getattr(s, "name", None)
- if n is not None:
- index[i] = n
- else:
- index[i] = f"Unnamed {count}"
- count += 1
-
- return Index(index)
-
-
-def _get_axes(
- N: int, K: int, index: Index | None, columns: Index | None
-) -> tuple[Index, Index]:
- # helper to create the axes as indexes
- # return axes or defaults
-
- if index is None:
- index = default_index(N)
- else:
- index = ensure_index(index)
-
- if columns is None:
- columns = default_index(K)
- else:
- columns = ensure_index(columns)
- return index, columns
-
-
-def dataclasses_to_dicts(data):
- """
- Converts a list of dataclass instances to a list of dictionaries.
-
- Parameters
- ----------
- data : List[Type[dataclass]]
-
- Returns
- --------
- list_dict : List[dict]
-
- Examples
- --------
- >>> from dataclasses import dataclass
- >>> @dataclass
- ... class Point:
- ... x: int
- ... y: int
-
- >>> dataclasses_to_dicts([Point(1, 2), Point(2, 3)])
- [{'x': 1, 'y': 2}, {'x': 2, 'y': 3}]
-
- """
- from dataclasses import asdict
-
- return list(map(asdict, data))
-
-
-# ---------------------------------------------------------------------
-# Conversion of Inputs to Arrays
-
-
-def to_arrays(
- data, columns: Index | None, dtype: DtypeObj | None = None
-) -> tuple[list[ArrayLike], Index]:
- """
- Return list of arrays, columns.
-
- Returns
- -------
- list[ArrayLike]
- These will become columns in a DataFrame.
- Index
- This will become frame.columns.
-
- Notes
- -----
- Ensures that len(result_arrays) == len(result_index).
- """
- if isinstance(data, ABCDataFrame):
- # see test_from_records_with_index_data, test_from_records_bad_index_column
- if columns is not None:
- arrays = [
- data._ixs(i, axis=1)._values
- for i, col in enumerate(data.columns)
- if col in columns
- ]
- else:
- columns = data.columns
- arrays = [data._ixs(i, axis=1)._values for i in range(len(columns))]
-
- return arrays, columns
-
- if not len(data):
- if isinstance(data, np.ndarray):
- if data.dtype.names is not None:
- # i.e. numpy structured array
- columns = ensure_index(data.dtype.names)
- arrays = [data[name] for name in columns]
-
- if len(data) == 0:
- # GH#42456 the indexing above results in list of 2D ndarrays
- # TODO: is that an issue with numpy?
- for i, arr in enumerate(arrays):
- if arr.ndim == 2:
- arrays[i] = arr[:, 0]
-
- return arrays, columns
- return [], ensure_index([])
-
- elif isinstance(data, np.ndarray) and data.dtype.names is not None:
- # e.g. recarray
- columns = Index(list(data.dtype.names))
- arrays = [data[k] for k in columns]
- return arrays, columns
-
- if isinstance(data[0], (list, tuple)):
- arr = _list_to_arrays(data)
- elif isinstance(data[0], abc.Mapping):
- arr, columns = _list_of_dict_to_arrays(data, columns)
- elif isinstance(data[0], ABCSeries):
- arr, columns = _list_of_series_to_arrays(data, columns)
- else:
- # last ditch effort
- data = [tuple(x) for x in data]
- arr = _list_to_arrays(data)
-
- content, columns = _finalize_columns_and_data(arr, columns, dtype)
- return content, columns
-
-
-def _list_to_arrays(data: list[tuple | list]) -> np.ndarray:
- # Returned np.ndarray has ndim = 2
- # Note: we already check len(data) > 0 before getting hre
- if isinstance(data[0], tuple):
- content = lib.to_object_array_tuples(data)
- else:
- # list of lists
- content = lib.to_object_array(data)
- return content
-
-
-def _list_of_series_to_arrays(
- data: list,
- columns: Index | None,
-) -> tuple[np.ndarray, Index]:
- # returned np.ndarray has ndim == 2
-
- if columns is None:
- # We know pass_data is non-empty because data[0] is a Series
- pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]
- columns = get_objs_combined_axis(pass_data, sort=False)
-
- indexer_cache: dict[int, np.ndarray] = {}
-
- aligned_values = []
- for s in data:
- index = getattr(s, "index", None)
- if index is None:
- index = default_index(len(s))
-
- if id(index) in indexer_cache:
- indexer = indexer_cache[id(index)]
- else:
- indexer = indexer_cache[id(index)] = index.get_indexer(columns)
-
- values = extract_array(s, extract_numpy=True)
- aligned_values.append(algorithms.take_nd(values, indexer))
-
- content = np.vstack(aligned_values)
- return content, columns
-
-
-def _list_of_dict_to_arrays(
- data: list[dict],
- columns: Index | None,
-) -> tuple[np.ndarray, Index]:
- """
- Convert list of dicts to numpy arrays
-
- if `columns` is not passed, column names are inferred from the records
- - for OrderedDict and dicts, the column names match
- the key insertion-order from the first record to the last.
- - For other kinds of dict-likes, the keys are lexically sorted.
-
- Parameters
- ----------
- data : iterable
- collection of records (OrderedDict, dict)
- columns: iterables or None
-
- Returns
- -------
- content : np.ndarray[object, ndim=2]
- columns : Index
- """
- if columns is None:
- gen = (list(x.keys()) for x in data)
- sort = not any(isinstance(d, dict) for d in data)
- pre_cols = lib.fast_unique_multiple_list_gen(gen, sort=sort)
- columns = ensure_index(pre_cols)
-
- # assure that they are of the base dict class and not of derived
- # classes
- data = [d if type(d) is dict else dict(d) for d in data]
-
- content = lib.dicts_to_array(data, list(columns))
- return content, columns
-
-
-def _finalize_columns_and_data(
- content: np.ndarray, # ndim == 2
- columns: Index | None,
- dtype: DtypeObj | None,
-) -> tuple[list[ArrayLike], Index]:
- """
- Ensure we have valid columns, cast object dtypes if possible.
- """
- contents = list(content.T)
-
- try:
- columns = _validate_or_indexify_columns(contents, columns)
- except AssertionError as err:
- # GH#26429 do not raise user-facing AssertionError
- raise ValueError(err) from err
-
- if len(contents) and contents[0].dtype == np.object_:
- contents = convert_object_array(contents, dtype=dtype)
-
- return contents, columns
-
-
-def _validate_or_indexify_columns(
- content: list[np.ndarray], columns: Index | None
-) -> Index:
- """
- If columns is None, make numbers as column names; Otherwise, validate that
- columns have valid length.
-
- Parameters
- ----------
- content : list of np.ndarrays
- columns : Index or None
-
- Returns
- -------
- Index
- If columns is None, assign positional column index value as columns.
-
- Raises
- ------
- 1. AssertionError when content is not composed of list of lists, and if
- length of columns is not equal to length of content.
- 2. ValueError when content is list of lists, but length of each sub-list
- is not equal
- 3. ValueError when content is list of lists, but length of sub-list is
- not equal to length of content
- """
- if columns is None:
- columns = default_index(len(content))
- else:
- # Add mask for data which is composed of list of lists
- is_mi_list = isinstance(columns, list) and all(
- isinstance(col, list) for col in columns
- )
-
- if not is_mi_list and len(columns) != len(content): # pragma: no cover
- # caller's responsibility to check for this...
- raise AssertionError(
- f"{len(columns)} columns passed, passed data had "
- f"{len(content)} columns"
- )
- if is_mi_list:
- # check if nested list column, length of each sub-list should be equal
- if len({len(col) for col in columns}) > 1:
- raise ValueError(
- "Length of columns passed for MultiIndex columns is different"
- )
-
- # if columns is not empty and length of sublist is not equal to content
- if columns and len(columns[0]) != len(content):
- raise ValueError(
- f"{len(columns[0])} columns passed, passed data had "
- f"{len(content)} columns"
- )
- return columns
-
-
-def convert_object_array(
- content: list[npt.NDArray[np.object_]],
- dtype: DtypeObj | None,
- dtype_backend: str = "numpy",
- coerce_float: bool = False,
-) -> list[ArrayLike]:
- """
- Internal function to convert object array.
-
- Parameters
- ----------
- content: List[np.ndarray]
- dtype: np.dtype or ExtensionDtype
- dtype_backend: Controls if nullable/pyarrow dtypes are returned.
- coerce_float: Cast floats that are integers to int.
-
- Returns
- -------
- List[ArrayLike]
- """
- # provide soft conversion of object dtypes
-
- def convert(arr):
- if dtype != np.dtype("O"):
- arr = lib.maybe_convert_objects(
- arr,
- try_float=coerce_float,
- convert_to_nullable_dtype=dtype_backend != "numpy",
- )
- # Notes on cases that get here 2023-02-15
- # 1) we DO get here when arr is all Timestamps and dtype=None
- # 2) disabling this doesn't break the world, so this must be
- # getting caught at a higher level
- # 3) passing convert_datetime to maybe_convert_objects get this right
- # 4) convert_timedelta?
-
- if dtype is None:
- if arr.dtype == np.dtype("O"):
- # i.e. maybe_convert_objects didn't convert
- arr = maybe_infer_to_datetimelike(arr)
- if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
- arr = StringDtype().construct_array_type()._from_sequence(arr)
- elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
- if is_integer_dtype(arr.dtype):
- arr = IntegerArray(arr, np.zeros(arr.shape, dtype=np.bool_))
- elif is_bool_dtype(arr.dtype):
- arr = BooleanArray(arr, np.zeros(arr.shape, dtype=np.bool_))
- elif is_float_dtype(arr.dtype):
- arr = FloatingArray(arr, np.isnan(arr))
-
- elif isinstance(dtype, ExtensionDtype):
- # TODO: test(s) that get here
- # TODO: try to de-duplicate this convert function with
- # core.construction functions
- cls = dtype.construct_array_type()
- arr = cls._from_sequence(arr, dtype=dtype, copy=False)
- elif dtype.kind in ["m", "M"]:
- # This restriction is harmless bc these are the only cases
- # where maybe_cast_to_datetime is not a no-op.
- # Here we know:
- # 1) dtype.kind in ["m", "M"] and
- # 2) arr is either object or numeric dtype
- arr = maybe_cast_to_datetime(arr, dtype)
-
- return arr
-
- arrays = [convert(arr) for arr in content]
-
- return arrays
diff --git a/contrib/python/pandas/py3/pandas/core/internals/managers.py b/contrib/python/pandas/py3/pandas/core/internals/managers.py
deleted file mode 100644
index 1e81e7a445f..00000000000
--- a/contrib/python/pandas/py3/pandas/core/internals/managers.py
+++ /dev/null
@@ -1,2343 +0,0 @@
-from __future__ import annotations
-
-import itertools
-from typing import (
- Any,
- Callable,
- Hashable,
- Literal,
- Sequence,
- TypeVar,
- cast,
-)
-import warnings
-import weakref
-
-import numpy as np
-
-from pandas._config import using_copy_on_write
-
-from pandas._libs import (
- algos as libalgos,
- internals as libinternals,
- lib,
-)
-from pandas._libs.internals import (
- BlockPlacement,
- BlockValuesRefs,
-)
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- DtypeObj,
- QuantileInterpolation,
- Shape,
- npt,
- type_t,
-)
-from pandas.errors import PerformanceWarning
-from pandas.util._decorators import cache_readonly
-from pandas.util._exceptions import find_stack_level
-from pandas.util._validators import validate_bool_kwarg
-
-from pandas.core.dtypes.cast import infer_dtype_from_scalar
-from pandas.core.dtypes.common import (
- ensure_platform_int,
- is_1d_only_ea_dtype,
- is_dtype_equal,
- is_list_like,
-)
-from pandas.core.dtypes.dtypes import ExtensionDtype
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import (
- array_equals,
- isna,
-)
-
-import pandas.core.algorithms as algos
-from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
-from pandas.core.arrays.sparse import SparseDtype
-import pandas.core.common as com
-from pandas.core.construction import (
- ensure_wrapped_if_datetimelike,
- extract_array,
-)
-from pandas.core.indexers import maybe_convert_indices
-from pandas.core.indexes.api import (
- Index,
- ensure_index,
-)
-from pandas.core.internals.base import (
- DataManager,
- SingleDataManager,
- interleaved_dtype,
-)
-from pandas.core.internals.blocks import (
- Block,
- NumpyBlock,
- ensure_block_shape,
- extend_blocks,
- get_block_type,
- new_block,
- new_block_2d,
-)
-from pandas.core.internals.ops import (
- blockwise_all,
- operate_blockwise,
-)
-
-T = TypeVar("T", bound="BaseBlockManager")
-
-
-class BaseBlockManager(DataManager):
- """
- Core internal data structure to implement DataFrame, Series, etc.
-
- Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
- lightweight blocked set of labeled data to be manipulated by the DataFrame
- public API class
-
- Attributes
- ----------
- shape
- ndim
- axes
- values
- items
-
- Methods
- -------
- set_axis(axis, new_labels)
- copy(deep=True)
-
- get_dtypes
-
- apply(func, axes, block_filter_fn)
-
- get_bool_data
- get_numeric_data
-
- get_slice(slice_like, axis)
- get(label)
- iget(loc)
-
- take(indexer, axis)
- reindex_axis(new_labels, axis)
- reindex_indexer(new_labels, indexer, axis)
-
- delete(label)
- insert(loc, label, value)
- set(label, value)
-
- Parameters
- ----------
- blocks: Sequence of Block
- axes: Sequence of Index
- verify_integrity: bool, default True
-
- Notes
- -----
- This is *not* a public API class
- """
-
- __slots__ = ()
-
- _blknos: npt.NDArray[np.intp]
- _blklocs: npt.NDArray[np.intp]
- blocks: tuple[Block, ...]
- axes: list[Index]
-
- @property
- def ndim(self) -> int:
- raise NotImplementedError
-
- _known_consolidated: bool
- _is_consolidated: bool
-
- def __init__(self, blocks, axes, verify_integrity: bool = True) -> None:
- raise NotImplementedError
-
- @classmethod
- def from_blocks(cls: type_t[T], blocks: list[Block], axes: list[Index]) -> T:
- raise NotImplementedError
-
- @property
- def blknos(self) -> npt.NDArray[np.intp]:
- """
- Suppose we want to find the array corresponding to our i'th column.
-
- blknos[i] identifies the block from self.blocks that contains this column.
-
- blklocs[i] identifies the column of interest within
- self.blocks[self.blknos[i]]
- """
- if self._blknos is None:
- # Note: these can be altered by other BlockManager methods.
- self._rebuild_blknos_and_blklocs()
-
- return self._blknos
-
- @property
- def blklocs(self) -> npt.NDArray[np.intp]:
- """
- See blknos.__doc__
- """
- if self._blklocs is None:
- # Note: these can be altered by other BlockManager methods.
- self._rebuild_blknos_and_blklocs()
-
- return self._blklocs
-
- def make_empty(self: T, axes=None) -> T:
- """return an empty BlockManager with the items axis of len 0"""
- if axes is None:
- axes = [Index([])] + self.axes[1:]
-
- # preserve dtype if possible
- if self.ndim == 1:
- assert isinstance(self, SingleBlockManager) # for mypy
- blk = self.blocks[0]
- arr = blk.values[:0]
- bp = BlockPlacement(slice(0, 0))
- nb = blk.make_block_same_class(arr, placement=bp)
- blocks = [nb]
- else:
- blocks = []
- return type(self).from_blocks(blocks, axes)
-
- def __nonzero__(self) -> bool:
- return True
-
- # Python3 compat
- __bool__ = __nonzero__
-
- def _normalize_axis(self, axis: AxisInt) -> int:
- # switch axis to follow BlockManager logic
- if self.ndim == 2:
- axis = 1 if axis == 0 else 0
- return axis
-
- def set_axis(self, axis: AxisInt, new_labels: Index) -> None:
- # Caller is responsible for ensuring we have an Index object.
- self._validate_set_axis(axis, new_labels)
- self.axes[axis] = new_labels
-
- @property
- def is_single_block(self) -> bool:
- # Assumes we are 2D; overridden by SingleBlockManager
- return len(self.blocks) == 1
-
- @property
- def items(self) -> Index:
- return self.axes[0]
-
- def _has_no_reference(self, i: int) -> bool:
- """
- Check for column `i` if it has references.
- (whether it references another array or is itself being referenced)
- Returns True if the column has no references.
- """
- blkno = self.blknos[i]
- return self._has_no_reference_block(blkno)
-
- def _has_no_reference_block(self, blkno: int) -> bool:
- """
- Check for block `i` if it has references.
- (whether it references another array or is itself being referenced)
- Returns True if the block has no references.
- """
- return not self.blocks[blkno].refs.has_reference()
-
- def add_references(self, mgr: BaseBlockManager) -> None:
- """
- Adds the references from one manager to another. We assume that both
- managers have the same block structure.
- """
- if len(self.blocks) != len(mgr.blocks):
- # If block structure changes, then we made a copy
- return
- for i, blk in enumerate(self.blocks):
- blk.refs = mgr.blocks[i].refs
- # Argument 1 to "add_reference" of "BlockValuesRefs" has incompatible type
- # "Block"; expected "SharedBlock"
- blk.refs.add_reference(blk) # type: ignore[arg-type]
-
- def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool:
- """
- Checks if two blocks from two different block managers reference the
- same underlying values.
- """
- ref = weakref.ref(self.blocks[blkno])
- return ref in mgr.blocks[blkno].refs.referenced_blocks
-
- def get_dtypes(self):
- dtypes = np.array([blk.dtype for blk in self.blocks])
- return dtypes.take(self.blknos)
-
- @property
- def arrays(self) -> list[ArrayLike]:
- """
- Quick access to the backing arrays of the Blocks.
-
- Only for compatibility with ArrayManager for testing convenience.
- Not to be used in actual code, and return value is not the same as the
- ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs).
-
- Warning! The returned arrays don't handle Copy-on-Write, so this should
- be used with caution (only in read-mode).
- """
- return [blk.values for blk in self.blocks]
-
- def __repr__(self) -> str:
- output = type(self).__name__
- for i, ax in enumerate(self.axes):
- if i == 0:
- output += f"\nItems: {ax}"
- else:
- output += f"\nAxis {i}: {ax}"
-
- for block in self.blocks:
- output += f"\n{block}"
- return output
-
- def apply(
- self: T,
- f,
- align_keys: list[str] | None = None,
- **kwargs,
- ) -> T:
- """
- Iterate over the blocks, collect and create a new BlockManager.
-
- Parameters
- ----------
- f : str or callable
- Name of the Block method to apply.
- align_keys: List[str] or None, default None
- **kwargs
- Keywords to pass to `f`
-
- Returns
- -------
- BlockManager
- """
- assert "filter" not in kwargs
-
- align_keys = align_keys or []
- result_blocks: list[Block] = []
- # fillna: Series/DataFrame is responsible for making sure value is aligned
-
- aligned_args = {k: kwargs[k] for k in align_keys}
-
- for b in self.blocks:
- if aligned_args:
- for k, obj in aligned_args.items():
- if isinstance(obj, (ABCSeries, ABCDataFrame)):
- # The caller is responsible for ensuring that
- # obj.axes[-1].equals(self.items)
- if obj.ndim == 1:
- kwargs[k] = obj.iloc[b.mgr_locs.indexer]._values
- else:
- kwargs[k] = obj.iloc[:, b.mgr_locs.indexer]._values
- else:
- # otherwise we have an ndarray
- kwargs[k] = obj[b.mgr_locs.indexer]
-
- if callable(f):
- applied = b.apply(f, **kwargs)
- else:
- applied = getattr(b, f)(**kwargs)
- result_blocks = extend_blocks(applied, result_blocks)
-
- out = type(self).from_blocks(result_blocks, self.axes)
- return out
-
- def where(self: T, other, cond, align: bool) -> T:
- if align:
- align_keys = ["other", "cond"]
- else:
- align_keys = ["cond"]
- other = extract_array(other, extract_numpy=True)
-
- return self.apply(
- "where",
- align_keys=align_keys,
- other=other,
- cond=cond,
- using_cow=using_copy_on_write(),
- )
-
- def round(self: T, decimals: int, using_cow: bool = False) -> T:
- return self.apply(
- "round",
- decimals=decimals,
- using_cow=using_cow,
- )
-
- def setitem(self: T, indexer, value) -> T:
- """
- Set values with indexer.
-
- For SingleBlockManager, this backs s[indexer] = value
- """
- if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
- raise ValueError(f"Cannot set values with ndim > {self.ndim}")
-
- if using_copy_on_write() and not self._has_no_reference(0):
- # if being referenced -> perform Copy-on-Write and clear the reference
- # this method is only called if there is a single block -> hardcoded 0
- self = self.copy()
-
- return self.apply("setitem", indexer=indexer, value=value)
-
- def putmask(self, mask, new, align: bool = True):
- if align:
- align_keys = ["new", "mask"]
- else:
- align_keys = ["mask"]
- new = extract_array(new, extract_numpy=True)
-
- return self.apply(
- "putmask",
- align_keys=align_keys,
- mask=mask,
- new=new,
- using_cow=using_copy_on_write(),
- )
-
- def diff(self: T, n: int, axis: AxisInt) -> T:
- # only reached with self.ndim == 2 and axis == 1
- axis = self._normalize_axis(axis)
- return self.apply("diff", n=n, axis=axis)
-
- def interpolate(self: T, inplace: bool, **kwargs) -> T:
- return self.apply(
- "interpolate", inplace=inplace, **kwargs, using_cow=using_copy_on_write()
- )
-
- def shift(self: T, periods: int, axis: AxisInt, fill_value) -> T:
- axis = self._normalize_axis(axis)
- if fill_value is lib.no_default:
- fill_value = None
-
- return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value)
-
- def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
- if limit is not None:
- # Do this validation even if we go through one of the no-op paths
- limit = libalgos.validate_limit(None, limit=limit)
-
- return self.apply(
- "fillna",
- value=value,
- limit=limit,
- inplace=inplace,
- downcast=downcast,
- using_cow=using_copy_on_write(),
- )
-
- def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
- if copy is None:
- if using_copy_on_write():
- copy = False
- else:
- copy = True
- elif using_copy_on_write():
- copy = False
-
- return self.apply(
- "astype",
- dtype=dtype,
- copy=copy,
- errors=errors,
- using_cow=using_copy_on_write(),
- )
-
- def convert(self: T, copy: bool | None) -> T:
- if copy is None:
- if using_copy_on_write():
- copy = False
- else:
- copy = True
- elif using_copy_on_write():
- copy = False
-
- return self.apply("convert", copy=copy, using_cow=using_copy_on_write())
-
- def replace(self: T, to_replace, value, inplace: bool) -> T:
- inplace = validate_bool_kwarg(inplace, "inplace")
- # NDFrame.replace ensures the not-is_list_likes here
- assert not is_list_like(to_replace)
- assert not is_list_like(value)
- return self.apply(
- "replace",
- to_replace=to_replace,
- value=value,
- inplace=inplace,
- using_cow=using_copy_on_write(),
- )
-
- def replace_regex(self, **kwargs):
- return self.apply("_replace_regex", **kwargs, using_cow=using_copy_on_write())
-
- def replace_list(
- self: T,
- src_list: list[Any],
- dest_list: list[Any],
- inplace: bool = False,
- regex: bool = False,
- ) -> T:
- """do a list replace"""
- inplace = validate_bool_kwarg(inplace, "inplace")
-
- bm = self.apply(
- "replace_list",
- src_list=src_list,
- dest_list=dest_list,
- inplace=inplace,
- regex=regex,
- using_cow=using_copy_on_write(),
- )
- bm._consolidate_inplace()
- return bm
-
- def to_native_types(self: T, **kwargs) -> T:
- """
- Convert values to native types (strings / python objects) that are used
- in formatting (repr / csv).
- """
- return self.apply("to_native_types", **kwargs)
-
- @property
- def is_numeric_mixed_type(self) -> bool:
- return all(block.is_numeric for block in self.blocks)
-
- @property
- def any_extension_types(self) -> bool:
- """Whether any of the blocks in this manager are extension blocks"""
- return any(block.is_extension for block in self.blocks)
-
- @property
- def is_view(self) -> bool:
- """return a boolean if we are a single block and are a view"""
- if len(self.blocks) == 1:
- return self.blocks[0].is_view
-
- # It is technically possible to figure out which blocks are views
- # e.g. [ b.values.base is not None for b in self.blocks ]
- # but then we have the case of possibly some blocks being a view
- # and some blocks not. setting in theory is possible on the non-view
- # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
- # complicated
-
- return False
-
- def _get_data_subset(self: T, predicate: Callable) -> T:
- blocks = [blk for blk in self.blocks if predicate(blk.values)]
- return self._combine(blocks, copy=False)
-
- def get_bool_data(self: T, copy: bool = False) -> T:
- """
- Select blocks that are bool-dtype and columns from object-dtype blocks
- that are all-bool.
-
- Parameters
- ----------
- copy : bool, default False
- Whether to copy the blocks
- """
-
- new_blocks = []
-
- for blk in self.blocks:
- if blk.dtype == bool:
- new_blocks.append(blk)
-
- elif blk.is_object:
- nbs = blk._split()
- for nb in nbs:
- if nb.is_bool:
- new_blocks.append(nb)
-
- return self._combine(new_blocks, copy)
-
- def get_numeric_data(self: T, copy: bool = False) -> T:
- """
- Parameters
- ----------
- copy : bool, default False
- Whether to copy the blocks
- """
- numeric_blocks = [blk for blk in self.blocks if blk.is_numeric]
- if len(numeric_blocks) == len(self.blocks):
- # Avoid somewhat expensive _combine
- if copy:
- return self.copy(deep=True)
- return self
- return self._combine(numeric_blocks, copy)
-
- def _combine(
- self: T, blocks: list[Block], copy: bool = True, index: Index | None = None
- ) -> T:
- """return a new manager with the blocks"""
- if len(blocks) == 0:
- if self.ndim == 2:
- # retain our own Index dtype
- if index is not None:
- axes = [self.items[:0], index]
- else:
- axes = [self.items[:0]] + self.axes[1:]
- return self.make_empty(axes)
- return self.make_empty()
-
- # FIXME: optimization potential
- indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks]))
- inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
-
- new_blocks: list[Block] = []
- # TODO(CoW) we could optimize here if we know that the passed blocks
- # are fully "owned" (eg created from an operation, not coming from
- # an existing manager)
- for b in blocks:
- nb = b.copy(deep=copy)
- nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer])
- new_blocks.append(nb)
-
- axes = list(self.axes)
- if index is not None:
- axes[-1] = index
- axes[0] = self.items.take(indexer)
-
- return type(self).from_blocks(new_blocks, axes)
-
- @property
- def nblocks(self) -> int:
- return len(self.blocks)
-
- def copy(self: T, deep: bool | None | Literal["all"] = True) -> T:
- """
- Make deep or shallow copy of BlockManager
-
- Parameters
- ----------
- deep : bool, string or None, default True
- If False or None, return a shallow copy (do not copy data)
- If 'all', copy data and a deep copy of the index
-
- Returns
- -------
- BlockManager
- """
- if deep is None:
- if using_copy_on_write():
- # use shallow copy
- deep = False
- else:
- # preserve deep copy for BlockManager with copy=None
- deep = True
-
- # this preserves the notion of view copying of axes
- if deep:
- # hit in e.g. tests.io.json.test_pandas
-
- def copy_func(ax):
- return ax.copy(deep=True) if deep == "all" else ax.view()
-
- new_axes = [copy_func(ax) for ax in self.axes]
- else:
- new_axes = list(self.axes)
-
- res = self.apply("copy", deep=deep)
- res.axes = new_axes
-
- if self.ndim > 1:
- # Avoid needing to re-compute these
- blknos = self._blknos
- if blknos is not None:
- res._blknos = blknos.copy()
- res._blklocs = self._blklocs.copy()
-
- if deep:
- res._consolidate_inplace()
- return res
-
- def consolidate(self: T) -> T:
- """
- Join together blocks having same dtype
-
- Returns
- -------
- y : BlockManager
- """
- if self.is_consolidated():
- return self
-
- bm = type(self)(self.blocks, self.axes, verify_integrity=False)
- bm._is_consolidated = False
- bm._consolidate_inplace()
- return bm
-
- def reindex_indexer(
- self: T,
- new_axis: Index,
- indexer: npt.NDArray[np.intp] | None,
- axis: AxisInt,
- fill_value=None,
- allow_dups: bool = False,
- copy: bool | None = True,
- only_slice: bool = False,
- *,
- use_na_proxy: bool = False,
- ) -> T:
- """
- Parameters
- ----------
- new_axis : Index
- indexer : ndarray[intp] or None
- axis : int
- fill_value : object, default None
- allow_dups : bool, default False
- copy : bool or None, default True
- If None, regard as False to get shallow copy.
- only_slice : bool, default False
- Whether to take views, not copies, along columns.
- use_na_proxy : bool, default False
- Whether to use a np.void ndarray for newly introduced columns.
-
- pandas-indexer with -1's only.
- """
- if copy is None:
- if using_copy_on_write():
- # use shallow copy
- copy = False
- else:
- # preserve deep copy for BlockManager with copy=None
- copy = True
-
- if indexer is None:
- if new_axis is self.axes[axis] and not copy:
- return self
-
- result = self.copy(deep=copy)
- result.axes = list(self.axes)
- result.axes[axis] = new_axis
- return result
-
- # Should be intp, but in some cases we get int64 on 32bit builds
- assert isinstance(indexer, np.ndarray)
-
- # some axes don't allow reindexing with dups
- if not allow_dups:
- self.axes[axis]._validate_can_reindex(indexer)
-
- if axis >= self.ndim:
- raise IndexError("Requested axis not found in manager")
-
- if axis == 0:
- new_blocks = self._slice_take_blocks_ax0(
- indexer,
- fill_value=fill_value,
- only_slice=only_slice,
- use_na_proxy=use_na_proxy,
- )
- else:
- new_blocks = [
- blk.take_nd(
- indexer,
- axis=1,
- fill_value=(
- fill_value if fill_value is not None else blk.fill_value
- ),
- )
- for blk in self.blocks
- ]
-
- new_axes = list(self.axes)
- new_axes[axis] = new_axis
-
- new_mgr = type(self).from_blocks(new_blocks, new_axes)
- if axis == 1:
- # We can avoid the need to rebuild these
- new_mgr._blknos = self.blknos.copy()
- new_mgr._blklocs = self.blklocs.copy()
- return new_mgr
-
- def _slice_take_blocks_ax0(
- self,
- slice_or_indexer: slice | np.ndarray,
- fill_value=lib.no_default,
- only_slice: bool = False,
- *,
- use_na_proxy: bool = False,
- ) -> list[Block]:
- """
- Slice/take blocks along axis=0.
-
- Overloaded for SingleBlock
-
- Parameters
- ----------
- slice_or_indexer : slice or np.ndarray[int64]
- fill_value : scalar, default lib.no_default
- only_slice : bool, default False
- If True, we always return views on existing arrays, never copies.
- This is used when called from ops.blockwise.operate_blockwise.
- use_na_proxy : bool, default False
- Whether to use a np.void ndarray for newly introduced columns.
-
- Returns
- -------
- new_blocks : list of Block
- """
- allow_fill = fill_value is not lib.no_default
-
- sl_type, slobj, sllen = _preprocess_slice_or_indexer(
- slice_or_indexer, self.shape[0], allow_fill=allow_fill
- )
-
- if self.is_single_block:
- blk = self.blocks[0]
-
- if sl_type == "slice":
- # GH#32959 EABlock would fail since we can't make 0-width
- # TODO(EA2D): special casing unnecessary with 2D EAs
- if sllen == 0:
- return []
- bp = BlockPlacement(slice(0, sllen))
- return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)]
- elif not allow_fill or self.ndim == 1:
- if allow_fill and fill_value is None:
- fill_value = blk.fill_value
-
- if not allow_fill and only_slice:
- # GH#33597 slice instead of take, so we get
- # views instead of copies
- blocks = [
- blk.getitem_block_columns(
- slice(ml, ml + 1), new_mgr_locs=BlockPlacement(i)
- )
- for i, ml in enumerate(slobj)
- ]
- return blocks
- else:
- bp = BlockPlacement(slice(0, sllen))
- return [
- blk.take_nd(
- slobj,
- axis=0,
- new_mgr_locs=bp,
- fill_value=fill_value,
- )
- ]
-
- if sl_type == "slice":
- blknos = self.blknos[slobj]
- blklocs = self.blklocs[slobj]
- else:
- blknos = algos.take_nd(
- self.blknos, slobj, fill_value=-1, allow_fill=allow_fill
- )
- blklocs = algos.take_nd(
- self.blklocs, slobj, fill_value=-1, allow_fill=allow_fill
- )
-
- # When filling blknos, make sure blknos is updated before appending to
- # blocks list, that way new blkno is exactly len(blocks).
- blocks = []
- group = not only_slice
- for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group):
- if blkno == -1:
- # If we've got here, fill_value was not lib.no_default
-
- blocks.append(
- self._make_na_block(
- placement=mgr_locs,
- fill_value=fill_value,
- use_na_proxy=use_na_proxy,
- )
- )
- else:
- blk = self.blocks[blkno]
-
- # Otherwise, slicing along items axis is necessary.
- if not blk._can_consolidate and not blk._validate_ndim:
- # i.e. we dont go through here for DatetimeTZBlock
- # A non-consolidatable block, it's easy, because there's
- # only one item and each mgr loc is a copy of that single
- # item.
- deep = not (only_slice or using_copy_on_write())
- for mgr_loc in mgr_locs:
- newblk = blk.copy(deep=deep)
- newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1))
- blocks.append(newblk)
-
- else:
- # GH#32779 to avoid the performance penalty of copying,
- # we may try to only slice
- taker = blklocs[mgr_locs.indexer]
- max_len = max(len(mgr_locs), taker.max() + 1)
- if only_slice or using_copy_on_write():
- taker = lib.maybe_indices_to_slice(taker, max_len)
-
- if isinstance(taker, slice):
- nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs)
- blocks.append(nb)
- elif only_slice:
- # GH#33597 slice instead of take, so we get
- # views instead of copies
- for i, ml in zip(taker, mgr_locs):
- slc = slice(i, i + 1)
- bp = BlockPlacement(ml)
- nb = blk.getitem_block_columns(slc, new_mgr_locs=bp)
- # We have np.shares_memory(nb.values, blk.values)
- blocks.append(nb)
- else:
- nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs)
- blocks.append(nb)
-
- return blocks
-
- def _make_na_block(
- self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False
- ) -> Block:
- # Note: we only get here with self.ndim == 2
-
- if use_na_proxy:
- assert fill_value is None
- shape = (len(placement), self.shape[1])
- vals = np.empty(shape, dtype=np.void)
- nb = NumpyBlock(vals, placement, ndim=2)
- return nb
-
- if fill_value is None:
- fill_value = np.nan
- block_shape = list(self.shape)
- block_shape[0] = len(placement)
-
- dtype, fill_value = infer_dtype_from_scalar(fill_value)
- # error: Argument "dtype" to "empty" has incompatible type "Union[dtype,
- # ExtensionDtype]"; expected "Union[dtype, None, type, _SupportsDtype, str,
- # Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any], _DtypeDict,
- # Tuple[Any, Any]]"
- block_values = np.empty(block_shape, dtype=dtype) # type: ignore[arg-type]
- block_values.fill(fill_value)
- return new_block_2d(block_values, placement=placement)
-
- def take(
- self: T,
- indexer,
- axis: AxisInt = 1,
- verify: bool = True,
- convert_indices: bool = True,
- ) -> T:
- """
- Take items along any axis.
-
- indexer : np.ndarray or slice
- axis : int, default 1
- verify : bool, default True
- Check that all entries are between 0 and len(self) - 1, inclusive.
- Pass verify=False if this check has been done by the caller.
- convert_indices : bool, default True
- Whether to attempt to convert indices to positive values.
-
- Returns
- -------
- BlockManager
- """
- # We have 6 tests that get here with a slice
- indexer = (
- np.arange(indexer.start, indexer.stop, indexer.step, dtype=np.intp)
- if isinstance(indexer, slice)
- else np.asanyarray(indexer, dtype=np.intp)
- )
-
- n = self.shape[axis]
- if convert_indices:
- indexer = maybe_convert_indices(indexer, n, verify=verify)
-
- new_labels = self.axes[axis].take(indexer)
- return self.reindex_indexer(
- new_axis=new_labels,
- indexer=indexer,
- axis=axis,
- allow_dups=True,
- copy=None,
- )
-
-
-class BlockManager(libinternals.BlockManager, BaseBlockManager):
- """
- BaseBlockManager that holds 2D blocks.
- """
-
- ndim = 2
-
- # ----------------------------------------------------------------
- # Constructors
-
- def __init__(
- self,
- blocks: Sequence[Block],
- axes: Sequence[Index],
- verify_integrity: bool = True,
- ) -> None:
- if verify_integrity:
- # Assertion disabled for performance
- # assert all(isinstance(x, Index) for x in axes)
-
- for block in blocks:
- if self.ndim != block.ndim:
- raise AssertionError(
- f"Number of Block dimensions ({block.ndim}) must equal "
- f"number of axes ({self.ndim})"
- )
- # As of 2.0, the caller is responsible for ensuring that
- # DatetimeTZBlock with block.ndim == 2 has block.values.ndim ==2;
- # previously there was a special check for fastparquet compat.
-
- self._verify_integrity()
-
- def _verify_integrity(self) -> None:
- mgr_shape = self.shape
- tot_items = sum(len(x.mgr_locs) for x in self.blocks)
- for block in self.blocks:
- if block.shape[1:] != mgr_shape[1:]:
- raise_construction_error(tot_items, block.shape[1:], self.axes)
- if len(self.items) != tot_items:
- raise AssertionError(
- "Number of manager items must equal union of "
- f"block items\n# manager items: {len(self.items)}, # "
- f"tot_items: {tot_items}"
- )
-
- @classmethod
- def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> BlockManager:
- """
- Constructor for BlockManager and SingleBlockManager with same signature.
- """
- return cls(blocks, axes, verify_integrity=False)
-
- # ----------------------------------------------------------------
- # Indexing
-
- def fast_xs(self, loc: int) -> SingleBlockManager:
- """
- Return the array corresponding to `frame.iloc[loc]`.
-
- Parameters
- ----------
- loc : int
-
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- if len(self.blocks) == 1:
- # TODO: this could be wrong if blk.mgr_locs is not slice(None)-like;
- # is this ruled out in the general case?
- result = self.blocks[0].iget((slice(None), loc))
- # in the case of a single block, the new block is a view
- block = new_block(
- result,
- placement=slice(0, len(result)),
- ndim=1,
- refs=self.blocks[0].refs,
- )
- return SingleBlockManager(block, self.axes[0])
-
- dtype = interleaved_dtype([blk.dtype for blk in self.blocks])
-
- n = len(self)
-
- # GH#46406
- immutable_ea = isinstance(dtype, SparseDtype)
-
- if isinstance(dtype, ExtensionDtype) and not immutable_ea:
- cls = dtype.construct_array_type()
- result = cls._empty((n,), dtype=dtype)
- else:
- # error: Argument "dtype" to "empty" has incompatible type
- # "Union[Type[object], dtype[Any], ExtensionDtype, None]"; expected
- # "None"
- result = np.empty(
- n, dtype=object if immutable_ea else dtype # type: ignore[arg-type]
- )
- result = ensure_wrapped_if_datetimelike(result)
-
- for blk in self.blocks:
- # Such assignment may incorrectly coerce NaT to None
- # result[blk.mgr_locs] = blk._slice((slice(None), loc))
- for i, rl in enumerate(blk.mgr_locs):
- result[rl] = blk.iget((i, loc))
-
- if immutable_ea:
- dtype = cast(ExtensionDtype, dtype)
- result = dtype.construct_array_type()._from_sequence(result, dtype=dtype)
-
- block = new_block(result, placement=slice(0, len(result)), ndim=1)
- return SingleBlockManager(block, self.axes[0])
-
- def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager:
- """
- Return the data as a SingleBlockManager.
- """
- block = self.blocks[self.blknos[i]]
- values = block.iget(self.blklocs[i])
-
- # shortcut for select a single-dim from a 2-dim BM
- bp = BlockPlacement(slice(0, len(values)))
- nb = type(block)(
- values, placement=bp, ndim=1, refs=block.refs if track_ref else None
- )
- return SingleBlockManager(nb, self.axes[1])
-
- def iget_values(self, i: int) -> ArrayLike:
- """
- Return the data for column i as the values (ndarray or ExtensionArray).
-
- Warning! The returned array is a view but doesn't handle Copy-on-Write,
- so this should be used with caution.
- """
- # TODO(CoW) making the arrays read-only might make this safer to use?
- block = self.blocks[self.blknos[i]]
- values = block.iget(self.blklocs[i])
- return values
-
- @property
- def column_arrays(self) -> list[np.ndarray]:
- """
- Used in the JSON C code to access column arrays.
- This optimizes compared to using `iget_values` by converting each
-
- Warning! This doesn't handle Copy-on-Write, so should be used with
- caution (current use case of consuming this in the JSON code is fine).
- """
- # This is an optimized equivalent to
- # result = [self.iget_values(i) for i in range(len(self.items))]
- result: list[np.ndarray | None] = [None] * len(self.items)
-
- for blk in self.blocks:
- mgr_locs = blk._mgr_locs
- values = blk.values_for_json()
- if values.ndim == 1:
- # TODO(EA2D): special casing not needed with 2D EAs
- result[mgr_locs[0]] = values
-
- else:
- for i, loc in enumerate(mgr_locs):
- result[loc] = values[i]
-
- # error: Incompatible return value type (got "List[None]",
- # expected "List[ndarray[Any, Any]]")
- return result # type: ignore[return-value]
-
- def iset(
- self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
- ):
- """
- Set new item in-place. Does not consolidate. Adds new Block if not
- contained in the current set of items
- """
-
- # FIXME: refactor, clearly separate broadcasting & zip-like assignment
- # can prob also fix the various if tests for sparse/categorical
- if self._blklocs is None and self.ndim > 1:
- self._rebuild_blknos_and_blklocs()
-
- # Note: we exclude DTA/TDA here
- value_is_extension_type = is_1d_only_ea_dtype(value.dtype)
- if not value_is_extension_type:
- if value.ndim == 2:
- value = value.T
- else:
- value = ensure_block_shape(value, ndim=2)
-
- if value.shape[1:] != self.shape[1:]:
- raise AssertionError(
- "Shape of new values must be compatible with manager shape"
- )
-
- if lib.is_integer(loc):
- # We have 6 tests where loc is _not_ an int.
- # In this case, get_blkno_placements will yield only one tuple,
- # containing (self._blknos[loc], BlockPlacement(slice(0, 1, 1)))
-
- # Check if we can use _iset_single fastpath
- loc = cast(int, loc)
- blkno = self.blknos[loc]
- blk = self.blocks[blkno]
- if len(blk._mgr_locs) == 1: # TODO: fastest way to check this?
- return self._iset_single(
- loc,
- value,
- inplace=inplace,
- blkno=blkno,
- blk=blk,
- )
-
- # error: Incompatible types in assignment (expression has type
- # "List[Union[int, slice, ndarray]]", variable has type "Union[int,
- # slice, ndarray]")
- loc = [loc] # type: ignore[assignment]
-
- # categorical/sparse/datetimetz
- if value_is_extension_type:
-
- def value_getitem(placement):
- return value
-
- else:
-
- def value_getitem(placement):
- return value[placement.indexer]
-
- # Accessing public blknos ensures the public versions are initialized
- blknos = self.blknos[loc]
- blklocs = self.blklocs[loc].copy()
-
- unfit_mgr_locs = []
- unfit_val_locs = []
- removed_blknos = []
- for blkno_l, val_locs in libinternals.get_blkno_placements(blknos, group=True):
- blk = self.blocks[blkno_l]
- blk_locs = blklocs[val_locs.indexer]
- if inplace and blk.should_store(value):
- # Updating inplace -> check if we need to do Copy-on-Write
- if using_copy_on_write() and not self._has_no_reference_block(blkno_l):
- self._iset_split_block(blkno_l, blk_locs, value_getitem(val_locs))
- else:
- blk.set_inplace(blk_locs, value_getitem(val_locs))
- continue
- else:
- unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
- unfit_val_locs.append(val_locs)
-
- # If all block items are unfit, schedule the block for removal.
- if len(val_locs) == len(blk.mgr_locs):
- removed_blknos.append(blkno_l)
- continue
- else:
- # Defer setting the new values to enable consolidation
- self._iset_split_block(blkno_l, blk_locs)
-
- if len(removed_blknos):
- # Remove blocks & update blknos accordingly
- is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
- is_deleted[removed_blknos] = True
-
- new_blknos = np.empty(self.nblocks, dtype=np.intp)
- new_blknos.fill(-1)
- new_blknos[~is_deleted] = np.arange(self.nblocks - len(removed_blknos))
- self._blknos = new_blknos[self._blknos]
- self.blocks = tuple(
- blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos)
- )
-
- if unfit_val_locs:
- unfit_idxr = np.concatenate(unfit_mgr_locs)
- unfit_count = len(unfit_idxr)
-
- new_blocks: list[Block] = []
- # TODO(CoW) is this always correct to assume that the new_blocks
- # are not referencing anything else?
- if value_is_extension_type:
- # This code (ab-)uses the fact that EA blocks contain only
- # one item.
- # TODO(EA2D): special casing unnecessary with 2D EAs
- new_blocks.extend(
- new_block_2d(
- values=value,
- placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)),
- )
- for mgr_loc in unfit_idxr
- )
-
- self._blknos[unfit_idxr] = np.arange(unfit_count) + len(self.blocks)
- self._blklocs[unfit_idxr] = 0
-
- else:
- # unfit_val_locs contains BlockPlacement objects
- unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
-
- new_blocks.append(
- new_block_2d(
- values=value_getitem(unfit_val_items),
- placement=BlockPlacement(unfit_idxr),
- )
- )
-
- self._blknos[unfit_idxr] = len(self.blocks)
- self._blklocs[unfit_idxr] = np.arange(unfit_count)
-
- self.blocks += tuple(new_blocks)
-
- # Newly created block's dtype may already be present.
- self._known_consolidated = False
-
- def _iset_split_block(
- self,
- blkno_l: int,
- blk_locs: np.ndarray | list[int],
- value: ArrayLike | None = None,
- ) -> None:
- """Removes columns from a block by splitting the block.
-
- Avoids copying the whole block through slicing and updates the manager
- after determinint the new block structure. Optionally adds a new block,
- otherwise has to be done by the caller.
-
- Parameters
- ----------
- blkno_l: The block number to operate on, relevant for updating the manager
- blk_locs: The locations of our block that should be deleted.
- value: The value to set as a replacement.
- """
- blk = self.blocks[blkno_l]
-
- if self._blklocs is None:
- self._rebuild_blknos_and_blklocs()
-
- nbs_tup = tuple(blk.delete(blk_locs))
- if value is not None:
- locs = blk.mgr_locs.as_array[blk_locs]
- first_nb = new_block_2d(value, BlockPlacement(locs))
- else:
- first_nb = nbs_tup[0]
- nbs_tup = tuple(nbs_tup[1:])
-
- nr_blocks = len(self.blocks)
- blocks_tup = (
- self.blocks[:blkno_l] + (first_nb,) + self.blocks[blkno_l + 1 :] + nbs_tup
- )
- self.blocks = blocks_tup
-
- if not nbs_tup and value is not None:
- # No need to update anything if split did not happen
- return
-
- self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb))
-
- for i, nb in enumerate(nbs_tup):
- self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb))
- self._blknos[nb.mgr_locs.indexer] = i + nr_blocks
-
- def _iset_single(
- self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block
- ) -> None:
- """
- Fastpath for iset when we are only setting a single position and
- the Block currently in that position is itself single-column.
-
- In this case we can swap out the entire Block and blklocs and blknos
- are unaffected.
- """
- # Caller is responsible for verifying value.shape
-
- if inplace and blk.should_store(value):
- copy = False
- if using_copy_on_write() and not self._has_no_reference_block(blkno):
- # perform Copy-on-Write and clear the reference
- copy = True
- iloc = self.blklocs[loc]
- blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy)
- return
-
- nb = new_block_2d(value, placement=blk._mgr_locs)
- old_blocks = self.blocks
- new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :]
- self.blocks = new_blocks
- return
-
- def column_setitem(
- self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
- ) -> None:
- """
- Set values ("setitem") into a single column (not setting the full column).
-
- This is a method on the BlockManager level, to avoid creating an
- intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)
- """
- if using_copy_on_write() and not self._has_no_reference(loc):
- blkno = self.blknos[loc]
- # Split blocks to only copy the column we want to modify
- blk_loc = self.blklocs[loc]
- # Copy our values
- values = self.blocks[blkno].values
- if values.ndim == 1:
- values = values.copy()
- else:
- # Use [blk_loc] as indexer to keep ndim=2, this already results in a
- # copy
- values = values[[blk_loc]]
- self._iset_split_block(blkno, [blk_loc], values)
-
- # this manager is only created temporarily to mutate the values in place
- # so don't track references, otherwise the `setitem` would perform CoW again
- col_mgr = self.iget(loc, track_ref=False)
- if inplace_only:
- col_mgr.setitem_inplace(idx, value)
- else:
- new_mgr = col_mgr.setitem((idx,), value)
- self.iset(loc, new_mgr._block.values, inplace=True)
-
- def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
- """
- Insert item at selected position.
-
- Parameters
- ----------
- loc : int
- item : hashable
- value : np.ndarray or ExtensionArray
- """
- # insert to the axis; this could possibly raise a TypeError
- new_axis = self.items.insert(loc, item)
-
- if value.ndim == 2:
- value = value.T
- if len(value) > 1:
- raise ValueError(
- f"Expected a 1D array, got an array with shape {value.T.shape}"
- )
- else:
- value = ensure_block_shape(value, ndim=self.ndim)
-
- bp = BlockPlacement(slice(loc, loc + 1))
- # TODO(CoW) do we always "own" the passed `value`?
- block = new_block_2d(values=value, placement=bp)
-
- if not len(self.blocks):
- # Fastpath
- self._blklocs = np.array([0], dtype=np.intp)
- self._blknos = np.array([0], dtype=np.intp)
- else:
- self._insert_update_mgr_locs(loc)
- self._insert_update_blklocs_and_blknos(loc)
-
- self.axes[0] = new_axis
- self.blocks += (block,)
-
- self._known_consolidated = False
-
- if sum(not block.is_extension for block in self.blocks) > 100:
- warnings.warn(
- "DataFrame is highly fragmented. This is usually the result "
- "of calling `frame.insert` many times, which has poor performance. "
- "Consider joining all columns at once using pd.concat(axis=1) "
- "instead. To get a de-fragmented frame, use `newframe = frame.copy()`",
- PerformanceWarning,
- stacklevel=find_stack_level(),
- )
-
- def _insert_update_mgr_locs(self, loc) -> None:
- """
- When inserting a new Block at location 'loc', we increment
- all of the mgr_locs of blocks above that by one.
- """
- for blkno, count in _fast_count_smallints(self.blknos[loc:]):
- # .620 this way, .326 of which is in increment_above
- blk = self.blocks[blkno]
- blk._mgr_locs = blk._mgr_locs.increment_above(loc)
-
- def _insert_update_blklocs_and_blknos(self, loc) -> None:
- """
- When inserting a new Block at location 'loc', we update our
- _blklocs and _blknos.
- """
-
- # Accessing public blklocs ensures the public versions are initialized
- if loc == self.blklocs.shape[0]:
- # np.append is a lot faster, let's use it if we can.
- self._blklocs = np.append(self._blklocs, 0)
- self._blknos = np.append(self._blknos, len(self.blocks))
- elif loc == 0:
- # np.append is a lot faster, let's use it if we can.
- self._blklocs = np.append(self._blklocs[::-1], 0)[::-1]
- self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1]
- else:
- new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos(
- self.blklocs, self.blknos, loc, len(self.blocks)
- )
- self._blklocs = new_blklocs
- self._blknos = new_blknos
-
- def idelete(self, indexer) -> BlockManager:
- """
- Delete selected locations, returning a new BlockManager.
- """
- is_deleted = np.zeros(self.shape[0], dtype=np.bool_)
- is_deleted[indexer] = True
- taker = (~is_deleted).nonzero()[0]
-
- nbs = self._slice_take_blocks_ax0(taker, only_slice=True)
- new_columns = self.items[~is_deleted]
- axes = [new_columns, self.axes[1]]
- return type(self)(tuple(nbs), axes, verify_integrity=False)
-
- # ----------------------------------------------------------------
- # Block-wise Operation
-
- def grouped_reduce(self: T, func: Callable) -> T:
- """
- Apply grouped reduction function blockwise, returning a new BlockManager.
-
- Parameters
- ----------
- func : grouped reduction function
-
- Returns
- -------
- BlockManager
- """
- result_blocks: list[Block] = []
-
- for blk in self.blocks:
- if blk.is_object:
- # split on object-dtype blocks bc some columns may raise
- # while others do not.
- for sb in blk._split():
- applied = sb.apply(func)
- result_blocks = extend_blocks(applied, result_blocks)
- else:
- applied = blk.apply(func)
- result_blocks = extend_blocks(applied, result_blocks)
-
- if len(result_blocks) == 0:
- nrows = 0
- else:
- nrows = result_blocks[0].values.shape[-1]
- index = Index(range(nrows))
-
- return type(self).from_blocks(result_blocks, [self.axes[0], index])
-
- def reduce(self: T, func: Callable) -> T:
- """
- Apply reduction function blockwise, returning a single-row BlockManager.
-
- Parameters
- ----------
- func : reduction function
-
- Returns
- -------
- BlockManager
- """
- # If 2D, we assume that we're operating column-wise
- assert self.ndim == 2
-
- res_blocks: list[Block] = []
- for blk in self.blocks:
- nbs = blk.reduce(func)
- res_blocks.extend(nbs)
-
- index = Index([None]) # placeholder
- new_mgr = type(self).from_blocks(res_blocks, [self.items, index])
- return new_mgr
-
- def operate_blockwise(self, other: BlockManager, array_op) -> BlockManager:
- """
- Apply array_op blockwise with another (aligned) BlockManager.
- """
- return operate_blockwise(self, other, array_op)
-
- def _equal_values(self: BlockManager, other: BlockManager) -> bool:
- """
- Used in .equals defined in base class. Only check the column values
- assuming shape and indexes have already been checked.
- """
- return blockwise_all(self, other, array_equals)
-
- def quantile(
- self: T,
- *,
- qs: Index, # with dtype float 64
- axis: AxisInt = 0,
- interpolation: QuantileInterpolation = "linear",
- ) -> T:
- """
- Iterate over blocks applying quantile reduction.
- This routine is intended for reduction type operations and
- will do inference on the generated blocks.
-
- Parameters
- ----------
- axis: reduction axis, default 0
- consolidate: bool, default True. Join together blocks having same
- dtype
- interpolation : type of interpolation, default 'linear'
- qs : list of the quantiles to be computed
-
- Returns
- -------
- BlockManager
- """
- # Series dispatches to DataFrame for quantile, which allows us to
- # simplify some of the code here and in the blocks
- assert self.ndim >= 2
- assert is_list_like(qs) # caller is responsible for this
- assert axis == 1 # only ever called this way
-
- new_axes = list(self.axes)
- new_axes[1] = Index(qs, dtype=np.float64)
-
- blocks = [
- blk.quantile(axis=axis, qs=qs, interpolation=interpolation)
- for blk in self.blocks
- ]
-
- return type(self)(blocks, new_axes)
-
- # ----------------------------------------------------------------
-
- def unstack(self, unstacker, fill_value) -> BlockManager:
- """
- Return a BlockManager with all blocks unstacked.
-
- Parameters
- ----------
- unstacker : reshape._Unstacker
- fill_value : Any
- fill_value for newly introduced missing values.
-
- Returns
- -------
- unstacked : BlockManager
- """
- new_columns = unstacker.get_new_columns(self.items)
- new_index = unstacker.new_index
-
- allow_fill = not unstacker.mask_all
- if allow_fill:
- # calculating the full mask once and passing it to Block._unstack is
- # faster than letting calculating it in each repeated call
- new_mask2D = (~unstacker.mask).reshape(*unstacker.full_shape)
- needs_masking = new_mask2D.any(axis=0)
- else:
- needs_masking = np.zeros(unstacker.full_shape[1], dtype=bool)
-
- new_blocks: list[Block] = []
- columns_mask: list[np.ndarray] = []
-
- if len(self.items) == 0:
- factor = 1
- else:
- fac = len(new_columns) / len(self.items)
- assert fac == int(fac)
- factor = int(fac)
-
- for blk in self.blocks:
- mgr_locs = blk.mgr_locs
- new_placement = mgr_locs.tile_for_unstack(factor)
-
- blocks, mask = blk._unstack(
- unstacker,
- fill_value,
- new_placement=new_placement,
- needs_masking=needs_masking,
- )
-
- new_blocks.extend(blocks)
- columns_mask.extend(mask)
-
- # Block._unstack should ensure this holds,
- assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks)
- # In turn this ensures that in the BlockManager call below
- # we have len(new_columns) == sum(x.shape[0] for x in new_blocks)
- # which suffices to allow us to pass verify_inegrity=False
-
- new_columns = new_columns[columns_mask]
-
- bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False)
- return bm
-
- def to_dict(self, copy: bool = True):
- """
- Return a dict of str(dtype) -> BlockManager
-
- Parameters
- ----------
- copy : bool, default True
-
- Returns
- -------
- values : a dict of dtype -> BlockManager
- """
-
- bd: dict[str, list[Block]] = {}
- for b in self.blocks:
- bd.setdefault(str(b.dtype), []).append(b)
-
- # TODO(EA2D): the combine will be unnecessary with 2D EAs
- return {dtype: self._combine(blocks, copy=copy) for dtype, blocks in bd.items()}
-
- def as_array(
- self,
- dtype: np.dtype | None = None,
- copy: bool = False,
- na_value: object = lib.no_default,
- ) -> np.ndarray:
- """
- Convert the blockmanager data into an numpy array.
-
- Parameters
- ----------
- dtype : np.dtype or None, default None
- Data type of the return array.
- copy : bool, default False
- If True then guarantee that a copy is returned. A value of
- False does not guarantee that the underlying data is not
- copied.
- na_value : object, default lib.no_default
- Value to be used as the missing value sentinel.
-
- Returns
- -------
- arr : ndarray
- """
- # TODO(CoW) handle case where resulting array is a view
- if len(self.blocks) == 0:
- arr = np.empty(self.shape, dtype=float)
- return arr.transpose()
-
- # We want to copy when na_value is provided to avoid
- # mutating the original object
- copy = copy or na_value is not lib.no_default
-
- if self.is_single_block:
- blk = self.blocks[0]
- if blk.is_extension:
- # Avoid implicit conversion of extension blocks to object
-
- # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
- # attribute "to_numpy"
- arr = blk.values.to_numpy( # type: ignore[union-attr]
- dtype=dtype,
- na_value=na_value,
- ).reshape(blk.shape)
- else:
- arr = np.asarray(blk.get_values())
- if dtype:
- arr = arr.astype(dtype, copy=False)
-
- if copy:
- arr = arr.copy()
- elif using_copy_on_write():
- arr = arr.view()
- arr.flags.writeable = False
- else:
- arr = self._interleave(dtype=dtype, na_value=na_value)
- # The underlying data was copied within _interleave, so no need
- # to further copy if copy=True or setting na_value
-
- if na_value is not lib.no_default:
- arr[isna(arr)] = na_value
-
- return arr.transpose()
-
- def _interleave(
- self,
- dtype: np.dtype | None = None,
- na_value: object = lib.no_default,
- ) -> np.ndarray:
- """
- Return ndarray from blocks with specified item order
- Items must be contained in the blocks
- """
- if not dtype:
- # Incompatible types in assignment (expression has type
- # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has
- # type "Optional[dtype[Any]]")
- dtype = interleaved_dtype( # type: ignore[assignment]
- [blk.dtype for blk in self.blocks]
- )
-
- # TODO: https://github.com/pandas-dev/pandas/issues/22791
- # Give EAs some input on what happens here. Sparse needs this.
- if isinstance(dtype, SparseDtype):
- dtype = dtype.subtype
- dtype = cast(np.dtype, dtype)
- elif isinstance(dtype, ExtensionDtype):
- dtype = np.dtype("object")
- elif is_dtype_equal(dtype, str):
- dtype = np.dtype("object")
-
- result = np.empty(self.shape, dtype=dtype)
-
- itemmask = np.zeros(self.shape[0])
-
- if dtype == np.dtype("object") and na_value is lib.no_default:
- # much more performant than using to_numpy below
- for blk in self.blocks:
- rl = blk.mgr_locs
- arr = blk.get_values(dtype)
- result[rl.indexer] = arr
- itemmask[rl.indexer] = 1
- return result
-
- for blk in self.blocks:
- rl = blk.mgr_locs
- if blk.is_extension:
- # Avoid implicit conversion of extension blocks to object
-
- # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no
- # attribute "to_numpy"
- arr = blk.values.to_numpy( # type: ignore[union-attr]
- dtype=dtype,
- na_value=na_value,
- )
- else:
- arr = blk.get_values(dtype)
- result[rl.indexer] = arr
- itemmask[rl.indexer] = 1
-
- if not itemmask.all():
- raise AssertionError("Some items were not contained in blocks")
-
- return result
-
- # ----------------------------------------------------------------
- # Consolidation
-
- def is_consolidated(self) -> bool:
- """
- Return True if more than one block with the same dtype
- """
- if not self._known_consolidated:
- self._consolidate_check()
- return self._is_consolidated
-
- def _consolidate_check(self) -> None:
- if len(self.blocks) == 1:
- # fastpath
- self._is_consolidated = True
- self._known_consolidated = True
- return
- dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate]
- self._is_consolidated = len(dtypes) == len(set(dtypes))
- self._known_consolidated = True
-
- def _consolidate_inplace(self) -> None:
- # In general, _consolidate_inplace should only be called via
- # DataFrame._consolidate_inplace, otherwise we will fail to invalidate
- # the DataFrame's _item_cache. The exception is for newly-created
- # BlockManager objects not yet attached to a DataFrame.
- if not self.is_consolidated():
- self.blocks = _consolidate(self.blocks)
- self._is_consolidated = True
- self._known_consolidated = True
- self._rebuild_blknos_and_blklocs()
-
-
-class SingleBlockManager(BaseBlockManager, SingleDataManager):
- """manage a single block with"""
-
- @property
- def ndim(self) -> Literal[1]:
- return 1
-
- _is_consolidated = True
- _known_consolidated = True
- __slots__ = ()
- is_single_block = True
-
- def __init__(
- self,
- block: Block,
- axis: Index,
- verify_integrity: bool = False,
- ) -> None:
- # Assertions disabled for performance
- # assert isinstance(block, Block), type(block)
- # assert isinstance(axis, Index), type(axis)
-
- self.axes = [axis]
- self.blocks = (block,)
-
- @classmethod
- def from_blocks(
- cls,
- blocks: list[Block],
- axes: list[Index],
- ) -> SingleBlockManager:
- """
- Constructor for BlockManager and SingleBlockManager with same signature.
- """
- assert len(blocks) == 1
- assert len(axes) == 1
- return cls(blocks[0], axes[0], verify_integrity=False)
-
- @classmethod
- def from_array(
- cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
- ) -> SingleBlockManager:
- """
- Constructor for if we have an array that is not yet a Block.
- """
- block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs)
- return cls(block, index)
-
- def to_2d_mgr(self, columns: Index) -> BlockManager:
- """
- Manager analogue of Series.to_frame
- """
- blk = self.blocks[0]
- arr = ensure_block_shape(blk.values, ndim=2)
- bp = BlockPlacement(0)
- new_blk = type(blk)(arr, placement=bp, ndim=2, refs=blk.refs)
- axes = [columns, self.axes[0]]
- return BlockManager([new_blk], axes=axes, verify_integrity=False)
-
- def _has_no_reference(self, i: int = 0) -> bool:
- """
- Check for column `i` if it has references.
- (whether it references another array or is itself being referenced)
- Returns True if the column has no references.
- """
- return not self.blocks[0].refs.has_reference()
-
- def __getstate__(self):
- block_values = [b.values for b in self.blocks]
- block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
- axes_array = list(self.axes)
-
- extra_state = {
- "0.14.1": {
- "axes": axes_array,
- "blocks": [
- {"values": b.values, "mgr_locs": b.mgr_locs.indexer}
- for b in self.blocks
- ],
- }
- }
-
- # First three elements of the state are to maintain forward
- # compatibility with 0.13.1.
- return axes_array, block_values, block_items, extra_state
-
- def __setstate__(self, state):
- def unpickle_block(values, mgr_locs, ndim: int) -> Block:
- # TODO(EA2D): ndim would be unnecessary with 2D EAs
- # older pickles may store e.g. DatetimeIndex instead of DatetimeArray
- values = extract_array(values, extract_numpy=True)
- return new_block(values, placement=mgr_locs, ndim=ndim)
-
- if isinstance(state, tuple) and len(state) >= 4 and "0.14.1" in state[3]:
- state = state[3]["0.14.1"]
- self.axes = [ensure_index(ax) for ax in state["axes"]]
- ndim = len(self.axes)
- self.blocks = tuple(
- unpickle_block(b["values"], b["mgr_locs"], ndim=ndim)
- for b in state["blocks"]
- )
- else:
- raise NotImplementedError("pre-0.14.1 pickles are no longer supported")
-
- self._post_setstate()
-
- def _post_setstate(self) -> None:
- pass
-
- @cache_readonly
- def _block(self) -> Block:
- return self.blocks[0]
-
- @property
- def _blknos(self):
- """compat with BlockManager"""
- return None
-
- @property
- def _blklocs(self):
- """compat with BlockManager"""
- return None
-
- def getitem_mgr(self, indexer: slice | np.ndarray) -> SingleBlockManager:
- # similar to get_slice, but not restricted to slice indexer
- blk = self._block
- if (
- using_copy_on_write()
- and isinstance(indexer, np.ndarray)
- and len(indexer) > 0
- and com.is_bool_indexer(indexer)
- and indexer.all()
- ):
- return type(self)(blk.copy(deep=False), self.index)
- array = blk._slice(indexer)
- if array.ndim > 1:
- # This will be caught by Series._get_values
- raise ValueError("dimension-expanding indexing not allowed")
-
- bp = BlockPlacement(slice(0, len(array)))
- # TODO(CoW) in theory only need to track reference if new_array is a view
- block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)
-
- new_idx = self.index[indexer]
- return type(self)(block, new_idx)
-
- def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager:
- # Assertion disabled for performance
- # assert isinstance(slobj, slice), type(slobj)
- if axis >= self.ndim:
- raise IndexError("Requested axis not found in manager")
-
- blk = self._block
- array = blk._slice(slobj)
- bp = BlockPlacement(slice(0, len(array)))
- # TODO this method is only used in groupby SeriesSplitter at the moment,
- # so passing refs is not yet covered by the tests
- block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)
- new_index = self.index._getitem_slice(slobj)
- return type(self)(block, new_index)
-
- @property
- def index(self) -> Index:
- return self.axes[0]
-
- @property
- def dtype(self) -> DtypeObj:
- return self._block.dtype
-
- def get_dtypes(self) -> np.ndarray:
- return np.array([self._block.dtype])
-
- def external_values(self):
- """The array that Series.values returns"""
- return self._block.external_values()
-
- def internal_values(self):
- """The array that Series._values returns"""
- return self._block.values
-
- def array_values(self):
- """The array that Series.array returns"""
- return self._block.array_values
-
- def get_numeric_data(self, copy: bool = False):
- if self._block.is_numeric:
- return self.copy(deep=copy)
- return self.make_empty()
-
- @property
- def _can_hold_na(self) -> bool:
- return self._block._can_hold_na
-
- def setitem_inplace(self, indexer, value) -> None:
- """
- Set values with indexer.
-
- For Single[Block/Array]Manager, this backs s[indexer] = value
-
- This is an inplace version of `setitem()`, mutating the manager/values
- in place, not returning a new Manager (and Block), and thus never changing
- the dtype.
- """
- if using_copy_on_write() and not self._has_no_reference(0):
- self.blocks = (self._block.copy(),)
- self._cache.clear()
-
- super().setitem_inplace(indexer, value)
-
- def idelete(self, indexer) -> SingleBlockManager:
- """
- Delete single location from SingleBlockManager.
-
- Ensures that self.blocks doesn't become empty.
- """
- nb = self._block.delete(indexer)[0]
- self.blocks = (nb,)
- self.axes[0] = self.axes[0].delete(indexer)
- self._cache.clear()
- return self
-
- def fast_xs(self, loc):
- """
- fast path for getting a cross-section
- return a view of the data
- """
- raise NotImplementedError("Use series._values[loc] instead")
-
- def set_values(self, values: ArrayLike) -> None:
- """
- Set the values of the single block in place.
-
- Use at your own risk! This does not check if the passed values are
- valid for the current Block/SingleBlockManager (length, dtype, etc).
- """
- # TODO(CoW) do we need to handle copy on write here? Currently this is
- # only used for FrameColumnApply.series_generator (what if apply is
- # mutating inplace?)
- self.blocks[0].values = values
- self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values)))
-
- def _equal_values(self: T, other: T) -> bool:
- """
- Used in .equals defined in base class. Only check the column values
- assuming shape and indexes have already been checked.
- """
- # For SingleBlockManager (i.e.Series)
- if other.ndim != 1:
- return False
- left = self.blocks[0].values
- right = other.blocks[0].values
- return array_equals(left, right)
-
-
-# --------------------------------------------------------------------
-# Constructor Helpers
-
-
-def create_block_manager_from_blocks(
- blocks: list[Block],
- axes: list[Index],
- consolidate: bool = True,
- verify_integrity: bool = True,
-) -> BlockManager:
- # If verify_integrity=False, then caller is responsible for checking
- # all(x.shape[-1] == len(axes[1]) for x in blocks)
- # sum(x.shape[0] for x in blocks) == len(axes[0])
- # set(x for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0])))
- # all(blk.ndim == 2 for blk in blocks)
- # This allows us to safely pass verify_integrity=False
-
- try:
- mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity)
-
- except ValueError as err:
- arrays = [blk.values for blk in blocks]
- tot_items = sum(arr.shape[0] for arr in arrays)
- raise_construction_error(tot_items, arrays[0].shape[1:], axes, err)
-
- if consolidate:
- mgr._consolidate_inplace()
- return mgr
-
-
-def create_block_manager_from_column_arrays(
- arrays: list[ArrayLike],
- axes: list[Index],
- consolidate: bool,
- refs: list,
-) -> BlockManager:
- # Assertions disabled for performance (caller is responsible for verifying)
- # assert isinstance(axes, list)
- # assert all(isinstance(x, Index) for x in axes)
- # assert all(isinstance(x, (np.ndarray, ExtensionArray)) for x in arrays)
- # assert all(type(x) is not PandasArray for x in arrays)
- # assert all(x.ndim == 1 for x in arrays)
- # assert all(len(x) == len(axes[1]) for x in arrays)
- # assert len(arrays) == len(axes[0])
- # These last three are sufficient to allow us to safely pass
- # verify_integrity=False below.
-
- try:
- blocks = _form_blocks(arrays, consolidate, refs)
- mgr = BlockManager(blocks, axes, verify_integrity=False)
- except ValueError as e:
- raise_construction_error(len(arrays), arrays[0].shape, axes, e)
- if consolidate:
- mgr._consolidate_inplace()
- return mgr
-
-
-def raise_construction_error(
- tot_items: int,
- block_shape: Shape,
- axes: list[Index],
- e: ValueError | None = None,
-):
- """raise a helpful message about our construction"""
- passed = tuple(map(int, [tot_items] + list(block_shape)))
- # Correcting the user facing error message during dataframe construction
- if len(passed) <= 2:
- passed = passed[::-1]
-
- implied = tuple(len(ax) for ax in axes)
- # Correcting the user facing error message during dataframe construction
- if len(implied) <= 2:
- implied = implied[::-1]
-
- # We return the exception object instead of raising it so that we
- # can raise it in the caller; mypy plays better with that
- if passed == implied and e is not None:
- raise e
- if block_shape[0] == 0:
- raise ValueError("Empty data passed with indices specified.")
- raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
-
-
-# -----------------------------------------------------------------------
-
-
-def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]:
- # compat for numpy<1.21, in which comparing a np.dtype with an ExtensionDtype
- # raises instead of returning False. Once earlier numpy versions are dropped,
- # this can be simplified to `return tup[1].dtype`
- dtype = tup[1].dtype
-
- if is_1d_only_ea_dtype(dtype):
- # We know these won't be consolidated, so don't need to group these.
- # This avoids expensive comparisons of CategoricalDtype objects
- sep = id(dtype)
- else:
- sep = 0
-
- return sep, isinstance(dtype, np.dtype), dtype
-
-
-def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:
- tuples = list(enumerate(arrays))
-
- if not consolidate:
- nbs = _tuples_to_blocks_no_consolidate(tuples, refs)
- return nbs
-
- # when consolidating, we can ignore refs (either stacking always copies,
- # or the EA is already copied in the calling dict_to_mgr)
- # TODO(CoW) check if this is also valid for rec_array_to_mgr
-
- # group by dtype
- grouper = itertools.groupby(tuples, _grouping_func)
-
- nbs = []
- for (_, _, dtype), tup_block in grouper:
- block_type = get_block_type(dtype)
-
- if isinstance(dtype, np.dtype):
- is_dtlike = dtype.kind in ["m", "M"]
-
- if issubclass(dtype.type, (str, bytes)):
- dtype = np.dtype(object)
-
- values, placement = _stack_arrays(list(tup_block), dtype)
- if is_dtlike:
- values = ensure_wrapped_if_datetimelike(values)
- blk = block_type(values, placement=BlockPlacement(placement), ndim=2)
- nbs.append(blk)
-
- elif is_1d_only_ea_dtype(dtype):
- dtype_blocks = [
- block_type(x[1], placement=BlockPlacement(x[0]), ndim=2)
- for x in tup_block
- ]
- nbs.extend(dtype_blocks)
-
- else:
- dtype_blocks = [
- block_type(
- ensure_block_shape(x[1], 2), placement=BlockPlacement(x[0]), ndim=2
- )
- for x in tup_block
- ]
- nbs.extend(dtype_blocks)
- return nbs
-
-
-def _tuples_to_blocks_no_consolidate(tuples, refs) -> list[Block]:
- # tuples produced within _form_blocks are of the form (placement, array)
- return [
- new_block_2d(
- ensure_block_shape(arr, ndim=2), placement=BlockPlacement(i), refs=ref
- )
- for ((i, arr), ref) in zip(tuples, refs)
- ]
-
-
-def _stack_arrays(tuples, dtype: np.dtype):
- placement, arrays = zip(*tuples)
-
- first = arrays[0]
- shape = (len(arrays),) + first.shape
-
- stacked = np.empty(shape, dtype=dtype)
- for i, arr in enumerate(arrays):
- stacked[i] = arr
-
- return stacked, placement
-
-
-def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]:
- """
- Merge blocks having same dtype, exclude non-consolidating blocks
- """
- # sort by _can_consolidate, dtype
- gkey = lambda x: x._consolidate_key
- grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)
-
- new_blocks: list[Block] = []
- for (_can_consolidate, dtype), group_blocks in grouper:
- merged_blocks, _ = _merge_blocks(
- list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate
- )
- new_blocks = extend_blocks(merged_blocks, new_blocks)
- return tuple(new_blocks)
-
-
-def _merge_blocks(
- blocks: list[Block], dtype: DtypeObj, can_consolidate: bool
-) -> tuple[list[Block], bool]:
- if len(blocks) == 1:
- return blocks, False
-
- if can_consolidate:
- # TODO: optimization potential in case all mgrs contain slices and
- # combination of those slices is a slice, too.
- new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
-
- new_values: ArrayLike
-
- if isinstance(blocks[0].dtype, np.dtype):
- # error: List comprehension has incompatible type List[Union[ndarray,
- # ExtensionArray]]; expected List[Union[complex, generic,
- # Sequence[Union[int, float, complex, str, bytes, generic]],
- # Sequence[Sequence[Any]], SupportsArray]]
- new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc]
- else:
- bvals = [blk.values for blk in blocks]
- bvals2 = cast(Sequence[NDArrayBackedExtensionArray], bvals)
- new_values = bvals2[0]._concat_same_type(bvals2, axis=0)
-
- argsort = np.argsort(new_mgr_locs)
- new_values = new_values[argsort]
- new_mgr_locs = new_mgr_locs[argsort]
-
- bp = BlockPlacement(new_mgr_locs)
- return [new_block_2d(new_values, placement=bp)], True
-
- # can't consolidate --> no merge
- return blocks, False
-
-
-def _fast_count_smallints(arr: npt.NDArray[np.intp]):
- """Faster version of set(arr) for sequences of small numbers."""
- counts = np.bincount(arr)
- nz = counts.nonzero()[0]
- # Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,
- # in one benchmark by a factor of 11
- return zip(nz, counts[nz])
-
-
-def _preprocess_slice_or_indexer(
- slice_or_indexer: slice | np.ndarray, length: int, allow_fill: bool
-):
- if isinstance(slice_or_indexer, slice):
- return (
- "slice",
- slice_or_indexer,
- libinternals.slice_len(slice_or_indexer, length),
- )
- else:
- if (
- not isinstance(slice_or_indexer, np.ndarray)
- or slice_or_indexer.dtype.kind != "i"
- ):
- dtype = getattr(slice_or_indexer, "dtype", None)
- raise TypeError(type(slice_or_indexer), dtype)
-
- indexer = ensure_platform_int(slice_or_indexer)
- if not allow_fill:
- indexer = maybe_convert_indices(indexer, length)
- return "fancy", indexer, len(indexer)
diff --git a/contrib/python/pandas/py3/pandas/core/internals/ops.py b/contrib/python/pandas/py3/pandas/core/internals/ops.py
deleted file mode 100644
index 24fc51a96d9..00000000000
--- a/contrib/python/pandas/py3/pandas/core/internals/ops.py
+++ /dev/null
@@ -1,147 +0,0 @@
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Iterator,
- NamedTuple,
-)
-
-from pandas._typing import ArrayLike
-
-if TYPE_CHECKING:
- from pandas._libs.internals import BlockPlacement
-
- from pandas.core.internals.blocks import Block
- from pandas.core.internals.managers import BlockManager
-
-
-class BlockPairInfo(NamedTuple):
- lvals: ArrayLike
- rvals: ArrayLike
- locs: BlockPlacement
- left_ea: bool
- right_ea: bool
- rblk: Block
-
-
-def _iter_block_pairs(
- left: BlockManager, right: BlockManager
-) -> Iterator[BlockPairInfo]:
- # At this point we have already checked the parent DataFrames for
- # assert rframe._indexed_same(lframe)
-
- for blk in left.blocks:
- locs = blk.mgr_locs
- blk_vals = blk.values
-
- left_ea = blk_vals.ndim == 1
-
- rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True)
-
- # Assertions are disabled for performance, but should hold:
- # if left_ea:
- # assert len(locs) == 1, locs
- # assert len(rblks) == 1, rblks
- # assert rblks[0].shape[0] == 1, rblks[0].shape
-
- for rblk in rblks:
- right_ea = rblk.values.ndim == 1
-
- lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea)
- info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk)
- yield info
-
-
-def operate_blockwise(
- left: BlockManager, right: BlockManager, array_op
-) -> BlockManager:
- # At this point we have already checked the parent DataFrames for
- # assert rframe._indexed_same(lframe)
-
- res_blks: list[Block] = []
- for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right):
- res_values = array_op(lvals, rvals)
- if left_ea and not right_ea and hasattr(res_values, "reshape"):
- res_values = res_values.reshape(1, -1)
- nbs = rblk._split_op_result(res_values)
-
- # Assertions are disabled for performance, but should hold:
- # if right_ea or left_ea:
- # assert len(nbs) == 1
- # else:
- # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape)
-
- _reset_block_mgr_locs(nbs, locs)
-
- res_blks.extend(nbs)
-
- # Assertions are disabled for performance, but should hold:
- # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array}
- # nlocs = sum(len(nb.mgr_locs.as_array) for nb in res_blks)
- # assert nlocs == len(left.items), (nlocs, len(left.items))
- # assert len(slocs) == nlocs, (len(slocs), nlocs)
- # assert slocs == set(range(nlocs)), slocs
-
- new_mgr = type(right)(tuple(res_blks), axes=right.axes, verify_integrity=False)
- return new_mgr
-
-
-def _reset_block_mgr_locs(nbs: list[Block], locs) -> None:
- """
- Reset mgr_locs to correspond to our original DataFrame.
- """
- for nb in nbs:
- nblocs = locs[nb.mgr_locs.indexer]
- nb.mgr_locs = nblocs
- # Assertions are disabled for performance, but should hold:
- # assert len(nblocs) == nb.shape[0], (len(nblocs), nb.shape)
- # assert all(x in locs.as_array for x in nb.mgr_locs.as_array)
-
-
-def _get_same_shape_values(
- lblk: Block, rblk: Block, left_ea: bool, right_ea: bool
-) -> tuple[ArrayLike, ArrayLike]:
- """
- Slice lblk.values to align with rblk. Squeeze if we have EAs.
- """
- lvals = lblk.values
- rvals = rblk.values
-
- # Require that the indexing into lvals be slice-like
- assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs
-
- # TODO(EA2D): with 2D EAs only this first clause would be needed
- if not (left_ea or right_ea):
- # error: No overload variant of "__getitem__" of "ExtensionArray" matches
- # argument type "Tuple[Union[ndarray, slice], slice]"
- lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[call-overload]
- assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape)
- elif left_ea and right_ea:
- assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape)
- elif right_ea:
- # lvals are 2D, rvals are 1D
-
- # error: No overload variant of "__getitem__" of "ExtensionArray" matches
- # argument type "Tuple[Union[ndarray, slice], slice]"
- lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[call-overload]
- assert lvals.shape[0] == 1, lvals.shape
- lvals = lvals[0, :]
- else:
- # lvals are 1D, rvals are 2D
- assert rvals.shape[0] == 1, rvals.shape
- # error: No overload variant of "__getitem__" of "ExtensionArray" matches
- # argument type "Tuple[int, slice]"
- rvals = rvals[0, :] # type: ignore[call-overload]
-
- return lvals, rvals
-
-
-def blockwise_all(left: BlockManager, right: BlockManager, op) -> bool:
- """
- Blockwise `all` reduction.
- """
- for info in _iter_block_pairs(left, right):
- res = op(info.lvals, info.rvals)
- if not res:
- return False
- return True
diff --git a/contrib/python/pandas/py3/pandas/core/methods/__init__.py b/contrib/python/pandas/py3/pandas/core/methods/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/methods/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/core/methods/describe.py b/contrib/python/pandas/py3/pandas/core/methods/describe.py
deleted file mode 100644
index ccd9ccfff80..00000000000
--- a/contrib/python/pandas/py3/pandas/core/methods/describe.py
+++ /dev/null
@@ -1,408 +0,0 @@
-"""
-Module responsible for execution of NDFrame.describe() method.
-
-Method NDFrame.describe() delegates actual execution to function describe_ndframe().
-"""
-from __future__ import annotations
-
-from abc import (
- ABC,
- abstractmethod,
-)
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Sequence,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs.tslibs import Timestamp
-from pandas._typing import (
- DtypeObj,
- NDFrameT,
- npt,
-)
-from pandas.util._validators import validate_percentile
-
-from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_complex_dtype,
- is_extension_array_dtype,
- is_numeric_dtype,
-)
-from pandas.core.dtypes.dtypes import DatetimeTZDtype
-
-from pandas.core.arrays.arrow.dtype import ArrowDtype
-from pandas.core.arrays.floating import Float64Dtype
-from pandas.core.reshape.concat import concat
-
-from pandas.io.formats.format import format_percentiles
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
-
-
-def describe_ndframe(
- *,
- obj: NDFrameT,
- include: str | Sequence[str] | None,
- exclude: str | Sequence[str] | None,
- percentiles: Sequence[float] | np.ndarray | None,
-) -> NDFrameT:
- """Describe series or dataframe.
-
- Called from pandas.core.generic.NDFrame.describe()
-
- Parameters
- ----------
- obj: DataFrame or Series
- Either dataframe or series to be described.
- include : 'all', list-like of dtypes or None (default), optional
- A white list of data types to include in the result. Ignored for ``Series``.
- exclude : list-like of dtypes or None (default), optional,
- A black list of data types to omit from the result. Ignored for ``Series``.
- percentiles : list-like of numbers, optional
- The percentiles to include in the output. All should fall between 0 and 1.
- The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
- 75th percentiles.
-
- Returns
- -------
- Dataframe or series description.
- """
- percentiles = refine_percentiles(percentiles)
-
- describer: NDFrameDescriberAbstract
- if obj.ndim == 1:
- describer = SeriesDescriber(
- obj=cast("Series", obj),
- )
- else:
- describer = DataFrameDescriber(
- obj=cast("DataFrame", obj),
- include=include,
- exclude=exclude,
- )
-
- result = describer.describe(percentiles=percentiles)
- return cast(NDFrameT, result)
-
-
-class NDFrameDescriberAbstract(ABC):
- """Abstract class for describing dataframe or series.
-
- Parameters
- ----------
- obj : Series or DataFrame
- Object to be described.
- """
-
- def __init__(self, obj: DataFrame | Series) -> None:
- self.obj = obj
-
- @abstractmethod
- def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:
- """Do describe either series or dataframe.
-
- Parameters
- ----------
- percentiles : list-like of numbers
- The percentiles to include in the output.
- """
-
-
-class SeriesDescriber(NDFrameDescriberAbstract):
- """Class responsible for creating series description."""
-
- obj: Series
-
- def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:
- describe_func = select_describe_func(
- self.obj,
- )
- return describe_func(self.obj, percentiles)
-
-
-class DataFrameDescriber(NDFrameDescriberAbstract):
- """Class responsible for creating dataobj description.
-
- Parameters
- ----------
- obj : DataFrame
- DataFrame to be described.
- include : 'all', list-like of dtypes or None
- A white list of data types to include in the result.
- exclude : list-like of dtypes or None
- A black list of data types to omit from the result.
- """
-
- def __init__(
- self,
- obj: DataFrame,
- *,
- include: str | Sequence[str] | None,
- exclude: str | Sequence[str] | None,
- ) -> None:
- self.include = include
- self.exclude = exclude
-
- if obj.ndim == 2 and obj.columns.size == 0:
- raise ValueError("Cannot describe a DataFrame without columns")
-
- super().__init__(obj)
-
- def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:
- data = self._select_data()
-
- ldesc: list[Series] = []
- for _, series in data.items():
- describe_func = select_describe_func(series)
- ldesc.append(describe_func(series, percentiles))
-
- col_names = reorder_columns(ldesc)
- d = concat(
- [x.reindex(col_names, copy=False) for x in ldesc],
- axis=1,
- sort=False,
- )
- d.columns = data.columns.copy()
- return d
-
- def _select_data(self):
- """Select columns to be described."""
- if (self.include is None) and (self.exclude is None):
- # when some numerics are found, keep only numerics
- default_include: list[npt.DTypeLike] = [np.number, "datetime"]
- data = self.obj.select_dtypes(include=default_include)
- if len(data.columns) == 0:
- data = self.obj
- elif self.include == "all":
- if self.exclude is not None:
- msg = "exclude must be None when include is 'all'"
- raise ValueError(msg)
- data = self.obj
- else:
- data = self.obj.select_dtypes(
- include=self.include,
- exclude=self.exclude,
- )
- return data
-
-
-def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]:
- """Set a convenient order for rows for display."""
- names: list[Hashable] = []
- ldesc_indexes = sorted((x.index for x in ldesc), key=len)
- for idxnames in ldesc_indexes:
- for name in idxnames:
- if name not in names:
- names.append(name)
- return names
-
-
-def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
- """Describe series containing numerical data.
-
- Parameters
- ----------
- series : Series
- Series to be described.
- percentiles : list-like of numbers
- The percentiles to include in the output.
- """
- from pandas import Series
-
- formatted_percentiles = format_percentiles(percentiles)
-
- stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
- d = (
- [series.count(), series.mean(), series.std(), series.min()]
- + series.quantile(percentiles).tolist()
- + [series.max()]
- )
- # GH#48340 - always return float on non-complex numeric data
- dtype: DtypeObj | None
- if is_extension_array_dtype(series):
- if isinstance(series.dtype, ArrowDtype):
- if series.dtype.kind == "m":
- # GH53001: describe timedeltas with object dtype
- dtype = None
- else:
- import pyarrow as pa
-
- dtype = ArrowDtype(pa.float64())
- else:
- dtype = Float64Dtype()
- elif is_numeric_dtype(series) and not is_complex_dtype(series):
- dtype = np.dtype("float")
- else:
- dtype = None
- return Series(d, index=stat_index, name=series.name, dtype=dtype)
-
-
-def describe_categorical_1d(
- data: Series,
- percentiles_ignored: Sequence[float],
-) -> Series:
- """Describe series containing categorical data.
-
- Parameters
- ----------
- data : Series
- Series to be described.
- percentiles_ignored : list-like of numbers
- Ignored, but in place to unify interface.
- """
- names = ["count", "unique", "top", "freq"]
- objcounts = data.value_counts()
- count_unique = len(objcounts[objcounts != 0])
- if count_unique > 0:
- top, freq = objcounts.index[0], objcounts.iloc[0]
- dtype = None
- else:
- # If the DataFrame is empty, set 'top' and 'freq' to None
- # to maintain output shape consistency
- top, freq = np.nan, np.nan
- dtype = "object"
-
- result = [data.count(), count_unique, top, freq]
-
- from pandas import Series
-
- return Series(result, index=names, name=data.name, dtype=dtype)
-
-
-def describe_timestamp_as_categorical_1d(
- data: Series,
- percentiles_ignored: Sequence[float],
-) -> Series:
- """Describe series containing timestamp data treated as categorical.
-
- Parameters
- ----------
- data : Series
- Series to be described.
- percentiles_ignored : list-like of numbers
- Ignored, but in place to unify interface.
- """
- names = ["count", "unique"]
- objcounts = data.value_counts()
- count_unique = len(objcounts[objcounts != 0])
- result = [data.count(), count_unique]
- dtype = None
- if count_unique > 0:
- top, freq = objcounts.index[0], objcounts.iloc[0]
- tz = data.dt.tz
- asint = data.dropna().values.view("i8")
- top = Timestamp(top)
- if top.tzinfo is not None and tz is not None:
- # Don't tz_localize(None) if key is already tz-aware
- top = top.tz_convert(tz)
- else:
- top = top.tz_localize(tz)
- names += ["top", "freq", "first", "last"]
- result += [
- top,
- freq,
- Timestamp(asint.min(), tz=tz),
- Timestamp(asint.max(), tz=tz),
- ]
-
- # If the DataFrame is empty, set 'top' and 'freq' to None
- # to maintain output shape consistency
- else:
- names += ["top", "freq"]
- result += [np.nan, np.nan]
- dtype = "object"
-
- from pandas import Series
-
- return Series(result, index=names, name=data.name, dtype=dtype)
-
-
-def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series:
- """Describe series containing datetime64 dtype.
-
- Parameters
- ----------
- data : Series
- Series to be described.
- percentiles : list-like of numbers
- The percentiles to include in the output.
- """
- # GH-30164
- from pandas import Series
-
- formatted_percentiles = format_percentiles(percentiles)
-
- stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
- d = (
- [data.count(), data.mean(), data.min()]
- + data.quantile(percentiles).tolist()
- + [data.max()]
- )
- return Series(d, index=stat_index, name=data.name)
-
-
-def select_describe_func(
- data: Series,
-) -> Callable:
- """Select proper function for describing series based on data type.
-
- Parameters
- ----------
- data : Series
- Series to be described.
- """
- if is_bool_dtype(data.dtype):
- return describe_categorical_1d
- elif is_numeric_dtype(data):
- return describe_numeric_1d
- elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
- return describe_timestamp_1d
- elif data.dtype.kind == "m":
- return describe_numeric_1d
- else:
- return describe_categorical_1d
-
-
-def refine_percentiles(
- percentiles: Sequence[float] | np.ndarray | None,
-) -> np.ndarray[Any, np.dtype[np.float64]]:
- """
- Ensure that percentiles are unique and sorted.
-
- Parameters
- ----------
- percentiles : list-like of numbers, optional
- The percentiles to include in the output.
- """
- if percentiles is None:
- return np.array([0.25, 0.5, 0.75])
-
- # explicit conversion of `percentiles` to list
- percentiles = list(percentiles)
-
- # get them all to be in [0, 1]
- validate_percentile(percentiles)
-
- # median should always be included
- if 0.5 not in percentiles:
- percentiles.append(0.5)
-
- percentiles = np.asarray(percentiles)
-
- # sort and check for duplicates
- unique_pcts = np.unique(percentiles)
- assert percentiles is not None
- if len(unique_pcts) < len(percentiles):
- raise ValueError("percentiles cannot contain duplicates")
-
- return unique_pcts
diff --git a/contrib/python/pandas/py3/pandas/core/methods/selectn.py b/contrib/python/pandas/py3/pandas/core/methods/selectn.py
deleted file mode 100644
index 241d55aa663..00000000000
--- a/contrib/python/pandas/py3/pandas/core/methods/selectn.py
+++ /dev/null
@@ -1,262 +0,0 @@
-"""
-Implementation of nlargest and nsmallest.
-"""
-
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Hashable,
- Sequence,
- cast,
- final,
-)
-
-import numpy as np
-
-from pandas._libs import algos as libalgos
-from pandas._typing import (
- DtypeObj,
- IndexLabel,
-)
-
-from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_complex_dtype,
- is_integer_dtype,
- is_list_like,
- is_numeric_dtype,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.dtypes import BaseMaskedDtype
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
-
-
-class SelectN:
- def __init__(self, obj, n: int, keep: str) -> None:
- self.obj = obj
- self.n = n
- self.keep = keep
-
- if self.keep not in ("first", "last", "all"):
- raise ValueError('keep must be either "first", "last" or "all"')
-
- def compute(self, method: str) -> DataFrame | Series:
- raise NotImplementedError
-
- @final
- def nlargest(self):
- return self.compute("nlargest")
-
- @final
- def nsmallest(self):
- return self.compute("nsmallest")
-
- @final
- @staticmethod
- def is_valid_dtype_n_method(dtype: DtypeObj) -> bool:
- """
- Helper function to determine if dtype is valid for
- nsmallest/nlargest methods
- """
- if is_numeric_dtype(dtype):
- return not is_complex_dtype(dtype)
- return needs_i8_conversion(dtype)
-
-
-class SelectNSeries(SelectN):
- """
- Implement n largest/smallest for Series
-
- Parameters
- ----------
- obj : Series
- n : int
- keep : {'first', 'last'}, default 'first'
-
- Returns
- -------
- nordered : Series
- """
-
- def compute(self, method: str) -> Series:
- from pandas.core.reshape.concat import concat
-
- n = self.n
- dtype = self.obj.dtype
- if not self.is_valid_dtype_n_method(dtype):
- raise TypeError(f"Cannot use method '{method}' with dtype {dtype}")
-
- if n <= 0:
- return self.obj[[]]
-
- dropped = self.obj.dropna()
- nan_index = self.obj.drop(dropped.index)
-
- # slow method
- if n >= len(self.obj):
- ascending = method == "nsmallest"
- return self.obj.sort_values(ascending=ascending).head(n)
-
- # fast method
- new_dtype = dropped.dtype
-
- # Similar to algorithms._ensure_data
- arr = dropped._values
- if needs_i8_conversion(arr.dtype):
- arr = arr.view("i8")
- elif isinstance(arr.dtype, BaseMaskedDtype):
- arr = arr._data
- else:
- arr = np.asarray(arr)
- if arr.dtype.kind == "b":
- arr = arr.view(np.uint8)
-
- if method == "nlargest":
- arr = -arr
- if is_integer_dtype(new_dtype):
- # GH 21426: ensure reverse ordering at boundaries
- arr -= 1
-
- elif is_bool_dtype(new_dtype):
- # GH 26154: ensure False is smaller than True
- arr = 1 - (-arr)
-
- if self.keep == "last":
- arr = arr[::-1]
-
- nbase = n
- narr = len(arr)
- n = min(n, narr)
-
- # arr passed into kth_smallest must be contiguous. We copy
- # here because kth_smallest will modify its input
- kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1)
- (ns,) = np.nonzero(arr <= kth_val)
- inds = ns[arr[ns].argsort(kind="mergesort")]
-
- if self.keep != "all":
- inds = inds[:n]
- findex = nbase
- else:
- if len(inds) < nbase <= len(nan_index) + len(inds):
- findex = len(nan_index) + len(inds)
- else:
- findex = len(inds)
-
- if self.keep == "last":
- # reverse indices
- inds = narr - 1 - inds
-
- return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
-
-
-class SelectNFrame(SelectN):
- """
- Implement n largest/smallest for DataFrame
-
- Parameters
- ----------
- obj : DataFrame
- n : int
- keep : {'first', 'last'}, default 'first'
- columns : list or str
-
- Returns
- -------
- nordered : DataFrame
- """
-
- def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
- super().__init__(obj, n, keep)
- if not is_list_like(columns) or isinstance(columns, tuple):
- columns = [columns]
-
- columns = cast(Sequence[Hashable], columns)
- columns = list(columns)
- self.columns = columns
-
- def compute(self, method: str) -> DataFrame:
- from pandas.core.api import Index
-
- n = self.n
- frame = self.obj
- columns = self.columns
-
- for column in columns:
- dtype = frame[column].dtype
- if not self.is_valid_dtype_n_method(dtype):
- raise TypeError(
- f"Column {repr(column)} has dtype {dtype}, "
- f"cannot use method {repr(method)} with this dtype"
- )
-
- def get_indexer(current_indexer, other_indexer):
- """
- Helper function to concat `current_indexer` and `other_indexer`
- depending on `method`
- """
- if method == "nsmallest":
- return current_indexer.append(other_indexer)
- else:
- return other_indexer.append(current_indexer)
-
- # Below we save and reset the index in case index contains duplicates
- original_index = frame.index
- cur_frame = frame = frame.reset_index(drop=True)
- cur_n = n
- indexer = Index([], dtype=np.int64)
-
- for i, column in enumerate(columns):
- # For each column we apply method to cur_frame[column].
- # If it's the last column or if we have the number of
- # results desired we are done.
- # Otherwise there are duplicates of the largest/smallest
- # value and we need to look at the rest of the columns
- # to determine which of the rows with the largest/smallest
- # value in the column to keep.
- series = cur_frame[column]
- is_last_column = len(columns) - 1 == i
- values = getattr(series, method)(
- cur_n, keep=self.keep if is_last_column else "all"
- )
-
- if is_last_column or len(values) <= cur_n:
- indexer = get_indexer(indexer, values.index)
- break
-
- # Now find all values which are equal to
- # the (nsmallest: largest)/(nlargest: smallest)
- # from our series.
- border_value = values == values[values.index[-1]]
-
- # Some of these values are among the top-n
- # some aren't.
- unsafe_values = values[border_value]
-
- # These values are definitely among the top-n
- safe_values = values[~border_value]
- indexer = get_indexer(indexer, safe_values.index)
-
- # Go on and separate the unsafe_values on the remaining
- # columns.
- cur_frame = cur_frame.loc[unsafe_values.index]
- cur_n = n - len(indexer)
-
- frame = frame.take(indexer)
-
- # Restore the index on frame
- frame.index = original_index.take(indexer)
-
- # If there is only one column, the frame is already sorted.
- if len(columns) == 1:
- return frame
-
- ascending = method == "nsmallest"
-
- return frame.sort_values(columns, ascending=ascending, kind="mergesort")
diff --git a/contrib/python/pandas/py3/pandas/core/methods/to_dict.py b/contrib/python/pandas/py3/pandas/core/methods/to_dict.py
deleted file mode 100644
index f1cca4fd7ac..00000000000
--- a/contrib/python/pandas/py3/pandas/core/methods/to_dict.py
+++ /dev/null
@@ -1,207 +0,0 @@
-from __future__ import annotations
-
-from typing import Literal
-import warnings
-
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.cast import maybe_box_native
-from pandas.core.dtypes.common import (
- is_extension_array_dtype,
- is_object_dtype,
-)
-
-from pandas import DataFrame
-from pandas.core import common as com
-
-
-def to_dict(
- df: DataFrame,
- orient: Literal[
- "dict", "list", "series", "split", "tight", "records", "index"
- ] = "dict",
- into: type[dict] = dict,
- index: bool = True,
-) -> dict | list[dict]:
- """
- Convert the DataFrame to a dictionary.
-
- The type of the key-value pairs can be customized with the parameters
- (see below).
-
- Parameters
- ----------
- orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
- Determines the type of the values of the dictionary.
-
- - 'dict' (default) : dict like {column -> {index -> value}}
- - 'list' : dict like {column -> [values]}
- - 'series' : dict like {column -> Series(values)}
- - 'split' : dict like
- {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
- - 'tight' : dict like
- {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
- 'index_names' -> [index.names], 'column_names' -> [column.names]}
- - 'records' : list like
- [{column -> value}, ... , {column -> value}]
- - 'index' : dict like {index -> {column -> value}}
-
- .. versionadded:: 1.4.0
- 'tight' as an allowed value for the ``orient`` argument
-
- into : class, default dict
- The collections.abc.Mapping subclass used for all Mappings
- in the return value. Can be the actual class or an empty
- instance of the mapping type you want. If you want a
- collections.defaultdict, you must pass it initialized.
-
- index : bool, default True
- Whether to include the index item (and index_names item if `orient`
- is 'tight') in the returned dictionary. Can only be ``False``
- when `orient` is 'split' or 'tight'.
-
- .. versionadded:: 2.0.0
-
- Returns
- -------
- dict, list or collections.abc.Mapping
- Return a collections.abc.Mapping object representing the DataFrame.
- The resulting transformation depends on the `orient` parameter.
- """
- if not df.columns.is_unique:
- warnings.warn(
- "DataFrame columns are not unique, some columns will be omitted.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- # GH16122
- into_c = com.standardize_mapping(into)
-
- # error: Incompatible types in assignment (expression has type "str",
- # variable has type "Literal['dict', 'list', 'series', 'split', 'tight',
- # 'records', 'index']")
- orient = orient.lower() # type: ignore[assignment]
-
- if not index and orient not in ["split", "tight"]:
- raise ValueError(
- "'index=False' is only valid when 'orient' is 'split' or 'tight'"
- )
-
- if orient == "series":
- # GH46470 Return quickly if orient series to avoid creating dtype objects
- return into_c((k, v) for k, v in df.items())
-
- box_native_indices = [
- i
- for i, col_dtype in enumerate(df.dtypes.values)
- if is_object_dtype(col_dtype) or is_extension_array_dtype(col_dtype)
- ]
- are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)
-
- if orient == "dict":
- return into_c((k, v.to_dict(into)) for k, v in df.items())
-
- elif orient == "list":
- object_dtype_indices_as_set = set(box_native_indices)
- return into_c(
- (
- k,
- list(map(maybe_box_native, v.tolist()))
- if i in object_dtype_indices_as_set
- else v.tolist(),
- )
- for i, (k, v) in enumerate(df.items())
- )
-
- elif orient == "split":
- data = df._create_data_for_split_and_tight_to_dict(
- are_all_object_dtype_cols, box_native_indices
- )
-
- return into_c(
- ((("index", df.index.tolist()),) if index else ())
- + (
- ("columns", df.columns.tolist()),
- ("data", data),
- )
- )
-
- elif orient == "tight":
- data = df._create_data_for_split_and_tight_to_dict(
- are_all_object_dtype_cols, box_native_indices
- )
-
- return into_c(
- ((("index", df.index.tolist()),) if index else ())
- + (
- ("columns", df.columns.tolist()),
- (
- "data",
- [
- list(map(maybe_box_native, t))
- for t in df.itertuples(index=False, name=None)
- ],
- ),
- )
- + ((("index_names", list(df.index.names)),) if index else ())
- + (("column_names", list(df.columns.names)),)
- )
-
- elif orient == "records":
- columns = df.columns.tolist()
- if are_all_object_dtype_cols:
- rows = (
- dict(zip(columns, row)) for row in df.itertuples(index=False, name=None)
- )
- return [
- into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
- ]
- else:
- data = [
- into_c(zip(columns, t)) for t in df.itertuples(index=False, name=None)
- ]
- if box_native_indices:
- object_dtype_indices_as_set = set(box_native_indices)
- object_dtype_cols = {
- col
- for i, col in enumerate(df.columns)
- if i in object_dtype_indices_as_set
- }
- for row in data:
- for col in object_dtype_cols:
- row[col] = maybe_box_native(row[col])
- return data
-
- elif orient == "index":
- if not df.index.is_unique:
- raise ValueError("DataFrame index must be unique for orient='index'.")
- columns = df.columns.tolist()
- if are_all_object_dtype_cols:
- return into_c(
- (t[0], dict(zip(df.columns, map(maybe_box_native, t[1:]))))
- for t in df.itertuples(name=None)
- )
- elif box_native_indices:
- object_dtype_indices_as_set = set(box_native_indices)
- is_object_dtype_by_index = [
- i in object_dtype_indices_as_set for i in range(len(df.columns))
- ]
- return into_c(
- (
- t[0],
- {
- columns[i]: maybe_box_native(v)
- if is_object_dtype_by_index[i]
- else v
- for i, v in enumerate(t[1:])
- },
- )
- for t in df.itertuples(name=None)
- )
- else:
- return into_c(
- (t[0], dict(zip(df.columns, t[1:]))) for t in df.itertuples(name=None)
- )
-
- else:
- raise ValueError(f"orient '{orient}' not understood")
diff --git a/contrib/python/pandas/py3/pandas/core/missing.py b/contrib/python/pandas/py3/pandas/core/missing.py
deleted file mode 100644
index ff307b59273..00000000000
--- a/contrib/python/pandas/py3/pandas/core/missing.py
+++ /dev/null
@@ -1,1030 +0,0 @@
-"""
-Routines for filling missing data.
-"""
-from __future__ import annotations
-
-from functools import (
- partial,
- wraps,
-)
-from typing import (
- TYPE_CHECKING,
- Any,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs import (
- NaT,
- algos,
- lib,
-)
-from pandas._typing import (
- ArrayLike,
- Axis,
- AxisInt,
- F,
- npt,
-)
-from pandas.compat._optional import import_optional_dependency
-
-from pandas.core.dtypes.cast import infer_dtype_from
-from pandas.core.dtypes.common import (
- is_array_like,
- is_numeric_v_string_like,
- is_object_dtype,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.missing import (
- is_valid_na_for_dtype,
- isna,
- na_value_for_dtype,
-)
-
-if TYPE_CHECKING:
- from pandas import Index
-
-
-def check_value_size(value, mask: npt.NDArray[np.bool_], length: int):
- """
- Validate the size of the values passed to ExtensionArray.fillna.
- """
- if is_array_like(value):
- if len(value) != length:
- raise ValueError(
- f"Length of 'value' does not match. Got ({len(value)}) "
- f" expected {length}"
- )
- value = value[mask]
-
- return value
-
-
-def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]:
- """
- Return a masking array of same size/shape as arr
- with entries equaling any member of values_to_mask set to True
-
- Parameters
- ----------
- arr : ArrayLike
- values_to_mask: list, tuple, or scalar
-
- Returns
- -------
- np.ndarray[bool]
- """
- # When called from Block.replace/replace_list, values_to_mask is a scalar
- # known to be holdable by arr.
- # When called from Series._single_replace, values_to_mask is tuple or list
- dtype, values_to_mask = infer_dtype_from(values_to_mask)
- # error: Argument "dtype" to "array" has incompatible type "Union[dtype[Any],
- # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
- # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
- # _DTypeDict, Tuple[Any, Any]]]"
- values_to_mask = np.array(values_to_mask, dtype=dtype) # type: ignore[arg-type]
-
- potential_na = False
- if is_object_dtype(arr):
- # pre-compute mask to avoid comparison to NA
- potential_na = True
- arr_mask = ~isna(arr)
-
- na_mask = isna(values_to_mask)
- nonna = values_to_mask[~na_mask]
-
- # GH 21977
- mask = np.zeros(arr.shape, dtype=bool)
- for x in nonna:
- if is_numeric_v_string_like(arr, x):
- # GH#29553 prevent numpy deprecation warnings
- pass
- else:
- if potential_na:
- new_mask = np.zeros(arr.shape, dtype=np.bool_)
- new_mask[arr_mask] = arr[arr_mask] == x
- else:
- new_mask = arr == x
-
- if not isinstance(new_mask, np.ndarray):
- # usually BooleanArray
- new_mask = new_mask.to_numpy(dtype=bool, na_value=False)
- mask |= new_mask
-
- if na_mask.any():
- mask |= isna(arr)
-
- return mask
-
-
-def clean_fill_method(method: str | None, allow_nearest: bool = False):
- # asfreq is compat for resampling
- if method in [None, "asfreq"]:
- return None
-
- if isinstance(method, str):
- method = method.lower()
- if method == "ffill":
- method = "pad"
- elif method == "bfill":
- method = "backfill"
-
- valid_methods = ["pad", "backfill"]
- expecting = "pad (ffill) or backfill (bfill)"
- if allow_nearest:
- valid_methods.append("nearest")
- expecting = "pad (ffill), backfill (bfill) or nearest"
- if method not in valid_methods:
- raise ValueError(f"Invalid fill method. Expecting {expecting}. Got {method}")
- return method
-
-
-# interpolation methods that dispatch to np.interp
-
-NP_METHODS = ["linear", "time", "index", "values"]
-
-# interpolation methods that dispatch to _interpolate_scipy_wrapper
-
-SP_METHODS = [
- "nearest",
- "zero",
- "slinear",
- "quadratic",
- "cubic",
- "barycentric",
- "krogh",
- "spline",
- "polynomial",
- "from_derivatives",
- "piecewise_polynomial",
- "pchip",
- "akima",
- "cubicspline",
-]
-
-
-def clean_interp_method(method: str, index: Index, **kwargs) -> str:
- order = kwargs.get("order")
-
- if method in ("spline", "polynomial") and order is None:
- raise ValueError("You must specify the order of the spline or polynomial.")
-
- valid = NP_METHODS + SP_METHODS
- if method not in valid:
- raise ValueError(f"method must be one of {valid}. Got '{method}' instead.")
-
- if method in ("krogh", "piecewise_polynomial", "pchip"):
- if not index.is_monotonic_increasing:
- raise ValueError(
- f"{method} interpolation requires that the index be monotonic."
- )
-
- return method
-
-
-def find_valid_index(
- values, *, how: str, is_valid: npt.NDArray[np.bool_]
-) -> int | None:
- """
- Retrieves the index of the first valid value.
-
- Parameters
- ----------
- values : ndarray or ExtensionArray
- how : {'first', 'last'}
- Use this parameter to change between the first or last valid index.
- is_valid: np.ndarray
- Mask to find na_values.
-
- Returns
- -------
- int or None
- """
- assert how in ["first", "last"]
-
- if len(values) == 0: # early stop
- return None
-
- if values.ndim == 2:
- is_valid = is_valid.any(axis=1) # reduce axis 1
-
- if how == "first":
- idxpos = is_valid[::].argmax()
-
- elif how == "last":
- idxpos = len(values) - 1 - is_valid[::-1].argmax()
-
- chk_notna = is_valid[idxpos]
-
- if not chk_notna:
- return None
- # Incompatible return value type (got "signedinteger[Any]",
- # expected "Optional[int]")
- return idxpos # type: ignore[return-value]
-
-
-def interpolate_array_2d(
- data: np.ndarray,
- method: str = "pad",
- axis: AxisInt = 0,
- index: Index | None = None,
- limit: int | None = None,
- limit_direction: str = "forward",
- limit_area: str | None = None,
- fill_value: Any | None = None,
- coerce: bool = False,
- downcast: str | None = None,
- **kwargs,
-) -> None:
- """
- Wrapper to dispatch to either interpolate_2d or _interpolate_2d_with_fill.
-
- Notes
- -----
- Alters 'data' in-place.
- """
- try:
- m = clean_fill_method(method)
- except ValueError:
- m = None
-
- if m is not None:
- if fill_value is not None:
- # similar to validate_fillna_kwargs
- raise ValueError("Cannot pass both fill_value and method")
-
- interpolate_2d(
- data,
- method=m,
- axis=axis,
- limit=limit,
- limit_area=limit_area,
- )
- else:
- assert index is not None # for mypy
-
- _interpolate_2d_with_fill(
- data=data,
- index=index,
- axis=axis,
- method=method,
- limit=limit,
- limit_direction=limit_direction,
- limit_area=limit_area,
- fill_value=fill_value,
- **kwargs,
- )
-
-
-def _interpolate_2d_with_fill(
- data: np.ndarray, # floating dtype
- index: Index,
- axis: AxisInt,
- method: str = "linear",
- limit: int | None = None,
- limit_direction: str = "forward",
- limit_area: str | None = None,
- fill_value: Any | None = None,
- **kwargs,
-) -> None:
- """
- Column-wise application of _interpolate_1d.
-
- Notes
- -----
- Alters 'data' in-place.
-
- The signature does differ from _interpolate_1d because it only
- includes what is needed for Block.interpolate.
- """
- # validate the interp method
- clean_interp_method(method, index, **kwargs)
-
- if is_valid_na_for_dtype(fill_value, data.dtype):
- fill_value = na_value_for_dtype(data.dtype, compat=False)
-
- if method == "time":
- if not needs_i8_conversion(index.dtype):
- raise ValueError(
- "time-weighted interpolation only works "
- "on Series or DataFrames with a "
- "DatetimeIndex"
- )
- method = "values"
-
- valid_limit_directions = ["forward", "backward", "both"]
- limit_direction = limit_direction.lower()
- if limit_direction not in valid_limit_directions:
- raise ValueError(
- "Invalid limit_direction: expecting one of "
- f"{valid_limit_directions}, got '{limit_direction}'."
- )
-
- if limit_area is not None:
- valid_limit_areas = ["inside", "outside"]
- limit_area = limit_area.lower()
- if limit_area not in valid_limit_areas:
- raise ValueError(
- f"Invalid limit_area: expecting one of {valid_limit_areas}, got "
- f"{limit_area}."
- )
-
- # default limit is unlimited GH #16282
- limit = algos.validate_limit(nobs=None, limit=limit)
-
- indices = _index_to_interp_indices(index, method)
-
- def func(yvalues: np.ndarray) -> None:
- # process 1-d slices in the axis direction
-
- _interpolate_1d(
- indices=indices,
- yvalues=yvalues,
- method=method,
- limit=limit,
- limit_direction=limit_direction,
- limit_area=limit_area,
- fill_value=fill_value,
- bounds_error=False,
- **kwargs,
- )
-
- # error: Argument 1 to "apply_along_axis" has incompatible type
- # "Callable[[ndarray[Any, Any]], None]"; expected "Callable[...,
- # Union[_SupportsArray[dtype[<nothing>]], Sequence[_SupportsArray
- # [dtype[<nothing>]]], Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]],
- # Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]],
- # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]]]]]"
- np.apply_along_axis(func, axis, data) # type: ignore[arg-type]
-
-
-def _index_to_interp_indices(index: Index, method: str) -> np.ndarray:
- """
- Convert Index to ndarray of indices to pass to NumPy/SciPy.
- """
- xarr = index._values
- if needs_i8_conversion(xarr.dtype):
- # GH#1646 for dt64tz
- xarr = xarr.view("i8")
-
- if method == "linear":
- inds = xarr
- inds = cast(np.ndarray, inds)
- else:
- inds = np.asarray(xarr)
-
- if method in ("values", "index"):
- if inds.dtype == np.object_:
- inds = lib.maybe_convert_objects(inds)
-
- return inds
-
-
-def _interpolate_1d(
- indices: np.ndarray,
- yvalues: np.ndarray,
- method: str | None = "linear",
- limit: int | None = None,
- limit_direction: str = "forward",
- limit_area: str | None = None,
- fill_value: Any | None = None,
- bounds_error: bool = False,
- order: int | None = None,
- **kwargs,
-) -> None:
- """
- Logic for the 1-d interpolation. The input
- indices and yvalues will each be 1-d arrays of the same length.
-
- Bounds_error is currently hardcoded to False since non-scipy ones don't
- take it as an argument.
-
- Notes
- -----
- Fills 'yvalues' in-place.
- """
-
- invalid = isna(yvalues)
- valid = ~invalid
-
- if not valid.any():
- return
-
- if valid.all():
- return
-
- # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
- all_nans = set(np.flatnonzero(invalid))
-
- first_valid_index = find_valid_index(yvalues, how="first", is_valid=valid)
- if first_valid_index is None: # no nan found in start
- first_valid_index = 0
- start_nans = set(range(first_valid_index))
-
- last_valid_index = find_valid_index(yvalues, how="last", is_valid=valid)
- if last_valid_index is None: # no nan found in end
- last_valid_index = len(yvalues)
- end_nans = set(range(1 + last_valid_index, len(valid)))
-
- # Like the sets above, preserve_nans contains indices of invalid values,
- # but in this case, it is the final set of indices that need to be
- # preserved as NaN after the interpolation.
-
- # For example if limit_direction='forward' then preserve_nans will
- # contain indices of NaNs at the beginning of the series, and NaNs that
- # are more than 'limit' away from the prior non-NaN.
-
- # set preserve_nans based on direction using _interp_limit
- preserve_nans: list | set
- if limit_direction == "forward":
- preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
- elif limit_direction == "backward":
- preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
- else:
- # both directions... just use _interp_limit
- preserve_nans = set(_interp_limit(invalid, limit, limit))
-
- # if limit_area is set, add either mid or outside indices
- # to preserve_nans GH #16284
- if limit_area == "inside":
- # preserve NaNs on the outside
- preserve_nans |= start_nans | end_nans
- elif limit_area == "outside":
- # preserve NaNs on the inside
- mid_nans = all_nans - start_nans - end_nans
- preserve_nans |= mid_nans
-
- # sort preserve_nans and convert to list
- preserve_nans = sorted(preserve_nans)
-
- is_datetimelike = needs_i8_conversion(yvalues.dtype)
-
- if is_datetimelike:
- yvalues = yvalues.view("i8")
-
- if method in NP_METHODS:
- # np.interp requires sorted X values, #21037
-
- indexer = np.argsort(indices[valid])
- yvalues[invalid] = np.interp(
- indices[invalid], indices[valid][indexer], yvalues[valid][indexer]
- )
- else:
- yvalues[invalid] = _interpolate_scipy_wrapper(
- indices[valid],
- yvalues[valid],
- indices[invalid],
- method=method,
- fill_value=fill_value,
- bounds_error=bounds_error,
- order=order,
- **kwargs,
- )
-
- if is_datetimelike:
- yvalues[preserve_nans] = NaT.value
- else:
- yvalues[preserve_nans] = np.nan
- return
-
-
-def _interpolate_scipy_wrapper(
- x,
- y,
- new_x,
- method,
- fill_value=None,
- bounds_error: bool = False,
- order=None,
- **kwargs,
-):
- """
- Passed off to scipy.interpolate.interp1d. method is scipy's kind.
- Returns an array interpolated at new_x. Add any new methods to
- the list in _clean_interp_method.
- """
- extra = f"{method} interpolation requires SciPy."
- import_optional_dependency("scipy", extra=extra)
- from scipy import interpolate
-
- new_x = np.asarray(new_x)
-
- # ignores some kwargs that could be passed along.
- alt_methods = {
- "barycentric": interpolate.barycentric_interpolate,
- "krogh": interpolate.krogh_interpolate,
- "from_derivatives": _from_derivatives,
- "piecewise_polynomial": _from_derivatives,
- }
-
- if getattr(x, "_is_all_dates", False):
- # GH 5975, scipy.interp1d can't handle datetime64s
- x, new_x = x._values.astype("i8"), new_x.astype("i8")
-
- if method == "pchip":
- alt_methods["pchip"] = interpolate.pchip_interpolate
- elif method == "akima":
- alt_methods["akima"] = _akima_interpolate
- elif method == "cubicspline":
- alt_methods["cubicspline"] = _cubicspline_interpolate
-
- interp1d_methods = [
- "nearest",
- "zero",
- "slinear",
- "quadratic",
- "cubic",
- "polynomial",
- ]
- if method in interp1d_methods:
- if method == "polynomial":
- method = order
- terp = interpolate.interp1d(
- x, y, kind=method, fill_value=fill_value, bounds_error=bounds_error
- )
- new_y = terp(new_x)
- elif method == "spline":
- # GH #10633, #24014
- if isna(order) or (order <= 0):
- raise ValueError(
- f"order needs to be specified and greater than 0; got order: {order}"
- )
- terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs)
- new_y = terp(new_x)
- else:
- # GH 7295: need to be able to write for some reason
- # in some circumstances: check all three
- if not x.flags.writeable:
- x = x.copy()
- if not y.flags.writeable:
- y = y.copy()
- if not new_x.flags.writeable:
- new_x = new_x.copy()
- method = alt_methods[method]
- new_y = method(x, y, new_x, **kwargs)
- return new_y
-
-
-def _from_derivatives(
- xi, yi, x, order=None, der: int | list[int] | None = 0, extrapolate: bool = False
-):
- """
- Convenience function for interpolate.BPoly.from_derivatives.
-
- Construct a piecewise polynomial in the Bernstein basis, compatible
- with the specified values and derivatives at breakpoints.
-
- Parameters
- ----------
- xi : array-like
- sorted 1D array of x-coordinates
- yi : array-like or list of array-likes
- yi[i][j] is the j-th derivative known at xi[i]
- order: None or int or array-like of ints. Default: None.
- Specifies the degree of local polynomials. If not None, some
- derivatives are ignored.
- der : int or list
- How many derivatives to extract; None for all potentially nonzero
- derivatives (that is a number equal to the number of points), or a
- list of derivatives to extract. This number includes the function
- value as 0th derivative.
- extrapolate : bool, optional
- Whether to extrapolate to ouf-of-bounds points based on first and last
- intervals, or to return NaNs. Default: True.
-
- See Also
- --------
- scipy.interpolate.BPoly.from_derivatives
-
- Returns
- -------
- y : scalar or array-like
- The result, of length R or length M or M by R.
- """
- from scipy import interpolate
-
- # return the method for compat with scipy version & backwards compat
- method = interpolate.BPoly.from_derivatives
- m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate)
-
- return m(x)
-
-
-def _akima_interpolate(xi, yi, x, der: int | list[int] | None = 0, axis: AxisInt = 0):
- """
- Convenience function for akima interpolation.
- xi and yi are arrays of values used to approximate some function f,
- with ``yi = f(xi)``.
-
- See `Akima1DInterpolator` for details.
-
- Parameters
- ----------
- xi : array-like
- A sorted list of x-coordinates, of length N.
- yi : array-like
- A 1-D array of real values. `yi`'s length along the interpolation
- axis must be equal to the length of `xi`. If N-D array, use axis
- parameter to select correct axis.
- x : scalar or array-like
- Of length M.
- der : int, optional
- How many derivatives to extract; None for all potentially
- nonzero derivatives (that is a number equal to the number
- of points), or a list of derivatives to extract. This number
- includes the function value as 0th derivative.
- axis : int, optional
- Axis in the yi array corresponding to the x-coordinate values.
-
- See Also
- --------
- scipy.interpolate.Akima1DInterpolator
-
- Returns
- -------
- y : scalar or array-like
- The result, of length R or length M or M by R,
-
- """
- from scipy import interpolate
-
- P = interpolate.Akima1DInterpolator(xi, yi, axis=axis)
-
- return P(x, nu=der)
-
-
-def _cubicspline_interpolate(
- xi,
- yi,
- x,
- axis: AxisInt = 0,
- bc_type: str | tuple[Any, Any] = "not-a-knot",
- extrapolate=None,
-):
- """
- Convenience function for cubic spline data interpolator.
-
- See `scipy.interpolate.CubicSpline` for details.
-
- Parameters
- ----------
- xi : array-like, shape (n,)
- 1-d array containing values of the independent variable.
- Values must be real, finite and in strictly increasing order.
- yi : array-like
- Array containing values of the dependent variable. It can have
- arbitrary number of dimensions, but the length along ``axis``
- (see below) must match the length of ``x``. Values must be finite.
- x : scalar or array-like, shape (m,)
- axis : int, optional
- Axis along which `y` is assumed to be varying. Meaning that for
- ``x[i]`` the corresponding values are ``np.take(y, i, axis=axis)``.
- Default is 0.
- bc_type : string or 2-tuple, optional
- Boundary condition type. Two additional equations, given by the
- boundary conditions, are required to determine all coefficients of
- polynomials on each segment [2]_.
- If `bc_type` is a string, then the specified condition will be applied
- at both ends of a spline. Available conditions are:
- * 'not-a-knot' (default): The first and second segment at a curve end
- are the same polynomial. It is a good default when there is no
- information on boundary conditions.
- * 'periodic': The interpolated functions is assumed to be periodic
- of period ``x[-1] - x[0]``. The first and last value of `y` must be
- identical: ``y[0] == y[-1]``. This boundary condition will result in
- ``y'[0] == y'[-1]`` and ``y''[0] == y''[-1]``.
- * 'clamped': The first derivative at curves ends are zero. Assuming
- a 1D `y`, ``bc_type=((1, 0.0), (1, 0.0))`` is the same condition.
- * 'natural': The second derivative at curve ends are zero. Assuming
- a 1D `y`, ``bc_type=((2, 0.0), (2, 0.0))`` is the same condition.
- If `bc_type` is a 2-tuple, the first and the second value will be
- applied at the curve start and end respectively. The tuple values can
- be one of the previously mentioned strings (except 'periodic') or a
- tuple `(order, deriv_values)` allowing to specify arbitrary
- derivatives at curve ends:
- * `order`: the derivative order, 1 or 2.
- * `deriv_value`: array-like containing derivative values, shape must
- be the same as `y`, excluding ``axis`` dimension. For example, if
- `y` is 1D, then `deriv_value` must be a scalar. If `y` is 3D with
- the shape (n0, n1, n2) and axis=2, then `deriv_value` must be 2D
- and have the shape (n0, n1).
- extrapolate : {bool, 'periodic', None}, optional
- If bool, determines whether to extrapolate to out-of-bounds points
- based on first and last intervals, or to return NaNs. If 'periodic',
- periodic extrapolation is used. If None (default), ``extrapolate`` is
- set to 'periodic' for ``bc_type='periodic'`` and to True otherwise.
-
- See Also
- --------
- scipy.interpolate.CubicHermiteSpline
-
- Returns
- -------
- y : scalar or array-like
- The result, of shape (m,)
-
- References
- ----------
- .. [1] `Cubic Spline Interpolation
- <https://en.wikiversity.org/wiki/Cubic_Spline_Interpolation>`_
- on Wikiversity.
- .. [2] Carl de Boor, "A Practical Guide to Splines", Springer-Verlag, 1978.
- """
- from scipy import interpolate
-
- P = interpolate.CubicSpline(
- xi, yi, axis=axis, bc_type=bc_type, extrapolate=extrapolate
- )
-
- return P(x)
-
-
-def _interpolate_with_limit_area(
- values: np.ndarray, method: str, limit: int | None, limit_area: str | None
-) -> None:
- """
- Apply interpolation and limit_area logic to values along a to-be-specified axis.
-
- Parameters
- ----------
- values: np.ndarray
- Input array.
- method: str
- Interpolation method. Could be "bfill" or "pad"
- limit: int, optional
- Index limit on interpolation.
- limit_area: str
- Limit area for interpolation. Can be "inside" or "outside"
-
- Notes
- -----
- Modifies values in-place.
- """
-
- invalid = isna(values)
- is_valid = ~invalid
-
- if not invalid.all():
- first = find_valid_index(values, how="first", is_valid=is_valid)
- if first is None:
- first = 0
- last = find_valid_index(values, how="last", is_valid=is_valid)
- if last is None:
- last = len(values)
-
- interpolate_2d(
- values,
- method=method,
- limit=limit,
- )
-
- if limit_area == "inside":
- invalid[first : last + 1] = False
- elif limit_area == "outside":
- invalid[:first] = invalid[last + 1 :] = False
-
- values[invalid] = np.nan
-
-
-def interpolate_2d(
- values: np.ndarray,
- method: str = "pad",
- axis: Axis = 0,
- limit: int | None = None,
- limit_area: str | None = None,
-) -> None:
- """
- Perform an actual interpolation of values, values will be make 2-d if
- needed fills inplace, returns the result.
-
- Parameters
- ----------
- values: np.ndarray
- Input array.
- method: str, default "pad"
- Interpolation method. Could be "bfill" or "pad"
- axis: 0 or 1
- Interpolation axis
- limit: int, optional
- Index limit on interpolation.
- limit_area: str, optional
- Limit area for interpolation. Can be "inside" or "outside"
-
- Notes
- -----
- Modifies values in-place.
- """
- if limit_area is not None:
- np.apply_along_axis(
- # error: Argument 1 to "apply_along_axis" has incompatible type
- # "partial[None]"; expected
- # "Callable[..., Union[_SupportsArray[dtype[<nothing>]],
- # Sequence[_SupportsArray[dtype[<nothing>]]],
- # Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]],
- # Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]],
- # Sequence[Sequence[Sequence[Sequence[_
- # SupportsArray[dtype[<nothing>]]]]]]]]"
- partial( # type: ignore[arg-type]
- _interpolate_with_limit_area,
- method=method,
- limit=limit,
- limit_area=limit_area,
- ),
- # error: Argument 2 to "apply_along_axis" has incompatible type
- # "Union[str, int]"; expected "SupportsIndex"
- axis, # type: ignore[arg-type]
- values,
- )
- return
-
- transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
-
- # reshape a 1 dim if needed
- if values.ndim == 1:
- if axis != 0: # pragma: no cover
- raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")
- values = values.reshape(tuple((1,) + values.shape))
-
- method = clean_fill_method(method)
- tvalues = transf(values)
-
- # _pad_2d and _backfill_2d both modify tvalues inplace
- if method == "pad":
- _pad_2d(tvalues, limit=limit)
- else:
- _backfill_2d(tvalues, limit=limit)
-
- return
-
-
-def _fillna_prep(
- values, mask: npt.NDArray[np.bool_] | None = None
-) -> npt.NDArray[np.bool_]:
- # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d
-
- if mask is None:
- mask = isna(values)
-
- mask = mask.view(np.uint8)
- return mask
-
-
-def _datetimelike_compat(func: F) -> F:
- """
- Wrapper to handle datetime64 and timedelta64 dtypes.
- """
-
- @wraps(func)
- def new_func(values, limit=None, mask=None):
- if needs_i8_conversion(values.dtype):
- if mask is None:
- # This needs to occur before casting to int64
- mask = isna(values)
-
- result, mask = func(values.view("i8"), limit=limit, mask=mask)
- return result.view(values.dtype), mask
-
- return func(values, limit=limit, mask=mask)
-
- return cast(F, new_func)
-
-
-@_datetimelike_compat
-def _pad_1d(
- values: np.ndarray,
- limit: int | None = None,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> tuple[np.ndarray, npt.NDArray[np.bool_]]:
- mask = _fillna_prep(values, mask)
- algos.pad_inplace(values, mask, limit=limit)
- return values, mask
-
-
-@_datetimelike_compat
-def _backfill_1d(
- values: np.ndarray,
- limit: int | None = None,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> tuple[np.ndarray, npt.NDArray[np.bool_]]:
- mask = _fillna_prep(values, mask)
- algos.backfill_inplace(values, mask, limit=limit)
- return values, mask
-
-
-@_datetimelike_compat
-def _pad_2d(values: np.ndarray, limit=None, mask: npt.NDArray[np.bool_] | None = None):
- mask = _fillna_prep(values, mask)
-
- if np.all(values.shape):
- algos.pad_2d_inplace(values, mask, limit=limit)
- else:
- # for test coverage
- pass
- return values, mask
-
-
-@_datetimelike_compat
-def _backfill_2d(values, limit=None, mask: npt.NDArray[np.bool_] | None = None):
- mask = _fillna_prep(values, mask)
-
- if np.all(values.shape):
- algos.backfill_2d_inplace(values, mask, limit=limit)
- else:
- # for test coverage
- pass
- return values, mask
-
-
-_fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d}
-
-
-def get_fill_func(method, ndim: int = 1):
- method = clean_fill_method(method)
- if ndim == 1:
- return _fill_methods[method]
- return {"pad": _pad_2d, "backfill": _backfill_2d}[method]
-
-
-def clean_reindex_fill_method(method) -> str | None:
- return clean_fill_method(method, allow_nearest=True)
-
-
-def _interp_limit(invalid: npt.NDArray[np.bool_], fw_limit, bw_limit):
- """
- Get indexers of values that won't be filled
- because they exceed the limits.
-
- Parameters
- ----------
- invalid : np.ndarray[bool]
- fw_limit : int or None
- forward limit to index
- bw_limit : int or None
- backward limit to index
-
- Returns
- -------
- set of indexers
-
- Notes
- -----
- This is equivalent to the more readable, but slower
-
- .. code-block:: python
-
- def _interp_limit(invalid, fw_limit, bw_limit):
- for x in np.where(invalid)[0]:
- if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
- yield x
- """
- # handle forward first; the backward direction is the same except
- # 1. operate on the reversed array
- # 2. subtract the returned indices from N - 1
- N = len(invalid)
- f_idx = set()
- b_idx = set()
-
- def inner(invalid, limit):
- limit = min(limit, N)
- windowed = _rolling_window(invalid, limit + 1).all(1)
- idx = set(np.where(windowed)[0] + limit) | set(
- np.where((~invalid[: limit + 1]).cumsum() == 0)[0]
- )
- return idx
-
- if fw_limit is not None:
- if fw_limit == 0:
- f_idx = set(np.where(invalid)[0])
- else:
- f_idx = inner(invalid, fw_limit)
-
- if bw_limit is not None:
- if bw_limit == 0:
- # then we don't even need to care about backwards
- # just use forwards
- return f_idx
- else:
- b_idx_inv = list(inner(invalid[::-1], bw_limit))
- b_idx = set(N - 1 - np.asarray(b_idx_inv))
- if fw_limit == 0:
- return b_idx
-
- return f_idx & b_idx
-
-
-def _rolling_window(a: npt.NDArray[np.bool_], window: int) -> npt.NDArray[np.bool_]:
- """
- [True, True, False, True, False], 2 ->
-
- [
- [True, True],
- [True, False],
- [False, True],
- [True, False],
- ]
- """
- # https://stackoverflow.com/a/6811241
- shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
- strides = a.strides + (a.strides[-1],)
- return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
diff --git a/contrib/python/pandas/py3/pandas/core/nanops.py b/contrib/python/pandas/py3/pandas/core/nanops.py
deleted file mode 100644
index 60c0d04ef28..00000000000
--- a/contrib/python/pandas/py3/pandas/core/nanops.py
+++ /dev/null
@@ -1,1767 +0,0 @@
-from __future__ import annotations
-
-import functools
-import itertools
-import operator
-from typing import (
- Any,
- Callable,
- cast,
-)
-import warnings
-
-import numpy as np
-
-from pandas._config import get_option
-
-from pandas._libs import (
- NaT,
- NaTType,
- iNaT,
- lib,
-)
-from pandas._typing import (
- ArrayLike,
- AxisInt,
- CorrelationMethod,
- Dtype,
- DtypeObj,
- F,
- Scalar,
- Shape,
- npt,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import (
- is_any_int_dtype,
- is_bool_dtype,
- is_complex,
- is_datetime64_any_dtype,
- is_float,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_numeric_dtype,
- is_object_dtype,
- is_scalar,
- is_timedelta64_dtype,
- needs_i8_conversion,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import PeriodDtype
-from pandas.core.dtypes.missing import (
- isna,
- na_value_for_dtype,
- notna,
-)
-
-from pandas.core.construction import extract_array
-
-bn = import_optional_dependency("bottleneck", errors="warn")
-_BOTTLENECK_INSTALLED = bn is not None
-_USE_BOTTLENECK = False
-
-
-def set_use_bottleneck(v: bool = True) -> None:
- # set/unset to use bottleneck
- global _USE_BOTTLENECK
- if _BOTTLENECK_INSTALLED:
- _USE_BOTTLENECK = v
-
-
-set_use_bottleneck(get_option("compute.use_bottleneck"))
-
-
-class disallow:
- def __init__(self, *dtypes: Dtype) -> None:
- super().__init__()
- self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
-
- def check(self, obj) -> bool:
- return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes)
-
- def __call__(self, f: F) -> F:
- @functools.wraps(f)
- def _f(*args, **kwargs):
- obj_iter = itertools.chain(args, kwargs.values())
- if any(self.check(obj) for obj in obj_iter):
- f_name = f.__name__.replace("nan", "")
- raise TypeError(
- f"reduction operation '{f_name}' not allowed for this dtype"
- )
- try:
- with np.errstate(invalid="ignore"):
- return f(*args, **kwargs)
- except ValueError as e:
- # we want to transform an object array
- # ValueError message to the more typical TypeError
- # e.g. this is normally a disallowed function on
- # object arrays that contain strings
- if is_object_dtype(args[0]):
- raise TypeError(e) from e
- raise
-
- return cast(F, _f)
-
-
-class bottleneck_switch:
- def __init__(self, name=None, **kwargs) -> None:
- self.name = name
- self.kwargs = kwargs
-
- def __call__(self, alt: F) -> F:
- bn_name = self.name or alt.__name__
-
- try:
- bn_func = getattr(bn, bn_name)
- except (AttributeError, NameError): # pragma: no cover
- bn_func = None
-
- @functools.wraps(alt)
- def f(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- **kwds,
- ):
- if len(self.kwargs) > 0:
- for k, v in self.kwargs.items():
- if k not in kwds:
- kwds[k] = v
-
- if values.size == 0 and kwds.get("min_count") is None:
- # We are empty, returning NA for our type
- # Only applies for the default `min_count` of None
- # since that affects how empty arrays are handled.
- # TODO(GH-18976) update all the nanops methods to
- # correctly handle empty inputs and remove this check.
- # It *may* just be `var`
- return _na_for_min_count(values, axis)
-
- if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name):
- if kwds.get("mask", None) is None:
- # `mask` is not recognised by bottleneck, would raise
- # TypeError if called
- kwds.pop("mask", None)
- result = bn_func(values, axis=axis, **kwds)
-
- # prefer to treat inf/-inf as NA, but must compute the func
- # twice :(
- if _has_infs(result):
- result = alt(values, axis=axis, skipna=skipna, **kwds)
- else:
- result = alt(values, axis=axis, skipna=skipna, **kwds)
- else:
- result = alt(values, axis=axis, skipna=skipna, **kwds)
-
- return result
-
- return cast(F, f)
-
-
-def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
- # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
- if not is_object_dtype(dtype) and not needs_i8_conversion(dtype):
- # GH 42878
- # Bottleneck uses naive summation leading to O(n) loss of precision
- # unlike numpy which implements pairwise summation, which has O(log(n)) loss
- # crossref: https://github.com/pydata/bottleneck/issues/379
-
- # GH 15507
- # bottleneck does not properly upcast during the sum
- # so can overflow
-
- # GH 9422
- # further we also want to preserve NaN when all elements
- # are NaN, unlike bottleneck/numpy which consider this
- # to be 0
- return name not in ["nansum", "nanprod", "nanmean"]
- return False
-
-
-def _has_infs(result) -> bool:
- if isinstance(result, np.ndarray):
- if result.dtype in ("f8", "f4"):
- # Note: outside of an nanops-specific test, we always have
- # result.ndim == 1, so there is no risk of this ravel making a copy.
- return lib.has_infs(result.ravel("K"))
- try:
- return np.isinf(result).any()
- except (TypeError, NotImplementedError):
- # if it doesn't support infs, then it can't have infs
- return False
-
-
-def _get_fill_value(
- dtype: DtypeObj, fill_value: Scalar | None = None, fill_value_typ=None
-):
- """return the correct fill value for the dtype of the values"""
- if fill_value is not None:
- return fill_value
- if _na_ok_dtype(dtype):
- if fill_value_typ is None:
- return np.nan
- else:
- if fill_value_typ == "+inf":
- return np.inf
- else:
- return -np.inf
- else:
- if fill_value_typ == "+inf":
- # need the max int here
- return lib.i8max
- else:
- return iNaT
-
-
-def _maybe_get_mask(
- values: np.ndarray, skipna: bool, mask: npt.NDArray[np.bool_] | None
-) -> npt.NDArray[np.bool_] | None:
- """
- Compute a mask if and only if necessary.
-
- This function will compute a mask iff it is necessary. Otherwise,
- return the provided mask (potentially None) when a mask does not need to be
- computed.
-
- A mask is never necessary if the values array is of boolean or integer
- dtypes, as these are incapable of storing NaNs. If passing a NaN-capable
- dtype that is interpretable as either boolean or integer data (eg,
- timedelta64), a mask must be provided.
-
- If the skipna parameter is False, a new mask will not be computed.
-
- The mask is computed using isna() by default. Setting invert=True selects
- notna() as the masking function.
-
- Parameters
- ----------
- values : ndarray
- input array to potentially compute mask for
- skipna : bool
- boolean for whether NaNs should be skipped
- mask : Optional[ndarray]
- nan-mask if known
-
- Returns
- -------
- Optional[np.ndarray[bool]]
- """
- if mask is None:
- if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype):
- # Boolean data cannot contain nulls, so signal via mask being None
- return None
-
- if skipna or needs_i8_conversion(values.dtype):
- mask = isna(values)
-
- return mask
-
-
-def _get_values(
- values: np.ndarray,
- skipna: bool,
- fill_value: Any = None,
- fill_value_typ: str | None = None,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> tuple[np.ndarray, npt.NDArray[np.bool_] | None, np.dtype, np.dtype, Any]:
- """
- Utility to get the values view, mask, dtype, dtype_max, and fill_value.
-
- If both mask and fill_value/fill_value_typ are not None and skipna is True,
- the values array will be copied.
-
- For input arrays of boolean or integer dtypes, copies will only occur if a
- precomputed mask, a fill_value/fill_value_typ, and skipna=True are
- provided.
-
- Parameters
- ----------
- values : ndarray
- input array to potentially compute mask for
- skipna : bool
- boolean for whether NaNs should be skipped
- fill_value : Any
- value to fill NaNs with
- fill_value_typ : str
- Set to '+inf' or '-inf' to handle dtype-specific infinities
- mask : Optional[np.ndarray[bool]]
- nan-mask if known
-
- Returns
- -------
- values : ndarray
- Potential copy of input value array
- mask : Optional[ndarray[bool]]
- Mask for values, if deemed necessary to compute
- dtype : np.dtype
- dtype for values
- dtype_max : np.dtype
- platform independent dtype
- fill_value : Any
- fill value used
- """
- # In _get_values is only called from within nanops, and in all cases
- # with scalar fill_value. This guarantee is important for the
- # np.where call below
- assert is_scalar(fill_value)
- # error: Incompatible types in assignment (expression has type "Union[Any,
- # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
- values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
-
- mask = _maybe_get_mask(values, skipna, mask)
-
- dtype = values.dtype
-
- datetimelike = False
- if needs_i8_conversion(values.dtype):
- # changing timedelta64/datetime64 to int64 needs to happen after
- # finding `mask` above
- values = np.asarray(values.view("i8"))
- datetimelike = True
-
- dtype_ok = _na_ok_dtype(dtype)
-
- # get our fill value (in case we need to provide an alternative
- # dtype for it)
- fill_value = _get_fill_value(
- dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
- )
-
- if skipna and (mask is not None) and (fill_value is not None):
- if mask.any():
- if dtype_ok or datetimelike:
- values = values.copy()
- np.putmask(values, mask, fill_value)
- else:
- # np.where will promote if needed
- values = np.where(~mask, values, fill_value)
-
- # return a platform independent precision dtype
- dtype_max = dtype
- if is_integer_dtype(dtype) or is_bool_dtype(dtype):
- dtype_max = np.dtype(np.int64)
- elif is_float_dtype(dtype):
- dtype_max = np.dtype(np.float64)
-
- return values, mask, dtype, dtype_max, fill_value
-
-
-def _na_ok_dtype(dtype: DtypeObj) -> bool:
- if needs_i8_conversion(dtype):
- return False
- return not issubclass(dtype.type, np.integer)
-
-
-def _wrap_results(result, dtype: np.dtype, fill_value=None):
- """wrap our results if needed"""
- if result is NaT:
- pass
-
- elif is_datetime64_any_dtype(dtype):
- if fill_value is None:
- # GH#24293
- fill_value = iNaT
- if not isinstance(result, np.ndarray):
- assert not isna(fill_value), "Expected non-null fill_value"
- if result == fill_value:
- result = np.nan
-
- if isna(result):
- result = np.datetime64("NaT", "ns").astype(dtype)
- else:
- result = np.int64(result).view(dtype)
- # retain original unit
- result = result.astype(dtype, copy=False)
- else:
- # If we have float dtype, taking a view will give the wrong result
- result = result.astype(dtype)
- elif is_timedelta64_dtype(dtype):
- if not isinstance(result, np.ndarray):
- if result == fill_value or np.isnan(result):
- result = np.timedelta64("NaT").astype(dtype)
-
- elif np.fabs(result) > lib.i8max:
- # raise if we have a timedelta64[ns] which is too large
- raise ValueError("overflow in timedelta operation")
- else:
- # return a timedelta64 with the original unit
- result = np.int64(result).astype(dtype, copy=False)
-
- else:
- result = result.astype("m8[ns]").view(dtype)
-
- return result
-
-
-def _datetimelike_compat(func: F) -> F:
- """
- If we have datetime64 or timedelta64 values, ensure we have a correct
- mask before calling the wrapped function, then cast back afterwards.
- """
-
- @functools.wraps(func)
- def new_func(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
- **kwargs,
- ):
- orig_values = values
-
- datetimelike = values.dtype.kind in ["m", "M"]
- if datetimelike and mask is None:
- mask = isna(values)
-
- result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
-
- if datetimelike:
- result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)
- if not skipna:
- assert mask is not None # checked above
- result = _mask_datetimelike_result(result, axis, mask, orig_values)
-
- return result
-
- return cast(F, new_func)
-
-
-def _na_for_min_count(values: np.ndarray, axis: AxisInt | None) -> Scalar | np.ndarray:
- """
- Return the missing value for `values`.
-
- Parameters
- ----------
- values : ndarray
- axis : int or None
- axis for the reduction, required if values.ndim > 1.
-
- Returns
- -------
- result : scalar or ndarray
- For 1-D values, returns a scalar of the correct missing type.
- For 2-D values, returns a 1-D array where each element is missing.
- """
- # we either return np.nan or pd.NaT
- if is_numeric_dtype(values):
- values = values.astype("float64")
- fill_value = na_value_for_dtype(values.dtype)
-
- if values.ndim == 1:
- return fill_value
- elif axis is None:
- return fill_value
- else:
- result_shape = values.shape[:axis] + values.shape[axis + 1 :]
-
- return np.full(result_shape, fill_value, dtype=values.dtype)
-
-
-def maybe_operate_rowwise(func: F) -> F:
- """
- NumPy operations on C-contiguous ndarrays with axis=1 can be
- very slow if axis 1 >> axis 0.
- Operate row-by-row and concatenate the results.
- """
-
- @functools.wraps(func)
- def newfunc(values: np.ndarray, *, axis: AxisInt | None = None, **kwargs):
- if (
- axis == 1
- and values.ndim == 2
- and values.flags["C_CONTIGUOUS"]
- # only takes this path for wide arrays (long dataframes), for threshold see
- # https://github.com/pandas-dev/pandas/pull/43311#issuecomment-974891737
- and (values.shape[1] / 1000) > values.shape[0]
- and values.dtype != object
- and values.dtype != bool
- ):
- arrs = list(values)
- if kwargs.get("mask") is not None:
- mask = kwargs.pop("mask")
- results = [
- func(arrs[i], mask=mask[i], **kwargs) for i in range(len(arrs))
- ]
- else:
- results = [func(x, **kwargs) for x in arrs]
- return np.array(results)
-
- return func(values, axis=axis, **kwargs)
-
- return cast(F, newfunc)
-
-
-def nanany(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> bool:
- """
- Check if any elements along an axis evaluate to True.
-
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- result : bool
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 2])
- >>> nanops.nanany(s)
- True
-
- >>> from pandas.core import nanops
- >>> s = pd.Series([np.nan])
- >>> nanops.nanany(s)
- False
- """
- if needs_i8_conversion(values.dtype) and values.dtype.kind != "m":
- # GH#34479
- warnings.warn(
- "'any' with datetime64 dtypes is deprecated and will raise in a "
- "future version. Use (obj != pd.Timestamp(0)).any() instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
-
- values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask)
-
- # For object type, any won't necessarily return
- # boolean values (numpy/numpy#4352)
- if is_object_dtype(values):
- values = values.astype(bool)
-
- # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
- # "bool")
- return values.any(axis) # type: ignore[return-value]
-
-
-def nanall(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> bool:
- """
- Check if all elements along an axis evaluate to True.
-
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- result : bool
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 2, np.nan])
- >>> nanops.nanall(s)
- True
-
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 0])
- >>> nanops.nanall(s)
- False
- """
- if needs_i8_conversion(values.dtype) and values.dtype.kind != "m":
- # GH#34479
- warnings.warn(
- "'all' with datetime64 dtypes is deprecated and will raise in a "
- "future version. Use (obj != pd.Timestamp(0)).all() instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
-
- values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask)
-
- # For object type, all won't necessarily return
- # boolean values (numpy/numpy#4352)
- if is_object_dtype(values):
- values = values.astype(bool)
-
- # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
- # "bool")
- return values.all(axis) # type: ignore[return-value]
-
-
-@disallow("M8")
-@_datetimelike_compat
-@maybe_operate_rowwise
-def nansum(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- min_count: int = 0,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> float:
- """
- Sum the elements along an axis ignoring NaNs
-
- Parameters
- ----------
- values : ndarray[dtype]
- axis : int, optional
- skipna : bool, default True
- min_count: int, default 0
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- result : dtype
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 2, np.nan])
- >>> nanops.nansum(s)
- 3.0
- """
- values, mask, dtype, dtype_max, _ = _get_values(
- values, skipna, fill_value=0, mask=mask
- )
- dtype_sum = dtype_max
- if is_float_dtype(dtype):
- dtype_sum = dtype
- elif is_timedelta64_dtype(dtype):
- dtype_sum = np.dtype(np.float64)
-
- the_sum = values.sum(axis, dtype=dtype_sum)
- the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
-
- return the_sum
-
-
-def _mask_datetimelike_result(
- result: np.ndarray | np.datetime64 | np.timedelta64,
- axis: AxisInt | None,
- mask: npt.NDArray[np.bool_],
- orig_values: np.ndarray,
-) -> np.ndarray | np.datetime64 | np.timedelta64 | NaTType:
- if isinstance(result, np.ndarray):
- # we need to apply the mask
- result = result.astype("i8").view(orig_values.dtype)
- axis_mask = mask.any(axis=axis)
- # error: Unsupported target for indexed assignment ("Union[ndarray[Any, Any],
- # datetime64, timedelta64]")
- result[axis_mask] = iNaT # type: ignore[index]
- else:
- if mask.any():
- return np.int64(iNaT).view(orig_values.dtype)
- return result
-
-
-@disallow(PeriodDtype)
-@bottleneck_switch()
-@_datetimelike_compat
-def nanmean(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> float:
- """
- Compute the mean of the element along an axis ignoring NaNs
-
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- float
- Unless input is a float array, in which case use the same
- precision as the input array.
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 2, np.nan])
- >>> nanops.nanmean(s)
- 1.5
- """
- values, mask, dtype, dtype_max, _ = _get_values(
- values, skipna, fill_value=0, mask=mask
- )
- dtype_sum = dtype_max
- dtype_count = np.dtype(np.float64)
-
- # not using needs_i8_conversion because that includes period
- if dtype.kind in ["m", "M"]:
- dtype_sum = np.dtype(np.float64)
- elif is_integer_dtype(dtype):
- dtype_sum = np.dtype(np.float64)
- elif is_float_dtype(dtype):
- dtype_sum = dtype
- dtype_count = dtype
-
- count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
- the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
-
- if axis is not None and getattr(the_sum, "ndim", False):
- count = cast(np.ndarray, count)
- with np.errstate(all="ignore"):
- # suppress division by zero warnings
- the_mean = the_sum / count
- ct_mask = count == 0
- if ct_mask.any():
- the_mean[ct_mask] = np.nan
- else:
- the_mean = the_sum / count if count > 0 else np.nan
-
- return the_mean
-
-
-@bottleneck_switch()
-def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=None):
- """
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- result : float
- Unless input is a float array, in which case use the same
- precision as the input array.
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 2, 2])
- >>> nanops.nanmedian(s)
- 2.0
- """
-
- def get_median(x, _mask=None):
- if _mask is None:
- _mask = notna(x)
- else:
- _mask = ~_mask
- if not skipna and not _mask.all():
- return np.nan
- with warnings.catch_warnings():
- # Suppress RuntimeWarning about All-NaN slice
- warnings.filterwarnings(
- "ignore", "All-NaN slice encountered", RuntimeWarning
- )
- res = np.nanmedian(x[_mask])
- return res
-
- values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask, fill_value=0)
- if not is_float_dtype(values.dtype):
- try:
- values = values.astype("f8")
- except ValueError as err:
- # e.g. "could not convert string to float: 'a'"
- raise TypeError(str(err)) from err
- if mask is not None:
- values[mask] = np.nan
-
- notempty = values.size
-
- # an array from a frame
- if values.ndim > 1 and axis is not None:
- # there's a non-empty array to apply over otherwise numpy raises
- if notempty:
- if not skipna:
- res = np.apply_along_axis(get_median, axis, values)
-
- else:
- # fastpath for the skipna case
- with warnings.catch_warnings():
- # Suppress RuntimeWarning about All-NaN slice
- warnings.filterwarnings(
- "ignore", "All-NaN slice encountered", RuntimeWarning
- )
- res = np.nanmedian(values, axis)
-
- else:
- # must return the correct shape, but median is not defined for the
- # empty set so return nans of shape "everything but the passed axis"
- # since "axis" is where the reduction would occur if we had a nonempty
- # array
- res = get_empty_reduction_result(values.shape, axis, np.float_, np.nan)
-
- else:
- # otherwise return a scalar value
- res = get_median(values, mask) if notempty else np.nan
- return _wrap_results(res, dtype)
-
-
-def get_empty_reduction_result(
- shape: tuple[int, ...],
- axis: AxisInt,
- dtype: np.dtype | type[np.floating],
- fill_value: Any,
-) -> np.ndarray:
- """
- The result from a reduction on an empty ndarray.
-
- Parameters
- ----------
- shape : Tuple[int]
- axis : int
- dtype : np.dtype
- fill_value : Any
-
- Returns
- -------
- np.ndarray
- """
- shp = np.array(shape)
- dims = np.arange(len(shape))
- ret = np.empty(shp[dims != axis], dtype=dtype)
- ret.fill(fill_value)
- return ret
-
-
-def _get_counts_nanvar(
- values_shape: Shape,
- mask: npt.NDArray[np.bool_] | None,
- axis: AxisInt | None,
- ddof: int,
- dtype: np.dtype = np.dtype(np.float64),
-) -> tuple[float | np.ndarray, float | np.ndarray]:
- """
- Get the count of non-null values along an axis, accounting
- for degrees of freedom.
-
- Parameters
- ----------
- values_shape : Tuple[int, ...]
- shape tuple from values ndarray, used if mask is None
- mask : Optional[ndarray[bool]]
- locations in values that should be considered missing
- axis : Optional[int]
- axis to count along
- ddof : int
- degrees of freedom
- dtype : type, optional
- type to use for count
-
- Returns
- -------
- count : int, np.nan or np.ndarray
- d : int, np.nan or np.ndarray
- """
- count = _get_counts(values_shape, mask, axis, dtype=dtype)
- d = count - dtype.type(ddof)
-
- # always return NaN, never inf
- if is_scalar(count):
- if count <= ddof:
- count = np.nan
- d = np.nan
- else:
- # count is not narrowed by is_scalar check
- count = cast(np.ndarray, count)
- mask = count <= ddof
- if mask.any():
- np.putmask(d, mask, np.nan)
- np.putmask(count, mask, np.nan)
- return count, d
-
-
-@bottleneck_switch(ddof=1)
-def nanstd(
- values,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- ddof: int = 1,
- mask=None,
-):
- """
- Compute the standard deviation along given axis while ignoring NaNs
-
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
- where N represents the number of elements.
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- result : float
- Unless input is a float array, in which case use the same
- precision as the input array.
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 2, 3])
- >>> nanops.nanstd(s)
- 1.0
- """
- if values.dtype == "M8[ns]":
- values = values.view("m8[ns]")
-
- orig_dtype = values.dtype
- values, mask, _, _, _ = _get_values(values, skipna, mask=mask)
-
- result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask))
- return _wrap_results(result, orig_dtype)
-
-
-@disallow("M8", "m8")
-@bottleneck_switch(ddof=1)
-def nanvar(
- values,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- ddof: int = 1,
- mask=None,
-):
- """
- Compute the variance along given axis while ignoring NaNs
-
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
- where N represents the number of elements.
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- result : float
- Unless input is a float array, in which case use the same
- precision as the input array.
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 2, 3])
- >>> nanops.nanvar(s)
- 1.0
- """
- values = extract_array(values, extract_numpy=True)
- dtype = values.dtype
- mask = _maybe_get_mask(values, skipna, mask)
- if is_any_int_dtype(dtype):
- values = values.astype("f8")
- if mask is not None:
- values[mask] = np.nan
-
- if is_float_dtype(values.dtype):
- count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
- else:
- count, d = _get_counts_nanvar(values.shape, mask, axis, ddof)
-
- if skipna and mask is not None:
- values = values.copy()
- np.putmask(values, mask, 0)
-
- # xref GH10242
- # Compute variance via two-pass algorithm, which is stable against
- # cancellation errors and relatively accurate for small numbers of
- # observations.
- #
- # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
- if axis is not None:
- avg = np.expand_dims(avg, axis)
- sqr = _ensure_numeric((avg - values) ** 2)
- if mask is not None:
- np.putmask(sqr, mask, 0)
- result = sqr.sum(axis=axis, dtype=np.float64) / d
-
- # Return variance as np.float64 (the datatype used in the accumulator),
- # unless we were dealing with a float array, in which case use the same
- # precision as the original values array.
- if is_float_dtype(dtype):
- result = result.astype(dtype, copy=False)
- return result
-
-
-@disallow("M8", "m8")
-def nansem(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- ddof: int = 1,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> float:
- """
- Compute the standard error in the mean along given axis while ignoring NaNs
-
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
- where N represents the number of elements.
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- result : float64
- Unless input is a float array, in which case use the same
- precision as the input array.
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 2, 3])
- >>> nanops.nansem(s)
- 0.5773502691896258
- """
- # This checks if non-numeric-like data is passed with numeric_only=False
- # and raises a TypeError otherwise
- nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
-
- mask = _maybe_get_mask(values, skipna, mask)
- if not is_float_dtype(values.dtype):
- values = values.astype("f8")
-
- if not skipna and mask is not None and mask.any():
- return np.nan
-
- count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
- var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
-
- return np.sqrt(var) / np.sqrt(count)
-
-
-def _nanminmax(meth, fill_value_typ):
- @bottleneck_switch(name=f"nan{meth}")
- @_datetimelike_compat
- def reduction(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> Dtype:
- values, mask, dtype, dtype_max, fill_value = _get_values(
- values, skipna, fill_value_typ=fill_value_typ, mask=mask
- )
-
- if (axis is not None and values.shape[axis] == 0) or values.size == 0:
- try:
- result = getattr(values, meth)(axis, dtype=dtype_max)
- result.fill(np.nan)
- except (AttributeError, TypeError, ValueError):
- result = np.nan
- else:
- result = getattr(values, meth)(axis)
-
- result = _maybe_null_out(result, axis, mask, values.shape)
- return result
-
- return reduction
-
-
-nanmin = _nanminmax("min", fill_value_typ="+inf")
-nanmax = _nanminmax("max", fill_value_typ="-inf")
-
-
-@disallow("O")
-def nanargmax(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> int | np.ndarray:
- """
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- result : int or ndarray[int]
- The index/indices of max value in specified axis or -1 in the NA case
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> arr = np.array([1, 2, 3, np.nan, 4])
- >>> nanops.nanargmax(arr)
- 4
-
- >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
- >>> arr[2:, 2] = np.nan
- >>> arr
- array([[ 0., 1., 2.],
- [ 3., 4., 5.],
- [ 6., 7., nan],
- [ 9., 10., nan]])
- >>> nanops.nanargmax(arr, axis=1)
- array([2, 2, 1, 1])
- """
- values, mask, _, _, _ = _get_values(values, True, fill_value_typ="-inf", mask=mask)
- # error: Need type annotation for 'result'
- result = values.argmax(axis) # type: ignore[var-annotated]
- result = _maybe_arg_null_out(result, axis, mask, skipna)
- return result
-
-
-@disallow("O")
-def nanargmin(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> int | np.ndarray:
- """
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- result : int or ndarray[int]
- The index/indices of min value in specified axis or -1 in the NA case
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> arr = np.array([1, 2, 3, np.nan, 4])
- >>> nanops.nanargmin(arr)
- 0
-
- >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
- >>> arr[2:, 0] = np.nan
- >>> arr
- array([[ 0., 1., 2.],
- [ 3., 4., 5.],
- [nan, 7., 8.],
- [nan, 10., 11.]])
- >>> nanops.nanargmin(arr, axis=1)
- array([0, 0, 1, 1])
- """
- values, mask, _, _, _ = _get_values(values, True, fill_value_typ="+inf", mask=mask)
- # error: Need type annotation for 'result'
- result = values.argmin(axis) # type: ignore[var-annotated]
- result = _maybe_arg_null_out(result, axis, mask, skipna)
- return result
-
-
-@disallow("M8", "m8")
-@maybe_operate_rowwise
-def nanskew(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> float:
- """
- Compute the sample skewness.
-
- The statistic computed here is the adjusted Fisher-Pearson standardized
- moment coefficient G1. The algorithm computes this coefficient directly
- from the second and third central moment.
-
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- result : float64
- Unless input is a float array, in which case use the same
- precision as the input array.
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 1, 2])
- >>> nanops.nanskew(s)
- 1.7320508075688787
- """
- # error: Incompatible types in assignment (expression has type "Union[Any,
- # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
- values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
- mask = _maybe_get_mask(values, skipna, mask)
- if not is_float_dtype(values.dtype):
- values = values.astype("f8")
- count = _get_counts(values.shape, mask, axis)
- else:
- count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
-
- if skipna and mask is not None:
- values = values.copy()
- np.putmask(values, mask, 0)
- elif not skipna and mask is not None and mask.any():
- return np.nan
-
- mean = values.sum(axis, dtype=np.float64) / count
- if axis is not None:
- mean = np.expand_dims(mean, axis)
-
- adjusted = values - mean
- if skipna and mask is not None:
- np.putmask(adjusted, mask, 0)
- adjusted2 = adjusted**2
- adjusted3 = adjusted2 * adjusted
- m2 = adjusted2.sum(axis, dtype=np.float64)
- m3 = adjusted3.sum(axis, dtype=np.float64)
-
- # floating point error
- #
- # #18044 in _libs/windows.pyx calc_skew follow this behavior
- # to fix the fperr to treat m2 <1e-14 as zero
- m2 = _zero_out_fperr(m2)
- m3 = _zero_out_fperr(m3)
-
- with np.errstate(invalid="ignore", divide="ignore"):
- result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)
-
- dtype = values.dtype
- if is_float_dtype(dtype):
- result = result.astype(dtype, copy=False)
-
- if isinstance(result, np.ndarray):
- result = np.where(m2 == 0, 0, result)
- result[count < 3] = np.nan
- else:
- result = 0 if m2 == 0 else result
- if count < 3:
- return np.nan
-
- return result
-
-
-@disallow("M8", "m8")
-@maybe_operate_rowwise
-def nankurt(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> float:
- """
- Compute the sample excess kurtosis
-
- The statistic computed here is the adjusted Fisher-Pearson standardized
- moment coefficient G2, computed directly from the second and fourth
- central moment.
-
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- result : float64
- Unless input is a float array, in which case use the same
- precision as the input array.
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 1, 3, 2])
- >>> nanops.nankurt(s)
- -1.2892561983471076
- """
- # error: Incompatible types in assignment (expression has type "Union[Any,
- # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
- values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
- mask = _maybe_get_mask(values, skipna, mask)
- if not is_float_dtype(values.dtype):
- values = values.astype("f8")
- count = _get_counts(values.shape, mask, axis)
- else:
- count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
-
- if skipna and mask is not None:
- values = values.copy()
- np.putmask(values, mask, 0)
- elif not skipna and mask is not None and mask.any():
- return np.nan
-
- mean = values.sum(axis, dtype=np.float64) / count
- if axis is not None:
- mean = np.expand_dims(mean, axis)
-
- adjusted = values - mean
- if skipna and mask is not None:
- np.putmask(adjusted, mask, 0)
- adjusted2 = adjusted**2
- adjusted4 = adjusted2**2
- m2 = adjusted2.sum(axis, dtype=np.float64)
- m4 = adjusted4.sum(axis, dtype=np.float64)
-
- with np.errstate(invalid="ignore", divide="ignore"):
- adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
- numerator = count * (count + 1) * (count - 1) * m4
- denominator = (count - 2) * (count - 3) * m2**2
-
- # floating point error
- #
- # #18044 in _libs/windows.pyx calc_kurt follow this behavior
- # to fix the fperr to treat denom <1e-14 as zero
- numerator = _zero_out_fperr(numerator)
- denominator = _zero_out_fperr(denominator)
-
- if not isinstance(denominator, np.ndarray):
- # if ``denom`` is a scalar, check these corner cases first before
- # doing division
- if count < 4:
- return np.nan
- if denominator == 0:
- return 0
-
- with np.errstate(invalid="ignore", divide="ignore"):
- result = numerator / denominator - adj
-
- dtype = values.dtype
- if is_float_dtype(dtype):
- result = result.astype(dtype, copy=False)
-
- if isinstance(result, np.ndarray):
- result = np.where(denominator == 0, 0, result)
- result[count < 4] = np.nan
-
- return result
-
-
-@disallow("M8", "m8")
-@maybe_operate_rowwise
-def nanprod(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- min_count: int = 0,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> float:
- """
- Parameters
- ----------
- values : ndarray[dtype]
- axis : int, optional
- skipna : bool, default True
- min_count: int, default 0
- mask : ndarray[bool], optional
- nan-mask if known
-
- Returns
- -------
- Dtype
- The product of all elements on a given axis. ( NaNs are treated as 1)
-
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 2, 3, np.nan])
- >>> nanops.nanprod(s)
- 6.0
- """
- mask = _maybe_get_mask(values, skipna, mask)
-
- if skipna and mask is not None:
- values = values.copy()
- values[mask] = 1
- result = values.prod(axis)
- # error: Incompatible return value type (got "Union[ndarray, float]", expected
- # "float")
- return _maybe_null_out( # type: ignore[return-value]
- result, axis, mask, values.shape, min_count=min_count
- )
-
-
-def _maybe_arg_null_out(
- result: np.ndarray,
- axis: AxisInt | None,
- mask: npt.NDArray[np.bool_] | None,
- skipna: bool,
-) -> np.ndarray | int:
- # helper function for nanargmin/nanargmax
- if mask is None:
- return result
-
- if axis is None or not getattr(result, "ndim", False):
- if skipna:
- if mask.all():
- return -1
- else:
- if mask.any():
- return -1
- else:
- if skipna:
- na_mask = mask.all(axis)
- else:
- na_mask = mask.any(axis)
- if na_mask.any():
- result[na_mask] = -1
- return result
-
-
-def _get_counts(
- values_shape: Shape,
- mask: npt.NDArray[np.bool_] | None,
- axis: AxisInt | None,
- dtype: np.dtype = np.dtype(np.float64),
-) -> float | np.ndarray:
- """
- Get the count of non-null values along an axis
-
- Parameters
- ----------
- values_shape : tuple of int
- shape tuple from values ndarray, used if mask is None
- mask : Optional[ndarray[bool]]
- locations in values that should be considered missing
- axis : Optional[int]
- axis to count along
- dtype : type, optional
- type to use for count
-
- Returns
- -------
- count : scalar or array
- """
- if axis is None:
- if mask is not None:
- n = mask.size - mask.sum()
- else:
- n = np.prod(values_shape)
- return dtype.type(n)
-
- if mask is not None:
- count = mask.shape[axis] - mask.sum(axis)
- else:
- count = values_shape[axis]
-
- if is_scalar(count):
- return dtype.type(count)
- return count.astype(dtype, copy=False)
-
-
-def _maybe_null_out(
- result: np.ndarray | float | NaTType,
- axis: AxisInt | None,
- mask: npt.NDArray[np.bool_] | None,
- shape: tuple[int, ...],
- min_count: int = 1,
-) -> np.ndarray | float | NaTType:
- """
- Returns
- -------
- Dtype
- The product of all elements on a given axis. ( NaNs are treated as 1)
- """
- if mask is None and min_count == 0:
- # nothing to check; short-circuit
- return result
-
- if axis is not None and isinstance(result, np.ndarray):
- if mask is not None:
- null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
- else:
- # we have no nulls, kept mask=None in _maybe_get_mask
- below_count = shape[axis] - min_count < 0
- new_shape = shape[:axis] + shape[axis + 1 :]
- null_mask = np.broadcast_to(below_count, new_shape)
-
- if np.any(null_mask):
- if is_numeric_dtype(result):
- if np.iscomplexobj(result):
- result = result.astype("c16")
- elif not is_float_dtype(result):
- result = result.astype("f8", copy=False)
- result[null_mask] = np.nan
- else:
- # GH12941, use None to auto cast null
- result[null_mask] = None
- elif result is not NaT:
- if check_below_min_count(shape, mask, min_count):
- result_dtype = getattr(result, "dtype", None)
- if is_float_dtype(result_dtype):
- # error: Item "None" of "Optional[Any]" has no attribute "type"
- result = result_dtype.type("nan") # type: ignore[union-attr]
- else:
- result = np.nan
-
- return result
-
-
-def check_below_min_count(
- shape: tuple[int, ...], mask: npt.NDArray[np.bool_] | None, min_count: int
-) -> bool:
- """
- Check for the `min_count` keyword. Returns True if below `min_count` (when
- missing value should be returned from the reduction).
-
- Parameters
- ----------
- shape : tuple
- The shape of the values (`values.shape`).
- mask : ndarray[bool] or None
- Boolean numpy array (typically of same shape as `shape`) or None.
- min_count : int
- Keyword passed through from sum/prod call.
-
- Returns
- -------
- bool
- """
- if min_count > 0:
- if mask is None:
- # no missing values, only check size
- non_nulls = np.prod(shape)
- else:
- non_nulls = mask.size - mask.sum()
- if non_nulls < min_count:
- return True
- return False
-
-
-def _zero_out_fperr(arg):
- # #18044 reference this behavior to fix rolling skew/kurt issue
- if isinstance(arg, np.ndarray):
- with np.errstate(invalid="ignore"):
- return np.where(np.abs(arg) < 1e-14, 0, arg)
- else:
- return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
-
-
-@disallow("M8", "m8")
-def nancorr(
- a: np.ndarray,
- b: np.ndarray,
- *,
- method: CorrelationMethod = "pearson",
- min_periods: int | None = None,
-) -> float:
- """
- a, b: ndarrays
- """
- if len(a) != len(b):
- raise AssertionError("Operands to nancorr must have same size")
-
- if min_periods is None:
- min_periods = 1
-
- valid = notna(a) & notna(b)
- if not valid.all():
- a = a[valid]
- b = b[valid]
-
- if len(a) < min_periods:
- return np.nan
-
- f = get_corr_func(method)
- return f(a, b)
-
-
-def get_corr_func(
- method: CorrelationMethod,
-) -> Callable[[np.ndarray, np.ndarray], float]:
- if method == "kendall":
- from scipy.stats import kendalltau
-
- def func(a, b):
- return kendalltau(a, b)[0]
-
- return func
- elif method == "spearman":
- from scipy.stats import spearmanr
-
- def func(a, b):
- return spearmanr(a, b)[0]
-
- return func
- elif method == "pearson":
-
- def func(a, b):
- return np.corrcoef(a, b)[0, 1]
-
- return func
- elif callable(method):
- return method
-
- raise ValueError(
- f"Unknown method '{method}', expected one of "
- "'kendall', 'spearman', 'pearson', or callable"
- )
-
-
-@disallow("M8", "m8")
-def nancov(
- a: np.ndarray,
- b: np.ndarray,
- *,
- min_periods: int | None = None,
- ddof: int | None = 1,
-) -> float:
- if len(a) != len(b):
- raise AssertionError("Operands to nancov must have same size")
-
- if min_periods is None:
- min_periods = 1
-
- valid = notna(a) & notna(b)
- if not valid.all():
- a = a[valid]
- b = b[valid]
-
- if len(a) < min_periods:
- return np.nan
-
- return np.cov(a, b, ddof=ddof)[0, 1]
-
-
-def _ensure_numeric(x):
- if isinstance(x, np.ndarray):
- if is_integer_dtype(x) or is_bool_dtype(x):
- x = x.astype(np.float64)
- elif is_object_dtype(x):
- try:
- x = x.astype(np.complex128)
- except (TypeError, ValueError):
- try:
- x = x.astype(np.float64)
- except ValueError as err:
- # GH#29941 we get here with object arrays containing strs
- raise TypeError(f"Could not convert {x} to numeric") from err
- else:
- if not np.any(np.imag(x)):
- x = x.real
- elif not (is_float(x) or is_integer(x) or is_complex(x)):
- try:
- x = float(x)
- except (TypeError, ValueError):
- # e.g. "1+1j" or "foo"
- try:
- x = complex(x)
- except ValueError as err:
- # e.g. "foo"
- raise TypeError(f"Could not convert {x} to numeric") from err
- return x
-
-
-# NA-friendly array comparisons
-
-
-def make_nancomp(op):
- def f(x, y):
- xmask = isna(x)
- ymask = isna(y)
- mask = xmask | ymask
-
- with np.errstate(all="ignore"):
- result = op(x, y)
-
- if mask.any():
- if is_bool_dtype(result):
- result = result.astype("O")
- np.putmask(result, mask, np.nan)
-
- return result
-
- return f
-
-
-nangt = make_nancomp(operator.gt)
-nange = make_nancomp(operator.ge)
-nanlt = make_nancomp(operator.lt)
-nanle = make_nancomp(operator.le)
-naneq = make_nancomp(operator.eq)
-nanne = make_nancomp(operator.ne)
-
-
-def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike:
- """
- Cumulative function with skipna support.
-
- Parameters
- ----------
- values : np.ndarray or ExtensionArray
- accum_func : {np.cumprod, np.maximum.accumulate, np.cumsum, np.minimum.accumulate}
- skipna : bool
-
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- mask_a, mask_b = {
- np.cumprod: (1.0, np.nan),
- np.maximum.accumulate: (-np.inf, np.nan),
- np.cumsum: (0.0, np.nan),
- np.minimum.accumulate: (np.inf, np.nan),
- }[accum_func]
-
- # This should go through ea interface
- assert values.dtype.kind not in ["m", "M"]
-
- # We will be applying this function to block values
- if skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)):
- vals = values.copy()
- mask = isna(vals)
- vals[mask] = mask_a
- result = accum_func(vals, axis=0)
- result[mask] = mask_b
- else:
- result = accum_func(values, axis=0)
-
- return result
diff --git a/contrib/python/pandas/py3/pandas/core/ops/__init__.py b/contrib/python/pandas/py3/pandas/core/ops/__init__.py
deleted file mode 100644
index 64619fdc4b8..00000000000
--- a/contrib/python/pandas/py3/pandas/core/ops/__init__.py
+++ /dev/null
@@ -1,535 +0,0 @@
-"""
-Arithmetic operations for PandasObjects
-
-This is not a public API.
-"""
-from __future__ import annotations
-
-import operator
-from typing import (
- TYPE_CHECKING,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
-from pandas._typing import (
- Axis,
- AxisInt,
- Level,
-)
-from pandas.util._decorators import Appender
-
-from pandas.core.dtypes.common import (
- is_array_like,
- is_list_like,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import isna
-
-from pandas.core import (
- algorithms,
- roperator,
-)
-from pandas.core.ops.array_ops import (
- arithmetic_op,
- comp_method_OBJECT_ARRAY,
- comparison_op,
- get_array_op,
- logical_op,
- maybe_prepare_scalar_for_op,
-)
-from pandas.core.ops.common import (
- get_op_result_name,
- unpack_zerodim_and_defer,
-)
-from pandas.core.ops.docstrings import (
- _flex_comp_doc_FRAME,
- _op_descriptions,
- make_flex_doc,
-)
-from pandas.core.ops.invalid import invalid_comparison
-from pandas.core.ops.mask_ops import (
- kleene_and,
- kleene_or,
- kleene_xor,
-)
-from pandas.core.ops.methods import add_flex_arithmetic_methods
-from pandas.core.roperator import (
- radd,
- rand_,
- rdiv,
- rdivmod,
- rfloordiv,
- rmod,
- rmul,
- ror_,
- rpow,
- rsub,
- rtruediv,
- rxor,
-)
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
-
-# -----------------------------------------------------------------------------
-# constants
-ARITHMETIC_BINOPS: set[str] = {
- "add",
- "sub",
- "mul",
- "pow",
- "mod",
- "floordiv",
- "truediv",
- "divmod",
- "radd",
- "rsub",
- "rmul",
- "rpow",
- "rmod",
- "rfloordiv",
- "rtruediv",
- "rdivmod",
-}
-
-
-COMPARISON_BINOPS: set[str] = {"eq", "ne", "lt", "gt", "le", "ge"}
-
-
-# -----------------------------------------------------------------------------
-# Masking NA values and fallbacks for operations numpy does not support
-
-
-def fill_binop(left, right, fill_value):
- """
- If a non-None fill_value is given, replace null entries in left and right
- with this value, but only in positions where _one_ of left/right is null,
- not both.
-
- Parameters
- ----------
- left : array-like
- right : array-like
- fill_value : object
-
- Returns
- -------
- left : array-like
- right : array-like
-
- Notes
- -----
- Makes copies if fill_value is not None and NAs are present.
- """
- if fill_value is not None:
- left_mask = isna(left)
- right_mask = isna(right)
-
- # one but not both
- mask = left_mask ^ right_mask
-
- if left_mask.any():
- # Avoid making a copy if we can
- left = left.copy()
- left[left_mask & mask] = fill_value
-
- if right_mask.any():
- # Avoid making a copy if we can
- right = right.copy()
- right[right_mask & mask] = fill_value
-
- return left, right
-
-
-# -----------------------------------------------------------------------------
-# Series
-
-
-def align_method_SERIES(left: Series, right, align_asobject: bool = False):
- """align lhs and rhs Series"""
- # ToDo: Different from align_method_FRAME, list, tuple and ndarray
- # are not coerced here
- # because Series has inconsistencies described in #13637
-
- if isinstance(right, ABCSeries):
- # avoid repeated alignment
- if not left.index.equals(right.index):
- if align_asobject:
- # to keep original value's dtype for bool ops
- left = left.astype(object)
- right = right.astype(object)
-
- left, right = left.align(right, copy=False)
-
- return left, right
-
-
-def flex_method_SERIES(op):
- name = op.__name__.strip("_")
- doc = make_flex_doc(name, "series")
-
- @Appender(doc)
- def flex_wrapper(self, other, level=None, fill_value=None, axis: Axis = 0):
- # validate axis
- if axis is not None:
- self._get_axis_number(axis)
-
- res_name = get_op_result_name(self, other)
-
- if isinstance(other, ABCSeries):
- return self._binop(other, op, level=level, fill_value=fill_value)
- elif isinstance(other, (np.ndarray, list, tuple)):
- if len(other) != len(self):
- raise ValueError("Lengths must be equal")
- other = self._constructor(other, self.index)
- result = self._binop(other, op, level=level, fill_value=fill_value)
- result.name = res_name
- return result
- else:
- if fill_value is not None:
- self = self.fillna(fill_value)
-
- return op(self, other)
-
- flex_wrapper.__name__ = name
- return flex_wrapper
-
-
-# -----------------------------------------------------------------------------
-# DataFrame
-
-
-def align_method_FRAME(
- left, right, axis, flex: bool | None = False, level: Level = None
-):
- """
- Convert rhs to meet lhs dims if input is list, tuple or np.ndarray.
-
- Parameters
- ----------
- left : DataFrame
- right : Any
- axis : int, str, or None
- flex : bool or None, default False
- Whether this is a flex op, in which case we reindex.
- None indicates not to check for alignment.
- level : int or level name, default None
-
- Returns
- -------
- left : DataFrame
- right : Any
- """
-
- def to_series(right):
- msg = "Unable to coerce to Series, length must be {req_len}: given {given_len}"
-
- # pass dtype to avoid doing inference, which would break consistency
- # with Index/Series ops
- dtype = None
- if getattr(right, "dtype", None) == object:
- # can't pass right.dtype unconditionally as that would break on e.g.
- # datetime64[h] ndarray
- dtype = object
-
- if axis is not None and left._get_axis_name(axis) == "index":
- if len(left.index) != len(right):
- raise ValueError(
- msg.format(req_len=len(left.index), given_len=len(right))
- )
- right = left._constructor_sliced(right, index=left.index, dtype=dtype)
- else:
- if len(left.columns) != len(right):
- raise ValueError(
- msg.format(req_len=len(left.columns), given_len=len(right))
- )
- right = left._constructor_sliced(right, index=left.columns, dtype=dtype)
- return right
-
- if isinstance(right, np.ndarray):
- if right.ndim == 1:
- right = to_series(right)
-
- elif right.ndim == 2:
- # We need to pass dtype=right.dtype to retain object dtype
- # otherwise we lose consistency with Index and array ops
- dtype = None
- if getattr(right, "dtype", None) == object:
- # can't pass right.dtype unconditionally as that would break on e.g.
- # datetime64[h] ndarray
- dtype = object
-
- if right.shape == left.shape:
- right = left._constructor(
- right, index=left.index, columns=left.columns, dtype=dtype
- )
-
- elif right.shape[0] == left.shape[0] and right.shape[1] == 1:
- # Broadcast across columns
- right = np.broadcast_to(right, left.shape)
- right = left._constructor(
- right, index=left.index, columns=left.columns, dtype=dtype
- )
-
- elif right.shape[1] == left.shape[1] and right.shape[0] == 1:
- # Broadcast along rows
- right = to_series(right[0, :])
-
- else:
- raise ValueError(
- "Unable to coerce to DataFrame, shape "
- f"must be {left.shape}: given {right.shape}"
- )
-
- elif right.ndim > 2:
- raise ValueError(
- "Unable to coerce to Series/DataFrame, "
- f"dimension must be <= 2: {right.shape}"
- )
-
- elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)):
- # GH 36702. Raise when attempting arithmetic with list of array-like.
- if any(is_array_like(el) for el in right):
- raise ValueError(
- f"Unable to coerce list of {type(right[0])} to Series/DataFrame"
- )
- # GH17901
- right = to_series(right)
-
- if flex is not None and isinstance(right, ABCDataFrame):
- if not left._indexed_same(right):
- if flex:
- left, right = left.align(right, join="outer", level=level, copy=False)
- else:
- raise ValueError(
- "Can only compare identically-labeled (both index and columns) "
- "DataFrame objects"
- )
- elif isinstance(right, ABCSeries):
- # axis=1 is default for DataFrame-with-Series op
- axis = left._get_axis_number(axis) if axis is not None else 1
-
- if not flex:
- if not left.axes[axis].equals(right.index):
- raise ValueError(
- "Operands are not aligned. Do "
- "`left, right = left.align(right, axis=1, copy=False)` "
- "before operating."
- )
-
- left, right = left.align(
- right, join="outer", axis=axis, level=level, copy=False
- )
- right = _maybe_align_series_as_frame(left, right, axis)
-
- return left, right
-
-
-def should_reindex_frame_op(
- left: DataFrame, right, op, axis: int, fill_value, level
-) -> bool:
- """
- Check if this is an operation between DataFrames that will need to reindex.
- """
- assert isinstance(left, ABCDataFrame)
-
- if op is operator.pow or op is roperator.rpow:
- # GH#32685 pow has special semantics for operating with null values
- return False
-
- if not isinstance(right, ABCDataFrame):
- return False
-
- if fill_value is None and level is None and axis == 1:
- # TODO: any other cases we should handle here?
-
- # Intersection is always unique so we have to check the unique columns
- left_uniques = left.columns.unique()
- right_uniques = right.columns.unique()
- cols = left_uniques.intersection(right_uniques)
- if len(cols) and not (
- len(cols) == len(left_uniques) and len(cols) == len(right_uniques)
- ):
- # TODO: is there a shortcut available when len(cols) == 0?
- return True
-
- return False
-
-
-def frame_arith_method_with_reindex(left: DataFrame, right: DataFrame, op) -> DataFrame:
- """
- For DataFrame-with-DataFrame operations that require reindexing,
- operate only on shared columns, then reindex.
-
- Parameters
- ----------
- left : DataFrame
- right : DataFrame
- op : binary operator
-
- Returns
- -------
- DataFrame
- """
- # GH#31623, only operate on shared columns
- cols, lcols, rcols = left.columns.join(
- right.columns, how="inner", level=None, return_indexers=True
- )
-
- new_left = left.iloc[:, lcols]
- new_right = right.iloc[:, rcols]
- result = op(new_left, new_right)
-
- # Do the join on the columns instead of using align_method_FRAME
- # to avoid constructing two potentially large/sparse DataFrames
- join_columns, _, _ = left.columns.join(
- right.columns, how="outer", level=None, return_indexers=True
- )
-
- if result.columns.has_duplicates:
- # Avoid reindexing with a duplicate axis.
- # https://github.com/pandas-dev/pandas/issues/35194
- indexer, _ = result.columns.get_indexer_non_unique(join_columns)
- indexer = algorithms.unique1d(indexer)
- result = result._reindex_with_indexers(
- {1: [join_columns, indexer]}, allow_dups=True
- )
- else:
- result = result.reindex(join_columns, axis=1)
-
- return result
-
-
-def _maybe_align_series_as_frame(frame: DataFrame, series: Series, axis: AxisInt):
- """
- If the Series operand is not EA-dtype, we can broadcast to 2D and operate
- blockwise.
- """
- rvalues = series._values
- if not isinstance(rvalues, np.ndarray):
- # TODO(EA2D): no need to special-case with 2D EAs
- if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"):
- # We can losslessly+cheaply cast to ndarray
- rvalues = np.asarray(rvalues)
- else:
- return series
-
- if axis == 0:
- rvalues = rvalues.reshape(-1, 1)
- else:
- rvalues = rvalues.reshape(1, -1)
-
- rvalues = np.broadcast_to(rvalues, frame.shape)
- # pass dtype to avoid doing inference
- return type(frame)(
- rvalues, index=frame.index, columns=frame.columns, dtype=rvalues.dtype
- )
-
-
-def flex_arith_method_FRAME(op):
- op_name = op.__name__.strip("_")
-
- na_op = get_array_op(op)
- doc = make_flex_doc(op_name, "dataframe")
-
- @Appender(doc)
- def f(self, other, axis: Axis = "columns", level=None, fill_value=None):
- axis = self._get_axis_number(axis) if axis is not None else 1
- axis = cast(int, axis)
-
- if should_reindex_frame_op(self, other, op, axis, fill_value, level):
- return frame_arith_method_with_reindex(self, other, op)
-
- if isinstance(other, ABCSeries) and fill_value is not None:
- # TODO: We could allow this in cases where we end up going
- # through the DataFrame path
- raise NotImplementedError(f"fill_value {fill_value} not supported.")
-
- other = maybe_prepare_scalar_for_op(other, self.shape)
- self, other = align_method_FRAME(self, other, axis, flex=True, level=level)
-
- if isinstance(other, ABCDataFrame):
- # Another DataFrame
- new_data = self._combine_frame(other, na_op, fill_value)
-
- elif isinstance(other, ABCSeries):
- new_data = self._dispatch_frame_op(other, op, axis=axis)
- else:
- # in this case we always have `np.ndim(other) == 0`
- if fill_value is not None:
- self = self.fillna(fill_value)
-
- new_data = self._dispatch_frame_op(other, op)
-
- return self._construct_result(new_data)
-
- f.__name__ = op_name
-
- return f
-
-
-def flex_comp_method_FRAME(op):
- op_name = op.__name__.strip("_")
-
- doc = _flex_comp_doc_FRAME.format(
- op_name=op_name, desc=_op_descriptions[op_name]["desc"]
- )
-
- @Appender(doc)
- def f(self, other, axis: Axis = "columns", level=None):
- axis = self._get_axis_number(axis) if axis is not None else 1
-
- self, other = align_method_FRAME(self, other, axis, flex=True, level=level)
-
- new_data = self._dispatch_frame_op(other, op, axis=axis)
- return self._construct_result(new_data)
-
- f.__name__ = op_name
-
- return f
-
-
-__all__ = [
- "add_flex_arithmetic_methods",
- "align_method_FRAME",
- "align_method_SERIES",
- "ARITHMETIC_BINOPS",
- "arithmetic_op",
- "COMPARISON_BINOPS",
- "comparison_op",
- "comp_method_OBJECT_ARRAY",
- "fill_binop",
- "flex_arith_method_FRAME",
- "flex_comp_method_FRAME",
- "flex_method_SERIES",
- "frame_arith_method_with_reindex",
- "invalid_comparison",
- "kleene_and",
- "kleene_or",
- "kleene_xor",
- "logical_op",
- "maybe_dispatch_ufunc_to_dunder_op",
- "radd",
- "rand_",
- "rdiv",
- "rdivmod",
- "rfloordiv",
- "rmod",
- "rmul",
- "ror_",
- "rpow",
- "rsub",
- "rtruediv",
- "rxor",
- "should_reindex_frame_op",
- "unpack_zerodim_and_defer",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/ops/array_ops.py b/contrib/python/pandas/py3/pandas/core/ops/array_ops.py
deleted file mode 100644
index 0ac8b377b19..00000000000
--- a/contrib/python/pandas/py3/pandas/core/ops/array_ops.py
+++ /dev/null
@@ -1,544 +0,0 @@
-"""
-Functions for arithmetic and comparison operations on NumPy arrays and
-ExtensionArrays.
-"""
-from __future__ import annotations
-
-import datetime
-from functools import partial
-import operator
-from typing import Any
-
-import numpy as np
-
-from pandas._libs import (
- NaT,
- Timedelta,
- Timestamp,
- lib,
- ops as libops,
-)
-from pandas._libs.tslibs import (
- BaseOffset,
- get_supported_reso,
- get_unit_from_dtype,
- is_supported_unit,
- is_unitless,
- npy_unit_to_abbrev,
-)
-from pandas._typing import (
- ArrayLike,
- Shape,
-)
-
-from pandas.core.dtypes.cast import (
- construct_1d_object_array_from_listlike,
- find_common_type,
-)
-from pandas.core.dtypes.common import (
- ensure_object,
- is_bool_dtype,
- is_integer_dtype,
- is_list_like,
- is_numeric_v_string_like,
- is_object_dtype,
- is_scalar,
-)
-from pandas.core.dtypes.generic import (
- ABCExtensionArray,
- ABCIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import (
- isna,
- notna,
-)
-
-from pandas.core.computation import expressions
-from pandas.core.construction import ensure_wrapped_if_datetimelike
-from pandas.core.ops import (
- missing,
- roperator,
-)
-from pandas.core.ops.dispatch import should_extension_dispatch
-from pandas.core.ops.invalid import invalid_comparison
-
-
-def comp_method_OBJECT_ARRAY(op, x, y):
- if isinstance(y, list):
- y = construct_1d_object_array_from_listlike(y)
-
- if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)):
- if not is_object_dtype(y.dtype):
- y = y.astype(np.object_)
-
- if isinstance(y, (ABCSeries, ABCIndex)):
- y = y._values
-
- if x.shape != y.shape:
- raise ValueError("Shapes must match", x.shape, y.shape)
- result = libops.vec_compare(x.ravel(), y.ravel(), op)
- else:
- result = libops.scalar_compare(x.ravel(), y, op)
- return result.reshape(x.shape)
-
-
-def _masked_arith_op(x: np.ndarray, y, op):
- """
- If the given arithmetic operation fails, attempt it again on
- only the non-null elements of the input array(s).
-
- Parameters
- ----------
- x : np.ndarray
- y : np.ndarray, Series, Index
- op : binary operator
- """
- # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes
- # the logic valid for both Series and DataFrame ops.
- xrav = x.ravel()
- assert isinstance(x, np.ndarray), type(x)
- if isinstance(y, np.ndarray):
- dtype = find_common_type([x.dtype, y.dtype])
- result = np.empty(x.size, dtype=dtype)
-
- if len(x) != len(y):
- raise ValueError(x.shape, y.shape)
- ymask = notna(y)
-
- # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex
- # we would get int64 dtype, see GH#19956
- yrav = y.ravel()
- mask = notna(xrav) & ymask.ravel()
-
- # See GH#5284, GH#5035, GH#19448 for historical reference
- if mask.any():
- result[mask] = op(xrav[mask], yrav[mask])
-
- else:
- if not is_scalar(y):
- raise TypeError(
- f"Cannot broadcast np.ndarray with operand of type { type(y) }"
- )
-
- # mask is only meaningful for x
- result = np.empty(x.size, dtype=x.dtype)
- mask = notna(xrav)
-
- # 1 ** np.nan is 1. So we have to unmask those.
- if op is pow:
- mask = np.where(x == 1, False, mask)
- elif op is roperator.rpow:
- mask = np.where(y == 1, False, mask)
-
- if mask.any():
- result[mask] = op(xrav[mask], y)
-
- np.putmask(result, ~mask, np.nan)
- result = result.reshape(x.shape) # 2D compat
- return result
-
-
-def _na_arithmetic_op(left: np.ndarray, right, op, is_cmp: bool = False):
- """
- Return the result of evaluating op on the passed in values.
-
- If native types are not compatible, try coercion to object dtype.
-
- Parameters
- ----------
- left : np.ndarray
- right : np.ndarray or scalar
- Excludes DataFrame, Series, Index, ExtensionArray.
- is_cmp : bool, default False
- If this a comparison operation.
-
- Returns
- -------
- array-like
-
- Raises
- ------
- TypeError : invalid operation
- """
- if isinstance(right, str):
- # can never use numexpr
- func = op
- else:
- func = partial(expressions.evaluate, op)
-
- try:
- result = func(left, right)
- except TypeError:
- if not is_cmp and (is_object_dtype(left.dtype) or is_object_dtype(right)):
- # For object dtype, fallback to a masked operation (only operating
- # on the non-missing values)
- # Don't do this for comparisons, as that will handle complex numbers
- # incorrectly, see GH#32047
- result = _masked_arith_op(left, right, op)
- else:
- raise
-
- if is_cmp and (is_scalar(result) or result is NotImplemented):
- # numpy returned a scalar instead of operating element-wise
- # e.g. numeric array vs str
- # TODO: can remove this after dropping some future numpy version?
- return invalid_comparison(left, right, op)
-
- return missing.dispatch_fill_zeros(op, left, right, result)
-
-
-def arithmetic_op(left: ArrayLike, right: Any, op):
- """
- Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ...
-
- Note: the caller is responsible for ensuring that numpy warnings are
- suppressed (with np.errstate(all="ignore")) if needed.
-
- Parameters
- ----------
- left : np.ndarray or ExtensionArray
- right : object
- Cannot be a DataFrame or Index. Series is *not* excluded.
- op : {operator.add, operator.sub, ...}
- Or one of the reversed variants from roperator.
-
- Returns
- -------
- ndarray or ExtensionArray
- Or a 2-tuple of these in the case of divmod or rdivmod.
- """
- # NB: We assume that extract_array and ensure_wrapped_if_datetimelike
- # have already been called on `left` and `right`,
- # and `maybe_prepare_scalar_for_op` has already been called on `right`
- # We need to special-case datetime64/timedelta64 dtypes (e.g. because numpy
- # casts integer dtypes to timedelta64 when operating with timedelta64 - GH#22390)
-
- if (
- should_extension_dispatch(left, right)
- or isinstance(right, (Timedelta, BaseOffset, Timestamp))
- or right is NaT
- ):
- # Timedelta/Timestamp and other custom scalars are included in the check
- # because numexpr will fail on it, see GH#31457
- res_values = op(left, right)
- else:
- # TODO we should handle EAs consistently and move this check before the if/else
- # (https://github.com/pandas-dev/pandas/issues/41165)
- _bool_arith_check(op, left, right)
-
- # error: Argument 1 to "_na_arithmetic_op" has incompatible type
- # "Union[ExtensionArray, ndarray[Any, Any]]"; expected "ndarray[Any, Any]"
- res_values = _na_arithmetic_op(left, right, op) # type: ignore[arg-type]
-
- return res_values
-
-
-def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
- """
- Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`.
-
- Note: the caller is responsible for ensuring that numpy warnings are
- suppressed (with np.errstate(all="ignore")) if needed.
-
- Parameters
- ----------
- left : np.ndarray or ExtensionArray
- right : object
- Cannot be a DataFrame, Series, or Index.
- op : {operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le}
-
- Returns
- -------
- ndarray or ExtensionArray
- """
- # NB: We assume extract_array has already been called on left and right
- lvalues = ensure_wrapped_if_datetimelike(left)
- rvalues = ensure_wrapped_if_datetimelike(right)
-
- rvalues = lib.item_from_zerodim(rvalues)
- if isinstance(rvalues, list):
- # We don't catch tuple here bc we may be comparing e.g. MultiIndex
- # to a tuple that represents a single entry, see test_compare_tuple_strs
- rvalues = np.asarray(rvalues)
-
- if isinstance(rvalues, (np.ndarray, ABCExtensionArray)):
- # TODO: make this treatment consistent across ops and classes.
- # We are not catching all listlikes here (e.g. frozenset, tuple)
- # The ambiguous case is object-dtype. See GH#27803
- if len(lvalues) != len(rvalues):
- raise ValueError(
- "Lengths must match to compare", lvalues.shape, rvalues.shape
- )
-
- if should_extension_dispatch(lvalues, rvalues) or (
- (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT)
- and not is_object_dtype(lvalues.dtype)
- ):
- # Call the method on lvalues
- res_values = op(lvalues, rvalues)
-
- elif is_scalar(rvalues) and isna(rvalues): # TODO: but not pd.NA?
- # numpy does not like comparisons vs None
- if op is operator.ne:
- res_values = np.ones(lvalues.shape, dtype=bool)
- else:
- res_values = np.zeros(lvalues.shape, dtype=bool)
-
- elif is_numeric_v_string_like(lvalues, rvalues):
- # GH#36377 going through the numexpr path would incorrectly raise
- return invalid_comparison(lvalues, rvalues, op)
-
- elif is_object_dtype(lvalues.dtype) or isinstance(rvalues, str):
- res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
-
- else:
- res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True)
-
- return res_values
-
-
-def na_logical_op(x: np.ndarray, y, op):
- try:
- # For exposition, write:
- # yarr = isinstance(y, np.ndarray)
- # yint = is_integer(y) or (yarr and y.dtype.kind == "i")
- # ybool = is_bool(y) or (yarr and y.dtype.kind == "b")
- # xint = x.dtype.kind == "i"
- # xbool = x.dtype.kind == "b"
- # Then Cases where this goes through without raising include:
- # (xint or xbool) and (yint or bool)
- result = op(x, y)
- except TypeError:
- if isinstance(y, np.ndarray):
- # bool-bool dtype operations should be OK, should not get here
- assert not (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype))
- x = ensure_object(x)
- y = ensure_object(y)
- result = libops.vec_binop(x.ravel(), y.ravel(), op)
- else:
- # let null fall thru
- assert lib.is_scalar(y)
- if not isna(y):
- y = bool(y)
- try:
- result = libops.scalar_binop(x, y, op)
- except (
- TypeError,
- ValueError,
- AttributeError,
- OverflowError,
- NotImplementedError,
- ) as err:
- typ = type(y).__name__
- raise TypeError(
- f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array "
- f"and scalar of type [{typ}]"
- ) from err
-
- return result.reshape(x.shape)
-
-
-def logical_op(left: ArrayLike, right: Any, op) -> ArrayLike:
- """
- Evaluate a logical operation `|`, `&`, or `^`.
-
- Parameters
- ----------
- left : np.ndarray or ExtensionArray
- right : object
- Cannot be a DataFrame, Series, or Index.
- op : {operator.and_, operator.or_, operator.xor}
- Or one of the reversed variants from roperator.
-
- Returns
- -------
- ndarray or ExtensionArray
- """
- fill_int = lambda x: x
-
- def fill_bool(x, left=None):
- # if `left` is specifically not-boolean, we do not cast to bool
- if x.dtype.kind in ["c", "f", "O"]:
- # dtypes that can hold NA
- mask = isna(x)
- if mask.any():
- x = x.astype(object)
- x[mask] = False
-
- if left is None or is_bool_dtype(left.dtype):
- x = x.astype(bool)
- return x
-
- is_self_int_dtype = is_integer_dtype(left.dtype)
-
- right = lib.item_from_zerodim(right)
- if is_list_like(right) and not hasattr(right, "dtype"):
- # e.g. list, tuple
- right = construct_1d_object_array_from_listlike(right)
-
- # NB: We assume extract_array has already been called on left and right
- lvalues = ensure_wrapped_if_datetimelike(left)
- rvalues = right
-
- if should_extension_dispatch(lvalues, rvalues):
- # Call the method on lvalues
- res_values = op(lvalues, rvalues)
-
- else:
- if isinstance(rvalues, np.ndarray):
- is_other_int_dtype = is_integer_dtype(rvalues.dtype)
- rvalues = rvalues if is_other_int_dtype else fill_bool(rvalues, lvalues)
-
- else:
- # i.e. scalar
- is_other_int_dtype = lib.is_integer(rvalues)
-
- # For int vs int `^`, `|`, `&` are bitwise operators and return
- # integer dtypes. Otherwise these are boolean ops
- filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool
-
- res_values = na_logical_op(lvalues, rvalues, op)
- # error: Cannot call function of unknown type
- res_values = filler(res_values) # type: ignore[operator]
-
- return res_values
-
-
-def get_array_op(op):
- """
- Return a binary array operation corresponding to the given operator op.
-
- Parameters
- ----------
- op : function
- Binary operator from operator or roperator module.
-
- Returns
- -------
- functools.partial
- """
- if isinstance(op, partial):
- # We get here via dispatch_to_series in DataFrame case
- # e.g. test_rolling_consistency_var_debiasing_factors
- return op
-
- op_name = op.__name__.strip("_").lstrip("r")
- if op_name == "arith_op":
- # Reached via DataFrame._combine_frame i.e. flex methods
- # e.g. test_df_add_flex_filled_mixed_dtypes
- return op
-
- if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}:
- return partial(comparison_op, op=op)
- elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}:
- return partial(logical_op, op=op)
- elif op_name in {
- "add",
- "sub",
- "mul",
- "truediv",
- "floordiv",
- "mod",
- "divmod",
- "pow",
- }:
- return partial(arithmetic_op, op=op)
- else:
- raise NotImplementedError(op_name)
-
-
-def maybe_prepare_scalar_for_op(obj, shape: Shape):
- """
- Cast non-pandas objects to pandas types to unify behavior of arithmetic
- and comparison operations.
-
- Parameters
- ----------
- obj: object
- shape : tuple[int]
-
- Returns
- -------
- out : object
-
- Notes
- -----
- Be careful to call this *after* determining the `name` attribute to be
- attached to the result of the arithmetic operation.
- """
- if type(obj) is datetime.timedelta:
- # GH#22390 cast up to Timedelta to rely on Timedelta
- # implementation; otherwise operation against numeric-dtype
- # raises TypeError
- return Timedelta(obj)
- elif type(obj) is datetime.datetime:
- # cast up to Timestamp to rely on Timestamp implementation, see Timedelta above
- return Timestamp(obj)
- elif isinstance(obj, np.datetime64):
- # GH#28080 numpy casts integer-dtype to datetime64 when doing
- # array[int] + datetime64, which we do not allow
- if isna(obj):
- from pandas.core.arrays import DatetimeArray
-
- # Avoid possible ambiguities with pd.NaT
- # GH 52295
- if is_unitless(obj.dtype):
- obj = obj.astype("datetime64[ns]")
- elif not is_supported_unit(get_unit_from_dtype(obj.dtype)):
- unit = get_unit_from_dtype(obj.dtype)
- closest_unit = npy_unit_to_abbrev(get_supported_reso(unit))
- obj = obj.astype(f"datetime64[{closest_unit}]")
- right = np.broadcast_to(obj, shape)
- return DatetimeArray(right)
-
- return Timestamp(obj)
-
- elif isinstance(obj, np.timedelta64):
- if isna(obj):
- from pandas.core.arrays import TimedeltaArray
-
- # wrapping timedelta64("NaT") in Timedelta returns NaT,
- # which would incorrectly be treated as a datetime-NaT, so
- # we broadcast and wrap in a TimedeltaArray
- # GH 52295
- if is_unitless(obj.dtype):
- obj = obj.astype("timedelta64[ns]")
- elif not is_supported_unit(get_unit_from_dtype(obj.dtype)):
- unit = get_unit_from_dtype(obj.dtype)
- closest_unit = npy_unit_to_abbrev(get_supported_reso(unit))
- obj = obj.astype(f"timedelta64[{closest_unit}]")
- right = np.broadcast_to(obj, shape)
- return TimedeltaArray(right)
-
- # In particular non-nanosecond timedelta64 needs to be cast to
- # nanoseconds, or else we get undesired behavior like
- # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D')
- return Timedelta(obj)
-
- return obj
-
-
-_BOOL_OP_NOT_ALLOWED = {
- operator.truediv,
- roperator.rtruediv,
- operator.floordiv,
- roperator.rfloordiv,
- operator.pow,
- roperator.rpow,
-}
-
-
-def _bool_arith_check(op, a, b):
- """
- In contrast to numpy, pandas raises an error for certain operations
- with booleans.
- """
- if op in _BOOL_OP_NOT_ALLOWED:
- if is_bool_dtype(a.dtype) and (
- is_bool_dtype(b) or isinstance(b, (bool, np.bool_))
- ):
- op_name = op.__name__.strip("_").lstrip("r")
- raise NotImplementedError(
- f"operator '{op_name}' not implemented for bool dtypes"
- )
diff --git a/contrib/python/pandas/py3/pandas/core/ops/common.py b/contrib/python/pandas/py3/pandas/core/ops/common.py
deleted file mode 100644
index d4ae1433722..00000000000
--- a/contrib/python/pandas/py3/pandas/core/ops/common.py
+++ /dev/null
@@ -1,151 +0,0 @@
-"""
-Boilerplate functions used in defining binary operations.
-"""
-from __future__ import annotations
-
-from functools import wraps
-import sys
-from typing import Callable
-
-from pandas._libs.lib import item_from_zerodim
-from pandas._libs.missing import is_matching_na
-from pandas._typing import F
-
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCIndex,
- ABCSeries,
-)
-
-
-def unpack_zerodim_and_defer(name: str) -> Callable[[F], F]:
- """
- Boilerplate for pandas conventions in arithmetic and comparison methods.
-
- Parameters
- ----------
- name : str
-
- Returns
- -------
- decorator
- """
-
- def wrapper(method: F) -> F:
- return _unpack_zerodim_and_defer(method, name)
-
- return wrapper
-
-
-def _unpack_zerodim_and_defer(method, name: str):
- """
- Boilerplate for pandas conventions in arithmetic and comparison methods.
-
- Ensure method returns NotImplemented when operating against "senior"
- classes. Ensure zero-dimensional ndarrays are always unpacked.
-
- Parameters
- ----------
- method : binary method
- name : str
-
- Returns
- -------
- method
- """
- if sys.version_info < (3, 9):
- from pandas.util._str_methods import (
- removeprefix,
- removesuffix,
- )
-
- stripped_name = removesuffix(removeprefix(name, "__"), "__")
- else:
- stripped_name = name.removeprefix("__").removesuffix("__")
- is_cmp = stripped_name in {"eq", "ne", "lt", "le", "gt", "ge"}
-
- @wraps(method)
- def new_method(self, other):
- if is_cmp and isinstance(self, ABCIndex) and isinstance(other, ABCSeries):
- # For comparison ops, Index does *not* defer to Series
- pass
- else:
- for cls in [ABCDataFrame, ABCSeries, ABCIndex]:
- if isinstance(self, cls):
- break
- if isinstance(other, cls):
- return NotImplemented
-
- other = item_from_zerodim(other)
-
- return method(self, other)
-
- return new_method
-
-
-def get_op_result_name(left, right):
- """
- Find the appropriate name to pin to an operation result. This result
- should always be either an Index or a Series.
-
- Parameters
- ----------
- left : {Series, Index}
- right : object
-
- Returns
- -------
- name : object
- Usually a string
- """
- if isinstance(right, (ABCSeries, ABCIndex)):
- name = _maybe_match_name(left, right)
- else:
- name = left.name
- return name
-
-
-def _maybe_match_name(a, b):
- """
- Try to find a name to attach to the result of an operation between
- a and b. If only one of these has a `name` attribute, return that
- name. Otherwise return a consensus name if they match or None if
- they have different names.
-
- Parameters
- ----------
- a : object
- b : object
-
- Returns
- -------
- name : str or None
-
- See Also
- --------
- pandas.core.common.consensus_name_attr
- """
- a_has = hasattr(a, "name")
- b_has = hasattr(b, "name")
- if a_has and b_has:
- try:
- if a.name == b.name:
- return a.name
- elif is_matching_na(a.name, b.name):
- # e.g. both are np.nan
- return a.name
- else:
- return None
- except TypeError:
- # pd.NA
- if is_matching_na(a.name, b.name):
- return a.name
- return None
- except ValueError:
- # e.g. np.int64(1) vs (np.int64(1), np.int64(2))
- return None
- elif a_has:
- return a.name
- elif b_has:
- return b.name
- return None
diff --git a/contrib/python/pandas/py3/pandas/core/ops/dispatch.py b/contrib/python/pandas/py3/pandas/core/ops/dispatch.py
deleted file mode 100644
index 2f500703ccf..00000000000
--- a/contrib/python/pandas/py3/pandas/core/ops/dispatch.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""
-Functions for defining unary operations.
-"""
-from __future__ import annotations
-
-from typing import Any
-
-from pandas._typing import ArrayLike
-
-from pandas.core.dtypes.generic import ABCExtensionArray
-
-
-def should_extension_dispatch(left: ArrayLike, right: Any) -> bool:
- """
- Identify cases where Series operation should dispatch to ExtensionArray method.
-
- Parameters
- ----------
- left : np.ndarray or ExtensionArray
- right : object
-
- Returns
- -------
- bool
- """
- return isinstance(left, ABCExtensionArray) or isinstance(right, ABCExtensionArray)
diff --git a/contrib/python/pandas/py3/pandas/core/ops/docstrings.py b/contrib/python/pandas/py3/pandas/core/ops/docstrings.py
deleted file mode 100644
index cdf1c120719..00000000000
--- a/contrib/python/pandas/py3/pandas/core/ops/docstrings.py
+++ /dev/null
@@ -1,765 +0,0 @@
-"""
-Templating for ops docstrings
-"""
-from __future__ import annotations
-
-
-def make_flex_doc(op_name: str, typ: str) -> str:
- """
- Make the appropriate substitutions for the given operation and class-typ
- into either _flex_doc_SERIES or _flex_doc_FRAME to return the docstring
- to attach to a generated method.
-
- Parameters
- ----------
- op_name : str {'__add__', '__sub__', ... '__eq__', '__ne__', ...}
- typ : str {series, 'dataframe']}
-
- Returns
- -------
- doc : str
- """
- op_name = op_name.replace("__", "")
- op_desc = _op_descriptions[op_name]
-
- op_desc_op = op_desc["op"]
- assert op_desc_op is not None # for mypy
- if op_name.startswith("r"):
- equiv = f"other {op_desc_op} {typ}"
- elif op_name == "divmod":
- equiv = f"{op_name}({typ}, other)"
- else:
- equiv = f"{typ} {op_desc_op} other"
-
- if typ == "series":
- base_doc = _flex_doc_SERIES
- if op_desc["reverse"]:
- base_doc += _see_also_reverse_SERIES.format(
- reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"]
- )
- doc_no_examples = base_doc.format(
- desc=op_desc["desc"],
- op_name=op_name,
- equiv=equiv,
- series_returns=op_desc["series_returns"],
- )
- ser_example = op_desc["series_examples"]
- if ser_example:
- doc = doc_no_examples + ser_example
- else:
- doc = doc_no_examples
- elif typ == "dataframe":
- base_doc = _flex_doc_FRAME
- doc = base_doc.format(
- desc=op_desc["desc"],
- op_name=op_name,
- equiv=equiv,
- reverse=op_desc["reverse"],
- )
- else:
- raise AssertionError("Invalid typ argument.")
- return doc
-
-
-_common_examples_algebra_SERIES = """
-Examples
---------
->>> a = pd.Series([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'])
->>> a
-a 1.0
-b 1.0
-c 1.0
-d NaN
-dtype: float64
->>> b = pd.Series([1, np.nan, 1, np.nan], index=['a', 'b', 'd', 'e'])
->>> b
-a 1.0
-b NaN
-d 1.0
-e NaN
-dtype: float64"""
-
-_common_examples_comparison_SERIES = """
-Examples
---------
->>> a = pd.Series([1, 1, 1, np.nan, 1], index=['a', 'b', 'c', 'd', 'e'])
->>> a
-a 1.0
-b 1.0
-c 1.0
-d NaN
-e 1.0
-dtype: float64
->>> b = pd.Series([0, 1, 2, np.nan, 1], index=['a', 'b', 'c', 'd', 'f'])
->>> b
-a 0.0
-b 1.0
-c 2.0
-d NaN
-f 1.0
-dtype: float64"""
-
-_add_example_SERIES = (
- _common_examples_algebra_SERIES
- + """
->>> a.add(b, fill_value=0)
-a 2.0
-b 1.0
-c 1.0
-d 1.0
-e NaN
-dtype: float64
-"""
-)
-
-_sub_example_SERIES = (
- _common_examples_algebra_SERIES
- + """
->>> a.subtract(b, fill_value=0)
-a 0.0
-b 1.0
-c 1.0
-d -1.0
-e NaN
-dtype: float64
-"""
-)
-
-_mul_example_SERIES = (
- _common_examples_algebra_SERIES
- + """
->>> a.multiply(b, fill_value=0)
-a 1.0
-b 0.0
-c 0.0
-d 0.0
-e NaN
-dtype: float64
-"""
-)
-
-_div_example_SERIES = (
- _common_examples_algebra_SERIES
- + """
->>> a.divide(b, fill_value=0)
-a 1.0
-b inf
-c inf
-d 0.0
-e NaN
-dtype: float64
-"""
-)
-
-_floordiv_example_SERIES = (
- _common_examples_algebra_SERIES
- + """
->>> a.floordiv(b, fill_value=0)
-a 1.0
-b inf
-c inf
-d 0.0
-e NaN
-dtype: float64
-"""
-)
-
-_divmod_example_SERIES = (
- _common_examples_algebra_SERIES
- + """
->>> a.divmod(b, fill_value=0)
-(a 1.0
- b NaN
- c NaN
- d 0.0
- e NaN
- dtype: float64,
- a 0.0
- b NaN
- c NaN
- d 0.0
- e NaN
- dtype: float64)
-"""
-)
-
-_mod_example_SERIES = (
- _common_examples_algebra_SERIES
- + """
->>> a.mod(b, fill_value=0)
-a 0.0
-b NaN
-c NaN
-d 0.0
-e NaN
-dtype: float64
-"""
-)
-_pow_example_SERIES = (
- _common_examples_algebra_SERIES
- + """
->>> a.pow(b, fill_value=0)
-a 1.0
-b 1.0
-c 1.0
-d 0.0
-e NaN
-dtype: float64
-"""
-)
-
-_ne_example_SERIES = (
- _common_examples_algebra_SERIES
- + """
->>> a.ne(b, fill_value=0)
-a False
-b True
-c True
-d True
-e True
-dtype: bool
-"""
-)
-
-_eq_example_SERIES = (
- _common_examples_algebra_SERIES
- + """
->>> a.eq(b, fill_value=0)
-a True
-b False
-c False
-d False
-e False
-dtype: bool
-"""
-)
-
-_lt_example_SERIES = (
- _common_examples_comparison_SERIES
- + """
->>> a.lt(b, fill_value=0)
-a False
-b False
-c True
-d False
-e False
-f True
-dtype: bool
-"""
-)
-
-_le_example_SERIES = (
- _common_examples_comparison_SERIES
- + """
->>> a.le(b, fill_value=0)
-a False
-b True
-c True
-d False
-e False
-f True
-dtype: bool
-"""
-)
-
-_gt_example_SERIES = (
- _common_examples_comparison_SERIES
- + """
->>> a.gt(b, fill_value=0)
-a True
-b False
-c False
-d False
-e True
-f False
-dtype: bool
-"""
-)
-
-_ge_example_SERIES = (
- _common_examples_comparison_SERIES
- + """
->>> a.ge(b, fill_value=0)
-a True
-b True
-c False
-d False
-e True
-f False
-dtype: bool
-"""
-)
-
-_returns_series = """Series\n The result of the operation."""
-
-_returns_tuple = """2-Tuple of Series\n The result of the operation."""
-
-_op_descriptions: dict[str, dict[str, str | None]] = {
- # Arithmetic Operators
- "add": {
- "op": "+",
- "desc": "Addition",
- "reverse": "radd",
- "series_examples": _add_example_SERIES,
- "series_returns": _returns_series,
- },
- "sub": {
- "op": "-",
- "desc": "Subtraction",
- "reverse": "rsub",
- "series_examples": _sub_example_SERIES,
- "series_returns": _returns_series,
- },
- "mul": {
- "op": "*",
- "desc": "Multiplication",
- "reverse": "rmul",
- "series_examples": _mul_example_SERIES,
- "series_returns": _returns_series,
- "df_examples": None,
- },
- "mod": {
- "op": "%",
- "desc": "Modulo",
- "reverse": "rmod",
- "series_examples": _mod_example_SERIES,
- "series_returns": _returns_series,
- },
- "pow": {
- "op": "**",
- "desc": "Exponential power",
- "reverse": "rpow",
- "series_examples": _pow_example_SERIES,
- "series_returns": _returns_series,
- "df_examples": None,
- },
- "truediv": {
- "op": "/",
- "desc": "Floating division",
- "reverse": "rtruediv",
- "series_examples": _div_example_SERIES,
- "series_returns": _returns_series,
- "df_examples": None,
- },
- "floordiv": {
- "op": "//",
- "desc": "Integer division",
- "reverse": "rfloordiv",
- "series_examples": _floordiv_example_SERIES,
- "series_returns": _returns_series,
- "df_examples": None,
- },
- "divmod": {
- "op": "divmod",
- "desc": "Integer division and modulo",
- "reverse": "rdivmod",
- "series_examples": _divmod_example_SERIES,
- "series_returns": _returns_tuple,
- "df_examples": None,
- },
- # Comparison Operators
- "eq": {
- "op": "==",
- "desc": "Equal to",
- "reverse": None,
- "series_examples": _eq_example_SERIES,
- "series_returns": _returns_series,
- },
- "ne": {
- "op": "!=",
- "desc": "Not equal to",
- "reverse": None,
- "series_examples": _ne_example_SERIES,
- "series_returns": _returns_series,
- },
- "lt": {
- "op": "<",
- "desc": "Less than",
- "reverse": None,
- "series_examples": _lt_example_SERIES,
- "series_returns": _returns_series,
- },
- "le": {
- "op": "<=",
- "desc": "Less than or equal to",
- "reverse": None,
- "series_examples": _le_example_SERIES,
- "series_returns": _returns_series,
- },
- "gt": {
- "op": ">",
- "desc": "Greater than",
- "reverse": None,
- "series_examples": _gt_example_SERIES,
- "series_returns": _returns_series,
- },
- "ge": {
- "op": ">=",
- "desc": "Greater than or equal to",
- "reverse": None,
- "series_examples": _ge_example_SERIES,
- "series_returns": _returns_series,
- },
-}
-
-_py_num_ref = """see
- `Python documentation
- <https://docs.python.org/3/reference/datamodel.html#emulating-numeric-types>`_
- for more details"""
-_op_names = list(_op_descriptions.keys())
-for key in _op_names:
- reverse_op = _op_descriptions[key]["reverse"]
- if reverse_op is not None:
- _op_descriptions[reverse_op] = _op_descriptions[key].copy()
- _op_descriptions[reverse_op]["reverse"] = key
- _op_descriptions[key][
- "see_also_desc"
- ] = f"Reverse of the {_op_descriptions[key]['desc']} operator, {_py_num_ref}"
- _op_descriptions[reverse_op][
- "see_also_desc"
- ] = f"Element-wise {_op_descriptions[key]['desc']}, {_py_num_ref}"
-
-_flex_doc_SERIES = """
-Return {desc} of series and other, element-wise (binary operator `{op_name}`).
-
-Equivalent to ``{equiv}``, but with support to substitute a fill_value for
-missing data in either one of the inputs.
-
-Parameters
-----------
-other : Series or scalar value
-level : int or name
- Broadcast across a level, matching Index values on the
- passed MultiIndex level.
-fill_value : None or float value, default None (NaN)
- Fill existing missing (NaN) values, and any new element needed for
- successful Series alignment, with this value before computation.
- If data in both corresponding Series locations is missing
- the result of filling (at that location) will be missing.
-axis : {{0 or 'index'}}
- Unused. Parameter needed for compatibility with DataFrame.
-
-Returns
--------
-{series_returns}
-"""
-
-_see_also_reverse_SERIES = """
-See Also
---------
-Series.{reverse} : {see_also_desc}.
-"""
-
-_flex_doc_FRAME = """
-Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
-
-Equivalent to ``{equiv}``, but with support to substitute a fill_value
-for missing data in one of the inputs. With reverse version, `{reverse}`.
-
-Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to
-arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`.
-
-Parameters
-----------
-other : scalar, sequence, Series, dict or DataFrame
- Any single or multiple element data structure, or list-like object.
-axis : {{0 or 'index', 1 or 'columns'}}
- Whether to compare by the index (0 or 'index') or columns.
- (1 or 'columns'). For Series input, axis to match Series index on.
-level : int or label
- Broadcast across a level, matching Index values on the
- passed MultiIndex level.
-fill_value : float or None, default None
- Fill existing missing (NaN) values, and any new element needed for
- successful DataFrame alignment, with this value before computation.
- If data in both corresponding DataFrame locations is missing
- the result will be missing.
-
-Returns
--------
-DataFrame
- Result of the arithmetic operation.
-
-See Also
---------
-DataFrame.add : Add DataFrames.
-DataFrame.sub : Subtract DataFrames.
-DataFrame.mul : Multiply DataFrames.
-DataFrame.div : Divide DataFrames (float division).
-DataFrame.truediv : Divide DataFrames (float division).
-DataFrame.floordiv : Divide DataFrames (integer division).
-DataFrame.mod : Calculate modulo (remainder after division).
-DataFrame.pow : Calculate exponential power.
-
-Notes
------
-Mismatched indices will be unioned together.
-
-Examples
---------
->>> df = pd.DataFrame({{'angles': [0, 3, 4],
-... 'degrees': [360, 180, 360]}},
-... index=['circle', 'triangle', 'rectangle'])
->>> df
- angles degrees
-circle 0 360
-triangle 3 180
-rectangle 4 360
-
-Add a scalar with operator version which return the same
-results.
-
->>> df + 1
- angles degrees
-circle 1 361
-triangle 4 181
-rectangle 5 361
-
->>> df.add(1)
- angles degrees
-circle 1 361
-triangle 4 181
-rectangle 5 361
-
-Divide by constant with reverse version.
-
->>> df.div(10)
- angles degrees
-circle 0.0 36.0
-triangle 0.3 18.0
-rectangle 0.4 36.0
-
->>> df.rdiv(10)
- angles degrees
-circle inf 0.027778
-triangle 3.333333 0.055556
-rectangle 2.500000 0.027778
-
-Subtract a list and Series by axis with operator version.
-
->>> df - [1, 2]
- angles degrees
-circle -1 358
-triangle 2 178
-rectangle 3 358
-
->>> df.sub([1, 2], axis='columns')
- angles degrees
-circle -1 358
-triangle 2 178
-rectangle 3 358
-
->>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']),
-... axis='index')
- angles degrees
-circle -1 359
-triangle 2 179
-rectangle 3 359
-
-Multiply a dictionary by axis.
-
->>> df.mul({{'angles': 0, 'degrees': 2}})
- angles degrees
-circle 0 720
-triangle 0 360
-rectangle 0 720
-
->>> df.mul({{'circle': 0, 'triangle': 2, 'rectangle': 3}}, axis='index')
- angles degrees
-circle 0 0
-triangle 6 360
-rectangle 12 1080
-
-Multiply a DataFrame of different shape with operator version.
-
->>> other = pd.DataFrame({{'angles': [0, 3, 4]}},
-... index=['circle', 'triangle', 'rectangle'])
->>> other
- angles
-circle 0
-triangle 3
-rectangle 4
-
->>> df * other
- angles degrees
-circle 0 NaN
-triangle 9 NaN
-rectangle 16 NaN
-
->>> df.mul(other, fill_value=0)
- angles degrees
-circle 0 0.0
-triangle 9 0.0
-rectangle 16 0.0
-
-Divide by a MultiIndex by level.
-
->>> df_multindex = pd.DataFrame({{'angles': [0, 3, 4, 4, 5, 6],
-... 'degrees': [360, 180, 360, 360, 540, 720]}},
-... index=[['A', 'A', 'A', 'B', 'B', 'B'],
-... ['circle', 'triangle', 'rectangle',
-... 'square', 'pentagon', 'hexagon']])
->>> df_multindex
- angles degrees
-A circle 0 360
- triangle 3 180
- rectangle 4 360
-B square 4 360
- pentagon 5 540
- hexagon 6 720
-
->>> df.div(df_multindex, level=1, fill_value=0)
- angles degrees
-A circle NaN 1.0
- triangle 1.0 1.0
- rectangle 1.0 1.0
-B square 0.0 0.0
- pentagon 0.0 0.0
- hexagon 0.0 0.0
-"""
-
-_flex_comp_doc_FRAME = """
-Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`).
-
-Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
-operators.
-
-Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis
-(rows or columns) and level for comparison.
-
-Parameters
-----------
-other : scalar, sequence, Series, or DataFrame
- Any single or multiple element data structure, or list-like object.
-axis : {{0 or 'index', 1 or 'columns'}}, default 'columns'
- Whether to compare by the index (0 or 'index') or columns
- (1 or 'columns').
-level : int or label
- Broadcast across a level, matching Index values on the passed
- MultiIndex level.
-
-Returns
--------
-DataFrame of bool
- Result of the comparison.
-
-See Also
---------
-DataFrame.eq : Compare DataFrames for equality elementwise.
-DataFrame.ne : Compare DataFrames for inequality elementwise.
-DataFrame.le : Compare DataFrames for less than inequality
- or equality elementwise.
-DataFrame.lt : Compare DataFrames for strictly less than
- inequality elementwise.
-DataFrame.ge : Compare DataFrames for greater than inequality
- or equality elementwise.
-DataFrame.gt : Compare DataFrames for strictly greater than
- inequality elementwise.
-
-Notes
------
-Mismatched indices will be unioned together.
-`NaN` values are considered different (i.e. `NaN` != `NaN`).
-
-Examples
---------
->>> df = pd.DataFrame({{'cost': [250, 150, 100],
-... 'revenue': [100, 250, 300]}},
-... index=['A', 'B', 'C'])
->>> df
- cost revenue
-A 250 100
-B 150 250
-C 100 300
-
-Comparison with a scalar, using either the operator or method:
-
->>> df == 100
- cost revenue
-A False True
-B False False
-C True False
-
->>> df.eq(100)
- cost revenue
-A False True
-B False False
-C True False
-
-When `other` is a :class:`Series`, the columns of a DataFrame are aligned
-with the index of `other` and broadcast:
-
->>> df != pd.Series([100, 250], index=["cost", "revenue"])
- cost revenue
-A True True
-B True False
-C False True
-
-Use the method to control the broadcast axis:
-
->>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis='index')
- cost revenue
-A True False
-B True True
-C True True
-D True True
-
-When comparing to an arbitrary sequence, the number of columns must
-match the number elements in `other`:
-
->>> df == [250, 100]
- cost revenue
-A True True
-B False False
-C False False
-
-Use the method to control the axis:
-
->>> df.eq([250, 250, 100], axis='index')
- cost revenue
-A True False
-B False True
-C True False
-
-Compare to a DataFrame of different shape.
-
->>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}},
-... index=['A', 'B', 'C', 'D'])
->>> other
- revenue
-A 300
-B 250
-C 100
-D 150
-
->>> df.gt(other)
- cost revenue
-A False False
-B False False
-C False True
-D False False
-
-Compare to a MultiIndex by level.
-
->>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220],
-... 'revenue': [100, 250, 300, 200, 175, 225]}},
-... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'],
-... ['A', 'B', 'C', 'A', 'B', 'C']])
->>> df_multindex
- cost revenue
-Q1 A 250 100
- B 150 250
- C 100 300
-Q2 A 150 200
- B 300 175
- C 220 225
-
->>> df.le(df_multindex, level=1)
- cost revenue
-Q1 A True True
- B True True
- C True True
-Q2 A False True
- B True False
- C True False
-"""
diff --git a/contrib/python/pandas/py3/pandas/core/ops/invalid.py b/contrib/python/pandas/py3/pandas/core/ops/invalid.py
deleted file mode 100644
index eb27cf74501..00000000000
--- a/contrib/python/pandas/py3/pandas/core/ops/invalid.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""
-Templates for invalid operations.
-"""
-from __future__ import annotations
-
-import operator
-
-import numpy as np
-
-
-def invalid_comparison(left, right, op) -> np.ndarray:
- """
- If a comparison has mismatched types and is not necessarily meaningful,
- follow python3 conventions by:
-
- - returning all-False for equality
- - returning all-True for inequality
- - raising TypeError otherwise
-
- Parameters
- ----------
- left : array-like
- right : scalar, array-like
- op : operator.{eq, ne, lt, le, gt}
-
- Raises
- ------
- TypeError : on inequality comparisons
- """
- if op is operator.eq:
- res_values = np.zeros(left.shape, dtype=bool)
- elif op is operator.ne:
- res_values = np.ones(left.shape, dtype=bool)
- else:
- typ = type(right).__name__
- raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}")
- return res_values
-
-
-def make_invalid_op(name: str):
- """
- Return a binary method that always raises a TypeError.
-
- Parameters
- ----------
- name : str
-
- Returns
- -------
- invalid_op : function
- """
-
- def invalid_op(self, other=None):
- typ = type(self).__name__
- raise TypeError(f"cannot perform {name} with this index type: {typ}")
-
- invalid_op.__name__ = name
- return invalid_op
diff --git a/contrib/python/pandas/py3/pandas/core/ops/mask_ops.py b/contrib/python/pandas/py3/pandas/core/ops/mask_ops.py
deleted file mode 100644
index adc1f63c568..00000000000
--- a/contrib/python/pandas/py3/pandas/core/ops/mask_ops.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""
-Ops for masked arrays.
-"""
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._libs import (
- lib,
- missing as libmissing,
-)
-
-
-def kleene_or(
- left: bool | np.ndarray | libmissing.NAType,
- right: bool | np.ndarray | libmissing.NAType,
- left_mask: np.ndarray | None,
- right_mask: np.ndarray | None,
-):
- """
- Boolean ``or`` using Kleene logic.
-
- Values are NA where we have ``NA | NA`` or ``NA | False``.
- ``NA | True`` is considered True.
-
- Parameters
- ----------
- left, right : ndarray, NA, or bool
- The values of the array.
- left_mask, right_mask : ndarray, optional
- The masks. Only one of these may be None, which implies that
- the associated `left` or `right` value is a scalar.
-
- Returns
- -------
- result, mask: ndarray[bool]
- The result of the logical or, and the new mask.
- """
- # To reduce the number of cases, we ensure that `left` & `left_mask`
- # always come from an array, not a scalar. This is safe, since
- # A | B == B | A
- if left_mask is None:
- return kleene_or(right, left, right_mask, left_mask)
-
- if not isinstance(left, np.ndarray):
- raise TypeError("Either `left` or `right` need to be a np.ndarray.")
-
- raise_for_nan(right, method="or")
-
- if right is libmissing.NA:
- result = left.copy()
- else:
- result = left | right
-
- if right_mask is not None:
- # output is unknown where (False & NA), (NA & False), (NA & NA)
- left_false = ~(left | left_mask)
- right_false = ~(right | right_mask)
- mask = (
- (left_false & right_mask)
- | (right_false & left_mask)
- | (left_mask & right_mask)
- )
- else:
- if right is True:
- mask = np.zeros_like(left_mask)
- elif right is libmissing.NA:
- mask = (~left & ~left_mask) | left_mask
- else:
- # False
- mask = left_mask.copy()
-
- return result, mask
-
-
-def kleene_xor(
- left: bool | np.ndarray | libmissing.NAType,
- right: bool | np.ndarray | libmissing.NAType,
- left_mask: np.ndarray | None,
- right_mask: np.ndarray | None,
-):
- """
- Boolean ``xor`` using Kleene logic.
-
- This is the same as ``or``, with the following adjustments
-
- * True, True -> False
- * True, NA -> NA
-
- Parameters
- ----------
- left, right : ndarray, NA, or bool
- The values of the array.
- left_mask, right_mask : ndarray, optional
- The masks. Only one of these may be None, which implies that
- the associated `left` or `right` value is a scalar.
-
- Returns
- -------
- result, mask: ndarray[bool]
- The result of the logical xor, and the new mask.
- """
- # To reduce the number of cases, we ensure that `left` & `left_mask`
- # always come from an array, not a scalar. This is safe, since
- # A ^ B == B ^ A
- if left_mask is None:
- return kleene_xor(right, left, right_mask, left_mask)
-
- if not isinstance(left, np.ndarray):
- raise TypeError("Either `left` or `right` need to be a np.ndarray.")
-
- raise_for_nan(right, method="xor")
- if right is libmissing.NA:
- result = np.zeros_like(left)
- else:
- result = left ^ right
-
- if right_mask is None:
- if right is libmissing.NA:
- mask = np.ones_like(left_mask)
- else:
- mask = left_mask.copy()
- else:
- mask = left_mask | right_mask
-
- return result, mask
-
-
-def kleene_and(
- left: bool | libmissing.NAType | np.ndarray,
- right: bool | libmissing.NAType | np.ndarray,
- left_mask: np.ndarray | None,
- right_mask: np.ndarray | None,
-):
- """
- Boolean ``and`` using Kleene logic.
-
- Values are ``NA`` for ``NA & NA`` or ``True & NA``.
-
- Parameters
- ----------
- left, right : ndarray, NA, or bool
- The values of the array.
- left_mask, right_mask : ndarray, optional
- The masks. Only one of these may be None, which implies that
- the associated `left` or `right` value is a scalar.
-
- Returns
- -------
- result, mask: ndarray[bool]
- The result of the logical xor, and the new mask.
- """
- # To reduce the number of cases, we ensure that `left` & `left_mask`
- # always come from an array, not a scalar. This is safe, since
- # A & B == B & A
- if left_mask is None:
- return kleene_and(right, left, right_mask, left_mask)
-
- if not isinstance(left, np.ndarray):
- raise TypeError("Either `left` or `right` need to be a np.ndarray.")
- raise_for_nan(right, method="and")
-
- if right is libmissing.NA:
- result = np.zeros_like(left)
- else:
- result = left & right
-
- if right_mask is None:
- # Scalar `right`
- if right is libmissing.NA:
- mask = (left & ~left_mask) | left_mask
-
- else:
- mask = left_mask.copy()
- if right is False:
- # unmask everything
- mask[:] = False
- else:
- # unmask where either left or right is False
- left_false = ~(left | left_mask)
- right_false = ~(right | right_mask)
- mask = (left_mask & ~right_false) | (right_mask & ~left_false)
-
- return result, mask
-
-
-def raise_for_nan(value, method: str) -> None:
- if lib.is_float(value) and np.isnan(value):
- raise ValueError(f"Cannot perform logical '{method}' with floating NaN")
diff --git a/contrib/python/pandas/py3/pandas/core/ops/methods.py b/contrib/python/pandas/py3/pandas/core/ops/methods.py
deleted file mode 100644
index be7c1205305..00000000000
--- a/contrib/python/pandas/py3/pandas/core/ops/methods.py
+++ /dev/null
@@ -1,124 +0,0 @@
-"""
-Functions to generate methods and pin them to the appropriate classes.
-"""
-from __future__ import annotations
-
-import operator
-
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-
-from pandas.core.ops import roperator
-
-
-def _get_method_wrappers(cls):
- """
- Find the appropriate operation-wrappers to use when defining flex/special
- arithmetic, boolean, and comparison operations with the given class.
-
- Parameters
- ----------
- cls : class
-
- Returns
- -------
- arith_flex : function or None
- comp_flex : function or None
- """
- # TODO: make these non-runtime imports once the relevant functions
- # are no longer in __init__
- from pandas.core.ops import (
- flex_arith_method_FRAME,
- flex_comp_method_FRAME,
- flex_method_SERIES,
- )
-
- if issubclass(cls, ABCSeries):
- # Just Series
- arith_flex = flex_method_SERIES
- comp_flex = flex_method_SERIES
- elif issubclass(cls, ABCDataFrame):
- arith_flex = flex_arith_method_FRAME
- comp_flex = flex_comp_method_FRAME
- return arith_flex, comp_flex
-
-
-def add_flex_arithmetic_methods(cls) -> None:
- """
- Adds the full suite of flex arithmetic methods (``pow``, ``mul``, ``add``)
- to the class.
-
- Parameters
- ----------
- cls : class
- flex methods will be defined and pinned to this class
- """
- flex_arith_method, flex_comp_method = _get_method_wrappers(cls)
- new_methods = _create_methods(cls, flex_arith_method, flex_comp_method)
- new_methods.update(
- {
- "multiply": new_methods["mul"],
- "subtract": new_methods["sub"],
- "divide": new_methods["div"],
- }
- )
- # opt out of bool flex methods for now
- assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_"))
-
- _add_methods(cls, new_methods=new_methods)
-
-
-def _create_methods(cls, arith_method, comp_method):
- # creates actual flex methods based upon arithmetic, and comp method
- # constructors.
-
- have_divmod = issubclass(cls, ABCSeries)
- # divmod is available for Series
-
- new_methods = {}
-
- new_methods.update(
- {
- "add": arith_method(operator.add),
- "radd": arith_method(roperator.radd),
- "sub": arith_method(operator.sub),
- "mul": arith_method(operator.mul),
- "truediv": arith_method(operator.truediv),
- "floordiv": arith_method(operator.floordiv),
- "mod": arith_method(operator.mod),
- "pow": arith_method(operator.pow),
- "rmul": arith_method(roperator.rmul),
- "rsub": arith_method(roperator.rsub),
- "rtruediv": arith_method(roperator.rtruediv),
- "rfloordiv": arith_method(roperator.rfloordiv),
- "rpow": arith_method(roperator.rpow),
- "rmod": arith_method(roperator.rmod),
- }
- )
- new_methods["div"] = new_methods["truediv"]
- new_methods["rdiv"] = new_methods["rtruediv"]
- if have_divmod:
- # divmod doesn't have an op that is supported by numexpr
- new_methods["divmod"] = arith_method(divmod)
- new_methods["rdivmod"] = arith_method(roperator.rdivmod)
-
- new_methods.update(
- {
- "eq": comp_method(operator.eq),
- "ne": comp_method(operator.ne),
- "lt": comp_method(operator.lt),
- "gt": comp_method(operator.gt),
- "le": comp_method(operator.le),
- "ge": comp_method(operator.ge),
- }
- )
-
- new_methods = {k.strip("_"): v for k, v in new_methods.items()}
- return new_methods
-
-
-def _add_methods(cls, new_methods) -> None:
- for name, method in new_methods.items():
- setattr(cls, name, method)
diff --git a/contrib/python/pandas/py3/pandas/core/ops/missing.py b/contrib/python/pandas/py3/pandas/core/ops/missing.py
deleted file mode 100644
index 3ba611c3bd0..00000000000
--- a/contrib/python/pandas/py3/pandas/core/ops/missing.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-Missing data handling for arithmetic operations.
-
-In particular, pandas conventions regarding division by zero differ
-from numpy in the following ways:
- 1) np.array([-1, 0, 1], dtype=dtype1) // np.array([0, 0, 0], dtype=dtype2)
- gives [nan, nan, nan] for most dtype combinations, and [0, 0, 0] for
- the remaining pairs
- (the remaining being dtype1==dtype2==intN and dtype==dtype2==uintN).
-
- pandas convention is to return [-inf, nan, inf] for all dtype
- combinations.
-
- Note: the numpy behavior described here is py3-specific.
-
- 2) np.array([-1, 0, 1], dtype=dtype1) % np.array([0, 0, 0], dtype=dtype2)
- gives precisely the same results as the // operation.
-
- pandas convention is to return [nan, nan, nan] for all dtype
- combinations.
-
- 3) divmod behavior consistent with 1) and 2).
-"""
-from __future__ import annotations
-
-import operator
-
-import numpy as np
-
-from pandas.core.dtypes.common import (
- is_float_dtype,
- is_integer_dtype,
- is_scalar,
-)
-
-from pandas.core.ops import roperator
-
-
-def _fill_zeros(result, x, y):
- """
- If this is a reversed op, then flip x,y
-
- If we have an integer value (or array in y)
- and we have 0's, fill them with np.nan,
- return the result.
-
- Mask the nan's from x.
- """
- if is_float_dtype(result.dtype):
- return result
-
- is_variable_type = hasattr(y, "dtype")
- is_scalar_type = is_scalar(y)
-
- if not is_variable_type and not is_scalar_type:
- return result
-
- if is_scalar_type:
- y = np.array(y)
-
- if is_integer_dtype(y.dtype):
- ymask = y == 0
- if ymask.any():
- # GH#7325, mask and nans must be broadcastable
- mask = ymask & ~np.isnan(result)
-
- # GH#9308 doing ravel on result and mask can improve putmask perf,
- # but can also make unwanted copies.
- result = result.astype("float64", copy=False)
-
- np.putmask(result, mask, np.nan)
-
- return result
-
-
-def mask_zero_div_zero(x, y, result: np.ndarray) -> np.ndarray:
- """
- Set results of 0 // 0 to np.nan, regardless of the dtypes
- of the numerator or the denominator.
-
- Parameters
- ----------
- x : ndarray
- y : ndarray
- result : ndarray
-
- Returns
- -------
- ndarray
- The filled result.
-
- Examples
- --------
- >>> x = np.array([1, 0, -1], dtype=np.int64)
- >>> x
- array([ 1, 0, -1])
- >>> y = 0 # int 0; numpy behavior is different with float
- >>> result = x // y
- >>> result # raw numpy result does not fill division by zero
- array([0, 0, 0])
- >>> mask_zero_div_zero(x, y, result)
- array([ inf, nan, -inf])
- """
-
- if not hasattr(y, "dtype"):
- # e.g. scalar, tuple
- y = np.array(y)
- if not hasattr(x, "dtype"):
- # e.g scalar, tuple
- x = np.array(x)
-
- zmask = y == 0
-
- if zmask.any():
- # Flip sign if necessary for -0.0
- zneg_mask = zmask & np.signbit(y)
- zpos_mask = zmask & ~zneg_mask
-
- x_lt0 = x < 0
- x_gt0 = x > 0
- nan_mask = zmask & (x == 0)
- with np.errstate(invalid="ignore"):
- neginf_mask = (zpos_mask & x_lt0) | (zneg_mask & x_gt0)
- posinf_mask = (zpos_mask & x_gt0) | (zneg_mask & x_lt0)
-
- if nan_mask.any() or neginf_mask.any() or posinf_mask.any():
- # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN
- result = result.astype("float64", copy=False)
-
- result[nan_mask] = np.nan
- result[posinf_mask] = np.inf
- result[neginf_mask] = -np.inf
-
- return result
-
-
-def dispatch_fill_zeros(op, left, right, result):
- """
- Call _fill_zeros with the appropriate fill value depending on the operation,
- with special logic for divmod and rdivmod.
-
- Parameters
- ----------
- op : function (operator.add, operator.div, ...)
- left : object (np.ndarray for non-reversed ops)
- right : object (np.ndarray for reversed ops)
- result : ndarray
-
- Returns
- -------
- result : np.ndarray
-
- Notes
- -----
- For divmod and rdivmod, the `result` parameter and returned `result`
- is a 2-tuple of ndarray objects.
- """
- if op is divmod:
- result = (
- mask_zero_div_zero(left, right, result[0]),
- _fill_zeros(result[1], left, right),
- )
- elif op is roperator.rdivmod:
- result = (
- mask_zero_div_zero(right, left, result[0]),
- _fill_zeros(result[1], right, left),
- )
- elif op is operator.floordiv:
- # Note: no need to do this for truediv; in py3 numpy behaves the way
- # we want.
- result = mask_zero_div_zero(left, right, result)
- elif op is roperator.rfloordiv:
- # Note: no need to do this for rtruediv; in py3 numpy behaves the way
- # we want.
- result = mask_zero_div_zero(right, left, result)
- elif op is operator.mod:
- result = _fill_zeros(result, left, right)
- elif op is roperator.rmod:
- result = _fill_zeros(result, right, left)
- return result
diff --git a/contrib/python/pandas/py3/pandas/core/resample.py b/contrib/python/pandas/py3/pandas/core/resample.py
deleted file mode 100644
index 546785fbf6f..00000000000
--- a/contrib/python/pandas/py3/pandas/core/resample.py
+++ /dev/null
@@ -1,2302 +0,0 @@
-from __future__ import annotations
-
-import copy
-from textwrap import dedent
-from typing import (
- TYPE_CHECKING,
- Callable,
- Hashable,
- Literal,
- cast,
- final,
- no_type_check,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.tslibs import (
- BaseOffset,
- IncompatibleFrequency,
- NaT,
- Period,
- Timedelta,
- Timestamp,
- to_offset,
-)
-from pandas._typing import (
- AnyArrayLike,
- Axis,
- AxisInt,
- Frequency,
- IndexLabel,
- NDFrameT,
- QuantileInterpolation,
- T,
- TimedeltaConvertibleTypes,
- TimeGrouperOrigin,
- TimestampConvertibleTypes,
- npt,
-)
-from pandas.compat.numpy import function as nv
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import (
- Appender,
- Substitution,
- doc,
-)
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-
-import pandas.core.algorithms as algos
-from pandas.core.apply import ResamplerWindowApply
-from pandas.core.base import PandasObject
-import pandas.core.common as com
-from pandas.core.generic import (
- NDFrame,
- _shared_docs,
-)
-from pandas.core.groupby.generic import SeriesGroupBy
-from pandas.core.groupby.groupby import (
- BaseGroupBy,
- GroupBy,
- _pipe_template,
- get_groupby,
-)
-from pandas.core.groupby.grouper import Grouper
-from pandas.core.groupby.ops import BinGrouper
-from pandas.core.indexes.datetimes import (
- DatetimeIndex,
- date_range,
-)
-from pandas.core.indexes.period import (
- PeriodIndex,
- period_range,
-)
-from pandas.core.indexes.timedeltas import (
- TimedeltaIndex,
- timedelta_range,
-)
-
-from pandas.tseries.frequencies import (
- is_subperiod,
- is_superperiod,
-)
-from pandas.tseries.offsets import (
- Day,
- Tick,
-)
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Index,
- Series,
- )
-
-_shared_docs_kwargs: dict[str, str] = {}
-
-
-class Resampler(BaseGroupBy, PandasObject):
- """
- Class for resampling datetimelike data, a groupby-like operation.
- See aggregate, transform, and apply functions on this object.
-
- It's easiest to use obj.resample(...) to use Resampler.
-
- Parameters
- ----------
- obj : Series or DataFrame
- groupby : TimeGrouper
- axis : int, default 0
- kind : str or None
- 'period', 'timestamp' to override default index treatment
-
- Returns
- -------
- a Resampler of the appropriate type
-
- Notes
- -----
- After resampling, see aggregate, apply, and transform functions.
- """
-
- grouper: BinGrouper
- _timegrouper: TimeGrouper
- binner: DatetimeIndex | TimedeltaIndex | PeriodIndex # depends on subclass
- exclusions: frozenset[Hashable] = frozenset() # for SelectionMixin compat
- _internal_names_set = set({"obj", "ax", "_indexer"})
-
- # to the groupby descriptor
- _attributes = [
- "freq",
- "axis",
- "closed",
- "label",
- "convention",
- "kind",
- "origin",
- "offset",
- ]
-
- def __init__(
- self,
- obj: NDFrame,
- timegrouper: TimeGrouper,
- axis: Axis = 0,
- kind=None,
- *,
- gpr_index: Index,
- group_keys: bool = False,
- selection=None,
- ) -> None:
- self._timegrouper = timegrouper
- self.keys = None
- self.sort = True
- self.axis = obj._get_axis_number(axis)
- self.kind = kind
- self.group_keys = group_keys
- self.as_index = True
-
- self.obj, self.ax, self._indexer = self._timegrouper._set_grouper(
- self._convert_obj(obj), sort=True, gpr_index=gpr_index
- )
- self.binner, self.grouper = self._get_binner()
- self._selection = selection
- if self._timegrouper.key is not None:
- self.exclusions = frozenset([self._timegrouper.key])
- else:
- self.exclusions = frozenset()
-
- def __str__(self) -> str:
- """
- Provide a nice str repr of our rolling object.
- """
- attrs = (
- f"{k}={getattr(self._timegrouper, k)}"
- for k in self._attributes
- if getattr(self._timegrouper, k, None) is not None
- )
- return f"{type(self).__name__} [{', '.join(attrs)}]"
-
- def __getattr__(self, attr: str):
- if attr in self._internal_names_set:
- return object.__getattribute__(self, attr)
- if attr in self._attributes:
- return getattr(self._timegrouper, attr)
- if attr in self.obj:
- return self[attr]
-
- return object.__getattribute__(self, attr)
-
- @property
- def _from_selection(self) -> bool:
- """
- Is the resampling from a DataFrame column or MultiIndex level.
- """
- # upsampling and PeriodIndex resampling do not work
- # with selection, this state used to catch and raise an error
- return self._timegrouper is not None and (
- self._timegrouper.key is not None or self._timegrouper.level is not None
- )
-
- def _convert_obj(self, obj: NDFrameT) -> NDFrameT:
- """
- Provide any conversions for the object in order to correctly handle.
-
- Parameters
- ----------
- obj : Series or DataFrame
-
- Returns
- -------
- Series or DataFrame
- """
- return obj._consolidate()
-
- def _get_binner_for_time(self):
- raise AbstractMethodError(self)
-
- @final
- def _get_binner(self):
- """
- Create the BinGrouper, assume that self.set_grouper(obj)
- has already been called.
- """
- binner, bins, binlabels = self._get_binner_for_time()
- assert len(bins) == len(binlabels)
- bin_grouper = BinGrouper(bins, binlabels, indexer=self._indexer)
- return binner, bin_grouper
-
- @Substitution(
- klass="Resampler",
- examples="""
- >>> df = pd.DataFrame({'A': [1, 2, 3, 4]},
- ... index=pd.date_range('2012-08-02', periods=4))
- >>> df
- A
- 2012-08-02 1
- 2012-08-03 2
- 2012-08-04 3
- 2012-08-05 4
-
- To get the difference between each 2-day period's maximum and minimum
- value in one pass, you can do
-
- >>> df.resample('2D').pipe(lambda x: x.max() - x.min())
- A
- 2012-08-02 1
- 2012-08-04 1""",
- )
- @Appender(_pipe_template)
- def pipe(
- self,
- func: Callable[..., T] | tuple[Callable[..., T], str],
- *args,
- **kwargs,
- ) -> T:
- return super().pipe(func, *args, **kwargs)
-
- _agg_see_also_doc = dedent(
- """
- See Also
- --------
- DataFrame.groupby.aggregate : Aggregate using callable, string, dict,
- or list of string/callables.
- DataFrame.resample.transform : Transforms the Series on each group
- based on the given function.
- DataFrame.aggregate: Aggregate using one or more
- operations over the specified axis.
- """
- )
-
- _agg_examples_doc = dedent(
- """
- Examples
- --------
- >>> s = pd.Series([1, 2, 3, 4, 5],
- ... index=pd.date_range('20130101', periods=5, freq='s'))
- >>> s
- 2013-01-01 00:00:00 1
- 2013-01-01 00:00:01 2
- 2013-01-01 00:00:02 3
- 2013-01-01 00:00:03 4
- 2013-01-01 00:00:04 5
- Freq: S, dtype: int64
-
- >>> r = s.resample('2s')
-
- >>> r.agg(np.sum)
- 2013-01-01 00:00:00 3
- 2013-01-01 00:00:02 7
- 2013-01-01 00:00:04 5
- Freq: 2S, dtype: int64
-
- >>> r.agg(['sum', 'mean', 'max'])
- sum mean max
- 2013-01-01 00:00:00 3 1.5 2
- 2013-01-01 00:00:02 7 3.5 4
- 2013-01-01 00:00:04 5 5.0 5
-
- >>> r.agg({'result': lambda x: x.mean() / x.std(),
- ... 'total': np.sum})
- result total
- 2013-01-01 00:00:00 2.121320 3
- 2013-01-01 00:00:02 4.949747 7
- 2013-01-01 00:00:04 NaN 5
-
- >>> r.agg(average="mean", total="sum")
- average total
- 2013-01-01 00:00:00 1.5 3
- 2013-01-01 00:00:02 3.5 7
- 2013-01-01 00:00:04 5.0 5
- """
- )
-
- @doc(
- _shared_docs["aggregate"],
- see_also=_agg_see_also_doc,
- examples=_agg_examples_doc,
- klass="DataFrame",
- axis="",
- )
- def aggregate(self, func=None, *args, **kwargs):
- result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
- if result is None:
- how = func
- result = self._groupby_and_aggregate(how, *args, **kwargs)
-
- return result
-
- agg = aggregate
- apply = aggregate
-
- def transform(self, arg, *args, **kwargs):
- """
- Call function producing a like-indexed Series on each group.
-
- Return a Series with the transformed values.
-
- Parameters
- ----------
- arg : function
- To apply to each group. Should return a Series with the same index.
-
- Returns
- -------
- Series
-
- Examples
- --------
- >>> s = pd.Series([1, 2],
- ... index=pd.date_range('20180101',
- ... periods=2,
- ... freq='1h'))
- >>> s
- 2018-01-01 00:00:00 1
- 2018-01-01 01:00:00 2
- Freq: H, dtype: int64
-
- >>> resampled = s.resample('15min')
- >>> resampled.transform(lambda x: (x - x.mean()) / x.std())
- 2018-01-01 00:00:00 NaN
- 2018-01-01 01:00:00 NaN
- Freq: H, dtype: float64
- """
- return self._selected_obj.groupby(self._timegrouper).transform(
- arg, *args, **kwargs
- )
-
- def _downsample(self, f, **kwargs):
- raise AbstractMethodError(self)
-
- def _upsample(self, f, limit=None, fill_value=None):
- raise AbstractMethodError(self)
-
- def _gotitem(self, key, ndim: int, subset=None):
- """
- Sub-classes to define. Return a sliced object.
-
- Parameters
- ----------
- key : string / list of selections
- ndim : {1, 2}
- requested ndim of result
- subset : object, default None
- subset to act on
- """
- grouper = self.grouper
- if subset is None:
- subset = self.obj
- if key is not None:
- subset = subset[key]
- else:
- # reached via Apply.agg_dict_like with selection=None and ndim=1
- assert subset.ndim == 1
- if ndim == 1:
- assert subset.ndim == 1
-
- grouped = get_groupby(
- subset, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys
- )
- return grouped
-
- def _groupby_and_aggregate(self, how, *args, **kwargs):
- """
- Re-evaluate the obj with a groupby aggregation.
- """
- grouper = self.grouper
-
- if self._selected_obj.ndim == 1:
- obj = self._selected_obj
- else:
- # Excludes `on` column when provided
- obj = self._obj_with_exclusions
- grouped = get_groupby(
- obj, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys
- )
-
- try:
- if callable(how):
- # TODO: test_resample_apply_with_additional_args fails if we go
- # through the non-lambda path, not clear that it should.
- func = lambda x: how(x, *args, **kwargs)
- result = grouped.aggregate(func)
- else:
- result = grouped.aggregate(how, *args, **kwargs)
- except (AttributeError, KeyError):
- # we have a non-reducing function; try to evaluate
- # alternatively we want to evaluate only a column of the input
-
- # test_apply_to_one_column_of_df the function being applied references
- # a DataFrame column, but aggregate_item_by_item operates column-wise
- # on Series, raising AttributeError or KeyError
- # (depending on whether the column lookup uses getattr/__getitem__)
- result = grouped.apply(how, *args, **kwargs)
-
- except ValueError as err:
- if "Must produce aggregated value" in str(err):
- # raised in _aggregate_named
- # see test_apply_without_aggregation, test_apply_with_mutated_index
- pass
- else:
- raise
-
- # we have a non-reducing function
- # try to evaluate
- result = grouped.apply(how, *args, **kwargs)
-
- return self._wrap_result(result)
-
- def _get_resampler_for_grouping(self, groupby: GroupBy, key):
- """
- Return the correct class for resampling with groupby.
- """
- return self._resampler_for_grouping(groupby=groupby, key=key, parent=self)
-
- def _wrap_result(self, result):
- """
- Potentially wrap any results.
- """
- # GH 47705
- obj = self.obj
- if (
- isinstance(result, ABCDataFrame)
- and len(result) == 0
- and not isinstance(result.index, PeriodIndex)
- ):
- result = result.set_index(
- _asfreq_compat(obj.index[:0], freq=self.freq), append=True
- )
-
- if isinstance(result, ABCSeries) and self._selection is not None:
- result.name = self._selection
-
- if isinstance(result, ABCSeries) and result.empty:
- # When index is all NaT, result is empty but index is not
- result.index = _asfreq_compat(obj.index[:0], freq=self.freq)
- result.name = getattr(obj, "name", None)
-
- return result
-
- def ffill(self, limit=None):
- """
- Forward fill the values.
-
- Parameters
- ----------
- limit : int, optional
- Limit of how many values to fill.
-
- Returns
- -------
- An upsampled Series.
-
- See Also
- --------
- Series.fillna: Fill NA/NaN values using the specified method.
- DataFrame.fillna: Fill NA/NaN values using the specified method.
- """
- return self._upsample("ffill", limit=limit)
-
- def nearest(self, limit=None):
- """
- Resample by using the nearest value.
-
- When resampling data, missing values may appear (e.g., when the
- resampling frequency is higher than the original frequency).
- The `nearest` method will replace ``NaN`` values that appeared in
- the resampled data with the value from the nearest member of the
- sequence, based on the index value.
- Missing values that existed in the original data will not be modified.
- If `limit` is given, fill only this many values in each direction for
- each of the original values.
-
- Parameters
- ----------
- limit : int, optional
- Limit of how many values to fill.
-
- Returns
- -------
- Series or DataFrame
- An upsampled Series or DataFrame with ``NaN`` values filled with
- their nearest value.
-
- See Also
- --------
- backfill : Backward fill the new missing values in the resampled data.
- pad : Forward fill ``NaN`` values.
-
- Examples
- --------
- >>> s = pd.Series([1, 2],
- ... index=pd.date_range('20180101',
- ... periods=2,
- ... freq='1h'))
- >>> s
- 2018-01-01 00:00:00 1
- 2018-01-01 01:00:00 2
- Freq: H, dtype: int64
-
- >>> s.resample('15min').nearest()
- 2018-01-01 00:00:00 1
- 2018-01-01 00:15:00 1
- 2018-01-01 00:30:00 2
- 2018-01-01 00:45:00 2
- 2018-01-01 01:00:00 2
- Freq: 15T, dtype: int64
-
- Limit the number of upsampled values imputed by the nearest:
-
- >>> s.resample('15min').nearest(limit=1)
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:15:00 1.0
- 2018-01-01 00:30:00 NaN
- 2018-01-01 00:45:00 2.0
- 2018-01-01 01:00:00 2.0
- Freq: 15T, dtype: float64
- """
- return self._upsample("nearest", limit=limit)
-
- def bfill(self, limit=None):
- """
- Backward fill the new missing values in the resampled data.
-
- In statistics, imputation is the process of replacing missing data with
- substituted values [1]_. When resampling data, missing values may
- appear (e.g., when the resampling frequency is higher than the original
- frequency). The backward fill will replace NaN values that appeared in
- the resampled data with the next value in the original sequence.
- Missing values that existed in the original data will not be modified.
-
- Parameters
- ----------
- limit : int, optional
- Limit of how many values to fill.
-
- Returns
- -------
- Series, DataFrame
- An upsampled Series or DataFrame with backward filled NaN values.
-
- See Also
- --------
- bfill : Alias of backfill.
- fillna : Fill NaN values using the specified method, which can be
- 'backfill'.
- nearest : Fill NaN values with nearest neighbor starting from center.
- ffill : Forward fill NaN values.
- Series.fillna : Fill NaN values in the Series using the
- specified method, which can be 'backfill'.
- DataFrame.fillna : Fill NaN values in the DataFrame using the
- specified method, which can be 'backfill'.
-
- References
- ----------
- .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
-
- Examples
- --------
- Resampling a Series:
-
- >>> s = pd.Series([1, 2, 3],
- ... index=pd.date_range('20180101', periods=3, freq='h'))
- >>> s
- 2018-01-01 00:00:00 1
- 2018-01-01 01:00:00 2
- 2018-01-01 02:00:00 3
- Freq: H, dtype: int64
-
- >>> s.resample('30min').bfill()
- 2018-01-01 00:00:00 1
- 2018-01-01 00:30:00 2
- 2018-01-01 01:00:00 2
- 2018-01-01 01:30:00 3
- 2018-01-01 02:00:00 3
- Freq: 30T, dtype: int64
-
- >>> s.resample('15min').bfill(limit=2)
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:15:00 NaN
- 2018-01-01 00:30:00 2.0
- 2018-01-01 00:45:00 2.0
- 2018-01-01 01:00:00 2.0
- 2018-01-01 01:15:00 NaN
- 2018-01-01 01:30:00 3.0
- 2018-01-01 01:45:00 3.0
- 2018-01-01 02:00:00 3.0
- Freq: 15T, dtype: float64
-
- Resampling a DataFrame that has missing values:
-
- >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
- ... index=pd.date_range('20180101', periods=3,
- ... freq='h'))
- >>> df
- a b
- 2018-01-01 00:00:00 2.0 1
- 2018-01-01 01:00:00 NaN 3
- 2018-01-01 02:00:00 6.0 5
-
- >>> df.resample('30min').bfill()
- a b
- 2018-01-01 00:00:00 2.0 1
- 2018-01-01 00:30:00 NaN 3
- 2018-01-01 01:00:00 NaN 3
- 2018-01-01 01:30:00 6.0 5
- 2018-01-01 02:00:00 6.0 5
-
- >>> df.resample('15min').bfill(limit=2)
- a b
- 2018-01-01 00:00:00 2.0 1.0
- 2018-01-01 00:15:00 NaN NaN
- 2018-01-01 00:30:00 NaN 3.0
- 2018-01-01 00:45:00 NaN 3.0
- 2018-01-01 01:00:00 NaN 3.0
- 2018-01-01 01:15:00 NaN NaN
- 2018-01-01 01:30:00 6.0 5.0
- 2018-01-01 01:45:00 6.0 5.0
- 2018-01-01 02:00:00 6.0 5.0
- """
- return self._upsample("bfill", limit=limit)
-
- def fillna(self, method, limit=None):
- """
- Fill missing values introduced by upsampling.
-
- In statistics, imputation is the process of replacing missing data with
- substituted values [1]_. When resampling data, missing values may
- appear (e.g., when the resampling frequency is higher than the original
- frequency).
-
- Missing values that existed in the original data will
- not be modified.
-
- Parameters
- ----------
- method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'}
- Method to use for filling holes in resampled data
-
- * 'pad' or 'ffill': use previous valid observation to fill gap
- (forward fill).
- * 'backfill' or 'bfill': use next valid observation to fill gap.
- * 'nearest': use nearest valid observation to fill gap.
-
- limit : int, optional
- Limit of how many consecutive missing values to fill.
-
- Returns
- -------
- Series or DataFrame
- An upsampled Series or DataFrame with missing values filled.
-
- See Also
- --------
- bfill : Backward fill NaN values in the resampled data.
- ffill : Forward fill NaN values in the resampled data.
- nearest : Fill NaN values in the resampled data
- with nearest neighbor starting from center.
- interpolate : Fill NaN values using interpolation.
- Series.fillna : Fill NaN values in the Series using the
- specified method, which can be 'bfill' and 'ffill'.
- DataFrame.fillna : Fill NaN values in the DataFrame using the
- specified method, which can be 'bfill' and 'ffill'.
-
- References
- ----------
- .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
-
- Examples
- --------
- Resampling a Series:
-
- >>> s = pd.Series([1, 2, 3],
- ... index=pd.date_range('20180101', periods=3, freq='h'))
- >>> s
- 2018-01-01 00:00:00 1
- 2018-01-01 01:00:00 2
- 2018-01-01 02:00:00 3
- Freq: H, dtype: int64
-
- Without filling the missing values you get:
-
- >>> s.resample("30min").asfreq()
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:30:00 NaN
- 2018-01-01 01:00:00 2.0
- 2018-01-01 01:30:00 NaN
- 2018-01-01 02:00:00 3.0
- Freq: 30T, dtype: float64
-
- >>> s.resample('30min').fillna("backfill")
- 2018-01-01 00:00:00 1
- 2018-01-01 00:30:00 2
- 2018-01-01 01:00:00 2
- 2018-01-01 01:30:00 3
- 2018-01-01 02:00:00 3
- Freq: 30T, dtype: int64
-
- >>> s.resample('15min').fillna("backfill", limit=2)
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:15:00 NaN
- 2018-01-01 00:30:00 2.0
- 2018-01-01 00:45:00 2.0
- 2018-01-01 01:00:00 2.0
- 2018-01-01 01:15:00 NaN
- 2018-01-01 01:30:00 3.0
- 2018-01-01 01:45:00 3.0
- 2018-01-01 02:00:00 3.0
- Freq: 15T, dtype: float64
-
- >>> s.resample('30min').fillna("pad")
- 2018-01-01 00:00:00 1
- 2018-01-01 00:30:00 1
- 2018-01-01 01:00:00 2
- 2018-01-01 01:30:00 2
- 2018-01-01 02:00:00 3
- Freq: 30T, dtype: int64
-
- >>> s.resample('30min').fillna("nearest")
- 2018-01-01 00:00:00 1
- 2018-01-01 00:30:00 2
- 2018-01-01 01:00:00 2
- 2018-01-01 01:30:00 3
- 2018-01-01 02:00:00 3
- Freq: 30T, dtype: int64
-
- Missing values present before the upsampling are not affected.
-
- >>> sm = pd.Series([1, None, 3],
- ... index=pd.date_range('20180101', periods=3, freq='h'))
- >>> sm
- 2018-01-01 00:00:00 1.0
- 2018-01-01 01:00:00 NaN
- 2018-01-01 02:00:00 3.0
- Freq: H, dtype: float64
-
- >>> sm.resample('30min').fillna('backfill')
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:30:00 NaN
- 2018-01-01 01:00:00 NaN
- 2018-01-01 01:30:00 3.0
- 2018-01-01 02:00:00 3.0
- Freq: 30T, dtype: float64
-
- >>> sm.resample('30min').fillna('pad')
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:30:00 1.0
- 2018-01-01 01:00:00 NaN
- 2018-01-01 01:30:00 NaN
- 2018-01-01 02:00:00 3.0
- Freq: 30T, dtype: float64
-
- >>> sm.resample('30min').fillna('nearest')
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:30:00 NaN
- 2018-01-01 01:00:00 NaN
- 2018-01-01 01:30:00 3.0
- 2018-01-01 02:00:00 3.0
- Freq: 30T, dtype: float64
-
- DataFrame resampling is done column-wise. All the same options are
- available.
-
- >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
- ... index=pd.date_range('20180101', periods=3,
- ... freq='h'))
- >>> df
- a b
- 2018-01-01 00:00:00 2.0 1
- 2018-01-01 01:00:00 NaN 3
- 2018-01-01 02:00:00 6.0 5
-
- >>> df.resample('30min').fillna("bfill")
- a b
- 2018-01-01 00:00:00 2.0 1
- 2018-01-01 00:30:00 NaN 3
- 2018-01-01 01:00:00 NaN 3
- 2018-01-01 01:30:00 6.0 5
- 2018-01-01 02:00:00 6.0 5
- """
- return self._upsample(method, limit=limit)
-
- @doc(NDFrame.interpolate, **_shared_docs_kwargs)
- def interpolate(
- self,
- method: QuantileInterpolation = "linear",
- *,
- axis: Axis = 0,
- limit=None,
- inplace: bool = False,
- limit_direction: Literal["forward", "backward", "both"] = "forward",
- limit_area=None,
- downcast=None,
- **kwargs,
- ):
- """
- Interpolate values according to different methods.
- """
- result = self._upsample("asfreq")
- return result.interpolate(
- method=method,
- axis=axis,
- limit=limit,
- inplace=inplace,
- limit_direction=limit_direction,
- limit_area=limit_area,
- downcast=downcast,
- **kwargs,
- )
-
- def asfreq(self, fill_value=None):
- """
- Return the values at the new freq, essentially a reindex.
-
- Parameters
- ----------
- fill_value : scalar, optional
- Value to use for missing values, applied during upsampling (note
- this does not fill NaNs that already were present).
-
- Returns
- -------
- DataFrame or Series
- Values at the specified freq.
-
- See Also
- --------
- Series.asfreq: Convert TimeSeries to specified frequency.
- DataFrame.asfreq: Convert TimeSeries to specified frequency.
- """
- return self._upsample("asfreq", fill_value=fill_value)
-
- def sum(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "sum", args, kwargs)
- nv.validate_resampler_func("sum", args, kwargs)
- return self._downsample("sum", numeric_only=numeric_only, min_count=min_count)
-
- @doc(GroupBy.prod)
- def prod(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "prod", args, kwargs)
- nv.validate_resampler_func("prod", args, kwargs)
- return self._downsample("prod", numeric_only=numeric_only, min_count=min_count)
-
- def min(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "min", args, kwargs)
- nv.validate_resampler_func("min", args, kwargs)
- return self._downsample("min", numeric_only=numeric_only, min_count=min_count)
-
- def max(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "max", args, kwargs)
- nv.validate_resampler_func("max", args, kwargs)
- return self._downsample("max", numeric_only=numeric_only, min_count=min_count)
-
- @doc(GroupBy.first)
- def first(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "first", args, kwargs)
- nv.validate_resampler_func("first", args, kwargs)
- return self._downsample("first", numeric_only=numeric_only, min_count=min_count)
-
- @doc(GroupBy.last)
- def last(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "last", args, kwargs)
- nv.validate_resampler_func("last", args, kwargs)
- return self._downsample("last", numeric_only=numeric_only, min_count=min_count)
-
- @doc(GroupBy.median)
- def median(self, numeric_only: bool = False, *args, **kwargs):
- maybe_warn_args_and_kwargs(type(self), "median", args, kwargs)
- nv.validate_resampler_func("median", args, kwargs)
- return self._downsample("median", numeric_only=numeric_only)
-
- def mean(
- self,
- numeric_only: bool = False,
- *args,
- **kwargs,
- ):
- """
- Compute mean of groups, excluding missing values.
-
- Parameters
- ----------
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionchanged:: 2.0.0
-
- numeric_only now defaults to ``False``.
-
- Returns
- -------
- DataFrame or Series
- Mean of values within each group.
- """
- maybe_warn_args_and_kwargs(type(self), "mean", args, kwargs)
- nv.validate_resampler_func("mean", args, kwargs)
- return self._downsample("mean", numeric_only=numeric_only)
-
- def std(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- *args,
- **kwargs,
- ):
- """
- Compute standard deviation of groups, excluding missing values.
-
- Parameters
- ----------
- ddof : int, default 1
- Degrees of freedom.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- .. versionchanged:: 2.0.0
-
- numeric_only now defaults to ``False``.
-
- Returns
- -------
- DataFrame or Series
- Standard deviation of values within each group.
- """
- maybe_warn_args_and_kwargs(type(self), "std", args, kwargs)
- nv.validate_resampler_func("std", args, kwargs)
- return self._downsample("std", ddof=ddof, numeric_only=numeric_only)
-
- def var(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- *args,
- **kwargs,
- ):
- """
- Compute variance of groups, excluding missing values.
-
- Parameters
- ----------
- ddof : int, default 1
- Degrees of freedom.
-
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- .. versionchanged:: 2.0.0
-
- numeric_only now defaults to ``False``.
-
- Returns
- -------
- DataFrame or Series
- Variance of values within each group.
- """
- maybe_warn_args_and_kwargs(type(self), "var", args, kwargs)
- nv.validate_resampler_func("var", args, kwargs)
- return self._downsample("var", ddof=ddof, numeric_only=numeric_only)
-
- @doc(GroupBy.sem)
- def sem(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "sem", args, kwargs)
- nv.validate_resampler_func("sem", args, kwargs)
- return self._downsample("sem", ddof=ddof, numeric_only=numeric_only)
-
- @doc(GroupBy.ohlc)
- def ohlc(
- self,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "ohlc", args, kwargs)
- nv.validate_resampler_func("ohlc", args, kwargs)
- return self._downsample("ohlc")
-
- @doc(SeriesGroupBy.nunique)
- def nunique(
- self,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "nunique", args, kwargs)
- nv.validate_resampler_func("nunique", args, kwargs)
- return self._downsample("nunique")
-
- @doc(GroupBy.size)
- def size(self):
- result = self._downsample("size")
-
- # If the result is a non-empty DataFrame we stack to get a Series
- # GH 46826
- if isinstance(result, ABCDataFrame) and not result.empty:
- result = result.stack()
-
- if not len(self.ax):
- from pandas import Series
-
- if self._selected_obj.ndim == 1:
- name = self._selected_obj.name
- else:
- name = None
- result = Series([], index=result.index, dtype="int64", name=name)
- return result
-
- @doc(GroupBy.count)
- def count(self):
- result = self._downsample("count")
- if not len(self.ax):
- if self._selected_obj.ndim == 1:
- result = type(self._selected_obj)(
- [], index=result.index, dtype="int64", name=self._selected_obj.name
- )
- else:
- from pandas import DataFrame
-
- result = DataFrame(
- [], index=result.index, columns=result.columns, dtype="int64"
- )
-
- return result
-
- def quantile(self, q: float | AnyArrayLike = 0.5, **kwargs):
- """
- Return value at the given quantile.
-
- Parameters
- ----------
- q : float or array-like, default 0.5 (50% quantile)
-
- Returns
- -------
- DataFrame or Series
- Quantile of values within each group.
-
- See Also
- --------
- Series.quantile
- Return a series, where the index is q and the values are the quantiles.
- DataFrame.quantile
- Return a DataFrame, where the columns are the columns of self,
- and the values are the quantiles.
- DataFrameGroupBy.quantile
- Return a DataFrame, where the columns are groupby columns,
- and the values are its quantiles.
- """
- return self._downsample("quantile", q=q, **kwargs)
-
-
-class _GroupByMixin(PandasObject):
- """
- Provide the groupby facilities.
- """
-
- _attributes: list[str] # in practice the same as Resampler._attributes
- _selection: IndexLabel | None = None
- _groupby: GroupBy
- _timegrouper: TimeGrouper
-
- def __init__(
- self,
- *,
- parent: Resampler,
- groupby: GroupBy,
- key=None,
- selection: IndexLabel | None = None,
- ) -> None:
- # reached via ._gotitem and _get_resampler_for_grouping
-
- assert isinstance(groupby, GroupBy), type(groupby)
-
- # parent is always a Resampler, sometimes a _GroupByMixin
- assert isinstance(parent, Resampler), type(parent)
-
- # initialize our GroupByMixin object with
- # the resampler attributes
- for attr in self._attributes:
- setattr(self, attr, getattr(parent, attr))
- self._selection = selection
-
- self.binner = parent.binner
- self.key = key
-
- self._groupby = groupby
- self._timegrouper = copy.copy(parent._timegrouper)
-
- self.ax = parent.ax
- self.obj = parent.obj
-
- @no_type_check
- def _apply(self, f, *args, **kwargs):
- """
- Dispatch to _upsample; we are stripping all of the _upsample kwargs and
- performing the original function call on the grouped object.
- """
-
- def func(x):
- x = self._resampler_cls(x, timegrouper=self._timegrouper, gpr_index=self.ax)
-
- if isinstance(f, str):
- return getattr(x, f)(**kwargs)
-
- return x.apply(f, *args, **kwargs)
-
- result = self._groupby.apply(func)
- return self._wrap_result(result)
-
- _upsample = _apply
- _downsample = _apply
- _groupby_and_aggregate = _apply
-
- @final
- def _gotitem(self, key, ndim, subset=None):
- """
- Sub-classes to define. Return a sliced object.
-
- Parameters
- ----------
- key : string / list of selections
- ndim : {1, 2}
- requested ndim of result
- subset : object, default None
- subset to act on
- """
- # create a new object to prevent aliasing
- if subset is None:
- subset = self.obj
- if key is not None:
- subset = subset[key]
- else:
- # reached via Apply.agg_dict_like with selection=None, ndim=1
- assert subset.ndim == 1
-
- # Try to select from a DataFrame, falling back to a Series
- try:
- if isinstance(key, list) and self.key not in key and self.key is not None:
- key.append(self.key)
- groupby = self._groupby[key]
- except IndexError:
- groupby = self._groupby
-
- selection = None
- if subset.ndim == 2 and (
- (lib.is_scalar(key) and key in subset) or lib.is_list_like(key)
- ):
- selection = key
- elif subset.ndim == 1 and lib.is_scalar(key) and key == subset.name:
- selection = key
-
- new_rs = type(self)(
- groupby=groupby,
- parent=cast(Resampler, self),
- selection=selection,
- )
- return new_rs
-
-
-class DatetimeIndexResampler(Resampler):
- @property
- def _resampler_for_grouping(self):
- return DatetimeIndexResamplerGroupby
-
- def _get_binner_for_time(self):
- # this is how we are actually creating the bins
- if self.kind == "period":
- return self._timegrouper._get_time_period_bins(self.ax)
- return self._timegrouper._get_time_bins(self.ax)
-
- def _downsample(self, how, **kwargs):
- """
- Downsample the cython defined function.
-
- Parameters
- ----------
- how : string / cython mapped function
- **kwargs : kw args passed to how function
- """
- how = com.get_cython_func(how) or how
- ax = self.ax
- if self._selected_obj.ndim == 1:
- obj = self._selected_obj
- else:
- # Excludes `on` column when provided
- obj = self._obj_with_exclusions
-
- if not len(ax):
- # reset to the new freq
- obj = obj.copy()
- obj.index = obj.index._with_freq(self.freq)
- assert obj.index.freq == self.freq, (obj.index.freq, self.freq)
- return obj
-
- # do we have a regular frequency
-
- # error: Item "None" of "Optional[Any]" has no attribute "binlabels"
- if (
- (ax.freq is not None or ax.inferred_freq is not None)
- and len(self.grouper.binlabels) > len(ax)
- and how is None
- ):
- # let's do an asfreq
- return self.asfreq()
-
- # we are downsampling
- # we want to call the actual grouper method here
- result = obj.groupby(self.grouper, axis=self.axis).aggregate(how, **kwargs)
-
- return self._wrap_result(result)
-
- def _adjust_binner_for_upsample(self, binner):
- """
- Adjust our binner when upsampling.
-
- The range of a new index should not be outside specified range
- """
- if self.closed == "right":
- binner = binner[1:]
- else:
- binner = binner[:-1]
- return binner
-
- def _upsample(self, method, limit=None, fill_value=None):
- """
- Parameters
- ----------
- method : string {'backfill', 'bfill', 'pad',
- 'ffill', 'asfreq'} method for upsampling
- limit : int, default None
- Maximum size gap to fill when reindexing
- fill_value : scalar, default None
- Value to use for missing values
-
- See Also
- --------
- .fillna: Fill NA/NaN values using the specified method.
-
- """
- if self.axis:
- raise AssertionError("axis must be 0")
- if self._from_selection:
- raise ValueError(
- "Upsampling from level= or on= selection "
- "is not supported, use .set_index(...) "
- "to explicitly set index to datetime-like"
- )
-
- ax = self.ax
- obj = self._selected_obj
- binner = self.binner
- res_index = self._adjust_binner_for_upsample(binner)
-
- # if we have the same frequency as our axis, then we are equal sampling
- if (
- limit is None
- and to_offset(ax.inferred_freq) == self.freq
- and len(obj) == len(res_index)
- ):
- result = obj.copy()
- result.index = res_index
- else:
- result = obj.reindex(
- res_index, method=method, limit=limit, fill_value=fill_value
- )
-
- return self._wrap_result(result)
-
- def _wrap_result(self, result):
- result = super()._wrap_result(result)
-
- # we may have a different kind that we were asked originally
- # convert if needed
- if self.kind == "period" and not isinstance(result.index, PeriodIndex):
- result.index = result.index.to_period(self.freq)
- return result
-
-
-class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler):
- """
- Provides a resample of a groupby implementation
- """
-
- @property
- def _resampler_cls(self):
- return DatetimeIndexResampler
-
-
-class PeriodIndexResampler(DatetimeIndexResampler):
- @property
- def _resampler_for_grouping(self):
- return PeriodIndexResamplerGroupby
-
- def _get_binner_for_time(self):
- if self.kind == "timestamp":
- return super()._get_binner_for_time()
- return self._timegrouper._get_period_bins(self.ax)
-
- def _convert_obj(self, obj: NDFrameT) -> NDFrameT:
- obj = super()._convert_obj(obj)
-
- if self._from_selection:
- # see GH 14008, GH 12871
- msg = (
- "Resampling from level= or on= selection "
- "with a PeriodIndex is not currently supported, "
- "use .set_index(...) to explicitly set index"
- )
- raise NotImplementedError(msg)
-
- # convert to timestamp
- if self.kind == "timestamp":
- obj = obj.to_timestamp(how=self.convention)
-
- return obj
-
- def _downsample(self, how, **kwargs):
- """
- Downsample the cython defined function.
-
- Parameters
- ----------
- how : string / cython mapped function
- **kwargs : kw args passed to how function
- """
- # we may need to actually resample as if we are timestamps
- if self.kind == "timestamp":
- return super()._downsample(how, **kwargs)
-
- how = com.get_cython_func(how) or how
- ax = self.ax
-
- if is_subperiod(ax.freq, self.freq):
- # Downsampling
- return self._groupby_and_aggregate(how, **kwargs)
- elif is_superperiod(ax.freq, self.freq):
- if how == "ohlc":
- # GH #13083
- # upsampling to subperiods is handled as an asfreq, which works
- # for pure aggregating/reducing methods
- # OHLC reduces along the time dimension, but creates multiple
- # values for each period -> handle by _groupby_and_aggregate()
- return self._groupby_and_aggregate(how)
- return self.asfreq()
- elif ax.freq == self.freq:
- return self.asfreq()
-
- raise IncompatibleFrequency(
- f"Frequency {ax.freq} cannot be resampled to {self.freq}, "
- "as they are not sub or super periods"
- )
-
- def _upsample(self, method, limit=None, fill_value=None):
- """
- Parameters
- ----------
- method : {'backfill', 'bfill', 'pad', 'ffill'}
- Method for upsampling.
- limit : int, default None
- Maximum size gap to fill when reindexing.
- fill_value : scalar, default None
- Value to use for missing values.
-
- See Also
- --------
- .fillna: Fill NA/NaN values using the specified method.
-
- """
- # we may need to actually resample as if we are timestamps
- if self.kind == "timestamp":
- return super()._upsample(method, limit=limit, fill_value=fill_value)
-
- ax = self.ax
- obj = self.obj
- new_index = self.binner
-
- # Start vs. end of period
- memb = ax.asfreq(self.freq, how=self.convention)
-
- # Get the fill indexer
- indexer = memb.get_indexer(new_index, method=method, limit=limit)
- new_obj = _take_new_index(
- obj,
- indexer,
- new_index,
- axis=self.axis,
- )
- return self._wrap_result(new_obj)
-
-
-class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler):
- """
- Provides a resample of a groupby implementation.
- """
-
- @property
- def _resampler_cls(self):
- return PeriodIndexResampler
-
-
-class TimedeltaIndexResampler(DatetimeIndexResampler):
- @property
- def _resampler_for_grouping(self):
- return TimedeltaIndexResamplerGroupby
-
- def _get_binner_for_time(self):
- return self._timegrouper._get_time_delta_bins(self.ax)
-
- def _adjust_binner_for_upsample(self, binner):
- """
- Adjust our binner when upsampling.
-
- The range of a new index is allowed to be greater than original range
- so we don't need to change the length of a binner, GH 13022
- """
- return binner
-
-
-class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler):
- """
- Provides a resample of a groupby implementation.
- """
-
- @property
- def _resampler_cls(self):
- return TimedeltaIndexResampler
-
-
-def get_resampler(obj: Series | DataFrame, kind=None, **kwds) -> Resampler:
- """
- Create a TimeGrouper and return our resampler.
- """
- tg = TimeGrouper(**kwds)
- return tg._get_resampler(obj, kind=kind)
-
-
-get_resampler.__doc__ = Resampler.__doc__
-
-
-def get_resampler_for_grouping(
- groupby: GroupBy,
- rule,
- how=None,
- fill_method=None,
- limit=None,
- kind=None,
- on=None,
- **kwargs,
-) -> Resampler:
- """
- Return our appropriate resampler when grouping as well.
- """
- # .resample uses 'on' similar to how .groupby uses 'key'
- tg = TimeGrouper(freq=rule, key=on, **kwargs)
- resampler = tg._get_resampler(groupby.obj, kind=kind)
- return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key)
-
-
-class TimeGrouper(Grouper):
- """
- Custom groupby class for time-interval grouping.
-
- Parameters
- ----------
- freq : pandas date offset or offset alias for identifying bin edges
- closed : closed end of interval; 'left' or 'right'
- label : interval boundary to use for labeling; 'left' or 'right'
- convention : {'start', 'end', 'e', 's'}
- If axis is PeriodIndex
- """
-
- _attributes = Grouper._attributes + (
- "closed",
- "label",
- "how",
- "kind",
- "convention",
- "origin",
- "offset",
- )
-
- origin: TimeGrouperOrigin
-
- def __init__(
- self,
- freq: Frequency = "Min",
- closed: Literal["left", "right"] | None = None,
- label: Literal["left", "right"] | None = None,
- how: str = "mean",
- axis: Axis = 0,
- fill_method=None,
- limit=None,
- kind: str | None = None,
- convention: Literal["start", "end", "e", "s"] | None = None,
- origin: Literal["epoch", "start", "start_day", "end", "end_day"]
- | TimestampConvertibleTypes = "start_day",
- offset: TimedeltaConvertibleTypes | None = None,
- group_keys: bool = False,
- **kwargs,
- ) -> None:
- # Check for correctness of the keyword arguments which would
- # otherwise silently use the default if misspelled
- if label not in {None, "left", "right"}:
- raise ValueError(f"Unsupported value {label} for `label`")
- if closed not in {None, "left", "right"}:
- raise ValueError(f"Unsupported value {closed} for `closed`")
- if convention not in {None, "start", "end", "e", "s"}:
- raise ValueError(f"Unsupported value {convention} for `convention`")
-
- freq = to_offset(freq)
-
- end_types = {"M", "A", "Q", "BM", "BA", "BQ", "W"}
- rule = freq.rule_code
- if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types):
- if closed is None:
- closed = "right"
- if label is None:
- label = "right"
- else:
- # The backward resample sets ``closed`` to ``'right'`` by default
- # since the last value should be considered as the edge point for
- # the last bin. When origin in "end" or "end_day", the value for a
- # specific ``Timestamp`` index stands for the resample result from
- # the current ``Timestamp`` minus ``freq`` to the current
- # ``Timestamp`` with a right close.
- if origin in ["end", "end_day"]:
- if closed is None:
- closed = "right"
- if label is None:
- label = "right"
- else:
- if closed is None:
- closed = "left"
- if label is None:
- label = "left"
-
- self.closed = closed
- self.label = label
- self.kind = kind
- self.convention = convention if convention is not None else "e"
- self.how = how
- self.fill_method = fill_method
- self.limit = limit
- self.group_keys = group_keys
-
- if origin in ("epoch", "start", "start_day", "end", "end_day"):
- # error: Incompatible types in assignment (expression has type "Union[Union[
- # Timestamp, datetime, datetime64, signedinteger[_64Bit], float, str],
- # Literal['epoch', 'start', 'start_day', 'end', 'end_day']]", variable has
- # type "Union[Timestamp, Literal['epoch', 'start', 'start_day', 'end',
- # 'end_day']]")
- self.origin = origin # type: ignore[assignment]
- else:
- try:
- self.origin = Timestamp(origin)
- except (ValueError, TypeError) as err:
- raise ValueError(
- "'origin' should be equal to 'epoch', 'start', 'start_day', "
- "'end', 'end_day' or "
- f"should be a Timestamp convertible type. Got '{origin}' instead."
- ) from err
-
- try:
- self.offset = Timedelta(offset) if offset is not None else None
- except (ValueError, TypeError) as err:
- raise ValueError(
- "'offset' should be a Timedelta convertible type. "
- f"Got '{offset}' instead."
- ) from err
-
- # always sort time groupers
- kwargs["sort"] = True
-
- super().__init__(freq=freq, axis=axis, **kwargs)
-
- def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler:
- """
- Return my resampler or raise if we have an invalid axis.
-
- Parameters
- ----------
- obj : Series or DataFrame
- kind : string, optional
- 'period','timestamp','timedelta' are valid
-
- Returns
- -------
- Resampler
-
- Raises
- ------
- TypeError if incompatible axis
-
- """
- _, ax, indexer = self._set_grouper(obj, gpr_index=None)
-
- if isinstance(ax, DatetimeIndex):
- return DatetimeIndexResampler(
- obj,
- timegrouper=self,
- kind=kind,
- axis=self.axis,
- group_keys=self.group_keys,
- gpr_index=ax,
- )
- elif isinstance(ax, PeriodIndex) or kind == "period":
- return PeriodIndexResampler(
- obj,
- timegrouper=self,
- kind=kind,
- axis=self.axis,
- group_keys=self.group_keys,
- gpr_index=ax,
- )
- elif isinstance(ax, TimedeltaIndex):
- return TimedeltaIndexResampler(
- obj,
- timegrouper=self,
- axis=self.axis,
- group_keys=self.group_keys,
- gpr_index=ax,
- )
-
- raise TypeError(
- "Only valid with DatetimeIndex, "
- "TimedeltaIndex or PeriodIndex, "
- f"but got an instance of '{type(ax).__name__}'"
- )
-
- def _get_grouper(
- self, obj: NDFrameT, validate: bool = True
- ) -> tuple[BinGrouper, NDFrameT]:
- # create the resampler and return our binner
- r = self._get_resampler(obj)
- return r.grouper, cast(NDFrameT, r.obj)
-
- def _get_time_bins(self, ax: DatetimeIndex):
- if not isinstance(ax, DatetimeIndex):
- raise TypeError(
- "axis must be a DatetimeIndex, but got "
- f"an instance of {type(ax).__name__}"
- )
-
- if len(ax) == 0:
- binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name)
- return binner, [], labels
-
- first, last = _get_timestamp_range_edges(
- ax.min(),
- ax.max(),
- self.freq,
- unit=ax.unit,
- closed=self.closed,
- origin=self.origin,
- offset=self.offset,
- )
- # GH #12037
- # use first/last directly instead of call replace() on them
- # because replace() will swallow the nanosecond part
- # thus last bin maybe slightly before the end if the end contains
- # nanosecond part and lead to `Values falls after last bin` error
- # GH 25758: If DST lands at midnight (e.g. 'America/Havana'), user feedback
- # has noted that ambiguous=True provides the most sensible result
- binner = labels = date_range(
- freq=self.freq,
- start=first,
- end=last,
- tz=ax.tz,
- name=ax.name,
- ambiguous=True,
- nonexistent="shift_forward",
- unit=ax.unit,
- )
-
- ax_values = ax.asi8
- binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
-
- # general version, knowing nothing about relative frequencies
- bins = lib.generate_bins_dt64(
- ax_values, bin_edges, self.closed, hasnans=ax.hasnans
- )
-
- if self.closed == "right":
- labels = binner
- if self.label == "right":
- labels = labels[1:]
- elif self.label == "right":
- labels = labels[1:]
-
- if ax.hasnans:
- binner = binner.insert(0, NaT)
- labels = labels.insert(0, NaT)
-
- # if we end up with more labels than bins
- # adjust the labels
- # GH4076
- if len(bins) < len(labels):
- labels = labels[: len(bins)]
-
- return binner, bins, labels
-
- def _adjust_bin_edges(
- self, binner: DatetimeIndex, ax_values: npt.NDArray[np.int64]
- ) -> tuple[DatetimeIndex, npt.NDArray[np.int64]]:
- # Some hacks for > daily data, see #1471, #1458, #1483
-
- if self.freq != "D" and is_superperiod(self.freq, "D"):
- if self.closed == "right":
- # GH 21459, GH 9119: Adjust the bins relative to the wall time
- edges_dti = binner.tz_localize(None)
- edges_dti = (
- edges_dti
- + Timedelta(days=1, unit=edges_dti.unit).as_unit(edges_dti.unit)
- - Timedelta(1, unit=edges_dti.unit).as_unit(edges_dti.unit)
- )
- bin_edges = edges_dti.tz_localize(binner.tz).asi8
- else:
- bin_edges = binner.asi8
-
- # intraday values on last day
- if bin_edges[-2] > ax_values.max():
- bin_edges = bin_edges[:-1]
- binner = binner[:-1]
- else:
- bin_edges = binner.asi8
- return binner, bin_edges
-
- def _get_time_delta_bins(self, ax: TimedeltaIndex):
- if not isinstance(ax, TimedeltaIndex):
- raise TypeError(
- "axis must be a TimedeltaIndex, but got "
- f"an instance of {type(ax).__name__}"
- )
-
- if not len(ax):
- binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name)
- return binner, [], labels
-
- start, end = ax.min(), ax.max()
-
- if self.closed == "right":
- end += self.freq
-
- labels = binner = timedelta_range(
- start=start, end=end, freq=self.freq, name=ax.name
- )
-
- end_stamps = labels
- if self.closed == "left":
- end_stamps += self.freq
-
- bins = ax.searchsorted(end_stamps, side=self.closed)
-
- if self.offset:
- # GH 10530 & 31809
- labels += self.offset
-
- return binner, bins, labels
-
- def _get_time_period_bins(self, ax: DatetimeIndex):
- if not isinstance(ax, DatetimeIndex):
- raise TypeError(
- "axis must be a DatetimeIndex, but got "
- f"an instance of {type(ax).__name__}"
- )
-
- freq = self.freq
-
- if not len(ax):
- binner = labels = PeriodIndex(data=[], freq=freq, name=ax.name)
- return binner, [], labels
-
- labels = binner = period_range(start=ax[0], end=ax[-1], freq=freq, name=ax.name)
-
- end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp()
- if ax.tz:
- end_stamps = end_stamps.tz_localize(ax.tz)
- bins = ax.searchsorted(end_stamps, side="left")
-
- return binner, bins, labels
-
- def _get_period_bins(self, ax: PeriodIndex):
- if not isinstance(ax, PeriodIndex):
- raise TypeError(
- "axis must be a PeriodIndex, but got "
- f"an instance of {type(ax).__name__}"
- )
-
- memb = ax.asfreq(self.freq, how=self.convention)
-
- # NaT handling as in pandas._lib.lib.generate_bins_dt64()
- nat_count = 0
- if memb.hasnans:
- # error: Incompatible types in assignment (expression has type
- # "bool_", variable has type "int") [assignment]
- nat_count = np.sum(memb._isnan) # type: ignore[assignment]
- memb = memb[~memb._isnan]
-
- if not len(memb):
- # index contains no valid (non-NaT) values
- bins = np.array([], dtype=np.int64)
- binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name)
- if len(ax) > 0:
- # index is all NaT
- binner, bins, labels = _insert_nat_bin(binner, bins, labels, len(ax))
- return binner, bins, labels
-
- freq_mult = self.freq.n
-
- start = ax.min().asfreq(self.freq, how=self.convention)
- end = ax.max().asfreq(self.freq, how="end")
- bin_shift = 0
-
- if isinstance(self.freq, Tick):
- # GH 23882 & 31809: get adjusted bin edge labels with 'origin'
- # and 'origin' support. This call only makes sense if the freq is a
- # Tick since offset and origin are only used in those cases.
- # Not doing this check could create an extra empty bin.
- p_start, end = _get_period_range_edges(
- start,
- end,
- self.freq,
- closed=self.closed,
- origin=self.origin,
- offset=self.offset,
- )
-
- # Get offset for bin edge (not label edge) adjustment
- start_offset = Period(start, self.freq) - Period(p_start, self.freq)
- # error: Item "Period" of "Union[Period, Any]" has no attribute "n"
- bin_shift = start_offset.n % freq_mult # type: ignore[union-attr]
- start = p_start
-
- labels = binner = period_range(
- start=start, end=end, freq=self.freq, name=ax.name
- )
-
- i8 = memb.asi8
-
- # when upsampling to subperiods, we need to generate enough bins
- expected_bins_count = len(binner) * freq_mult
- i8_extend = expected_bins_count - (i8[-1] - i8[0])
- rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
- rng += freq_mult
- # adjust bin edge indexes to account for base
- rng -= bin_shift
-
- # Wrap in PeriodArray for PeriodArray.searchsorted
- prng = type(memb._data)(rng, dtype=memb.dtype)
- bins = memb.searchsorted(prng, side="left")
-
- if nat_count > 0:
- binner, bins, labels = _insert_nat_bin(binner, bins, labels, nat_count)
-
- return binner, bins, labels
-
-
-def _take_new_index(
- obj: NDFrameT, indexer: npt.NDArray[np.intp], new_index: Index, axis: AxisInt = 0
-) -> NDFrameT:
- if isinstance(obj, ABCSeries):
- new_values = algos.take_nd(obj._values, indexer)
- # error: Incompatible return value type (got "Series", expected "NDFrameT")
- return obj._constructor( # type: ignore[return-value]
- new_values, index=new_index, name=obj.name
- )
- elif isinstance(obj, ABCDataFrame):
- if axis == 1:
- raise NotImplementedError("axis 1 is not supported")
- new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1)
- # error: Incompatible return value type
- # (got "DataFrame", expected "NDFrameT")
- return obj._constructor(new_mgr) # type: ignore[return-value]
- else:
- raise ValueError("'obj' should be either a Series or a DataFrame")
-
-
-def _get_timestamp_range_edges(
- first: Timestamp,
- last: Timestamp,
- freq: BaseOffset,
- unit: str,
- closed: Literal["right", "left"] = "left",
- origin: TimeGrouperOrigin = "start_day",
- offset: Timedelta | None = None,
-) -> tuple[Timestamp, Timestamp]:
- """
- Adjust the `first` Timestamp to the preceding Timestamp that resides on
- the provided offset. Adjust the `last` Timestamp to the following
- Timestamp that resides on the provided offset. Input Timestamps that
- already reside on the offset will be adjusted depending on the type of
- offset and the `closed` parameter.
-
- Parameters
- ----------
- first : pd.Timestamp
- The beginning Timestamp of the range to be adjusted.
- last : pd.Timestamp
- The ending Timestamp of the range to be adjusted.
- freq : pd.DateOffset
- The dateoffset to which the Timestamps will be adjusted.
- closed : {'right', 'left'}, default "left"
- Which side of bin interval is closed.
- origin : {'epoch', 'start', 'start_day'} or Timestamp, default 'start_day'
- The timestamp on which to adjust the grouping. The timezone of origin must
- match the timezone of the index.
- If a timestamp is not used, these values are also supported:
-
- - 'epoch': `origin` is 1970-01-01
- - 'start': `origin` is the first value of the timeseries
- - 'start_day': `origin` is the first day at midnight of the timeseries
- offset : pd.Timedelta, default is None
- An offset timedelta added to the origin.
-
- Returns
- -------
- A tuple of length 2, containing the adjusted pd.Timestamp objects.
- """
- if isinstance(freq, Tick):
- index_tz = first.tz
- if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None):
- raise ValueError("The origin must have the same timezone as the index.")
- if origin == "epoch":
- # set the epoch based on the timezone to have similar bins results when
- # resampling on the same kind of indexes on different timezones
- origin = Timestamp("1970-01-01", tz=index_tz)
-
- if isinstance(freq, Day):
- # _adjust_dates_anchored assumes 'D' means 24H, but first/last
- # might contain a DST transition (23H, 24H, or 25H).
- # So "pretend" the dates are naive when adjusting the endpoints
- first = first.tz_localize(None)
- last = last.tz_localize(None)
- if isinstance(origin, Timestamp):
- origin = origin.tz_localize(None)
-
- first, last = _adjust_dates_anchored(
- first, last, freq, closed=closed, origin=origin, offset=offset, unit=unit
- )
- if isinstance(freq, Day):
- first = first.tz_localize(index_tz)
- last = last.tz_localize(index_tz)
- else:
- first = first.normalize()
- last = last.normalize()
-
- if closed == "left":
- first = Timestamp(freq.rollback(first))
- else:
- first = Timestamp(first - freq)
-
- last = Timestamp(last + freq)
-
- return first, last
-
-
-def _get_period_range_edges(
- first: Period,
- last: Period,
- freq: BaseOffset,
- closed: Literal["right", "left"] = "left",
- origin: TimeGrouperOrigin = "start_day",
- offset: Timedelta | None = None,
-) -> tuple[Period, Period]:
- """
- Adjust the provided `first` and `last` Periods to the respective Period of
- the given offset that encompasses them.
-
- Parameters
- ----------
- first : pd.Period
- The beginning Period of the range to be adjusted.
- last : pd.Period
- The ending Period of the range to be adjusted.
- freq : pd.DateOffset
- The freq to which the Periods will be adjusted.
- closed : {'right', 'left'}, default "left"
- Which side of bin interval is closed.
- origin : {'epoch', 'start', 'start_day'}, Timestamp, default 'start_day'
- The timestamp on which to adjust the grouping. The timezone of origin must
- match the timezone of the index.
-
- If a timestamp is not used, these values are also supported:
-
- - 'epoch': `origin` is 1970-01-01
- - 'start': `origin` is the first value of the timeseries
- - 'start_day': `origin` is the first day at midnight of the timeseries
- offset : pd.Timedelta, default is None
- An offset timedelta added to the origin.
-
- Returns
- -------
- A tuple of length 2, containing the adjusted pd.Period objects.
- """
- if not all(isinstance(obj, Period) for obj in [first, last]):
- raise TypeError("'first' and 'last' must be instances of type Period")
-
- # GH 23882
- first_ts = first.to_timestamp()
- last_ts = last.to_timestamp()
- adjust_first = not freq.is_on_offset(first_ts)
- adjust_last = freq.is_on_offset(last_ts)
-
- first_ts, last_ts = _get_timestamp_range_edges(
- first_ts, last_ts, freq, unit="ns", closed=closed, origin=origin, offset=offset
- )
-
- first = (first_ts + int(adjust_first) * freq).to_period(freq)
- last = (last_ts - int(adjust_last) * freq).to_period(freq)
- return first, last
-
-
-def _insert_nat_bin(
- binner: PeriodIndex, bins: np.ndarray, labels: PeriodIndex, nat_count: int
-) -> tuple[PeriodIndex, np.ndarray, PeriodIndex]:
- # NaT handling as in pandas._lib.lib.generate_bins_dt64()
- # shift bins by the number of NaT
- assert nat_count > 0
- bins += nat_count
- bins = np.insert(bins, 0, nat_count)
-
- # Incompatible types in assignment (expression has type "Index", variable
- # has type "PeriodIndex")
- binner = binner.insert(0, NaT) # type: ignore[assignment]
- # Incompatible types in assignment (expression has type "Index", variable
- # has type "PeriodIndex")
- labels = labels.insert(0, NaT) # type: ignore[assignment]
- return binner, bins, labels
-
-
-def _adjust_dates_anchored(
- first: Timestamp,
- last: Timestamp,
- freq: Tick,
- closed: Literal["right", "left"] = "right",
- origin: TimeGrouperOrigin = "start_day",
- offset: Timedelta | None = None,
- unit: str = "ns",
-) -> tuple[Timestamp, Timestamp]:
- # First and last offsets should be calculated from the start day to fix an
- # error cause by resampling across multiple days when a one day period is
- # not a multiple of the frequency. See GH 8683
- # To handle frequencies that are not multiple or divisible by a day we let
- # the possibility to define a fixed origin timestamp. See GH 31809
- first = first.as_unit(unit)
- last = last.as_unit(unit)
- if offset is not None:
- offset = offset.as_unit(unit)
-
- freq_value = Timedelta(freq).as_unit(unit)._value
-
- origin_timestamp = 0 # origin == "epoch"
- if origin == "start_day":
- origin_timestamp = first.normalize()._value
- elif origin == "start":
- origin_timestamp = first._value
- elif isinstance(origin, Timestamp):
- origin_timestamp = origin.as_unit(unit)._value
- elif origin in ["end", "end_day"]:
- origin_last = last if origin == "end" else last.ceil("D")
- sub_freq_times = (origin_last._value - first._value) // freq_value
- if closed == "left":
- sub_freq_times += 1
- first = origin_last - sub_freq_times * freq
- origin_timestamp = first._value
- origin_timestamp += offset._value if offset else 0
-
- # GH 10117 & GH 19375. If first and last contain timezone information,
- # Perform the calculation in UTC in order to avoid localizing on an
- # Ambiguous or Nonexistent time.
- first_tzinfo = first.tzinfo
- last_tzinfo = last.tzinfo
- if first_tzinfo is not None:
- first = first.tz_convert("UTC")
- if last_tzinfo is not None:
- last = last.tz_convert("UTC")
-
- foffset = (first._value - origin_timestamp) % freq_value
- loffset = (last._value - origin_timestamp) % freq_value
-
- if closed == "right":
- if foffset > 0:
- # roll back
- fresult_int = first._value - foffset
- else:
- fresult_int = first._value - freq_value
-
- if loffset > 0:
- # roll forward
- lresult_int = last._value + (freq_value - loffset)
- else:
- # already the end of the road
- lresult_int = last._value
- else: # closed == 'left'
- if foffset > 0:
- fresult_int = first._value - foffset
- else:
- # start of the road
- fresult_int = first._value
-
- if loffset > 0:
- # roll forward
- lresult_int = last._value + (freq_value - loffset)
- else:
- lresult_int = last._value + freq_value
- fresult = Timestamp(fresult_int, unit=unit)
- lresult = Timestamp(lresult_int, unit=unit)
- if first_tzinfo is not None:
- fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo)
- if last_tzinfo is not None:
- lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo)
- return fresult, lresult
-
-
-def asfreq(
- obj: NDFrameT,
- freq,
- method=None,
- how=None,
- normalize: bool = False,
- fill_value=None,
-) -> NDFrameT:
- """
- Utility frequency conversion method for Series/DataFrame.
-
- See :meth:`pandas.NDFrame.asfreq` for full documentation.
- """
- if isinstance(obj.index, PeriodIndex):
- if method is not None:
- raise NotImplementedError("'method' argument is not supported")
-
- if how is None:
- how = "E"
-
- new_obj = obj.copy()
- new_obj.index = obj.index.asfreq(freq, how=how)
-
- elif len(obj.index) == 0:
- new_obj = obj.copy()
-
- new_obj.index = _asfreq_compat(obj.index, freq)
- else:
- dti = date_range(obj.index.min(), obj.index.max(), freq=freq)
- dti.name = obj.index.name
- new_obj = obj.reindex(dti, method=method, fill_value=fill_value)
- if normalize:
- new_obj.index = new_obj.index.normalize()
-
- return new_obj
-
-
-def _asfreq_compat(index: DatetimeIndex | PeriodIndex | TimedeltaIndex, freq):
- """
- Helper to mimic asfreq on (empty) DatetimeIndex and TimedeltaIndex.
-
- Parameters
- ----------
- index : PeriodIndex, DatetimeIndex, or TimedeltaIndex
- freq : DateOffset
-
- Returns
- -------
- same type as index
- """
- if len(index) != 0:
- # This should never be reached, always checked by the caller
- raise ValueError(
- "Can only set arbitrary freq for empty DatetimeIndex or TimedeltaIndex"
- )
- new_index: Index
- if isinstance(index, PeriodIndex):
- new_index = index.asfreq(freq=freq)
- elif isinstance(index, DatetimeIndex):
- new_index = DatetimeIndex([], dtype=index.dtype, freq=freq, name=index.name)
- elif isinstance(index, TimedeltaIndex):
- new_index = TimedeltaIndex([], dtype=index.dtype, freq=freq, name=index.name)
- else: # pragma: no cover
- raise TypeError(type(index))
- return new_index
-
-
-def maybe_warn_args_and_kwargs(cls, kernel: str, args, kwargs) -> None:
- """
- Warn for deprecation of args and kwargs in resample functions.
-
- Parameters
- ----------
- cls : type
- Class to warn about.
- kernel : str
- Operation name.
- args : tuple or None
- args passed by user. Will be None if and only if kernel does not have args.
- kwargs : dict or None
- kwargs passed by user. Will be None if and only if kernel does not have kwargs.
- """
- warn_args = args is not None and len(args) > 0
- warn_kwargs = kwargs is not None and len(kwargs) > 0
- if warn_args and warn_kwargs:
- msg = "args and kwargs"
- elif warn_args:
- msg = "args"
- elif warn_kwargs:
- msg = "kwargs"
- else:
- return
- warnings.warn(
- f"Passing additional {msg} to {cls.__name__}.{kernel} has "
- "no impact on the result and is deprecated. This will "
- "raise a TypeError in a future version of pandas.",
- category=FutureWarning,
- stacklevel=find_stack_level(),
- )
diff --git a/contrib/python/pandas/py3/pandas/core/reshape/__init__.py b/contrib/python/pandas/py3/pandas/core/reshape/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/reshape/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/core/reshape/api.py b/contrib/python/pandas/py3/pandas/core/reshape/api.py
deleted file mode 100644
index b1884c497f0..00000000000
--- a/contrib/python/pandas/py3/pandas/core/reshape/api.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from pandas.core.reshape.concat import concat
-from pandas.core.reshape.encoding import (
- from_dummies,
- get_dummies,
-)
-from pandas.core.reshape.melt import (
- lreshape,
- melt,
- wide_to_long,
-)
-from pandas.core.reshape.merge import (
- merge,
- merge_asof,
- merge_ordered,
-)
-from pandas.core.reshape.pivot import (
- crosstab,
- pivot,
- pivot_table,
-)
-from pandas.core.reshape.tile import (
- cut,
- qcut,
-)
-
-__all__ = [
- "concat",
- "crosstab",
- "cut",
- "from_dummies",
- "get_dummies",
- "lreshape",
- "melt",
- "merge",
- "merge_asof",
- "merge_ordered",
- "pivot",
- "pivot_table",
- "qcut",
- "wide_to_long",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/reshape/concat.py b/contrib/python/pandas/py3/pandas/core/reshape/concat.py
deleted file mode 100644
index 79f130451a9..00000000000
--- a/contrib/python/pandas/py3/pandas/core/reshape/concat.py
+++ /dev/null
@@ -1,823 +0,0 @@
-"""
-Concat routines.
-"""
-from __future__ import annotations
-
-from collections import abc
-from typing import (
- TYPE_CHECKING,
- Callable,
- Hashable,
- Iterable,
- Literal,
- Mapping,
- cast,
- overload,
-)
-
-import numpy as np
-
-from pandas._config import using_copy_on_write
-
-from pandas._typing import (
- Axis,
- AxisInt,
- HashableT,
-)
-from pandas.util._decorators import cache_readonly
-
-from pandas.core.dtypes.concat import concat_compat
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-from pandas.core.dtypes.inference import is_bool
-from pandas.core.dtypes.missing import isna
-
-from pandas.core.arrays.categorical import (
- factorize_from_iterable,
- factorize_from_iterables,
-)
-import pandas.core.common as com
-from pandas.core.indexes.api import (
- Index,
- MultiIndex,
- all_indexes_same,
- default_index,
- ensure_index,
- get_objs_combined_axis,
- get_unanimous_names,
-)
-from pandas.core.internals import concatenate_managers
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
- from pandas.core.generic import NDFrame
-
-# ---------------------------------------------------------------------
-# Concatenate DataFrame objects
-
-
-@overload
-def concat(
- objs: Iterable[DataFrame] | Mapping[HashableT, DataFrame],
- *,
- axis: Literal[0, "index"] = ...,
- join: str = ...,
- ignore_index: bool = ...,
- keys=...,
- levels=...,
- names=...,
- verify_integrity: bool = ...,
- sort: bool = ...,
- copy: bool | None = ...,
-) -> DataFrame:
- ...
-
-
-@overload
-def concat(
- objs: Iterable[Series] | Mapping[HashableT, Series],
- *,
- axis: Literal[0, "index"] = ...,
- join: str = ...,
- ignore_index: bool = ...,
- keys=...,
- levels=...,
- names=...,
- verify_integrity: bool = ...,
- sort: bool = ...,
- copy: bool | None = ...,
-) -> Series:
- ...
-
-
-@overload
-def concat(
- objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
- *,
- axis: Literal[0, "index"] = ...,
- join: str = ...,
- ignore_index: bool = ...,
- keys=...,
- levels=...,
- names=...,
- verify_integrity: bool = ...,
- sort: bool = ...,
- copy: bool | None = ...,
-) -> DataFrame | Series:
- ...
-
-
-@overload
-def concat(
- objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
- *,
- axis: Literal[1, "columns"],
- join: str = ...,
- ignore_index: bool = ...,
- keys=...,
- levels=...,
- names=...,
- verify_integrity: bool = ...,
- sort: bool = ...,
- copy: bool | None = ...,
-) -> DataFrame:
- ...
-
-
-@overload
-def concat(
- objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
- *,
- axis: Axis = ...,
- join: str = ...,
- ignore_index: bool = ...,
- keys=...,
- levels=...,
- names=...,
- verify_integrity: bool = ...,
- sort: bool = ...,
- copy: bool | None = ...,
-) -> DataFrame | Series:
- ...
-
-
-def concat(
- objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
- *,
- axis: Axis = 0,
- join: str = "outer",
- ignore_index: bool = False,
- keys=None,
- levels=None,
- names=None,
- verify_integrity: bool = False,
- sort: bool = False,
- copy: bool | None = None,
-) -> DataFrame | Series:
- """
- Concatenate pandas objects along a particular axis.
-
- Allows optional set logic along the other axes.
-
- Can also add a layer of hierarchical indexing on the concatenation axis,
- which may be useful if the labels are the same (or overlapping) on
- the passed axis number.
-
- Parameters
- ----------
- objs : a sequence or mapping of Series or DataFrame objects
- If a mapping is passed, the sorted keys will be used as the `keys`
- argument, unless it is passed, in which case the values will be
- selected (see below). Any None objects will be dropped silently unless
- they are all None in which case a ValueError will be raised.
- axis : {0/'index', 1/'columns'}, default 0
- The axis to concatenate along.
- join : {'inner', 'outer'}, default 'outer'
- How to handle indexes on other axis (or axes).
- ignore_index : bool, default False
- If True, do not use the index values along the concatenation axis. The
- resulting axis will be labeled 0, ..., n - 1. This is useful if you are
- concatenating objects where the concatenation axis does not have
- meaningful indexing information. Note the index values on the other
- axes are still respected in the join.
- keys : sequence, default None
- If multiple levels passed, should contain tuples. Construct
- hierarchical index using the passed keys as the outermost level.
- levels : list of sequences, default None
- Specific levels (unique values) to use for constructing a
- MultiIndex. Otherwise they will be inferred from the keys.
- names : list, default None
- Names for the levels in the resulting hierarchical index.
- verify_integrity : bool, default False
- Check whether the new concatenated axis contains duplicates. This can
- be very expensive relative to the actual data concatenation.
- sort : bool, default False
- Sort non-concatenation axis if it is not already aligned.
-
- copy : bool, default True
- If False, do not copy data unnecessarily.
-
- Returns
- -------
- object, type of objs
- When concatenating all ``Series`` along the index (axis=0), a
- ``Series`` is returned. When ``objs`` contains at least one
- ``DataFrame``, a ``DataFrame`` is returned. When concatenating along
- the columns (axis=1), a ``DataFrame`` is returned.
-
- See Also
- --------
- DataFrame.join : Join DataFrames using indexes.
- DataFrame.merge : Merge DataFrames by indexes or columns.
-
- Notes
- -----
- The keys, levels, and names arguments are all optional.
-
- A walkthrough of how this method fits in with other tools for combining
- pandas objects can be found `here
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html>`__.
-
- It is not recommended to build DataFrames by adding single rows in a
- for loop. Build a list of rows and make a DataFrame in a single concat.
-
- Examples
- --------
- Combine two ``Series``.
-
- >>> s1 = pd.Series(['a', 'b'])
- >>> s2 = pd.Series(['c', 'd'])
- >>> pd.concat([s1, s2])
- 0 a
- 1 b
- 0 c
- 1 d
- dtype: object
-
- Clear the existing index and reset it in the result
- by setting the ``ignore_index`` option to ``True``.
-
- >>> pd.concat([s1, s2], ignore_index=True)
- 0 a
- 1 b
- 2 c
- 3 d
- dtype: object
-
- Add a hierarchical index at the outermost level of
- the data with the ``keys`` option.
-
- >>> pd.concat([s1, s2], keys=['s1', 's2'])
- s1 0 a
- 1 b
- s2 0 c
- 1 d
- dtype: object
-
- Label the index keys you create with the ``names`` option.
-
- >>> pd.concat([s1, s2], keys=['s1', 's2'],
- ... names=['Series name', 'Row ID'])
- Series name Row ID
- s1 0 a
- 1 b
- s2 0 c
- 1 d
- dtype: object
-
- Combine two ``DataFrame`` objects with identical columns.
-
- >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],
- ... columns=['letter', 'number'])
- >>> df1
- letter number
- 0 a 1
- 1 b 2
- >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],
- ... columns=['letter', 'number'])
- >>> df2
- letter number
- 0 c 3
- 1 d 4
- >>> pd.concat([df1, df2])
- letter number
- 0 a 1
- 1 b 2
- 0 c 3
- 1 d 4
-
- Combine ``DataFrame`` objects with overlapping columns
- and return everything. Columns outside the intersection will
- be filled with ``NaN`` values.
-
- >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
- ... columns=['letter', 'number', 'animal'])
- >>> df3
- letter number animal
- 0 c 3 cat
- 1 d 4 dog
- >>> pd.concat([df1, df3], sort=False)
- letter number animal
- 0 a 1 NaN
- 1 b 2 NaN
- 0 c 3 cat
- 1 d 4 dog
-
- Combine ``DataFrame`` objects with overlapping columns
- and return only those that are shared by passing ``inner`` to
- the ``join`` keyword argument.
-
- >>> pd.concat([df1, df3], join="inner")
- letter number
- 0 a 1
- 1 b 2
- 0 c 3
- 1 d 4
-
- Combine ``DataFrame`` objects horizontally along the x axis by
- passing in ``axis=1``.
-
- >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
- ... columns=['animal', 'name'])
- >>> pd.concat([df1, df4], axis=1)
- letter number animal name
- 0 a 1 bird polly
- 1 b 2 monkey george
-
- Prevent the result from including duplicate index values with the
- ``verify_integrity`` option.
-
- >>> df5 = pd.DataFrame([1], index=['a'])
- >>> df5
- 0
- a 1
- >>> df6 = pd.DataFrame([2], index=['a'])
- >>> df6
- 0
- a 2
- >>> pd.concat([df5, df6], verify_integrity=True)
- Traceback (most recent call last):
- ...
- ValueError: Indexes have overlapping values: ['a']
-
- Append a single row to the end of a ``DataFrame`` object.
-
- >>> df7 = pd.DataFrame({'a': 1, 'b': 2}, index=[0])
- >>> df7
- a b
- 0 1 2
- >>> new_row = pd.Series({'a': 3, 'b': 4})
- >>> new_row
- a 3
- b 4
- dtype: int64
- >>> pd.concat([df7, new_row.to_frame().T], ignore_index=True)
- a b
- 0 1 2
- 1 3 4
- """
- if copy is None:
- if using_copy_on_write():
- copy = False
- else:
- copy = True
- elif copy and using_copy_on_write():
- copy = False
-
- op = _Concatenator(
- objs,
- axis=axis,
- ignore_index=ignore_index,
- join=join,
- keys=keys,
- levels=levels,
- names=names,
- verify_integrity=verify_integrity,
- copy=copy,
- sort=sort,
- )
-
- return op.get_result()
-
-
-class _Concatenator:
- """
- Orchestrates a concatenation operation for BlockManagers
- """
-
- def __init__(
- self,
- objs: Iterable[NDFrame] | Mapping[HashableT, NDFrame],
- axis: Axis = 0,
- join: str = "outer",
- keys=None,
- levels=None,
- names=None,
- ignore_index: bool = False,
- verify_integrity: bool = False,
- copy: bool = True,
- sort: bool = False,
- ) -> None:
- if isinstance(objs, (ABCSeries, ABCDataFrame, str)):
- raise TypeError(
- "first argument must be an iterable of pandas "
- f'objects, you passed an object of type "{type(objs).__name__}"'
- )
-
- if join == "outer":
- self.intersect = False
- elif join == "inner":
- self.intersect = True
- else: # pragma: no cover
- raise ValueError(
- "Only can inner (intersect) or outer (union) join the other axis"
- )
-
- if isinstance(objs, abc.Mapping):
- if keys is None:
- keys = list(objs.keys())
- objs = [objs[k] for k in keys]
- else:
- objs = list(objs)
-
- if len(objs) == 0:
- raise ValueError("No objects to concatenate")
-
- if keys is None:
- objs = list(com.not_none(*objs))
- else:
- # #1649
- clean_keys = []
- clean_objs = []
- for k, v in zip(keys, objs):
- if v is None:
- continue
- clean_keys.append(k)
- clean_objs.append(v)
- objs = clean_objs
-
- if isinstance(keys, MultiIndex):
- # TODO: retain levels?
- keys = type(keys).from_tuples(clean_keys, names=keys.names)
- else:
- name = getattr(keys, "name", None)
- keys = Index(clean_keys, name=name, dtype=getattr(keys, "dtype", None))
-
- if len(objs) == 0:
- raise ValueError("All objects passed were None")
-
- # figure out what our result ndim is going to be
- ndims = set()
- for obj in objs:
- if not isinstance(obj, (ABCSeries, ABCDataFrame)):
- msg = (
- f"cannot concatenate object of type '{type(obj)}'; "
- "only Series and DataFrame objs are valid"
- )
- raise TypeError(msg)
-
- ndims.add(obj.ndim)
-
- # get the sample
- # want the highest ndim that we have, and must be non-empty
- # unless all objs are empty
- sample: NDFrame | None = None
- if len(ndims) > 1:
- max_ndim = max(ndims)
- for obj in objs:
- if obj.ndim == max_ndim and np.sum(obj.shape):
- sample = obj
- break
-
- else:
- # filter out the empties if we have not multi-index possibilities
- # note to keep empty Series as it affect to result columns / name
- non_empties = [
- obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, ABCSeries)
- ]
-
- if len(non_empties) and (
- keys is None and names is None and levels is None and not self.intersect
- ):
- objs = non_empties
- sample = objs[0]
-
- if sample is None:
- sample = objs[0]
- self.objs = objs
-
- # Standardize axis parameter to int
- if isinstance(sample, ABCSeries):
- from pandas import DataFrame
-
- axis = DataFrame._get_axis_number(axis)
- else:
- axis = sample._get_axis_number(axis)
-
- # Need to flip BlockManager axis in the DataFrame special case
- self._is_frame = isinstance(sample, ABCDataFrame)
- if self._is_frame:
- axis = sample._get_block_manager_axis(axis)
-
- self._is_series = isinstance(sample, ABCSeries)
- if not 0 <= axis <= sample.ndim:
- raise AssertionError(
- f"axis must be between 0 and {sample.ndim}, input was {axis}"
- )
-
- # if we have mixed ndims, then convert to highest ndim
- # creating column numbers as needed
- if len(ndims) > 1:
- current_column = 0
- max_ndim = sample.ndim
- self.objs, objs = [], self.objs
- for obj in objs:
- ndim = obj.ndim
- if ndim == max_ndim:
- pass
-
- elif ndim != max_ndim - 1:
- raise ValueError(
- "cannot concatenate unaligned mixed "
- "dimensional NDFrame objects"
- )
-
- else:
- name = getattr(obj, "name", None)
- if ignore_index or name is None:
- name = current_column
- current_column += 1
-
- # doing a row-wise concatenation so need everything
- # to line up
- if self._is_frame and axis == 1:
- name = 0
- # mypy needs to know sample is not an NDFrame
- sample = cast("DataFrame | Series", sample)
- obj = sample._constructor({name: obj}, copy=False)
-
- self.objs.append(obj)
-
- # note: this is the BlockManager axis (since DataFrame is transposed)
- self.bm_axis = axis
- self.axis = 1 - self.bm_axis if self._is_frame else 0
- self.keys = keys
- self.names = names or getattr(keys, "names", None)
- self.levels = levels
-
- if not is_bool(sort):
- raise ValueError(
- f"The 'sort' keyword only accepts boolean values; {sort} was passed."
- )
- self.sort = sort
-
- self.ignore_index = ignore_index
- self.verify_integrity = verify_integrity
- self.copy = copy
-
- self.new_axes = self._get_new_axes()
-
- def get_result(self):
- cons: Callable[..., DataFrame | Series]
- sample: DataFrame | Series
-
- # series only
- if self._is_series:
- sample = cast("Series", self.objs[0])
-
- # stack blocks
- if self.bm_axis == 0:
- name = com.consensus_name_attr(self.objs)
- cons = sample._constructor
-
- arrs = [ser._values for ser in self.objs]
-
- res = concat_compat(arrs, axis=0)
- result = cons(res, index=self.new_axes[0], name=name, dtype=res.dtype)
- return result.__finalize__(self, method="concat")
-
- # combine as columns in a frame
- else:
- data = dict(zip(range(len(self.objs)), self.objs))
-
- # GH28330 Preserves subclassed objects through concat
- cons = sample._constructor_expanddim
-
- index, columns = self.new_axes
- df = cons(data, index=index, copy=self.copy)
- df.columns = columns
- return df.__finalize__(self, method="concat")
-
- # combine block managers
- else:
- sample = cast("DataFrame", self.objs[0])
-
- mgrs_indexers = []
- for obj in self.objs:
- indexers = {}
- for ax, new_labels in enumerate(self.new_axes):
- # ::-1 to convert BlockManager ax to DataFrame ax
- if ax == self.bm_axis:
- # Suppress reindexing on concat axis
- continue
-
- # 1-ax to convert BlockManager axis to DataFrame axis
- obj_labels = obj.axes[1 - ax]
- if not new_labels.equals(obj_labels):
- indexers[ax] = obj_labels.get_indexer(new_labels)
-
- mgrs_indexers.append((obj._mgr, indexers))
-
- new_data = concatenate_managers(
- mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
- )
- if not self.copy and not using_copy_on_write():
- new_data._consolidate_inplace()
-
- cons = sample._constructor
- return cons(new_data).__finalize__(self, method="concat")
-
- def _get_result_dim(self) -> int:
- if self._is_series and self.bm_axis == 1:
- return 2
- else:
- return self.objs[0].ndim
-
- def _get_new_axes(self) -> list[Index]:
- ndim = self._get_result_dim()
- return [
- self._get_concat_axis if i == self.bm_axis else self._get_comb_axis(i)
- for i in range(ndim)
- ]
-
- def _get_comb_axis(self, i: AxisInt) -> Index:
- data_axis = self.objs[0]._get_block_manager_axis(i)
- return get_objs_combined_axis(
- self.objs,
- axis=data_axis,
- intersect=self.intersect,
- sort=self.sort,
- copy=self.copy,
- )
-
- @cache_readonly
- def _get_concat_axis(self) -> Index:
- """
- Return index to be used along concatenation axis.
- """
- if self._is_series:
- if self.bm_axis == 0:
- indexes = [x.index for x in self.objs]
- elif self.ignore_index:
- idx = default_index(len(self.objs))
- return idx
- elif self.keys is None:
- names: list[Hashable] = [None] * len(self.objs)
- num = 0
- has_names = False
- for i, x in enumerate(self.objs):
- if not isinstance(x, ABCSeries):
- raise TypeError(
- f"Cannot concatenate type 'Series' with "
- f"object of type '{type(x).__name__}'"
- )
- if x.name is not None:
- names[i] = x.name
- has_names = True
- else:
- names[i] = num
- num += 1
- if has_names:
- return Index(names)
- else:
- return default_index(len(self.objs))
- else:
- return ensure_index(self.keys).set_names(self.names)
- else:
- indexes = [x.axes[self.axis] for x in self.objs]
-
- if self.ignore_index:
- idx = default_index(sum(len(i) for i in indexes))
- return idx
-
- if self.keys is None:
- if self.levels is not None:
- raise ValueError("levels supported only when keys is not None")
- concat_axis = _concat_indexes(indexes)
- else:
- concat_axis = _make_concat_multiindex(
- indexes, self.keys, self.levels, self.names
- )
-
- self._maybe_check_integrity(concat_axis)
-
- return concat_axis
-
- def _maybe_check_integrity(self, concat_index: Index):
- if self.verify_integrity:
- if not concat_index.is_unique:
- overlap = concat_index[concat_index.duplicated()].unique()
- raise ValueError(f"Indexes have overlapping values: {overlap}")
-
-
-def _concat_indexes(indexes) -> Index:
- return indexes[0].append(indexes[1:])
-
-
-def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiIndex:
- if (levels is None and isinstance(keys[0], tuple)) or (
- levels is not None and len(levels) > 1
- ):
- zipped = list(zip(*keys))
- if names is None:
- names = [None] * len(zipped)
-
- if levels is None:
- _, levels = factorize_from_iterables(zipped)
- else:
- levels = [ensure_index(x) for x in levels]
- else:
- zipped = [keys]
- if names is None:
- names = [None]
-
- if levels is None:
- levels = [ensure_index(keys).unique()]
- else:
- levels = [ensure_index(x) for x in levels]
-
- for level in levels:
- if not level.is_unique:
- raise ValueError(f"Level values not unique: {level.tolist()}")
-
- if not all_indexes_same(indexes) or not all(level.is_unique for level in levels):
- codes_list = []
-
- # things are potentially different sizes, so compute the exact codes
- # for each level and pass those to MultiIndex.from_arrays
-
- for hlevel, level in zip(zipped, levels):
- to_concat = []
- if isinstance(hlevel, Index) and hlevel.equals(level):
- lens = [len(idx) for idx in indexes]
- codes_list.append(np.repeat(np.arange(len(hlevel)), lens))
- else:
- for key, index in zip(hlevel, indexes):
- # Find matching codes, include matching nan values as equal.
- mask = (isna(level) & isna(key)) | (level == key)
- if not mask.any():
- raise ValueError(f"Key {key} not in level {level}")
- i = np.nonzero(mask)[0][0]
-
- to_concat.append(np.repeat(i, len(index)))
- codes_list.append(np.concatenate(to_concat))
-
- concat_index = _concat_indexes(indexes)
-
- # these go at the end
- if isinstance(concat_index, MultiIndex):
- levels.extend(concat_index.levels)
- codes_list.extend(concat_index.codes)
- else:
- codes, categories = factorize_from_iterable(concat_index)
- levels.append(categories)
- codes_list.append(codes)
-
- if len(names) == len(levels):
- names = list(names)
- else:
- # make sure that all of the passed indices have the same nlevels
- if not len({idx.nlevels for idx in indexes}) == 1:
- raise AssertionError(
- "Cannot concat indices that do not have the same number of levels"
- )
-
- # also copies
- names = list(names) + list(get_unanimous_names(*indexes))
-
- return MultiIndex(
- levels=levels, codes=codes_list, names=names, verify_integrity=False
- )
-
- new_index = indexes[0]
- n = len(new_index)
- kpieces = len(indexes)
-
- # also copies
- new_names = list(names)
- new_levels = list(levels)
-
- # construct codes
- new_codes = []
-
- # do something a bit more speedy
-
- for hlevel, level in zip(zipped, levels):
- hlevel = ensure_index(hlevel)
- mapped = level.get_indexer(hlevel)
-
- mask = mapped == -1
- if mask.any():
- raise ValueError(f"Values not found in passed level: {hlevel[mask]!s}")
-
- new_codes.append(np.repeat(mapped, n))
-
- if isinstance(new_index, MultiIndex):
- new_levels.extend(new_index.levels)
- new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
- else:
- new_levels.append(new_index.unique())
- single_codes = new_index.unique().get_indexer(new_index)
- new_codes.append(np.tile(single_codes, kpieces))
-
- if len(new_names) < len(new_levels):
- new_names.extend(new_index.names)
-
- return MultiIndex(
- levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
- )
diff --git a/contrib/python/pandas/py3/pandas/core/reshape/encoding.py b/contrib/python/pandas/py3/pandas/core/reshape/encoding.py
deleted file mode 100644
index b907cf34626..00000000000
--- a/contrib/python/pandas/py3/pandas/core/reshape/encoding.py
+++ /dev/null
@@ -1,533 +0,0 @@
-from __future__ import annotations
-
-from collections import defaultdict
-import itertools
-from typing import (
- Hashable,
- Iterable,
-)
-
-import numpy as np
-
-from pandas._libs.sparse import IntIndex
-from pandas._typing import NpDtype
-
-from pandas.core.dtypes.common import (
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- pandas_dtype,
-)
-
-from pandas.core.arrays import SparseArray
-from pandas.core.arrays.categorical import factorize_from_iterable
-from pandas.core.frame import DataFrame
-from pandas.core.indexes.api import (
- Index,
- default_index,
-)
-from pandas.core.series import Series
-
-
-def get_dummies(
- data,
- prefix=None,
- prefix_sep: str | Iterable[str] | dict[str, str] = "_",
- dummy_na: bool = False,
- columns=None,
- sparse: bool = False,
- drop_first: bool = False,
- dtype: NpDtype | None = None,
-) -> DataFrame:
- """
- Convert categorical variable into dummy/indicator variables.
-
- Each variable is converted in as many 0/1 variables as there are different
- values. Columns in the output are each named after a value; if the input is
- a DataFrame, the name of the original variable is prepended to the value.
-
- Parameters
- ----------
- data : array-like, Series, or DataFrame
- Data of which to get dummy indicators.
- prefix : str, list of str, or dict of str, default None
- String to append DataFrame column names.
- Pass a list with length equal to the number of columns
- when calling get_dummies on a DataFrame. Alternatively, `prefix`
- can be a dictionary mapping column names to prefixes.
- prefix_sep : str, default '_'
- If appending prefix, separator/delimiter to use. Or pass a
- list or dictionary as with `prefix`.
- dummy_na : bool, default False
- Add a column to indicate NaNs, if False NaNs are ignored.
- columns : list-like, default None
- Column names in the DataFrame to be encoded.
- If `columns` is None then all the columns with
- `object`, `string`, or `category` dtype will be converted.
- sparse : bool, default False
- Whether the dummy-encoded columns should be backed by
- a :class:`SparseArray` (True) or a regular NumPy array (False).
- drop_first : bool, default False
- Whether to get k-1 dummies out of k categorical levels by removing the
- first level.
- dtype : dtype, default bool
- Data type for new columns. Only a single dtype is allowed.
-
- Returns
- -------
- DataFrame
- Dummy-coded data. If `data` contains other columns than the
- dummy-coded one(s), these will be prepended, unaltered, to the result.
-
- See Also
- --------
- Series.str.get_dummies : Convert Series of strings to dummy codes.
- :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.
-
- Notes
- -----
- Reference :ref:`the user guide <reshaping.dummies>` for more examples.
-
- Examples
- --------
- >>> s = pd.Series(list('abca'))
-
- >>> pd.get_dummies(s)
- a b c
- 0 True False False
- 1 False True False
- 2 False False True
- 3 True False False
-
- >>> s1 = ['a', 'b', np.nan]
-
- >>> pd.get_dummies(s1)
- a b
- 0 True False
- 1 False True
- 2 False False
-
- >>> pd.get_dummies(s1, dummy_na=True)
- a b NaN
- 0 True False False
- 1 False True False
- 2 False False True
-
- >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
- ... 'C': [1, 2, 3]})
-
- >>> pd.get_dummies(df, prefix=['col1', 'col2'])
- C col1_a col1_b col2_a col2_b col2_c
- 0 1 True False False True False
- 1 2 False True True False False
- 2 3 True False False False True
-
- >>> pd.get_dummies(pd.Series(list('abcaa')))
- a b c
- 0 True False False
- 1 False True False
- 2 False False True
- 3 True False False
- 4 True False False
-
- >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
- b c
- 0 False False
- 1 True False
- 2 False True
- 3 False False
- 4 False False
-
- >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
- a b c
- 0 1.0 0.0 0.0
- 1 0.0 1.0 0.0
- 2 0.0 0.0 1.0
- """
- from pandas.core.reshape.concat import concat
-
- dtypes_to_encode = ["object", "string", "category"]
-
- if isinstance(data, DataFrame):
- # determine columns being encoded
- if columns is None:
- data_to_encode = data.select_dtypes(include=dtypes_to_encode)
- elif not is_list_like(columns):
- raise TypeError("Input must be a list-like for parameter `columns`")
- else:
- data_to_encode = data[columns]
-
- # validate prefixes and separator to avoid silently dropping cols
- def check_len(item, name):
- if is_list_like(item):
- if not len(item) == data_to_encode.shape[1]:
- len_msg = (
- f"Length of '{name}' ({len(item)}) did not match the "
- "length of the columns being encoded "
- f"({data_to_encode.shape[1]})."
- )
- raise ValueError(len_msg)
-
- check_len(prefix, "prefix")
- check_len(prefix_sep, "prefix_sep")
-
- if isinstance(prefix, str):
- prefix = itertools.cycle([prefix])
- if isinstance(prefix, dict):
- prefix = [prefix[col] for col in data_to_encode.columns]
-
- if prefix is None:
- prefix = data_to_encode.columns
-
- # validate separators
- if isinstance(prefix_sep, str):
- prefix_sep = itertools.cycle([prefix_sep])
- elif isinstance(prefix_sep, dict):
- prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
-
- with_dummies: list[DataFrame]
- if data_to_encode.shape == data.shape:
- # Encoding the entire df, do not prepend any dropped columns
- with_dummies = []
- elif columns is not None:
- # Encoding only cols specified in columns. Get all cols not in
- # columns to prepend to result.
- with_dummies = [data.drop(columns, axis=1)]
- else:
- # Encoding only object and category dtype columns. Get remaining
- # columns to prepend to result.
- with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
-
- for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep):
- # col is (column_name, column), use just column data here
- dummy = _get_dummies_1d(
- col[1],
- prefix=pre,
- prefix_sep=sep,
- dummy_na=dummy_na,
- sparse=sparse,
- drop_first=drop_first,
- dtype=dtype,
- )
- with_dummies.append(dummy)
- result = concat(with_dummies, axis=1)
- else:
- result = _get_dummies_1d(
- data,
- prefix,
- prefix_sep,
- dummy_na,
- sparse=sparse,
- drop_first=drop_first,
- dtype=dtype,
- )
- return result
-
-
-def _get_dummies_1d(
- data,
- prefix,
- prefix_sep: str | Iterable[str] | dict[str, str] = "_",
- dummy_na: bool = False,
- sparse: bool = False,
- drop_first: bool = False,
- dtype: NpDtype | None = None,
-) -> DataFrame:
- from pandas.core.reshape.concat import concat
-
- # Series avoids inconsistent NaN handling
- codes, levels = factorize_from_iterable(Series(data, copy=False))
-
- if dtype is None:
- dtype = np.dtype(bool)
- _dtype = pandas_dtype(dtype)
-
- if is_object_dtype(_dtype):
- raise ValueError("dtype=object is not a valid dtype for get_dummies")
-
- def get_empty_frame(data) -> DataFrame:
- index: Index | np.ndarray
- if isinstance(data, Series):
- index = data.index
- else:
- index = default_index(len(data))
- return DataFrame(index=index)
-
- # if all NaN
- if not dummy_na and len(levels) == 0:
- return get_empty_frame(data)
-
- codes = codes.copy()
- if dummy_na:
- codes[codes == -1] = len(levels)
- levels = levels.insert(len(levels), np.nan)
-
- # if dummy_na, we just fake a nan level. drop_first will drop it again
- if drop_first and len(levels) == 1:
- return get_empty_frame(data)
-
- number_of_cols = len(levels)
-
- if prefix is None:
- dummy_cols = levels
- else:
- dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])
-
- index: Index | None
- if isinstance(data, Series):
- index = data.index
- else:
- index = None
-
- if sparse:
- fill_value: bool | float
- if is_integer_dtype(dtype):
- fill_value = 0
- elif dtype == np.dtype(bool):
- fill_value = False
- else:
- fill_value = 0.0
-
- sparse_series = []
- N = len(data)
- sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
- mask = codes != -1
- codes = codes[mask]
- n_idx = np.arange(N)[mask]
-
- for ndx, code in zip(n_idx, codes):
- sp_indices[code].append(ndx)
-
- if drop_first:
- # remove first categorical level to avoid perfect collinearity
- # GH12042
- sp_indices = sp_indices[1:]
- dummy_cols = dummy_cols[1:]
- for col, ixs in zip(dummy_cols, sp_indices):
- sarr = SparseArray(
- np.ones(len(ixs), dtype=dtype),
- sparse_index=IntIndex(N, ixs),
- fill_value=fill_value,
- dtype=dtype,
- )
- sparse_series.append(Series(data=sarr, index=index, name=col, copy=False))
-
- return concat(sparse_series, axis=1, copy=False)
-
- else:
- # take on axis=1 + transpose to ensure ndarray layout is column-major
- eye_dtype: NpDtype
- if isinstance(_dtype, np.dtype):
- eye_dtype = _dtype
- else:
- eye_dtype = np.bool_
- dummy_mat = np.eye(number_of_cols, dtype=eye_dtype).take(codes, axis=1).T
-
- if not dummy_na:
- # reset NaN GH4446
- dummy_mat[codes == -1] = 0
-
- if drop_first:
- # remove first GH12042
- dummy_mat = dummy_mat[:, 1:]
- dummy_cols = dummy_cols[1:]
- return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)
-
-
-def from_dummies(
- data: DataFrame,
- sep: None | str = None,
- default_category: None | Hashable | dict[str, Hashable] = None,
-) -> DataFrame:
- """
- Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.
-
- Inverts the operation performed by :func:`~pandas.get_dummies`.
-
- .. versionadded:: 1.5.0
-
- Parameters
- ----------
- data : DataFrame
- Data which contains dummy-coded variables in form of integer columns of
- 1's and 0's.
- sep : str, default None
- Separator used in the column names of the dummy categories they are
- character indicating the separation of the categorical names from the prefixes.
- For example, if your column names are 'prefix_A' and 'prefix_B',
- you can strip the underscore by specifying sep='_'.
- default_category : None, Hashable or dict of Hashables, default None
- The default category is the implied category when a value has none of the
- listed categories specified with a one, i.e. if all dummies in a row are
- zero. Can be a single value for all variables or a dict directly mapping
- the default categories to a prefix of a variable.
-
- Returns
- -------
- DataFrame
- Categorical data decoded from the dummy input-data.
-
- Raises
- ------
- ValueError
- * When the input ``DataFrame`` ``data`` contains NA values.
- * When the input ``DataFrame`` ``data`` contains column names with separators
- that do not match the separator specified with ``sep``.
- * When a ``dict`` passed to ``default_category`` does not include an implied
- category for each prefix.
- * When a value in ``data`` has more than one category assigned to it.
- * When ``default_category=None`` and a value in ``data`` has no category
- assigned to it.
- TypeError
- * When the input ``data`` is not of type ``DataFrame``.
- * When the input ``DataFrame`` ``data`` contains non-dummy data.
- * When the passed ``sep`` is of a wrong data type.
- * When the passed ``default_category`` is of a wrong data type.
-
- See Also
- --------
- :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.
- :class:`~pandas.Categorical` : Represent a categorical variable in classic.
-
- Notes
- -----
- The columns of the passed dummy data should only include 1's and 0's,
- or boolean values.
-
- Examples
- --------
- >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],
- ... "c": [0, 0, 1, 0]})
-
- >>> df
- a b c
- 0 1 0 0
- 1 0 1 0
- 2 0 0 1
- 3 1 0 0
-
- >>> pd.from_dummies(df)
- 0 a
- 1 b
- 2 c
- 3 a
-
- >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],
- ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
- ... "col2_c": [0, 0, 1]})
-
- >>> df
- col1_a col1_b col2_a col2_b col2_c
- 0 1 0 0 1 0
- 1 0 1 1 0 0
- 2 1 0 0 0 1
-
- >>> pd.from_dummies(df, sep="_")
- col1 col2
- 0 a b
- 1 b a
- 2 a c
-
- >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],
- ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
- ... "col2_c": [0, 0, 0]})
-
- >>> df
- col1_a col1_b col2_a col2_b col2_c
- 0 1 0 0 1 0
- 1 0 1 1 0 0
- 2 0 0 0 0 0
-
- >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})
- col1 col2
- 0 a b
- 1 b a
- 2 d e
- """
- from pandas.core.reshape.concat import concat
-
- if not isinstance(data, DataFrame):
- raise TypeError(
- "Expected 'data' to be a 'DataFrame'; "
- f"Received 'data' of type: {type(data).__name__}"
- )
-
- if data.isna().any().any():
- raise ValueError(
- "Dummy DataFrame contains NA value in column: "
- f"'{data.isna().any().idxmax()}'"
- )
-
- # index data with a list of all columns that are dummies
- try:
- data_to_decode = data.astype("boolean", copy=False)
- except TypeError:
- raise TypeError("Passed DataFrame contains non-dummy data")
-
- # collect prefixes and get lists to slice data for each prefix
- variables_slice = defaultdict(list)
- if sep is None:
- variables_slice[""] = list(data.columns)
- elif isinstance(sep, str):
- for col in data_to_decode.columns:
- prefix = col.split(sep)[0]
- if len(prefix) == len(col):
- raise ValueError(f"Separator not specified for column: {col}")
- variables_slice[prefix].append(col)
- else:
- raise TypeError(
- "Expected 'sep' to be of type 'str' or 'None'; "
- f"Received 'sep' of type: {type(sep).__name__}"
- )
-
- if default_category is not None:
- if isinstance(default_category, dict):
- if not len(default_category) == len(variables_slice):
- len_msg = (
- f"Length of 'default_category' ({len(default_category)}) "
- f"did not match the length of the columns being encoded "
- f"({len(variables_slice)})"
- )
- raise ValueError(len_msg)
- elif isinstance(default_category, Hashable):
- default_category = dict(
- zip(variables_slice, [default_category] * len(variables_slice))
- )
- else:
- raise TypeError(
- "Expected 'default_category' to be of type "
- "'None', 'Hashable', or 'dict'; "
- "Received 'default_category' of type: "
- f"{type(default_category).__name__}"
- )
-
- cat_data = {}
- for prefix, prefix_slice in variables_slice.items():
- if sep is None:
- cats = prefix_slice.copy()
- else:
- cats = [col[len(prefix + sep) :] for col in prefix_slice]
- assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)
- if any(assigned > 1):
- raise ValueError(
- "Dummy DataFrame contains multi-assignment(s); "
- f"First instance in row: {assigned.idxmax()}"
- )
- if any(assigned == 0):
- if isinstance(default_category, dict):
- cats.append(default_category[prefix])
- else:
- raise ValueError(
- "Dummy DataFrame contains unassigned value(s); "
- f"First instance in row: {assigned.idxmin()}"
- )
- data_slice = concat(
- (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1
- )
- else:
- data_slice = data_to_decode.loc[:, prefix_slice]
- cats_array = np.array(cats, dtype="object")
- # get indices of True entries along axis=1
- cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]]
-
- return DataFrame(cat_data)
diff --git a/contrib/python/pandas/py3/pandas/core/reshape/melt.py b/contrib/python/pandas/py3/pandas/core/reshape/melt.py
deleted file mode 100644
index 8ed8dd14664..00000000000
--- a/contrib/python/pandas/py3/pandas/core/reshape/melt.py
+++ /dev/null
@@ -1,540 +0,0 @@
-from __future__ import annotations
-
-import re
-from typing import (
- TYPE_CHECKING,
- Hashable,
-)
-
-import numpy as np
-
-from pandas.util._decorators import Appender
-
-from pandas.core.dtypes.common import (
- is_extension_array_dtype,
- is_list_like,
-)
-from pandas.core.dtypes.concat import concat_compat
-from pandas.core.dtypes.missing import notna
-
-import pandas.core.algorithms as algos
-from pandas.core.arrays import Categorical
-import pandas.core.common as com
-from pandas.core.indexes.api import (
- Index,
- MultiIndex,
-)
-from pandas.core.reshape.concat import concat
-from pandas.core.reshape.util import tile_compat
-from pandas.core.shared_docs import _shared_docs
-from pandas.core.tools.numeric import to_numeric
-
-if TYPE_CHECKING:
- from pandas._typing import AnyArrayLike
-
- from pandas import DataFrame
-
-
-@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})
-def melt(
- frame: DataFrame,
- id_vars=None,
- value_vars=None,
- var_name=None,
- value_name: Hashable = "value",
- col_level=None,
- ignore_index: bool = True,
-) -> DataFrame:
- # If multiindex, gather names of columns on all level for checking presence
- # of `id_vars` and `value_vars`
- if isinstance(frame.columns, MultiIndex):
- cols = [x for c in frame.columns for x in c]
- else:
- cols = list(frame.columns)
-
- if value_name in frame.columns:
- raise ValueError(
- f"value_name ({value_name}) cannot match an element in "
- "the DataFrame columns."
- )
-
- if id_vars is not None:
- if not is_list_like(id_vars):
- id_vars = [id_vars]
- elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list):
- raise ValueError(
- "id_vars must be a list of tuples when columns are a MultiIndex"
- )
- else:
- # Check that `id_vars` are in frame
- id_vars = list(id_vars)
- missing = Index(com.flatten(id_vars)).difference(cols)
- if not missing.empty:
- raise KeyError(
- "The following 'id_vars' are not present "
- f"in the DataFrame: {list(missing)}"
- )
- else:
- id_vars = []
-
- if value_vars is not None:
- if not is_list_like(value_vars):
- value_vars = [value_vars]
- elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list):
- raise ValueError(
- "value_vars must be a list of tuples when columns are a MultiIndex"
- )
- else:
- value_vars = list(value_vars)
- # Check that `value_vars` are in frame
- missing = Index(com.flatten(value_vars)).difference(cols)
- if not missing.empty:
- raise KeyError(
- "The following 'value_vars' are not present in "
- f"the DataFrame: {list(missing)}"
- )
- if col_level is not None:
- idx = frame.columns.get_level_values(col_level).get_indexer(
- id_vars + value_vars
- )
- else:
- idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars))
- frame = frame.iloc[:, idx]
- else:
- frame = frame.copy()
-
- if col_level is not None: # allow list or other?
- # frame is a copy
- frame.columns = frame.columns.get_level_values(col_level)
-
- if var_name is None:
- if isinstance(frame.columns, MultiIndex):
- if len(frame.columns.names) == len(set(frame.columns.names)):
- var_name = frame.columns.names
- else:
- var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]
- else:
- var_name = [
- frame.columns.name if frame.columns.name is not None else "variable"
- ]
- if isinstance(var_name, str):
- var_name = [var_name]
-
- N, K = frame.shape
- K -= len(id_vars)
-
- mdata: dict[Hashable, AnyArrayLike] = {}
- for col in id_vars:
- id_data = frame.pop(col)
- if is_extension_array_dtype(id_data):
- if K > 0:
- id_data = concat([id_data] * K, ignore_index=True)
- else:
- # We can't concat empty list. (GH 46044)
- id_data = type(id_data)([], name=id_data.name, dtype=id_data.dtype)
- else:
- # error: Incompatible types in assignment (expression has type
- # "ndarray[Any, dtype[Any]]", variable has type "Series")
- id_data = np.tile(id_data._values, K) # type: ignore[assignment]
- mdata[col] = id_data
-
- mcolumns = id_vars + var_name + [value_name]
-
- if frame.shape[1] > 0:
- mdata[value_name] = concat(
- [frame.iloc[:, i] for i in range(frame.shape[1])]
- ).values
- else:
- mdata[value_name] = frame._values.ravel("F")
- for i, col in enumerate(var_name):
- # asanyarray will keep the columns as an Index
- mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N)
-
- result = frame._constructor(mdata, columns=mcolumns)
-
- if not ignore_index:
- result.index = tile_compat(frame.index, K)
-
- return result
-
-
-def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame:
- """
- Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.
-
- Accepts a dictionary, ``groups``, in which each key is a new column name
- and each value is a list of old column names that will be "melted" under
- the new column name as part of the reshape.
-
- Parameters
- ----------
- data : DataFrame
- The wide-format DataFrame.
- groups : dict
- {new_name : list_of_columns}.
- dropna : bool, default True
- Do not include columns whose entries are all NaN.
-
- Returns
- -------
- DataFrame
- Reshaped DataFrame.
-
- See Also
- --------
- melt : Unpivot a DataFrame from wide to long format, optionally leaving
- identifiers set.
- pivot : Create a spreadsheet-style pivot table as a DataFrame.
- DataFrame.pivot : Pivot without aggregation that can handle
- non-numeric data.
- DataFrame.pivot_table : Generalization of pivot that can handle
- duplicate values for one index/column pair.
- DataFrame.unstack : Pivot based on the index values instead of a
- column.
- wide_to_long : Wide panel to long format. Less flexible but more
- user-friendly than melt.
-
- Examples
- --------
- >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
- ... 'team': ['Red Sox', 'Yankees'],
- ... 'year1': [2007, 2007], 'year2': [2008, 2008]})
- >>> data
- hr1 hr2 team year1 year2
- 0 514 545 Red Sox 2007 2008
- 1 573 526 Yankees 2007 2008
-
- >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
- team year hr
- 0 Red Sox 2007 514
- 1 Yankees 2007 573
- 2 Red Sox 2008 545
- 3 Yankees 2008 526
- """
- if isinstance(groups, dict):
- keys = list(groups.keys())
- values = list(groups.values())
- else:
- keys, values = zip(*groups)
-
- all_cols = list(set.union(*(set(x) for x in values)))
- id_cols = list(data.columns.difference(all_cols))
-
- K = len(values[0])
-
- for seq in values:
- if len(seq) != K:
- raise ValueError("All column lists must be same length")
-
- mdata = {}
- pivot_cols = []
-
- for target, names in zip(keys, values):
- to_concat = [data[col]._values for col in names]
-
- mdata[target] = concat_compat(to_concat)
- pivot_cols.append(target)
-
- for col in id_cols:
- mdata[col] = np.tile(data[col]._values, K)
-
- if dropna:
- mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
- for c in pivot_cols:
- mask &= notna(mdata[c])
- if not mask.all():
- mdata = {k: v[mask] for k, v in mdata.items()}
-
- return data._constructor(mdata, columns=id_cols + pivot_cols)
-
-
-def wide_to_long(
- df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"
-) -> DataFrame:
- r"""
- Unpivot a DataFrame from wide to long format.
-
- Less flexible but more user-friendly than melt.
-
- With stubnames ['A', 'B'], this function expects to find one or more
- group of columns with format
- A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
- You specify what you want to call this suffix in the resulting long format
- with `j` (for example `j='year'`)
-
- Each row of these wide variables are assumed to be uniquely identified by
- `i` (can be a single column name or a list of column names)
-
- All remaining variables in the data frame are left intact.
-
- Parameters
- ----------
- df : DataFrame
- The wide-format DataFrame.
- stubnames : str or list-like
- The stub name(s). The wide format variables are assumed to
- start with the stub names.
- i : str or list-like
- Column(s) to use as id variable(s).
- j : str
- The name of the sub-observation variable. What you wish to name your
- suffix in the long format.
- sep : str, default ""
- A character indicating the separation of the variable names
- in the wide format, to be stripped from the names in the long format.
- For example, if your column names are A-suffix1, A-suffix2, you
- can strip the hyphen by specifying `sep='-'`.
- suffix : str, default '\\d+'
- A regular expression capturing the wanted suffixes. '\\d+' captures
- numeric suffixes. Suffixes with no numbers could be specified with the
- negated character class '\\D+'. You can also further disambiguate
- suffixes, for example, if your wide variables are of the form A-one,
- B-two,.., and you have an unrelated column A-rating, you can ignore the
- last one by specifying `suffix='(!?one|two)'`. When all suffixes are
- numeric, they are cast to int64/float64.
-
- Returns
- -------
- DataFrame
- A DataFrame that contains each stub name as a variable, with new index
- (i, j).
-
- See Also
- --------
- melt : Unpivot a DataFrame from wide to long format, optionally leaving
- identifiers set.
- pivot : Create a spreadsheet-style pivot table as a DataFrame.
- DataFrame.pivot : Pivot without aggregation that can handle
- non-numeric data.
- DataFrame.pivot_table : Generalization of pivot that can handle
- duplicate values for one index/column pair.
- DataFrame.unstack : Pivot based on the index values instead of a
- column.
-
- Notes
- -----
- All extra variables are left untouched. This simply uses
- `pandas.melt` under the hood, but is hard-coded to "do the right thing"
- in a typical case.
-
- Examples
- --------
- >>> np.random.seed(123)
- >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
- ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
- ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
- ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
- ... "X" : dict(zip(range(3), np.random.randn(3)))
- ... })
- >>> df["id"] = df.index
- >>> df
- A1970 A1980 B1970 B1980 X id
- 0 a d 2.5 3.2 -1.085631 0
- 1 b e 1.2 1.3 0.997345 1
- 2 c f 0.7 0.1 0.282978 2
- >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
- ... # doctest: +NORMALIZE_WHITESPACE
- X A B
- id year
- 0 1970 -1.085631 a 2.5
- 1 1970 0.997345 b 1.2
- 2 1970 0.282978 c 0.7
- 0 1980 -1.085631 d 3.2
- 1 1980 0.997345 e 1.3
- 2 1980 0.282978 f 0.1
-
- With multiple id columns
-
- >>> df = pd.DataFrame({
- ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
- ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
- ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
- ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
- ... })
- >>> df
- famid birth ht1 ht2
- 0 1 1 2.8 3.4
- 1 1 2 2.9 3.8
- 2 1 3 2.2 2.9
- 3 2 1 2.0 3.2
- 4 2 2 1.8 2.8
- 5 2 3 1.9 2.4
- 6 3 1 2.2 3.3
- 7 3 2 2.3 3.4
- 8 3 3 2.1 2.9
- >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
- >>> l
- ... # doctest: +NORMALIZE_WHITESPACE
- ht
- famid birth age
- 1 1 1 2.8
- 2 3.4
- 2 1 2.9
- 2 3.8
- 3 1 2.2
- 2 2.9
- 2 1 1 2.0
- 2 3.2
- 2 1 1.8
- 2 2.8
- 3 1 1.9
- 2 2.4
- 3 1 1 2.2
- 2 3.3
- 2 1 2.3
- 2 3.4
- 3 1 2.1
- 2 2.9
-
- Going from long back to wide just takes some creative use of `unstack`
-
- >>> w = l.unstack()
- >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
- >>> w.reset_index()
- famid birth ht1 ht2
- 0 1 1 2.8 3.4
- 1 1 2 2.9 3.8
- 2 1 3 2.2 2.9
- 3 2 1 2.0 3.2
- 4 2 2 1.8 2.8
- 5 2 3 1.9 2.4
- 6 3 1 2.2 3.3
- 7 3 2 2.3 3.4
- 8 3 3 2.1 2.9
-
- Less wieldy column names are also handled
-
- >>> np.random.seed(0)
- >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),
- ... 'A(weekly)-2011': np.random.rand(3),
- ... 'B(weekly)-2010': np.random.rand(3),
- ... 'B(weekly)-2011': np.random.rand(3),
- ... 'X' : np.random.randint(3, size=3)})
- >>> df['id'] = df.index
- >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
- A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id
- 0 0.548814 0.544883 0.437587 0.383442 0 0
- 1 0.715189 0.423655 0.891773 0.791725 1 1
- 2 0.602763 0.645894 0.963663 0.528895 1 2
-
- >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',
- ... j='year', sep='-')
- ... # doctest: +NORMALIZE_WHITESPACE
- X A(weekly) B(weekly)
- id year
- 0 2010 0 0.548814 0.437587
- 1 2010 1 0.715189 0.891773
- 2 2010 1 0.602763 0.963663
- 0 2011 0 0.544883 0.383442
- 1 2011 1 0.423655 0.791725
- 2 2011 1 0.645894 0.528895
-
- If we have many columns, we could also use a regex to find our
- stubnames and pass that list on to wide_to_long
-
- >>> stubnames = sorted(
- ... set([match[0] for match in df.columns.str.findall(
- ... r'[A-B]\(.*\)').values if match != []])
- ... )
- >>> list(stubnames)
- ['A(weekly)', 'B(weekly)']
-
- All of the above examples have integers as suffixes. It is possible to
- have non-integers as suffixes.
-
- >>> df = pd.DataFrame({
- ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
- ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
- ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
- ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
- ... })
- >>> df
- famid birth ht_one ht_two
- 0 1 1 2.8 3.4
- 1 1 2 2.9 3.8
- 2 1 3 2.2 2.9
- 3 2 1 2.0 3.2
- 4 2 2 1.8 2.8
- 5 2 3 1.9 2.4
- 6 3 1 2.2 3.3
- 7 3 2 2.3 3.4
- 8 3 3 2.1 2.9
-
- >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
- ... sep='_', suffix=r'\w+')
- >>> l
- ... # doctest: +NORMALIZE_WHITESPACE
- ht
- famid birth age
- 1 1 one 2.8
- two 3.4
- 2 one 2.9
- two 3.8
- 3 one 2.2
- two 2.9
- 2 1 one 2.0
- two 3.2
- 2 one 1.8
- two 2.8
- 3 one 1.9
- two 2.4
- 3 1 one 2.2
- two 3.3
- 2 one 2.3
- two 3.4
- 3 one 2.1
- two 2.9
- """
-
- def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]:
- regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
- pattern = re.compile(regex)
- return [col for col in df.columns if pattern.match(col)]
-
- def melt_stub(df, stub: str, i, j, value_vars, sep: str):
- newdf = melt(
- df,
- id_vars=i,
- value_vars=value_vars,
- value_name=stub.rstrip(sep),
- var_name=j,
- )
- newdf[j] = Categorical(newdf[j])
- newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)
-
- # GH17627 Cast numerics suffixes to int/float
- newdf[j] = to_numeric(newdf[j], errors="ignore")
-
- return newdf.set_index(i + [j])
-
- if not is_list_like(stubnames):
- stubnames = [stubnames]
- else:
- stubnames = list(stubnames)
-
- if any(col in stubnames for col in df.columns):
- raise ValueError("stubname can't be identical to a column name")
-
- if not is_list_like(i):
- i = [i]
- else:
- i = list(i)
-
- if df[i].duplicated().any():
- raise ValueError("the id variables need to uniquely identify each row")
-
- value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]
-
- value_vars_flattened = [e for sublist in value_vars for e in sublist]
- id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
-
- _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)]
- melted = _melted[0].join(_melted[1:], how="outer")
-
- if len(i) == 1:
- new = df[id_vars].set_index(i).join(melted)
- return new
-
- new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
-
- return new
diff --git a/contrib/python/pandas/py3/pandas/core/reshape/merge.py b/contrib/python/pandas/py3/pandas/core/reshape/merge.py
deleted file mode 100644
index b1fb6773648..00000000000
--- a/contrib/python/pandas/py3/pandas/core/reshape/merge.py
+++ /dev/null
@@ -1,2645 +0,0 @@
-"""
-SQL-style merge routines
-"""
-from __future__ import annotations
-
-import copy as cp
-import datetime
-from functools import partial
-import string
-from typing import (
- TYPE_CHECKING,
- Hashable,
- Literal,
- Sequence,
- cast,
-)
-import uuid
-import warnings
-
-import numpy as np
-
-from pandas._libs import (
- Timedelta,
- hashtable as libhashtable,
- join as libjoin,
- lib,
-)
-from pandas._libs.lib import is_range_indexer
-from pandas._typing import (
- AnyArrayLike,
- ArrayLike,
- AxisInt,
- DtypeObj,
- IndexLabel,
- JoinHow,
- MergeHow,
- Shape,
- Suffixes,
- npt,
-)
-from pandas.errors import MergeError
-from pandas.util._decorators import (
- Appender,
- Substitution,
- cache_readonly,
-)
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.base import ExtensionDtype
-from pandas.core.dtypes.cast import find_common_type
-from pandas.core.dtypes.common import (
- ensure_float64,
- ensure_int64,
- ensure_object,
- is_array_like,
- is_bool,
- is_bool_dtype,
- is_categorical_dtype,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_list_like,
- is_number,
- is_numeric_dtype,
- is_object_dtype,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.dtypes import DatetimeTZDtype
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import (
- isna,
- na_value_for_dtype,
-)
-
-from pandas import (
- ArrowDtype,
- Categorical,
- Index,
- MultiIndex,
- Series,
-)
-import pandas.core.algorithms as algos
-from pandas.core.arrays import (
- ArrowExtensionArray,
- BaseMaskedArray,
- ExtensionArray,
-)
-from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
-import pandas.core.common as com
-from pandas.core.construction import (
- ensure_wrapped_if_datetimelike,
- extract_array,
-)
-from pandas.core.frame import _merge_doc
-from pandas.core.indexes.api import default_index
-from pandas.core.sorting import is_int64_overflow_possible
-
-if TYPE_CHECKING:
- from pandas import DataFrame
- from pandas.core import groupby
- from pandas.core.arrays import DatetimeArray
-
-_factorizers = {
- np.int64: libhashtable.Int64Factorizer,
- np.longlong: libhashtable.Int64Factorizer,
- np.int32: libhashtable.Int32Factorizer,
- np.int16: libhashtable.Int16Factorizer,
- np.int8: libhashtable.Int8Factorizer,
- np.uint64: libhashtable.UInt64Factorizer,
- np.uint32: libhashtable.UInt32Factorizer,
- np.uint16: libhashtable.UInt16Factorizer,
- np.uint8: libhashtable.UInt8Factorizer,
- np.bool_: libhashtable.UInt8Factorizer,
- np.float64: libhashtable.Float64Factorizer,
- np.float32: libhashtable.Float32Factorizer,
- np.complex64: libhashtable.Complex64Factorizer,
- np.complex128: libhashtable.Complex128Factorizer,
- np.object_: libhashtable.ObjectFactorizer,
-}
-
-# See https://github.com/pandas-dev/pandas/issues/52451
-if np.intc is not np.int32:
- _factorizers[np.intc] = libhashtable.Int64Factorizer
-
-
-@Substitution("\nleft : DataFrame or named Series")
-@Appender(_merge_doc, indents=0)
-def merge(
- left: DataFrame | Series,
- right: DataFrame | Series,
- how: MergeHow = "inner",
- on: IndexLabel | None = None,
- left_on: IndexLabel | None = None,
- right_on: IndexLabel | None = None,
- left_index: bool = False,
- right_index: bool = False,
- sort: bool = False,
- suffixes: Suffixes = ("_x", "_y"),
- copy: bool | None = None,
- indicator: str | bool = False,
- validate: str | None = None,
-) -> DataFrame:
- op = _MergeOperation(
- left,
- right,
- how=how,
- on=on,
- left_on=left_on,
- right_on=right_on,
- left_index=left_index,
- right_index=right_index,
- sort=sort,
- suffixes=suffixes,
- indicator=indicator,
- validate=validate,
- )
- return op.get_result(copy=copy)
-
-
-def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces):
- """
- groupby & merge; we are always performing a left-by type operation
-
- Parameters
- ----------
- by: field to group
- left: DataFrame
- right: DataFrame
- merge_pieces: function for merging
- """
- pieces = []
- if not isinstance(by, (list, tuple)):
- by = [by]
-
- lby = left.groupby(by, sort=False)
- rby: groupby.DataFrameGroupBy | None = None
-
- # if we can groupby the rhs
- # then we can get vastly better perf
- if all(item in right.columns for item in by):
- rby = right.groupby(by, sort=False)
-
- for key, lhs in lby.grouper.get_iterator(lby._selected_obj, axis=lby.axis):
- if rby is None:
- rhs = right
- else:
- try:
- rhs = right.take(rby.indices[key])
- except KeyError:
- # key doesn't exist in left
- lcols = lhs.columns.tolist()
- cols = lcols + [r for r in right.columns if r not in set(lcols)]
- merged = lhs.reindex(columns=cols)
- merged.index = range(len(merged))
- pieces.append(merged)
- continue
-
- merged = merge_pieces(lhs, rhs)
-
- # make sure join keys are in the merged
- # TODO, should merge_pieces do this?
- merged[by] = key
-
- pieces.append(merged)
-
- # preserve the original order
- # if we have a missing piece this can be reset
- from pandas.core.reshape.concat import concat
-
- result = concat(pieces, ignore_index=True)
- result = result.reindex(columns=pieces[0].columns, copy=False)
- return result, lby
-
-
-def merge_ordered(
- left: DataFrame,
- right: DataFrame,
- on: IndexLabel | None = None,
- left_on: IndexLabel | None = None,
- right_on: IndexLabel | None = None,
- left_by=None,
- right_by=None,
- fill_method: str | None = None,
- suffixes: Suffixes = ("_x", "_y"),
- how: JoinHow = "outer",
-) -> DataFrame:
- """
- Perform a merge for ordered data with optional filling/interpolation.
-
- Designed for ordered data like time series data. Optionally
- perform group-wise merge (see examples).
-
- Parameters
- ----------
- left : DataFrame or named Series
- right : DataFrame or named Series
- on : label or list
- Field names to join on. Must be found in both DataFrames.
- left_on : label or list, or array-like
- Field names to join on in left DataFrame. Can be a vector or list of
- vectors of the length of the DataFrame to use a particular vector as
- the join key instead of columns.
- right_on : label or list, or array-like
- Field names to join on in right DataFrame or vector/list of vectors per
- left_on docs.
- left_by : column name or list of column names
- Group left DataFrame by group columns and merge piece by piece with
- right DataFrame. Must be None if either left or right are a Series.
- right_by : column name or list of column names
- Group right DataFrame by group columns and merge piece by piece with
- left DataFrame. Must be None if either left or right are a Series.
- fill_method : {'ffill', None}, default None
- Interpolation method for data.
- suffixes : list-like, default is ("_x", "_y")
- A length-2 sequence where each element is optionally a string
- indicating the suffix to add to overlapping column names in
- `left` and `right` respectively. Pass a value of `None` instead
- of a string to indicate that the column name from `left` or
- `right` should be left as-is, with no suffix. At least one of the
- values must not be None.
-
- how : {'left', 'right', 'outer', 'inner'}, default 'outer'
- * left: use only keys from left frame (SQL: left outer join)
- * right: use only keys from right frame (SQL: right outer join)
- * outer: use union of keys from both frames (SQL: full outer join)
- * inner: use intersection of keys from both frames (SQL: inner join).
-
- Returns
- -------
- DataFrame
- The merged DataFrame output type will be the same as
- 'left', if it is a subclass of DataFrame.
-
- See Also
- --------
- merge : Merge with a database-style join.
- merge_asof : Merge on nearest keys.
-
- Examples
- --------
- >>> from pandas import merge_ordered
- >>> df1 = pd.DataFrame(
- ... {
- ... "key": ["a", "c", "e", "a", "c", "e"],
- ... "lvalue": [1, 2, 3, 1, 2, 3],
- ... "group": ["a", "a", "a", "b", "b", "b"]
- ... }
- ... )
- >>> df1
- key lvalue group
- 0 a 1 a
- 1 c 2 a
- 2 e 3 a
- 3 a 1 b
- 4 c 2 b
- 5 e 3 b
-
- >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
- >>> df2
- key rvalue
- 0 b 1
- 1 c 2
- 2 d 3
-
- >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group")
- key lvalue group rvalue
- 0 a 1 a NaN
- 1 b 1 a 1.0
- 2 c 2 a 2.0
- 3 d 2 a 3.0
- 4 e 3 a 3.0
- 5 a 1 b NaN
- 6 b 1 b 1.0
- 7 c 2 b 2.0
- 8 d 2 b 3.0
- 9 e 3 b 3.0
- """
-
- def _merger(x, y) -> DataFrame:
- # perform the ordered merge operation
- op = _OrderedMerge(
- x,
- y,
- on=on,
- left_on=left_on,
- right_on=right_on,
- suffixes=suffixes,
- fill_method=fill_method,
- how=how,
- )
- return op.get_result()
-
- if left_by is not None and right_by is not None:
- raise ValueError("Can only group either left or right frames")
- if left_by is not None:
- if isinstance(left_by, str):
- left_by = [left_by]
- check = set(left_by).difference(left.columns)
- if len(check) != 0:
- raise KeyError(f"{check} not found in left columns")
- result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y))
- elif right_by is not None:
- if isinstance(right_by, str):
- right_by = [right_by]
- check = set(right_by).difference(right.columns)
- if len(check) != 0:
- raise KeyError(f"{check} not found in right columns")
- result, _ = _groupby_and_merge(
- right_by, right, left, lambda x, y: _merger(y, x)
- )
- else:
- result = _merger(left, right)
- return result
-
-
-def merge_asof(
- left: DataFrame | Series,
- right: DataFrame | Series,
- on: IndexLabel | None = None,
- left_on: IndexLabel | None = None,
- right_on: IndexLabel | None = None,
- left_index: bool = False,
- right_index: bool = False,
- by=None,
- left_by=None,
- right_by=None,
- suffixes: Suffixes = ("_x", "_y"),
- tolerance=None,
- allow_exact_matches: bool = True,
- direction: str = "backward",
-) -> DataFrame:
- """
- Perform a merge by key distance.
-
- This is similar to a left-join except that we match on nearest
- key rather than equal keys. Both DataFrames must be sorted by the key.
-
- For each row in the left DataFrame:
-
- - A "backward" search selects the last row in the right DataFrame whose
- 'on' key is less than or equal to the left's key.
-
- - A "forward" search selects the first row in the right DataFrame whose
- 'on' key is greater than or equal to the left's key.
-
- - A "nearest" search selects the row in the right DataFrame whose 'on'
- key is closest in absolute distance to the left's key.
-
- The default is "backward" and is compatible in versions below 0.20.0.
- The direction parameter was added in version 0.20.0 and introduces
- "forward" and "nearest".
-
- Optionally match on equivalent keys with 'by' before searching with 'on'.
-
- Parameters
- ----------
- left : DataFrame or named Series
- right : DataFrame or named Series
- on : label
- Field name to join on. Must be found in both DataFrames.
- The data MUST be ordered. Furthermore this must be a numeric column,
- such as datetimelike, integer, or float. On or left_on/right_on
- must be given.
- left_on : label
- Field name to join on in left DataFrame.
- right_on : label
- Field name to join on in right DataFrame.
- left_index : bool
- Use the index of the left DataFrame as the join key.
- right_index : bool
- Use the index of the right DataFrame as the join key.
- by : column name or list of column names
- Match on these columns before performing merge operation.
- left_by : column name
- Field names to match on in the left DataFrame.
- right_by : column name
- Field names to match on in the right DataFrame.
- suffixes : 2-length sequence (tuple, list, ...)
- Suffix to apply to overlapping column names in the left and right
- side, respectively.
- tolerance : int or Timedelta, optional, default None
- Select asof tolerance within this range; must be compatible
- with the merge index.
- allow_exact_matches : bool, default True
-
- - If True, allow matching with the same 'on' value
- (i.e. less-than-or-equal-to / greater-than-or-equal-to)
- - If False, don't match the same 'on' value
- (i.e., strictly less-than / strictly greater-than).
-
- direction : 'backward' (default), 'forward', or 'nearest'
- Whether to search for prior, subsequent, or closest matches.
-
- Returns
- -------
- DataFrame
-
- See Also
- --------
- merge : Merge with a database-style join.
- merge_ordered : Merge with optional filling/interpolation.
-
- Examples
- --------
- >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
- >>> left
- a left_val
- 0 1 a
- 1 5 b
- 2 10 c
-
- >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})
- >>> right
- a right_val
- 0 1 1
- 1 2 2
- 2 3 3
- 3 6 6
- 4 7 7
-
- >>> pd.merge_asof(left, right, on="a")
- a left_val right_val
- 0 1 a 1
- 1 5 b 3
- 2 10 c 7
-
- >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False)
- a left_val right_val
- 0 1 a NaN
- 1 5 b 3.0
- 2 10 c 7.0
-
- >>> pd.merge_asof(left, right, on="a", direction="forward")
- a left_val right_val
- 0 1 a 1.0
- 1 5 b 6.0
- 2 10 c NaN
-
- >>> pd.merge_asof(left, right, on="a", direction="nearest")
- a left_val right_val
- 0 1 a 1
- 1 5 b 6
- 2 10 c 7
-
- We can use indexed DataFrames as well.
-
- >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])
- >>> left
- left_val
- 1 a
- 5 b
- 10 c
-
- >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])
- >>> right
- right_val
- 1 1
- 2 2
- 3 3
- 6 6
- 7 7
-
- >>> pd.merge_asof(left, right, left_index=True, right_index=True)
- left_val right_val
- 1 a 1
- 5 b 3
- 10 c 7
-
- Here is a real-world times-series example
-
- >>> quotes = pd.DataFrame(
- ... {
- ... "time": [
- ... pd.Timestamp("2016-05-25 13:30:00.023"),
- ... pd.Timestamp("2016-05-25 13:30:00.023"),
- ... pd.Timestamp("2016-05-25 13:30:00.030"),
- ... pd.Timestamp("2016-05-25 13:30:00.041"),
- ... pd.Timestamp("2016-05-25 13:30:00.048"),
- ... pd.Timestamp("2016-05-25 13:30:00.049"),
- ... pd.Timestamp("2016-05-25 13:30:00.072"),
- ... pd.Timestamp("2016-05-25 13:30:00.075")
- ... ],
- ... "ticker": [
- ... "GOOG",
- ... "MSFT",
- ... "MSFT",
- ... "MSFT",
- ... "GOOG",
- ... "AAPL",
- ... "GOOG",
- ... "MSFT"
- ... ],
- ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
- ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
- ... }
- ... )
- >>> quotes
- time ticker bid ask
- 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93
- 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96
- 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98
- 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00
- 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93
- 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01
- 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88
- 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03
-
- >>> trades = pd.DataFrame(
- ... {
- ... "time": [
- ... pd.Timestamp("2016-05-25 13:30:00.023"),
- ... pd.Timestamp("2016-05-25 13:30:00.038"),
- ... pd.Timestamp("2016-05-25 13:30:00.048"),
- ... pd.Timestamp("2016-05-25 13:30:00.048"),
- ... pd.Timestamp("2016-05-25 13:30:00.048")
- ... ],
- ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
- ... "price": [51.95, 51.95, 720.77, 720.92, 98.0],
- ... "quantity": [75, 155, 100, 100, 100]
- ... }
- ... )
- >>> trades
- time ticker price quantity
- 0 2016-05-25 13:30:00.023 MSFT 51.95 75
- 1 2016-05-25 13:30:00.038 MSFT 51.95 155
- 2 2016-05-25 13:30:00.048 GOOG 720.77 100
- 3 2016-05-25 13:30:00.048 GOOG 720.92 100
- 4 2016-05-25 13:30:00.048 AAPL 98.00 100
-
- By default we are taking the asof of the quotes
-
- >>> pd.merge_asof(trades, quotes, on="time", by="ticker")
- time ticker price quantity bid ask
- 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
- 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
- 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
- 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
- 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
-
- We only asof within 2ms between the quote time and the trade time
-
- >>> pd.merge_asof(
- ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")
- ... )
- time ticker price quantity bid ask
- 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
- 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN
- 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
- 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
- 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
-
- We only asof within 10ms between the quote time and the trade time
- and we exclude exact matches on time. However *prior* data will
- propagate forward
-
- >>> pd.merge_asof(
- ... trades,
- ... quotes,
- ... on="time",
- ... by="ticker",
- ... tolerance=pd.Timedelta("10ms"),
- ... allow_exact_matches=False
- ... )
- time ticker price quantity bid ask
- 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN
- 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
- 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN
- 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN
- 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
- """
- op = _AsOfMerge(
- left,
- right,
- on=on,
- left_on=left_on,
- right_on=right_on,
- left_index=left_index,
- right_index=right_index,
- by=by,
- left_by=left_by,
- right_by=right_by,
- suffixes=suffixes,
- how="asof",
- tolerance=tolerance,
- allow_exact_matches=allow_exact_matches,
- direction=direction,
- )
- return op.get_result()
-
-
-# TODO: transformations??
-# TODO: only copy DataFrames when modification necessary
-class _MergeOperation:
- """
- Perform a database (SQL) merge operation between two DataFrame or Series
- objects using either columns as keys or their row indexes
- """
-
- _merge_type = "merge"
- how: MergeHow | Literal["asof"]
- on: IndexLabel | None
- # left_on/right_on may be None when passed, but in validate_specification
- # get replaced with non-None.
- left_on: Sequence[Hashable | AnyArrayLike]
- right_on: Sequence[Hashable | AnyArrayLike]
- left_index: bool
- right_index: bool
- axis: AxisInt
- bm_axis: AxisInt
- sort: bool
- suffixes: Suffixes
- copy: bool
- indicator: str | bool
- validate: str | None
- join_names: list[Hashable]
- right_join_keys: list[AnyArrayLike]
- left_join_keys: list[AnyArrayLike]
-
- def __init__(
- self,
- left: DataFrame | Series,
- right: DataFrame | Series,
- how: MergeHow | Literal["asof"] = "inner",
- on: IndexLabel | None = None,
- left_on: IndexLabel | None = None,
- right_on: IndexLabel | None = None,
- axis: AxisInt = 1,
- left_index: bool = False,
- right_index: bool = False,
- sort: bool = True,
- suffixes: Suffixes = ("_x", "_y"),
- indicator: str | bool = False,
- validate: str | None = None,
- ) -> None:
- _left = _validate_operand(left)
- _right = _validate_operand(right)
- self.left = self.orig_left = _left
- self.right = self.orig_right = _right
- self.how = how
-
- # bm_axis -> the axis on the BlockManager
- self.bm_axis = axis
- # axis --> the axis on the Series/DataFrame
- self.axis = 1 - axis if self.left.ndim == 2 else 0
-
- self.on = com.maybe_make_list(on)
-
- self.suffixes = suffixes
- self.sort = sort
-
- self.left_index = left_index
- self.right_index = right_index
-
- self.indicator = indicator
-
- if not is_bool(left_index):
- raise ValueError(
- f"left_index parameter must be of type bool, not {type(left_index)}"
- )
- if not is_bool(right_index):
- raise ValueError(
- f"right_index parameter must be of type bool, not {type(right_index)}"
- )
-
- # GH 40993: raise when merging between different levels; enforced in 2.0
- if _left.columns.nlevels != _right.columns.nlevels:
- msg = (
- "Not allowed to merge between different levels. "
- f"({_left.columns.nlevels} levels on the left, "
- f"{_right.columns.nlevels} on the right)"
- )
- raise MergeError(msg)
-
- self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)
-
- cross_col = None
- if self.how == "cross":
- (
- self.left,
- self.right,
- self.how,
- cross_col,
- ) = self._create_cross_configuration(self.left, self.right)
- self.left_on = self.right_on = [cross_col]
- self._cross = cross_col
-
- # note this function has side effects
- (
- self.left_join_keys,
- self.right_join_keys,
- self.join_names,
- ) = self._get_merge_keys()
-
- # validate the merge keys dtypes. We may need to coerce
- # to avoid incompatible dtypes
- self._maybe_coerce_merge_keys()
-
- # If argument passed to validate,
- # check if columns specified as unique
- # are in fact unique.
- if validate is not None:
- self._validate(validate)
-
- def _reindex_and_concat(
- self,
- join_index: Index,
- left_indexer: npt.NDArray[np.intp] | None,
- right_indexer: npt.NDArray[np.intp] | None,
- copy: bool | None,
- ) -> DataFrame:
- """
- reindex along index and concat along columns.
- """
- # Take views so we do not alter the originals
- left = self.left[:]
- right = self.right[:]
-
- llabels, rlabels = _items_overlap_with_suffix(
- self.left._info_axis, self.right._info_axis, self.suffixes
- )
-
- if left_indexer is not None and not is_range_indexer(left_indexer, len(left)):
- # Pinning the index here (and in the right code just below) is not
- # necessary, but makes the `.take` more performant if we have e.g.
- # a MultiIndex for left.index.
- lmgr = left._mgr.reindex_indexer(
- join_index,
- left_indexer,
- axis=1,
- copy=False,
- only_slice=True,
- allow_dups=True,
- use_na_proxy=True,
- )
- left = left._constructor(lmgr)
- left.index = join_index
-
- if right_indexer is not None and not is_range_indexer(
- right_indexer, len(right)
- ):
- rmgr = right._mgr.reindex_indexer(
- join_index,
- right_indexer,
- axis=1,
- copy=False,
- only_slice=True,
- allow_dups=True,
- use_na_proxy=True,
- )
- right = right._constructor(rmgr)
- right.index = join_index
-
- from pandas import concat
-
- left.columns = llabels
- right.columns = rlabels
- result = concat([left, right], axis=1, copy=copy)
- return result
-
- def get_result(self, copy: bool | None = True) -> DataFrame:
- if self.indicator:
- self.left, self.right = self._indicator_pre_merge(self.left, self.right)
-
- join_index, left_indexer, right_indexer = self._get_join_info()
-
- result = self._reindex_and_concat(
- join_index, left_indexer, right_indexer, copy=copy
- )
- result = result.__finalize__(self, method=self._merge_type)
-
- if self.indicator:
- result = self._indicator_post_merge(result)
-
- self._maybe_add_join_keys(result, left_indexer, right_indexer)
-
- self._maybe_restore_index_levels(result)
-
- self._maybe_drop_cross_column(result, self._cross)
-
- return result.__finalize__(self, method="merge")
-
- def _maybe_drop_cross_column(
- self, result: DataFrame, cross_col: str | None
- ) -> None:
- if cross_col is not None:
- del result[cross_col]
-
- @cache_readonly
- def _indicator_name(self) -> str | None:
- if isinstance(self.indicator, str):
- return self.indicator
- elif isinstance(self.indicator, bool):
- return "_merge" if self.indicator else None
- else:
- raise ValueError(
- "indicator option can only accept boolean or string arguments"
- )
-
- def _indicator_pre_merge(
- self, left: DataFrame, right: DataFrame
- ) -> tuple[DataFrame, DataFrame]:
- columns = left.columns.union(right.columns)
-
- for i in ["_left_indicator", "_right_indicator"]:
- if i in columns:
- raise ValueError(
- "Cannot use `indicator=True` option when "
- f"data contains a column named {i}"
- )
- if self._indicator_name in columns:
- raise ValueError(
- "Cannot use name of an existing column for indicator column"
- )
-
- left = left.copy()
- right = right.copy()
-
- left["_left_indicator"] = 1
- left["_left_indicator"] = left["_left_indicator"].astype("int8")
-
- right["_right_indicator"] = 2
- right["_right_indicator"] = right["_right_indicator"].astype("int8")
-
- return left, right
-
- def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
- result["_left_indicator"] = result["_left_indicator"].fillna(0)
- result["_right_indicator"] = result["_right_indicator"].fillna(0)
-
- result[self._indicator_name] = Categorical(
- (result["_left_indicator"] + result["_right_indicator"]),
- categories=[1, 2, 3],
- )
- result[self._indicator_name] = result[
- self._indicator_name
- ].cat.rename_categories(["left_only", "right_only", "both"])
-
- result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)
- return result
-
- def _maybe_restore_index_levels(self, result: DataFrame) -> None:
- """
- Restore index levels specified as `on` parameters
-
- Here we check for cases where `self.left_on` and `self.right_on` pairs
- each reference an index level in their respective DataFrames. The
- joined columns corresponding to these pairs are then restored to the
- index of `result`.
-
- **Note:** This method has side effects. It modifies `result` in-place
-
- Parameters
- ----------
- result: DataFrame
- merge result
-
- Returns
- -------
- None
- """
- names_to_restore = []
- for name, left_key, right_key in zip(
- self.join_names, self.left_on, self.right_on
- ):
- if (
- # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible
- # type "Union[Hashable, ExtensionArray, Index, Series]"; expected
- # "Hashable"
- self.orig_left._is_level_reference(left_key) # type: ignore[arg-type]
- # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible
- # type "Union[Hashable, ExtensionArray, Index, Series]"; expected
- # "Hashable"
- and self.orig_right._is_level_reference(
- right_key # type: ignore[arg-type]
- )
- and left_key == right_key
- and name not in result.index.names
- ):
- names_to_restore.append(name)
-
- if names_to_restore:
- result.set_index(names_to_restore, inplace=True)
-
- def _maybe_add_join_keys(
- self,
- result: DataFrame,
- left_indexer: np.ndarray | None,
- right_indexer: np.ndarray | None,
- ) -> None:
- left_has_missing = None
- right_has_missing = None
-
- assert all(is_array_like(x) for x in self.left_join_keys)
-
- keys = zip(self.join_names, self.left_on, self.right_on)
- for i, (name, lname, rname) in enumerate(keys):
- if not _should_fill(lname, rname):
- continue
-
- take_left, take_right = None, None
-
- if name in result:
- if left_indexer is not None and right_indexer is not None:
- if name in self.left:
- if left_has_missing is None:
- left_has_missing = (left_indexer == -1).any()
-
- if left_has_missing:
- take_right = self.right_join_keys[i]
-
- if not is_dtype_equal(
- result[name].dtype, self.left[name].dtype
- ):
- take_left = self.left[name]._values
-
- elif name in self.right:
- if right_has_missing is None:
- right_has_missing = (right_indexer == -1).any()
-
- if right_has_missing:
- take_left = self.left_join_keys[i]
-
- if not is_dtype_equal(
- result[name].dtype, self.right[name].dtype
- ):
- take_right = self.right[name]._values
-
- elif left_indexer is not None:
- take_left = self.left_join_keys[i]
- take_right = self.right_join_keys[i]
-
- if take_left is not None or take_right is not None:
- if take_left is None:
- lvals = result[name]._values
- else:
- # TODO: can we pin down take_left's type earlier?
- take_left = extract_array(take_left, extract_numpy=True)
- lfill = na_value_for_dtype(take_left.dtype)
- lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill)
-
- if take_right is None:
- rvals = result[name]._values
- else:
- # TODO: can we pin down take_right's type earlier?
- taker = extract_array(take_right, extract_numpy=True)
- rfill = na_value_for_dtype(taker.dtype)
- rvals = algos.take_nd(taker, right_indexer, fill_value=rfill)
-
- # if we have an all missing left_indexer
- # make sure to just use the right values or vice-versa
- mask_left = left_indexer == -1
- # error: Item "bool" of "Union[Any, bool]" has no attribute "all"
- if mask_left.all(): # type: ignore[union-attr]
- key_col = Index(rvals)
- result_dtype = rvals.dtype
- elif right_indexer is not None and (right_indexer == -1).all():
- key_col = Index(lvals)
- result_dtype = lvals.dtype
- else:
- key_col = Index(lvals).where(~mask_left, rvals)
- result_dtype = find_common_type([lvals.dtype, rvals.dtype])
- if (
- lvals.dtype.kind == "M"
- and rvals.dtype.kind == "M"
- and result_dtype.kind == "O"
- ):
- # TODO(non-nano) Workaround for common_type not dealing
- # with different resolutions
- result_dtype = key_col.dtype
-
- if result._is_label_reference(name):
- result[name] = Series(
- key_col, dtype=result_dtype, index=result.index
- )
- elif result._is_level_reference(name):
- if isinstance(result.index, MultiIndex):
- key_col.name = name
- idx_list = [
- result.index.get_level_values(level_name)
- if level_name != name
- else key_col
- for level_name in result.index.names
- ]
-
- result.set_index(idx_list, inplace=True)
- else:
- result.index = Index(key_col, name=name)
- else:
- result.insert(i, name or f"key_{i}", key_col)
-
- def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- """return the join indexers"""
- return get_join_indexers(
- self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
- )
-
- def _get_join_info(
- self,
- ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
- # make mypy happy
- assert self.how != "cross"
- left_ax = self.left.axes[self.axis]
- right_ax = self.right.axes[self.axis]
-
- if self.left_index and self.right_index and self.how != "asof":
- join_index, left_indexer, right_indexer = left_ax.join(
- right_ax, how=self.how, return_indexers=True, sort=self.sort
- )
-
- elif self.right_index and self.how == "left":
- join_index, left_indexer, right_indexer = _left_join_on_index(
- left_ax, right_ax, self.left_join_keys, sort=self.sort
- )
-
- elif self.left_index and self.how == "right":
- join_index, right_indexer, left_indexer = _left_join_on_index(
- right_ax, left_ax, self.right_join_keys, sort=self.sort
- )
- else:
- (left_indexer, right_indexer) = self._get_join_indexers()
-
- if self.right_index:
- if len(self.left) > 0:
- join_index = self._create_join_index(
- self.left.index,
- self.right.index,
- left_indexer,
- how="right",
- )
- else:
- join_index = self.right.index.take(right_indexer)
- elif self.left_index:
- if self.how == "asof":
- # GH#33463 asof should always behave like a left merge
- join_index = self._create_join_index(
- self.left.index,
- self.right.index,
- left_indexer,
- how="left",
- )
-
- elif len(self.right) > 0:
- join_index = self._create_join_index(
- self.right.index,
- self.left.index,
- right_indexer,
- how="left",
- )
- else:
- join_index = self.left.index.take(left_indexer)
- else:
- join_index = default_index(len(left_indexer))
-
- if len(join_index) == 0 and not isinstance(join_index, MultiIndex):
- join_index = default_index(0).set_names(join_index.name)
- return join_index, left_indexer, right_indexer
-
- def _create_join_index(
- self,
- index: Index,
- other_index: Index,
- indexer: npt.NDArray[np.intp],
- how: JoinHow = "left",
- ) -> Index:
- """
- Create a join index by rearranging one index to match another
-
- Parameters
- ----------
- index : Index being rearranged
- other_index : Index used to supply values not found in index
- indexer : np.ndarray[np.intp] how to rearrange index
- how : str
- Replacement is only necessary if indexer based on other_index.
-
- Returns
- -------
- Index
- """
- if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):
- # if final index requires values in other_index but not target
- # index, indexer may hold missing (-1) values, causing Index.take
- # to take the final value in target index. So, we set the last
- # element to be the desired fill value. We do not use allow_fill
- # and fill_value because it throws a ValueError on integer indices
- mask = indexer == -1
- if np.any(mask):
- fill_value = na_value_for_dtype(index.dtype, compat=False)
- index = index.append(Index([fill_value]))
- return index.take(indexer)
-
- def _get_merge_keys(
- self,
- ) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]:
- """
- Note: has side effects (copy/delete key columns)
-
- Parameters
- ----------
- left
- right
- on
-
- Returns
- -------
- left_keys, right_keys, join_names
- """
- # left_keys, right_keys entries can actually be anything listlike
- # with a 'dtype' attr
- left_keys: list[AnyArrayLike] = []
- right_keys: list[AnyArrayLike] = []
- join_names: list[Hashable] = []
- right_drop: list[Hashable] = []
- left_drop: list[Hashable] = []
-
- left, right = self.left, self.right
-
- is_lkey = lambda x: is_array_like(x) and len(x) == len(left)
- is_rkey = lambda x: is_array_like(x) and len(x) == len(right)
-
- # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
- # user could, for example, request 'left_index' and 'left_by'. In a
- # regular pd.merge(), users cannot specify both 'left_index' and
- # 'left_on'. (Instead, users have a MultiIndex). That means the
- # self.left_on in this function is always empty in a pd.merge(), but
- # a pd.merge_asof(left_index=True, left_by=...) will result in a
- # self.left_on array with a None in the middle of it. This requires
- # a work-around as designated in the code below.
- # See _validate_left_right_on() for where this happens.
-
- # ugh, spaghetti re #733
- if _any(self.left_on) and _any(self.right_on):
- for lk, rk in zip(self.left_on, self.right_on):
- if is_lkey(lk):
- lk = cast(AnyArrayLike, lk)
- left_keys.append(lk)
- if is_rkey(rk):
- rk = cast(AnyArrayLike, rk)
- right_keys.append(rk)
- join_names.append(None) # what to do?
- else:
- # Then we're either Hashable or a wrong-length arraylike,
- # the latter of which will raise
- rk = cast(Hashable, rk)
- if rk is not None:
- right_keys.append(right._get_label_or_level_values(rk))
- join_names.append(rk)
- else:
- # work-around for merge_asof(right_index=True)
- right_keys.append(right.index)
- join_names.append(right.index.name)
- else:
- if not is_rkey(rk):
- # Then we're either Hashable or a wrong-length arraylike,
- # the latter of which will raise
- rk = cast(Hashable, rk)
- if rk is not None:
- right_keys.append(right._get_label_or_level_values(rk))
- else:
- # work-around for merge_asof(right_index=True)
- right_keys.append(right.index)
- if lk is not None and lk == rk: # FIXME: what about other NAs?
- # avoid key upcast in corner case (length-0)
- lk = cast(Hashable, lk)
- if len(left) > 0:
- right_drop.append(rk)
- else:
- left_drop.append(lk)
- else:
- rk = cast(AnyArrayLike, rk)
- right_keys.append(rk)
- if lk is not None:
- # Then we're either Hashable or a wrong-length arraylike,
- # the latter of which will raise
- lk = cast(Hashable, lk)
- left_keys.append(left._get_label_or_level_values(lk))
- join_names.append(lk)
- else:
- # work-around for merge_asof(left_index=True)
- left_keys.append(left.index)
- join_names.append(left.index.name)
- elif _any(self.left_on):
- for k in self.left_on:
- if is_lkey(k):
- k = cast(AnyArrayLike, k)
- left_keys.append(k)
- join_names.append(None)
- else:
- # Then we're either Hashable or a wrong-length arraylike,
- # the latter of which will raise
- k = cast(Hashable, k)
- left_keys.append(left._get_label_or_level_values(k))
- join_names.append(k)
- if isinstance(self.right.index, MultiIndex):
- right_keys = [
- lev._values.take(lev_codes)
- for lev, lev_codes in zip(
- self.right.index.levels, self.right.index.codes
- )
- ]
- else:
- right_keys = [self.right.index._values]
- elif _any(self.right_on):
- for k in self.right_on:
- if is_rkey(k):
- k = cast(AnyArrayLike, k)
- right_keys.append(k)
- join_names.append(None)
- else:
- # Then we're either Hashable or a wrong-length arraylike,
- # the latter of which will raise
- k = cast(Hashable, k)
- right_keys.append(right._get_label_or_level_values(k))
- join_names.append(k)
- if isinstance(self.left.index, MultiIndex):
- left_keys = [
- lev._values.take(lev_codes)
- for lev, lev_codes in zip(
- self.left.index.levels, self.left.index.codes
- )
- ]
- else:
- left_keys = [self.left.index._values]
-
- if left_drop:
- self.left = self.left._drop_labels_or_levels(left_drop)
-
- if right_drop:
- self.right = self.right._drop_labels_or_levels(right_drop)
-
- return left_keys, right_keys, join_names
-
- def _maybe_coerce_merge_keys(self) -> None:
- # we have valid merges but we may have to further
- # coerce these if they are originally incompatible types
- #
- # for example if these are categorical, but are not dtype_equal
- # or if we have object and integer dtypes
-
- for lk, rk, name in zip(
- self.left_join_keys, self.right_join_keys, self.join_names
- ):
- if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
- continue
-
- lk = extract_array(lk, extract_numpy=True)
- rk = extract_array(rk, extract_numpy=True)
-
- lk_is_cat = is_categorical_dtype(lk.dtype)
- rk_is_cat = is_categorical_dtype(rk.dtype)
- lk_is_object = is_object_dtype(lk.dtype)
- rk_is_object = is_object_dtype(rk.dtype)
-
- # if either left or right is a categorical
- # then the must match exactly in categories & ordered
- if lk_is_cat and rk_is_cat:
- lk = cast(Categorical, lk)
- rk = cast(Categorical, rk)
- if lk._categories_match_up_to_permutation(rk):
- continue
-
- elif lk_is_cat or rk_is_cat:
- pass
-
- elif is_dtype_equal(lk.dtype, rk.dtype):
- continue
-
- msg = (
- f"You are trying to merge on {lk.dtype} and "
- f"{rk.dtype} columns. If you wish to proceed you should use pd.concat"
- )
-
- # if we are numeric, then allow differing
- # kinds to proceed, eg. int64 and int8, int and float
- # further if we are object, but we infer to
- # the same, then proceed
- if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype):
- if lk.dtype.kind == rk.dtype.kind:
- continue
-
- # check whether ints and floats
- if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):
- # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
- with np.errstate(invalid="ignore"):
- # error: Argument 1 to "astype" of "ndarray" has incompatible
- # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected
- # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"
- casted = lk.astype(rk.dtype) # type: ignore[arg-type]
-
- mask = ~np.isnan(lk)
- match = lk == casted
- if not match[mask].all():
- warnings.warn(
- "You are merging on int and float "
- "columns where the float values "
- "are not equal to their int representation.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- continue
-
- if is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype):
- # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
- with np.errstate(invalid="ignore"):
- # error: Argument 1 to "astype" of "ndarray" has incompatible
- # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected
- # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"
- casted = rk.astype(lk.dtype) # type: ignore[arg-type]
-
- mask = ~np.isnan(rk)
- match = rk == casted
- if not match[mask].all():
- warnings.warn(
- "You are merging on int and float "
- "columns where the float values "
- "are not equal to their int representation.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- continue
-
- # let's infer and see if we are ok
- if lib.infer_dtype(lk, skipna=False) == lib.infer_dtype(
- rk, skipna=False
- ):
- continue
-
- # Check if we are trying to merge on obviously
- # incompatible dtypes GH 9780, GH 15800
-
- # bool values are coerced to object
- elif (lk_is_object and is_bool_dtype(rk.dtype)) or (
- is_bool_dtype(lk.dtype) and rk_is_object
- ):
- pass
-
- # object values are allowed to be merged
- elif (lk_is_object and is_numeric_dtype(rk.dtype)) or (
- is_numeric_dtype(lk.dtype) and rk_is_object
- ):
- inferred_left = lib.infer_dtype(lk, skipna=False)
- inferred_right = lib.infer_dtype(rk, skipna=False)
- bool_types = ["integer", "mixed-integer", "boolean", "empty"]
- string_types = ["string", "unicode", "mixed", "bytes", "empty"]
-
- # inferred bool
- if inferred_left in bool_types and inferred_right in bool_types:
- pass
-
- # unless we are merging non-string-like with string-like
- elif (
- inferred_left in string_types and inferred_right not in string_types
- ) or (
- inferred_right in string_types and inferred_left not in string_types
- ):
- raise ValueError(msg)
-
- # datetimelikes must match exactly
- elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):
- raise ValueError(msg)
- elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype):
- raise ValueError(msg)
- elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance(
- rk.dtype, DatetimeTZDtype
- ):
- raise ValueError(msg)
- elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance(
- rk.dtype, DatetimeTZDtype
- ):
- raise ValueError(msg)
- elif (
- isinstance(lk.dtype, DatetimeTZDtype)
- and isinstance(rk.dtype, DatetimeTZDtype)
- ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"):
- # allows datetime with different resolutions
- continue
-
- elif lk_is_object and rk_is_object:
- continue
-
- # Houston, we have a problem!
- # let's coerce to object if the dtypes aren't
- # categorical, otherwise coerce to the category
- # dtype. If we coerced categories to object,
- # then we would lose type information on some
- # columns, and end up trying to merge
- # incompatible dtypes. See GH 16900.
- if name in self.left.columns:
- typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object
- self.left = self.left.copy()
- self.left[name] = self.left[name].astype(typ)
- if name in self.right.columns:
- typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object
- self.right = self.right.copy()
- self.right[name] = self.right[name].astype(typ)
-
- def _create_cross_configuration(
- self, left: DataFrame, right: DataFrame
- ) -> tuple[DataFrame, DataFrame, JoinHow, str]:
- """
- Creates the configuration to dispatch the cross operation to inner join,
- e.g. adding a join column and resetting parameters. Join column is added
- to a new object, no inplace modification
-
- Parameters
- ----------
- left : DataFrame
- right : DataFrame
-
- Returns
- -------
- a tuple (left, right, how, cross_col) representing the adjusted
- DataFrames with cross_col, the merge operation set to inner and the column
- to join over.
- """
- cross_col = f"_cross_{uuid.uuid4()}"
- how: JoinHow = "inner"
- return (
- left.assign(**{cross_col: 1}),
- right.assign(**{cross_col: 1}),
- how,
- cross_col,
- )
-
- def _validate_left_right_on(self, left_on, right_on):
- left_on = com.maybe_make_list(left_on)
- right_on = com.maybe_make_list(right_on)
-
- if self.how == "cross":
- if (
- self.left_index
- or self.right_index
- or right_on is not None
- or left_on is not None
- or self.on is not None
- ):
- raise MergeError(
- "Can not pass on, right_on, left_on or set right_index=True or "
- "left_index=True"
- )
- # Hm, any way to make this logic less complicated??
- elif self.on is None and left_on is None and right_on is None:
- if self.left_index and self.right_index:
- left_on, right_on = (), ()
- elif self.left_index:
- raise MergeError("Must pass right_on or right_index=True")
- elif self.right_index:
- raise MergeError("Must pass left_on or left_index=True")
- else:
- # use the common columns
- left_cols = self.left.columns
- right_cols = self.right.columns
- common_cols = left_cols.intersection(right_cols)
- if len(common_cols) == 0:
- raise MergeError(
- "No common columns to perform merge on. "
- f"Merge options: left_on={left_on}, "
- f"right_on={right_on}, "
- f"left_index={self.left_index}, "
- f"right_index={self.right_index}"
- )
- if (
- not left_cols.join(common_cols, how="inner").is_unique
- or not right_cols.join(common_cols, how="inner").is_unique
- ):
- raise MergeError(f"Data columns not unique: {repr(common_cols)}")
- left_on = right_on = common_cols
- elif self.on is not None:
- if left_on is not None or right_on is not None:
- raise MergeError(
- 'Can only pass argument "on" OR "left_on" '
- 'and "right_on", not a combination of both.'
- )
- if self.left_index or self.right_index:
- raise MergeError(
- 'Can only pass argument "on" OR "left_index" '
- 'and "right_index", not a combination of both.'
- )
- left_on = right_on = self.on
- elif left_on is not None:
- if self.left_index:
- raise MergeError(
- 'Can only pass argument "left_on" OR "left_index" not both.'
- )
- if not self.right_index and right_on is None:
- raise MergeError('Must pass "right_on" OR "right_index".')
- n = len(left_on)
- if self.right_index:
- if len(left_on) != self.right.index.nlevels:
- raise ValueError(
- "len(left_on) must equal the number "
- 'of levels in the index of "right"'
- )
- right_on = [None] * n
- elif right_on is not None:
- if self.right_index:
- raise MergeError(
- 'Can only pass argument "right_on" OR "right_index" not both.'
- )
- if not self.left_index and left_on is None:
- raise MergeError('Must pass "left_on" OR "left_index".')
- n = len(right_on)
- if self.left_index:
- if len(right_on) != self.left.index.nlevels:
- raise ValueError(
- "len(right_on) must equal the number "
- 'of levels in the index of "left"'
- )
- left_on = [None] * n
- if self.how != "cross" and len(right_on) != len(left_on):
- raise ValueError("len(right_on) must equal len(left_on)")
-
- return left_on, right_on
-
- def _validate(self, validate: str) -> None:
- # Check uniqueness of each
- if self.left_index:
- left_unique = self.orig_left.index.is_unique
- else:
- left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
-
- if self.right_index:
- right_unique = self.orig_right.index.is_unique
- else:
- right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
-
- # Check data integrity
- if validate in ["one_to_one", "1:1"]:
- if not left_unique and not right_unique:
- raise MergeError(
- "Merge keys are not unique in either left "
- "or right dataset; not a one-to-one merge"
- )
- if not left_unique:
- raise MergeError(
- "Merge keys are not unique in left dataset; not a one-to-one merge"
- )
- if not right_unique:
- raise MergeError(
- "Merge keys are not unique in right dataset; not a one-to-one merge"
- )
-
- elif validate in ["one_to_many", "1:m"]:
- if not left_unique:
- raise MergeError(
- "Merge keys are not unique in left dataset; not a one-to-many merge"
- )
-
- elif validate in ["many_to_one", "m:1"]:
- if not right_unique:
- raise MergeError(
- "Merge keys are not unique in right dataset; "
- "not a many-to-one merge"
- )
-
- elif validate in ["many_to_many", "m:m"]:
- pass
-
- else:
- raise ValueError(
- f'"{validate}" is not a valid argument. '
- "Valid arguments are:\n"
- '- "1:1"\n'
- '- "1:m"\n'
- '- "m:1"\n'
- '- "m:m"\n'
- '- "one_to_one"\n'
- '- "one_to_many"\n'
- '- "many_to_one"\n'
- '- "many_to_many"'
- )
-
-
-def get_join_indexers(
- left_keys,
- right_keys,
- sort: bool = False,
- how: MergeHow | Literal["asof"] = "inner",
- **kwargs,
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- """
-
- Parameters
- ----------
- left_keys : ndarray, Index, Series
- right_keys : ndarray, Index, Series
- sort : bool, default False
- how : {'inner', 'outer', 'left', 'right'}, default 'inner'
-
- Returns
- -------
- np.ndarray[np.intp]
- Indexer into the left_keys.
- np.ndarray[np.intp]
- Indexer into the right_keys.
- """
- assert len(left_keys) == len(
- right_keys
- ), "left_key and right_keys must be the same length"
-
- # fast-path for empty left/right
- left_n = len(left_keys[0])
- right_n = len(right_keys[0])
- if left_n == 0:
- if how in ["left", "inner", "cross"]:
- return _get_empty_indexer()
- elif not sort and how in ["right", "outer"]:
- return _get_no_sort_one_missing_indexer(right_n, True)
- elif right_n == 0:
- if how in ["right", "inner", "cross"]:
- return _get_empty_indexer()
- elif not sort and how in ["left", "outer"]:
- return _get_no_sort_one_missing_indexer(left_n, False)
-
- # get left & right join labels and num. of levels at each location
- mapped = (
- _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
- for n in range(len(left_keys))
- )
- zipped = zip(*mapped)
- llab, rlab, shape = (list(x) for x in zipped)
-
- # get flat i8 keys from label lists
- lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort)
-
- # factorize keys to a dense i8 space
- # `count` is the num. of unique keys
- # set(lkey) | set(rkey) == range(count)
-
- lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
- # preserve left frame order if how == 'left' and sort == False
- kwargs = cp.copy(kwargs)
- if how in ("left", "right"):
- kwargs["sort"] = sort
- join_func = {
- "inner": libjoin.inner_join,
- "left": libjoin.left_outer_join,
- "right": lambda x, y, count, **kwargs: libjoin.left_outer_join(
- y, x, count, **kwargs
- )[::-1],
- "outer": libjoin.full_outer_join,
- }[how]
-
- # error: Cannot call function of unknown type
- return join_func(lkey, rkey, count, **kwargs) # type: ignore[operator]
-
-
-def restore_dropped_levels_multijoin(
- left: MultiIndex,
- right: MultiIndex,
- dropped_level_names,
- join_index: Index,
- lindexer: npt.NDArray[np.intp],
- rindexer: npt.NDArray[np.intp],
-) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]:
- """
- *this is an internal non-public method*
-
- Returns the levels, labels and names of a multi-index to multi-index join.
- Depending on the type of join, this method restores the appropriate
- dropped levels of the joined multi-index.
- The method relies on lindexer, rindexer which hold the index positions of
- left and right, where a join was feasible
-
- Parameters
- ----------
- left : MultiIndex
- left index
- right : MultiIndex
- right index
- dropped_level_names : str array
- list of non-common level names
- join_index : Index
- the index of the join between the
- common levels of left and right
- lindexer : np.ndarray[np.intp]
- left indexer
- rindexer : np.ndarray[np.intp]
- right indexer
-
- Returns
- -------
- levels : list of Index
- levels of combined multiindexes
- labels : np.ndarray[np.intp]
- labels of combined multiindexes
- names : List[Hashable]
- names of combined multiindex levels
-
- """
-
- def _convert_to_multiindex(index: Index) -> MultiIndex:
- if isinstance(index, MultiIndex):
- return index
- else:
- return MultiIndex.from_arrays([index._values], names=[index.name])
-
- # For multi-multi joins with one overlapping level,
- # the returned index if of type Index
- # Assure that join_index is of type MultiIndex
- # so that dropped levels can be appended
- join_index = _convert_to_multiindex(join_index)
-
- join_levels = join_index.levels
- join_codes = join_index.codes
- join_names = join_index.names
-
- # Iterate through the levels that must be restored
- for dropped_level_name in dropped_level_names:
- if dropped_level_name in left.names:
- idx = left
- indexer = lindexer
- else:
- idx = right
- indexer = rindexer
-
- # The index of the level name to be restored
- name_idx = idx.names.index(dropped_level_name)
-
- restore_levels = idx.levels[name_idx]
- # Inject -1 in the codes list where a join was not possible
- # IOW indexer[i]=-1
- codes = idx.codes[name_idx]
- if indexer is None:
- restore_codes = codes
- else:
- restore_codes = algos.take_nd(codes, indexer, fill_value=-1)
-
- # error: Cannot determine type of "__add__"
- join_levels = join_levels + [restore_levels] # type: ignore[has-type]
- join_codes = join_codes + [restore_codes]
- join_names = join_names + [dropped_level_name]
-
- return join_levels, join_codes, join_names
-
-
-class _OrderedMerge(_MergeOperation):
- _merge_type = "ordered_merge"
-
- def __init__(
- self,
- left: DataFrame | Series,
- right: DataFrame | Series,
- on: IndexLabel | None = None,
- left_on: IndexLabel | None = None,
- right_on: IndexLabel | None = None,
- left_index: bool = False,
- right_index: bool = False,
- axis: AxisInt = 1,
- suffixes: Suffixes = ("_x", "_y"),
- fill_method: str | None = None,
- how: JoinHow | Literal["asof"] = "outer",
- ) -> None:
- self.fill_method = fill_method
- _MergeOperation.__init__(
- self,
- left,
- right,
- on=on,
- left_on=left_on,
- left_index=left_index,
- right_index=right_index,
- right_on=right_on,
- axis=axis,
- how=how,
- suffixes=suffixes,
- sort=True, # factorize sorts
- )
-
- def get_result(self, copy: bool | None = True) -> DataFrame:
- join_index, left_indexer, right_indexer = self._get_join_info()
-
- llabels, rlabels = _items_overlap_with_suffix(
- self.left._info_axis, self.right._info_axis, self.suffixes
- )
-
- left_join_indexer: np.ndarray | None
- right_join_indexer: np.ndarray | None
-
- if self.fill_method == "ffill":
- if left_indexer is None:
- raise TypeError("left_indexer cannot be None")
- left_indexer, right_indexer = cast(np.ndarray, left_indexer), cast(
- np.ndarray, right_indexer
- )
- left_join_indexer = libjoin.ffill_indexer(left_indexer)
- right_join_indexer = libjoin.ffill_indexer(right_indexer)
- else:
- left_join_indexer = left_indexer
- right_join_indexer = right_indexer
-
- result = self._reindex_and_concat(
- join_index, left_join_indexer, right_join_indexer, copy=copy
- )
- self._maybe_add_join_keys(result, left_indexer, right_indexer)
-
- return result
-
-
-def _asof_by_function(direction: str):
- name = f"asof_join_{direction}_on_X_by_Y"
- return getattr(libjoin, name, None)
-
-
-_type_casters = {
- "int64_t": ensure_int64,
- "double": ensure_float64,
- "object": ensure_object,
-}
-
-
-def _get_cython_type_upcast(dtype: DtypeObj) -> str:
- """Upcast a dtype to 'int64_t', 'double', or 'object'"""
- if is_integer_dtype(dtype):
- return "int64_t"
- elif is_float_dtype(dtype):
- return "double"
- else:
- return "object"
-
-
-class _AsOfMerge(_OrderedMerge):
- _merge_type = "asof_merge"
-
- def __init__(
- self,
- left: DataFrame | Series,
- right: DataFrame | Series,
- on: IndexLabel | None = None,
- left_on: IndexLabel | None = None,
- right_on: IndexLabel | None = None,
- left_index: bool = False,
- right_index: bool = False,
- by=None,
- left_by=None,
- right_by=None,
- axis: AxisInt = 1,
- suffixes: Suffixes = ("_x", "_y"),
- copy: bool = True,
- fill_method: str | None = None,
- how: Literal["asof"] = "asof",
- tolerance=None,
- allow_exact_matches: bool = True,
- direction: str = "backward",
- ) -> None:
- self.by = by
- self.left_by = left_by
- self.right_by = right_by
- self.tolerance = tolerance
- self.allow_exact_matches = allow_exact_matches
- self.direction = direction
-
- _OrderedMerge.__init__(
- self,
- left,
- right,
- on=on,
- left_on=left_on,
- right_on=right_on,
- left_index=left_index,
- right_index=right_index,
- axis=axis,
- how=how,
- suffixes=suffixes,
- fill_method=fill_method,
- )
-
- def _validate_left_right_on(self, left_on, right_on):
- left_on, right_on = super()._validate_left_right_on(left_on, right_on)
-
- # we only allow on to be a single item for on
- if len(left_on) != 1 and not self.left_index:
- raise MergeError("can only asof on a key for left")
-
- if len(right_on) != 1 and not self.right_index:
- raise MergeError("can only asof on a key for right")
-
- if self.left_index and isinstance(self.left.index, MultiIndex):
- raise MergeError("left can only have one index")
-
- if self.right_index and isinstance(self.right.index, MultiIndex):
- raise MergeError("right can only have one index")
-
- # set 'by' columns
- if self.by is not None:
- if self.left_by is not None or self.right_by is not None:
- raise MergeError("Can only pass by OR left_by and right_by")
- self.left_by = self.right_by = self.by
- if self.left_by is None and self.right_by is not None:
- raise MergeError("missing left_by")
- if self.left_by is not None and self.right_by is None:
- raise MergeError("missing right_by")
-
- # GH#29130 Check that merge keys do not have dtype object
- if not self.left_index:
- left_on_0 = left_on[0]
- if is_array_like(left_on_0):
- lo_dtype = left_on_0.dtype
- else:
- lo_dtype = (
- self.left._get_label_or_level_values(left_on_0).dtype
- if left_on_0 in self.left.columns
- else self.left.index.get_level_values(left_on_0)
- )
- else:
- lo_dtype = self.left.index.dtype
-
- if not self.right_index:
- right_on_0 = right_on[0]
- if is_array_like(right_on_0):
- ro_dtype = right_on_0.dtype
- else:
- ro_dtype = (
- self.right._get_label_or_level_values(right_on_0).dtype
- if right_on_0 in self.right.columns
- else self.right.index.get_level_values(right_on_0)
- )
- else:
- ro_dtype = self.right.index.dtype
-
- if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype):
- raise MergeError(
- f"Incompatible merge dtype, {repr(ro_dtype)} and "
- f"{repr(lo_dtype)}, both sides must have numeric dtype"
- )
-
- # add 'by' to our key-list so we can have it in the
- # output as a key
- if self.left_by is not None:
- if not is_list_like(self.left_by):
- self.left_by = [self.left_by]
- if not is_list_like(self.right_by):
- self.right_by = [self.right_by]
-
- if len(self.left_by) != len(self.right_by):
- raise MergeError("left_by and right_by must be same length")
-
- left_on = self.left_by + list(left_on)
- right_on = self.right_by + list(right_on)
-
- # check 'direction' is valid
- if self.direction not in ["backward", "forward", "nearest"]:
- raise MergeError(f"direction invalid: {self.direction}")
-
- return left_on, right_on
-
- def _get_merge_keys(
- self,
- ) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]:
- # note this function has side effects
- (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys()
-
- # validate index types are the same
- for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
- if not is_dtype_equal(lk.dtype, rk.dtype):
- if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype):
- # The generic error message is confusing for categoricals.
- #
- # In this function, the join keys include both the original
- # ones of the merge_asof() call, and also the keys passed
- # to its by= argument. Unordered but equal categories
- # are not supported for the former, but will fail
- # later with a ValueError, so we don't *need* to check
- # for them here.
- msg = (
- f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
- f"{repr(rk.dtype)}, both sides category, but not equal ones"
- )
- else:
- msg = (
- f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
- f"{repr(rk.dtype)}, must be the same type"
- )
- raise MergeError(msg)
-
- # validate tolerance; datetime.timedelta or Timedelta if we have a DTI
- if self.tolerance is not None:
- if self.left_index:
- # Actually more specifically an Index
- lt = cast(AnyArrayLike, self.left.index)
- else:
- lt = left_join_keys[-1]
-
- msg = (
- f"incompatible tolerance {self.tolerance}, must be compat "
- f"with type {repr(lt.dtype)}"
- )
-
- if needs_i8_conversion(lt):
- if not isinstance(self.tolerance, datetime.timedelta):
- raise MergeError(msg)
- if self.tolerance < Timedelta(0):
- raise MergeError("tolerance must be positive")
-
- elif is_integer_dtype(lt):
- if not is_integer(self.tolerance):
- raise MergeError(msg)
- if self.tolerance < 0:
- raise MergeError("tolerance must be positive")
-
- elif is_float_dtype(lt):
- if not is_number(self.tolerance):
- raise MergeError(msg)
- if self.tolerance < 0:
- raise MergeError("tolerance must be positive")
-
- else:
- raise MergeError("key must be integer, timestamp or float")
-
- # validate allow_exact_matches
- if not is_bool(self.allow_exact_matches):
- msg = (
- "allow_exact_matches must be boolean, "
- f"passed {self.allow_exact_matches}"
- )
- raise MergeError(msg)
-
- return left_join_keys, right_join_keys, join_names
-
- def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- """return the join indexers"""
-
- def flip(xs) -> np.ndarray:
- """unlike np.transpose, this returns an array of tuples"""
-
- def injection(obj):
- if not is_extension_array_dtype(obj):
- # ndarray
- return obj
- obj = extract_array(obj)
- if isinstance(obj, NDArrayBackedExtensionArray):
- # fastpath for e.g. dt64tz, categorical
- return obj._ndarray
- # FIXME: returning obj._values_for_argsort() here doesn't
- # break in any existing test cases, but i (@jbrockmendel)
- # am pretty sure it should!
- # e.g.
- # arr = pd.array([0, pd.NA, 255], dtype="UInt8")
- # will have values_for_argsort (before GH#45434)
- # np.array([0, 255, 255], dtype=np.uint8)
- # and the non-injectivity should make a difference somehow
- # shouldn't it?
- return np.asarray(obj)
-
- xs = [injection(x) for x in xs]
- labels = list(string.ascii_lowercase[: len(xs)])
- dtypes = [x.dtype for x in xs]
- labeled_dtypes = list(zip(labels, dtypes))
- return np.array(list(zip(*xs)), labeled_dtypes)
-
- # values to compare
- left_values = (
- self.left.index._values if self.left_index else self.left_join_keys[-1]
- )
- right_values = (
- self.right.index._values if self.right_index else self.right_join_keys[-1]
- )
- tolerance = self.tolerance
-
- # we require sortedness and non-null values in the join keys
- if not Index(left_values).is_monotonic_increasing:
- side = "left"
- if isna(left_values).any():
- raise ValueError(f"Merge keys contain null values on {side} side")
- raise ValueError(f"{side} keys must be sorted")
-
- if not Index(right_values).is_monotonic_increasing:
- side = "right"
- if isna(right_values).any():
- raise ValueError(f"Merge keys contain null values on {side} side")
- raise ValueError(f"{side} keys must be sorted")
-
- # initial type conversion as needed
- if needs_i8_conversion(left_values):
- if tolerance is not None:
- tolerance = Timedelta(tolerance)
-
- # TODO: we have no test cases with PeriodDtype here; probably
- # need to adjust tolerance for that case.
- if left_values.dtype.kind in ["m", "M"]:
- # Make sure the i8 representation for tolerance
- # matches that for left_values/right_values.
- lvs = ensure_wrapped_if_datetimelike(left_values)
- tolerance = tolerance.as_unit(lvs.unit)
-
- tolerance = tolerance._value
-
- # TODO: require left_values.dtype == right_values.dtype, or at least
- # comparable for e.g. dt64tz
- left_values = left_values.view("i8")
- right_values = right_values.view("i8")
-
- # a "by" parameter requires special handling
- if self.left_by is not None:
- # remove 'on' parameter from values if one existed
- if self.left_index and self.right_index:
- left_by_values = self.left_join_keys
- right_by_values = self.right_join_keys
- else:
- left_by_values = self.left_join_keys[0:-1]
- right_by_values = self.right_join_keys[0:-1]
-
- # get tuple representation of values if more than one
- if len(left_by_values) == 1:
- lbv = left_by_values[0]
- rbv = right_by_values[0]
- else:
- # We get here with non-ndarrays in test_merge_by_col_tz_aware
- # and test_merge_groupby_multiple_column_with_categorical_column
- lbv = flip(left_by_values)
- rbv = flip(right_by_values)
-
- # upcast 'by' parameter because HashTable is limited
- by_type = _get_cython_type_upcast(lbv.dtype)
- by_type_caster = _type_casters[by_type]
- # error: Incompatible types in assignment (expression has type
- # "ndarray[Any, dtype[generic]]", variable has type
- # "List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]")
- left_by_values = by_type_caster(lbv) # type: ignore[assignment]
- # error: Incompatible types in assignment (expression has type
- # "ndarray[Any, dtype[generic]]", variable has type
- # "List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]")
- right_by_values = by_type_caster(rbv) # type: ignore[assignment]
-
- # choose appropriate function by type
- func = _asof_by_function(self.direction)
- return func(
- left_values,
- right_values,
- left_by_values,
- right_by_values,
- self.allow_exact_matches,
- tolerance,
- )
- else:
- # choose appropriate function by type
- func = _asof_by_function(self.direction)
- # TODO(cython3):
- # Bug in beta1 preventing Cython from choosing
- # right specialization when one fused memview is None
- # Doesn't matter what type we choose
- # (nothing happens anyways since it is None)
- # GH 51640
- return func[f"{left_values.dtype}_t", object](
- left_values,
- right_values,
- None,
- None,
- self.allow_exact_matches,
- tolerance,
- False,
- )
-
-
-def _get_multiindex_indexer(
- join_keys, index: MultiIndex, sort: bool
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- # left & right join labels and num. of levels at each location
- mapped = (
- _factorize_keys(index.levels[n], join_keys[n], sort=sort)
- for n in range(index.nlevels)
- )
- zipped = zip(*mapped)
- rcodes, lcodes, shape = (list(x) for x in zipped)
- if sort:
- rcodes = list(map(np.take, rcodes, index.codes))
- else:
- i8copy = lambda a: a.astype("i8", subok=False, copy=True)
- rcodes = list(map(i8copy, index.codes))
-
- # fix right labels if there were any nulls
- for i, join_key in enumerate(join_keys):
- mask = index.codes[i] == -1
- if mask.any():
- # check if there already was any nulls at this location
- # if there was, it is factorized to `shape[i] - 1`
- a = join_key[lcodes[i] == shape[i] - 1]
- if a.size == 0 or not a[0] != a[0]:
- shape[i] += 1
-
- rcodes[i][mask] = shape[i] - 1
-
- # get flat i8 join keys
- lkey, rkey = _get_join_keys(lcodes, rcodes, tuple(shape), sort)
-
- # factorize keys to a dense i8 space
- lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
-
- return libjoin.left_outer_join(lkey, rkey, count, sort=sort)
-
-
-def _get_single_indexer(
- join_key, index: Index, sort: bool = False
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort)
-
- return libjoin.left_outer_join(left_key, right_key, count, sort=sort)
-
-
-def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- """Return empty join indexers."""
- return (
- np.array([], dtype=np.intp),
- np.array([], dtype=np.intp),
- )
-
-
-def _get_no_sort_one_missing_indexer(
- n: int, left_missing: bool
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- """
- Return join indexers where all of one side is selected without sorting
- and none of the other side is selected.
-
- Parameters
- ----------
- n : int
- Length of indexers to create.
- left_missing : bool
- If True, the left indexer will contain only -1's.
- If False, the right indexer will contain only -1's.
-
- Returns
- -------
- np.ndarray[np.intp]
- Left indexer
- np.ndarray[np.intp]
- Right indexer
- """
- idx = np.arange(n, dtype=np.intp)
- idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp)
- if left_missing:
- return idx_missing, idx
- return idx, idx_missing
-
-
-def _left_join_on_index(
- left_ax: Index, right_ax: Index, join_keys, sort: bool = False
-) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]:
- if len(join_keys) > 1:
- if not (
- isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels
- ):
- raise AssertionError(
- "If more than one join key is given then "
- "'right_ax' must be a MultiIndex and the "
- "number of join keys must be the number of levels in right_ax"
- )
-
- left_indexer, right_indexer = _get_multiindex_indexer(
- join_keys, right_ax, sort=sort
- )
- else:
- jkey = join_keys[0]
-
- left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort)
-
- if sort or len(left_ax) != len(left_indexer):
- # if asked to sort or there are 1-to-many matches
- join_index = left_ax.take(left_indexer)
- return join_index, left_indexer, right_indexer
-
- # left frame preserves order & length of its index
- return left_ax, None, right_indexer
-
-
-def _factorize_keys(
- lk: ArrayLike,
- rk: ArrayLike,
- sort: bool = True,
- how: MergeHow | Literal["asof"] = "inner",
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
- """
- Encode left and right keys as enumerated types.
-
- This is used to get the join indexers to be used when merging DataFrames.
-
- Parameters
- ----------
- lk : array-like
- Left key.
- rk : array-like
- Right key.
- sort : bool, defaults to True
- If True, the encoding is done such that the unique elements in the
- keys are sorted.
- how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’
- Type of merge.
-
- Returns
- -------
- np.ndarray[np.intp]
- Left (resp. right if called with `key='right'`) labels, as enumerated type.
- np.ndarray[np.intp]
- Right (resp. left if called with `key='right'`) labels, as enumerated type.
- int
- Number of unique elements in union of left and right labels.
-
- See Also
- --------
- merge : Merge DataFrame or named Series objects
- with a database-style join.
- algorithms.factorize : Encode the object as an enumerated type
- or categorical variable.
-
- Examples
- --------
- >>> lk = np.array(["a", "c", "b"])
- >>> rk = np.array(["a", "c"])
-
- Here, the unique values are `'a', 'b', 'c'`. With the default
- `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
-
- >>> pd.core.reshape.merge._factorize_keys(lk, rk)
- (array([0, 2, 1]), array([0, 2]), 3)
-
- With the `sort=False`, the encoding will correspond to the order
- in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
-
- >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
- (array([0, 1, 2]), array([0, 1]), 3)
- """
- # Some pre-processing for non-ndarray lk / rk
- lk = extract_array(lk, extract_numpy=True, extract_range=True)
- rk = extract_array(rk, extract_numpy=True, extract_range=True)
- # TODO: if either is a RangeIndex, we can likely factorize more efficiently?
-
- if (
- isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype)
- ) or (
- isinstance(lk.dtype, np.dtype)
- and lk.dtype.kind == "M"
- and isinstance(rk.dtype, np.dtype)
- and rk.dtype.kind == "M"
- ):
- # Extract the ndarray (UTC-localized) values
- # Note: we dont need the dtypes to match, as these can still be compared
- lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
- lk = cast("DatetimeArray", lk)._ndarray
- rk = cast("DatetimeArray", rk)._ndarray
-
- elif (
- is_categorical_dtype(lk.dtype)
- and is_categorical_dtype(rk.dtype)
- and is_dtype_equal(lk.dtype, rk.dtype)
- ):
- assert isinstance(lk, Categorical)
- assert isinstance(rk, Categorical)
- # Cast rk to encoding so we can compare codes with lk
-
- rk = lk._encode_with_my_categories(rk)
-
- lk = ensure_int64(lk.codes)
- rk = ensure_int64(rk.codes)
-
- elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype):
- if not isinstance(lk, BaseMaskedArray) and not (
- # exclude arrow dtypes that would get cast to object
- isinstance(lk.dtype, ArrowDtype)
- and is_numeric_dtype(lk.dtype.numpy_dtype)
- ):
- lk, _ = lk._values_for_factorize()
-
- # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
- # "_values_for_factorize"
- rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
-
- if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
- # GH#23917 TODO: Needs tests for non-matching dtypes
- # GH#23917 TODO: needs tests for case where lk is integer-dtype
- # and rk is datetime-dtype
- lk = np.asarray(lk, dtype=np.int64)
- rk = np.asarray(rk, dtype=np.int64)
-
- klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk)
-
- rizer = klass(max(len(lk), len(rk)))
-
- if isinstance(lk, BaseMaskedArray):
- assert isinstance(rk, BaseMaskedArray)
- llab = rizer.factorize(lk._data, mask=lk._mask)
- rlab = rizer.factorize(rk._data, mask=rk._mask)
- elif isinstance(lk, ArrowExtensionArray):
- assert isinstance(rk, ArrowExtensionArray)
- # we can only get here with numeric dtypes
- # TODO: Remove when we have a Factorizer for Arrow
- llab = rizer.factorize(
- lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()
- )
- rlab = rizer.factorize(
- rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()
- )
- else:
- # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
- # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
- # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
- llab = rizer.factorize(lk) # type: ignore[arg-type]
- rlab = rizer.factorize(rk) # type: ignore[arg-type]
- assert llab.dtype == np.dtype(np.intp), llab.dtype
- assert rlab.dtype == np.dtype(np.intp), rlab.dtype
-
- count = rizer.get_count()
-
- if sort:
- uniques = rizer.uniques.to_array()
- llab, rlab = _sort_labels(uniques, llab, rlab)
-
- # NA group
- lmask = llab == -1
- lany = lmask.any()
- rmask = rlab == -1
- rany = rmask.any()
-
- if lany or rany:
- if lany:
- np.putmask(llab, lmask, count)
- if rany:
- np.putmask(rlab, rmask, count)
- count += 1
-
- if how == "right":
- return rlab, llab, count
- return llab, rlab, count
-
-
-def _convert_arrays_and_get_rizer_klass(
- lk: ArrayLike, rk: ArrayLike
-) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]:
- klass: type[libhashtable.Factorizer]
- if is_numeric_dtype(lk.dtype):
- if not is_dtype_equal(lk, rk):
- dtype = find_common_type([lk.dtype, rk.dtype])
- if isinstance(dtype, ExtensionDtype):
- cls = dtype.construct_array_type()
- if not isinstance(lk, ExtensionArray):
- lk = cls._from_sequence(lk, dtype=dtype, copy=False)
- else:
- lk = lk.astype(dtype)
-
- if not isinstance(rk, ExtensionArray):
- rk = cls._from_sequence(rk, dtype=dtype, copy=False)
- else:
- rk = rk.astype(dtype)
- else:
- lk = lk.astype(dtype)
- rk = rk.astype(dtype)
- if isinstance(lk, BaseMaskedArray):
- # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
- # expected type "Type[object]"
- klass = _factorizers[lk.dtype.type] # type: ignore[index]
- elif isinstance(lk.dtype, ArrowDtype):
- klass = _factorizers[lk.dtype.numpy_dtype.type]
- else:
- klass = _factorizers[lk.dtype.type]
-
- else:
- klass = libhashtable.ObjectFactorizer
- lk = ensure_object(lk)
- rk = ensure_object(rk)
- return klass, lk, rk
-
-
-def _sort_labels(
- uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp]
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
- llength = len(left)
- labels = np.concatenate([left, right])
-
- _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True)
- new_left, new_right = new_labels[:llength], new_labels[llength:]
-
- return new_left, new_right
-
-
-def _get_join_keys(
- llab: list[npt.NDArray[np.int64 | np.intp]],
- rlab: list[npt.NDArray[np.int64 | np.intp]],
- shape: Shape,
- sort: bool,
-) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
- # how many levels can be done without overflow
- nlev = next(
- lev
- for lev in range(len(shape), 0, -1)
- if not is_int64_overflow_possible(shape[:lev])
- )
-
- # get keys for the first `nlev` levels
- stride = np.prod(shape[1:nlev], dtype="i8")
- lkey = stride * llab[0].astype("i8", subok=False, copy=False)
- rkey = stride * rlab[0].astype("i8", subok=False, copy=False)
-
- for i in range(1, nlev):
- with np.errstate(divide="ignore"):
- stride //= shape[i]
- lkey += llab[i] * stride
- rkey += rlab[i] * stride
-
- if nlev == len(shape): # all done!
- return lkey, rkey
-
- # densify current keys to avoid overflow
- lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
-
- llab = [lkey] + llab[nlev:]
- rlab = [rkey] + rlab[nlev:]
- shape = (count,) + shape[nlev:]
-
- return _get_join_keys(llab, rlab, shape, sort)
-
-
-def _should_fill(lname, rname) -> bool:
- if not isinstance(lname, str) or not isinstance(rname, str):
- return True
- return lname == rname
-
-
-def _any(x) -> bool:
- return x is not None and com.any_not_none(*x)
-
-
-def _validate_operand(obj: DataFrame | Series) -> DataFrame:
- if isinstance(obj, ABCDataFrame):
- return obj
- elif isinstance(obj, ABCSeries):
- if obj.name is None:
- raise ValueError("Cannot merge a Series without a name")
- return obj.to_frame()
- else:
- raise TypeError(
- f"Can only merge Series or DataFrame objects, a {type(obj)} was passed"
- )
-
-
-def _items_overlap_with_suffix(
- left: Index, right: Index, suffixes: Suffixes
-) -> tuple[Index, Index]:
- """
- Suffixes type validation.
-
- If two indices overlap, add suffixes to overlapping entries.
-
- If corresponding suffix is empty, the entry is simply converted to string.
-
- """
- if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict):
- raise TypeError(
- f"Passing 'suffixes' as a {type(suffixes)}, is not supported. "
- "Provide 'suffixes' as a tuple instead."
- )
-
- to_rename = left.intersection(right)
- if len(to_rename) == 0:
- return left, right
-
- lsuffix, rsuffix = suffixes
-
- if not lsuffix and not rsuffix:
- raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
-
- def renamer(x, suffix):
- """
- Rename the left and right indices.
-
- If there is overlap, and suffix is not None, add
- suffix, otherwise, leave it as-is.
-
- Parameters
- ----------
- x : original column name
- suffix : str or None
-
- Returns
- -------
- x : renamed column name
- """
- if x in to_rename and suffix is not None:
- return f"{x}{suffix}"
- return x
-
- lrenamer = partial(renamer, suffix=lsuffix)
- rrenamer = partial(renamer, suffix=rsuffix)
-
- llabels = left._transform_index(lrenamer)
- rlabels = right._transform_index(rrenamer)
-
- dups = []
- if not llabels.is_unique:
- # Only warn when duplicates are caused because of suffixes, already duplicated
- # columns in origin should not warn
- dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()
- if not rlabels.is_unique:
- dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
- if dups:
- raise MergeError(
- f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "
- f"not allowed.",
- )
-
- return llabels, rlabels
diff --git a/contrib/python/pandas/py3/pandas/core/reshape/pivot.py b/contrib/python/pandas/py3/pandas/core/reshape/pivot.py
deleted file mode 100644
index cd5ee1ca1fc..00000000000
--- a/contrib/python/pandas/py3/pandas/core/reshape/pivot.py
+++ /dev/null
@@ -1,885 +0,0 @@
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Callable,
- Hashable,
- Sequence,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import (
- AggFuncType,
- AggFuncTypeBase,
- AggFuncTypeDict,
- IndexLabel,
-)
-from pandas.util._decorators import (
- Appender,
- Substitution,
-)
-
-from pandas.core.dtypes.cast import maybe_downcast_to_dtype
-from pandas.core.dtypes.common import (
- is_extension_array_dtype,
- is_integer_dtype,
- is_list_like,
- is_nested_list_like,
- is_scalar,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-
-import pandas.core.common as com
-from pandas.core.frame import _shared_docs
-from pandas.core.groupby import Grouper
-from pandas.core.indexes.api import (
- Index,
- MultiIndex,
- get_objs_combined_axis,
-)
-from pandas.core.reshape.concat import concat
-from pandas.core.reshape.util import cartesian_product
-from pandas.core.series import Series
-
-if TYPE_CHECKING:
- from pandas import DataFrame
-
-
-# Note: We need to make sure `frame` is imported before `pivot`, otherwise
-# _shared_docs['pivot_table'] will not yet exist. TODO: Fix this dependency
-@Substitution("\ndata : DataFrame")
-@Appender(_shared_docs["pivot_table"], indents=1)
-def pivot_table(
- data: DataFrame,
- values=None,
- index=None,
- columns=None,
- aggfunc: AggFuncType = "mean",
- fill_value=None,
- margins: bool = False,
- dropna: bool = True,
- margins_name: Hashable = "All",
- observed: bool = False,
- sort: bool = True,
-) -> DataFrame:
- index = _convert_by(index)
- columns = _convert_by(columns)
-
- if isinstance(aggfunc, list):
- pieces: list[DataFrame] = []
- keys = []
- for func in aggfunc:
- _table = __internal_pivot_table(
- data,
- values=values,
- index=index,
- columns=columns,
- fill_value=fill_value,
- aggfunc=func,
- margins=margins,
- dropna=dropna,
- margins_name=margins_name,
- observed=observed,
- sort=sort,
- )
- pieces.append(_table)
- keys.append(getattr(func, "__name__", func))
-
- table = concat(pieces, keys=keys, axis=1)
- return table.__finalize__(data, method="pivot_table")
-
- table = __internal_pivot_table(
- data,
- values,
- index,
- columns,
- aggfunc,
- fill_value,
- margins,
- dropna,
- margins_name,
- observed,
- sort,
- )
- return table.__finalize__(data, method="pivot_table")
-
-
-def __internal_pivot_table(
- data: DataFrame,
- values,
- index,
- columns,
- aggfunc: AggFuncTypeBase | AggFuncTypeDict,
- fill_value,
- margins: bool,
- dropna: bool,
- margins_name: Hashable,
- observed: bool,
- sort: bool,
-) -> DataFrame:
- """
- Helper of :func:`pandas.pivot_table` for any non-list ``aggfunc``.
- """
- keys = index + columns
-
- values_passed = values is not None
- if values_passed:
- if is_list_like(values):
- values_multi = True
- values = list(values)
- else:
- values_multi = False
- values = [values]
-
- # GH14938 Make sure value labels are in data
- for i in values:
- if i not in data:
- raise KeyError(i)
-
- to_filter = []
- for x in keys + values:
- if isinstance(x, Grouper):
- x = x.key
- try:
- if x in data:
- to_filter.append(x)
- except TypeError:
- pass
- if len(to_filter) < len(data.columns):
- data = data[to_filter]
-
- else:
- values = data.columns
- for key in keys:
- try:
- values = values.drop(key)
- except (TypeError, ValueError, KeyError):
- pass
- values = list(values)
-
- grouped = data.groupby(keys, observed=observed, sort=sort)
- agged = grouped.agg(aggfunc)
-
- if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
- agged = agged.dropna(how="all")
-
- # gh-21133
- # we want to down cast if
- # the original values are ints
- # as we grouped with a NaN value
- # and then dropped, coercing to floats
- for v in values:
- if (
- v in data
- and is_integer_dtype(data[v])
- and v in agged
- and not is_integer_dtype(agged[v])
- ):
- if not isinstance(agged[v], ABCDataFrame) and isinstance(
- data[v].dtype, np.dtype
- ):
- # exclude DataFrame case bc maybe_downcast_to_dtype expects
- # ArrayLike
- # e.g. test_pivot_table_multiindex_columns_doctest_case
- # agged.columns is a MultiIndex and 'v' is indexing only
- # on its first level.
- agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype)
-
- table = agged
-
- # GH17038, this check should only happen if index is defined (not None)
- if table.index.nlevels > 1 and index:
- # Related GH #17123
- # If index_names are integers, determine whether the integers refer
- # to the level position or name.
- index_names = agged.index.names[: len(index)]
- to_unstack = []
- for i in range(len(index), len(keys)):
- name = agged.index.names[i]
- if name is None or name in index_names:
- to_unstack.append(i)
- else:
- to_unstack.append(name)
- table = agged.unstack(to_unstack)
-
- if not dropna:
- if isinstance(table.index, MultiIndex):
- m = MultiIndex.from_arrays(
- cartesian_product(table.index.levels), names=table.index.names
- )
- table = table.reindex(m, axis=0)
-
- if isinstance(table.columns, MultiIndex):
- m = MultiIndex.from_arrays(
- cartesian_product(table.columns.levels), names=table.columns.names
- )
- table = table.reindex(m, axis=1)
-
- if sort is True and isinstance(table, ABCDataFrame):
- table = table.sort_index(axis=1)
-
- if fill_value is not None:
- table = table.fillna(fill_value, downcast="infer")
-
- if margins:
- if dropna:
- data = data[data.notna().all(axis=1)]
- table = _add_margins(
- table,
- data,
- values,
- rows=index,
- cols=columns,
- aggfunc=aggfunc,
- observed=dropna,
- margins_name=margins_name,
- fill_value=fill_value,
- )
-
- # discard the top level
- if values_passed and not values_multi and table.columns.nlevels > 1:
- table = table.droplevel(0, axis=1)
- if len(index) == 0 and len(columns) > 0:
- table = table.T
-
- # GH 15193 Make sure empty columns are removed if dropna=True
- if isinstance(table, ABCDataFrame) and dropna:
- table = table.dropna(how="all", axis=1)
-
- return table
-
-
-def _add_margins(
- table: DataFrame | Series,
- data: DataFrame,
- values,
- rows,
- cols,
- aggfunc,
- observed=None,
- margins_name: Hashable = "All",
- fill_value=None,
-):
- if not isinstance(margins_name, str):
- raise ValueError("margins_name argument must be a string")
-
- msg = f'Conflicting name "{margins_name}" in margins'
- for level in table.index.names:
- if margins_name in table.index.get_level_values(level):
- raise ValueError(msg)
-
- grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)
-
- if table.ndim == 2:
- # i.e. DataFrame
- for level in table.columns.names[1:]:
- if margins_name in table.columns.get_level_values(level):
- raise ValueError(msg)
-
- key: str | tuple[str, ...]
- if len(rows) > 1:
- key = (margins_name,) + ("",) * (len(rows) - 1)
- else:
- key = margins_name
-
- if not values and isinstance(table, ABCSeries):
- # If there are no values and the table is a series, then there is only
- # one column in the data. Compute grand margin and return it.
- return table._append(Series({key: grand_margin[margins_name]}))
-
- elif values:
- marginal_result_set = _generate_marginal_results(
- table, data, values, rows, cols, aggfunc, observed, margins_name
- )
- if not isinstance(marginal_result_set, tuple):
- return marginal_result_set
- result, margin_keys, row_margin = marginal_result_set
- else:
- # no values, and table is a DataFrame
- assert isinstance(table, ABCDataFrame)
- marginal_result_set = _generate_marginal_results_without_values(
- table, data, rows, cols, aggfunc, observed, margins_name
- )
- if not isinstance(marginal_result_set, tuple):
- return marginal_result_set
- result, margin_keys, row_margin = marginal_result_set
-
- row_margin = row_margin.reindex(result.columns, fill_value=fill_value)
- # populate grand margin
- for k in margin_keys:
- if isinstance(k, str):
- row_margin[k] = grand_margin[k]
- else:
- row_margin[k] = grand_margin[k[0]]
-
- from pandas import DataFrame
-
- margin_dummy = DataFrame(row_margin, columns=Index([key])).T
-
- row_names = result.index.names
- # check the result column and leave floats
- for dtype in set(result.dtypes):
- if is_extension_array_dtype(dtype):
- # Can hold NA already
- continue
-
- cols = result.select_dtypes([dtype]).columns
- margin_dummy[cols] = margin_dummy[cols].apply(
- maybe_downcast_to_dtype, args=(dtype,)
- )
- result = result._append(margin_dummy)
- result.index.names = row_names
-
- return result
-
-
-def _compute_grand_margin(
- data: DataFrame, values, aggfunc, margins_name: Hashable = "All"
-):
- if values:
- grand_margin = {}
- for k, v in data[values].items():
- try:
- if isinstance(aggfunc, str):
- grand_margin[k] = getattr(v, aggfunc)()
- elif isinstance(aggfunc, dict):
- if isinstance(aggfunc[k], str):
- grand_margin[k] = getattr(v, aggfunc[k])()
- else:
- grand_margin[k] = aggfunc[k](v)
- else:
- grand_margin[k] = aggfunc(v)
- except TypeError:
- pass
- return grand_margin
- else:
- return {margins_name: aggfunc(data.index)}
-
-
-def _generate_marginal_results(
- table, data, values, rows, cols, aggfunc, observed, margins_name: Hashable = "All"
-):
- if len(cols) > 0:
- # need to "interleave" the margins
- table_pieces = []
- margin_keys = []
-
- def _all_key(key):
- return (key, margins_name) + ("",) * (len(cols) - 1)
-
- if len(rows) > 0:
- margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc)
- cat_axis = 1
-
- for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
- all_key = _all_key(key)
-
- # we are going to mutate this, so need to copy!
- piece = piece.copy()
- piece[all_key] = margin[key]
-
- table_pieces.append(piece)
- margin_keys.append(all_key)
- else:
- from pandas import DataFrame
-
- cat_axis = 0
- for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
- if len(cols) > 1:
- all_key = _all_key(key)
- else:
- all_key = margins_name
- table_pieces.append(piece)
- # GH31016 this is to calculate margin for each group, and assign
- # corresponded key as index
- transformed_piece = DataFrame(piece.apply(aggfunc)).T
- if isinstance(piece.index, MultiIndex):
- # We are adding an empty level
- transformed_piece.index = MultiIndex.from_tuples(
- [all_key], names=piece.index.names + [None]
- )
- else:
- transformed_piece.index = Index([all_key], name=piece.index.name)
-
- # append piece for margin into table_piece
- table_pieces.append(transformed_piece)
- margin_keys.append(all_key)
-
- if not table_pieces:
- # GH 49240
- return table
- else:
- result = concat(table_pieces, axis=cat_axis)
-
- if len(rows) == 0:
- return result
- else:
- result = table
- margin_keys = table.columns
-
- if len(cols) > 0:
- row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc)
- row_margin = row_margin.stack()
-
- # slight hack
- new_order = [len(cols)] + list(range(len(cols)))
- row_margin.index = row_margin.index.reorder_levels(new_order)
- else:
- row_margin = Series(np.nan, index=result.columns)
-
- return result, margin_keys, row_margin
-
-
-def _generate_marginal_results_without_values(
- table: DataFrame,
- data,
- rows,
- cols,
- aggfunc,
- observed,
- margins_name: Hashable = "All",
-):
- if len(cols) > 0:
- # need to "interleave" the margins
- margin_keys: list | Index = []
-
- def _all_key():
- if len(cols) == 1:
- return margins_name
- return (margins_name,) + ("",) * (len(cols) - 1)
-
- if len(rows) > 0:
- margin = data[rows].groupby(rows, observed=observed).apply(aggfunc)
- all_key = _all_key()
- table[all_key] = margin
- result = table
- margin_keys.append(all_key)
-
- else:
- margin = data.groupby(level=0, axis=0, observed=observed).apply(aggfunc)
- all_key = _all_key()
- table[all_key] = margin
- result = table
- margin_keys.append(all_key)
- return result
- else:
- result = table
- margin_keys = table.columns
-
- if len(cols):
- row_margin = data[cols].groupby(cols, observed=observed).apply(aggfunc)
- else:
- row_margin = Series(np.nan, index=result.columns)
-
- return result, margin_keys, row_margin
-
-
-def _convert_by(by):
- if by is None:
- by = []
- elif (
- is_scalar(by)
- or isinstance(by, (np.ndarray, Index, ABCSeries, Grouper))
- or callable(by)
- ):
- by = [by]
- else:
- by = list(by)
- return by
-
-
-@Substitution("\ndata : DataFrame")
-@Appender(_shared_docs["pivot"], indents=1)
-def pivot(
- data: DataFrame,
- *,
- columns: IndexLabel,
- index: IndexLabel | lib.NoDefault = lib.NoDefault,
- values: IndexLabel | lib.NoDefault = lib.NoDefault,
-) -> DataFrame:
- columns_listlike = com.convert_to_list_like(columns)
-
- # If columns is None we will create a MultiIndex level with None as name
- # which might cause duplicated names because None is the default for
- # level names
- data = data.copy(deep=False)
- data.index = data.index.copy()
- data.index.names = [
- name if name is not None else lib.NoDefault for name in data.index.names
- ]
-
- indexed: DataFrame | Series
- if values is lib.NoDefault:
- if index is not lib.NoDefault:
- cols = com.convert_to_list_like(index)
- else:
- cols = []
-
- append = index is lib.NoDefault
- # error: Unsupported operand types for + ("List[Any]" and "ExtensionArray")
- # error: Unsupported left operand type for + ("ExtensionArray")
- indexed = data.set_index(
- cols + columns_listlike, append=append # type: ignore[operator]
- )
- else:
- if index is lib.NoDefault:
- if isinstance(data.index, MultiIndex):
- # GH 23955
- index_list = [
- data.index.get_level_values(i) for i in range(data.index.nlevels)
- ]
- else:
- index_list = [Series(data.index, name=data.index.name)]
- else:
- index_list = [data[idx] for idx in com.convert_to_list_like(index)]
-
- data_columns = [data[col] for col in columns_listlike]
- index_list.extend(data_columns)
- multiindex = MultiIndex.from_arrays(index_list)
-
- if is_list_like(values) and not isinstance(values, tuple):
- # Exclude tuple because it is seen as a single column name
- values = cast(Sequence[Hashable], values)
- indexed = data._constructor(
- data[values]._values, index=multiindex, columns=values
- )
- else:
- indexed = data._constructor_sliced(data[values]._values, index=multiindex)
- # error: Argument 1 to "unstack" of "DataFrame" has incompatible type "Union
- # [List[Any], ExtensionArray, ndarray[Any, Any], Index, Series]"; expected
- # "Hashable"
- result = indexed.unstack(columns_listlike) # type: ignore[arg-type]
- result.index.names = [
- name if name is not lib.NoDefault else None for name in result.index.names
- ]
-
- return result
-
-
-def crosstab(
- index,
- columns,
- values=None,
- rownames=None,
- colnames=None,
- aggfunc=None,
- margins: bool = False,
- margins_name: Hashable = "All",
- dropna: bool = True,
- normalize: bool = False,
-) -> DataFrame:
- """
- Compute a simple cross tabulation of two (or more) factors.
-
- By default, computes a frequency table of the factors unless an
- array of values and an aggregation function are passed.
-
- Parameters
- ----------
- index : array-like, Series, or list of arrays/Series
- Values to group by in the rows.
- columns : array-like, Series, or list of arrays/Series
- Values to group by in the columns.
- values : array-like, optional
- Array of values to aggregate according to the factors.
- Requires `aggfunc` be specified.
- rownames : sequence, default None
- If passed, must match number of row arrays passed.
- colnames : sequence, default None
- If passed, must match number of column arrays passed.
- aggfunc : function, optional
- If specified, requires `values` be specified as well.
- margins : bool, default False
- Add row/column margins (subtotals).
- margins_name : str, default 'All'
- Name of the row/column that will contain the totals
- when margins is True.
- dropna : bool, default True
- Do not include columns whose entries are all NaN.
- normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False
- Normalize by dividing all values by the sum of values.
-
- - If passed 'all' or `True`, will normalize over all values.
- - If passed 'index' will normalize over each row.
- - If passed 'columns' will normalize over each column.
- - If margins is `True`, will also normalize margin values.
-
- Returns
- -------
- DataFrame
- Cross tabulation of the data.
-
- See Also
- --------
- DataFrame.pivot : Reshape data based on column values.
- pivot_table : Create a pivot table as a DataFrame.
-
- Notes
- -----
- Any Series passed will have their name attributes used unless row or column
- names for the cross-tabulation are specified.
-
- Any input passed containing Categorical data will have **all** of its
- categories included in the cross-tabulation, even if the actual data does
- not contain any instances of a particular category.
-
- In the event that there aren't overlapping indexes an empty DataFrame will
- be returned.
-
- Reference :ref:`the user guide <reshaping.crosstabulations>` for more examples.
-
- Examples
- --------
- >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
- ... "bar", "bar", "foo", "foo", "foo"], dtype=object)
- >>> b = np.array(["one", "one", "one", "two", "one", "one",
- ... "one", "two", "two", "two", "one"], dtype=object)
- >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
- ... "shiny", "dull", "shiny", "shiny", "shiny"],
- ... dtype=object)
- >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
- b one two
- c dull shiny dull shiny
- a
- bar 1 2 1 0
- foo 2 2 1 2
-
- Here 'c' and 'f' are not represented in the data and will not be
- shown in the output because dropna is True by default. Set
- dropna=False to preserve categories with no data.
-
- >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
- >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
- >>> pd.crosstab(foo, bar)
- col_0 d e
- row_0
- a 1 0
- b 0 1
- >>> pd.crosstab(foo, bar, dropna=False)
- col_0 d e f
- row_0
- a 1 0 0
- b 0 1 0
- c 0 0 0
- """
- if values is None and aggfunc is not None:
- raise ValueError("aggfunc cannot be used without values.")
-
- if values is not None and aggfunc is None:
- raise ValueError("values cannot be used without an aggfunc.")
-
- if not is_nested_list_like(index):
- index = [index]
- if not is_nested_list_like(columns):
- columns = [columns]
-
- common_idx = None
- pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))]
- if pass_objs:
- common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False)
-
- rownames = _get_names(index, rownames, prefix="row")
- colnames = _get_names(columns, colnames, prefix="col")
-
- # duplicate names mapped to unique names for pivot op
- (
- rownames_mapper,
- unique_rownames,
- colnames_mapper,
- unique_colnames,
- ) = _build_names_mapper(rownames, colnames)
-
- from pandas import DataFrame
-
- data = {
- **dict(zip(unique_rownames, index)),
- **dict(zip(unique_colnames, columns)),
- }
- df = DataFrame(data, index=common_idx)
-
- if values is None:
- df["__dummy__"] = 0
- kwargs = {"aggfunc": len, "fill_value": 0}
- else:
- df["__dummy__"] = values
- kwargs = {"aggfunc": aggfunc}
-
- # error: Argument 7 to "pivot_table" of "DataFrame" has incompatible type
- # "**Dict[str, object]"; expected "Union[...]"
- table = df.pivot_table(
- "__dummy__",
- index=unique_rownames,
- columns=unique_colnames,
- margins=margins,
- margins_name=margins_name,
- dropna=dropna,
- **kwargs, # type: ignore[arg-type]
- )
-
- # Post-process
- if normalize is not False:
- table = _normalize(
- table, normalize=normalize, margins=margins, margins_name=margins_name
- )
-
- table = table.rename_axis(index=rownames_mapper, axis=0)
- table = table.rename_axis(columns=colnames_mapper, axis=1)
-
- return table
-
-
-def _normalize(
- table: DataFrame, normalize, margins: bool, margins_name: Hashable = "All"
-) -> DataFrame:
- if not isinstance(normalize, (bool, str)):
- axis_subs = {0: "index", 1: "columns"}
- try:
- normalize = axis_subs[normalize]
- except KeyError as err:
- raise ValueError("Not a valid normalize argument") from err
-
- if margins is False:
- # Actual Normalizations
- normalizers: dict[bool | str, Callable] = {
- "all": lambda x: x / x.sum(axis=1).sum(axis=0),
- "columns": lambda x: x / x.sum(),
- "index": lambda x: x.div(x.sum(axis=1), axis=0),
- }
-
- normalizers[True] = normalizers["all"]
-
- try:
- f = normalizers[normalize]
- except KeyError as err:
- raise ValueError("Not a valid normalize argument") from err
-
- table = f(table)
- table = table.fillna(0)
-
- elif margins is True:
- # keep index and column of pivoted table
- table_index = table.index
- table_columns = table.columns
- last_ind_or_col = table.iloc[-1, :].name
-
- # check if margin name is not in (for MI cases) and not equal to last
- # index/column and save the column and index margin
- if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col):
- raise ValueError(f"{margins_name} not in pivoted DataFrame")
- column_margin = table.iloc[:-1, -1]
- index_margin = table.iloc[-1, :-1]
-
- # keep the core table
- table = table.iloc[:-1, :-1]
-
- # Normalize core
- table = _normalize(table, normalize=normalize, margins=False)
-
- # Fix Margins
- if normalize == "columns":
- column_margin = column_margin / column_margin.sum()
- table = concat([table, column_margin], axis=1)
- table = table.fillna(0)
- table.columns = table_columns
-
- elif normalize == "index":
- index_margin = index_margin / index_margin.sum()
- table = table._append(index_margin)
- table = table.fillna(0)
- table.index = table_index
-
- elif normalize == "all" or normalize is True:
- column_margin = column_margin / column_margin.sum()
- index_margin = index_margin / index_margin.sum()
- index_margin.loc[margins_name] = 1
- table = concat([table, column_margin], axis=1)
- table = table._append(index_margin)
-
- table = table.fillna(0)
- table.index = table_index
- table.columns = table_columns
-
- else:
- raise ValueError("Not a valid normalize argument")
-
- else:
- raise ValueError("Not a valid margins argument")
-
- return table
-
-
-def _get_names(arrs, names, prefix: str = "row"):
- if names is None:
- names = []
- for i, arr in enumerate(arrs):
- if isinstance(arr, ABCSeries) and arr.name is not None:
- names.append(arr.name)
- else:
- names.append(f"{prefix}_{i}")
- else:
- if len(names) != len(arrs):
- raise AssertionError("arrays and names must have the same length")
- if not isinstance(names, list):
- names = list(names)
-
- return names
-
-
-def _build_names_mapper(
- rownames: list[str], colnames: list[str]
-) -> tuple[dict[str, str], list[str], dict[str, str], list[str]]:
- """
- Given the names of a DataFrame's rows and columns, returns a set of unique row
- and column names and mappers that convert to original names.
-
- A row or column name is replaced if it is duplicate among the rows of the inputs,
- among the columns of the inputs or between the rows and the columns.
-
- Parameters
- ----------
- rownames: list[str]
- colnames: list[str]
-
- Returns
- -------
- Tuple(Dict[str, str], List[str], Dict[str, str], List[str])
-
- rownames_mapper: dict[str, str]
- a dictionary with new row names as keys and original rownames as values
- unique_rownames: list[str]
- a list of rownames with duplicate names replaced by dummy names
- colnames_mapper: dict[str, str]
- a dictionary with new column names as keys and original column names as values
- unique_colnames: list[str]
- a list of column names with duplicate names replaced by dummy names
-
- """
-
- def get_duplicates(names):
- seen: set = set()
- return {name for name in names if name not in seen}
-
- shared_names = set(rownames).intersection(set(colnames))
- dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names
-
- rownames_mapper = {
- f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names
- }
- unique_rownames = [
- f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames)
- ]
-
- colnames_mapper = {
- f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names
- }
- unique_colnames = [
- f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames)
- ]
-
- return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames
diff --git a/contrib/python/pandas/py3/pandas/core/reshape/reshape.py b/contrib/python/pandas/py3/pandas/core/reshape/reshape.py
deleted file mode 100644
index e83317ebc74..00000000000
--- a/contrib/python/pandas/py3/pandas/core/reshape/reshape.py
+++ /dev/null
@@ -1,841 +0,0 @@
-from __future__ import annotations
-
-import itertools
-from typing import (
- TYPE_CHECKING,
- cast,
-)
-import warnings
-
-import numpy as np
-
-import pandas._libs.reshape as libreshape
-from pandas._typing import npt
-from pandas.errors import PerformanceWarning
-from pandas.util._decorators import cache_readonly
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.cast import maybe_promote
-from pandas.core.dtypes.common import (
- ensure_platform_int,
- is_1d_only_ea_dtype,
- is_extension_array_dtype,
- is_integer,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.dtypes import ExtensionDtype
-from pandas.core.dtypes.missing import notna
-
-import pandas.core.algorithms as algos
-from pandas.core.arrays.categorical import factorize_from_iterable
-from pandas.core.construction import ensure_wrapped_if_datetimelike
-from pandas.core.frame import DataFrame
-from pandas.core.indexes.api import (
- Index,
- MultiIndex,
-)
-from pandas.core.series import Series
-from pandas.core.sorting import (
- compress_group_index,
- decons_obs_group_ids,
- get_compressed_ids,
- get_group_index,
- get_group_index_sorter,
-)
-
-if TYPE_CHECKING:
- from pandas.core.arrays import ExtensionArray
- from pandas.core.indexes.frozen import FrozenList
-
-
-class _Unstacker:
- """
- Helper class to unstack data / pivot with multi-level index
-
- Parameters
- ----------
- index : MultiIndex
- level : int or str, default last level
- Level to "unstack". Accepts a name for the level.
- fill_value : scalar, optional
- Default value to fill in missing values if subgroups do not have the
- same set of labels. By default, missing values will be replaced with
- the default fill value for that data type, NaN for float, NaT for
- datetimelike, etc. For integer types, by default data will converted to
- float and missing values will be set to NaN.
- constructor : object
- Pandas ``DataFrame`` or subclass used to create unstacked
- response. If None, DataFrame will be used.
-
- Examples
- --------
- >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
- ... ('two', 'a'), ('two', 'b')])
- >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
- >>> s
- one a 1
- b 2
- two a 3
- b 4
- dtype: int64
-
- >>> s.unstack(level=-1)
- a b
- one 1 2
- two 3 4
-
- >>> s.unstack(level=0)
- one two
- a 1 3
- b 2 4
-
- Returns
- -------
- unstacked : DataFrame
- """
-
- def __init__(self, index: MultiIndex, level=-1, constructor=None) -> None:
- if constructor is None:
- constructor = DataFrame
- self.constructor = constructor
-
- self.index = index.remove_unused_levels()
-
- self.level = self.index._get_level_number(level)
-
- # when index includes `nan`, need to lift levels/strides by 1
- self.lift = 1 if -1 in self.index.codes[self.level] else 0
-
- # Note: the "pop" below alters these in-place.
- self.new_index_levels = list(self.index.levels)
- self.new_index_names = list(self.index.names)
-
- self.removed_name = self.new_index_names.pop(self.level)
- self.removed_level = self.new_index_levels.pop(self.level)
- self.removed_level_full = index.levels[self.level]
-
- # Bug fix GH 20601
- # If the data frame is too big, the number of unique index combination
- # will cause int32 overflow on windows environments.
- # We want to check and raise an error before this happens
- num_rows = np.max([index_level.size for index_level in self.new_index_levels])
- num_columns = self.removed_level.size
-
- # GH20601: This forces an overflow if the number of cells is too high.
- num_cells = num_rows * num_columns
-
- # GH 26314: Previous ValueError raised was too restrictive for many users.
- if num_cells > np.iinfo(np.int32).max:
- warnings.warn(
- f"The following operation may generate {num_cells} cells "
- f"in the resulting pandas object.",
- PerformanceWarning,
- stacklevel=find_stack_level(),
- )
-
- self._make_selectors()
-
- @cache_readonly
- def _indexer_and_to_sort(
- self,
- ) -> tuple[
- npt.NDArray[np.intp],
- list[np.ndarray], # each has _some_ signed integer dtype
- ]:
- v = self.level
-
- codes = list(self.index.codes)
- levs = list(self.index.levels)
- to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
- sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
-
- comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
- ngroups = len(obs_ids)
-
- indexer = get_group_index_sorter(comp_index, ngroups)
- return indexer, to_sort
-
- @cache_readonly
- def sorted_labels(self) -> list[np.ndarray]:
- indexer, to_sort = self._indexer_and_to_sort
- return [line.take(indexer) for line in to_sort]
-
- def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
- indexer, _ = self._indexer_and_to_sort
-
- sorted_values = algos.take_nd(values, indexer, axis=0)
- return sorted_values
-
- def _make_selectors(self):
- new_levels = self.new_index_levels
-
- # make the mask
- remaining_labels = self.sorted_labels[:-1]
- level_sizes = tuple(len(x) for x in new_levels)
-
- comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
- ngroups = len(obs_ids)
-
- comp_index = ensure_platform_int(comp_index)
- stride = self.index.levshape[self.level] + self.lift
- self.full_shape = ngroups, stride
-
- selector = self.sorted_labels[-1] + stride * comp_index + self.lift
- mask = np.zeros(np.prod(self.full_shape), dtype=bool)
- mask.put(selector, True)
-
- if mask.sum() < len(self.index):
- raise ValueError("Index contains duplicate entries, cannot reshape")
-
- self.group_index = comp_index
- self.mask = mask
- self.compressor = comp_index.searchsorted(np.arange(ngroups))
-
- @cache_readonly
- def mask_all(self) -> bool:
- return bool(self.mask.all())
-
- @cache_readonly
- def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
- # We cache this for re-use in ExtensionBlock._unstack
- dummy_arr = np.arange(len(self.index), dtype=np.intp)
- new_values, mask = self.get_new_values(dummy_arr, fill_value=-1)
- return new_values, mask.any(0)
- # TODO: in all tests we have mask.any(0).all(); can we rely on that?
-
- def get_result(self, values, value_columns, fill_value) -> DataFrame:
- if values.ndim == 1:
- values = values[:, np.newaxis]
-
- if value_columns is None and values.shape[1] != 1: # pragma: no cover
- raise ValueError("must pass column labels for multi-column data")
-
- values, _ = self.get_new_values(values, fill_value)
- columns = self.get_new_columns(value_columns)
- index = self.new_index
-
- return self.constructor(
- values, index=index, columns=columns, dtype=values.dtype
- )
-
- def get_new_values(self, values, fill_value=None):
- if values.ndim == 1:
- values = values[:, np.newaxis]
-
- sorted_values = self._make_sorted_values(values)
-
- # place the values
- length, width = self.full_shape
- stride = values.shape[1]
- result_width = width * stride
- result_shape = (length, result_width)
- mask = self.mask
- mask_all = self.mask_all
-
- # we can simply reshape if we don't have a mask
- if mask_all and len(values):
- # TODO: Under what circumstances can we rely on sorted_values
- # matching values? When that holds, we can slice instead
- # of take (in particular for EAs)
- new_values = (
- sorted_values.reshape(length, width, stride)
- .swapaxes(1, 2)
- .reshape(result_shape)
- )
- new_mask = np.ones(result_shape, dtype=bool)
- return new_values, new_mask
-
- dtype = values.dtype
-
- # if our mask is all True, then we can use our existing dtype
- if mask_all:
- dtype = values.dtype
- new_values = np.empty(result_shape, dtype=dtype)
- else:
- if isinstance(dtype, ExtensionDtype):
- # GH#41875
- # We are assuming that fill_value can be held by this dtype,
- # unlike the non-EA case that promotes.
- cls = dtype.construct_array_type()
- new_values = cls._empty(result_shape, dtype=dtype)
- new_values[:] = fill_value
- else:
- dtype, fill_value = maybe_promote(dtype, fill_value)
- new_values = np.empty(result_shape, dtype=dtype)
- new_values.fill(fill_value)
-
- name = dtype.name
- new_mask = np.zeros(result_shape, dtype=bool)
-
- # we need to convert to a basic dtype
- # and possibly coerce an input to our output dtype
- # e.g. ints -> floats
- if needs_i8_conversion(values.dtype):
- sorted_values = sorted_values.view("i8")
- new_values = new_values.view("i8")
- else:
- sorted_values = sorted_values.astype(name, copy=False)
-
- # fill in our values & mask
- libreshape.unstack(
- sorted_values,
- mask.view("u1"),
- stride,
- length,
- width,
- new_values,
- new_mask.view("u1"),
- )
-
- # reconstruct dtype if needed
- if needs_i8_conversion(values.dtype):
- # view as datetime64 so we can wrap in DatetimeArray and use
- # DTA's view method
- new_values = new_values.view("M8[ns]")
- new_values = ensure_wrapped_if_datetimelike(new_values)
- new_values = new_values.view(values.dtype)
-
- return new_values, new_mask
-
- def get_new_columns(self, value_columns: Index | None):
- if value_columns is None:
- if self.lift == 0:
- return self.removed_level._rename(name=self.removed_name)
-
- lev = self.removed_level.insert(0, item=self.removed_level._na_value)
- return lev.rename(self.removed_name)
-
- stride = len(self.removed_level) + self.lift
- width = len(value_columns)
- propagator = np.repeat(np.arange(width), stride)
-
- new_levels: FrozenList | list[Index]
-
- if isinstance(value_columns, MultiIndex):
- # error: Cannot determine type of "__add__" [has-type]
- new_levels = value_columns.levels + ( # type: ignore[has-type]
- self.removed_level_full,
- )
- new_names = value_columns.names + (self.removed_name,)
-
- new_codes = [lab.take(propagator) for lab in value_columns.codes]
- else:
- new_levels = [
- value_columns,
- self.removed_level_full,
- ]
- new_names = [value_columns.name, self.removed_name]
- new_codes = [propagator]
-
- repeater = self._repeater
-
- # The entire level is then just a repetition of the single chunk:
- new_codes.append(np.tile(repeater, width))
- return MultiIndex(
- levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
- )
-
- @cache_readonly
- def _repeater(self) -> np.ndarray:
- # The two indices differ only if the unstacked level had unused items:
- if len(self.removed_level_full) != len(self.removed_level):
- # In this case, we remap the new codes to the original level:
- repeater = self.removed_level_full.get_indexer(self.removed_level)
- if self.lift:
- repeater = np.insert(repeater, 0, -1)
- else:
- # Otherwise, we just use each level item exactly once:
- stride = len(self.removed_level) + self.lift
- repeater = np.arange(stride) - self.lift
-
- return repeater
-
- @cache_readonly
- def new_index(self) -> MultiIndex:
- # Does not depend on values or value_columns
- result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
-
- # construct the new index
- if len(self.new_index_levels) == 1:
- level, level_codes = self.new_index_levels[0], result_codes[0]
- if (level_codes == -1).any():
- level = level.insert(len(level), level._na_value)
- return level.take(level_codes).rename(self.new_index_names[0])
-
- return MultiIndex(
- levels=self.new_index_levels,
- codes=result_codes,
- names=self.new_index_names,
- verify_integrity=False,
- )
-
-
-def _unstack_multiple(data, clocs, fill_value=None):
- if len(clocs) == 0:
- return data
-
- # NOTE: This doesn't deal with hierarchical columns yet
-
- index = data.index
-
- # GH 19966 Make sure if MultiIndexed index has tuple name, they will be
- # recognised as a whole
- if clocs in index.names:
- clocs = [clocs]
- clocs = [index._get_level_number(i) for i in clocs]
-
- rlocs = [i for i in range(index.nlevels) if i not in clocs]
-
- clevels = [index.levels[i] for i in clocs]
- ccodes = [index.codes[i] for i in clocs]
- cnames = [index.names[i] for i in clocs]
- rlevels = [index.levels[i] for i in rlocs]
- rcodes = [index.codes[i] for i in rlocs]
- rnames = [index.names[i] for i in rlocs]
-
- shape = tuple(len(x) for x in clevels)
- group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
-
- comp_ids, obs_ids = compress_group_index(group_index, sort=False)
- recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
-
- if not rlocs:
- # Everything is in clocs, so the dummy df has a regular index
- dummy_index = Index(obs_ids, name="__placeholder__")
- else:
- dummy_index = MultiIndex(
- levels=rlevels + [obs_ids],
- codes=rcodes + [comp_ids],
- names=rnames + ["__placeholder__"],
- verify_integrity=False,
- )
-
- if isinstance(data, Series):
- dummy = data.copy()
- dummy.index = dummy_index
-
- unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
- new_levels = clevels
- new_names = cnames
- new_codes = recons_codes
- else:
- if isinstance(data.columns, MultiIndex):
- result = data
- while clocs:
- val = clocs.pop(0)
- result = result.unstack(val, fill_value=fill_value)
- clocs = [v if v < val else v - 1 for v in clocs]
-
- return result
-
- # GH#42579 deep=False to avoid consolidating
- dummy = data.copy(deep=False)
- dummy.index = dummy_index
-
- unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
- if isinstance(unstacked, Series):
- unstcols = unstacked.index
- else:
- unstcols = unstacked.columns
- assert isinstance(unstcols, MultiIndex) # for mypy
- new_levels = [unstcols.levels[0]] + clevels
- new_names = [data.columns.name] + cnames
-
- new_codes = [unstcols.codes[0]]
- for rec in recons_codes:
- new_codes.append(rec.take(unstcols.codes[-1]))
-
- new_columns = MultiIndex(
- levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
- )
-
- if isinstance(unstacked, Series):
- unstacked.index = new_columns
- else:
- unstacked.columns = new_columns
-
- return unstacked
-
-
-def unstack(obj: Series | DataFrame, level, fill_value=None):
- if isinstance(level, (tuple, list)):
- if len(level) != 1:
- # _unstack_multiple only handles MultiIndexes,
- # and isn't needed for a single level
- return _unstack_multiple(obj, level, fill_value=fill_value)
- else:
- level = level[0]
-
- if not is_integer(level) and not level == "__placeholder__":
- # check if level is valid in case of regular index
- obj.index._get_level_number(level)
-
- if isinstance(obj, DataFrame):
- if isinstance(obj.index, MultiIndex):
- return _unstack_frame(obj, level, fill_value=fill_value)
- else:
- return obj.T.stack(dropna=False)
- elif not isinstance(obj.index, MultiIndex):
- # GH 36113
- # Give nicer error messages when unstack a Series whose
- # Index is not a MultiIndex.
- raise ValueError(
- f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
- )
- else:
- if is_1d_only_ea_dtype(obj.dtype):
- return _unstack_extension_series(obj, level, fill_value)
- unstacker = _Unstacker(
- obj.index, level=level, constructor=obj._constructor_expanddim
- )
- return unstacker.get_result(
- obj._values, value_columns=None, fill_value=fill_value
- )
-
-
-def _unstack_frame(obj: DataFrame, level, fill_value=None):
- assert isinstance(obj.index, MultiIndex) # checked by caller
- unstacker = _Unstacker(obj.index, level=level, constructor=obj._constructor)
-
- if not obj._can_fast_transpose:
- mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
- return obj._constructor(mgr)
- else:
- return unstacker.get_result(
- obj._values, value_columns=obj.columns, fill_value=fill_value
- )
-
-
-def _unstack_extension_series(series: Series, level, fill_value) -> DataFrame:
- """
- Unstack an ExtensionArray-backed Series.
-
- The ExtensionDtype is preserved.
-
- Parameters
- ----------
- series : Series
- A Series with an ExtensionArray for values
- level : Any
- The level name or number.
- fill_value : Any
- The user-level (not physical storage) fill value to use for
- missing values introduced by the reshape. Passed to
- ``series.values.take``.
-
- Returns
- -------
- DataFrame
- Each column of the DataFrame will have the same dtype as
- the input Series.
- """
- # Defer to the logic in ExtensionBlock._unstack
- df = series.to_frame()
- result = df.unstack(level=level, fill_value=fill_value)
-
- # equiv: result.droplevel(level=0, axis=1)
- # but this avoids an extra copy
- result.columns = result.columns.droplevel(0)
- return result
-
-
-def stack(frame: DataFrame, level=-1, dropna: bool = True):
- """
- Convert DataFrame to Series with multi-level Index. Columns become the
- second level of the resulting hierarchical index
-
- Returns
- -------
- stacked : Series or DataFrame
- """
-
- def factorize(index):
- if index.is_unique:
- return index, np.arange(len(index))
- codes, categories = factorize_from_iterable(index)
- return categories, codes
-
- N, K = frame.shape
-
- # Will also convert negative level numbers and check if out of bounds.
- level_num = frame.columns._get_level_number(level)
-
- if isinstance(frame.columns, MultiIndex):
- return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
- elif isinstance(frame.index, MultiIndex):
- new_levels = list(frame.index.levels)
- new_codes = [lab.repeat(K) for lab in frame.index.codes]
-
- clev, clab = factorize(frame.columns)
- new_levels.append(clev)
- new_codes.append(np.tile(clab, N).ravel())
-
- new_names = list(frame.index.names)
- new_names.append(frame.columns.name)
- new_index = MultiIndex(
- levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
- )
- else:
- levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns)))
- codes = ilab.repeat(K), np.tile(clab, N).ravel()
- new_index = MultiIndex(
- levels=levels,
- codes=codes,
- names=[frame.index.name, frame.columns.name],
- verify_integrity=False,
- )
-
- if not frame.empty and frame._is_homogeneous_type:
- # For homogeneous EAs, frame._values will coerce to object. So
- # we concatenate instead.
- dtypes = list(frame.dtypes._values)
- dtype = dtypes[0]
-
- if is_extension_array_dtype(dtype):
- arr = dtype.construct_array_type()
- new_values = arr._concat_same_type(
- [col._values for _, col in frame.items()]
- )
- new_values = _reorder_for_extension_array_stack(new_values, N, K)
- else:
- # homogeneous, non-EA
- new_values = frame._values.ravel()
-
- else:
- # non-homogeneous
- new_values = frame._values.ravel()
-
- if dropna:
- mask = notna(new_values)
- new_values = new_values[mask]
- new_index = new_index[mask]
-
- return frame._constructor_sliced(new_values, index=new_index)
-
-
-def stack_multiple(frame, level, dropna: bool = True):
- # If all passed levels match up to column names, no
- # ambiguity about what to do
- if all(lev in frame.columns.names for lev in level):
- result = frame
- for lev in level:
- result = stack(result, lev, dropna=dropna)
-
- # Otherwise, level numbers may change as each successive level is stacked
- elif all(isinstance(lev, int) for lev in level):
- # As each stack is done, the level numbers decrease, so we need
- # to account for that when level is a sequence of ints
- result = frame
- # _get_level_number() checks level numbers are in range and converts
- # negative numbers to positive
- level = [frame.columns._get_level_number(lev) for lev in level]
-
- while level:
- lev = level.pop(0)
- result = stack(result, lev, dropna=dropna)
- # Decrement all level numbers greater than current, as these
- # have now shifted down by one
- level = [v if v <= lev else v - 1 for v in level]
-
- else:
- raise ValueError(
- "level should contain all level names or all level "
- "numbers, not a mixture of the two."
- )
-
- return result
-
-
-def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
- """Creates a MultiIndex from the first N-1 levels of this MultiIndex."""
- if len(columns.levels) <= 2:
- return columns.levels[0]._rename(name=columns.names[0])
-
- levs = [
- [lev[c] if c >= 0 else None for c in codes]
- for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
- ]
-
- # Remove duplicate tuples in the MultiIndex.
- tuples = zip(*levs)
- unique_tuples = (key for key, _ in itertools.groupby(tuples))
- new_levs = zip(*unique_tuples)
-
- # The dtype of each level must be explicitly set to avoid inferring the wrong type.
- # See GH-36991.
- return MultiIndex.from_arrays(
- [
- # Not all indices can accept None values.
- Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev
- for new_lev, lev in zip(new_levs, columns.levels)
- ],
- names=columns.names[:-1],
- )
-
-
-def _stack_multi_columns(
- frame: DataFrame, level_num: int = -1, dropna: bool = True
-) -> DataFrame:
- def _convert_level_number(level_num: int, columns: Index):
- """
- Logic for converting the level number to something we can safely pass
- to swaplevel.
-
- If `level_num` matches a column name return the name from
- position `level_num`, otherwise return `level_num`.
- """
- if level_num in columns.names:
- return columns.names[level_num]
-
- return level_num
-
- this = frame.copy(deep=False)
- mi_cols = this.columns # cast(MultiIndex, this.columns)
- assert isinstance(mi_cols, MultiIndex) # caller is responsible
-
- # this makes life much simpler
- if level_num != mi_cols.nlevels - 1:
- # roll levels to put selected level at end
- roll_columns = mi_cols
- for i in range(level_num, mi_cols.nlevels - 1):
- # Need to check if the ints conflict with level names
- lev1 = _convert_level_number(i, roll_columns)
- lev2 = _convert_level_number(i + 1, roll_columns)
- roll_columns = roll_columns.swaplevel(lev1, lev2)
- this.columns = mi_cols = roll_columns
-
- if not mi_cols._is_lexsorted():
- # Workaround the edge case where 0 is one of the column names,
- # which interferes with trying to sort based on the first
- # level
- level_to_sort = _convert_level_number(0, mi_cols)
- this = this.sort_index(level=level_to_sort, axis=1)
- mi_cols = this.columns
-
- mi_cols = cast(MultiIndex, mi_cols)
- new_columns = _stack_multi_column_index(mi_cols)
-
- # time to ravel the values
- new_data = {}
- level_vals = mi_cols.levels[-1]
- level_codes = sorted(set(mi_cols.codes[-1]))
- level_vals_nan = level_vals.insert(len(level_vals), None)
-
- level_vals_used = np.take(level_vals_nan, level_codes)
- levsize = len(level_codes)
- drop_cols = []
- for key in new_columns:
- try:
- loc = this.columns.get_loc(key)
- except KeyError:
- drop_cols.append(key)
- continue
-
- # can make more efficient?
- # we almost always return a slice
- # but if unsorted can get a boolean
- # indexer
- if not isinstance(loc, slice):
- slice_len = len(loc)
- else:
- slice_len = loc.stop - loc.start
-
- if slice_len != levsize:
- chunk = this.loc[:, this.columns[loc]]
- chunk.columns = level_vals_nan.take(chunk.columns.codes[-1])
- value_slice = chunk.reindex(columns=level_vals_used).values
- else:
- if frame._is_homogeneous_type and is_extension_array_dtype(
- frame.dtypes.iloc[0]
- ):
- # TODO(EA2D): won't need special case, can go through .values
- # paths below (might change to ._values)
- dtype = this[this.columns[loc]].dtypes.iloc[0]
- subset = this[this.columns[loc]]
-
- value_slice = dtype.construct_array_type()._concat_same_type(
- [x._values for _, x in subset.items()]
- )
- N, K = subset.shape
- idx = np.arange(N * K).reshape(K, N).T.ravel()
- value_slice = value_slice.take(idx)
-
- elif frame._is_mixed_type:
- value_slice = this[this.columns[loc]].values
- else:
- value_slice = this.values[:, loc]
-
- if value_slice.ndim > 1:
- # i.e. not extension
- value_slice = value_slice.ravel()
-
- new_data[key] = value_slice
-
- if len(drop_cols) > 0:
- new_columns = new_columns.difference(drop_cols)
-
- N = len(this)
-
- if isinstance(this.index, MultiIndex):
- new_levels = list(this.index.levels)
- new_names = list(this.index.names)
- new_codes = [lab.repeat(levsize) for lab in this.index.codes]
- else:
- old_codes, old_levels = factorize_from_iterable(this.index)
- new_levels = [old_levels]
- new_codes = [old_codes.repeat(levsize)]
- new_names = [this.index.name] # something better?
-
- new_levels.append(level_vals)
- new_codes.append(np.tile(level_codes, N))
- new_names.append(frame.columns.names[level_num])
-
- new_index = MultiIndex(
- levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
- )
-
- result = frame._constructor(new_data, index=new_index, columns=new_columns)
-
- # more efficient way to go about this? can do the whole masking biz but
- # will only save a small amount of time...
- if dropna:
- result = result.dropna(axis=0, how="all")
-
- return result
-
-
-def _reorder_for_extension_array_stack(
- arr: ExtensionArray, n_rows: int, n_columns: int
-) -> ExtensionArray:
- """
- Re-orders the values when stacking multiple extension-arrays.
-
- The indirect stacking method used for EAs requires a followup
- take to get the order correct.
-
- Parameters
- ----------
- arr : ExtensionArray
- n_rows, n_columns : int
- The number of rows and columns in the original DataFrame.
-
- Returns
- -------
- taken : ExtensionArray
- The original `arr` with elements re-ordered appropriately
-
- Examples
- --------
- >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
- >>> _reorder_for_extension_array_stack(arr, 2, 3)
- array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
-
- >>> _reorder_for_extension_array_stack(arr, 3, 2)
- array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
- """
- # final take to get the order correct.
- # idx is an indexer like
- # [c0r0, c1r0, c2r0, ...,
- # c0r1, c1r1, c2r1, ...]
- idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
- return arr.take(idx)
diff --git a/contrib/python/pandas/py3/pandas/core/reshape/tile.py b/contrib/python/pandas/py3/pandas/core/reshape/tile.py
deleted file mode 100644
index 267abdb8d01..00000000000
--- a/contrib/python/pandas/py3/pandas/core/reshape/tile.py
+++ /dev/null
@@ -1,651 +0,0 @@
-"""
-Quantilization functions and related stuff
-"""
-from __future__ import annotations
-
-from typing import (
- Any,
- Callable,
- Literal,
-)
-
-import numpy as np
-
-from pandas._libs import (
- Timedelta,
- Timestamp,
-)
-from pandas._libs.lib import infer_dtype
-from pandas._typing import IntervalLeftRight
-
-from pandas.core.dtypes.common import (
- DT64NS_DTYPE,
- ensure_platform_int,
- is_bool_dtype,
- is_categorical_dtype,
- is_datetime64_dtype,
- is_datetime64tz_dtype,
- is_datetime_or_timedelta_dtype,
- is_extension_array_dtype,
- is_integer,
- is_list_like,
- is_numeric_dtype,
- is_scalar,
- is_timedelta64_dtype,
-)
-from pandas.core.dtypes.generic import ABCSeries
-from pandas.core.dtypes.missing import isna
-
-from pandas import (
- Categorical,
- Index,
- IntervalIndex,
- to_datetime,
- to_timedelta,
-)
-from pandas.core import nanops
-import pandas.core.algorithms as algos
-
-
-def cut(
- x,
- bins,
- right: bool = True,
- labels=None,
- retbins: bool = False,
- precision: int = 3,
- include_lowest: bool = False,
- duplicates: str = "raise",
- ordered: bool = True,
-):
- """
- Bin values into discrete intervals.
-
- Use `cut` when you need to segment and sort data values into bins. This
- function is also useful for going from a continuous variable to a
- categorical variable. For example, `cut` could convert ages to groups of
- age ranges. Supports binning into an equal number of bins, or a
- pre-specified array of bins.
-
- Parameters
- ----------
- x : array-like
- The input array to be binned. Must be 1-dimensional.
- bins : int, sequence of scalars, or IntervalIndex
- The criteria to bin by.
-
- * int : Defines the number of equal-width bins in the range of `x`. The
- range of `x` is extended by .1% on each side to include the minimum
- and maximum values of `x`.
- * sequence of scalars : Defines the bin edges allowing for non-uniform
- width. No extension of the range of `x` is done.
- * IntervalIndex : Defines the exact bins to be used. Note that
- IntervalIndex for `bins` must be non-overlapping.
-
- right : bool, default True
- Indicates whether `bins` includes the rightmost edge or not. If
- ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
- indicate (1,2], (2,3], (3,4]. This argument is ignored when
- `bins` is an IntervalIndex.
- labels : array or False, default None
- Specifies the labels for the returned bins. Must be the same length as
- the resulting bins. If False, returns only integer indicators of the
- bins. This affects the type of the output container (see below).
- This argument is ignored when `bins` is an IntervalIndex. If True,
- raises an error. When `ordered=False`, labels must be provided.
- retbins : bool, default False
- Whether to return the bins or not. Useful when bins is provided
- as a scalar.
- precision : int, default 3
- The precision at which to store and display the bins labels.
- include_lowest : bool, default False
- Whether the first interval should be left-inclusive or not.
- duplicates : {default 'raise', 'drop'}, optional
- If bin edges are not unique, raise ValueError or drop non-uniques.
- ordered : bool, default True
- Whether the labels are ordered or not. Applies to returned types
- Categorical and Series (with Categorical dtype). If True,
- the resulting categorical will be ordered. If False, the resulting
- categorical will be unordered (labels must be provided).
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- out : Categorical, Series, or ndarray
- An array-like object representing the respective bin for each value
- of `x`. The type depends on the value of `labels`.
-
- * None (default) : returns a Series for Series `x` or a
- Categorical for all other inputs. The values stored within
- are Interval dtype.
-
- * sequence of scalars : returns a Series for Series `x` or a
- Categorical for all other inputs. The values stored within
- are whatever the type in the sequence is.
-
- * False : returns an ndarray of integers.
-
- bins : numpy.ndarray or IntervalIndex.
- The computed or specified bins. Only returned when `retbins=True`.
- For scalar or sequence `bins`, this is an ndarray with the computed
- bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
- an IntervalIndex `bins`, this is equal to `bins`.
-
- See Also
- --------
- qcut : Discretize variable into equal-sized buckets based on rank
- or based on sample quantiles.
- Categorical : Array type for storing data that come from a
- fixed set of values.
- Series : One-dimensional array with axis labels (including time series).
- IntervalIndex : Immutable Index implementing an ordered, sliceable set.
-
- Notes
- -----
- Any NA values will be NA in the result. Out of bounds values will be NA in
- the resulting Series or Categorical object.
-
- Reference :ref:`the user guide <reshaping.tile.cut>` for more examples.
-
- Examples
- --------
- Discretize into three equal-sized bins.
-
- >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
- ... # doctest: +ELLIPSIS
- [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
- Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
-
- >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
- ... # doctest: +ELLIPSIS
- ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
- Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
- array([0.994, 3. , 5. , 7. ]))
-
- Discovers the same bins, but assign them specific labels. Notice that
- the returned Categorical's categories are `labels` and is ordered.
-
- >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
- ... 3, labels=["bad", "medium", "good"])
- ['bad', 'good', 'medium', 'medium', 'good', 'bad']
- Categories (3, object): ['bad' < 'medium' < 'good']
-
- ``ordered=False`` will result in unordered categories when labels are passed.
- This parameter can be used to allow non-unique labels:
-
- >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
- ... labels=["B", "A", "B"], ordered=False)
- ['B', 'B', 'A', 'A', 'B', 'B']
- Categories (2, object): ['A', 'B']
-
- ``labels=False`` implies you just want the bins back.
-
- >>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
- array([0, 1, 1, 3])
-
- Passing a Series as an input returns a Series with categorical dtype:
-
- >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
- ... index=['a', 'b', 'c', 'd', 'e'])
- >>> pd.cut(s, 3)
- ... # doctest: +ELLIPSIS
- a (1.992, 4.667]
- b (1.992, 4.667]
- c (4.667, 7.333]
- d (7.333, 10.0]
- e (7.333, 10.0]
- dtype: category
- Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ...
-
- Passing a Series as an input returns a Series with mapping value.
- It is used to map numerically to intervals based on bins.
-
- >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
- ... index=['a', 'b', 'c', 'd', 'e'])
- >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
- ... # doctest: +ELLIPSIS
- (a 1.0
- b 2.0
- c 3.0
- d 4.0
- e NaN
- dtype: float64,
- array([ 0, 2, 4, 6, 8, 10]))
-
- Use `drop` optional when bins is not unique
-
- >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
- ... right=False, duplicates='drop')
- ... # doctest: +ELLIPSIS
- (a 1.0
- b 2.0
- c 3.0
- d 3.0
- e NaN
- dtype: float64,
- array([ 0, 2, 4, 6, 10]))
-
- Passing an IntervalIndex for `bins` results in those categories exactly.
- Notice that values not covered by the IntervalIndex are set to NaN. 0
- is to the left of the first bin (which is closed on the right), and 1.5
- falls between two bins.
-
- >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
- >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
- [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]
- Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]]
- """
- # NOTE: this binning code is changed a bit from histogram for var(x) == 0
-
- original = x
- x = _preprocess_for_cut(x)
- x, dtype = _coerce_to_type(x)
-
- if not np.iterable(bins):
- if is_scalar(bins) and bins < 1:
- raise ValueError("`bins` should be a positive integer.")
-
- try: # for array-like
- sz = x.size
- except AttributeError:
- x = np.asarray(x)
- sz = x.size
-
- if sz == 0:
- raise ValueError("Cannot cut empty array")
-
- rng = (nanops.nanmin(x), nanops.nanmax(x))
- mn, mx = (mi + 0.0 for mi in rng)
-
- if np.isinf(mn) or np.isinf(mx):
- # GH 24314
- raise ValueError(
- "cannot specify integer `bins` when input data contains infinity"
- )
- if mn == mx: # adjust end points before binning
- mn -= 0.001 * abs(mn) if mn != 0 else 0.001
- mx += 0.001 * abs(mx) if mx != 0 else 0.001
- bins = np.linspace(mn, mx, bins + 1, endpoint=True)
- else: # adjust end points after binning
- bins = np.linspace(mn, mx, bins + 1, endpoint=True)
- adj = (mx - mn) * 0.001 # 0.1% of the range
- if right:
- bins[0] -= adj
- else:
- bins[-1] += adj
-
- elif isinstance(bins, IntervalIndex):
- if bins.is_overlapping:
- raise ValueError("Overlapping IntervalIndex is not accepted.")
-
- else:
- if is_datetime64tz_dtype(bins):
- bins = np.asarray(bins, dtype=DT64NS_DTYPE)
- else:
- bins = np.asarray(bins)
- bins = _convert_bin_to_numeric_type(bins, dtype)
-
- # GH 26045: cast to float64 to avoid an overflow
- if (np.diff(bins.astype("float64")) < 0).any():
- raise ValueError("bins must increase monotonically.")
-
- fac, bins = _bins_to_cuts(
- x,
- bins,
- right=right,
- labels=labels,
- precision=precision,
- include_lowest=include_lowest,
- dtype=dtype,
- duplicates=duplicates,
- ordered=ordered,
- )
-
- return _postprocess_for_cut(fac, bins, retbins, dtype, original)
-
-
-def qcut(
- x,
- q,
- labels=None,
- retbins: bool = False,
- precision: int = 3,
- duplicates: str = "raise",
-):
- """
- Quantile-based discretization function.
-
- Discretize variable into equal-sized buckets based on rank or based
- on sample quantiles. For example 1000 values for 10 quantiles would
- produce a Categorical object indicating quantile membership for each data point.
-
- Parameters
- ----------
- x : 1d ndarray or Series
- q : int or list-like of float
- Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
- array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
- labels : array or False, default None
- Used as labels for the resulting bins. Must be of the same length as
- the resulting bins. If False, return only integer indicators of the
- bins. If True, raises an error.
- retbins : bool, optional
- Whether to return the (bins, labels) or not. Can be useful if bins
- is given as a scalar.
- precision : int, optional
- The precision at which to store and display the bins labels.
- duplicates : {default 'raise', 'drop'}, optional
- If bin edges are not unique, raise ValueError or drop non-uniques.
-
- Returns
- -------
- out : Categorical or Series or array of integers if labels is False
- The return type (Categorical or Series) depends on the input: a Series
- of type category if input is a Series else Categorical. Bins are
- represented as categories when categorical data is returned.
- bins : ndarray of floats
- Returned only if `retbins` is True.
-
- Notes
- -----
- Out of bounds values will be NA in the resulting Categorical object
-
- Examples
- --------
- >>> pd.qcut(range(5), 4)
- ... # doctest: +ELLIPSIS
- [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
- Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ...
-
- >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
- ... # doctest: +SKIP
- [good, good, medium, bad, bad]
- Categories (3, object): [good < medium < bad]
-
- >>> pd.qcut(range(5), 4, labels=False)
- array([0, 0, 1, 2, 3])
- """
- original = x
- x = _preprocess_for_cut(x)
- x, dtype = _coerce_to_type(x)
-
- quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
-
- x_np = np.asarray(x)
- x_np = x_np[~np.isnan(x_np)]
- bins = np.quantile(x_np, quantiles)
-
- fac, bins = _bins_to_cuts(
- x,
- bins,
- labels=labels,
- precision=precision,
- include_lowest=True,
- dtype=dtype,
- duplicates=duplicates,
- )
-
- return _postprocess_for_cut(fac, bins, retbins, dtype, original)
-
-
-def _bins_to_cuts(
- x,
- bins: np.ndarray,
- right: bool = True,
- labels=None,
- precision: int = 3,
- include_lowest: bool = False,
- dtype=None,
- duplicates: str = "raise",
- ordered: bool = True,
-):
- if not ordered and labels is None:
- raise ValueError("'labels' must be provided if 'ordered = False'")
-
- if duplicates not in ["raise", "drop"]:
- raise ValueError(
- "invalid value for 'duplicates' parameter, valid options are: raise, drop"
- )
-
- if isinstance(bins, IntervalIndex):
- # we have a fast-path here
- ids = bins.get_indexer(x)
- result = Categorical.from_codes(ids, categories=bins, ordered=True)
- return result, bins
-
- unique_bins = algos.unique(bins)
- if len(unique_bins) < len(bins) and len(bins) != 2:
- if duplicates == "raise":
- raise ValueError(
- f"Bin edges must be unique: {repr(bins)}.\n"
- f"You can drop duplicate edges by setting the 'duplicates' kwarg"
- )
- bins = unique_bins
-
- side: Literal["left", "right"] = "left" if right else "right"
- ids = ensure_platform_int(bins.searchsorted(x, side=side))
-
- if include_lowest:
- ids[np.asarray(x) == bins[0]] = 1
-
- na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
- has_nas = na_mask.any()
-
- if labels is not False:
- if not (labels is None or is_list_like(labels)):
- raise ValueError(
- "Bin labels must either be False, None or passed in as a "
- "list-like argument"
- )
-
- if labels is None:
- labels = _format_labels(
- bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
- )
- elif ordered and len(set(labels)) != len(labels):
- raise ValueError(
- "labels must be unique if ordered=True; pass ordered=False "
- "for duplicate labels"
- )
- else:
- if len(labels) != len(bins) - 1:
- raise ValueError(
- "Bin labels must be one fewer than the number of bin edges"
- )
- if not is_categorical_dtype(labels):
- labels = Categorical(
- labels,
- categories=labels if len(set(labels)) == len(labels) else None,
- ordered=ordered,
- )
- # TODO: handle mismatch between categorical label order and pandas.cut order.
- np.putmask(ids, na_mask, 0)
- result = algos.take_nd(labels, ids - 1)
-
- else:
- result = ids - 1
- if has_nas:
- result = result.astype(np.float64)
- np.putmask(result, na_mask, np.nan)
-
- return result, bins
-
-
-def _coerce_to_type(x):
- """
- if the passed data is of datetime/timedelta, bool or nullable int type,
- this method converts it to numeric so that cut or qcut method can
- handle it
- """
- dtype = None
-
- if is_datetime64tz_dtype(x.dtype):
- dtype = x.dtype
- elif is_datetime64_dtype(x.dtype):
- x = to_datetime(x).astype("datetime64[ns]", copy=False)
- dtype = np.dtype("datetime64[ns]")
- elif is_timedelta64_dtype(x.dtype):
- x = to_timedelta(x)
- dtype = np.dtype("timedelta64[ns]")
- elif is_bool_dtype(x.dtype):
- # GH 20303
- x = x.astype(np.int64)
- # To support cut and qcut for IntegerArray we convert to float dtype.
- # Will properly support in the future.
- # https://github.com/pandas-dev/pandas/pull/31290
- # https://github.com/pandas-dev/pandas/issues/31389
- elif is_extension_array_dtype(x.dtype) and is_numeric_dtype(x.dtype):
- x = x.to_numpy(dtype=np.float64, na_value=np.nan)
-
- if dtype is not None:
- # GH 19768: force NaT to NaN during integer conversion
- x = np.where(x.notna(), x.view(np.int64), np.nan)
-
- return x, dtype
-
-
-def _convert_bin_to_numeric_type(bins, dtype):
- """
- if the passed bin is of datetime/timedelta type,
- this method converts it to integer
-
- Parameters
- ----------
- bins : list-like of bins
- dtype : dtype of data
-
- Raises
- ------
- ValueError if bins are not of a compat dtype to dtype
- """
- bins_dtype = infer_dtype(bins, skipna=False)
- if is_timedelta64_dtype(dtype):
- if bins_dtype in ["timedelta", "timedelta64"]:
- bins = to_timedelta(bins).view(np.int64)
- else:
- raise ValueError("bins must be of timedelta64 dtype")
- elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
- if bins_dtype in ["datetime", "datetime64"]:
- bins = to_datetime(bins)
- if is_datetime64_dtype(bins):
- # As of 2.0, to_datetime may give non-nano, so we need to convert
- # here until the rest of this file recognizes non-nano
- bins = bins.astype("datetime64[ns]", copy=False)
- bins = bins.view(np.int64)
- else:
- raise ValueError("bins must be of datetime64 dtype")
-
- return bins
-
-
-def _convert_bin_to_datelike_type(bins, dtype):
- """
- Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
- datelike
-
- Parameters
- ----------
- bins : list-like of bins
- dtype : dtype of data
-
- Returns
- -------
- bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
- datelike
- """
- if is_datetime64tz_dtype(dtype):
- bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)
- elif is_datetime_or_timedelta_dtype(dtype):
- bins = Index(bins.astype(np.int64), dtype=dtype)
- return bins
-
-
-def _format_labels(
- bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None
-):
- """based on the dtype, return our labels"""
- closed: IntervalLeftRight = "right" if right else "left"
-
- formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
-
- if is_datetime64tz_dtype(dtype):
- formatter = lambda x: Timestamp(x, tz=dtype.tz)
- adjust = lambda x: x - Timedelta("1ns")
- elif is_datetime64_dtype(dtype):
- formatter = Timestamp
- adjust = lambda x: x - Timedelta("1ns")
- elif is_timedelta64_dtype(dtype):
- formatter = Timedelta
- adjust = lambda x: x - Timedelta("1ns")
- else:
- precision = _infer_precision(precision, bins)
- formatter = lambda x: _round_frac(x, precision)
- adjust = lambda x: x - 10 ** (-precision)
-
- breaks = [formatter(b) for b in bins]
- if right and include_lowest:
- # adjust lhs of first interval by precision to account for being right closed
- breaks[0] = adjust(breaks[0])
-
- return IntervalIndex.from_breaks(breaks, closed=closed)
-
-
-def _preprocess_for_cut(x):
- """
- handles preprocessing for cut where we convert passed
- input to array, strip the index information and store it
- separately
- """
- # Check that the passed array is a Pandas or Numpy object
- # We don't want to strip away a Pandas data-type here (e.g. datetimetz)
- ndim = getattr(x, "ndim", None)
- if ndim is None:
- x = np.asarray(x)
- if x.ndim != 1:
- raise ValueError("Input array must be 1 dimensional")
-
- return x
-
-
-def _postprocess_for_cut(fac, bins, retbins: bool, dtype, original):
- """
- handles post processing for the cut method where
- we combine the index information if the originally passed
- datatype was a series
- """
- if isinstance(original, ABCSeries):
- fac = original._constructor(fac, index=original.index, name=original.name)
-
- if not retbins:
- return fac
-
- bins = _convert_bin_to_datelike_type(bins, dtype)
-
- return fac, bins
-
-
-def _round_frac(x, precision: int):
- """
- Round the fractional part of the given number
- """
- if not np.isfinite(x) or x == 0:
- return x
- else:
- frac, whole = np.modf(x)
- if whole == 0:
- digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision
- else:
- digits = precision
- return np.around(x, digits)
-
-
-def _infer_precision(base_precision: int, bins) -> int:
- """
- Infer an appropriate precision for _round_frac
- """
- for precision in range(base_precision, 20):
- levels = [_round_frac(b, precision) for b in bins]
- if algos.unique(levels).size == bins.size:
- return precision
- return base_precision # default
diff --git a/contrib/python/pandas/py3/pandas/core/reshape/util.py b/contrib/python/pandas/py3/pandas/core/reshape/util.py
deleted file mode 100644
index a92b439927f..00000000000
--- a/contrib/python/pandas/py3/pandas/core/reshape/util.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._typing import NumpyIndexT
-
-from pandas.core.dtypes.common import is_list_like
-
-
-def cartesian_product(X) -> list[np.ndarray]:
- """
- Numpy version of itertools.product.
- Sometimes faster (for large inputs)...
-
- Parameters
- ----------
- X : list-like of list-likes
-
- Returns
- -------
- product : list of ndarrays
-
- Examples
- --------
- >>> cartesian_product([list('ABC'), [1, 2]])
- [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='<U1'), array([1, 2, 1, 2, 1, 2])]
-
- See Also
- --------
- itertools.product : Cartesian product of input iterables. Equivalent to
- nested for-loops.
- """
- msg = "Input must be a list-like of list-likes"
- if not is_list_like(X):
- raise TypeError(msg)
- for x in X:
- if not is_list_like(x):
- raise TypeError(msg)
-
- if len(X) == 0:
- return []
-
- lenX = np.fromiter((len(x) for x in X), dtype=np.intp)
- cumprodX = np.cumprod(lenX)
-
- if np.any(cumprodX < 0):
- raise ValueError("Product space too large to allocate arrays!")
-
- a = np.roll(cumprodX, 1)
- a[0] = 1
-
- if cumprodX[-1] != 0:
- b = cumprodX[-1] / cumprodX
- else:
- # if any factor is empty, the cartesian product is empty
- b = np.zeros_like(cumprodX)
-
- # error: Argument of type "int_" cannot be assigned to parameter "num" of
- # type "int" in function "tile_compat"
- return [
- tile_compat(
- np.repeat(x, b[i]),
- np.prod(a[i]), # pyright: ignore[reportGeneralTypeIssues]
- )
- for i, x in enumerate(X)
- ]
-
-
-def tile_compat(arr: NumpyIndexT, num: int) -> NumpyIndexT:
- """
- Index compat for np.tile.
-
- Notes
- -----
- Does not support multi-dimensional `num`.
- """
- if isinstance(arr, np.ndarray):
- return np.tile(arr, num)
-
- # Otherwise we have an Index
- taker = np.tile(np.arange(len(arr)), num)
- return arr.take(taker)
diff --git a/contrib/python/pandas/py3/pandas/core/roperator.py b/contrib/python/pandas/py3/pandas/core/roperator.py
deleted file mode 100644
index 2f320f4e9c6..00000000000
--- a/contrib/python/pandas/py3/pandas/core/roperator.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""
-Reversed Operations not available in the stdlib operator module.
-Defining these instead of using lambdas allows us to reference them by name.
-"""
-from __future__ import annotations
-
-import operator
-
-
-def radd(left, right):
- return right + left
-
-
-def rsub(left, right):
- return right - left
-
-
-def rmul(left, right):
- return right * left
-
-
-def rdiv(left, right):
- return right / left
-
-
-def rtruediv(left, right):
- return right / left
-
-
-def rfloordiv(left, right):
- return right // left
-
-
-def rmod(left, right):
- # check if right is a string as % is the string
- # formatting operation; this is a TypeError
- # otherwise perform the op
- if isinstance(right, str):
- typ = type(left).__name__
- raise TypeError(f"{typ} cannot perform the operation mod")
-
- return right % left
-
-
-def rdivmod(left, right):
- return divmod(right, left)
-
-
-def rpow(left, right):
- return right**left
-
-
-def rand_(left, right):
- return operator.and_(right, left)
-
-
-def ror_(left, right):
- return operator.or_(right, left)
-
-
-def rxor(left, right):
- return operator.xor(right, left)
diff --git a/contrib/python/pandas/py3/pandas/core/sample.py b/contrib/python/pandas/py3/pandas/core/sample.py
deleted file mode 100644
index a9b236b58a9..00000000000
--- a/contrib/python/pandas/py3/pandas/core/sample.py
+++ /dev/null
@@ -1,153 +0,0 @@
-"""
-Module containing utilities for NDFrame.sample() and .GroupBy.sample()
-"""
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import AxisInt
-
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-
-if TYPE_CHECKING:
- from pandas.core.generic import NDFrame
-
-
-def preprocess_weights(obj: NDFrame, weights, axis: AxisInt) -> np.ndarray:
- """
- Process and validate the `weights` argument to `NDFrame.sample` and
- `.GroupBy.sample`.
-
- Returns `weights` as an ndarray[np.float64], validated except for normalizing
- weights (because that must be done groupwise in groupby sampling).
- """
- # If a series, align with frame
- if isinstance(weights, ABCSeries):
- weights = weights.reindex(obj.axes[axis])
-
- # Strings acceptable if a dataframe and axis = 0
- if isinstance(weights, str):
- if isinstance(obj, ABCDataFrame):
- if axis == 0:
- try:
- weights = obj[weights]
- except KeyError as err:
- raise KeyError(
- "String passed to weights not a valid column"
- ) from err
- else:
- raise ValueError(
- "Strings can only be passed to "
- "weights when sampling from rows on "
- "a DataFrame"
- )
- else:
- raise ValueError(
- "Strings cannot be passed as weights when sampling from a Series."
- )
-
- if isinstance(obj, ABCSeries):
- func = obj._constructor
- else:
- func = obj._constructor_sliced
-
- weights = func(weights, dtype="float64")._values
-
- if len(weights) != obj.shape[axis]:
- raise ValueError("Weights and axis to be sampled must be of same length")
-
- if lib.has_infs(weights):
- raise ValueError("weight vector may not include `inf` values")
-
- if (weights < 0).any():
- raise ValueError("weight vector many not include negative values")
-
- missing = np.isnan(weights)
- if missing.any():
- # Don't modify weights in place
- weights = weights.copy()
- weights[missing] = 0
- return weights
-
-
-def process_sampling_size(
- n: int | None, frac: float | None, replace: bool
-) -> int | None:
- """
- Process and validate the `n` and `frac` arguments to `NDFrame.sample` and
- `.GroupBy.sample`.
-
- Returns None if `frac` should be used (variable sampling sizes), otherwise returns
- the constant sampling size.
- """
- # If no frac or n, default to n=1.
- if n is None and frac is None:
- n = 1
- elif n is not None and frac is not None:
- raise ValueError("Please enter a value for `frac` OR `n`, not both")
- elif n is not None:
- if n < 0:
- raise ValueError(
- "A negative number of rows requested. Please provide `n` >= 0."
- )
- if n % 1 != 0:
- raise ValueError("Only integers accepted as `n` values")
- else:
- assert frac is not None # for mypy
- if frac > 1 and not replace:
- raise ValueError(
- "Replace has to be set to `True` when "
- "upsampling the population `frac` > 1."
- )
- if frac < 0:
- raise ValueError(
- "A negative number of rows requested. Please provide `frac` >= 0."
- )
-
- return n
-
-
-def sample(
- obj_len: int,
- size: int,
- replace: bool,
- weights: np.ndarray | None,
- random_state: np.random.RandomState | np.random.Generator,
-) -> np.ndarray:
- """
- Randomly sample `size` indices in `np.arange(obj_len)`
-
- Parameters
- ----------
- obj_len : int
- The length of the indices being considered
- size : int
- The number of values to choose
- replace : bool
- Allow or disallow sampling of the same row more than once.
- weights : np.ndarray[np.float64] or None
- If None, equal probability weighting, otherwise weights according
- to the vector normalized
- random_state: np.random.RandomState or np.random.Generator
- State used for the random sampling
-
- Returns
- -------
- np.ndarray[np.intp]
- """
- if weights is not None:
- weight_sum = weights.sum()
- if weight_sum != 0:
- weights = weights / weight_sum
- else:
- raise ValueError("Invalid weights: weights sum to zero")
-
- return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype(
- np.intp, copy=False
- )
diff --git a/contrib/python/pandas/py3/pandas/core/series.py b/contrib/python/pandas/py3/pandas/core/series.py
deleted file mode 100644
index 78f4da4e651..00000000000
--- a/contrib/python/pandas/py3/pandas/core/series.py
+++ /dev/null
@@ -1,6118 +0,0 @@
-"""
-Data structure for 1-dimensional cross-sectional and time series data
-"""
-from __future__ import annotations
-
-import sys
-from textwrap import dedent
-from typing import (
- IO,
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Iterable,
- Literal,
- Mapping,
- Sequence,
- Union,
- cast,
- overload,
-)
-import warnings
-import weakref
-
-import numpy as np
-
-from pandas._config import (
- get_option,
- using_copy_on_write,
-)
-
-from pandas._libs import (
- lib,
- properties,
- reshape,
-)
-from pandas._libs.internals import BlockValuesRefs
-from pandas._libs.lib import is_range_indexer
-from pandas._typing import (
- AggFuncType,
- AlignJoin,
- AnyAll,
- AnyArrayLike,
- ArrayLike,
- Axis,
- AxisInt,
- CorrelationMethod,
- DropKeep,
- Dtype,
- DtypeBackend,
- DtypeObj,
- FilePath,
- FillnaOptions,
- Frequency,
- IgnoreRaise,
- IndexKeyFunc,
- IndexLabel,
- Level,
- NaPosition,
- QuantileInterpolation,
- Renamer,
- Scalar,
- SingleManager,
- SortKind,
- StorageOptions,
- TimedeltaConvertibleTypes,
- TimestampConvertibleTypes,
- ValueKeyFunc,
- WriteBuffer,
- npt,
-)
-from pandas.compat import PYPY
-from pandas.compat.numpy import function as nv
-from pandas.errors import (
- ChainedAssignmentError,
- InvalidIndexError,
- _chained_assignment_msg,
-)
-from pandas.util._decorators import (
- Appender,
- Substitution,
- doc,
-)
-from pandas.util._exceptions import find_stack_level
-from pandas.util._validators import (
- validate_ascending,
- validate_bool_kwarg,
- validate_percentile,
-)
-
-from pandas.core.dtypes.astype import astype_is_view
-from pandas.core.dtypes.cast import (
- LossySetitemError,
- convert_dtypes,
- maybe_box_native,
- maybe_cast_pointwise_result,
-)
-from pandas.core.dtypes.common import (
- ensure_platform_int,
- is_dict_like,
- is_extension_array_dtype,
- is_integer,
- is_iterator,
- is_list_like,
- is_numeric_dtype,
- is_object_dtype,
- is_scalar,
- pandas_dtype,
- validate_all_hashable,
-)
-from pandas.core.dtypes.generic import ABCDataFrame
-from pandas.core.dtypes.inference import is_hashable
-from pandas.core.dtypes.missing import (
- isna,
- na_value_for_dtype,
- notna,
- remove_na_arraylike,
-)
-
-from pandas.core import (
- algorithms,
- base,
- common as com,
- missing,
- nanops,
- ops,
-)
-from pandas.core.accessor import CachedAccessor
-from pandas.core.apply import SeriesApply
-from pandas.core.arrays import ExtensionArray
-from pandas.core.arrays.categorical import CategoricalAccessor
-from pandas.core.arrays.sparse import SparseAccessor
-from pandas.core.construction import (
- extract_array,
- sanitize_array,
-)
-from pandas.core.generic import NDFrame
-from pandas.core.indexers import (
- disallow_ndim_indexing,
- unpack_1tuple,
-)
-from pandas.core.indexes.accessors import CombinedDatetimelikeProperties
-from pandas.core.indexes.api import (
- DatetimeIndex,
- Index,
- MultiIndex,
- PeriodIndex,
- default_index,
- ensure_index,
-)
-import pandas.core.indexes.base as ibase
-from pandas.core.indexes.multi import maybe_droplevels
-from pandas.core.indexing import (
- check_bool_indexer,
- check_dict_or_set_indexers,
-)
-from pandas.core.internals import (
- SingleArrayManager,
- SingleBlockManager,
-)
-from pandas.core.methods import selectn
-from pandas.core.shared_docs import _shared_docs
-from pandas.core.sorting import (
- ensure_key_mapped,
- nargsort,
-)
-from pandas.core.strings.accessor import StringMethods
-from pandas.core.tools.datetimes import to_datetime
-
-import pandas.io.formats.format as fmt
-from pandas.io.formats.info import (
- INFO_DOCSTRING,
- SeriesInfo,
- series_sub_kwargs,
-)
-import pandas.plotting
-
-if TYPE_CHECKING:
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- Suffixes,
- )
-
- from pandas.core.frame import DataFrame
- from pandas.core.groupby.generic import SeriesGroupBy
- from pandas.core.resample import Resampler
-
-__all__ = ["Series"]
-
-_shared_doc_kwargs = {
- "axes": "index",
- "klass": "Series",
- "axes_single_arg": "{0 or 'index'}",
- "axis": """axis : {0 or 'index'}
- Unused. Parameter needed for compatibility with DataFrame.""",
- "inplace": """inplace : bool, default False
- If True, performs operation inplace and returns None.""",
- "unique": "np.ndarray",
- "duplicated": "Series",
- "optional_by": "",
- "optional_mapper": "",
- "optional_reindex": """
-index : array-like, optional
- New labels for the index. Preferably an Index object to avoid
- duplicating data.
-axis : int or str, optional
- Unused.""",
- "replace_iloc": """
- This differs from updating with ``.loc`` or ``.iloc``, which require
- you to specify a location to update with some value.""",
-}
-
-
-def _coerce_method(converter):
- """
- Install the scalar coercion methods.
- """
-
- def wrapper(self):
- if len(self) == 1:
- warnings.warn(
- f"Calling {converter.__name__} on a single element Series is "
- "deprecated and will raise a TypeError in the future. "
- f"Use {converter.__name__}(ser.iloc[0]) instead",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return converter(self.iloc[0])
- raise TypeError(f"cannot convert the series to {converter}")
-
- wrapper.__name__ = f"__{converter.__name__}__"
- return wrapper
-
-
-# ----------------------------------------------------------------------
-# Series class
-
-
-# error: Definition of "max" in base class "IndexOpsMixin" is incompatible with
-# definition in base class "NDFrame"
-# error: Definition of "min" in base class "IndexOpsMixin" is incompatible with
-# definition in base class "NDFrame"
-class Series(base.IndexOpsMixin, NDFrame): # type: ignore[misc]
- """
- One-dimensional ndarray with axis labels (including time series).
-
- Labels need not be unique but must be a hashable type. The object
- supports both integer- and label-based indexing and provides a host of
- methods for performing operations involving the index. Statistical
- methods from ndarray have been overridden to automatically exclude
- missing data (currently represented as NaN).
-
- Operations between Series (+, -, /, \\*, \\*\\*) align values based on their
- associated index values-- they need not be the same length. The result
- index will be the sorted union of the two indexes.
-
- Parameters
- ----------
- data : array-like, Iterable, dict, or scalar value
- Contains data stored in Series. If data is a dict, argument order is
- maintained.
- index : array-like or Index (1d)
- Values must be hashable and have the same length as `data`.
- Non-unique index values are allowed. Will default to
- RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like
- and index is None, then the keys in the data are used as the index. If the
- index is not None, the resulting Series is reindexed with the index values.
- dtype : str, numpy.dtype, or ExtensionDtype, optional
- Data type for the output Series. If not specified, this will be
- inferred from `data`.
- See the :ref:`user guide <basics.dtypes>` for more usages.
- name : Hashable, default None
- The name to give to the Series.
- copy : bool, default False
- Copy input data. Only affects Series or 1d ndarray input. See examples.
-
- Notes
- -----
- Please reference the :ref:`User Guide <basics.series>` for more information.
-
- Examples
- --------
- Constructing Series from a dictionary with an Index specified
-
- >>> d = {'a': 1, 'b': 2, 'c': 3}
- >>> ser = pd.Series(data=d, index=['a', 'b', 'c'])
- >>> ser
- a 1
- b 2
- c 3
- dtype: int64
-
- The keys of the dictionary match with the Index values, hence the Index
- values have no effect.
-
- >>> d = {'a': 1, 'b': 2, 'c': 3}
- >>> ser = pd.Series(data=d, index=['x', 'y', 'z'])
- >>> ser
- x NaN
- y NaN
- z NaN
- dtype: float64
-
- Note that the Index is first build with the keys from the dictionary.
- After this the Series is reindexed with the given Index values, hence we
- get all NaN as a result.
-
- Constructing Series from a list with `copy=False`.
-
- >>> r = [1, 2]
- >>> ser = pd.Series(r, copy=False)
- >>> ser.iloc[0] = 999
- >>> r
- [1, 2]
- >>> ser
- 0 999
- 1 2
- dtype: int64
-
- Due to input data type the Series has a `copy` of
- the original data even though `copy=False`, so
- the data is unchanged.
-
- Constructing Series from a 1d ndarray with `copy=False`.
-
- >>> r = np.array([1, 2])
- >>> ser = pd.Series(r, copy=False)
- >>> ser.iloc[0] = 999
- >>> r
- array([999, 2])
- >>> ser
- 0 999
- 1 2
- dtype: int64
-
- Due to input data type the Series has a `view` on
- the original data, so
- the data is changed as well.
- """
-
- _typ = "series"
- _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray)
-
- _name: Hashable
- _metadata: list[str] = ["name"]
- _internal_names_set = {"index"} | NDFrame._internal_names_set
- _accessors = {"dt", "cat", "str", "sparse"}
- _hidden_attrs = (
- base.IndexOpsMixin._hidden_attrs | NDFrame._hidden_attrs | frozenset([])
- )
-
- # Override cache_readonly bc Series is mutable
- # error: Incompatible types in assignment (expression has type "property",
- # base class "IndexOpsMixin" defined the type as "Callable[[IndexOpsMixin], bool]")
- hasnans = property( # type: ignore[assignment]
- # error: "Callable[[IndexOpsMixin], bool]" has no attribute "fget"
- base.IndexOpsMixin.hasnans.fget, # type: ignore[attr-defined]
- doc=base.IndexOpsMixin.hasnans.__doc__,
- )
- _mgr: SingleManager
- div: Callable[[Series, Any], Series]
- rdiv: Callable[[Series, Any], Series]
-
- # ----------------------------------------------------------------------
- # Constructors
-
- def __init__(
- self,
- data=None,
- index=None,
- dtype: Dtype | None = None,
- name=None,
- copy: bool | None = None,
- fastpath: bool = False,
- ) -> None:
- if (
- isinstance(data, (SingleBlockManager, SingleArrayManager))
- and index is None
- and dtype is None
- and (copy is False or copy is None)
- ):
- if using_copy_on_write():
- data = data.copy(deep=False)
- # GH#33357 called with just the SingleBlockManager
- NDFrame.__init__(self, data)
- if fastpath:
- # e.g. from _box_col_values, skip validation of name
- object.__setattr__(self, "_name", name)
- else:
- self.name = name
- return
-
- if isinstance(data, (ExtensionArray, np.ndarray)):
- if copy is not False and using_copy_on_write():
- if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
- data = data.copy()
- if copy is None:
- copy = False
-
- # we are called internally, so short-circuit
- if fastpath:
- # data is a ndarray, index is defined
- if not isinstance(data, (SingleBlockManager, SingleArrayManager)):
- manager = get_option("mode.data_manager")
- if manager == "block":
- data = SingleBlockManager.from_array(data, index)
- elif manager == "array":
- data = SingleArrayManager.from_array(data, index)
- elif using_copy_on_write() and not copy:
- data = data.copy(deep=False)
- if copy:
- data = data.copy()
- # skips validation of the name
- object.__setattr__(self, "_name", name)
- NDFrame.__init__(self, data)
- return
-
- if isinstance(data, SingleBlockManager) and using_copy_on_write() and not copy:
- data = data.copy(deep=False)
-
- name = ibase.maybe_extract_name(name, data, type(self))
-
- if index is not None:
- index = ensure_index(index)
-
- if dtype is not None:
- dtype = self._validate_dtype(dtype)
-
- if data is None:
- index = index if index is not None else default_index(0)
- if len(index) or dtype is not None:
- data = na_value_for_dtype(pandas_dtype(dtype), compat=False)
- else:
- data = []
-
- if isinstance(data, MultiIndex):
- raise NotImplementedError(
- "initializing a Series from a MultiIndex is not supported"
- )
-
- refs = None
- if isinstance(data, Index):
- if dtype is not None:
- data = data.astype(dtype, copy=False)
-
- if using_copy_on_write():
- refs = data._references
- data = data._values
- else:
- # GH#24096 we need to ensure the index remains immutable
- data = data._values.copy()
- copy = False
-
- elif isinstance(data, np.ndarray):
- if len(data.dtype):
- # GH#13296 we are dealing with a compound dtype, which
- # should be treated as 2D
- raise ValueError(
- "Cannot construct a Series from an ndarray with "
- "compound dtype. Use DataFrame instead."
- )
- elif isinstance(data, Series):
- if index is None:
- index = data.index
- data = data._mgr.copy(deep=False)
- else:
- data = data.reindex(index, copy=copy)
- copy = False
- data = data._mgr
- elif is_dict_like(data):
- data, index = self._init_dict(data, index, dtype)
- dtype = None
- copy = False
- elif isinstance(data, (SingleBlockManager, SingleArrayManager)):
- if index is None:
- index = data.index
- elif not data.index.equals(index) or copy:
- # GH#19275 SingleBlockManager input should only be called
- # internally
- raise AssertionError(
- "Cannot pass both SingleBlockManager "
- "`data` argument and a different "
- "`index` argument. `copy` must be False."
- )
-
- elif isinstance(data, ExtensionArray):
- pass
- else:
- data = com.maybe_iterable_to_list(data)
- if is_list_like(data) and not len(data) and dtype is None:
- # GH 29405: Pre-2.0, this defaulted to float.
- dtype = np.dtype(object)
-
- if index is None:
- if not is_list_like(data):
- data = [data]
- index = default_index(len(data))
- elif is_list_like(data):
- com.require_length_match(data, index)
-
- # create/copy the manager
- if isinstance(data, (SingleBlockManager, SingleArrayManager)):
- if dtype is not None:
- data = data.astype(dtype=dtype, errors="ignore", copy=copy)
- elif copy:
- data = data.copy()
- else:
- data = sanitize_array(data, index, dtype, copy)
-
- manager = get_option("mode.data_manager")
- if manager == "block":
- data = SingleBlockManager.from_array(data, index, refs=refs)
- elif manager == "array":
- data = SingleArrayManager.from_array(data, index)
-
- NDFrame.__init__(self, data)
- self.name = name
- self._set_axis(0, index)
-
- def _init_dict(
- self, data, index: Index | None = None, dtype: DtypeObj | None = None
- ):
- """
- Derive the "_mgr" and "index" attributes of a new Series from a
- dictionary input.
-
- Parameters
- ----------
- data : dict or dict-like
- Data used to populate the new Series.
- index : Index or None, default None
- Index for the new Series: if None, use dict keys.
- dtype : np.dtype, ExtensionDtype, or None, default None
- The dtype for the new Series: if None, infer from data.
-
- Returns
- -------
- _data : BlockManager for the new Series
- index : index for the new Series
- """
- keys: Index | tuple
-
- # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')]
- # raises KeyError), so we iterate the entire dict, and align
- if data:
- # GH:34717, issue was using zip to extract key and values from data.
- # using generators in effects the performance.
- # Below is the new way of extracting the keys and values
-
- keys = tuple(data.keys())
- values = list(data.values()) # Generating list of values- faster way
- elif index is not None:
- # fastpath for Series(data=None). Just use broadcasting a scalar
- # instead of reindexing.
- if len(index) or dtype is not None:
- values = na_value_for_dtype(pandas_dtype(dtype), compat=False)
- else:
- values = []
- keys = index
- else:
- keys, values = default_index(0), []
-
- # Input is now list-like, so rely on "standard" construction:
- s = Series(values, index=keys, dtype=dtype)
-
- # Now we just make sure the order is respected, if any
- if data and index is not None:
- s = s.reindex(index, copy=False)
- return s._mgr, s.index
-
- # ----------------------------------------------------------------------
-
- @property
- def _constructor(self) -> Callable[..., Series]:
- return Series
-
- @property
- def _constructor_expanddim(self) -> Callable[..., DataFrame]:
- """
- Used when a manipulation result has one higher dimension as the
- original, such as Series.to_frame()
- """
- from pandas.core.frame import DataFrame
-
- return DataFrame
-
- # types
- @property
- def _can_hold_na(self) -> bool:
- return self._mgr._can_hold_na
-
- # ndarray compatibility
- @property
- def dtype(self) -> DtypeObj:
- """
- Return the dtype object of the underlying data.
-
- Examples
- --------
- >>> s = pd.Series([1, 2, 3])
- >>> s.dtype
- dtype('int64')
- """
- return self._mgr.dtype
-
- @property
- def dtypes(self) -> DtypeObj:
- """
- Return the dtype object of the underlying data.
-
- Examples
- --------
- >>> s = pd.Series([1, 2, 3])
- >>> s.dtypes
- dtype('int64')
- """
- # DataFrame compatibility
- return self.dtype
-
- @property
- def name(self) -> Hashable:
- """
- Return the name of the Series.
-
- The name of a Series becomes its index or column name if it is used
- to form a DataFrame. It is also used whenever displaying the Series
- using the interpreter.
-
- Returns
- -------
- label (hashable object)
- The name of the Series, also the column name if part of a DataFrame.
-
- See Also
- --------
- Series.rename : Sets the Series name when given a scalar input.
- Index.name : Corresponding Index property.
-
- Examples
- --------
- The Series name can be set initially when calling the constructor.
-
- >>> s = pd.Series([1, 2, 3], dtype=np.int64, name='Numbers')
- >>> s
- 0 1
- 1 2
- 2 3
- Name: Numbers, dtype: int64
- >>> s.name = "Integers"
- >>> s
- 0 1
- 1 2
- 2 3
- Name: Integers, dtype: int64
-
- The name of a Series within a DataFrame is its column name.
-
- >>> df = pd.DataFrame([[1, 2], [3, 4], [5, 6]],
- ... columns=["Odd Numbers", "Even Numbers"])
- >>> df
- Odd Numbers Even Numbers
- 0 1 2
- 1 3 4
- 2 5 6
- >>> df["Even Numbers"].name
- 'Even Numbers'
- """
- return self._name
-
- @name.setter
- def name(self, value: Hashable) -> None:
- validate_all_hashable(value, error_name=f"{type(self).__name__}.name")
- object.__setattr__(self, "_name", value)
-
- @property
- def values(self):
- """
- Return Series as ndarray or ndarray-like depending on the dtype.
-
- .. warning::
-
- We recommend using :attr:`Series.array` or
- :meth:`Series.to_numpy`, depending on whether you need
- a reference to the underlying data or a NumPy array.
-
- Returns
- -------
- numpy.ndarray or ndarray-like
-
- See Also
- --------
- Series.array : Reference to the underlying data.
- Series.to_numpy : A NumPy array representing the underlying data.
-
- Examples
- --------
- >>> pd.Series([1, 2, 3]).values
- array([1, 2, 3])
-
- >>> pd.Series(list('aabc')).values
- array(['a', 'a', 'b', 'c'], dtype=object)
-
- >>> pd.Series(list('aabc')).astype('category').values
- ['a', 'a', 'b', 'c']
- Categories (3, object): ['a', 'b', 'c']
-
- Timezone aware datetime data is converted to UTC:
-
- >>> pd.Series(pd.date_range('20130101', periods=3,
- ... tz='US/Eastern')).values
- array(['2013-01-01T05:00:00.000000000',
- '2013-01-02T05:00:00.000000000',
- '2013-01-03T05:00:00.000000000'], dtype='datetime64[ns]')
- """
- return self._mgr.external_values()
-
- @property
- def _values(self):
- """
- Return the internal repr of this data (defined by Block.interval_values).
- This are the values as stored in the Block (ndarray or ExtensionArray
- depending on the Block class), with datetime64[ns] and timedelta64[ns]
- wrapped in ExtensionArrays to match Index._values behavior.
-
- Differs from the public ``.values`` for certain data types, because of
- historical backwards compatibility of the public attribute (e.g. period
- returns object ndarray and datetimetz a datetime64[ns] ndarray for
- ``.values`` while it returns an ExtensionArray for ``._values`` in those
- cases).
-
- Differs from ``.array`` in that this still returns the numpy array if
- the Block is backed by a numpy array (except for datetime64 and
- timedelta64 dtypes), while ``.array`` ensures to always return an
- ExtensionArray.
-
- Overview:
-
- dtype | values | _values | array |
- ----------- | ------------- | ------------- | ------------- |
- Numeric | ndarray | ndarray | PandasArray |
- Category | Categorical | Categorical | Categorical |
- dt64[ns] | ndarray[M8ns] | DatetimeArray | DatetimeArray |
- dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray |
- td64[ns] | ndarray[m8ns] | TimedeltaArray| ndarray[m8ns] |
- Period | ndarray[obj] | PeriodArray | PeriodArray |
- Nullable | EA | EA | EA |
-
- """
- return self._mgr.internal_values()
-
- @property
- def _references(self) -> BlockValuesRefs | None:
- if isinstance(self._mgr, SingleArrayManager):
- return None
- return self._mgr._block.refs
-
- # error: Decorated property not supported
- @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[misc]
- @property
- def array(self) -> ExtensionArray:
- return self._mgr.array_values()
-
- # ops
- def ravel(self, order: str = "C") -> ArrayLike:
- """
- Return the flattened underlying data as an ndarray or ExtensionArray.
-
- Returns
- -------
- numpy.ndarray or ExtensionArray
- Flattened data of the Series.
-
- See Also
- --------
- numpy.ndarray.ravel : Return a flattened array.
- """
- arr = self._values.ravel(order=order)
- if isinstance(arr, np.ndarray) and using_copy_on_write():
- arr.flags.writeable = False
- return arr
-
- def __len__(self) -> int:
- """
- Return the length of the Series.
- """
- return len(self._mgr)
-
- def view(self, dtype: Dtype | None = None) -> Series:
- """
- Create a new view of the Series.
-
- This function will return a new Series with a view of the same
- underlying values in memory, optionally reinterpreted with a new data
- type. The new data type must preserve the same size in bytes as to not
- cause index misalignment.
-
- Parameters
- ----------
- dtype : data type
- Data type object or one of their string representations.
-
- Returns
- -------
- Series
- A new Series object as a view of the same data in memory.
-
- See Also
- --------
- numpy.ndarray.view : Equivalent numpy function to create a new view of
- the same data in memory.
-
- Notes
- -----
- Series are instantiated with ``dtype=float64`` by default. While
- ``numpy.ndarray.view()`` will return a view with the same data type as
- the original array, ``Series.view()`` (without specified dtype)
- will try using ``float64`` and may fail if the original data type size
- in bytes is not the same.
-
- Examples
- --------
- >>> s = pd.Series([-2, -1, 0, 1, 2], dtype='int8')
- >>> s
- 0 -2
- 1 -1
- 2 0
- 3 1
- 4 2
- dtype: int8
-
- The 8 bit signed integer representation of `-1` is `0b11111111`, but
- the same bytes represent 255 if read as an 8 bit unsigned integer:
-
- >>> us = s.view('uint8')
- >>> us
- 0 254
- 1 255
- 2 0
- 3 1
- 4 2
- dtype: uint8
-
- The views share the same underlying values:
-
- >>> us[0] = 128
- >>> s
- 0 -128
- 1 -1
- 2 0
- 3 1
- 4 2
- dtype: int8
- """
- # self.array instead of self._values so we piggyback on PandasArray
- # implementation
- res_values = self.array.view(dtype)
- res_ser = self._constructor(res_values, index=self.index, copy=False)
- if isinstance(res_ser._mgr, SingleBlockManager) and using_copy_on_write():
- blk = res_ser._mgr._block
- blk.refs = cast("BlockValuesRefs", self._references)
- blk.refs.add_reference(blk) # type: ignore[arg-type]
- return res_ser.__finalize__(self, method="view")
-
- # ----------------------------------------------------------------------
- # NDArray Compat
- _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray)
-
- def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
- """
- Return the values as a NumPy array.
-
- Users should not call this directly. Rather, it is invoked by
- :func:`numpy.array` and :func:`numpy.asarray`.
-
- Parameters
- ----------
- dtype : str or numpy.dtype, optional
- The dtype to use for the resulting NumPy array. By default,
- the dtype is inferred from the data.
-
- Returns
- -------
- numpy.ndarray
- The values in the series converted to a :class:`numpy.ndarray`
- with the specified `dtype`.
-
- See Also
- --------
- array : Create a new array from data.
- Series.array : Zero-copy view to the array backing the Series.
- Series.to_numpy : Series method for similar behavior.
-
- Examples
- --------
- >>> ser = pd.Series([1, 2, 3])
- >>> np.asarray(ser)
- array([1, 2, 3])
-
- For timezone-aware data, the timezones may be retained with
- ``dtype='object'``
-
- >>> tzser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
- >>> np.asarray(tzser, dtype="object")
- array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
- Timestamp('2000-01-02 00:00:00+0100', tz='CET')],
- dtype=object)
-
- Or the values may be localized to UTC and the tzinfo discarded with
- ``dtype='datetime64[ns]'``
-
- >>> np.asarray(tzser, dtype="datetime64[ns]") # doctest: +ELLIPSIS
- array(['1999-12-31T23:00:00.000000000', ...],
- dtype='datetime64[ns]')
- """
- values = self._values
- arr = np.asarray(values, dtype=dtype)
- if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype):
- arr = arr.view()
- arr.flags.writeable = False
- return arr
-
- # ----------------------------------------------------------------------
- # Unary Methods
-
- # coercion
- __float__ = _coerce_method(float)
- __int__ = _coerce_method(int)
-
- # ----------------------------------------------------------------------
-
- # indexers
- @property
- def axes(self) -> list[Index]:
- """
- Return a list of the row axis labels.
- """
- return [self.index]
-
- # ----------------------------------------------------------------------
- # Indexing Methods
-
- @Appender(NDFrame.take.__doc__)
- def take(self, indices, axis: Axis = 0, **kwargs) -> Series:
- nv.validate_take((), kwargs)
-
- indices = ensure_platform_int(indices)
-
- if (
- indices.ndim == 1
- and using_copy_on_write()
- and is_range_indexer(indices, len(self))
- ):
- return self.copy(deep=None)
-
- new_index = self.index.take(indices)
- new_values = self._values.take(indices)
-
- result = self._constructor(new_values, index=new_index, fastpath=True)
- return result.__finalize__(self, method="take")
-
- def _take_with_is_copy(self, indices, axis: Axis = 0) -> Series:
- """
- Internal version of the `take` method that sets the `_is_copy`
- attribute to keep track of the parent dataframe (using in indexing
- for the SettingWithCopyWarning). For Series this does the same
- as the public take (it never sets `_is_copy`).
-
- See the docstring of `take` for full explanation of the parameters.
- """
- return self.take(indices=indices, axis=axis)
-
- def _ixs(self, i: int, axis: AxisInt = 0) -> Any:
- """
- Return the i-th value or values in the Series by location.
-
- Parameters
- ----------
- i : int
-
- Returns
- -------
- scalar (int) or Series (slice, sequence)
- """
- return self._values[i]
-
- def _slice(self, slobj: slice | np.ndarray, axis: Axis = 0) -> Series:
- # axis kwarg is retained for compat with NDFrame method
- # _slice is *always* positional
- return self._get_values(slobj)
-
- def __getitem__(self, key):
- check_dict_or_set_indexers(key)
- key = com.apply_if_callable(key, self)
-
- if key is Ellipsis:
- return self
-
- key_is_scalar = is_scalar(key)
- if isinstance(key, (list, tuple)):
- key = unpack_1tuple(key)
-
- if is_integer(key) and self.index._should_fallback_to_positional:
- return self._values[key]
-
- elif key_is_scalar:
- return self._get_value(key)
-
- if is_hashable(key):
- # Otherwise index.get_value will raise InvalidIndexError
- try:
- # For labels that don't resolve as scalars like tuples and frozensets
- result = self._get_value(key)
-
- return result
-
- except (KeyError, TypeError, InvalidIndexError):
- # InvalidIndexError for e.g. generator
- # see test_series_getitem_corner_generator
- if isinstance(key, tuple) and isinstance(self.index, MultiIndex):
- # We still have the corner case where a tuple is a key
- # in the first level of our MultiIndex
- return self._get_values_tuple(key)
-
- if is_iterator(key):
- key = list(key)
-
- if com.is_bool_indexer(key):
- key = check_bool_indexer(self.index, key)
- key = np.asarray(key, dtype=bool)
- return self._get_values(key)
-
- return self._get_with(key)
-
- def _get_with(self, key):
- # other: fancy integer or otherwise
- if isinstance(key, slice):
- # _convert_slice_indexer to determine if this slice is positional
- # or label based, and if the latter, convert to positional
- slobj = self.index._convert_slice_indexer(key, kind="getitem")
- return self._slice(slobj)
- elif isinstance(key, ABCDataFrame):
- raise TypeError(
- "Indexing a Series with DataFrame is not "
- "supported, use the appropriate DataFrame column"
- )
- elif isinstance(key, tuple):
- return self._get_values_tuple(key)
-
- elif not is_list_like(key):
- # e.g. scalars that aren't recognized by lib.is_scalar, GH#32684
- return self.loc[key]
-
- if not isinstance(key, (list, np.ndarray, ExtensionArray, Series, Index)):
- key = list(key)
-
- if isinstance(key, Index):
- key_type = key.inferred_type
- else:
- key_type = lib.infer_dtype(key, skipna=False)
-
- # Note: The key_type == "boolean" case should be caught by the
- # com.is_bool_indexer check in __getitem__
- if key_type == "integer":
- # We need to decide whether to treat this as a positional indexer
- # (i.e. self.iloc) or label-based (i.e. self.loc)
- if not self.index._should_fallback_to_positional:
- return self.loc[key]
- else:
- return self.iloc[key]
-
- # handle the dup indexing case GH#4246
- return self.loc[key]
-
- def _get_values_tuple(self, key: tuple):
- # mpl hackaround
- if com.any_none(*key):
- # mpl compat if we look up e.g. ser[:, np.newaxis];
- # see tests.series.timeseries.test_mpl_compat_hack
- # the asarray is needed to avoid returning a 2D DatetimeArray
- result = np.asarray(self._values[key])
- disallow_ndim_indexing(result)
- return result
-
- if not isinstance(self.index, MultiIndex):
- raise KeyError("key of type tuple not found and not a MultiIndex")
-
- # If key is contained, would have returned by now
- indexer, new_index = self.index.get_loc_level(key)
- new_ser = self._constructor(self._values[indexer], index=new_index, copy=False)
- if using_copy_on_write() and isinstance(indexer, slice):
- new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type]
- return new_ser.__finalize__(self)
-
- def _get_values(self, indexer: slice | npt.NDArray[np.bool_]) -> Series:
- new_mgr = self._mgr.getitem_mgr(indexer)
- return self._constructor(new_mgr).__finalize__(self)
-
- def _get_value(self, label, takeable: bool = False):
- """
- Quickly retrieve single value at passed index label.
-
- Parameters
- ----------
- label : object
- takeable : interpret the index as indexers, default False
-
- Returns
- -------
- scalar value
- """
- if takeable:
- return self._values[label]
-
- # Similar to Index.get_value, but we do not fall back to positional
- loc = self.index.get_loc(label)
-
- if is_integer(loc):
- return self._values[loc]
-
- if isinstance(self.index, MultiIndex):
- mi = self.index
- new_values = self._values[loc]
- if len(new_values) == 1 and mi.nlevels == 1:
- # If more than one level left, we can not return a scalar
- return new_values[0]
-
- new_index = mi[loc]
- new_index = maybe_droplevels(new_index, label)
- new_ser = self._constructor(
- new_values, index=new_index, name=self.name, copy=False
- )
- if using_copy_on_write() and isinstance(loc, slice):
- new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type]
- return new_ser.__finalize__(self)
-
- else:
- return self.iloc[loc]
-
- def __setitem__(self, key, value) -> None:
- if not PYPY and using_copy_on_write():
- if sys.getrefcount(self) <= 3:
- warnings.warn(
- _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
- )
-
- check_dict_or_set_indexers(key)
- key = com.apply_if_callable(key, self)
- cacher_needs_updating = self._check_is_chained_assignment_possible()
-
- if key is Ellipsis:
- key = slice(None)
-
- if isinstance(key, slice):
- indexer = self.index._convert_slice_indexer(key, kind="getitem")
- return self._set_values(indexer, value)
-
- try:
- self._set_with_engine(key, value)
- except KeyError:
- # We have a scalar (or for MultiIndex or object-dtype, scalar-like)
- # key that is not present in self.index.
- if is_integer(key):
- if not self.index._should_fallback_to_positional:
- # GH#33469
- self.loc[key] = value
- else:
- # positional setter
- # can't use _mgr.setitem_inplace yet bc could have *both*
- # KeyError and then ValueError, xref GH#45070
- self._set_values(key, value)
- else:
- # GH#12862 adding a new key to the Series
- self.loc[key] = value
-
- except (TypeError, ValueError, LossySetitemError):
- # The key was OK, but we cannot set the value losslessly
- indexer = self.index.get_loc(key)
- self._set_values(indexer, value)
-
- except InvalidIndexError as err:
- if isinstance(key, tuple) and not isinstance(self.index, MultiIndex):
- # cases with MultiIndex don't get here bc they raise KeyError
- # e.g. test_basic_getitem_setitem_corner
- raise KeyError(
- "key of type tuple not found and not a MultiIndex"
- ) from err
-
- if com.is_bool_indexer(key):
- key = check_bool_indexer(self.index, key)
- key = np.asarray(key, dtype=bool)
-
- if (
- is_list_like(value)
- and len(value) != len(self)
- and not isinstance(value, Series)
- and not is_object_dtype(self.dtype)
- ):
- # Series will be reindexed to have matching length inside
- # _where call below
- # GH#44265
- indexer = key.nonzero()[0]
- self._set_values(indexer, value)
- return
-
- # otherwise with listlike other we interpret series[mask] = other
- # as series[mask] = other[mask]
- try:
- self._where(~key, value, inplace=True)
- except InvalidIndexError:
- # test_where_dups
- self.iloc[key] = value
- return
-
- else:
- self._set_with(key, value)
-
- if cacher_needs_updating:
- self._maybe_update_cacher(inplace=True)
-
- def _set_with_engine(self, key, value) -> None:
- loc = self.index.get_loc(key)
-
- # this is equivalent to self._values[key] = value
- self._mgr.setitem_inplace(loc, value)
-
- def _set_with(self, key, value) -> None:
- # We got here via exception-handling off of InvalidIndexError, so
- # key should always be listlike at this point.
- assert not isinstance(key, tuple)
-
- if is_iterator(key):
- # Without this, the call to infer_dtype will consume the generator
- key = list(key)
-
- if not self.index._should_fallback_to_positional:
- # Regardless of the key type, we're treating it as labels
- self._set_labels(key, value)
-
- else:
- # Note: key_type == "boolean" should not occur because that
- # should be caught by the is_bool_indexer check in __setitem__
- key_type = lib.infer_dtype(key, skipna=False)
-
- if key_type == "integer":
- self._set_values(key, value)
- else:
- self._set_labels(key, value)
-
- def _set_labels(self, key, value) -> None:
- key = com.asarray_tuplesafe(key)
- indexer: np.ndarray = self.index.get_indexer(key)
- mask = indexer == -1
- if mask.any():
- raise KeyError(f"{key[mask]} not in index")
- self._set_values(indexer, value)
-
- def _set_values(self, key, value) -> None:
- if isinstance(key, (Index, Series)):
- key = key._values
-
- self._mgr = self._mgr.setitem(indexer=key, value=value)
- self._maybe_update_cacher()
-
- def _set_value(self, label, value, takeable: bool = False) -> None:
- """
- Quickly set single value at passed label.
-
- If label is not contained, a new object is created with the label
- placed at the end of the result index.
-
- Parameters
- ----------
- label : object
- Partial indexing with MultiIndex not allowed.
- value : object
- Scalar value.
- takeable : interpret the index as indexers, default False
- """
- if not takeable:
- try:
- loc = self.index.get_loc(label)
- except KeyError:
- # set using a non-recursive method
- self.loc[label] = value
- return
- else:
- loc = label
-
- self._set_values(loc, value)
-
- # ----------------------------------------------------------------------
- # Lookup Caching
-
- @property
- def _is_cached(self) -> bool:
- """Return boolean indicating if self is cached or not."""
- return getattr(self, "_cacher", None) is not None
-
- def _get_cacher(self):
- """return my cacher or None"""
- cacher = getattr(self, "_cacher", None)
- if cacher is not None:
- cacher = cacher[1]()
- return cacher
-
- def _reset_cacher(self) -> None:
- """
- Reset the cacher.
- """
- if hasattr(self, "_cacher"):
- del self._cacher
-
- def _set_as_cached(self, item, cacher) -> None:
- """
- Set the _cacher attribute on the calling object with a weakref to
- cacher.
- """
- if using_copy_on_write():
- return
- self._cacher = (item, weakref.ref(cacher))
-
- def _clear_item_cache(self) -> None:
- # no-op for Series
- pass
-
- def _check_is_chained_assignment_possible(self) -> bool:
- """
- See NDFrame._check_is_chained_assignment_possible.__doc__
- """
- if self._is_view and self._is_cached:
- ref = self._get_cacher()
- if ref is not None and ref._is_mixed_type:
- self._check_setitem_copy(t="referent", force=True)
- return True
- return super()._check_is_chained_assignment_possible()
-
- def _maybe_update_cacher(
- self, clear: bool = False, verify_is_copy: bool = True, inplace: bool = False
- ) -> None:
- """
- See NDFrame._maybe_update_cacher.__doc__
- """
- # for CoW, we never want to update the parent DataFrame cache
- # if the Series changed, but don't keep track of any cacher
- if using_copy_on_write():
- return
- cacher = getattr(self, "_cacher", None)
- if cacher is not None:
- assert self.ndim == 1
- ref: DataFrame = cacher[1]()
-
- # we are trying to reference a dead referent, hence
- # a copy
- if ref is None:
- del self._cacher
- elif len(self) == len(ref) and self.name in ref.columns:
- # GH#42530 self.name must be in ref.columns
- # to ensure column still in dataframe
- # otherwise, either self or ref has swapped in new arrays
- ref._maybe_cache_changed(cacher[0], self, inplace=inplace)
- else:
- # GH#33675 we have swapped in a new array, so parent
- # reference to self is now invalid
- ref._item_cache.pop(cacher[0], None)
-
- super()._maybe_update_cacher(
- clear=clear, verify_is_copy=verify_is_copy, inplace=inplace
- )
-
- # ----------------------------------------------------------------------
- # Unsorted
-
- @property
- def _is_mixed_type(self) -> bool:
- return False
-
- def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series:
- """
- Repeat elements of a Series.
-
- Returns a new Series where each element of the current Series
- is repeated consecutively a given number of times.
-
- Parameters
- ----------
- repeats : int or array of ints
- The number of repetitions for each element. This should be a
- non-negative integer. Repeating 0 times will return an empty
- Series.
- axis : None
- Unused. Parameter needed for compatibility with DataFrame.
-
- Returns
- -------
- Series
- Newly created Series with repeated elements.
-
- See Also
- --------
- Index.repeat : Equivalent function for Index.
- numpy.repeat : Similar method for :class:`numpy.ndarray`.
-
- Examples
- --------
- >>> s = pd.Series(['a', 'b', 'c'])
- >>> s
- 0 a
- 1 b
- 2 c
- dtype: object
- >>> s.repeat(2)
- 0 a
- 0 a
- 1 b
- 1 b
- 2 c
- 2 c
- dtype: object
- >>> s.repeat([1, 2, 3])
- 0 a
- 1 b
- 1 b
- 2 c
- 2 c
- 2 c
- dtype: object
- """
- nv.validate_repeat((), {"axis": axis})
- new_index = self.index.repeat(repeats)
- new_values = self._values.repeat(repeats)
- return self._constructor(new_values, index=new_index, copy=False).__finalize__(
- self, method="repeat"
- )
-
- @overload
- def reset_index(
- self,
- level: IndexLabel = ...,
- *,
- drop: Literal[False] = ...,
- name: Level = ...,
- inplace: Literal[False] = ...,
- allow_duplicates: bool = ...,
- ) -> DataFrame:
- ...
-
- @overload
- def reset_index(
- self,
- level: IndexLabel = ...,
- *,
- drop: Literal[True],
- name: Level = ...,
- inplace: Literal[False] = ...,
- allow_duplicates: bool = ...,
- ) -> Series:
- ...
-
- @overload
- def reset_index(
- self,
- level: IndexLabel = ...,
- *,
- drop: bool = ...,
- name: Level = ...,
- inplace: Literal[True],
- allow_duplicates: bool = ...,
- ) -> None:
- ...
-
- def reset_index(
- self,
- level: IndexLabel = None,
- *,
- drop: bool = False,
- name: Level = lib.no_default,
- inplace: bool = False,
- allow_duplicates: bool = False,
- ) -> DataFrame | Series | None:
- """
- Generate a new DataFrame or Series with the index reset.
-
- This is useful when the index needs to be treated as a column, or
- when the index is meaningless and needs to be reset to the default
- before another operation.
-
- Parameters
- ----------
- level : int, str, tuple, or list, default optional
- For a Series with a MultiIndex, only remove the specified levels
- from the index. Removes all levels by default.
- drop : bool, default False
- Just reset the index, without inserting it as a column in
- the new DataFrame.
- name : object, optional
- The name to use for the column containing the original Series
- values. Uses ``self.name`` by default. This argument is ignored
- when `drop` is True.
- inplace : bool, default False
- Modify the Series in place (do not create a new object).
- allow_duplicates : bool, default False
- Allow duplicate column labels to be created.
-
- .. versionadded:: 1.5.0
-
- Returns
- -------
- Series or DataFrame or None
- When `drop` is False (the default), a DataFrame is returned.
- The newly created columns will come first in the DataFrame,
- followed by the original Series values.
- When `drop` is True, a `Series` is returned.
- In either case, if ``inplace=True``, no value is returned.
-
- See Also
- --------
- DataFrame.reset_index: Analogous function for DataFrame.
-
- Examples
- --------
- >>> s = pd.Series([1, 2, 3, 4], name='foo',
- ... index=pd.Index(['a', 'b', 'c', 'd'], name='idx'))
-
- Generate a DataFrame with default index.
-
- >>> s.reset_index()
- idx foo
- 0 a 1
- 1 b 2
- 2 c 3
- 3 d 4
-
- To specify the name of the new column use `name`.
-
- >>> s.reset_index(name='values')
- idx values
- 0 a 1
- 1 b 2
- 2 c 3
- 3 d 4
-
- To generate a new Series with the default set `drop` to True.
-
- >>> s.reset_index(drop=True)
- 0 1
- 1 2
- 2 3
- 3 4
- Name: foo, dtype: int64
-
- The `level` parameter is interesting for Series with a multi-level
- index.
-
- >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz']),
- ... np.array(['one', 'two', 'one', 'two'])]
- >>> s2 = pd.Series(
- ... range(4), name='foo',
- ... index=pd.MultiIndex.from_arrays(arrays,
- ... names=['a', 'b']))
-
- To remove a specific level from the Index, use `level`.
-
- >>> s2.reset_index(level='a')
- a foo
- b
- one bar 0
- two bar 1
- one baz 2
- two baz 3
-
- If `level` is not set, all levels are removed from the Index.
-
- >>> s2.reset_index()
- a b foo
- 0 bar one 0
- 1 bar two 1
- 2 baz one 2
- 3 baz two 3
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- if drop:
- new_index = default_index(len(self))
- if level is not None:
- level_list: Sequence[Hashable]
- if not isinstance(level, (tuple, list)):
- level_list = [level]
- else:
- level_list = level
- level_list = [self.index._get_level_number(lev) for lev in level_list]
- if len(level_list) < self.index.nlevels:
- new_index = self.index.droplevel(level_list)
-
- if inplace:
- self.index = new_index
- elif using_copy_on_write():
- new_ser = self.copy(deep=False)
- new_ser.index = new_index
- return new_ser.__finalize__(self, method="reset_index")
- else:
- return self._constructor(
- self._values.copy(), index=new_index, copy=False
- ).__finalize__(self, method="reset_index")
- elif inplace:
- raise TypeError(
- "Cannot reset_index inplace on a Series to create a DataFrame"
- )
- else:
- if name is lib.no_default:
- # For backwards compatibility, keep columns as [0] instead of
- # [None] when self.name is None
- if self.name is None:
- name = 0
- else:
- name = self.name
-
- df = self.to_frame(name)
- return df.reset_index(
- level=level, drop=drop, allow_duplicates=allow_duplicates
- )
- return None
-
- # ----------------------------------------------------------------------
- # Rendering Methods
-
- def __repr__(self) -> str:
- """
- Return a string representation for a particular Series.
- """
- # pylint: disable=invalid-repr-returned
- repr_params = fmt.get_series_repr_params()
- return self.to_string(**repr_params)
-
- @overload
- def to_string(
- self,
- buf: None = ...,
- na_rep: str = ...,
- float_format: str | None = ...,
- header: bool = ...,
- index: bool = ...,
- length=...,
- dtype=...,
- name=...,
- max_rows: int | None = ...,
- min_rows: int | None = ...,
- ) -> str:
- ...
-
- @overload
- def to_string(
- self,
- buf: FilePath | WriteBuffer[str],
- na_rep: str = ...,
- float_format: str | None = ...,
- header: bool = ...,
- index: bool = ...,
- length=...,
- dtype=...,
- name=...,
- max_rows: int | None = ...,
- min_rows: int | None = ...,
- ) -> None:
- ...
-
- def to_string(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- na_rep: str = "NaN",
- float_format: str | None = None,
- header: bool = True,
- index: bool = True,
- length: bool = False,
- dtype: bool = False,
- name: bool = False,
- max_rows: int | None = None,
- min_rows: int | None = None,
- ) -> str | None:
- """
- Render a string representation of the Series.
-
- Parameters
- ----------
- buf : StringIO-like, optional
- Buffer to write to.
- na_rep : str, optional
- String representation of NaN to use, default 'NaN'.
- float_format : one-parameter function, optional
- Formatter function to apply to columns' elements if they are
- floats, default None.
- header : bool, default True
- Add the Series header (index name).
- index : bool, optional
- Add index (row) labels, default True.
- length : bool, default False
- Add the Series length.
- dtype : bool, default False
- Add the Series dtype.
- name : bool, default False
- Add the Series name if not None.
- max_rows : int, optional
- Maximum number of rows to show before truncating. If None, show
- all.
- min_rows : int, optional
- The number of rows to display in a truncated repr (when number
- of rows is above `max_rows`).
-
- Returns
- -------
- str or None
- String representation of Series if ``buf=None``, otherwise None.
- """
- formatter = fmt.SeriesFormatter(
- self,
- name=name,
- length=length,
- header=header,
- index=index,
- dtype=dtype,
- na_rep=na_rep,
- float_format=float_format,
- min_rows=min_rows,
- max_rows=max_rows,
- )
- result = formatter.to_string()
-
- # catch contract violations
- if not isinstance(result, str):
- raise AssertionError(
- "result must be of type str, type "
- f"of result is {repr(type(result).__name__)}"
- )
-
- if buf is None:
- return result
- else:
- if hasattr(buf, "write"):
- buf.write(result)
- else:
- with open(buf, "w") as f:
- f.write(result)
- return None
-
- @doc(
- klass=_shared_doc_kwargs["klass"],
- storage_options=_shared_docs["storage_options"],
- examples=dedent(
- """Examples
- --------
- >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal")
- >>> print(s.to_markdown())
- | | animal |
- |---:|:---------|
- | 0 | elk |
- | 1 | pig |
- | 2 | dog |
- | 3 | quetzal |
-
- Output markdown with a tabulate option.
-
- >>> print(s.to_markdown(tablefmt="grid"))
- +----+----------+
- | | animal |
- +====+==========+
- | 0 | elk |
- +----+----------+
- | 1 | pig |
- +----+----------+
- | 2 | dog |
- +----+----------+
- | 3 | quetzal |
- +----+----------+"""
- ),
- )
- def to_markdown(
- self,
- buf: IO[str] | None = None,
- mode: str = "wt",
- index: bool = True,
- storage_options: StorageOptions = None,
- **kwargs,
- ) -> str | None:
- """
- Print {klass} in Markdown-friendly format.
-
- Parameters
- ----------
- buf : str, Path or StringIO-like, optional, default None
- Buffer to write to. If None, the output is returned as a string.
- mode : str, optional
- Mode in which file is opened, "wt" by default.
- index : bool, optional, default True
- Add index (row) labels.
-
- .. versionadded:: 1.1.0
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- **kwargs
- These parameters will be passed to `tabulate \
- <https://pypi.org/project/tabulate>`_.
-
- Returns
- -------
- str
- {klass} in Markdown-friendly format.
-
- Notes
- -----
- Requires the `tabulate <https://pypi.org/project/tabulate>`_ package.
-
- {examples}
- """
- return self.to_frame().to_markdown(
- buf, mode, index, storage_options=storage_options, **kwargs
- )
-
- # ----------------------------------------------------------------------
-
- def items(self) -> Iterable[tuple[Hashable, Any]]:
- """
- Lazily iterate over (index, value) tuples.
-
- This method returns an iterable tuple (index, value). This is
- convenient if you want to create a lazy iterator.
-
- Returns
- -------
- iterable
- Iterable of tuples containing the (index, value) pairs from a
- Series.
-
- See Also
- --------
- DataFrame.items : Iterate over (column name, Series) pairs.
- DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs.
-
- Examples
- --------
- >>> s = pd.Series(['A', 'B', 'C'])
- >>> for index, value in s.items():
- ... print(f"Index : {index}, Value : {value}")
- Index : 0, Value : A
- Index : 1, Value : B
- Index : 2, Value : C
- """
- return zip(iter(self.index), iter(self))
-
- # ----------------------------------------------------------------------
- # Misc public methods
-
- def keys(self) -> Index:
- """
- Return alias for index.
-
- Returns
- -------
- Index
- Index of the Series.
- """
- return self.index
-
- def to_dict(self, into: type[dict] = dict) -> dict:
- """
- Convert Series to {label -> value} dict or dict-like object.
-
- Parameters
- ----------
- into : class, default dict
- The collections.abc.Mapping subclass to use as the return
- object. Can be the actual class or an empty
- instance of the mapping type you want. If you want a
- collections.defaultdict, you must pass it initialized.
-
- Returns
- -------
- collections.abc.Mapping
- Key-value representation of Series.
-
- Examples
- --------
- >>> s = pd.Series([1, 2, 3, 4])
- >>> s.to_dict()
- {0: 1, 1: 2, 2: 3, 3: 4}
- >>> from collections import OrderedDict, defaultdict
- >>> s.to_dict(OrderedDict)
- OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
- >>> dd = defaultdict(list)
- >>> s.to_dict(dd)
- defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
- """
- # GH16122
- into_c = com.standardize_mapping(into)
-
- if is_object_dtype(self) or is_extension_array_dtype(self):
- return into_c((k, maybe_box_native(v)) for k, v in self.items())
- else:
- # Not an object dtype => all types will be the same so let the default
- # indexer return native python type
- return into_c(self.items())
-
- def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
- """
- Convert Series to DataFrame.
-
- Parameters
- ----------
- name : object, optional
- The passed name should substitute for the series name (if it has
- one).
-
- Returns
- -------
- DataFrame
- DataFrame representation of Series.
-
- Examples
- --------
- >>> s = pd.Series(["a", "b", "c"],
- ... name="vals")
- >>> s.to_frame()
- vals
- 0 a
- 1 b
- 2 c
- """
- columns: Index
- if name is lib.no_default:
- name = self.name
- if name is None:
- # default to [0], same as we would get with DataFrame(self)
- columns = default_index(1)
- else:
- columns = Index([name])
- else:
- columns = Index([name])
-
- mgr = self._mgr.to_2d_mgr(columns)
- df = self._constructor_expanddim(mgr)
- return df.__finalize__(self, method="to_frame")
-
- def _set_name(
- self, name, inplace: bool = False, deep: bool | None = None
- ) -> Series:
- """
- Set the Series name.
-
- Parameters
- ----------
- name : str
- inplace : bool
- Whether to modify `self` directly or return a copy.
- deep : bool|None, default None
- Whether to do a deep copy, a shallow copy, or Copy on Write(None)
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- ser = self if inplace else self.copy(deep and not using_copy_on_write())
- ser.name = name
- return ser
-
- @Appender(
- """
-Examples
---------
->>> ser = pd.Series([390., 350., 30., 20.],
-... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], name="Max Speed")
->>> ser
-Falcon 390.0
-Falcon 350.0
-Parrot 30.0
-Parrot 20.0
-Name: Max Speed, dtype: float64
->>> ser.groupby(["a", "b", "a", "b"]).mean()
-a 210.0
-b 185.0
-Name: Max Speed, dtype: float64
->>> ser.groupby(level=0).mean()
-Falcon 370.0
-Parrot 25.0
-Name: Max Speed, dtype: float64
->>> ser.groupby(ser > 100).mean()
-Max Speed
-False 25.0
-True 370.0
-Name: Max Speed, dtype: float64
-
-**Grouping by Indexes**
-
-We can groupby different levels of a hierarchical index
-using the `level` parameter:
-
->>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
-... ['Captive', 'Wild', 'Captive', 'Wild']]
->>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
->>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed")
->>> ser
-Animal Type
-Falcon Captive 390.0
- Wild 350.0
-Parrot Captive 30.0
- Wild 20.0
-Name: Max Speed, dtype: float64
->>> ser.groupby(level=0).mean()
-Animal
-Falcon 370.0
-Parrot 25.0
-Name: Max Speed, dtype: float64
->>> ser.groupby(level="Type").mean()
-Type
-Captive 210.0
-Wild 185.0
-Name: Max Speed, dtype: float64
-
-We can also choose to include `NA` in group keys or not by defining
-`dropna` parameter, the default setting is `True`.
-
->>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan])
->>> ser.groupby(level=0).sum()
-a 3
-b 3
-dtype: int64
-
->>> ser.groupby(level=0, dropna=False).sum()
-a 3
-b 3
-NaN 3
-dtype: int64
-
->>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot']
->>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed")
->>> ser.groupby(["a", "b", "a", np.nan]).mean()
-a 210.0
-b 350.0
-Name: Max Speed, dtype: float64
-
->>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()
-a 210.0
-b 350.0
-NaN 20.0
-Name: Max Speed, dtype: float64
-"""
- )
- @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
- def groupby(
- self,
- by=None,
- axis: Axis = 0,
- level: IndexLabel = None,
- as_index: bool = True,
- sort: bool = True,
- group_keys: bool = True,
- observed: bool = False,
- dropna: bool = True,
- ) -> SeriesGroupBy:
- from pandas.core.groupby.generic import SeriesGroupBy
-
- if level is None and by is None:
- raise TypeError("You have to supply one of 'by' and 'level'")
- if not as_index:
- raise TypeError("as_index=False only valid with DataFrame")
- axis = self._get_axis_number(axis)
-
- return SeriesGroupBy(
- obj=self,
- keys=by,
- axis=axis,
- level=level,
- as_index=as_index,
- sort=sort,
- group_keys=group_keys,
- observed=observed,
- dropna=dropna,
- )
-
- # ----------------------------------------------------------------------
- # Statistics, overridden ndarray methods
-
- # TODO: integrate bottleneck
- def count(self):
- """
- Return number of non-NA/null observations in the Series.
-
- Returns
- -------
- int or Series (if level specified)
- Number of non-null values in the Series.
-
- See Also
- --------
- DataFrame.count : Count non-NA cells for each column or row.
-
- Examples
- --------
- >>> s = pd.Series([0.0, 1.0, np.nan])
- >>> s.count()
- 2
- """
- return notna(self._values).sum().astype("int64")
-
- def mode(self, dropna: bool = True) -> Series:
- """
- Return the mode(s) of the Series.
-
- The mode is the value that appears most often. There can be multiple modes.
-
- Always returns Series even if only one value is returned.
-
- Parameters
- ----------
- dropna : bool, default True
- Don't consider counts of NaN/NaT.
-
- Returns
- -------
- Series
- Modes of the Series in sorted order.
- """
- # TODO: Add option for bins like value_counts()
- values = self._values
- if isinstance(values, np.ndarray):
- res_values = algorithms.mode(values, dropna=dropna)
- else:
- res_values = values._mode(dropna=dropna)
-
- # Ensure index is type stable (should always use int index)
- return self._constructor(
- res_values, index=range(len(res_values)), name=self.name, copy=False
- )
-
- def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation
- """
- Return unique values of Series object.
-
- Uniques are returned in order of appearance. Hash table-based unique,
- therefore does NOT sort.
-
- Returns
- -------
- ndarray or ExtensionArray
- The unique values returned as a NumPy array. See Notes.
-
- See Also
- --------
- Series.drop_duplicates : Return Series with duplicate values removed.
- unique : Top-level unique method for any 1-d array-like object.
- Index.unique : Return Index with unique values from an Index object.
-
- Notes
- -----
- Returns the unique values as a NumPy array. In case of an
- extension-array backed Series, a new
- :class:`~api.extensions.ExtensionArray` of that type with just
- the unique values is returned. This includes
-
- * Categorical
- * Period
- * Datetime with Timezone
- * Datetime without Timezone
- * Timedelta
- * Interval
- * Sparse
- * IntegerNA
-
- See Examples section.
-
- Examples
- --------
- >>> pd.Series([2, 1, 3, 3], name='A').unique()
- array([2, 1, 3])
-
- >>> pd.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique()
- <DatetimeArray>
- ['2016-01-01 00:00:00']
- Length: 1, dtype: datetime64[ns]
-
- >>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern')
- ... for _ in range(3)]).unique()
- <DatetimeArray>
- ['2016-01-01 00:00:00-05:00']
- Length: 1, dtype: datetime64[ns, US/Eastern]
-
- An Categorical will return categories in the order of
- appearance and with the same dtype.
-
- >>> pd.Series(pd.Categorical(list('baabc'))).unique()
- ['b', 'a', 'c']
- Categories (3, object): ['a', 'b', 'c']
- >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'),
- ... ordered=True)).unique()
- ['b', 'a', 'c']
- Categories (3, object): ['a' < 'b' < 'c']
- """
- return super().unique()
-
- @overload
- def drop_duplicates(
- self,
- *,
- keep: DropKeep = ...,
- inplace: Literal[False] = ...,
- ignore_index: bool = ...,
- ) -> Series:
- ...
-
- @overload
- def drop_duplicates(
- self, *, keep: DropKeep = ..., inplace: Literal[True], ignore_index: bool = ...
- ) -> None:
- ...
-
- @overload
- def drop_duplicates(
- self, *, keep: DropKeep = ..., inplace: bool = ..., ignore_index: bool = ...
- ) -> Series | None:
- ...
-
- def drop_duplicates(
- self,
- *,
- keep: DropKeep = "first",
- inplace: bool = False,
- ignore_index: bool = False,
- ) -> Series | None:
- """
- Return Series with duplicate values removed.
-
- Parameters
- ----------
- keep : {'first', 'last', ``False``}, default 'first'
- Method to handle dropping duplicates:
-
- - 'first' : Drop duplicates except for the first occurrence.
- - 'last' : Drop duplicates except for the last occurrence.
- - ``False`` : Drop all duplicates.
-
- inplace : bool, default ``False``
- If ``True``, performs operation inplace and returns None.
-
- ignore_index : bool, default ``False``
- If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
-
- .. versionadded:: 2.0.0
-
- Returns
- -------
- Series or None
- Series with duplicates dropped or None if ``inplace=True``.
-
- See Also
- --------
- Index.drop_duplicates : Equivalent method on Index.
- DataFrame.drop_duplicates : Equivalent method on DataFrame.
- Series.duplicated : Related method on Series, indicating duplicate
- Series values.
- Series.unique : Return unique values as an array.
-
- Examples
- --------
- Generate a Series with duplicated entries.
-
- >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'],
- ... name='animal')
- >>> s
- 0 lama
- 1 cow
- 2 lama
- 3 beetle
- 4 lama
- 5 hippo
- Name: animal, dtype: object
-
- With the 'keep' parameter, the selection behaviour of duplicated values
- can be changed. The value 'first' keeps the first occurrence for each
- set of duplicated entries. The default value of keep is 'first'.
-
- >>> s.drop_duplicates()
- 0 lama
- 1 cow
- 3 beetle
- 5 hippo
- Name: animal, dtype: object
-
- The value 'last' for parameter 'keep' keeps the last occurrence for
- each set of duplicated entries.
-
- >>> s.drop_duplicates(keep='last')
- 1 cow
- 3 beetle
- 4 lama
- 5 hippo
- Name: animal, dtype: object
-
- The value ``False`` for parameter 'keep' discards all sets of
- duplicated entries.
-
- >>> s.drop_duplicates(keep=False)
- 1 cow
- 3 beetle
- 5 hippo
- Name: animal, dtype: object
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- result = super().drop_duplicates(keep=keep)
-
- if ignore_index:
- result.index = default_index(len(result))
-
- if inplace:
- self._update_inplace(result)
- return None
- else:
- return result
-
- def duplicated(self, keep: DropKeep = "first") -> Series:
- """
- Indicate duplicate Series values.
-
- Duplicated values are indicated as ``True`` values in the resulting
- Series. Either all duplicates, all except the first or all except the
- last occurrence of duplicates can be indicated.
-
- Parameters
- ----------
- keep : {'first', 'last', False}, default 'first'
- Method to handle dropping duplicates:
-
- - 'first' : Mark duplicates as ``True`` except for the first
- occurrence.
- - 'last' : Mark duplicates as ``True`` except for the last
- occurrence.
- - ``False`` : Mark all duplicates as ``True``.
-
- Returns
- -------
- Series[bool]
- Series indicating whether each value has occurred in the
- preceding values.
-
- See Also
- --------
- Index.duplicated : Equivalent method on pandas.Index.
- DataFrame.duplicated : Equivalent method on pandas.DataFrame.
- Series.drop_duplicates : Remove duplicate values from Series.
-
- Examples
- --------
- By default, for each set of duplicated values, the first occurrence is
- set on False and all others on True:
-
- >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'])
- >>> animals.duplicated()
- 0 False
- 1 False
- 2 True
- 3 False
- 4 True
- dtype: bool
-
- which is equivalent to
-
- >>> animals.duplicated(keep='first')
- 0 False
- 1 False
- 2 True
- 3 False
- 4 True
- dtype: bool
-
- By using 'last', the last occurrence of each set of duplicated values
- is set on False and all others on True:
-
- >>> animals.duplicated(keep='last')
- 0 True
- 1 False
- 2 True
- 3 False
- 4 False
- dtype: bool
-
- By setting keep on ``False``, all duplicates are True:
-
- >>> animals.duplicated(keep=False)
- 0 True
- 1 False
- 2 True
- 3 False
- 4 True
- dtype: bool
- """
- res = self._duplicated(keep=keep)
- result = self._constructor(res, index=self.index, copy=False)
- return result.__finalize__(self, method="duplicated")
-
- def idxmin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashable:
- """
- Return the row label of the minimum value.
-
- If multiple values equal the minimum, the first row label with that
- value is returned.
-
- Parameters
- ----------
- axis : {0 or 'index'}
- Unused. Parameter needed for compatibility with DataFrame.
- skipna : bool, default True
- Exclude NA/null values. If the entire Series is NA, the result
- will be NA.
- *args, **kwargs
- Additional arguments and keywords have no effect but might be
- accepted for compatibility with NumPy.
-
- Returns
- -------
- Index
- Label of the minimum value.
-
- Raises
- ------
- ValueError
- If the Series is empty.
-
- See Also
- --------
- numpy.argmin : Return indices of the minimum values
- along the given axis.
- DataFrame.idxmin : Return index of first occurrence of minimum
- over requested axis.
- Series.idxmax : Return index *label* of the first occurrence
- of maximum of values.
-
- Notes
- -----
- This method is the Series version of ``ndarray.argmin``. This method
- returns the label of the minimum, while ``ndarray.argmin`` returns
- the position. To get the position, use ``series.values.argmin()``.
-
- Examples
- --------
- >>> s = pd.Series(data=[1, None, 4, 1],
- ... index=['A', 'B', 'C', 'D'])
- >>> s
- A 1.0
- B NaN
- C 4.0
- D 1.0
- dtype: float64
-
- >>> s.idxmin()
- 'A'
-
- If `skipna` is False and there is an NA value in the data,
- the function returns ``nan``.
-
- >>> s.idxmin(skipna=False)
- nan
- """
- # error: Argument 1 to "argmin" of "IndexOpsMixin" has incompatible type "Union
- # [int, Literal['index', 'columns']]"; expected "Optional[int]"
- i = self.argmin(axis, skipna, *args, **kwargs) # type: ignore[arg-type]
- if i == -1:
- return np.nan
- return self.index[i]
-
- def idxmax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashable:
- """
- Return the row label of the maximum value.
-
- If multiple values equal the maximum, the first row label with that
- value is returned.
-
- Parameters
- ----------
- axis : {0 or 'index'}
- Unused. Parameter needed for compatibility with DataFrame.
- skipna : bool, default True
- Exclude NA/null values. If the entire Series is NA, the result
- will be NA.
- *args, **kwargs
- Additional arguments and keywords have no effect but might be
- accepted for compatibility with NumPy.
-
- Returns
- -------
- Index
- Label of the maximum value.
-
- Raises
- ------
- ValueError
- If the Series is empty.
-
- See Also
- --------
- numpy.argmax : Return indices of the maximum values
- along the given axis.
- DataFrame.idxmax : Return index of first occurrence of maximum
- over requested axis.
- Series.idxmin : Return index *label* of the first occurrence
- of minimum of values.
-
- Notes
- -----
- This method is the Series version of ``ndarray.argmax``. This method
- returns the label of the maximum, while ``ndarray.argmax`` returns
- the position. To get the position, use ``series.values.argmax()``.
-
- Examples
- --------
- >>> s = pd.Series(data=[1, None, 4, 3, 4],
- ... index=['A', 'B', 'C', 'D', 'E'])
- >>> s
- A 1.0
- B NaN
- C 4.0
- D 3.0
- E 4.0
- dtype: float64
-
- >>> s.idxmax()
- 'C'
-
- If `skipna` is False and there is an NA value in the data,
- the function returns ``nan``.
-
- >>> s.idxmax(skipna=False)
- nan
- """
- # error: Argument 1 to "argmax" of "IndexOpsMixin" has incompatible type
- # "Union[int, Literal['index', 'columns']]"; expected "Optional[int]"
- i = self.argmax(axis, skipna, *args, **kwargs) # type: ignore[arg-type]
- if i == -1:
- return np.nan
- return self.index[i]
-
- def round(self, decimals: int = 0, *args, **kwargs) -> Series:
- """
- Round each value in a Series to the given number of decimals.
-
- Parameters
- ----------
- decimals : int, default 0
- Number of decimal places to round to. If decimals is negative,
- it specifies the number of positions to the left of the decimal point.
- *args, **kwargs
- Additional arguments and keywords have no effect but might be
- accepted for compatibility with NumPy.
-
- Returns
- -------
- Series
- Rounded values of the Series.
-
- See Also
- --------
- numpy.around : Round values of an np.array.
- DataFrame.round : Round values of a DataFrame.
-
- Examples
- --------
- >>> s = pd.Series([0.1, 1.3, 2.7])
- >>> s.round()
- 0 0.0
- 1 1.0
- 2 3.0
- dtype: float64
- """
- nv.validate_round(args, kwargs)
- result = self._values.round(decimals)
- result = self._constructor(result, index=self.index, copy=False).__finalize__(
- self, method="round"
- )
-
- return result
-
- @overload
- def quantile(
- self, q: float = ..., interpolation: QuantileInterpolation = ...
- ) -> float:
- ...
-
- @overload
- def quantile(
- self,
- q: Sequence[float] | AnyArrayLike,
- interpolation: QuantileInterpolation = ...,
- ) -> Series:
- ...
-
- @overload
- def quantile(
- self,
- q: float | Sequence[float] | AnyArrayLike = ...,
- interpolation: QuantileInterpolation = ...,
- ) -> float | Series:
- ...
-
- def quantile(
- self,
- q: float | Sequence[float] | AnyArrayLike = 0.5,
- interpolation: QuantileInterpolation = "linear",
- ) -> float | Series:
- """
- Return value at the given quantile.
-
- Parameters
- ----------
- q : float or array-like, default 0.5 (50% quantile)
- The quantile(s) to compute, which can lie in range: 0 <= q <= 1.
- interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
- This optional parameter specifies the interpolation method to use,
- when the desired quantile lies between two data points `i` and `j`:
-
- * linear: `i + (j - i) * fraction`, where `fraction` is the
- fractional part of the index surrounded by `i` and `j`.
- * lower: `i`.
- * higher: `j`.
- * nearest: `i` or `j` whichever is nearest.
- * midpoint: (`i` + `j`) / 2.
-
- Returns
- -------
- float or Series
- If ``q`` is an array, a Series will be returned where the
- index is ``q`` and the values are the quantiles, otherwise
- a float will be returned.
-
- See Also
- --------
- core.window.Rolling.quantile : Calculate the rolling quantile.
- numpy.percentile : Returns the q-th percentile(s) of the array elements.
-
- Examples
- --------
- >>> s = pd.Series([1, 2, 3, 4])
- >>> s.quantile(.5)
- 2.5
- >>> s.quantile([.25, .5, .75])
- 0.25 1.75
- 0.50 2.50
- 0.75 3.25
- dtype: float64
- """
- validate_percentile(q)
-
- # We dispatch to DataFrame so that core.internals only has to worry
- # about 2D cases.
- df = self.to_frame()
-
- result = df.quantile(q=q, interpolation=interpolation, numeric_only=False)
- if result.ndim == 2:
- result = result.iloc[:, 0]
-
- if is_list_like(q):
- result.name = self.name
- idx = Index(q, dtype=np.float64)
- return self._constructor(result, index=idx, name=self.name)
- else:
- # scalar
- return result.iloc[0]
-
- def corr(
- self,
- other: Series,
- method: CorrelationMethod = "pearson",
- min_periods: int | None = None,
- ) -> float:
- """
- Compute correlation with `other` Series, excluding missing values.
-
- The two `Series` objects are not required to be the same length and will be
- aligned internally before the correlation function is applied.
-
- Parameters
- ----------
- other : Series
- Series with which to compute the correlation.
- method : {'pearson', 'kendall', 'spearman'} or callable
- Method used to compute correlation:
-
- - pearson : Standard correlation coefficient
- - kendall : Kendall Tau correlation coefficient
- - spearman : Spearman rank correlation
- - callable: Callable with input two 1d ndarrays and returning a float.
-
- .. warning::
- Note that the returned matrix from corr will have 1 along the
- diagonals and will be symmetric regardless of the callable's
- behavior.
- min_periods : int, optional
- Minimum number of observations needed to have a valid result.
-
- Returns
- -------
- float
- Correlation with other.
-
- See Also
- --------
- DataFrame.corr : Compute pairwise correlation between columns.
- DataFrame.corrwith : Compute pairwise correlation with another
- DataFrame or Series.
-
- Notes
- -----
- Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.
-
- * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
- * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
- * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
-
- Examples
- --------
- >>> def histogram_intersection(a, b):
- ... v = np.minimum(a, b).sum().round(decimals=1)
- ... return v
- >>> s1 = pd.Series([.2, .0, .6, .2])
- >>> s2 = pd.Series([.3, .6, .0, .1])
- >>> s1.corr(s2, method=histogram_intersection)
- 0.3
- """ # noqa:E501
- this, other = self.align(other, join="inner", copy=False)
- if len(this) == 0:
- return np.nan
-
- if method in ["pearson", "spearman", "kendall"] or callable(method):
- return nanops.nancorr(
- this.values, other.values, method=method, min_periods=min_periods
- )
-
- raise ValueError(
- "method must be either 'pearson', "
- "'spearman', 'kendall', or a callable, "
- f"'{method}' was supplied"
- )
-
- def cov(
- self,
- other: Series,
- min_periods: int | None = None,
- ddof: int | None = 1,
- ) -> float:
- """
- Compute covariance with Series, excluding missing values.
-
- The two `Series` objects are not required to be the same length and
- will be aligned internally before the covariance is calculated.
-
- Parameters
- ----------
- other : Series
- Series with which to compute the covariance.
- min_periods : int, optional
- Minimum number of observations needed to have a valid result.
- ddof : int, default 1
- Delta degrees of freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- float
- Covariance between Series and other normalized by N-1
- (unbiased estimator).
-
- See Also
- --------
- DataFrame.cov : Compute pairwise covariance of columns.
-
- Examples
- --------
- >>> s1 = pd.Series([0.90010907, 0.13484424, 0.62036035])
- >>> s2 = pd.Series([0.12528585, 0.26962463, 0.51111198])
- >>> s1.cov(s2)
- -0.01685762652715874
- """
- this, other = self.align(other, join="inner", copy=False)
- if len(this) == 0:
- return np.nan
- return nanops.nancov(
- this.values, other.values, min_periods=min_periods, ddof=ddof
- )
-
- @doc(
- klass="Series",
- extra_params="",
- other_klass="DataFrame",
- examples=dedent(
- """
- Difference with previous row
-
- >>> s = pd.Series([1, 1, 2, 3, 5, 8])
- >>> s.diff()
- 0 NaN
- 1 0.0
- 2 1.0
- 3 1.0
- 4 2.0
- 5 3.0
- dtype: float64
-
- Difference with 3rd previous row
-
- >>> s.diff(periods=3)
- 0 NaN
- 1 NaN
- 2 NaN
- 3 2.0
- 4 4.0
- 5 6.0
- dtype: float64
-
- Difference with following row
-
- >>> s.diff(periods=-1)
- 0 0.0
- 1 -1.0
- 2 -1.0
- 3 -2.0
- 4 -3.0
- 5 NaN
- dtype: float64
-
- Overflow in input dtype
-
- >>> s = pd.Series([1, 0], dtype=np.uint8)
- >>> s.diff()
- 0 NaN
- 1 255.0
- dtype: float64"""
- ),
- )
- def diff(self, periods: int = 1) -> Series:
- """
- First discrete difference of element.
-
- Calculates the difference of a {klass} element compared with another
- element in the {klass} (default is element in previous row).
-
- Parameters
- ----------
- periods : int, default 1
- Periods to shift for calculating difference, accepts negative
- values.
- {extra_params}
- Returns
- -------
- {klass}
- First differences of the Series.
-
- See Also
- --------
- {klass}.pct_change: Percent change over given number of periods.
- {klass}.shift: Shift index by desired number of periods with an
- optional time freq.
- {other_klass}.diff: First discrete difference of object.
-
- Notes
- -----
- For boolean dtypes, this uses :meth:`operator.xor` rather than
- :meth:`operator.sub`.
- The result is calculated according to current dtype in {klass},
- however dtype of the result is always float64.
-
- Examples
- --------
- {examples}
- """
- result = algorithms.diff(self._values, periods)
- return self._constructor(result, index=self.index, copy=False).__finalize__(
- self, method="diff"
- )
-
- def autocorr(self, lag: int = 1) -> float:
- """
- Compute the lag-N autocorrelation.
-
- This method computes the Pearson correlation between
- the Series and its shifted self.
-
- Parameters
- ----------
- lag : int, default 1
- Number of lags to apply before performing autocorrelation.
-
- Returns
- -------
- float
- The Pearson correlation between self and self.shift(lag).
-
- See Also
- --------
- Series.corr : Compute the correlation between two Series.
- Series.shift : Shift index by desired number of periods.
- DataFrame.corr : Compute pairwise correlation of columns.
- DataFrame.corrwith : Compute pairwise correlation between rows or
- columns of two DataFrame objects.
-
- Notes
- -----
- If the Pearson correlation is not well defined return 'NaN'.
-
- Examples
- --------
- >>> s = pd.Series([0.25, 0.5, 0.2, -0.05])
- >>> s.autocorr() # doctest: +ELLIPSIS
- 0.10355...
- >>> s.autocorr(lag=2) # doctest: +ELLIPSIS
- -0.99999...
-
- If the Pearson correlation is not well defined, then 'NaN' is returned.
-
- >>> s = pd.Series([1, 0, 0, 0])
- >>> s.autocorr()
- nan
- """
- return self.corr(self.shift(lag))
-
- def dot(self, other: AnyArrayLike) -> Series | np.ndarray:
- """
- Compute the dot product between the Series and the columns of other.
-
- This method computes the dot product between the Series and another
- one, or the Series and each columns of a DataFrame, or the Series and
- each columns of an array.
-
- It can also be called using `self @ other` in Python >= 3.5.
-
- Parameters
- ----------
- other : Series, DataFrame or array-like
- The other object to compute the dot product with its columns.
-
- Returns
- -------
- scalar, Series or numpy.ndarray
- Return the dot product of the Series and other if other is a
- Series, the Series of the dot product of Series and each rows of
- other if other is a DataFrame or a numpy.ndarray between the Series
- and each columns of the numpy array.
-
- See Also
- --------
- DataFrame.dot: Compute the matrix product with the DataFrame.
- Series.mul: Multiplication of series and other, element-wise.
-
- Notes
- -----
- The Series and other has to share the same index if other is a Series
- or a DataFrame.
-
- Examples
- --------
- >>> s = pd.Series([0, 1, 2, 3])
- >>> other = pd.Series([-1, 2, -3, 4])
- >>> s.dot(other)
- 8
- >>> s @ other
- 8
- >>> df = pd.DataFrame([[0, 1], [-2, 3], [4, -5], [6, 7]])
- >>> s.dot(df)
- 0 24
- 1 14
- dtype: int64
- >>> arr = np.array([[0, 1], [-2, 3], [4, -5], [6, 7]])
- >>> s.dot(arr)
- array([24, 14])
- """
- if isinstance(other, (Series, ABCDataFrame)):
- common = self.index.union(other.index)
- if len(common) > len(self.index) or len(common) > len(other.index):
- raise ValueError("matrices are not aligned")
-
- left = self.reindex(index=common, copy=False)
- right = other.reindex(index=common, copy=False)
- lvals = left.values
- rvals = right.values
- else:
- lvals = self.values
- rvals = np.asarray(other)
- if lvals.shape[0] != rvals.shape[0]:
- raise Exception(
- f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
- )
-
- if isinstance(other, ABCDataFrame):
- return self._constructor(
- np.dot(lvals, rvals), index=other.columns, copy=False
- ).__finalize__(self, method="dot")
- elif isinstance(other, Series):
- return np.dot(lvals, rvals)
- elif isinstance(rvals, np.ndarray):
- return np.dot(lvals, rvals)
- else: # pragma: no cover
- raise TypeError(f"unsupported type: {type(other)}")
-
- def __matmul__(self, other):
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- return self.dot(other)
-
- def __rmatmul__(self, other):
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- return self.dot(np.transpose(other))
-
- @doc(base.IndexOpsMixin.searchsorted, klass="Series")
- # Signature of "searchsorted" incompatible with supertype "IndexOpsMixin"
- def searchsorted( # type: ignore[override]
- self,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- return base.IndexOpsMixin.searchsorted(self, value, side=side, sorter=sorter)
-
- # -------------------------------------------------------------------
- # Combination
-
- def _append(
- self, to_append, ignore_index: bool = False, verify_integrity: bool = False
- ):
- from pandas.core.reshape.concat import concat
-
- if isinstance(to_append, (list, tuple)):
- to_concat = [self]
- to_concat.extend(to_append)
- else:
- to_concat = [self, to_append]
- if any(isinstance(x, (ABCDataFrame,)) for x in to_concat[1:]):
- msg = "to_append should be a Series or list/tuple of Series, got DataFrame"
- raise TypeError(msg)
- return concat(
- to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity
- )
-
- def _binop(self, other: Series, func, level=None, fill_value=None):
- """
- Perform generic binary operation with optional fill value.
-
- Parameters
- ----------
- other : Series
- func : binary operator
- fill_value : float or object
- Value to substitute for NA/null values. If both Series are NA in a
- location, the result will be NA regardless of the passed fill value.
- level : int or level name, default None
- Broadcast across a level, matching Index values on the
- passed MultiIndex level.
-
- Returns
- -------
- Series
- """
- if not isinstance(other, Series):
- raise AssertionError("Other operand must be Series")
-
- this = self
-
- if not self.index.equals(other.index):
- this, other = self.align(other, level=level, join="outer", copy=False)
-
- this_vals, other_vals = ops.fill_binop(this._values, other._values, fill_value)
-
- with np.errstate(all="ignore"):
- result = func(this_vals, other_vals)
-
- name = ops.get_op_result_name(self, other)
- return this._construct_result(result, name)
-
- def _construct_result(
- self, result: ArrayLike | tuple[ArrayLike, ArrayLike], name: Hashable
- ) -> Series | tuple[Series, Series]:
- """
- Construct an appropriately-labelled Series from the result of an op.
-
- Parameters
- ----------
- result : ndarray or ExtensionArray
- name : Label
-
- Returns
- -------
- Series
- In the case of __divmod__ or __rdivmod__, a 2-tuple of Series.
- """
- if isinstance(result, tuple):
- # produced by divmod or rdivmod
-
- res1 = self._construct_result(result[0], name=name)
- res2 = self._construct_result(result[1], name=name)
-
- # GH#33427 assertions to keep mypy happy
- assert isinstance(res1, Series)
- assert isinstance(res2, Series)
- return (res1, res2)
-
- # TODO: result should always be ArrayLike, but this fails for some
- # JSONArray tests
- dtype = getattr(result, "dtype", None)
- out = self._constructor(result, index=self.index, dtype=dtype)
- out = out.__finalize__(self)
-
- # Set the result's name after __finalize__ is called because __finalize__
- # would set it back to self.name
- out.name = name
- return out
-
- @doc(
- _shared_docs["compare"],
- """
-Returns
--------
-Series or DataFrame
- If axis is 0 or 'index' the result will be a Series.
- The resulting index will be a MultiIndex with 'self' and 'other'
- stacked alternately at the inner level.
-
- If axis is 1 or 'columns' the result will be a DataFrame.
- It will have two columns namely 'self' and 'other'.
-
-See Also
---------
-DataFrame.compare : Compare with another DataFrame and show differences.
-
-Notes
------
-Matching NaNs will not appear as a difference.
-
-Examples
---------
->>> s1 = pd.Series(["a", "b", "c", "d", "e"])
->>> s2 = pd.Series(["a", "a", "c", "b", "e"])
-
-Align the differences on columns
-
->>> s1.compare(s2)
- self other
-1 b a
-3 d b
-
-Stack the differences on indices
-
->>> s1.compare(s2, align_axis=0)
-1 self b
- other a
-3 self d
- other b
-dtype: object
-
-Keep all original rows
-
->>> s1.compare(s2, keep_shape=True)
- self other
-0 NaN NaN
-1 b a
-2 NaN NaN
-3 d b
-4 NaN NaN
-
-Keep all original rows and also all original values
-
->>> s1.compare(s2, keep_shape=True, keep_equal=True)
- self other
-0 a a
-1 b a
-2 c c
-3 d b
-4 e e
-""",
- klass=_shared_doc_kwargs["klass"],
- )
- def compare(
- self,
- other: Series,
- align_axis: Axis = 1,
- keep_shape: bool = False,
- keep_equal: bool = False,
- result_names: Suffixes = ("self", "other"),
- ) -> DataFrame | Series:
- return super().compare(
- other=other,
- align_axis=align_axis,
- keep_shape=keep_shape,
- keep_equal=keep_equal,
- result_names=result_names,
- )
-
- def combine(
- self,
- other: Series | Hashable,
- func: Callable[[Hashable, Hashable], Hashable],
- fill_value: Hashable = None,
- ) -> Series:
- """
- Combine the Series with a Series or scalar according to `func`.
-
- Combine the Series and `other` using `func` to perform elementwise
- selection for combined Series.
- `fill_value` is assumed when value is missing at some index
- from one of the two objects being combined.
-
- Parameters
- ----------
- other : Series or scalar
- The value(s) to be combined with the `Series`.
- func : function
- Function that takes two scalars as inputs and returns an element.
- fill_value : scalar, optional
- The value to assume when an index is missing from
- one Series or the other. The default specifies to use the
- appropriate NaN value for the underlying dtype of the Series.
-
- Returns
- -------
- Series
- The result of combining the Series with the other object.
-
- See Also
- --------
- Series.combine_first : Combine Series values, choosing the calling
- Series' values first.
-
- Examples
- --------
- Consider 2 Datasets ``s1`` and ``s2`` containing
- highest clocked speeds of different birds.
-
- >>> s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0})
- >>> s1
- falcon 330.0
- eagle 160.0
- dtype: float64
- >>> s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})
- >>> s2
- falcon 345.0
- eagle 200.0
- duck 30.0
- dtype: float64
-
- Now, to combine the two datasets and view the highest speeds
- of the birds across the two datasets
-
- >>> s1.combine(s2, max)
- duck NaN
- eagle 200.0
- falcon 345.0
- dtype: float64
-
- In the previous example, the resulting value for duck is missing,
- because the maximum of a NaN and a float is a NaN.
- So, in the example, we set ``fill_value=0``,
- so the maximum value returned will be the value from some dataset.
-
- >>> s1.combine(s2, max, fill_value=0)
- duck 30.0
- eagle 200.0
- falcon 345.0
- dtype: float64
- """
- if fill_value is None:
- fill_value = na_value_for_dtype(self.dtype, compat=False)
-
- if isinstance(other, Series):
- # If other is a Series, result is based on union of Series,
- # so do this element by element
- new_index = self.index.union(other.index)
- new_name = ops.get_op_result_name(self, other)
- new_values = np.empty(len(new_index), dtype=object)
- for i, idx in enumerate(new_index):
- lv = self.get(idx, fill_value)
- rv = other.get(idx, fill_value)
- with np.errstate(all="ignore"):
- new_values[i] = func(lv, rv)
- else:
- # Assume that other is a scalar, so apply the function for
- # each element in the Series
- new_index = self.index
- new_values = np.empty(len(new_index), dtype=object)
- with np.errstate(all="ignore"):
- new_values[:] = [func(lv, other) for lv in self._values]
- new_name = self.name
-
- # try_float=False is to match agg_series
- npvalues = lib.maybe_convert_objects(new_values, try_float=False)
- res_values = maybe_cast_pointwise_result(npvalues, self.dtype, same_dtype=False)
- return self._constructor(res_values, index=new_index, name=new_name, copy=False)
-
- def combine_first(self, other) -> Series:
- """
- Update null elements with value in the same location in 'other'.
-
- Combine two Series objects by filling null values in one Series with
- non-null values from the other Series. Result index will be the union
- of the two indexes.
-
- Parameters
- ----------
- other : Series
- The value(s) to be used for filling null values.
-
- Returns
- -------
- Series
- The result of combining the provided Series with the other object.
-
- See Also
- --------
- Series.combine : Perform element-wise operation on two Series
- using a given function.
-
- Examples
- --------
- >>> s1 = pd.Series([1, np.nan])
- >>> s2 = pd.Series([3, 4, 5])
- >>> s1.combine_first(s2)
- 0 1.0
- 1 4.0
- 2 5.0
- dtype: float64
-
- Null values still persist if the location of that null value
- does not exist in `other`
-
- >>> s1 = pd.Series({'falcon': np.nan, 'eagle': 160.0})
- >>> s2 = pd.Series({'eagle': 200.0, 'duck': 30.0})
- >>> s1.combine_first(s2)
- duck 30.0
- eagle 160.0
- falcon NaN
- dtype: float64
- """
- new_index = self.index.union(other.index)
- this = self.reindex(new_index, copy=False)
- other = other.reindex(new_index, copy=False)
- if this.dtype.kind == "M" and other.dtype.kind != "M":
- other = to_datetime(other)
-
- return this.where(notna(this), other)
-
- def update(self, other: Series | Sequence | Mapping) -> None:
- """
- Modify Series in place using values from passed Series.
-
- Uses non-NA values from passed Series to make updates. Aligns
- on index.
-
- Parameters
- ----------
- other : Series, or object coercible into Series
-
- Examples
- --------
- >>> s = pd.Series([1, 2, 3])
- >>> s.update(pd.Series([4, 5, 6]))
- >>> s
- 0 4
- 1 5
- 2 6
- dtype: int64
-
- >>> s = pd.Series(['a', 'b', 'c'])
- >>> s.update(pd.Series(['d', 'e'], index=[0, 2]))
- >>> s
- 0 d
- 1 b
- 2 e
- dtype: object
-
- >>> s = pd.Series([1, 2, 3])
- >>> s.update(pd.Series([4, 5, 6, 7, 8]))
- >>> s
- 0 4
- 1 5
- 2 6
- dtype: int64
-
- If ``other`` contains NaNs the corresponding values are not updated
- in the original Series.
-
- >>> s = pd.Series([1, 2, 3])
- >>> s.update(pd.Series([4, np.nan, 6]))
- >>> s
- 0 4
- 1 2
- 2 6
- dtype: int64
-
- ``other`` can also be a non-Series object type
- that is coercible into a Series
-
- >>> s = pd.Series([1, 2, 3])
- >>> s.update([4, np.nan, 6])
- >>> s
- 0 4
- 1 2
- 2 6
- dtype: int64
-
- >>> s = pd.Series([1, 2, 3])
- >>> s.update({1: 9})
- >>> s
- 0 1
- 1 9
- 2 3
- dtype: int64
- """
-
- if not isinstance(other, Series):
- other = Series(other)
-
- other = other.reindex_like(self)
- mask = notna(other)
-
- self._mgr = self._mgr.putmask(mask=mask, new=other)
- self._maybe_update_cacher()
-
- # ----------------------------------------------------------------------
- # Reindexing, sorting
-
- @overload
- def sort_values(
- self,
- *,
- axis: Axis = ...,
- ascending: bool | int | Sequence[bool] | Sequence[int] = ...,
- inplace: Literal[False] = ...,
- kind: str = ...,
- na_position: str = ...,
- ignore_index: bool = ...,
- key: ValueKeyFunc = ...,
- ) -> Series:
- ...
-
- @overload
- def sort_values(
- self,
- *,
- axis: Axis = ...,
- ascending: bool | int | Sequence[bool] | Sequence[int] = ...,
- inplace: Literal[True],
- kind: str = ...,
- na_position: str = ...,
- ignore_index: bool = ...,
- key: ValueKeyFunc = ...,
- ) -> None:
- ...
-
- def sort_values(
- self,
- *,
- axis: Axis = 0,
- ascending: bool | int | Sequence[bool] | Sequence[int] = True,
- inplace: bool = False,
- kind: str = "quicksort",
- na_position: str = "last",
- ignore_index: bool = False,
- key: ValueKeyFunc = None,
- ) -> Series | None:
- """
- Sort by the values.
-
- Sort a Series in ascending or descending order by some
- criterion.
-
- Parameters
- ----------
- axis : {0 or 'index'}
- Unused. Parameter needed for compatibility with DataFrame.
- ascending : bool or list of bools, default True
- If True, sort values in ascending order, otherwise descending.
- inplace : bool, default False
- If True, perform operation in-place.
- kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
- Choice of sorting algorithm. See also :func:`numpy.sort` for more
- information. 'mergesort' and 'stable' are the only stable algorithms.
- na_position : {'first' or 'last'}, default 'last'
- Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
- the end.
- ignore_index : bool, default False
- If True, the resulting axis will be labeled 0, 1, …, n - 1.
- key : callable, optional
- If not None, apply the key function to the series values
- before sorting. This is similar to the `key` argument in the
- builtin :meth:`sorted` function, with the notable difference that
- this `key` function should be *vectorized*. It should expect a
- ``Series`` and return an array-like.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- Series or None
- Series ordered by values or None if ``inplace=True``.
-
- See Also
- --------
- Series.sort_index : Sort by the Series indices.
- DataFrame.sort_values : Sort DataFrame by the values along either axis.
- DataFrame.sort_index : Sort DataFrame by indices.
-
- Examples
- --------
- >>> s = pd.Series([np.nan, 1, 3, 10, 5])
- >>> s
- 0 NaN
- 1 1.0
- 2 3.0
- 3 10.0
- 4 5.0
- dtype: float64
-
- Sort values ascending order (default behaviour)
-
- >>> s.sort_values(ascending=True)
- 1 1.0
- 2 3.0
- 4 5.0
- 3 10.0
- 0 NaN
- dtype: float64
-
- Sort values descending order
-
- >>> s.sort_values(ascending=False)
- 3 10.0
- 4 5.0
- 2 3.0
- 1 1.0
- 0 NaN
- dtype: float64
-
- Sort values putting NAs first
-
- >>> s.sort_values(na_position='first')
- 0 NaN
- 1 1.0
- 2 3.0
- 4 5.0
- 3 10.0
- dtype: float64
-
- Sort a series of strings
-
- >>> s = pd.Series(['z', 'b', 'd', 'a', 'c'])
- >>> s
- 0 z
- 1 b
- 2 d
- 3 a
- 4 c
- dtype: object
-
- >>> s.sort_values()
- 3 a
- 1 b
- 4 c
- 2 d
- 0 z
- dtype: object
-
- Sort using a key function. Your `key` function will be
- given the ``Series`` of values and should return an array-like.
-
- >>> s = pd.Series(['a', 'B', 'c', 'D', 'e'])
- >>> s.sort_values()
- 1 B
- 3 D
- 0 a
- 2 c
- 4 e
- dtype: object
- >>> s.sort_values(key=lambda x: x.str.lower())
- 0 a
- 1 B
- 2 c
- 3 D
- 4 e
- dtype: object
-
- NumPy ufuncs work well here. For example, we can
- sort by the ``sin`` of the value
-
- >>> s = pd.Series([-4, -2, 0, 2, 4])
- >>> s.sort_values(key=np.sin)
- 1 -2
- 4 4
- 2 0
- 0 -4
- 3 2
- dtype: int64
-
- More complicated user-defined functions can be used,
- as long as they expect a Series and return an array-like
-
- >>> s.sort_values(key=lambda x: (np.tan(x.cumsum())))
- 0 -4
- 3 2
- 4 4
- 1 -2
- 2 0
- dtype: int64
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- # Validate the axis parameter
- self._get_axis_number(axis)
-
- # GH 5856/5853
- if inplace and self._is_cached:
- raise ValueError(
- "This Series is a view of some other array, to "
- "sort in-place you must create a copy"
- )
-
- if is_list_like(ascending):
- ascending = cast(Sequence[Union[bool, int]], ascending)
- if len(ascending) != 1:
- raise ValueError(
- f"Length of ascending ({len(ascending)}) must be 1 for Series"
- )
- ascending = ascending[0]
-
- ascending = validate_ascending(ascending)
-
- if na_position not in ["first", "last"]:
- raise ValueError(f"invalid na_position: {na_position}")
-
- # GH 35922. Make sorting stable by leveraging nargsort
- values_to_sort = ensure_key_mapped(self, key)._values if key else self._values
- sorted_index = nargsort(values_to_sort, kind, bool(ascending), na_position)
-
- if is_range_indexer(sorted_index, len(sorted_index)):
- if inplace:
- return self._update_inplace(self)
- return self.copy(deep=None)
-
- result = self._constructor(
- self._values[sorted_index], index=self.index[sorted_index], copy=False
- )
-
- if ignore_index:
- result.index = default_index(len(sorted_index))
-
- if not inplace:
- return result.__finalize__(self, method="sort_values")
- self._update_inplace(result)
- return None
-
- @overload
- def sort_index(
- self,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool | Sequence[bool] = ...,
- inplace: Literal[True],
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool = ...,
- ignore_index: bool = ...,
- key: IndexKeyFunc = ...,
- ) -> None:
- ...
-
- @overload
- def sort_index(
- self,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool | Sequence[bool] = ...,
- inplace: Literal[False] = ...,
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool = ...,
- ignore_index: bool = ...,
- key: IndexKeyFunc = ...,
- ) -> Series:
- ...
-
- @overload
- def sort_index(
- self,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool | Sequence[bool] = ...,
- inplace: bool = ...,
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool = ...,
- ignore_index: bool = ...,
- key: IndexKeyFunc = ...,
- ) -> Series | None:
- ...
-
- def sort_index(
- self,
- *,
- axis: Axis = 0,
- level: IndexLabel = None,
- ascending: bool | Sequence[bool] = True,
- inplace: bool = False,
- kind: SortKind = "quicksort",
- na_position: NaPosition = "last",
- sort_remaining: bool = True,
- ignore_index: bool = False,
- key: IndexKeyFunc = None,
- ) -> Series | None:
- """
- Sort Series by index labels.
-
- Returns a new Series sorted by label if `inplace` argument is
- ``False``, otherwise updates the original series and returns None.
-
- Parameters
- ----------
- axis : {0 or 'index'}
- Unused. Parameter needed for compatibility with DataFrame.
- level : int, optional
- If not None, sort on values in specified index level(s).
- ascending : bool or list-like of bools, default True
- Sort ascending vs. descending. When the index is a MultiIndex the
- sort direction can be controlled for each level individually.
- inplace : bool, default False
- If True, perform operation in-place.
- kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
- Choice of sorting algorithm. See also :func:`numpy.sort` for more
- information. 'mergesort' and 'stable' are the only stable algorithms. For
- DataFrames, this option is only applied when sorting on a single
- column or label.
- na_position : {'first', 'last'}, default 'last'
- If 'first' puts NaNs at the beginning, 'last' puts NaNs at the end.
- Not implemented for MultiIndex.
- sort_remaining : bool, default True
- If True and sorting by level and index is multilevel, sort by other
- levels too (in order) after sorting by specified level.
- ignore_index : bool, default False
- If True, the resulting axis will be labeled 0, 1, …, n - 1.
- key : callable, optional
- If not None, apply the key function to the index values
- before sorting. This is similar to the `key` argument in the
- builtin :meth:`sorted` function, with the notable difference that
- this `key` function should be *vectorized*. It should expect an
- ``Index`` and return an ``Index`` of the same shape.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- Series or None
- The original Series sorted by the labels or None if ``inplace=True``.
-
- See Also
- --------
- DataFrame.sort_index: Sort DataFrame by the index.
- DataFrame.sort_values: Sort DataFrame by the value.
- Series.sort_values : Sort Series by the value.
-
- Examples
- --------
- >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4])
- >>> s.sort_index()
- 1 c
- 2 b
- 3 a
- 4 d
- dtype: object
-
- Sort Descending
-
- >>> s.sort_index(ascending=False)
- 4 d
- 3 a
- 2 b
- 1 c
- dtype: object
-
- By default NaNs are put at the end, but use `na_position` to place
- them at the beginning
-
- >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, np.nan])
- >>> s.sort_index(na_position='first')
- NaN d
- 1.0 c
- 2.0 b
- 3.0 a
- dtype: object
-
- Specify index level to sort
-
- >>> arrays = [np.array(['qux', 'qux', 'foo', 'foo',
- ... 'baz', 'baz', 'bar', 'bar']),
- ... np.array(['two', 'one', 'two', 'one',
- ... 'two', 'one', 'two', 'one'])]
- >>> s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=arrays)
- >>> s.sort_index(level=1)
- bar one 8
- baz one 6
- foo one 4
- qux one 2
- bar two 7
- baz two 5
- foo two 3
- qux two 1
- dtype: int64
-
- Does not sort by remaining levels when sorting by levels
-
- >>> s.sort_index(level=1, sort_remaining=False)
- qux one 2
- foo one 4
- baz one 6
- bar one 8
- qux two 1
- foo two 3
- baz two 5
- bar two 7
- dtype: int64
-
- Apply a key function before sorting
-
- >>> s = pd.Series([1, 2, 3, 4], index=['A', 'b', 'C', 'd'])
- >>> s.sort_index(key=lambda x : x.str.lower())
- A 1
- b 2
- C 3
- d 4
- dtype: int64
- """
-
- return super().sort_index(
- axis=axis,
- level=level,
- ascending=ascending,
- inplace=inplace,
- kind=kind,
- na_position=na_position,
- sort_remaining=sort_remaining,
- ignore_index=ignore_index,
- key=key,
- )
-
- def argsort(
- self,
- axis: Axis = 0,
- kind: SortKind = "quicksort",
- order: None = None,
- ) -> Series:
- """
- Return the integer indices that would sort the Series values.
-
- Override ndarray.argsort. Argsorts the value, omitting NA/null values,
- and places the result in the same locations as the non-NA values.
-
- Parameters
- ----------
- axis : {0 or 'index'}
- Unused. Parameter needed for compatibility with DataFrame.
- kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort'
- Choice of sorting algorithm. See :func:`numpy.sort` for more
- information. 'mergesort' and 'stable' are the only stable algorithms.
- order : None
- Has no effect but is accepted for compatibility with numpy.
-
- Returns
- -------
- Series[np.intp]
- Positions of values within the sort order with -1 indicating
- nan values.
-
- See Also
- --------
- numpy.ndarray.argsort : Returns the indices that would sort this array.
- """
- values = self._values
- mask = isna(values)
-
- if mask.any():
- result = np.full(len(self), -1, dtype=np.intp)
- notmask = ~mask
- result[notmask] = np.argsort(values[notmask], kind=kind)
- else:
- result = np.argsort(values, kind=kind)
-
- res = self._constructor(
- result, index=self.index, name=self.name, dtype=np.intp, copy=False
- )
- return res.__finalize__(self, method="argsort")
-
- def nlargest(
- self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
- ) -> Series:
- """
- Return the largest `n` elements.
-
- Parameters
- ----------
- n : int, default 5
- Return this many descending sorted values.
- keep : {'first', 'last', 'all'}, default 'first'
- When there are duplicate values that cannot all fit in a
- Series of `n` elements:
-
- - ``first`` : return the first `n` occurrences in order
- of appearance.
- - ``last`` : return the last `n` occurrences in reverse
- order of appearance.
- - ``all`` : keep all occurrences. This can result in a Series of
- size larger than `n`.
-
- Returns
- -------
- Series
- The `n` largest values in the Series, sorted in decreasing order.
-
- See Also
- --------
- Series.nsmallest: Get the `n` smallest elements.
- Series.sort_values: Sort Series by values.
- Series.head: Return the first `n` rows.
-
- Notes
- -----
- Faster than ``.sort_values(ascending=False).head(n)`` for small `n`
- relative to the size of the ``Series`` object.
-
- Examples
- --------
- >>> countries_population = {"Italy": 59000000, "France": 65000000,
- ... "Malta": 434000, "Maldives": 434000,
- ... "Brunei": 434000, "Iceland": 337000,
- ... "Nauru": 11300, "Tuvalu": 11300,
- ... "Anguilla": 11300, "Montserrat": 5200}
- >>> s = pd.Series(countries_population)
- >>> s
- Italy 59000000
- France 65000000
- Malta 434000
- Maldives 434000
- Brunei 434000
- Iceland 337000
- Nauru 11300
- Tuvalu 11300
- Anguilla 11300
- Montserrat 5200
- dtype: int64
-
- The `n` largest elements where ``n=5`` by default.
-
- >>> s.nlargest()
- France 65000000
- Italy 59000000
- Malta 434000
- Maldives 434000
- Brunei 434000
- dtype: int64
-
- The `n` largest elements where ``n=3``. Default `keep` value is 'first'
- so Malta will be kept.
-
- >>> s.nlargest(3)
- France 65000000
- Italy 59000000
- Malta 434000
- dtype: int64
-
- The `n` largest elements where ``n=3`` and keeping the last duplicates.
- Brunei will be kept since it is the last with value 434000 based on
- the index order.
-
- >>> s.nlargest(3, keep='last')
- France 65000000
- Italy 59000000
- Brunei 434000
- dtype: int64
-
- The `n` largest elements where ``n=3`` with all duplicates kept. Note
- that the returned Series has five elements due to the three duplicates.
-
- >>> s.nlargest(3, keep='all')
- France 65000000
- Italy 59000000
- Malta 434000
- Maldives 434000
- Brunei 434000
- dtype: int64
- """
- return selectn.SelectNSeries(self, n=n, keep=keep).nlargest()
-
- def nsmallest(self, n: int = 5, keep: str = "first") -> Series:
- """
- Return the smallest `n` elements.
-
- Parameters
- ----------
- n : int, default 5
- Return this many ascending sorted values.
- keep : {'first', 'last', 'all'}, default 'first'
- When there are duplicate values that cannot all fit in a
- Series of `n` elements:
-
- - ``first`` : return the first `n` occurrences in order
- of appearance.
- - ``last`` : return the last `n` occurrences in reverse
- order of appearance.
- - ``all`` : keep all occurrences. This can result in a Series of
- size larger than `n`.
-
- Returns
- -------
- Series
- The `n` smallest values in the Series, sorted in increasing order.
-
- See Also
- --------
- Series.nlargest: Get the `n` largest elements.
- Series.sort_values: Sort Series by values.
- Series.head: Return the first `n` rows.
-
- Notes
- -----
- Faster than ``.sort_values().head(n)`` for small `n` relative to
- the size of the ``Series`` object.
-
- Examples
- --------
- >>> countries_population = {"Italy": 59000000, "France": 65000000,
- ... "Brunei": 434000, "Malta": 434000,
- ... "Maldives": 434000, "Iceland": 337000,
- ... "Nauru": 11300, "Tuvalu": 11300,
- ... "Anguilla": 11300, "Montserrat": 5200}
- >>> s = pd.Series(countries_population)
- >>> s
- Italy 59000000
- France 65000000
- Brunei 434000
- Malta 434000
- Maldives 434000
- Iceland 337000
- Nauru 11300
- Tuvalu 11300
- Anguilla 11300
- Montserrat 5200
- dtype: int64
-
- The `n` smallest elements where ``n=5`` by default.
-
- >>> s.nsmallest()
- Montserrat 5200
- Nauru 11300
- Tuvalu 11300
- Anguilla 11300
- Iceland 337000
- dtype: int64
-
- The `n` smallest elements where ``n=3``. Default `keep` value is
- 'first' so Nauru and Tuvalu will be kept.
-
- >>> s.nsmallest(3)
- Montserrat 5200
- Nauru 11300
- Tuvalu 11300
- dtype: int64
-
- The `n` smallest elements where ``n=3`` and keeping the last
- duplicates. Anguilla and Tuvalu will be kept since they are the last
- with value 11300 based on the index order.
-
- >>> s.nsmallest(3, keep='last')
- Montserrat 5200
- Anguilla 11300
- Tuvalu 11300
- dtype: int64
-
- The `n` smallest elements where ``n=3`` with all duplicates kept. Note
- that the returned Series has four elements due to the three duplicates.
-
- >>> s.nsmallest(3, keep='all')
- Montserrat 5200
- Nauru 11300
- Tuvalu 11300
- Anguilla 11300
- dtype: int64
- """
- return selectn.SelectNSeries(self, n=n, keep=keep).nsmallest()
-
- @doc(
- klass=_shared_doc_kwargs["klass"],
- extra_params=dedent(
- """copy : bool, default True
- Whether to copy underlying data."""
- ),
- examples=dedent(
- """\
- Examples
- --------
- >>> s = pd.Series(
- ... ["A", "B", "A", "C"],
- ... index=[
- ... ["Final exam", "Final exam", "Coursework", "Coursework"],
- ... ["History", "Geography", "History", "Geography"],
- ... ["January", "February", "March", "April"],
- ... ],
- ... )
- >>> s
- Final exam History January A
- Geography February B
- Coursework History March A
- Geography April C
- dtype: object
-
- In the following example, we will swap the levels of the indices.
- Here, we will swap the levels column-wise, but levels can be swapped row-wise
- in a similar manner. Note that column-wise is the default behaviour.
- By not supplying any arguments for i and j, we swap the last and second to
- last indices.
-
- >>> s.swaplevel()
- Final exam January History A
- February Geography B
- Coursework March History A
- April Geography C
- dtype: object
-
- By supplying one argument, we can choose which index to swap the last
- index with. We can for example swap the first index with the last one as
- follows.
-
- >>> s.swaplevel(0)
- January History Final exam A
- February Geography Final exam B
- March History Coursework A
- April Geography Coursework C
- dtype: object
-
- We can also define explicitly which indices we want to swap by supplying values
- for both i and j. Here, we for example swap the first and second indices.
-
- >>> s.swaplevel(0, 1)
- History Final exam January A
- Geography Final exam February B
- History Coursework March A
- Geography Coursework April C
- dtype: object"""
- ),
- )
- def swaplevel(
- self, i: Level = -2, j: Level = -1, copy: bool | None = None
- ) -> Series:
- """
- Swap levels i and j in a :class:`MultiIndex`.
-
- Default is to swap the two innermost levels of the index.
-
- Parameters
- ----------
- i, j : int or str
- Levels of the indices to be swapped. Can pass level name as string.
- {extra_params}
-
- Returns
- -------
- {klass}
- {klass} with levels swapped in MultiIndex.
-
- {examples}
- """
- assert isinstance(self.index, MultiIndex)
- result = self.copy(deep=copy and not using_copy_on_write())
- result.index = self.index.swaplevel(i, j)
- return result
-
- def reorder_levels(self, order: Sequence[Level]) -> Series:
- """
- Rearrange index levels using input order.
-
- May not drop or duplicate levels.
-
- Parameters
- ----------
- order : list of int representing new level order
- Reference level by number or key.
-
- Returns
- -------
- type of caller (new object)
- """
- if not isinstance(self.index, MultiIndex): # pragma: no cover
- raise Exception("Can only reorder levels on a hierarchical axis.")
-
- result = self.copy(deep=None)
- assert isinstance(result.index, MultiIndex)
- result.index = result.index.reorder_levels(order)
- return result
-
- def explode(self, ignore_index: bool = False) -> Series:
- """
- Transform each element of a list-like to a row.
-
- Parameters
- ----------
- ignore_index : bool, default False
- If True, the resulting index will be labeled 0, 1, …, n - 1.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- Series
- Exploded lists to rows; index will be duplicated for these rows.
-
- See Also
- --------
- Series.str.split : Split string values on specified separator.
- Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex
- to produce DataFrame.
- DataFrame.melt : Unpivot a DataFrame from wide format to long format.
- DataFrame.explode : Explode a DataFrame from list-like
- columns to long format.
-
- Notes
- -----
- This routine will explode list-likes including lists, tuples, sets,
- Series, and np.ndarray. The result dtype of the subset rows will
- be object. Scalars will be returned unchanged, and empty list-likes will
- result in a np.nan for that row. In addition, the ordering of elements in
- the output will be non-deterministic when exploding sets.
-
- Reference :ref:`the user guide <reshaping.explode>` for more examples.
-
- Examples
- --------
- >>> s = pd.Series([[1, 2, 3], 'foo', [], [3, 4]])
- >>> s
- 0 [1, 2, 3]
- 1 foo
- 2 []
- 3 [3, 4]
- dtype: object
-
- >>> s.explode()
- 0 1
- 0 2
- 0 3
- 1 foo
- 2 NaN
- 3 3
- 3 4
- dtype: object
- """
- if not len(self) or not is_object_dtype(self):
- result = self.copy()
- return result.reset_index(drop=True) if ignore_index else result
-
- values, counts = reshape.explode(np.asarray(self._values))
-
- if ignore_index:
- index = default_index(len(values))
- else:
- index = self.index.repeat(counts)
-
- return self._constructor(values, index=index, name=self.name, copy=False)
-
- def unstack(self, level: IndexLabel = -1, fill_value: Hashable = None) -> DataFrame:
- """
- Unstack, also known as pivot, Series with MultiIndex to produce DataFrame.
-
- Parameters
- ----------
- level : int, str, or list of these, default last level
- Level(s) to unstack, can pass level name.
- fill_value : scalar value, default None
- Value to use when replacing NaN values.
-
- Returns
- -------
- DataFrame
- Unstacked Series.
-
- Notes
- -----
- Reference :ref:`the user guide <reshaping.stacking>` for more examples.
-
- Examples
- --------
- >>> s = pd.Series([1, 2, 3, 4],
- ... index=pd.MultiIndex.from_product([['one', 'two'],
- ... ['a', 'b']]))
- >>> s
- one a 1
- b 2
- two a 3
- b 4
- dtype: int64
-
- >>> s.unstack(level=-1)
- a b
- one 1 2
- two 3 4
-
- >>> s.unstack(level=0)
- one two
- a 1 3
- b 2 4
- """
- from pandas.core.reshape.reshape import unstack
-
- return unstack(self, level, fill_value)
-
- # ----------------------------------------------------------------------
- # function application
-
- def map(
- self,
- arg: Callable | Mapping | Series,
- na_action: Literal["ignore"] | None = None,
- ) -> Series:
- """
- Map values of Series according to an input mapping or function.
-
- Used for substituting each value in a Series with another value,
- that may be derived from a function, a ``dict`` or
- a :class:`Series`.
-
- Parameters
- ----------
- arg : function, collections.abc.Mapping subclass or Series
- Mapping correspondence.
- na_action : {None, 'ignore'}, default None
- If 'ignore', propagate NaN values, without passing them to the
- mapping correspondence.
-
- Returns
- -------
- Series
- Same index as caller.
-
- See Also
- --------
- Series.apply : For applying more complex functions on a Series.
- DataFrame.apply : Apply a function row-/column-wise.
- DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
-
- Notes
- -----
- When ``arg`` is a dictionary, values in Series that are not in the
- dictionary (as keys) are converted to ``NaN``. However, if the
- dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
- provides a method for default values), then this default is used
- rather than ``NaN``.
-
- Examples
- --------
- >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
- >>> s
- 0 cat
- 1 dog
- 2 NaN
- 3 rabbit
- dtype: object
-
- ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
- in the ``dict`` are converted to ``NaN``, unless the dict has a default
- value (e.g. ``defaultdict``):
-
- >>> s.map({'cat': 'kitten', 'dog': 'puppy'})
- 0 kitten
- 1 puppy
- 2 NaN
- 3 NaN
- dtype: object
-
- It also accepts a function:
-
- >>> s.map('I am a {}'.format)
- 0 I am a cat
- 1 I am a dog
- 2 I am a nan
- 3 I am a rabbit
- dtype: object
-
- To avoid applying the function to missing values (and keep them as
- ``NaN``) ``na_action='ignore'`` can be used:
-
- >>> s.map('I am a {}'.format, na_action='ignore')
- 0 I am a cat
- 1 I am a dog
- 2 NaN
- 3 I am a rabbit
- dtype: object
- """
- new_values = self._map_values(arg, na_action=na_action)
- return self._constructor(new_values, index=self.index, copy=False).__finalize__(
- self, method="map"
- )
-
- def _gotitem(self, key, ndim, subset=None) -> Series:
- """
- Sub-classes to define. Return a sliced object.
-
- Parameters
- ----------
- key : string / list of selections
- ndim : {1, 2}
- Requested ndim of result.
- subset : object, default None
- Subset to act on.
- """
- return self
-
- _agg_see_also_doc = dedent(
- """
- See Also
- --------
- Series.apply : Invoke function on a Series.
- Series.transform : Transform function producing a Series with like indexes.
- """
- )
-
- _agg_examples_doc = dedent(
- """
- Examples
- --------
- >>> s = pd.Series([1, 2, 3, 4])
- >>> s
- 0 1
- 1 2
- 2 3
- 3 4
- dtype: int64
-
- >>> s.agg('min')
- 1
-
- >>> s.agg(['min', 'max'])
- min 1
- max 4
- dtype: int64
- """
- )
-
- @doc(
- _shared_docs["aggregate"],
- klass=_shared_doc_kwargs["klass"],
- axis=_shared_doc_kwargs["axis"],
- see_also=_agg_see_also_doc,
- examples=_agg_examples_doc,
- )
- def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
- # Validate the axis parameter
- self._get_axis_number(axis)
-
- # if func is None, will switch to user-provided "named aggregation" kwargs
- if func is None:
- func = dict(kwargs.items())
-
- op = SeriesApply(self, func, convert_dtype=False, args=args, kwargs=kwargs)
- result = op.agg()
- return result
-
- agg = aggregate
-
- # error: Signature of "any" incompatible with supertype "NDFrame" [override]
- @overload # type: ignore[override]
- def any(
- self,
- *,
- axis: Axis = ...,
- bool_only: bool | None = ...,
- skipna: bool = ...,
- level: None = ...,
- **kwargs,
- ) -> bool:
- ...
-
- @overload
- def any(
- self,
- *,
- axis: Axis = ...,
- bool_only: bool | None = ...,
- skipna: bool = ...,
- level: Level,
- **kwargs,
- ) -> Series | bool:
- ...
-
- # error: Missing return statement
- @doc(NDFrame.any, **_shared_doc_kwargs)
- def any( # type: ignore[empty-body]
- self,
- axis: Axis = 0,
- bool_only: bool | None = None,
- skipna: bool = True,
- level: Level | None = None,
- **kwargs,
- ) -> Series | bool:
- ...
-
- @doc(
- _shared_docs["transform"],
- klass=_shared_doc_kwargs["klass"],
- axis=_shared_doc_kwargs["axis"],
- )
- def transform(
- self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
- ) -> DataFrame | Series:
- # Validate axis argument
- self._get_axis_number(axis)
- result = SeriesApply(
- self, func=func, convert_dtype=True, args=args, kwargs=kwargs
- ).transform()
- return result
-
- def apply(
- self,
- func: AggFuncType,
- convert_dtype: bool = True,
- args: tuple[Any, ...] = (),
- **kwargs,
- ) -> DataFrame | Series:
- """
- Invoke function on values of Series.
-
- Can be ufunc (a NumPy function that applies to the entire Series)
- or a Python function that only works on single values.
-
- Parameters
- ----------
- func : function
- Python function or NumPy ufunc to apply.
- convert_dtype : bool, default True
- Try to find better dtype for elementwise function results. If
- False, leave as dtype=object. Note that the dtype is always
- preserved for some extension array dtypes, such as Categorical.
- args : tuple
- Positional arguments passed to func after the series value.
- **kwargs
- Additional keyword arguments passed to func.
-
- Returns
- -------
- Series or DataFrame
- If func returns a Series object the result will be a DataFrame.
-
- See Also
- --------
- Series.map: For element-wise operations.
- Series.agg: Only perform aggregating type operations.
- Series.transform: Only perform transforming type operations.
-
- Notes
- -----
- Functions that mutate the passed object can produce unexpected
- behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
- for more details.
-
- Examples
- --------
- Create a series with typical summer temperatures for each city.
-
- >>> s = pd.Series([20, 21, 12],
- ... index=['London', 'New York', 'Helsinki'])
- >>> s
- London 20
- New York 21
- Helsinki 12
- dtype: int64
-
- Square the values by defining a function and passing it as an
- argument to ``apply()``.
-
- >>> def square(x):
- ... return x ** 2
- >>> s.apply(square)
- London 400
- New York 441
- Helsinki 144
- dtype: int64
-
- Square the values by passing an anonymous function as an
- argument to ``apply()``.
-
- >>> s.apply(lambda x: x ** 2)
- London 400
- New York 441
- Helsinki 144
- dtype: int64
-
- Define a custom function that needs additional positional
- arguments and pass these additional arguments using the
- ``args`` keyword.
-
- >>> def subtract_custom_value(x, custom_value):
- ... return x - custom_value
-
- >>> s.apply(subtract_custom_value, args=(5,))
- London 15
- New York 16
- Helsinki 7
- dtype: int64
-
- Define a custom function that takes keyword arguments
- and pass these arguments to ``apply``.
-
- >>> def add_custom_values(x, **kwargs):
- ... for month in kwargs:
- ... x += kwargs[month]
- ... return x
-
- >>> s.apply(add_custom_values, june=30, july=20, august=25)
- London 95
- New York 96
- Helsinki 87
- dtype: int64
-
- Use a function from the Numpy library.
-
- >>> s.apply(np.log)
- London 2.995732
- New York 3.044522
- Helsinki 2.484907
- dtype: float64
- """
- return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
-
- def _reduce(
- self,
- op,
- name: str,
- *,
- axis: Axis = 0,
- skipna: bool = True,
- numeric_only: bool = False,
- filter_type=None,
- **kwds,
- ):
- """
- Perform a reduction operation.
-
- If we have an ndarray as a value, then simply perform the operation,
- otherwise delegate to the object.
- """
- delegate = self._values
-
- if axis is not None:
- self._get_axis_number(axis)
-
- if isinstance(delegate, ExtensionArray):
- # dispatch to ExtensionArray interface
- return delegate._reduce(name, skipna=skipna, **kwds)
-
- else:
- # dispatch to numpy arrays
- if numeric_only and not is_numeric_dtype(self.dtype):
- kwd_name = "numeric_only"
- if name in ["any", "all"]:
- kwd_name = "bool_only"
- # GH#47500 - change to TypeError to match other methods
- raise TypeError(
- f"Series.{name} does not allow {kwd_name}={numeric_only} "
- "with non-numeric dtypes."
- )
- with np.errstate(all="ignore"):
- return op(delegate, skipna=skipna, **kwds)
-
- def _reindex_indexer(
- self,
- new_index: Index | None,
- indexer: npt.NDArray[np.intp] | None,
- copy: bool | None,
- ) -> Series:
- # Note: new_index is None iff indexer is None
- # if not None, indexer is np.intp
- if indexer is None and (
- new_index is None or new_index.names == self.index.names
- ):
- if using_copy_on_write():
- return self.copy(deep=copy)
- if copy or copy is None:
- return self.copy(deep=copy)
- return self
-
- new_values = algorithms.take_nd(
- self._values, indexer, allow_fill=True, fill_value=None
- )
- return self._constructor(new_values, index=new_index, copy=False)
-
- def _needs_reindex_multi(self, axes, method, level) -> bool:
- """
- Check if we do need a multi reindex; this is for compat with
- higher dims.
- """
- return False
-
- # error: Cannot determine type of 'align'
- @doc(
- NDFrame.align, # type: ignore[has-type]
- klass=_shared_doc_kwargs["klass"],
- axes_single_arg=_shared_doc_kwargs["axes_single_arg"],
- )
- def align(
- self,
- other: Series,
- join: AlignJoin = "outer",
- axis: Axis | None = None,
- level: Level = None,
- copy: bool | None = None,
- fill_value: Hashable = None,
- method: FillnaOptions | None = None,
- limit: int | None = None,
- fill_axis: Axis = 0,
- broadcast_axis: Axis | None = None,
- ) -> Series:
- return super().align(
- other,
- join=join,
- axis=axis,
- level=level,
- copy=copy,
- fill_value=fill_value,
- method=method,
- limit=limit,
- fill_axis=fill_axis,
- broadcast_axis=broadcast_axis,
- )
-
- @overload
- def rename(
- self,
- index: Renamer | Hashable | None = ...,
- *,
- axis: Axis | None = ...,
- copy: bool = ...,
- inplace: Literal[True],
- level: Level | None = ...,
- errors: IgnoreRaise = ...,
- ) -> None:
- ...
-
- @overload
- def rename(
- self,
- index: Renamer | Hashable | None = ...,
- *,
- axis: Axis | None = ...,
- copy: bool = ...,
- inplace: Literal[False] = ...,
- level: Level | None = ...,
- errors: IgnoreRaise = ...,
- ) -> Series:
- ...
-
- @overload
- def rename(
- self,
- index: Renamer | Hashable | None = ...,
- *,
- axis: Axis | None = ...,
- copy: bool = ...,
- inplace: bool = ...,
- level: Level | None = ...,
- errors: IgnoreRaise = ...,
- ) -> Series | None:
- ...
-
- def rename(
- self,
- index: Renamer | Hashable | None = None,
- *,
- axis: Axis | None = None,
- copy: bool | None = None,
- inplace: bool = False,
- level: Level | None = None,
- errors: IgnoreRaise = "ignore",
- ) -> Series | None:
- """
- Alter Series index labels or name.
-
- Function / dict values must be unique (1-to-1). Labels not contained in
- a dict / Series will be left as-is. Extra labels listed don't throw an
- error.
-
- Alternatively, change ``Series.name`` with a scalar value.
-
- See the :ref:`user guide <basics.rename>` for more.
-
- Parameters
- ----------
- index : scalar, hashable sequence, dict-like or function optional
- Functions or dict-like are transformations to apply to
- the index.
- Scalar or hashable sequence-like will alter the ``Series.name``
- attribute.
- axis : {0 or 'index'}
- Unused. Parameter needed for compatibility with DataFrame.
- copy : bool, default True
- Also copy underlying data.
- inplace : bool, default False
- Whether to return a new Series. If True the value of copy is ignored.
- level : int or level name, default None
- In case of MultiIndex, only rename labels in the specified level.
- errors : {'ignore', 'raise'}, default 'ignore'
- If 'raise', raise `KeyError` when a `dict-like mapper` or
- `index` contains labels that are not present in the index being transformed.
- If 'ignore', existing keys will be renamed and extra keys will be ignored.
-
- Returns
- -------
- Series or None
- Series with index labels or name altered or None if ``inplace=True``.
-
- See Also
- --------
- DataFrame.rename : Corresponding DataFrame method.
- Series.rename_axis : Set the name of the axis.
-
- Examples
- --------
- >>> s = pd.Series([1, 2, 3])
- >>> s
- 0 1
- 1 2
- 2 3
- dtype: int64
- >>> s.rename("my_name") # scalar, changes Series.name
- 0 1
- 1 2
- 2 3
- Name: my_name, dtype: int64
- >>> s.rename(lambda x: x ** 2) # function, changes labels
- 0 1
- 1 2
- 4 3
- dtype: int64
- >>> s.rename({1: 3, 2: 5}) # mapping, changes labels
- 0 1
- 3 2
- 5 3
- dtype: int64
- """
- if axis is not None:
- # Make sure we raise if an invalid 'axis' is passed.
- axis = self._get_axis_number(axis)
-
- if callable(index) or is_dict_like(index):
- # error: Argument 1 to "_rename" of "NDFrame" has incompatible
- # type "Union[Union[Mapping[Any, Hashable], Callable[[Any],
- # Hashable]], Hashable, None]"; expected "Union[Mapping[Any,
- # Hashable], Callable[[Any], Hashable], None]"
- return super()._rename(
- index, # type: ignore[arg-type]
- copy=copy,
- inplace=inplace,
- level=level,
- errors=errors,
- )
- else:
- return self._set_name(index, inplace=inplace, deep=copy)
-
- @Appender(
- """
- Examples
- --------
- >>> s = pd.Series([1, 2, 3])
- >>> s
- 0 1
- 1 2
- 2 3
- dtype: int64
-
- >>> s.set_axis(['a', 'b', 'c'], axis=0)
- a 1
- b 2
- c 3
- dtype: int64
- """
- )
- @Substitution(
- **_shared_doc_kwargs,
- extended_summary_sub="",
- axis_description_sub="",
- see_also_sub="",
- )
- @Appender(NDFrame.set_axis.__doc__)
- def set_axis(
- self,
- labels,
- *,
- axis: Axis = 0,
- copy: bool | None = None,
- ) -> Series:
- return super().set_axis(labels, axis=axis, copy=copy)
-
- # error: Cannot determine type of 'reindex'
- @doc(
- NDFrame.reindex, # type: ignore[has-type]
- klass=_shared_doc_kwargs["klass"],
- optional_reindex=_shared_doc_kwargs["optional_reindex"],
- )
- def reindex( # type: ignore[override]
- self,
- index=None,
- *,
- axis: Axis | None = None,
- method: str | None = None,
- copy: bool | None = None,
- level: Level | None = None,
- fill_value: Scalar | None = None,
- limit: int | None = None,
- tolerance=None,
- ) -> Series:
- return super().reindex(
- index=index,
- method=method,
- copy=copy,
- level=level,
- fill_value=fill_value,
- limit=limit,
- tolerance=tolerance,
- )
-
- @doc(NDFrame.rename_axis)
- def rename_axis( # type: ignore[override]
- self: Series,
- mapper: IndexLabel | lib.NoDefault = lib.no_default,
- *,
- index=lib.no_default,
- axis: Axis = 0,
- copy: bool = True,
- inplace: bool = False,
- ) -> Series | None:
- return super().rename_axis(
- mapper=mapper,
- index=index,
- axis=axis,
- copy=copy,
- inplace=inplace,
- )
-
- @overload
- def drop(
- self,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level | None = ...,
- inplace: Literal[True],
- errors: IgnoreRaise = ...,
- ) -> None:
- ...
-
- @overload
- def drop(
- self,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level | None = ...,
- inplace: Literal[False] = ...,
- errors: IgnoreRaise = ...,
- ) -> Series:
- ...
-
- @overload
- def drop(
- self,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level | None = ...,
- inplace: bool = ...,
- errors: IgnoreRaise = ...,
- ) -> Series | None:
- ...
-
- def drop(
- self,
- labels: IndexLabel = None,
- *,
- axis: Axis = 0,
- index: IndexLabel = None,
- columns: IndexLabel = None,
- level: Level | None = None,
- inplace: bool = False,
- errors: IgnoreRaise = "raise",
- ) -> Series | None:
- """
- Return Series with specified index labels removed.
-
- Remove elements of a Series based on specifying the index labels.
- When using a multi-index, labels on different levels can be removed
- by specifying the level.
-
- Parameters
- ----------
- labels : single label or list-like
- Index labels to drop.
- axis : {0 or 'index'}
- Unused. Parameter needed for compatibility with DataFrame.
- index : single label or list-like
- Redundant for application on Series, but 'index' can be used instead
- of 'labels'.
- columns : single label or list-like
- No change is made to the Series; use 'index' or 'labels' instead.
- level : int or level name, optional
- For MultiIndex, level for which the labels will be removed.
- inplace : bool, default False
- If True, do operation inplace and return None.
- errors : {'ignore', 'raise'}, default 'raise'
- If 'ignore', suppress error and only existing labels are dropped.
-
- Returns
- -------
- Series or None
- Series with specified index labels removed or None if ``inplace=True``.
-
- Raises
- ------
- KeyError
- If none of the labels are found in the index.
-
- See Also
- --------
- Series.reindex : Return only specified index labels of Series.
- Series.dropna : Return series without null values.
- Series.drop_duplicates : Return Series with duplicate values removed.
- DataFrame.drop : Drop specified labels from rows or columns.
-
- Examples
- --------
- >>> s = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
- >>> s
- A 0
- B 1
- C 2
- dtype: int64
-
- Drop labels B en C
-
- >>> s.drop(labels=['B', 'C'])
- A 0
- dtype: int64
-
- Drop 2nd level label in MultiIndex Series
-
- >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
- ... ['speed', 'weight', 'length']],
- ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
- ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
- >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
- ... index=midx)
- >>> s
- lama speed 45.0
- weight 200.0
- length 1.2
- cow speed 30.0
- weight 250.0
- length 1.5
- falcon speed 320.0
- weight 1.0
- length 0.3
- dtype: float64
-
- >>> s.drop(labels='weight', level=1)
- lama speed 45.0
- length 1.2
- cow speed 30.0
- length 1.5
- falcon speed 320.0
- length 0.3
- dtype: float64
- """
- return super().drop(
- labels=labels,
- axis=axis,
- index=index,
- columns=columns,
- level=level,
- inplace=inplace,
- errors=errors,
- )
-
- @overload
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: Literal[False] = ...,
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> Series:
- ...
-
- @overload
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: Literal[True],
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> None:
- ...
-
- @overload
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: bool = ...,
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> Series | None:
- ...
-
- @doc(NDFrame.fillna, **_shared_doc_kwargs)
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = None,
- *,
- method: FillnaOptions | None = None,
- axis: Axis | None = None,
- inplace: bool = False,
- limit: int | None = None,
- downcast: dict | None = None,
- ) -> Series | None:
- return super().fillna(
- value=value,
- method=method,
- axis=axis,
- inplace=inplace,
- limit=limit,
- downcast=downcast,
- )
-
- def pop(self, item: Hashable) -> Any:
- """
- Return item and drops from series. Raise KeyError if not found.
-
- Parameters
- ----------
- item : label
- Index of the element that needs to be removed.
-
- Returns
- -------
- Value that is popped from series.
-
- Examples
- --------
- >>> ser = pd.Series([1,2,3])
-
- >>> ser.pop(0)
- 1
-
- >>> ser
- 1 2
- 2 3
- dtype: int64
- """
- return super().pop(item=item)
-
- @overload
- def replace(
- self,
- to_replace=...,
- value=...,
- *,
- inplace: Literal[False] = ...,
- limit: int | None = ...,
- regex: bool = ...,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
- ) -> Series:
- ...
-
- @overload
- def replace(
- self,
- to_replace=...,
- value=...,
- *,
- inplace: Literal[True],
- limit: int | None = ...,
- regex: bool = ...,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
- ) -> None:
- ...
-
- @doc(
- NDFrame.replace,
- klass=_shared_doc_kwargs["klass"],
- inplace=_shared_doc_kwargs["inplace"],
- replace_iloc=_shared_doc_kwargs["replace_iloc"],
- )
- def replace(
- self,
- to_replace=None,
- value=lib.no_default,
- *,
- inplace: bool = False,
- limit: int | None = None,
- regex: bool = False,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,
- ) -> Series | None:
- return super().replace(
- to_replace=to_replace,
- value=value,
- inplace=inplace,
- limit=limit,
- regex=regex,
- method=method,
- )
-
- @doc(INFO_DOCSTRING, **series_sub_kwargs)
- def info(
- self,
- verbose: bool | None = None,
- buf: IO[str] | None = None,
- max_cols: int | None = None,
- memory_usage: bool | str | None = None,
- show_counts: bool = True,
- ) -> None:
- return SeriesInfo(self, memory_usage).render(
- buf=buf,
- max_cols=max_cols,
- verbose=verbose,
- show_counts=show_counts,
- )
-
- def _replace_single(self, to_replace, method: str, inplace: bool, limit):
- """
- Replaces values in a Series using the fill method specified when no
- replacement value is given in the replace method
- """
-
- result = self if inplace else self.copy()
-
- values = result._values
- mask = missing.mask_missing(values, to_replace)
-
- if isinstance(values, ExtensionArray):
- # dispatch to the EA's _pad_mask_inplace method
- values._fill_mask_inplace(method, limit, mask)
- else:
- fill_f = missing.get_fill_func(method)
- fill_f(values, limit=limit, mask=mask)
-
- if inplace:
- return
- return result
-
- # error: Cannot determine type of 'shift'
- @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type]
- def shift(
- self, periods: int = 1, freq=None, axis: Axis = 0, fill_value: Hashable = None
- ) -> Series:
- return super().shift(
- periods=periods, freq=freq, axis=axis, fill_value=fill_value
- )
-
- def memory_usage(self, index: bool = True, deep: bool = False) -> int:
- """
- Return the memory usage of the Series.
-
- The memory usage can optionally include the contribution of
- the index and of elements of `object` dtype.
-
- Parameters
- ----------
- index : bool, default True
- Specifies whether to include the memory usage of the Series index.
- deep : bool, default False
- If True, introspect the data deeply by interrogating
- `object` dtypes for system-level memory consumption, and include
- it in the returned value.
-
- Returns
- -------
- int
- Bytes of memory consumed.
-
- See Also
- --------
- numpy.ndarray.nbytes : Total bytes consumed by the elements of the
- array.
- DataFrame.memory_usage : Bytes consumed by a DataFrame.
-
- Examples
- --------
- >>> s = pd.Series(range(3))
- >>> s.memory_usage()
- 152
-
- Not including the index gives the size of the rest of the data, which
- is necessarily smaller:
-
- >>> s.memory_usage(index=False)
- 24
-
- The memory footprint of `object` values is ignored by default:
-
- >>> s = pd.Series(["a", "b"])
- >>> s.values
- array(['a', 'b'], dtype=object)
- >>> s.memory_usage()
- 144
- >>> s.memory_usage(deep=True)
- 244
- """
- v = self._memory_usage(deep=deep)
- if index:
- v += self.index.memory_usage(deep=deep)
- return v
-
- def isin(self, values) -> Series:
- """
- Whether elements in Series are contained in `values`.
-
- Return a boolean Series showing whether each element in the Series
- matches an element in the passed sequence of `values` exactly.
-
- Parameters
- ----------
- values : set or list-like
- The sequence of values to test. Passing in a single string will
- raise a ``TypeError``. Instead, turn a single string into a
- list of one element.
-
- Returns
- -------
- Series
- Series of booleans indicating if each element is in values.
-
- Raises
- ------
- TypeError
- * If `values` is a string
-
- See Also
- --------
- DataFrame.isin : Equivalent method on DataFrame.
-
- Examples
- --------
- >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama',
- ... 'hippo'], name='animal')
- >>> s.isin(['cow', 'lama'])
- 0 True
- 1 True
- 2 True
- 3 False
- 4 True
- 5 False
- Name: animal, dtype: bool
-
- To invert the boolean values, use the ``~`` operator:
-
- >>> ~s.isin(['cow', 'lama'])
- 0 False
- 1 False
- 2 False
- 3 True
- 4 False
- 5 True
- Name: animal, dtype: bool
-
- Passing a single string as ``s.isin('lama')`` will raise an error. Use
- a list of one element instead:
-
- >>> s.isin(['lama'])
- 0 True
- 1 False
- 2 True
- 3 False
- 4 True
- 5 False
- Name: animal, dtype: bool
-
- Strings and integers are distinct and are therefore not comparable:
-
- >>> pd.Series([1]).isin(['1'])
- 0 False
- dtype: bool
- >>> pd.Series([1.1]).isin(['1.1'])
- 0 False
- dtype: bool
- """
- result = algorithms.isin(self._values, values)
- return self._constructor(result, index=self.index, copy=False).__finalize__(
- self, method="isin"
- )
-
- def between(
- self,
- left,
- right,
- inclusive: Literal["both", "neither", "left", "right"] = "both",
- ) -> Series:
- """
- Return boolean Series equivalent to left <= series <= right.
-
- This function returns a boolean vector containing `True` wherever the
- corresponding Series element is between the boundary values `left` and
- `right`. NA values are treated as `False`.
-
- Parameters
- ----------
- left : scalar or list-like
- Left boundary.
- right : scalar or list-like
- Right boundary.
- inclusive : {"both", "neither", "left", "right"}
- Include boundaries. Whether to set each bound as closed or open.
-
- .. versionchanged:: 1.3.0
-
- Returns
- -------
- Series
- Series representing whether each element is between left and
- right (inclusive).
-
- See Also
- --------
- Series.gt : Greater than of series and other.
- Series.lt : Less than of series and other.
-
- Notes
- -----
- This function is equivalent to ``(left <= ser) & (ser <= right)``
-
- Examples
- --------
- >>> s = pd.Series([2, 0, 4, 8, np.nan])
-
- Boundary values are included by default:
-
- >>> s.between(1, 4)
- 0 True
- 1 False
- 2 True
- 3 False
- 4 False
- dtype: bool
-
- With `inclusive` set to ``"neither"`` boundary values are excluded:
-
- >>> s.between(1, 4, inclusive="neither")
- 0 True
- 1 False
- 2 False
- 3 False
- 4 False
- dtype: bool
-
- `left` and `right` can be any scalar value:
-
- >>> s = pd.Series(['Alice', 'Bob', 'Carol', 'Eve'])
- >>> s.between('Anna', 'Daniel')
- 0 False
- 1 True
- 2 True
- 3 False
- dtype: bool
- """
- if inclusive == "both":
- lmask = self >= left
- rmask = self <= right
- elif inclusive == "left":
- lmask = self >= left
- rmask = self < right
- elif inclusive == "right":
- lmask = self > left
- rmask = self <= right
- elif inclusive == "neither":
- lmask = self > left
- rmask = self < right
- else:
- raise ValueError(
- "Inclusive has to be either string of 'both',"
- "'left', 'right', or 'neither'."
- )
-
- return lmask & rmask
-
- # ----------------------------------------------------------------------
- # Convert to types that support pd.NA
-
- def _convert_dtypes(
- self,
- infer_objects: bool = True,
- convert_string: bool = True,
- convert_integer: bool = True,
- convert_boolean: bool = True,
- convert_floating: bool = True,
- dtype_backend: DtypeBackend = "numpy_nullable",
- ) -> Series:
- input_series = self
- if infer_objects:
- input_series = input_series.infer_objects()
- if is_object_dtype(input_series):
- input_series = input_series.copy(deep=None)
-
- if convert_string or convert_integer or convert_boolean or convert_floating:
- inferred_dtype = convert_dtypes(
- input_series._values,
- convert_string,
- convert_integer,
- convert_boolean,
- convert_floating,
- infer_objects,
- dtype_backend,
- )
- result = input_series.astype(inferred_dtype)
- else:
- result = input_series.copy(deep=None)
- return result
-
- # error: Cannot determine type of 'isna'
- # error: Return type "Series" of "isna" incompatible with return type "ndarray
- # [Any, dtype[bool_]]" in supertype "IndexOpsMixin"
- @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type]
- def isna(self) -> Series: # type: ignore[override]
- return NDFrame.isna(self)
-
- # error: Cannot determine type of 'isna'
- @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type]
- def isnull(self) -> Series:
- """
- Series.isnull is an alias for Series.isna.
- """
- return super().isnull()
-
- # error: Cannot determine type of 'notna'
- @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type]
- def notna(self) -> Series:
- return super().notna()
-
- # error: Cannot determine type of 'notna'
- @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type]
- def notnull(self) -> Series:
- """
- Series.notnull is an alias for Series.notna.
- """
- return super().notnull()
-
- @overload
- def dropna(
- self,
- *,
- axis: Axis = ...,
- inplace: Literal[False] = ...,
- how: AnyAll | None = ...,
- ignore_index: bool = ...,
- ) -> Series:
- ...
-
- @overload
- def dropna(
- self,
- *,
- axis: Axis = ...,
- inplace: Literal[True],
- how: AnyAll | None = ...,
- ignore_index: bool = ...,
- ) -> None:
- ...
-
- def dropna(
- self,
- *,
- axis: Axis = 0,
- inplace: bool = False,
- how: AnyAll | None = None,
- ignore_index: bool = False,
- ) -> Series | None:
- """
- Return a new Series with missing values removed.
-
- See the :ref:`User Guide <missing_data>` for more on which values are
- considered missing, and how to work with missing data.
-
- Parameters
- ----------
- axis : {0 or 'index'}
- Unused. Parameter needed for compatibility with DataFrame.
- inplace : bool, default False
- If True, do operation inplace and return None.
- how : str, optional
- Not in use. Kept for compatibility.
- ignore_index : bool, default ``False``
- If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
-
- .. versionadded:: 2.0.0
-
- Returns
- -------
- Series or None
- Series with NA entries dropped from it or None if ``inplace=True``.
-
- See Also
- --------
- Series.isna: Indicate missing values.
- Series.notna : Indicate existing (non-missing) values.
- Series.fillna : Replace missing values.
- DataFrame.dropna : Drop rows or columns which contain NA values.
- Index.dropna : Drop missing indices.
-
- Examples
- --------
- >>> ser = pd.Series([1., 2., np.nan])
- >>> ser
- 0 1.0
- 1 2.0
- 2 NaN
- dtype: float64
-
- Drop NA values from a Series.
-
- >>> ser.dropna()
- 0 1.0
- 1 2.0
- dtype: float64
-
- Empty strings are not considered NA values. ``None`` is considered an
- NA value.
-
- >>> ser = pd.Series([np.NaN, 2, pd.NaT, '', None, 'I stay'])
- >>> ser
- 0 NaN
- 1 2
- 2 NaT
- 3
- 4 None
- 5 I stay
- dtype: object
- >>> ser.dropna()
- 1 2
- 3
- 5 I stay
- dtype: object
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
- # Validate the axis parameter
- self._get_axis_number(axis or 0)
-
- if self._can_hold_na:
- result = remove_na_arraylike(self)
- else:
- if not inplace:
- result = self.copy(deep=None)
- else:
- result = self
-
- if ignore_index:
- result.index = default_index(len(result))
-
- if inplace:
- return self._update_inplace(result)
- else:
- return result
-
- # ----------------------------------------------------------------------
- # Time series-oriented methods
-
- # error: Cannot determine type of 'asfreq'
- @doc(NDFrame.asfreq, **_shared_doc_kwargs) # type: ignore[has-type]
- def asfreq(
- self,
- freq: Frequency,
- method: FillnaOptions | None = None,
- how: str | None = None,
- normalize: bool = False,
- fill_value: Hashable = None,
- ) -> Series:
- return super().asfreq(
- freq=freq,
- method=method,
- how=how,
- normalize=normalize,
- fill_value=fill_value,
- )
-
- # error: Cannot determine type of 'resample'
- @doc(NDFrame.resample, **_shared_doc_kwargs) # type: ignore[has-type]
- def resample(
- self,
- rule,
- axis: Axis = 0,
- closed: str | None = None,
- label: str | None = None,
- convention: str = "start",
- kind: str | None = None,
- on: Level = None,
- level: Level = None,
- origin: str | TimestampConvertibleTypes = "start_day",
- offset: TimedeltaConvertibleTypes | None = None,
- group_keys: bool = False,
- ) -> Resampler:
- return super().resample(
- rule=rule,
- axis=axis,
- closed=closed,
- label=label,
- convention=convention,
- kind=kind,
- on=on,
- level=level,
- origin=origin,
- offset=offset,
- group_keys=group_keys,
- )
-
- def to_timestamp(
- self,
- freq=None,
- how: Literal["s", "e", "start", "end"] = "start",
- copy: bool | None = None,
- ) -> Series:
- """
- Cast to DatetimeIndex of Timestamps, at *beginning* of period.
-
- Parameters
- ----------
- freq : str, default frequency of PeriodIndex
- Desired frequency.
- how : {'s', 'e', 'start', 'end'}
- Convention for converting period to timestamp; start of period
- vs. end.
- copy : bool, default True
- Whether or not to return a copy.
-
- Returns
- -------
- Series with DatetimeIndex
-
- Examples
- --------
- >>> idx = pd.PeriodIndex(['2023', '2024', '2025'], freq='Y')
- >>> s1 = pd.Series([1, 2, 3], index=idx)
- >>> s1
- 2023 1
- 2024 2
- 2025 3
- Freq: A-DEC, dtype: int64
-
- The resulting frequency of the Timestamps is `YearBegin`
-
- >>> s1 = s1.to_timestamp()
- >>> s1
- 2023-01-01 1
- 2024-01-01 2
- 2025-01-01 3
- Freq: AS-JAN, dtype: int64
-
- Using `freq` which is the offset that the Timestamps will have
-
- >>> s2 = pd.Series([1, 2, 3], index=idx)
- >>> s2 = s2.to_timestamp(freq='M')
- >>> s2
- 2023-01-31 1
- 2024-01-31 2
- 2025-01-31 3
- Freq: A-JAN, dtype: int64
- """
- if not isinstance(self.index, PeriodIndex):
- raise TypeError(f"unsupported Type {type(self.index).__name__}")
-
- new_obj = self.copy(deep=copy and not using_copy_on_write())
- new_index = self.index.to_timestamp(freq=freq, how=how)
- setattr(new_obj, "index", new_index)
- return new_obj
-
- def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series:
- """
- Convert Series from DatetimeIndex to PeriodIndex.
-
- Parameters
- ----------
- freq : str, default None
- Frequency associated with the PeriodIndex.
- copy : bool, default True
- Whether or not to return a copy.
-
- Returns
- -------
- Series
- Series with index converted to PeriodIndex.
-
- Examples
- --------
- >>> idx = pd.DatetimeIndex(['2023', '2024', '2025'])
- >>> s = pd.Series([1, 2, 3], index=idx)
- >>> s = s.to_period()
- >>> s
- 2023 1
- 2024 2
- 2025 3
- Freq: A-DEC, dtype: int64
-
- Viewing the index
-
- >>> s.index
- PeriodIndex(['2023', '2024', '2025'], dtype='period[A-DEC]')
- """
- if not isinstance(self.index, DatetimeIndex):
- raise TypeError(f"unsupported Type {type(self.index).__name__}")
-
- new_obj = self.copy(deep=copy and not using_copy_on_write())
- new_index = self.index.to_period(freq=freq)
- setattr(new_obj, "index", new_index)
- return new_obj
-
- @overload
- def ffill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[False] = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> Series:
- ...
-
- @overload
- def ffill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[True],
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> None:
- ...
-
- @overload
- def ffill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: bool = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> Series | None:
- ...
-
- def ffill(
- self,
- *,
- axis: None | Axis = None,
- inplace: bool = False,
- limit: None | int = None,
- downcast: dict | None = None,
- ) -> Series | None:
- return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
-
- @overload
- def bfill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[False] = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> Series:
- ...
-
- @overload
- def bfill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[True],
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> None:
- ...
-
- @overload
- def bfill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: bool = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> Series | None:
- ...
-
- def bfill(
- self,
- *,
- axis: None | Axis = None,
- inplace: bool = False,
- limit: None | int = None,
- downcast: dict | None = None,
- ) -> Series | None:
- return super().bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
-
- def clip(
- self: Series,
- lower=None,
- upper=None,
- *,
- axis: Axis | None = None,
- inplace: bool = False,
- **kwargs,
- ) -> Series | None:
- return super().clip(lower, upper, axis=axis, inplace=inplace, **kwargs)
-
- def interpolate(
- self: Series,
- method: str = "linear",
- *,
- axis: Axis = 0,
- limit: int | None = None,
- inplace: bool = False,
- limit_direction: str | None = None,
- limit_area: str | None = None,
- downcast: str | None = None,
- **kwargs,
- ) -> Series | None:
- return super().interpolate(
- method=method,
- axis=axis,
- limit=limit,
- inplace=inplace,
- limit_direction=limit_direction,
- limit_area=limit_area,
- downcast=downcast,
- **kwargs,
- )
-
- @overload
- def where(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[False] = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> Series:
- ...
-
- @overload
- def where(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[True],
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> None:
- ...
-
- @overload
- def where(
- self,
- cond,
- other=...,
- *,
- inplace: bool = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> Series | None:
- ...
-
- def where(
- self,
- cond,
- other=lib.no_default,
- *,
- inplace: bool = False,
- axis: Axis | None = None,
- level: Level = None,
- ) -> Series | None:
- return super().where(
- cond,
- other,
- inplace=inplace,
- axis=axis,
- level=level,
- )
-
- @overload
- def mask(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[False] = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> Series:
- ...
-
- @overload
- def mask(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[True],
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> None:
- ...
-
- @overload
- def mask(
- self,
- cond,
- other=...,
- *,
- inplace: bool = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> Series | None:
- ...
-
- def mask(
- self,
- cond,
- other=lib.no_default,
- *,
- inplace: bool = False,
- axis: Axis | None = None,
- level: Level = None,
- ) -> Series | None:
- return super().mask(
- cond,
- other,
- inplace=inplace,
- axis=axis,
- level=level,
- )
-
- # ----------------------------------------------------------------------
- # Add index
- _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index"]
- _AXIS_LEN = len(_AXIS_ORDERS)
- _info_axis_number: Literal[0] = 0
- _info_axis_name: Literal["index"] = "index"
-
- index = properties.AxisProperty(
- axis=0, doc="The index (axis labels) of the Series."
- )
-
- # ----------------------------------------------------------------------
- # Accessor Methods
- # ----------------------------------------------------------------------
- str = CachedAccessor("str", StringMethods)
- dt = CachedAccessor("dt", CombinedDatetimelikeProperties)
- cat = CachedAccessor("cat", CategoricalAccessor)
- plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
- sparse = CachedAccessor("sparse", SparseAccessor)
-
- # ----------------------------------------------------------------------
- # Add plotting methods to Series
- hist = pandas.plotting.hist_series
-
- # ----------------------------------------------------------------------
- # Template-Based Arithmetic/Comparison Methods
-
- def _cmp_method(self, other, op):
- res_name = ops.get_op_result_name(self, other)
-
- if isinstance(other, Series) and not self._indexed_same(other):
- raise ValueError("Can only compare identically-labeled Series objects")
-
- lvalues = self._values
- rvalues = extract_array(other, extract_numpy=True, extract_range=True)
-
- with np.errstate(all="ignore"):
- res_values = ops.comparison_op(lvalues, rvalues, op)
-
- return self._construct_result(res_values, name=res_name)
-
- def _logical_method(self, other, op):
- res_name = ops.get_op_result_name(self, other)
- self, other = ops.align_method_SERIES(self, other, align_asobject=True)
-
- lvalues = self._values
- rvalues = extract_array(other, extract_numpy=True, extract_range=True)
-
- res_values = ops.logical_op(lvalues, rvalues, op)
- return self._construct_result(res_values, name=res_name)
-
- def _arith_method(self, other, op):
- self, other = ops.align_method_SERIES(self, other)
- return base.IndexOpsMixin._arith_method(self, other, op)
-
-
-Series._add_numeric_operations()
-
-# Add arithmetic!
-ops.add_flex_arithmetic_methods(Series)
diff --git a/contrib/python/pandas/py3/pandas/core/shared_docs.py b/contrib/python/pandas/py3/pandas/core/shared_docs.py
deleted file mode 100644
index ddd59a55b54..00000000000
--- a/contrib/python/pandas/py3/pandas/core/shared_docs.py
+++ /dev/null
@@ -1,894 +0,0 @@
-from __future__ import annotations
-
-_shared_docs: dict[str, str] = {}
-
-_shared_docs[
- "aggregate"
-] = """
-Aggregate using one or more operations over the specified axis.
-
-Parameters
-----------
-func : function, str, list or dict
- Function to use for aggregating the data. If a function, must either
- work when passed a {klass} or when passed to {klass}.apply.
-
- Accepted combinations are:
-
- - function
- - string function name
- - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
- - dict of axis labels -> functions, function names or list of such.
-{axis}
-*args
- Positional arguments to pass to `func`.
-**kwargs
- Keyword arguments to pass to `func`.
-
-Returns
--------
-scalar, Series or DataFrame
-
- The return can be:
-
- * scalar : when Series.agg is called with single function
- * Series : when DataFrame.agg is called with a single function
- * DataFrame : when DataFrame.agg is called with several functions
-
- Return scalar, Series or DataFrame.
-{see_also}
-Notes
------
-`agg` is an alias for `aggregate`. Use the alias.
-
-Functions that mutate the passed object can produce unexpected
-behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
-for more details.
-
-A passed user-defined-function will be passed a Series for evaluation.
-{examples}"""
-
-_shared_docs[
- "compare"
-] = """
-Compare to another {klass} and show the differences.
-
-.. versionadded:: 1.1.0
-
-Parameters
-----------
-other : {klass}
- Object to compare with.
-
-align_axis : {{0 or 'index', 1 or 'columns'}}, default 1
- Determine which axis to align the comparison on.
-
- * 0, or 'index' : Resulting differences are stacked vertically
- with rows drawn alternately from self and other.
- * 1, or 'columns' : Resulting differences are aligned horizontally
- with columns drawn alternately from self and other.
-
-keep_shape : bool, default False
- If true, all rows and columns are kept.
- Otherwise, only the ones with different values are kept.
-
-keep_equal : bool, default False
- If true, the result keeps values that are equal.
- Otherwise, equal values are shown as NaNs.
-
-result_names : tuple, default ('self', 'other')
- Set the dataframes names in the comparison.
-
- .. versionadded:: 1.5.0
-"""
-
-_shared_docs[
- "groupby"
-] = """
-Group %(klass)s using a mapper or by a Series of columns.
-
-A groupby operation involves some combination of splitting the
-object, applying a function, and combining the results. This can be
-used to group large amounts of data and compute operations on these
-groups.
-
-Parameters
-----------
-by : mapping, function, label, pd.Grouper or list of such
- Used to determine the groups for the groupby.
- If ``by`` is a function, it's called on each value of the object's
- index. If a dict or Series is passed, the Series or dict VALUES
- will be used to determine the groups (the Series' values are first
- aligned; see ``.align()`` method). If a list or ndarray of length
- equal to the selected axis is passed (see the `groupby user guide
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>`_),
- the values are used as-is to determine the groups. A label or list
- of labels may be passed to group by the columns in ``self``.
- Notice that a tuple is interpreted as a (single) key.
-axis : {0 or 'index', 1 or 'columns'}, default 0
- Split along rows (0) or columns (1). For `Series` this parameter
- is unused and defaults to 0.
-level : int, level name, or sequence of such, default None
- If the axis is a MultiIndex (hierarchical), group by a particular
- level or levels. Do not specify both ``by`` and ``level``.
-as_index : bool, default True
- For aggregated output, return object with group labels as the
- index. Only relevant for DataFrame input. as_index=False is
- effectively "SQL-style" grouped output.
-sort : bool, default True
- Sort group keys. Get better performance by turning this off.
- Note this does not influence the order of observations within each
- group. Groupby preserves the order of rows within each group.
-
- .. versionchanged:: 2.0.0
-
- Specifying ``sort=False`` with an ordered categorical grouper will no
- longer sort the values.
-
-group_keys : bool, default True
- When calling apply and the ``by`` argument produces a like-indexed
- (i.e. :ref:`a transform <groupby.transform>`) result, add group keys to
- index to identify pieces. By default group keys are not included
- when the result's index (and column) labels match the inputs, and
- are included otherwise.
-
- .. versionchanged:: 1.5.0
-
- Warns that ``group_keys`` will no longer be ignored when the
- result from ``apply`` is a like-indexed Series or DataFrame.
- Specify ``group_keys`` explicitly to include the group keys or
- not.
-
- .. versionchanged:: 2.0.0
-
- ``group_keys`` now defaults to ``True``.
-
-observed : bool, default False
- This only applies if any of the groupers are Categoricals.
- If True: only show observed values for categorical groupers.
- If False: show all values for categorical groupers.
-dropna : bool, default True
- If True, and if group keys contain NA values, NA values together
- with row/column will be dropped.
- If False, NA values will also be treated as the key in groups.
-
- .. versionadded:: 1.1.0
-
-Returns
--------
-%(klass)sGroupBy
- Returns a groupby object that contains information about the groups.
-
-See Also
---------
-resample : Convenience method for frequency conversion and resampling
- of time series.
-
-Notes
------
-See the `user guide
-<https://pandas.pydata.org/pandas-docs/stable/groupby.html>`__ for more
-detailed usage and examples, including splitting an object into groups,
-iterating through groups, selecting a group, aggregation, and more.
-"""
-
-_shared_docs[
- "melt"
-] = """
-Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
-
-This function is useful to massage a DataFrame into a format where one
-or more columns are identifier variables (`id_vars`), while all other
-columns, considered measured variables (`value_vars`), are "unpivoted" to
-the row axis, leaving just two non-identifier columns, 'variable' and
-'value'.
-
-Parameters
-----------
-id_vars : tuple, list, or ndarray, optional
- Column(s) to use as identifier variables.
-value_vars : tuple, list, or ndarray, optional
- Column(s) to unpivot. If not specified, uses all columns that
- are not set as `id_vars`.
-var_name : scalar
- Name to use for the 'variable' column. If None it uses
- ``frame.columns.name`` or 'variable'.
-value_name : scalar, default 'value'
- Name to use for the 'value' column.
-col_level : int or str, optional
- If columns are a MultiIndex then use this level to melt.
-ignore_index : bool, default True
- If True, original index is ignored. If False, the original index is retained.
- Index labels will be repeated as necessary.
-
- .. versionadded:: 1.1.0
-
-Returns
--------
-DataFrame
- Unpivoted DataFrame.
-
-See Also
---------
-%(other)s : Identical method.
-pivot_table : Create a spreadsheet-style pivot table as a DataFrame.
-DataFrame.pivot : Return reshaped DataFrame organized
- by given index / column values.
-DataFrame.explode : Explode a DataFrame from list-like
- columns to long format.
-
-Notes
------
-Reference :ref:`the user guide <reshaping.melt>` for more examples.
-
-Examples
---------
->>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
-... 'B': {0: 1, 1: 3, 2: 5},
-... 'C': {0: 2, 1: 4, 2: 6}})
->>> df
- A B C
-0 a 1 2
-1 b 3 4
-2 c 5 6
-
->>> %(caller)sid_vars=['A'], value_vars=['B'])
- A variable value
-0 a B 1
-1 b B 3
-2 c B 5
-
->>> %(caller)sid_vars=['A'], value_vars=['B', 'C'])
- A variable value
-0 a B 1
-1 b B 3
-2 c B 5
-3 a C 2
-4 b C 4
-5 c C 6
-
-The names of 'variable' and 'value' columns can be customized:
-
->>> %(caller)sid_vars=['A'], value_vars=['B'],
-... var_name='myVarname', value_name='myValname')
- A myVarname myValname
-0 a B 1
-1 b B 3
-2 c B 5
-
-Original index values can be kept around:
-
->>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False)
- A variable value
-0 a B 1
-1 b B 3
-2 c B 5
-0 a C 2
-1 b C 4
-2 c C 6
-
-If you have multi-index columns:
-
->>> df.columns = [list('ABC'), list('DEF')]
->>> df
- A B C
- D E F
-0 a 1 2
-1 b 3 4
-2 c 5 6
-
->>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B'])
- A variable value
-0 a B 1
-1 b B 3
-2 c B 5
-
->>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')])
- (A, D) variable_0 variable_1 value
-0 a B E 1
-1 b B E 3
-2 c B E 5
-"""
-
-_shared_docs[
- "transform"
-] = """
-Call ``func`` on self producing a {klass} with the same axis shape as self.
-
-Parameters
-----------
-func : function, str, list-like or dict-like
- Function to use for transforming the data. If a function, must either
- work when passed a {klass} or when passed to {klass}.apply. If func
- is both list-like and dict-like, dict-like behavior takes precedence.
-
- Accepted combinations are:
-
- - function
- - string function name
- - list-like of functions and/or function names, e.g. ``[np.exp, 'sqrt']``
- - dict-like of axis labels -> functions, function names or list-like of such.
-{axis}
-*args
- Positional arguments to pass to `func`.
-**kwargs
- Keyword arguments to pass to `func`.
-
-Returns
--------
-{klass}
- A {klass} that must have the same length as self.
-
-Raises
-------
-ValueError : If the returned {klass} has a different length than self.
-
-See Also
---------
-{klass}.agg : Only perform aggregating type operations.
-{klass}.apply : Invoke function on a {klass}.
-
-Notes
------
-Functions that mutate the passed object can produce unexpected
-behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
-for more details.
-
-Examples
---------
->>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}})
->>> df
- A B
-0 0 1
-1 1 2
-2 2 3
->>> df.transform(lambda x: x + 1)
- A B
-0 1 2
-1 2 3
-2 3 4
-
-Even though the resulting {klass} must have the same length as the
-input {klass}, it is possible to provide several input functions:
-
->>> s = pd.Series(range(3))
->>> s
-0 0
-1 1
-2 2
-dtype: int64
->>> s.transform([np.sqrt, np.exp])
- sqrt exp
-0 0.000000 1.000000
-1 1.000000 2.718282
-2 1.414214 7.389056
-
-You can call transform on a GroupBy object:
-
->>> df = pd.DataFrame({{
-... "Date": [
-... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05",
-... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"],
-... "Data": [5, 8, 6, 1, 50, 100, 60, 120],
-... }})
->>> df
- Date Data
-0 2015-05-08 5
-1 2015-05-07 8
-2 2015-05-06 6
-3 2015-05-05 1
-4 2015-05-08 50
-5 2015-05-07 100
-6 2015-05-06 60
-7 2015-05-05 120
->>> df.groupby('Date')['Data'].transform('sum')
-0 55
-1 108
-2 66
-3 121
-4 55
-5 108
-6 66
-7 121
-Name: Data, dtype: int64
-
->>> df = pd.DataFrame({{
-... "c": [1, 1, 1, 2, 2, 2, 2],
-... "type": ["m", "n", "o", "m", "m", "n", "n"]
-... }})
->>> df
- c type
-0 1 m
-1 1 n
-2 1 o
-3 2 m
-4 2 m
-5 2 n
-6 2 n
->>> df['size'] = df.groupby('c')['type'].transform(len)
->>> df
- c type size
-0 1 m 3
-1 1 n 3
-2 1 o 3
-3 2 m 4
-4 2 m 4
-5 2 n 4
-6 2 n 4
-"""
-
-_shared_docs[
- "storage_options"
-] = """storage_options : dict, optional
- Extra options that make sense for a particular storage connection, e.g.
- host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
- are forwarded to ``urllib.request.Request`` as header options. For other
- URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
- forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
- details, and for more examples on storage options refer `here
- <https://pandas.pydata.org/docs/user_guide/io.html?
- highlight=storage_options#reading-writing-remote-files>`_."""
-
-_shared_docs[
- "compression_options"
-] = """compression : str or dict, default 'infer'
- For on-the-fly compression of the output data. If 'infer' and '%s' is
- path-like, then detect compression from the following extensions: '.gz',
- '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
- (otherwise no compression).
- Set to ``None`` for no compression.
- Can also be a dict with key ``'method'`` set
- to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
- key-value pairs are forwarded to
- ``zipfile.ZipFile``, ``gzip.GzipFile``,
- ``bz2.BZ2File``, ``zstandard.ZstdCompressor`` or
- ``tarfile.TarFile``, respectively.
- As an example, the following could be passed for faster compression and to create
- a reproducible gzip archive:
- ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
-
- .. versionadded:: 1.5.0
- Added support for `.tar` files."""
-
-_shared_docs[
- "decompression_options"
-] = """compression : str or dict, default 'infer'
- For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
- path-like, then detect compression from the following extensions: '.gz',
- '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
- (otherwise no compression).
- If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in.
- Set to ``None`` for no decompression.
- Can also be a dict with key ``'method'`` set
- to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
- key-value pairs are forwarded to
- ``zipfile.ZipFile``, ``gzip.GzipFile``,
- ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or
- ``tarfile.TarFile``, respectively.
- As an example, the following could be passed for Zstandard decompression using a
- custom compression dictionary:
- ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
-
- .. versionadded:: 1.5.0
- Added support for `.tar` files."""
-
-_shared_docs[
- "replace"
-] = """
- Replace values given in `to_replace` with `value`.
-
- Values of the {klass} are replaced with other values dynamically.
- {replace_iloc}
-
- Parameters
- ----------
- to_replace : str, regex, list, dict, Series, int, float, or None
- How to find the values that will be replaced.
-
- * numeric, str or regex:
-
- - numeric: numeric values equal to `to_replace` will be
- replaced with `value`
- - str: string exactly matching `to_replace` will be replaced
- with `value`
- - regex: regexs matching `to_replace` will be replaced with
- `value`
-
- * list of str, regex, or numeric:
-
- - First, if `to_replace` and `value` are both lists, they
- **must** be the same length.
- - Second, if ``regex=True`` then all of the strings in **both**
- lists will be interpreted as regexs otherwise they will match
- directly. This doesn't matter much for `value` since there
- are only a few possible substitution regexes you can use.
- - str, regex and numeric rules apply as above.
-
- * dict:
-
- - Dicts can be used to specify different replacement values
- for different existing values. For example,
- ``{{'a': 'b', 'y': 'z'}}`` replaces the value 'a' with 'b' and
- 'y' with 'z'. To use a dict in this way, the optional `value`
- parameter should not be given.
- - For a DataFrame a dict can specify that different values
- should be replaced in different columns. For example,
- ``{{'a': 1, 'b': 'z'}}`` looks for the value 1 in column 'a'
- and the value 'z' in column 'b' and replaces these values
- with whatever is specified in `value`. The `value` parameter
- should not be ``None`` in this case. You can treat this as a
- special case of passing two lists except that you are
- specifying the column to search in.
- - For a DataFrame nested dictionaries, e.g.,
- ``{{'a': {{'b': np.nan}}}}``, are read as follows: look in column
- 'a' for the value 'b' and replace it with NaN. The optional `value`
- parameter should not be specified to use a nested dict in this
- way. You can nest regular expressions as well. Note that
- column names (the top-level dictionary keys in a nested
- dictionary) **cannot** be regular expressions.
-
- * None:
-
- - This means that the `regex` argument must be a string,
- compiled regular expression, or list, dict, ndarray or
- Series of such elements. If `value` is also ``None`` then
- this **must** be a nested dictionary or Series.
-
- See the examples section for examples of each of these.
- value : scalar, dict, list, str, regex, default None
- Value to replace any values matching `to_replace` with.
- For a DataFrame a dict of values can be used to specify which
- value to use for each column (columns not in the dict will not be
- filled). Regular expressions, strings and lists or dicts of such
- objects are also allowed.
- {inplace}
- limit : int, default None
- Maximum size gap to forward or backward fill.
- regex : bool or same types as `to_replace`, default False
- Whether to interpret `to_replace` and/or `value` as regular
- expressions. If this is ``True`` then `to_replace` *must* be a
- string. Alternatively, this could be a regular expression or a
- list, dict, or array of regular expressions in which case
- `to_replace` must be ``None``.
- method : {{'pad', 'ffill', 'bfill'}}
- The method to use when for replacement, when `to_replace` is a
- scalar, list or tuple and `value` is ``None``.
-
- Returns
- -------
- {klass}
- Object after replacement.
-
- Raises
- ------
- AssertionError
- * If `regex` is not a ``bool`` and `to_replace` is not
- ``None``.
-
- TypeError
- * If `to_replace` is not a scalar, array-like, ``dict``, or ``None``
- * If `to_replace` is a ``dict`` and `value` is not a ``list``,
- ``dict``, ``ndarray``, or ``Series``
- * If `to_replace` is ``None`` and `regex` is not compilable
- into a regular expression or is a list, dict, ndarray, or
- Series.
- * When replacing multiple ``bool`` or ``datetime64`` objects and
- the arguments to `to_replace` does not match the type of the
- value being replaced
-
- ValueError
- * If a ``list`` or an ``ndarray`` is passed to `to_replace` and
- `value` but they are not the same length.
-
- See Also
- --------
- {klass}.fillna : Fill NA values.
- {klass}.where : Replace values based on boolean condition.
- Series.str.replace : Simple string replacement.
-
- Notes
- -----
- * Regex substitution is performed under the hood with ``re.sub``. The
- rules for substitution for ``re.sub`` are the same.
- * Regular expressions will only substitute on strings, meaning you
- cannot provide, for example, a regular expression matching floating
- point numbers and expect the columns in your frame that have a
- numeric dtype to be matched. However, if those floating point
- numbers *are* strings, then you can do this.
- * This method has *a lot* of options. You are encouraged to experiment
- and play with this method to gain intuition about how it works.
- * When dict is used as the `to_replace` value, it is like
- key(s) in the dict are the to_replace part and
- value(s) in the dict are the value parameter.
-
- Examples
- --------
-
- **Scalar `to_replace` and `value`**
-
- >>> s = pd.Series([1, 2, 3, 4, 5])
- >>> s.replace(1, 5)
- 0 5
- 1 2
- 2 3
- 3 4
- 4 5
- dtype: int64
-
- >>> df = pd.DataFrame({{'A': [0, 1, 2, 3, 4],
- ... 'B': [5, 6, 7, 8, 9],
- ... 'C': ['a', 'b', 'c', 'd', 'e']}})
- >>> df.replace(0, 5)
- A B C
- 0 5 5 a
- 1 1 6 b
- 2 2 7 c
- 3 3 8 d
- 4 4 9 e
-
- **List-like `to_replace`**
-
- >>> df.replace([0, 1, 2, 3], 4)
- A B C
- 0 4 5 a
- 1 4 6 b
- 2 4 7 c
- 3 4 8 d
- 4 4 9 e
-
- >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1])
- A B C
- 0 4 5 a
- 1 3 6 b
- 2 2 7 c
- 3 1 8 d
- 4 4 9 e
-
- >>> s.replace([1, 2], method='bfill')
- 0 3
- 1 3
- 2 3
- 3 4
- 4 5
- dtype: int64
-
- **dict-like `to_replace`**
-
- >>> df.replace({{0: 10, 1: 100}})
- A B C
- 0 10 5 a
- 1 100 6 b
- 2 2 7 c
- 3 3 8 d
- 4 4 9 e
-
- >>> df.replace({{'A': 0, 'B': 5}}, 100)
- A B C
- 0 100 100 a
- 1 1 6 b
- 2 2 7 c
- 3 3 8 d
- 4 4 9 e
-
- >>> df.replace({{'A': {{0: 100, 4: 400}}}})
- A B C
- 0 100 5 a
- 1 1 6 b
- 2 2 7 c
- 3 3 8 d
- 4 400 9 e
-
- **Regular expression `to_replace`**
-
- >>> df = pd.DataFrame({{'A': ['bat', 'foo', 'bait'],
- ... 'B': ['abc', 'bar', 'xyz']}})
- >>> df.replace(to_replace=r'^ba.$', value='new', regex=True)
- A B
- 0 new abc
- 1 foo new
- 2 bait xyz
-
- >>> df.replace({{'A': r'^ba.$'}}, {{'A': 'new'}}, regex=True)
- A B
- 0 new abc
- 1 foo bar
- 2 bait xyz
-
- >>> df.replace(regex=r'^ba.$', value='new')
- A B
- 0 new abc
- 1 foo new
- 2 bait xyz
-
- >>> df.replace(regex={{r'^ba.$': 'new', 'foo': 'xyz'}})
- A B
- 0 new abc
- 1 xyz new
- 2 bait xyz
-
- >>> df.replace(regex=[r'^ba.$', 'foo'], value='new')
- A B
- 0 new abc
- 1 new new
- 2 bait xyz
-
- Compare the behavior of ``s.replace({{'a': None}})`` and
- ``s.replace('a', None)`` to understand the peculiarities
- of the `to_replace` parameter:
-
- >>> s = pd.Series([10, 'a', 'a', 'b', 'a'])
-
- When one uses a dict as the `to_replace` value, it is like the
- value(s) in the dict are equal to the `value` parameter.
- ``s.replace({{'a': None}})`` is equivalent to
- ``s.replace(to_replace={{'a': None}}, value=None, method=None)``:
-
- >>> s.replace({{'a': None}})
- 0 10
- 1 None
- 2 None
- 3 b
- 4 None
- dtype: object
-
- When ``value`` is not explicitly passed and `to_replace` is a scalar, list
- or tuple, `replace` uses the method parameter (default 'pad') to do the
- replacement. So this is why the 'a' values are being replaced by 10
- in rows 1 and 2 and 'b' in row 4 in this case.
-
- >>> s.replace('a')
- 0 10
- 1 10
- 2 10
- 3 b
- 4 b
- dtype: object
-
- On the other hand, if ``None`` is explicitly passed for ``value``, it will
- be respected:
-
- >>> s.replace('a', None)
- 0 10
- 1 None
- 2 None
- 3 b
- 4 None
- dtype: object
-
- .. versionchanged:: 1.4.0
- Previously the explicit ``None`` was silently ignored.
-"""
-
-_shared_docs[
- "idxmin"
-] = """
- Return index of first occurrence of minimum over requested axis.
-
- NA/null values are excluded.
-
- Parameters
- ----------
- axis : {{0 or 'index', 1 or 'columns'}}, default 0
- The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- skipna : bool, default True
- Exclude NA/null values. If an entire row/column is NA, the result
- will be NA.
- numeric_only : bool, default {numeric_only_default}
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- Returns
- -------
- Series
- Indexes of minima along the specified axis.
-
- Raises
- ------
- ValueError
- * If the row/column is empty
-
- See Also
- --------
- Series.idxmin : Return index of the minimum element.
-
- Notes
- -----
- This method is the DataFrame version of ``ndarray.argmin``.
-
- Examples
- --------
- Consider a dataset containing food consumption in Argentina.
-
- >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48],
- ... 'co2_emissions': [37.2, 19.66, 1712]}},
- ... index=['Pork', 'Wheat Products', 'Beef'])
-
- >>> df
- consumption co2_emissions
- Pork 10.51 37.20
- Wheat Products 103.11 19.66
- Beef 55.48 1712.00
-
- By default, it returns the index for the minimum value in each column.
-
- >>> df.idxmin()
- consumption Pork
- co2_emissions Wheat Products
- dtype: object
-
- To return the index for the minimum value in each row, use ``axis="columns"``.
-
- >>> df.idxmin(axis="columns")
- Pork consumption
- Wheat Products co2_emissions
- Beef consumption
- dtype: object
-"""
-
-_shared_docs[
- "idxmax"
-] = """
- Return index of first occurrence of maximum over requested axis.
-
- NA/null values are excluded.
-
- Parameters
- ----------
- axis : {{0 or 'index', 1 or 'columns'}}, default 0
- The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- skipna : bool, default True
- Exclude NA/null values. If an entire row/column is NA, the result
- will be NA.
- numeric_only : bool, default {numeric_only_default}
- Include only `float`, `int` or `boolean` data.
-
- .. versionadded:: 1.5.0
-
- Returns
- -------
- Series
- Indexes of maxima along the specified axis.
-
- Raises
- ------
- ValueError
- * If the row/column is empty
-
- See Also
- --------
- Series.idxmax : Return index of the maximum element.
-
- Notes
- -----
- This method is the DataFrame version of ``ndarray.argmax``.
-
- Examples
- --------
- Consider a dataset containing food consumption in Argentina.
-
- >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48],
- ... 'co2_emissions': [37.2, 19.66, 1712]}},
- ... index=['Pork', 'Wheat Products', 'Beef'])
-
- >>> df
- consumption co2_emissions
- Pork 10.51 37.20
- Wheat Products 103.11 19.66
- Beef 55.48 1712.00
-
- By default, it returns the index for the maximum value in each column.
-
- >>> df.idxmax()
- consumption Wheat Products
- co2_emissions Beef
- dtype: object
-
- To return the index for the maximum value in each row, use ``axis="columns"``.
-
- >>> df.idxmax(axis="columns")
- Pork co2_emissions
- Wheat Products consumption
- Beef co2_emissions
- dtype: object
-"""
diff --git a/contrib/python/pandas/py3/pandas/core/sorting.py b/contrib/python/pandas/py3/pandas/core/sorting.py
deleted file mode 100644
index 970c9998f5f..00000000000
--- a/contrib/python/pandas/py3/pandas/core/sorting.py
+++ /dev/null
@@ -1,725 +0,0 @@
-""" miscellaneous sorting / groupby utilities """
-from __future__ import annotations
-
-from collections import defaultdict
-from typing import (
- TYPE_CHECKING,
- Callable,
- DefaultDict,
- Hashable,
- Iterable,
- Sequence,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs import (
- algos,
- hashtable,
- lib,
-)
-from pandas._libs.hashtable import unique_label_indices
-from pandas._typing import (
- AxisInt,
- IndexKeyFunc,
- Level,
- NaPosition,
- Shape,
- SortKind,
- npt,
-)
-
-from pandas.core.dtypes.common import (
- ensure_int64,
- ensure_platform_int,
- is_extension_array_dtype,
-)
-from pandas.core.dtypes.generic import (
- ABCMultiIndex,
- ABCRangeIndex,
-)
-from pandas.core.dtypes.missing import isna
-
-from pandas.core.construction import extract_array
-
-if TYPE_CHECKING:
- from pandas import MultiIndex
- from pandas.core.arrays import ExtensionArray
- from pandas.core.indexes.base import Index
-
-
-def get_indexer_indexer(
- target: Index,
- level: Level | list[Level] | None,
- ascending: list[bool] | bool,
- kind: SortKind,
- na_position: NaPosition,
- sort_remaining: bool,
- key: IndexKeyFunc,
-) -> npt.NDArray[np.intp] | None:
- """
- Helper method that return the indexer according to input parameters for
- the sort_index method of DataFrame and Series.
-
- Parameters
- ----------
- target : Index
- level : int or level name or list of ints or list of level names
- ascending : bool or list of bools, default True
- kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
- na_position : {'first', 'last'}, default 'last'
- sort_remaining : bool, default True
- key : callable, optional
-
- Returns
- -------
- Optional[ndarray[intp]]
- The indexer for the new index.
- """
-
- target = ensure_key_mapped(target, key, levels=level)
- target = target._sort_levels_monotonic()
-
- if level is not None:
- _, indexer = target.sortlevel(
- level, ascending=ascending, sort_remaining=sort_remaining
- )
- elif isinstance(target, ABCMultiIndex):
- indexer = lexsort_indexer(
- target._get_codes_for_sorting(), orders=ascending, na_position=na_position
- )
- else:
- # Check monotonic-ness before sort an index (GH 11080)
- if (ascending and target.is_monotonic_increasing) or (
- not ascending and target.is_monotonic_decreasing
- ):
- return None
-
- # ascending can only be a Sequence for MultiIndex
- indexer = nargsort(
- target,
- kind=kind,
- ascending=cast(bool, ascending),
- na_position=na_position,
- )
- return indexer
-
-
-def get_group_index(
- labels, shape: Shape, sort: bool, xnull: bool
-) -> npt.NDArray[np.int64]:
- """
- For the particular label_list, gets the offsets into the hypothetical list
- representing the totally ordered cartesian product of all possible label
- combinations, *as long as* this space fits within int64 bounds;
- otherwise, though group indices identify unique combinations of
- labels, they cannot be deconstructed.
- - If `sort`, rank of returned ids preserve lexical ranks of labels.
- i.e. returned id's can be used to do lexical sort on labels;
- - If `xnull` nulls (-1 labels) are passed through.
-
- Parameters
- ----------
- labels : sequence of arrays
- Integers identifying levels at each location
- shape : tuple[int, ...]
- Number of unique levels at each location
- sort : bool
- If the ranks of returned ids should match lexical ranks of labels
- xnull : bool
- If true nulls are excluded. i.e. -1 values in the labels are
- passed through.
-
- Returns
- -------
- An array of type int64 where two elements are equal if their corresponding
- labels are equal at all location.
-
- Notes
- -----
- The length of `labels` and `shape` must be identical.
- """
-
- def _int64_cut_off(shape) -> int:
- acc = 1
- for i, mul in enumerate(shape):
- acc *= int(mul)
- if not acc < lib.i8max:
- return i
- return len(shape)
-
- def maybe_lift(lab, size) -> tuple[np.ndarray, int]:
- # promote nan values (assigned -1 label in lab array)
- # so that all output values are non-negative
- return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
-
- labels = [ensure_int64(x) for x in labels]
- lshape = list(shape)
- if not xnull:
- for i, (lab, size) in enumerate(zip(labels, shape)):
- lab, size = maybe_lift(lab, size)
- labels[i] = lab
- lshape[i] = size
-
- labels = list(labels)
-
- # Iteratively process all the labels in chunks sized so less
- # than lib.i8max unique int ids will be required for each chunk
- while True:
- # how many levels can be done without overflow:
- nlev = _int64_cut_off(lshape)
-
- # compute flat ids for the first `nlev` levels
- stride = np.prod(lshape[1:nlev], dtype="i8")
- out = stride * labels[0].astype("i8", subok=False, copy=False)
-
- for i in range(1, nlev):
- if lshape[i] == 0:
- stride = np.int64(0)
- else:
- stride //= lshape[i]
- out += labels[i] * stride
-
- if xnull: # exclude nulls
- mask = labels[0] == -1
- for lab in labels[1:nlev]:
- mask |= lab == -1
- out[mask] = -1
-
- if nlev == len(lshape): # all levels done!
- break
-
- # compress what has been done so far in order to avoid overflow
- # to retain lexical ranks, obs_ids should be sorted
- comp_ids, obs_ids = compress_group_index(out, sort=sort)
-
- labels = [comp_ids] + labels[nlev:]
- lshape = [len(obs_ids)] + lshape[nlev:]
-
- return out
-
-
-def get_compressed_ids(
- labels, sizes: Shape
-) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]:
- """
- Group_index is offsets into cartesian product of all possible labels. This
- space can be huge, so this function compresses it, by computing offsets
- (comp_ids) into the list of unique labels (obs_group_ids).
-
- Parameters
- ----------
- labels : list of label arrays
- sizes : tuple[int] of size of the levels
-
- Returns
- -------
- np.ndarray[np.intp]
- comp_ids
- np.ndarray[np.int64]
- obs_group_ids
- """
- ids = get_group_index(labels, sizes, sort=True, xnull=False)
- return compress_group_index(ids, sort=True)
-
-
-def is_int64_overflow_possible(shape: Shape) -> bool:
- the_prod = 1
- for x in shape:
- the_prod *= int(x)
-
- return the_prod >= lib.i8max
-
-
-def _decons_group_index(
- comp_labels: npt.NDArray[np.intp], shape: Shape
-) -> list[npt.NDArray[np.intp]]:
- # reconstruct labels
- if is_int64_overflow_possible(shape):
- # at some point group indices are factorized,
- # and may not be deconstructed here! wrong path!
- raise ValueError("cannot deconstruct factorized group indices!")
-
- label_list = []
- factor = 1
- y = np.array(0)
- x = comp_labels
- for i in reversed(range(len(shape))):
- labels = (x - y) % (factor * shape[i]) // factor
- np.putmask(labels, comp_labels < 0, -1)
- label_list.append(labels)
- y = labels * factor
- factor *= shape[i]
- return label_list[::-1]
-
-
-def decons_obs_group_ids(
- comp_ids: npt.NDArray[np.intp],
- obs_ids: npt.NDArray[np.intp],
- shape: Shape,
- labels: Sequence[npt.NDArray[np.signedinteger]],
- xnull: bool,
-) -> list[npt.NDArray[np.intp]]:
- """
- Reconstruct labels from observed group ids.
-
- Parameters
- ----------
- comp_ids : np.ndarray[np.intp]
- obs_ids: np.ndarray[np.intp]
- shape : tuple[int]
- labels : Sequence[np.ndarray[np.signedinteger]]
- xnull : bool
- If nulls are excluded; i.e. -1 labels are passed through.
- """
- if not xnull:
- lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp)
- arr_shape = np.asarray(shape, dtype=np.intp) + lift
- shape = tuple(arr_shape)
-
- if not is_int64_overflow_possible(shape):
- # obs ids are deconstructable! take the fast route!
- out = _decons_group_index(obs_ids, shape)
- return out if xnull or not lift.any() else [x - y for x, y in zip(out, lift)]
-
- indexer = unique_label_indices(comp_ids)
- return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]
-
-
-def indexer_from_factorized(
- labels, shape: Shape, compress: bool = True
-) -> npt.NDArray[np.intp]:
- ids = get_group_index(labels, shape, sort=True, xnull=False)
-
- if not compress:
- ngroups = (ids.size and ids.max()) + 1
- else:
- ids, obs = compress_group_index(ids, sort=True)
- ngroups = len(obs)
-
- return get_group_index_sorter(ids, ngroups)
-
-
-def lexsort_indexer(
- keys, orders=None, na_position: str = "last", key: Callable | None = None
-) -> npt.NDArray[np.intp]:
- """
- Performs lexical sorting on a set of keys
-
- Parameters
- ----------
- keys : sequence of arrays
- Sequence of ndarrays to be sorted by the indexer
- orders : bool or list of booleans, optional
- Determines the sorting order for each element in keys. If a list,
- it must be the same length as keys. This determines whether the
- corresponding element in keys should be sorted in ascending
- (True) or descending (False) order. if bool, applied to all
- elements as above. if None, defaults to True.
- na_position : {'first', 'last'}, default 'last'
- Determines placement of NA elements in the sorted list ("last" or "first")
- key : Callable, optional
- Callable key function applied to every element in keys before sorting
-
- Returns
- -------
- np.ndarray[np.intp]
- """
- from pandas.core.arrays import Categorical
-
- labels = []
- shape = []
- if isinstance(orders, bool):
- orders = [orders] * len(keys)
- elif orders is None:
- orders = [True] * len(keys)
-
- keys = [ensure_key_mapped(k, key) for k in keys]
-
- for k, order in zip(keys, orders):
- cat = Categorical(k, ordered=True)
-
- if na_position not in ["last", "first"]:
- raise ValueError(f"invalid na_position: {na_position}")
-
- n = len(cat.categories)
- codes = cat.codes.copy()
-
- mask = cat.codes == -1
- if order: # ascending
- if na_position == "last":
- codes = np.where(mask, n, codes)
- elif na_position == "first":
- codes += 1
- else: # not order means descending
- if na_position == "last":
- codes = np.where(mask, n, n - codes - 1)
- elif na_position == "first":
- codes = np.where(mask, 0, n - codes)
- if mask.any():
- n += 1
-
- shape.append(n)
- labels.append(codes)
-
- return indexer_from_factorized(labels, tuple(shape))
-
-
-def nargsort(
- items,
- kind: str = "quicksort",
- ascending: bool = True,
- na_position: str = "last",
- key: Callable | None = None,
- mask: npt.NDArray[np.bool_] | None = None,
-) -> npt.NDArray[np.intp]:
- """
- Intended to be a drop-in replacement for np.argsort which handles NaNs.
-
- Adds ascending, na_position, and key parameters.
-
- (GH #6399, #5231, #27237)
-
- Parameters
- ----------
- kind : str, default 'quicksort'
- ascending : bool, default True
- na_position : {'first', 'last'}, default 'last'
- key : Optional[Callable], default None
- mask : Optional[np.ndarray[bool]], default None
- Passed when called by ExtensionArray.argsort.
-
- Returns
- -------
- np.ndarray[np.intp]
- """
-
- if key is not None:
- items = ensure_key_mapped(items, key)
- return nargsort(
- items,
- kind=kind,
- ascending=ascending,
- na_position=na_position,
- key=None,
- mask=mask,
- )
-
- if isinstance(items, ABCRangeIndex):
- return items.argsort(ascending=ascending) # TODO: test coverage with key?
- elif not isinstance(items, ABCMultiIndex):
- items = extract_array(items)
- if mask is None:
- mask = np.asarray(isna(items)) # TODO: does this exclude MultiIndex too?
-
- if is_extension_array_dtype(items):
- return items.argsort(ascending=ascending, kind=kind, na_position=na_position)
- else:
- items = np.asanyarray(items)
-
- idx = np.arange(len(items))
- non_nans = items[~mask]
- non_nan_idx = idx[~mask]
-
- nan_idx = np.nonzero(mask)[0]
- if not ascending:
- non_nans = non_nans[::-1]
- non_nan_idx = non_nan_idx[::-1]
- indexer = non_nan_idx[non_nans.argsort(kind=kind)]
- if not ascending:
- indexer = indexer[::-1]
- # Finally, place the NaNs at the end or the beginning according to
- # na_position
- if na_position == "last":
- indexer = np.concatenate([indexer, nan_idx])
- elif na_position == "first":
- indexer = np.concatenate([nan_idx, indexer])
- else:
- raise ValueError(f"invalid na_position: {na_position}")
- return ensure_platform_int(indexer)
-
-
-def nargminmax(values: ExtensionArray, method: str, axis: AxisInt = 0):
- """
- Implementation of np.argmin/argmax but for ExtensionArray and which
- handles missing values.
-
- Parameters
- ----------
- values : ExtensionArray
- method : {"argmax", "argmin"}
- axis : int, default 0
-
- Returns
- -------
- int
- """
- assert method in {"argmax", "argmin"}
- func = np.argmax if method == "argmax" else np.argmin
-
- mask = np.asarray(isna(values))
- arr_values = values._values_for_argsort()
-
- if arr_values.ndim > 1:
- if mask.any():
- if axis == 1:
- zipped = zip(arr_values, mask)
- else:
- zipped = zip(arr_values.T, mask.T)
- return np.array([_nanargminmax(v, m, func) for v, m in zipped])
- return func(arr_values, axis=axis)
-
- return _nanargminmax(arr_values, mask, func)
-
-
-def _nanargminmax(values: np.ndarray, mask: npt.NDArray[np.bool_], func) -> int:
- """
- See nanargminmax.__doc__.
- """
- idx = np.arange(values.shape[0])
- non_nans = values[~mask]
- non_nan_idx = idx[~mask]
-
- return non_nan_idx[func(non_nans)]
-
-
-def _ensure_key_mapped_multiindex(
- index: MultiIndex, key: Callable, level=None
-) -> MultiIndex:
- """
- Returns a new MultiIndex in which key has been applied
- to all levels specified in level (or all levels if level
- is None). Used for key sorting for MultiIndex.
-
- Parameters
- ----------
- index : MultiIndex
- Index to which to apply the key function on the
- specified levels.
- key : Callable
- Function that takes an Index and returns an Index of
- the same shape. This key is applied to each level
- separately. The name of the level can be used to
- distinguish different levels for application.
- level : list-like, int or str, default None
- Level or list of levels to apply the key function to.
- If None, key function is applied to all levels. Other
- levels are left unchanged.
-
- Returns
- -------
- labels : MultiIndex
- Resulting MultiIndex with modified levels.
- """
-
- if level is not None:
- if isinstance(level, (str, int)):
- sort_levels = [level]
- else:
- sort_levels = level
-
- sort_levels = [index._get_level_number(lev) for lev in sort_levels]
- else:
- sort_levels = list(range(index.nlevels)) # satisfies mypy
-
- mapped = [
- ensure_key_mapped(index._get_level_values(level), key)
- if level in sort_levels
- else index._get_level_values(level)
- for level in range(index.nlevels)
- ]
-
- return type(index).from_arrays(mapped)
-
-
-def ensure_key_mapped(values, key: Callable | None, levels=None):
- """
- Applies a callable key function to the values function and checks
- that the resulting value has the same shape. Can be called on Index
- subclasses, Series, DataFrames, or ndarrays.
-
- Parameters
- ----------
- values : Series, DataFrame, Index subclass, or ndarray
- key : Optional[Callable], key to be called on the values array
- levels : Optional[List], if values is a MultiIndex, list of levels to
- apply the key to.
- """
- from pandas.core.indexes.api import Index
-
- if not key:
- return values
-
- if isinstance(values, ABCMultiIndex):
- return _ensure_key_mapped_multiindex(values, key, level=levels)
-
- result = key(values.copy())
- if len(result) != len(values):
- raise ValueError(
- "User-provided `key` function must not change the shape of the array."
- )
-
- try:
- if isinstance(
- values, Index
- ): # convert to a new Index subclass, not necessarily the same
- result = Index(result)
- else:
- type_of_values = type(values)
- result = type_of_values(result) # try to revert to original type otherwise
- except TypeError:
- raise TypeError(
- f"User-provided `key` function returned an invalid type {type(result)} \
- which could not be converted to {type(values)}."
- )
-
- return result
-
-
-def get_flattened_list(
- comp_ids: npt.NDArray[np.intp],
- ngroups: int,
- levels: Iterable[Index],
- labels: Iterable[np.ndarray],
-) -> list[tuple]:
- """Map compressed group id -> key tuple."""
- comp_ids = comp_ids.astype(np.int64, copy=False)
- arrays: DefaultDict[int, list[int]] = defaultdict(list)
- for labs, level in zip(labels, levels):
- table = hashtable.Int64HashTable(ngroups)
- table.map_keys_to_values(comp_ids, labs.astype(np.int64, copy=False))
- for i in range(ngroups):
- arrays[i].append(level[table.get_item(i)])
- return [tuple(array) for array in arrays.values()]
-
-
-def get_indexer_dict(
- label_list: list[np.ndarray], keys: list[Index]
-) -> dict[Hashable, npt.NDArray[np.intp]]:
- """
- Returns
- -------
- dict:
- Labels mapped to indexers.
- """
- shape = tuple(len(x) for x in keys)
-
- group_index = get_group_index(label_list, shape, sort=True, xnull=True)
- if np.all(group_index == -1):
- # Short-circuit, lib.indices_fast will return the same
- return {}
- ngroups = (
- ((group_index.size and group_index.max()) + 1)
- if is_int64_overflow_possible(shape)
- else np.prod(shape, dtype="i8")
- )
-
- sorter = get_group_index_sorter(group_index, ngroups)
-
- sorted_labels = [lab.take(sorter) for lab in label_list]
- group_index = group_index.take(sorter)
-
- return lib.indices_fast(sorter, group_index, keys, sorted_labels)
-
-
-# ----------------------------------------------------------------------
-# sorting levels...cleverly?
-
-
-def get_group_index_sorter(
- group_index: npt.NDArray[np.intp], ngroups: int | None = None
-) -> npt.NDArray[np.intp]:
- """
- algos.groupsort_indexer implements `counting sort` and it is at least
- O(ngroups), where
- ngroups = prod(shape)
- shape = map(len, keys)
- that is, linear in the number of combinations (cartesian product) of unique
- values of groupby keys. This can be huge when doing multi-key groupby.
- np.argsort(kind='mergesort') is O(count x log(count)) where count is the
- length of the data-frame;
- Both algorithms are `stable` sort and that is necessary for correctness of
- groupby operations. e.g. consider:
- df.groupby(key)[col].transform('first')
-
- Parameters
- ----------
- group_index : np.ndarray[np.intp]
- signed integer dtype
- ngroups : int or None, default None
-
- Returns
- -------
- np.ndarray[np.intp]
- """
- if ngroups is None:
- ngroups = 1 + group_index.max()
- count = len(group_index)
- alpha = 0.0 # taking complexities literally; there may be
- beta = 1.0 # some room for fine-tuning these parameters
- do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))
- if do_groupsort:
- sorter, _ = algos.groupsort_indexer(
- ensure_platform_int(group_index),
- ngroups,
- )
- # sorter _should_ already be intp, but mypy is not yet able to verify
- else:
- sorter = group_index.argsort(kind="mergesort")
- return ensure_platform_int(sorter)
-
-
-def compress_group_index(
- group_index: npt.NDArray[np.int64], sort: bool = True
-) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
- """
- Group_index is offsets into cartesian product of all possible labels. This
- space can be huge, so this function compresses it, by computing offsets
- (comp_ids) into the list of unique labels (obs_group_ids).
- """
- size_hint = len(group_index)
- table = hashtable.Int64HashTable(size_hint)
-
- group_index = ensure_int64(group_index)
-
- # note, group labels come out ascending (ie, 1,2,3 etc)
- comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
-
- if sort and len(obs_group_ids) > 0:
- obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
-
- return ensure_int64(comp_ids), ensure_int64(obs_group_ids)
-
-
-def _reorder_by_uniques(
- uniques: npt.NDArray[np.int64], labels: npt.NDArray[np.intp]
-) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]:
- """
- Parameters
- ----------
- uniques : np.ndarray[np.int64]
- labels : np.ndarray[np.intp]
-
- Returns
- -------
- np.ndarray[np.int64]
- np.ndarray[np.intp]
- """
- # sorter is index where elements ought to go
- sorter = uniques.argsort()
-
- # reverse_indexer is where elements came from
- reverse_indexer = np.empty(len(sorter), dtype=np.intp)
- reverse_indexer.put(sorter, np.arange(len(sorter)))
-
- mask = labels < 0
-
- # move labels to right locations (ie, unsort ascending labels)
- labels = reverse_indexer.take(labels)
- np.putmask(labels, mask, -1)
-
- # sort observed ids
- uniques = uniques.take(sorter)
-
- return uniques, labels
diff --git a/contrib/python/pandas/py3/pandas/core/sparse/__init__.py b/contrib/python/pandas/py3/pandas/core/sparse/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/sparse/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/core/sparse/api.py b/contrib/python/pandas/py3/pandas/core/sparse/api.py
deleted file mode 100644
index 2a324ebf77d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/sparse/api.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from pandas.core.arrays.sparse import (
- SparseArray,
- SparseDtype,
-)
-
-__all__ = ["SparseArray", "SparseDtype"]
diff --git a/contrib/python/pandas/py3/pandas/core/strings/__init__.py b/contrib/python/pandas/py3/pandas/core/strings/__init__.py
deleted file mode 100644
index eb650477c2b..00000000000
--- a/contrib/python/pandas/py3/pandas/core/strings/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""
-Implementation of pandas.Series.str and its interface.
-
-* strings.accessor.StringMethods : Accessor for Series.str
-* strings.base.BaseStringArrayMethods: Mixin ABC for EAs to implement str methods
-
-Most methods on the StringMethods accessor follow the pattern:
-
- 1. extract the array from the series (or index)
- 2. Call that array's implementation of the string method
- 3. Wrap the result (in a Series, index, or DataFrame)
-
-Pandas extension arrays implementing string methods should inherit from
-pandas.core.strings.base.BaseStringArrayMethods. This is an ABC defining
-the various string methods. To avoid namespace clashes and pollution,
-these are prefixed with `_str_`. So ``Series.str.upper()`` calls
-``Series.array._str_upper()``. The interface isn't currently public
-to other string extension arrays.
-"""
-# Pandas current implementation is in ObjectStringArrayMixin. This is designed
-# to work on object-dtype ndarrays.
-#
-# BaseStringArrayMethods
-# - ObjectStringArrayMixin
-# - StringArray
-# - PandasArray
-# - Categorical
-# - ArrowStringArray
diff --git a/contrib/python/pandas/py3/pandas/core/strings/accessor.py b/contrib/python/pandas/py3/pandas/core/strings/accessor.py
deleted file mode 100644
index e544bde16da..00000000000
--- a/contrib/python/pandas/py3/pandas/core/strings/accessor.py
+++ /dev/null
@@ -1,3376 +0,0 @@
-from __future__ import annotations
-
-import codecs
-from functools import wraps
-import re
-from typing import (
- TYPE_CHECKING,
- Callable,
- Hashable,
- Literal,
- cast,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import (
- AlignJoin,
- DtypeObj,
- F,
- Scalar,
-)
-from pandas.util._decorators import Appender
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import (
- ensure_object,
- is_bool_dtype,
- is_categorical_dtype,
- is_integer,
- is_list_like,
- is_object_dtype,
- is_re,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCIndex,
- ABCMultiIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import isna
-
-from pandas.core.arrays.arrow.dtype import ArrowDtype
-from pandas.core.base import NoNewAttributesMixin
-from pandas.core.construction import extract_array
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Index,
- Series,
- )
-
-_shared_docs: dict[str, str] = {}
-_cpython_optimized_encoders = (
- "utf-8",
- "utf8",
- "latin-1",
- "latin1",
- "iso-8859-1",
- "mbcs",
- "ascii",
-)
-_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32")
-
-
-def forbid_nonstring_types(
- forbidden: list[str] | None, name: str | None = None
-) -> Callable[[F], F]:
- """
- Decorator to forbid specific types for a method of StringMethods.
-
- For calling `.str.{method}` on a Series or Index, it is necessary to first
- initialize the :class:`StringMethods` object, and then call the method.
- However, different methods allow different input types, and so this can not
- be checked during :meth:`StringMethods.__init__`, but must be done on a
- per-method basis. This decorator exists to facilitate this process, and
- make it explicit which (inferred) types are disallowed by the method.
-
- :meth:`StringMethods.__init__` allows the *union* of types its different
- methods allow (after skipping NaNs; see :meth:`StringMethods._validate`),
- namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'].
-
- The default string types ['string', 'empty'] are allowed for all methods.
- For the additional types ['bytes', 'mixed', 'mixed-integer'], each method
- then needs to forbid the types it is not intended for.
-
- Parameters
- ----------
- forbidden : list-of-str or None
- List of forbidden non-string types, may be one or more of
- `['bytes', 'mixed', 'mixed-integer']`.
- name : str, default None
- Name of the method to use in the error message. By default, this is
- None, in which case the name from the method being wrapped will be
- copied. However, for working with further wrappers (like _pat_wrapper
- and _noarg_wrapper), it is necessary to specify the name.
-
- Returns
- -------
- func : wrapper
- The method to which the decorator is applied, with an added check that
- enforces the inferred type to not be in the list of forbidden types.
-
- Raises
- ------
- TypeError
- If the inferred type of the underlying data is in `forbidden`.
- """
- # deal with None
- forbidden = [] if forbidden is None else forbidden
-
- allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set(
- forbidden
- )
-
- def _forbid_nonstring_types(func: F) -> F:
- func_name = func.__name__ if name is None else name
-
- @wraps(func)
- def wrapper(self, *args, **kwargs):
- if self._inferred_dtype not in allowed_types:
- msg = (
- f"Cannot use .str.{func_name} with values of "
- f"inferred dtype '{self._inferred_dtype}'."
- )
- raise TypeError(msg)
- return func(self, *args, **kwargs)
-
- wrapper.__name__ = func_name
- return cast(F, wrapper)
-
- return _forbid_nonstring_types
-
-
-def _map_and_wrap(name, docstring):
- @forbid_nonstring_types(["bytes"], name=name)
- def wrapper(self):
- result = getattr(self._data.array, f"_str_{name}")()
- return self._wrap_result(result)
-
- wrapper.__doc__ = docstring
- return wrapper
-
-
-class StringMethods(NoNewAttributesMixin):
- """
- Vectorized string functions for Series and Index.
-
- NAs stay NA unless handled otherwise by a particular method.
- Patterned after Python's string methods, with some inspiration from
- R's stringr package.
-
- Examples
- --------
- >>> s = pd.Series(["A_Str_Series"])
- >>> s
- 0 A_Str_Series
- dtype: object
-
- >>> s.str.split("_")
- 0 [A, Str, Series]
- dtype: object
-
- >>> s.str.replace("_", "")
- 0 AStrSeries
- dtype: object
- """
-
- # Note: see the docstring in pandas.core.strings.__init__
- # for an explanation of the implementation.
- # TODO: Dispatch all the methods
- # Currently the following are not dispatched to the array
- # * cat
- # * extractall
-
- def __init__(self, data) -> None:
- from pandas.core.arrays.string_ import StringDtype
-
- self._inferred_dtype = self._validate(data)
- self._is_categorical = is_categorical_dtype(data.dtype)
- self._is_string = isinstance(data.dtype, StringDtype)
- self._data = data
-
- self._index = self._name = None
- if isinstance(data, ABCSeries):
- self._index = data.index
- self._name = data.name
-
- # ._values.categories works for both Series/Index
- self._parent = data._values.categories if self._is_categorical else data
- # save orig to blow up categoricals to the right type
- self._orig = data
- self._freeze()
-
- @staticmethod
- def _validate(data):
- """
- Auxiliary function for StringMethods, infers and checks dtype of data.
-
- This is a "first line of defence" at the creation of the StringMethods-
- object, and just checks that the dtype is in the
- *union* of the allowed types over all string methods below; this
- restriction is then refined on a per-method basis using the decorator
- @forbid_nonstring_types (more info in the corresponding docstring).
-
- This really should exclude all series/index with any non-string values,
- but that isn't practical for performance reasons until we have a str
- dtype (GH 9343 / 13877)
-
- Parameters
- ----------
- data : The content of the Series
-
- Returns
- -------
- dtype : inferred dtype of data
- """
- if isinstance(data, ABCMultiIndex):
- raise AttributeError(
- "Can only use .str accessor with Index, not MultiIndex"
- )
-
- # see _libs/lib.pyx for list of inferred types
- allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"]
-
- data = extract_array(data)
-
- values = getattr(data, "categories", data) # categorical / normal
-
- inferred_dtype = lib.infer_dtype(values, skipna=True)
-
- if inferred_dtype not in allowed_types:
- raise AttributeError("Can only use .str accessor with string values!")
- return inferred_dtype
-
- def __getitem__(self, key):
- result = self._data.array._str_getitem(key)
- return self._wrap_result(result)
-
- def _wrap_result(
- self,
- result,
- name=None,
- expand: bool | None = None,
- fill_value=np.nan,
- returns_string: bool = True,
- returns_bool: bool = False,
- ):
- from pandas import (
- Index,
- MultiIndex,
- )
-
- if not hasattr(result, "ndim") or not hasattr(result, "dtype"):
- if isinstance(result, ABCDataFrame):
- result = result.__finalize__(self._orig, name="str")
- return result
- assert result.ndim < 3
-
- # We can be wrapping a string / object / categorical result, in which
- # case we'll want to return the same dtype as the input.
- # Or we can be wrapping a numeric output, in which case we don't want
- # to return a StringArray.
- # Ideally the array method returns the right array type.
- if expand is None:
- # infer from ndim if expand is not specified
- expand = result.ndim != 1
- elif expand is True and not isinstance(self._orig, ABCIndex):
- # required when expand=True is explicitly specified
- # not needed when inferred
- if isinstance(result.dtype, ArrowDtype):
- import pyarrow as pa
-
- from pandas.compat import pa_version_under11p0
-
- from pandas.core.arrays.arrow.array import ArrowExtensionArray
-
- value_lengths = result._data.combine_chunks().value_lengths()
- max_len = pa.compute.max(value_lengths).as_py()
- min_len = pa.compute.min(value_lengths).as_py()
- if result._hasna:
- # ArrowExtensionArray.fillna doesn't work for list scalars
- result = ArrowExtensionArray(
- result._data.fill_null([None] * max_len)
- )
- if min_len < max_len:
- # append nulls to each scalar list element up to max_len
- if not pa_version_under11p0:
- result = ArrowExtensionArray(
- pa.compute.list_slice(
- result._data,
- start=0,
- stop=max_len,
- return_fixed_size_list=True,
- )
- )
- else:
- all_null = np.full(max_len, fill_value=None, dtype=object)
- values = result.to_numpy()
- new_values = []
- for row in values:
- if len(row) < max_len:
- nulls = all_null[: max_len - len(row)]
- row = np.append(row, nulls)
- new_values.append(row)
- pa_type = result._data.type
- result = ArrowExtensionArray(pa.array(new_values, type=pa_type))
- if name is not None:
- labels = name
- else:
- labels = range(max_len)
- result = {
- label: ArrowExtensionArray(pa.array(res))
- for label, res in zip(labels, (zip(*result.tolist())))
- }
- elif is_object_dtype(result):
-
- def cons_row(x):
- if is_list_like(x):
- return x
- else:
- return [x]
-
- result = [cons_row(x) for x in result]
- if result and not self._is_string:
- # propagate nan values to match longest sequence (GH 18450)
- max_len = max(len(x) for x in result)
- result = [
- x * max_len if len(x) == 0 or x[0] is np.nan else x
- for x in result
- ]
-
- if not isinstance(expand, bool):
- raise ValueError("expand must be True or False")
-
- if expand is False:
- # if expand is False, result should have the same name
- # as the original otherwise specified
- if name is None:
- name = getattr(result, "name", None)
- if name is None:
- # do not use logical or, _orig may be a DataFrame
- # which has "name" column
- name = self._orig.name
-
- # Wait until we are sure result is a Series or Index before
- # checking attributes (GH 12180)
- if isinstance(self._orig, ABCIndex):
- # if result is a boolean np.array, return the np.array
- # instead of wrapping it into a boolean Index (GH 8875)
- if is_bool_dtype(result):
- return result
-
- if expand:
- result = list(result)
- out = MultiIndex.from_tuples(result, names=name)
- if out.nlevels == 1:
- # We had all tuples of length-one, which are
- # better represented as a regular Index.
- out = out.get_level_values(0)
- return out
- else:
- return Index(result, name=name)
- else:
- index = self._orig.index
- # This is a mess.
- dtype: DtypeObj | str | None
- vdtype = getattr(result, "dtype", None)
- if self._is_string:
- if is_bool_dtype(vdtype):
- dtype = result.dtype
- elif returns_string:
- dtype = self._orig.dtype
- else:
- dtype = vdtype
- else:
- dtype = vdtype
-
- if expand:
- cons = self._orig._constructor_expanddim
- result = cons(result, columns=name, index=index, dtype=dtype)
- else:
- # Must be a Series
- cons = self._orig._constructor
- result = cons(result, name=name, index=index, dtype=dtype)
- result = result.__finalize__(self._orig, method="str")
- if name is not None and result.ndim == 1:
- # __finalize__ might copy over the original name, but we may
- # want the new name (e.g. str.extract).
- result.name = name
- return result
-
- def _get_series_list(self, others):
- """
- Auxiliary function for :meth:`str.cat`. Turn potentially mixed input
- into a list of Series (elements without an index must match the length
- of the calling Series/Index).
-
- Parameters
- ----------
- others : Series, DataFrame, np.ndarray, list-like or list-like of
- Objects that are either Series, Index or np.ndarray (1-dim).
-
- Returns
- -------
- list of Series
- Others transformed into list of Series.
- """
- from pandas import (
- DataFrame,
- Series,
- )
-
- # self._orig is either Series or Index
- idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index
-
- # Generally speaking, all objects without an index inherit the index
- # `idx` of the calling Series/Index - i.e. must have matching length.
- # Objects with an index (i.e. Series/Index/DataFrame) keep their own.
- if isinstance(others, ABCSeries):
- return [others]
- elif isinstance(others, ABCIndex):
- return [Series(others, index=idx, dtype=others.dtype)]
- elif isinstance(others, ABCDataFrame):
- return [others[x] for x in others]
- elif isinstance(others, np.ndarray) and others.ndim == 2:
- others = DataFrame(others, index=idx)
- return [others[x] for x in others]
- elif is_list_like(others, allow_sets=False):
- others = list(others) # ensure iterators do not get read twice etc
-
- # in case of list-like `others`, all elements must be
- # either Series/Index/np.ndarray (1-dim)...
- if all(
- isinstance(x, (ABCSeries, ABCIndex))
- or (isinstance(x, np.ndarray) and x.ndim == 1)
- for x in others
- ):
- los: list[Series] = []
- while others: # iterate through list and append each element
- los = los + self._get_series_list(others.pop(0))
- return los
- # ... or just strings
- elif all(not is_list_like(x) for x in others):
- return [Series(others, index=idx)]
- raise TypeError(
- "others must be Series, Index, DataFrame, np.ndarray "
- "or list-like (either containing only strings or "
- "containing only objects of type Series/Index/"
- "np.ndarray[1-dim])"
- )
-
- @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"])
- def cat(
- self,
- others=None,
- sep=None,
- na_rep=None,
- join: AlignJoin = "left",
- ) -> str | Series | Index:
- """
- Concatenate strings in the Series/Index with given separator.
-
- If `others` is specified, this function concatenates the Series/Index
- and elements of `others` element-wise.
- If `others` is not passed, then all values in the Series/Index are
- concatenated into a single string with a given `sep`.
-
- Parameters
- ----------
- others : Series, Index, DataFrame, np.ndarray or list-like
- Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
- other list-likes of strings must have the same length as the
- calling Series/Index, with the exception of indexed objects (i.e.
- Series/Index/DataFrame) if `join` is not None.
-
- If others is a list-like that contains a combination of Series,
- Index or np.ndarray (1-dim), then all elements will be unpacked and
- must satisfy the above criteria individually.
-
- If others is None, the method returns the concatenation of all
- strings in the calling Series/Index.
- sep : str, default ''
- The separator between the different elements/columns. By default
- the empty string `''` is used.
- na_rep : str or None, default None
- Representation that is inserted for all missing values:
-
- - If `na_rep` is None, and `others` is None, missing values in the
- Series/Index are omitted from the result.
- - If `na_rep` is None, and `others` is not None, a row containing a
- missing value in any of the columns (before concatenation) will
- have a missing value in the result.
- join : {'left', 'right', 'outer', 'inner'}, default 'left'
- Determines the join-style between the calling Series/Index and any
- Series/Index/DataFrame in `others` (objects without an index need
- to match the length of the calling Series/Index). To disable
- alignment, use `.values` on any Series/Index/DataFrame in `others`.
-
- Returns
- -------
- str, Series or Index
- If `others` is None, `str` is returned, otherwise a `Series/Index`
- (same type as caller) of objects is returned.
-
- See Also
- --------
- split : Split each string in the Series/Index.
- join : Join lists contained as elements in the Series/Index.
-
- Examples
- --------
- When not passing `others`, all values are concatenated into a single
- string:
-
- >>> s = pd.Series(['a', 'b', np.nan, 'd'])
- >>> s.str.cat(sep=' ')
- 'a b d'
-
- By default, NA values in the Series are ignored. Using `na_rep`, they
- can be given a representation:
-
- >>> s.str.cat(sep=' ', na_rep='?')
- 'a b ? d'
-
- If `others` is specified, corresponding values are concatenated with
- the separator. Result will be a Series of strings.
-
- >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
- 0 a,A
- 1 b,B
- 2 NaN
- 3 d,D
- dtype: object
-
- Missing values will remain missing in the result, but can again be
- represented using `na_rep`
-
- >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
- 0 a,A
- 1 b,B
- 2 -,C
- 3 d,D
- dtype: object
-
- If `sep` is not specified, the values are concatenated without
- separation.
-
- >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
- 0 aA
- 1 bB
- 2 -C
- 3 dD
- dtype: object
-
- Series with different indexes can be aligned before concatenation. The
- `join`-keyword works as in other methods.
-
- >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
- >>> s.str.cat(t, join='left', na_rep='-')
- 0 aa
- 1 b-
- 2 -c
- 3 dd
- dtype: object
- >>>
- >>> s.str.cat(t, join='outer', na_rep='-')
- 0 aa
- 1 b-
- 2 -c
- 3 dd
- 4 -e
- dtype: object
- >>>
- >>> s.str.cat(t, join='inner', na_rep='-')
- 0 aa
- 2 -c
- 3 dd
- dtype: object
- >>>
- >>> s.str.cat(t, join='right', na_rep='-')
- 3 dd
- 0 aa
- 4 -e
- 2 -c
- dtype: object
-
- For more examples, see :ref:`here <text.concatenate>`.
- """
- # TODO: dispatch
- from pandas import (
- Index,
- Series,
- concat,
- )
-
- if isinstance(others, str):
- raise ValueError("Did you mean to supply a `sep` keyword?")
- if sep is None:
- sep = ""
-
- if isinstance(self._orig, ABCIndex):
- data = Series(self._orig, index=self._orig, dtype=self._orig.dtype)
- else: # Series
- data = self._orig
-
- # concatenate Series/Index with itself if no "others"
- if others is None:
- # error: Incompatible types in assignment (expression has type
- # "ndarray", variable has type "Series")
- data = ensure_object(data) # type: ignore[assignment]
- na_mask = isna(data)
- if na_rep is None and na_mask.any():
- return sep.join(data[~na_mask])
- elif na_rep is not None and na_mask.any():
- return sep.join(np.where(na_mask, na_rep, data))
- else:
- return sep.join(data)
-
- try:
- # turn anything in "others" into lists of Series
- others = self._get_series_list(others)
- except ValueError as err: # do not catch TypeError raised by _get_series_list
- raise ValueError(
- "If `others` contains arrays or lists (or other "
- "list-likes without an index), these must all be "
- "of the same length as the calling Series/Index."
- ) from err
-
- # align if required
- if any(not data.index.equals(x.index) for x in others):
- # Need to add keys for uniqueness in case of duplicate columns
- others = concat(
- others,
- axis=1,
- join=(join if join == "inner" else "outer"),
- keys=range(len(others)),
- sort=False,
- copy=False,
- )
- data, others = data.align(others, join=join)
- others = [others[x] for x in others] # again list of Series
-
- all_cols = [ensure_object(x) for x in [data] + others]
- na_masks = np.array([isna(x) for x in all_cols])
- union_mask = np.logical_or.reduce(na_masks, axis=0)
-
- if na_rep is None and union_mask.any():
- # no na_rep means NaNs for all rows where any column has a NaN
- # only necessary if there are actually any NaNs
- result = np.empty(len(data), dtype=object)
- np.putmask(result, union_mask, np.nan)
-
- not_masked = ~union_mask
- result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep)
- elif na_rep is not None and union_mask.any():
- # fill NaNs with na_rep in case there are actually any NaNs
- all_cols = [
- np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)
- ]
- result = cat_safe(all_cols, sep)
- else:
- # no NaNs - can just concatenate
- result = cat_safe(all_cols, sep)
-
- out: Index | Series
- if isinstance(self._orig, ABCIndex):
- # add dtype for case that result is all-NA
-
- out = Index(result, dtype=object, name=self._orig.name)
- else: # Series
- if is_categorical_dtype(self._orig.dtype):
- # We need to infer the new categories.
- dtype = None
- else:
- dtype = self._orig.dtype
- res_ser = Series(
- result, dtype=dtype, index=data.index, name=self._orig.name, copy=False
- )
- out = res_ser.__finalize__(self._orig, method="str_cat")
- return out
-
- _shared_docs[
- "str_split"
- ] = r"""
- Split strings around given separator/delimiter.
-
- Splits the string in the Series/Index from the %(side)s,
- at the specified delimiter string.
-
- Parameters
- ----------
- pat : str%(pat_regex)s, optional
- %(pat_description)s.
- If not specified, split on whitespace.
- n : int, default -1 (all)
- Limit number of splits in output.
- ``None``, 0 and -1 will be interpreted as return all splits.
- expand : bool, default False
- Expand the split strings into separate columns.
-
- - If ``True``, return DataFrame/MultiIndex expanding dimensionality.
- - If ``False``, return Series/Index, containing lists of strings.
- %(regex_argument)s
- Returns
- -------
- Series, Index, DataFrame or MultiIndex
- Type matches caller unless ``expand=True`` (see Notes).
- %(raises_split)s
- See Also
- --------
- Series.str.split : Split strings around given separator/delimiter.
- Series.str.rsplit : Splits string around given separator/delimiter,
- starting from the right.
- Series.str.join : Join lists contained as elements in the Series/Index
- with passed delimiter.
- str.split : Standard library version for split.
- str.rsplit : Standard library version for rsplit.
-
- Notes
- -----
- The handling of the `n` keyword depends on the number of found splits:
-
- - If found splits > `n`, make first `n` splits only
- - If found splits <= `n`, make all splits
- - If for a certain row the number of found splits < `n`,
- append `None` for padding up to `n` if ``expand=True``
-
- If using ``expand=True``, Series and Index callers return DataFrame and
- MultiIndex objects, respectively.
- %(regex_pat_note)s
- Examples
- --------
- >>> s = pd.Series(
- ... [
- ... "this is a regular sentence",
- ... "https://docs.python.org/3/tutorial/index.html",
- ... np.nan
- ... ]
- ... )
- >>> s
- 0 this is a regular sentence
- 1 https://docs.python.org/3/tutorial/index.html
- 2 NaN
- dtype: object
-
- In the default setting, the string is split by whitespace.
-
- >>> s.str.split()
- 0 [this, is, a, regular, sentence]
- 1 [https://docs.python.org/3/tutorial/index.html]
- 2 NaN
- dtype: object
-
- Without the `n` parameter, the outputs of `rsplit` and `split`
- are identical.
-
- >>> s.str.rsplit()
- 0 [this, is, a, regular, sentence]
- 1 [https://docs.python.org/3/tutorial/index.html]
- 2 NaN
- dtype: object
-
- The `n` parameter can be used to limit the number of splits on the
- delimiter. The outputs of `split` and `rsplit` are different.
-
- >>> s.str.split(n=2)
- 0 [this, is, a regular sentence]
- 1 [https://docs.python.org/3/tutorial/index.html]
- 2 NaN
- dtype: object
-
- >>> s.str.rsplit(n=2)
- 0 [this is a, regular, sentence]
- 1 [https://docs.python.org/3/tutorial/index.html]
- 2 NaN
- dtype: object
-
- The `pat` parameter can be used to split by other characters.
-
- >>> s.str.split(pat="/")
- 0 [this is a regular sentence]
- 1 [https:, , docs.python.org, 3, tutorial, index...
- 2 NaN
- dtype: object
-
- When using ``expand=True``, the split elements will expand out into
- separate columns. If NaN is present, it is propagated throughout
- the columns during the split.
-
- >>> s.str.split(expand=True)
- 0 1 2 3 4
- 0 this is a regular sentence
- 1 https://docs.python.org/3/tutorial/index.html None None None None
- 2 NaN NaN NaN NaN NaN
-
- For slightly more complex use cases like splitting the html document name
- from a url, a combination of parameter settings can be used.
-
- >>> s.str.rsplit("/", n=1, expand=True)
- 0 1
- 0 this is a regular sentence None
- 1 https://docs.python.org/3/tutorial index.html
- 2 NaN NaN
- %(regex_examples)s"""
-
- @Appender(
- _shared_docs["str_split"]
- % {
- "side": "beginning",
- "pat_regex": " or compiled regex",
- "pat_description": "String or regular expression to split on",
- "regex_argument": """
- regex : bool, default None
- Determines if the passed-in pattern is a regular expression:
-
- - If ``True``, assumes the passed-in pattern is a regular expression
- - If ``False``, treats the pattern as a literal string.
- - If ``None`` and `pat` length is 1, treats `pat` as a literal string.
- - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression.
- - Cannot be set to False if `pat` is a compiled regex
-
- .. versionadded:: 1.4.0
- """,
- "raises_split": """
- Raises
- ------
- ValueError
- * if `regex` is False and `pat` is a compiled regex
- """,
- "regex_pat_note": """
- Use of `regex =False` with a `pat` as a compiled regex will raise an error.
- """,
- "method": "split",
- "regex_examples": r"""
- Remember to escape special characters when explicitly using regular expressions.
-
- >>> s = pd.Series(["foo and bar plus baz"])
- >>> s.str.split(r"and|plus", expand=True)
- 0 1 2
- 0 foo bar baz
-
- Regular expressions can be used to handle urls or file names.
- When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled
- as a regex only if ``len(pat) != 1``.
-
- >>> s = pd.Series(['foojpgbar.jpg'])
- >>> s.str.split(r".", expand=True)
- 0 1
- 0 foojpgbar jpg
-
- >>> s.str.split(r"\.jpg", expand=True)
- 0 1
- 0 foojpgbar
-
- When ``regex=True``, `pat` is interpreted as a regex
-
- >>> s.str.split(r"\.jpg", regex=True, expand=True)
- 0 1
- 0 foojpgbar
-
- A compiled regex can be passed as `pat`
-
- >>> import re
- >>> s.str.split(re.compile(r"\.jpg"), expand=True)
- 0 1
- 0 foojpgbar
-
- When ``regex=False``, `pat` is interpreted as the string itself
-
- >>> s.str.split(r"\.jpg", regex=False, expand=True)
- 0
- 0 foojpgbar.jpg
- """,
- }
- )
- @forbid_nonstring_types(["bytes"])
- def split(
- self,
- pat: str | re.Pattern | None = None,
- *,
- n=-1,
- expand: bool = False,
- regex: bool | None = None,
- ):
- if regex is False and is_re(pat):
- raise ValueError(
- "Cannot use a compiled regex as replacement pattern with regex=False"
- )
- if is_re(pat):
- regex = True
- result = self._data.array._str_split(pat, n, expand, regex)
- return self._wrap_result(result, returns_string=expand, expand=expand)
-
- @Appender(
- _shared_docs["str_split"]
- % {
- "side": "end",
- "pat_regex": "",
- "pat_description": "String to split on",
- "regex_argument": "",
- "raises_split": "",
- "regex_pat_note": "",
- "method": "rsplit",
- "regex_examples": "",
- }
- )
- @forbid_nonstring_types(["bytes"])
- def rsplit(self, pat=None, *, n=-1, expand: bool = False):
- result = self._data.array._str_rsplit(pat, n=n)
- return self._wrap_result(result, expand=expand, returns_string=expand)
-
- _shared_docs[
- "str_partition"
- ] = """
- Split the string at the %(side)s occurrence of `sep`.
-
- This method splits the string at the %(side)s occurrence of `sep`,
- and returns 3 elements containing the part before the separator,
- the separator itself, and the part after the separator.
- If the separator is not found, return %(return)s.
-
- Parameters
- ----------
- sep : str, default whitespace
- String to split on.
- expand : bool, default True
- If True, return DataFrame/MultiIndex expanding dimensionality.
- If False, return Series/Index.
-
- Returns
- -------
- DataFrame/MultiIndex or Series/Index of objects
-
- See Also
- --------
- %(also)s
- Series.str.split : Split strings around given separators.
- str.partition : Standard library version.
-
- Examples
- --------
-
- >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])
- >>> s
- 0 Linda van der Berg
- 1 George Pitt-Rivers
- dtype: object
-
- >>> s.str.partition()
- 0 1 2
- 0 Linda van der Berg
- 1 George Pitt-Rivers
-
- To partition by the last space instead of the first one:
-
- >>> s.str.rpartition()
- 0 1 2
- 0 Linda van der Berg
- 1 George Pitt-Rivers
-
- To partition by something different than a space:
-
- >>> s.str.partition('-')
- 0 1 2
- 0 Linda van der Berg
- 1 George Pitt - Rivers
-
- To return a Series containing tuples instead of a DataFrame:
-
- >>> s.str.partition('-', expand=False)
- 0 (Linda van der Berg, , )
- 1 (George Pitt, -, Rivers)
- dtype: object
-
- Also available on indices:
-
- >>> idx = pd.Index(['X 123', 'Y 999'])
- >>> idx
- Index(['X 123', 'Y 999'], dtype='object')
-
- Which will create a MultiIndex:
-
- >>> idx.str.partition()
- MultiIndex([('X', ' ', '123'),
- ('Y', ' ', '999')],
- )
-
- Or an index with tuples with ``expand=False``:
-
- >>> idx.str.partition(expand=False)
- Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')
- """
-
- @Appender(
- _shared_docs["str_partition"]
- % {
- "side": "first",
- "return": "3 elements containing the string itself, followed by two "
- "empty strings",
- "also": "rpartition : Split the string at the last occurrence of `sep`.",
- }
- )
- @forbid_nonstring_types(["bytes"])
- def partition(self, sep: str = " ", expand: bool = True):
- result = self._data.array._str_partition(sep, expand)
- return self._wrap_result(result, expand=expand, returns_string=expand)
-
- @Appender(
- _shared_docs["str_partition"]
- % {
- "side": "last",
- "return": "3 elements containing two empty strings, followed by the "
- "string itself",
- "also": "partition : Split the string at the first occurrence of `sep`.",
- }
- )
- @forbid_nonstring_types(["bytes"])
- def rpartition(self, sep: str = " ", expand: bool = True):
- result = self._data.array._str_rpartition(sep, expand)
- return self._wrap_result(result, expand=expand, returns_string=expand)
-
- def get(self, i):
- """
- Extract element from each component at specified position or with specified key.
-
- Extract element from lists, tuples, dict, or strings in each element in the
- Series/Index.
-
- Parameters
- ----------
- i : int or hashable dict label
- Position or key of element to extract.
-
- Returns
- -------
- Series or Index
-
- Examples
- --------
- >>> s = pd.Series(["String",
- ... (1, 2, 3),
- ... ["a", "b", "c"],
- ... 123,
- ... -456,
- ... {1: "Hello", "2": "World"}])
- >>> s
- 0 String
- 1 (1, 2, 3)
- 2 [a, b, c]
- 3 123
- 4 -456
- 5 {1: 'Hello', '2': 'World'}
- dtype: object
-
- >>> s.str.get(1)
- 0 t
- 1 2
- 2 b
- 3 NaN
- 4 NaN
- 5 Hello
- dtype: object
-
- >>> s.str.get(-1)
- 0 g
- 1 3
- 2 c
- 3 NaN
- 4 NaN
- 5 None
- dtype: object
-
- Return element with given key
-
- >>> s = pd.Series([{"name": "Hello", "value": "World"},
- ... {"name": "Goodbye", "value": "Planet"}])
- >>> s.str.get('name')
- 0 Hello
- 1 Goodbye
- dtype: object
- """
- result = self._data.array._str_get(i)
- return self._wrap_result(result)
-
- @forbid_nonstring_types(["bytes"])
- def join(self, sep):
- """
- Join lists contained as elements in the Series/Index with passed delimiter.
-
- If the elements of a Series are lists themselves, join the content of these
- lists using the delimiter passed to the function.
- This function is an equivalent to :meth:`str.join`.
-
- Parameters
- ----------
- sep : str
- Delimiter to use between list entries.
-
- Returns
- -------
- Series/Index: object
- The list entries concatenated by intervening occurrences of the
- delimiter.
-
- Raises
- ------
- AttributeError
- If the supplied Series contains neither strings nor lists.
-
- See Also
- --------
- str.join : Standard library version of this method.
- Series.str.split : Split strings around given separator/delimiter.
-
- Notes
- -----
- If any of the list items is not a string object, the result of the join
- will be `NaN`.
-
- Examples
- --------
- Example with a list that contains non-string elements.
-
- >>> s = pd.Series([['lion', 'elephant', 'zebra'],
- ... [1.1, 2.2, 3.3],
- ... ['cat', np.nan, 'dog'],
- ... ['cow', 4.5, 'goat'],
- ... ['duck', ['swan', 'fish'], 'guppy']])
- >>> s
- 0 [lion, elephant, zebra]
- 1 [1.1, 2.2, 3.3]
- 2 [cat, nan, dog]
- 3 [cow, 4.5, goat]
- 4 [duck, [swan, fish], guppy]
- dtype: object
-
- Join all lists using a '-'. The lists containing object(s) of types other
- than str will produce a NaN.
-
- >>> s.str.join('-')
- 0 lion-elephant-zebra
- 1 NaN
- 2 NaN
- 3 NaN
- 4 NaN
- dtype: object
- """
- result = self._data.array._str_join(sep)
- return self._wrap_result(result)
-
- @forbid_nonstring_types(["bytes"])
- def contains(
- self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
- ):
- r"""
- Test if pattern or regex is contained within a string of a Series or Index.
-
- Return boolean Series or Index based on whether a given pattern or regex is
- contained within a string of a Series or Index.
-
- Parameters
- ----------
- pat : str
- Character sequence or regular expression.
- case : bool, default True
- If True, case sensitive.
- flags : int, default 0 (no flags)
- Flags to pass through to the re module, e.g. re.IGNORECASE.
- na : scalar, optional
- Fill value for missing values. The default depends on dtype of the
- array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
- ``pandas.NA`` is used.
- regex : bool, default True
- If True, assumes the pat is a regular expression.
-
- If False, treats the pat as a literal string.
-
- Returns
- -------
- Series or Index of boolean values
- A Series or Index of boolean values indicating whether the
- given pattern is contained within the string of each element
- of the Series or Index.
-
- See Also
- --------
- match : Analogous, but stricter, relying on re.match instead of re.search.
- Series.str.startswith : Test if the start of each string element matches a
- pattern.
- Series.str.endswith : Same as startswith, but tests the end of string.
-
- Examples
- --------
- Returning a Series of booleans using only a literal pattern.
-
- >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
- >>> s1.str.contains('og', regex=False)
- 0 False
- 1 True
- 2 False
- 3 False
- 4 NaN
- dtype: object
-
- Returning an Index of booleans using only a literal pattern.
-
- >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
- >>> ind.str.contains('23', regex=False)
- Index([False, False, False, True, nan], dtype='object')
-
- Specifying case sensitivity using `case`.
-
- >>> s1.str.contains('oG', case=True, regex=True)
- 0 False
- 1 False
- 2 False
- 3 False
- 4 NaN
- dtype: object
-
- Specifying `na` to be `False` instead of `NaN` replaces NaN values
- with `False`. If Series or Index does not contain NaN values
- the resultant dtype will be `bool`, otherwise, an `object` dtype.
-
- >>> s1.str.contains('og', na=False, regex=True)
- 0 False
- 1 True
- 2 False
- 3 False
- 4 False
- dtype: bool
-
- Returning 'house' or 'dog' when either expression occurs in a string.
-
- >>> s1.str.contains('house|dog', regex=True)
- 0 False
- 1 True
- 2 True
- 3 False
- 4 NaN
- dtype: object
-
- Ignoring case sensitivity using `flags` with regex.
-
- >>> import re
- >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
- 0 False
- 1 False
- 2 True
- 3 False
- 4 NaN
- dtype: object
-
- Returning any digit using regular expression.
-
- >>> s1.str.contains('\\d', regex=True)
- 0 False
- 1 False
- 2 False
- 3 True
- 4 NaN
- dtype: object
-
- Ensure `pat` is a not a literal pattern when `regex` is set to True.
- Note in the following example one might expect only `s2[1]` and `s2[3]` to
- return `True`. However, '.0' as a regex matches any character
- followed by a 0.
-
- >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])
- >>> s2.str.contains('.0', regex=True)
- 0 True
- 1 True
- 2 False
- 3 True
- 4 False
- dtype: bool
- """
- if regex and re.compile(pat).groups:
- warnings.warn(
- "This pattern is interpreted as a regular expression, and has "
- "match groups. To actually get the groups, use str.extract.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
-
- result = self._data.array._str_contains(pat, case, flags, na, regex)
- return self._wrap_result(result, fill_value=na, returns_string=False)
-
- @forbid_nonstring_types(["bytes"])
- def match(self, pat, case: bool = True, flags: int = 0, na=None):
- """
- Determine if each string starts with a match of a regular expression.
-
- Parameters
- ----------
- pat : str
- Character sequence or regular expression.
- case : bool, default True
- If True, case sensitive.
- flags : int, default 0 (no flags)
- Regex module flags, e.g. re.IGNORECASE.
- na : scalar, optional
- Fill value for missing values. The default depends on dtype of the
- array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
- ``pandas.NA`` is used.
-
- Returns
- -------
- Series/Index/array of boolean values
-
- See Also
- --------
- fullmatch : Stricter matching that requires the entire string to match.
- contains : Analogous, but less strict, relying on re.search instead of
- re.match.
- extract : Extract matched groups.
- """
- result = self._data.array._str_match(pat, case=case, flags=flags, na=na)
- return self._wrap_result(result, fill_value=na, returns_string=False)
-
- @forbid_nonstring_types(["bytes"])
- def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None):
- """
- Determine if each string entirely matches a regular expression.
-
- .. versionadded:: 1.1.0
-
- Parameters
- ----------
- pat : str
- Character sequence or regular expression.
- case : bool, default True
- If True, case sensitive.
- flags : int, default 0 (no flags)
- Regex module flags, e.g. re.IGNORECASE.
- na : scalar, optional
- Fill value for missing values. The default depends on dtype of the
- array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
- ``pandas.NA`` is used.
-
- Returns
- -------
- Series/Index/array of boolean values
-
- See Also
- --------
- match : Similar, but also returns `True` when only a *prefix* of the string
- matches the regular expression.
- extract : Extract matched groups.
- """
- result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
- return self._wrap_result(result, fill_value=na, returns_string=False)
-
- @forbid_nonstring_types(["bytes"])
- def replace(
- self,
- pat: str | re.Pattern,
- repl: str | Callable,
- n: int = -1,
- case: bool | None = None,
- flags: int = 0,
- regex: bool = False,
- ):
- r"""
- Replace each occurrence of pattern/regex in the Series/Index.
-
- Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on
- the regex value.
-
- Parameters
- ----------
- pat : str or compiled regex
- String can be a character sequence or regular expression.
- repl : str or callable
- Replacement string or a callable. The callable is passed the regex
- match object and must return a replacement string to be used.
- See :func:`re.sub`.
- n : int, default -1 (all)
- Number of replacements to make from start.
- case : bool, default None
- Determines if replace is case sensitive:
-
- - If True, case sensitive (the default if `pat` is a string)
- - Set to False for case insensitive
- - Cannot be set if `pat` is a compiled regex.
-
- flags : int, default 0 (no flags)
- Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled
- regex.
- regex : bool, default False
- Determines if the passed-in pattern is a regular expression:
-
- - If True, assumes the passed-in pattern is a regular expression.
- - If False, treats the pattern as a literal string
- - Cannot be set to False if `pat` is a compiled regex or `repl` is
- a callable.
-
- Returns
- -------
- Series or Index of object
- A copy of the object with all matching occurrences of `pat` replaced by
- `repl`.
-
- Raises
- ------
- ValueError
- * if `regex` is False and `repl` is a callable or `pat` is a compiled
- regex
- * if `pat` is a compiled regex and `case` or `flags` is set
-
- Notes
- -----
- When `pat` is a compiled regex, all flags should be included in the
- compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
- regex will raise an error.
-
- Examples
- --------
- When `pat` is a string and `regex` is True (the default), the given `pat`
- is compiled as a regex. When `repl` is a string, it replaces matching
- regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
- left as is:
-
- >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
- 0 bao
- 1 baz
- 2 NaN
- dtype: object
-
- When `pat` is a string and `regex` is False, every `pat` is replaced with
- `repl` as with :meth:`str.replace`:
-
- >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
- 0 bao
- 1 fuz
- 2 NaN
- dtype: object
-
- When `repl` is a callable, it is called on every `pat` using
- :func:`re.sub`. The callable should expect one positional argument
- (a regex object) and return a string.
-
- To get the idea:
-
- >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True)
- 0 <re.Match object; span=(0, 1), match='f'>oo
- 1 <re.Match object; span=(0, 1), match='f'>uz
- 2 NaN
- dtype: object
-
- Reverse every lowercase alphabetic word:
-
- >>> repl = lambda m: m.group(0)[::-1]
- >>> ser = pd.Series(['foo 123', 'bar baz', np.nan])
- >>> ser.str.replace(r'[a-z]+', repl, regex=True)
- 0 oof 123
- 1 rab zab
- 2 NaN
- dtype: object
-
- Using regex groups (extract second group and swap case):
-
- >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
- >>> repl = lambda m: m.group('two').swapcase()
- >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz'])
- >>> ser.str.replace(pat, repl, regex=True)
- 0 tWO
- 1 bAR
- dtype: object
-
- Using a compiled regex with flags
-
- >>> import re
- >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
- >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True)
- 0 foo
- 1 bar
- 2 NaN
- dtype: object
- """
- # Check whether repl is valid (GH 13438, GH 15055)
- if not (isinstance(repl, str) or callable(repl)):
- raise TypeError("repl must be a string or callable")
-
- is_compiled_re = is_re(pat)
- if regex or regex is None:
- if is_compiled_re and (case is not None or flags != 0):
- raise ValueError(
- "case and flags cannot be set when pat is a compiled regex"
- )
-
- elif is_compiled_re:
- raise ValueError(
- "Cannot use a compiled regex as replacement pattern with regex=False"
- )
- elif callable(repl):
- raise ValueError("Cannot use a callable replacement when regex=False")
-
- if case is None:
- case = True
-
- result = self._data.array._str_replace(
- pat, repl, n=n, case=case, flags=flags, regex=regex
- )
- return self._wrap_result(result)
-
- @forbid_nonstring_types(["bytes"])
- def repeat(self, repeats):
- """
- Duplicate each string in the Series or Index.
-
- Parameters
- ----------
- repeats : int or sequence of int
- Same value for all (int) or different value per (sequence).
-
- Returns
- -------
- Series or pandas.Index
- Series or Index of repeated string objects specified by
- input parameter repeats.
-
- Examples
- --------
- >>> s = pd.Series(['a', 'b', 'c'])
- >>> s
- 0 a
- 1 b
- 2 c
- dtype: object
-
- Single int repeats string in Series
-
- >>> s.str.repeat(repeats=2)
- 0 aa
- 1 bb
- 2 cc
- dtype: object
-
- Sequence of int repeats corresponding string in Series
-
- >>> s.str.repeat(repeats=[1, 2, 3])
- 0 a
- 1 bb
- 2 ccc
- dtype: object
- """
- result = self._data.array._str_repeat(repeats)
- return self._wrap_result(result)
-
- @forbid_nonstring_types(["bytes"])
- def pad(
- self,
- width,
- side: Literal["left", "right", "both"] = "left",
- fillchar: str = " ",
- ):
- """
- Pad strings in the Series/Index up to width.
-
- Parameters
- ----------
- width : int
- Minimum width of resulting string; additional characters will be filled
- with character defined in `fillchar`.
- side : {'left', 'right', 'both'}, default 'left'
- Side from which to fill resulting string.
- fillchar : str, default ' '
- Additional character for filling, default is whitespace.
-
- Returns
- -------
- Series or Index of object
- Returns Series or Index with minimum number of char in object.
-
- See Also
- --------
- Series.str.rjust : Fills the left side of strings with an arbitrary
- character. Equivalent to ``Series.str.pad(side='left')``.
- Series.str.ljust : Fills the right side of strings with an arbitrary
- character. Equivalent to ``Series.str.pad(side='right')``.
- Series.str.center : Fills both sides of strings with an arbitrary
- character. Equivalent to ``Series.str.pad(side='both')``.
- Series.str.zfill : Pad strings in the Series/Index by prepending '0'
- character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
-
- Examples
- --------
- >>> s = pd.Series(["caribou", "tiger"])
- >>> s
- 0 caribou
- 1 tiger
- dtype: object
-
- >>> s.str.pad(width=10)
- 0 caribou
- 1 tiger
- dtype: object
-
- >>> s.str.pad(width=10, side='right', fillchar='-')
- 0 caribou---
- 1 tiger-----
- dtype: object
-
- >>> s.str.pad(width=10, side='both', fillchar='-')
- 0 -caribou--
- 1 --tiger---
- dtype: object
- """
- if not isinstance(fillchar, str):
- msg = f"fillchar must be a character, not {type(fillchar).__name__}"
- raise TypeError(msg)
-
- if len(fillchar) != 1:
- raise TypeError("fillchar must be a character, not str")
-
- if not is_integer(width):
- msg = f"width must be of integer type, not {type(width).__name__}"
- raise TypeError(msg)
-
- result = self._data.array._str_pad(width, side=side, fillchar=fillchar)
- return self._wrap_result(result)
-
- _shared_docs[
- "str_pad"
- ] = """
- Pad %(side)s side of strings in the Series/Index.
-
- Equivalent to :meth:`str.%(method)s`.
-
- Parameters
- ----------
- width : int
- Minimum width of resulting string; additional characters will be filled
- with ``fillchar``.
- fillchar : str
- Additional character for filling, default is whitespace.
-
- Returns
- -------
- Series/Index of objects.
- """
-
- @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"})
- @forbid_nonstring_types(["bytes"])
- def center(self, width, fillchar: str = " "):
- return self.pad(width, side="both", fillchar=fillchar)
-
- @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"})
- @forbid_nonstring_types(["bytes"])
- def ljust(self, width, fillchar: str = " "):
- return self.pad(width, side="right", fillchar=fillchar)
-
- @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"})
- @forbid_nonstring_types(["bytes"])
- def rjust(self, width, fillchar: str = " "):
- return self.pad(width, side="left", fillchar=fillchar)
-
- @forbid_nonstring_types(["bytes"])
- def zfill(self, width):
- """
- Pad strings in the Series/Index by prepending '0' characters.
-
- Strings in the Series/Index are padded with '0' characters on the
- left of the string to reach a total string length `width`. Strings
- in the Series/Index with length greater or equal to `width` are
- unchanged.
-
- Parameters
- ----------
- width : int
- Minimum length of resulting string; strings with length less
- than `width` be prepended with '0' characters.
-
- Returns
- -------
- Series/Index of objects.
-
- See Also
- --------
- Series.str.rjust : Fills the left side of strings with an arbitrary
- character.
- Series.str.ljust : Fills the right side of strings with an arbitrary
- character.
- Series.str.pad : Fills the specified sides of strings with an arbitrary
- character.
- Series.str.center : Fills both sides of strings with an arbitrary
- character.
-
- Notes
- -----
- Differs from :meth:`str.zfill` which has special handling
- for '+'/'-' in the string.
-
- Examples
- --------
- >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])
- >>> s
- 0 -1
- 1 1
- 2 1000
- 3 10
- 4 NaN
- dtype: object
-
- Note that ``10`` and ``NaN`` are not strings, therefore they are
- converted to ``NaN``. The minus sign in ``'-1'`` is treated as a
- special character and the zero is added to the right of it
- (:meth:`str.zfill` would have moved it to the left). ``1000``
- remains unchanged as it is longer than `width`.
-
- >>> s.str.zfill(3)
- 0 -01
- 1 001
- 2 1000
- 3 NaN
- 4 NaN
- dtype: object
- """
- if not is_integer(width):
- msg = f"width must be of integer type, not {type(width).__name__}"
- raise TypeError(msg)
- f = lambda x: x.zfill(width)
- result = self._data.array._str_map(f)
- return self._wrap_result(result)
-
- def slice(self, start=None, stop=None, step=None):
- """
- Slice substrings from each element in the Series or Index.
-
- Parameters
- ----------
- start : int, optional
- Start position for slice operation.
- stop : int, optional
- Stop position for slice operation.
- step : int, optional
- Step size for slice operation.
-
- Returns
- -------
- Series or Index of object
- Series or Index from sliced substring from original string object.
-
- See Also
- --------
- Series.str.slice_replace : Replace a slice with a string.
- Series.str.get : Return element at position.
- Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
- being the position.
-
- Examples
- --------
- >>> s = pd.Series(["koala", "dog", "chameleon"])
- >>> s
- 0 koala
- 1 dog
- 2 chameleon
- dtype: object
-
- >>> s.str.slice(start=1)
- 0 oala
- 1 og
- 2 hameleon
- dtype: object
-
- >>> s.str.slice(start=-1)
- 0 a
- 1 g
- 2 n
- dtype: object
-
- >>> s.str.slice(stop=2)
- 0 ko
- 1 do
- 2 ch
- dtype: object
-
- >>> s.str.slice(step=2)
- 0 kaa
- 1 dg
- 2 caeen
- dtype: object
-
- >>> s.str.slice(start=0, stop=5, step=3)
- 0 kl
- 1 d
- 2 cm
- dtype: object
-
- Equivalent behaviour to:
-
- >>> s.str[0:5:3]
- 0 kl
- 1 d
- 2 cm
- dtype: object
- """
- result = self._data.array._str_slice(start, stop, step)
- return self._wrap_result(result)
-
- @forbid_nonstring_types(["bytes"])
- def slice_replace(self, start=None, stop=None, repl=None):
- """
- Replace a positional slice of a string with another value.
-
- Parameters
- ----------
- start : int, optional
- Left index position to use for the slice. If not specified (None),
- the slice is unbounded on the left, i.e. slice from the start
- of the string.
- stop : int, optional
- Right index position to use for the slice. If not specified (None),
- the slice is unbounded on the right, i.e. slice until the
- end of the string.
- repl : str, optional
- String for replacement. If not specified (None), the sliced region
- is replaced with an empty string.
-
- Returns
- -------
- Series or Index
- Same type as the original object.
-
- See Also
- --------
- Series.str.slice : Just slicing without replacement.
-
- Examples
- --------
- >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
- >>> s
- 0 a
- 1 ab
- 2 abc
- 3 abdc
- 4 abcde
- dtype: object
-
- Specify just `start`, meaning replace `start` until the end of the
- string with `repl`.
-
- >>> s.str.slice_replace(1, repl='X')
- 0 aX
- 1 aX
- 2 aX
- 3 aX
- 4 aX
- dtype: object
-
- Specify just `stop`, meaning the start of the string to `stop` is replaced
- with `repl`, and the rest of the string is included.
-
- >>> s.str.slice_replace(stop=2, repl='X')
- 0 X
- 1 X
- 2 Xc
- 3 Xdc
- 4 Xcde
- dtype: object
-
- Specify `start` and `stop`, meaning the slice from `start` to `stop` is
- replaced with `repl`. Everything before or after `start` and `stop` is
- included as is.
-
- >>> s.str.slice_replace(start=1, stop=3, repl='X')
- 0 aX
- 1 aX
- 2 aX
- 3 aXc
- 4 aXde
- dtype: object
- """
- result = self._data.array._str_slice_replace(start, stop, repl)
- return self._wrap_result(result)
-
- def decode(self, encoding, errors: str = "strict"):
- """
- Decode character string in the Series/Index using indicated encoding.
-
- Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
- python3.
-
- Parameters
- ----------
- encoding : str
- errors : str, optional
-
- Returns
- -------
- Series or Index
- """
- # TODO: Add a similar _bytes interface.
- if encoding in _cpython_optimized_decoders:
- # CPython optimized implementation
- f = lambda x: x.decode(encoding, errors)
- else:
- decoder = codecs.getdecoder(encoding)
- f = lambda x: decoder(x, errors)[0]
- arr = self._data.array
- # assert isinstance(arr, (StringArray,))
- result = arr._str_map(f)
- return self._wrap_result(result)
-
- @forbid_nonstring_types(["bytes"])
- def encode(self, encoding, errors: str = "strict"):
- """
- Encode character string in the Series/Index using indicated encoding.
-
- Equivalent to :meth:`str.encode`.
-
- Parameters
- ----------
- encoding : str
- errors : str, optional
-
- Returns
- -------
- Series/Index of objects
- """
- result = self._data.array._str_encode(encoding, errors)
- return self._wrap_result(result, returns_string=False)
-
- _shared_docs[
- "str_strip"
- ] = r"""
- Remove %(position)s characters.
-
- Strip whitespaces (including newlines) or a set of specified characters
- from each string in the Series/Index from %(side)s.
- Replaces any non-strings in Series with NaNs.
- Equivalent to :meth:`str.%(method)s`.
-
- Parameters
- ----------
- to_strip : str or None, default None
- Specifying the set of characters to be removed.
- All combinations of this set of characters will be stripped.
- If None then whitespaces are removed.
-
- Returns
- -------
- Series or Index of object
-
- See Also
- --------
- Series.str.strip : Remove leading and trailing characters in Series/Index.
- Series.str.lstrip : Remove leading characters in Series/Index.
- Series.str.rstrip : Remove trailing characters in Series/Index.
-
- Examples
- --------
- >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True])
- >>> s
- 0 1. Ant.
- 1 2. Bee!\n
- 2 3. Cat?\t
- 3 NaN
- 4 10
- 5 True
- dtype: object
-
- >>> s.str.strip()
- 0 1. Ant.
- 1 2. Bee!
- 2 3. Cat?
- 3 NaN
- 4 NaN
- 5 NaN
- dtype: object
-
- >>> s.str.lstrip('123.')
- 0 Ant.
- 1 Bee!\n
- 2 Cat?\t
- 3 NaN
- 4 NaN
- 5 NaN
- dtype: object
-
- >>> s.str.rstrip('.!? \n\t')
- 0 1. Ant
- 1 2. Bee
- 2 3. Cat
- 3 NaN
- 4 NaN
- 5 NaN
- dtype: object
-
- >>> s.str.strip('123.!? \n\t')
- 0 Ant
- 1 Bee
- 2 Cat
- 3 NaN
- 4 NaN
- 5 NaN
- dtype: object
- """
-
- @Appender(
- _shared_docs["str_strip"]
- % {
- "side": "left and right sides",
- "method": "strip",
- "position": "leading and trailing",
- }
- )
- @forbid_nonstring_types(["bytes"])
- def strip(self, to_strip=None):
- result = self._data.array._str_strip(to_strip)
- return self._wrap_result(result)
-
- @Appender(
- _shared_docs["str_strip"]
- % {"side": "left side", "method": "lstrip", "position": "leading"}
- )
- @forbid_nonstring_types(["bytes"])
- def lstrip(self, to_strip=None):
- result = self._data.array._str_lstrip(to_strip)
- return self._wrap_result(result)
-
- @Appender(
- _shared_docs["str_strip"]
- % {"side": "right side", "method": "rstrip", "position": "trailing"}
- )
- @forbid_nonstring_types(["bytes"])
- def rstrip(self, to_strip=None):
- result = self._data.array._str_rstrip(to_strip)
- return self._wrap_result(result)
-
- _shared_docs[
- "str_removefix"
- ] = r"""
- Remove a %(side)s from an object series.
-
- If the %(side)s is not present, the original string will be returned.
-
- Parameters
- ----------
- %(side)s : str
- Remove the %(side)s of the string.
-
- Returns
- -------
- Series/Index: object
- The Series or Index with given %(side)s removed.
-
- See Also
- --------
- Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series.
-
- Examples
- --------
- >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"])
- >>> s
- 0 str_foo
- 1 str_bar
- 2 no_prefix
- dtype: object
- >>> s.str.removeprefix("str_")
- 0 foo
- 1 bar
- 2 no_prefix
- dtype: object
-
- >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"])
- >>> s
- 0 foo_str
- 1 bar_str
- 2 no_suffix
- dtype: object
- >>> s.str.removesuffix("_str")
- 0 foo
- 1 bar
- 2 no_suffix
- dtype: object
- """
-
- @Appender(
- _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"}
- )
- @forbid_nonstring_types(["bytes"])
- def removeprefix(self, prefix):
- result = self._data.array._str_removeprefix(prefix)
- return self._wrap_result(result)
-
- @Appender(
- _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"}
- )
- @forbid_nonstring_types(["bytes"])
- def removesuffix(self, suffix):
- result = self._data.array._str_removesuffix(suffix)
- return self._wrap_result(result)
-
- @forbid_nonstring_types(["bytes"])
- def wrap(self, width, **kwargs):
- r"""
- Wrap strings in Series/Index at specified line width.
-
- This method has the same keyword parameters and defaults as
- :class:`textwrap.TextWrapper`.
-
- Parameters
- ----------
- width : int
- Maximum line width.
- expand_tabs : bool, optional
- If True, tab characters will be expanded to spaces (default: True).
- replace_whitespace : bool, optional
- If True, each whitespace character (as defined by string.whitespace)
- remaining after tab expansion will be replaced by a single space
- (default: True).
- drop_whitespace : bool, optional
- If True, whitespace that, after wrapping, happens to end up at the
- beginning or end of a line is dropped (default: True).
- break_long_words : bool, optional
- If True, then words longer than width will be broken in order to ensure
- that no lines are longer than width. If it is false, long words will
- not be broken, and some lines may be longer than width (default: True).
- break_on_hyphens : bool, optional
- If True, wrapping will occur preferably on whitespace and right after
- hyphens in compound words, as it is customary in English. If false,
- only whitespaces will be considered as potentially good places for line
- breaks, but you need to set break_long_words to false if you want truly
- insecable words (default: True).
-
- Returns
- -------
- Series or Index
-
- Notes
- -----
- Internally, this method uses a :class:`textwrap.TextWrapper` instance with
- default settings. To achieve behavior matching R's stringr library str_wrap
- function, use the arguments:
-
- - expand_tabs = False
- - replace_whitespace = True
- - drop_whitespace = True
- - break_long_words = False
- - break_on_hyphens = False
-
- Examples
- --------
- >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
- >>> s.str.wrap(12)
- 0 line to be\nwrapped
- 1 another line\nto be\nwrapped
- dtype: object
- """
- result = self._data.array._str_wrap(width, **kwargs)
- return self._wrap_result(result)
-
- @forbid_nonstring_types(["bytes"])
- def get_dummies(self, sep: str = "|"):
- """
- Return DataFrame of dummy/indicator variables for Series.
-
- Each string in Series is split by sep and returned as a DataFrame
- of dummy/indicator variables.
-
- Parameters
- ----------
- sep : str, default "|"
- String to split on.
-
- Returns
- -------
- DataFrame
- Dummy variables corresponding to values of the Series.
-
- See Also
- --------
- get_dummies : Convert categorical variable into dummy/indicator
- variables.
-
- Examples
- --------
- >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
- a b c
- 0 1 1 0
- 1 1 0 0
- 2 1 0 1
-
- >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
- a b c
- 0 1 1 0
- 1 0 0 0
- 2 1 0 1
- """
- # we need to cast to Series of strings as only that has all
- # methods available for making the dummies...
- result, name = self._data.array._str_get_dummies(sep)
- return self._wrap_result(
- result,
- name=name,
- expand=True,
- returns_string=False,
- )
-
- @forbid_nonstring_types(["bytes"])
- def translate(self, table):
- """
- Map all characters in the string through the given mapping table.
-
- Equivalent to standard :meth:`str.translate`.
-
- Parameters
- ----------
- table : dict
- Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or
- None. Unmapped characters are left untouched.
- Characters mapped to None are deleted. :meth:`str.maketrans` is a
- helper function for making translation tables.
-
- Returns
- -------
- Series or Index
- """
- result = self._data.array._str_translate(table)
- return self._wrap_result(result)
-
- @forbid_nonstring_types(["bytes"])
- def count(self, pat, flags: int = 0):
- r"""
- Count occurrences of pattern in each string of the Series/Index.
-
- This function is used to count the number of times a particular regex
- pattern is repeated in each of the string elements of the
- :class:`~pandas.Series`.
-
- Parameters
- ----------
- pat : str
- Valid regular expression.
- flags : int, default 0, meaning no flags
- Flags for the `re` module. For a complete list, `see here
- <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
- **kwargs
- For compatibility with other string methods. Not used.
-
- Returns
- -------
- Series or Index
- Same type as the calling object containing the integer counts.
-
- See Also
- --------
- re : Standard library module for regular expressions.
- str.count : Standard library version, without regular expression support.
-
- Notes
- -----
- Some characters need to be escaped when passing in `pat`.
- eg. ``'$'`` has a special meaning in regex and must be escaped when
- finding this literal character.
-
- Examples
- --------
- >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
- >>> s.str.count('a')
- 0 0.0
- 1 0.0
- 2 2.0
- 3 2.0
- 4 NaN
- 5 0.0
- 6 1.0
- dtype: float64
-
- Escape ``'$'`` to find the literal dollar sign.
-
- >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
- >>> s.str.count('\\$')
- 0 1
- 1 0
- 2 1
- 3 2
- 4 2
- 5 0
- dtype: int64
-
- This is also available on Index
-
- >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
- Index([0, 0, 2, 1], dtype='int64')
- """
- result = self._data.array._str_count(pat, flags)
- return self._wrap_result(result, returns_string=False)
-
- @forbid_nonstring_types(["bytes"])
- def startswith(
- self, pat: str | tuple[str, ...], na: Scalar | None = None
- ) -> Series | Index:
- """
- Test if the start of each string element matches a pattern.
-
- Equivalent to :meth:`str.startswith`.
-
- Parameters
- ----------
- pat : str or tuple[str, ...]
- Character sequence or tuple of strings. Regular expressions are not
- accepted.
- na : object, default NaN
- Object shown if element tested is not a string. The default depends
- on dtype of the array. For object-dtype, ``numpy.nan`` is used.
- For ``StringDtype``, ``pandas.NA`` is used.
-
- Returns
- -------
- Series or Index of bool
- A Series of booleans indicating whether the given pattern matches
- the start of each string element.
-
- See Also
- --------
- str.startswith : Python standard library string method.
- Series.str.endswith : Same as startswith, but tests the end of string.
- Series.str.contains : Tests if string element contains a pattern.
-
- Examples
- --------
- >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
- >>> s
- 0 bat
- 1 Bear
- 2 cat
- 3 NaN
- dtype: object
-
- >>> s.str.startswith('b')
- 0 True
- 1 False
- 2 False
- 3 NaN
- dtype: object
-
- >>> s.str.startswith(('b', 'B'))
- 0 True
- 1 True
- 2 False
- 3 NaN
- dtype: object
-
- Specifying `na` to be `False` instead of `NaN`.
-
- >>> s.str.startswith('b', na=False)
- 0 True
- 1 False
- 2 False
- 3 False
- dtype: bool
- """
- if not isinstance(pat, (str, tuple)):
- msg = f"expected a string or tuple, not {type(pat).__name__}"
- raise TypeError(msg)
- result = self._data.array._str_startswith(pat, na=na)
- return self._wrap_result(result, returns_string=False)
-
- @forbid_nonstring_types(["bytes"])
- def endswith(
- self, pat: str | tuple[str, ...], na: Scalar | None = None
- ) -> Series | Index:
- """
- Test if the end of each string element matches a pattern.
-
- Equivalent to :meth:`str.endswith`.
-
- Parameters
- ----------
- pat : str or tuple[str, ...]
- Character sequence or tuple of strings. Regular expressions are not
- accepted.
- na : object, default NaN
- Object shown if element tested is not a string. The default depends
- on dtype of the array. For object-dtype, ``numpy.nan`` is used.
- For ``StringDtype``, ``pandas.NA`` is used.
-
- Returns
- -------
- Series or Index of bool
- A Series of booleans indicating whether the given pattern matches
- the end of each string element.
-
- See Also
- --------
- str.endswith : Python standard library string method.
- Series.str.startswith : Same as endswith, but tests the start of string.
- Series.str.contains : Tests if string element contains a pattern.
-
- Examples
- --------
- >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
- >>> s
- 0 bat
- 1 bear
- 2 caT
- 3 NaN
- dtype: object
-
- >>> s.str.endswith('t')
- 0 True
- 1 False
- 2 False
- 3 NaN
- dtype: object
-
- >>> s.str.endswith(('t', 'T'))
- 0 True
- 1 False
- 2 True
- 3 NaN
- dtype: object
-
- Specifying `na` to be `False` instead of `NaN`.
-
- >>> s.str.endswith('t', na=False)
- 0 True
- 1 False
- 2 False
- 3 False
- dtype: bool
- """
- if not isinstance(pat, (str, tuple)):
- msg = f"expected a string or tuple, not {type(pat).__name__}"
- raise TypeError(msg)
- result = self._data.array._str_endswith(pat, na=na)
- return self._wrap_result(result, returns_string=False)
-
- @forbid_nonstring_types(["bytes"])
- def findall(self, pat, flags: int = 0):
- """
- Find all occurrences of pattern or regular expression in the Series/Index.
-
- Equivalent to applying :func:`re.findall` to all the elements in the
- Series/Index.
-
- Parameters
- ----------
- pat : str
- Pattern or regular expression.
- flags : int, default 0
- Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which
- means no flags).
-
- Returns
- -------
- Series/Index of lists of strings
- All non-overlapping matches of pattern or regular expression in each
- string of this Series/Index.
-
- See Also
- --------
- count : Count occurrences of pattern or regular expression in each string
- of the Series/Index.
- extractall : For each string in the Series, extract groups from all matches
- of regular expression and return a DataFrame with one row for each
- match and one column for each group.
- re.findall : The equivalent ``re`` function to all non-overlapping matches
- of pattern or regular expression in string, as a list of strings.
-
- Examples
- --------
- >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
-
- The search for the pattern 'Monkey' returns one match:
-
- >>> s.str.findall('Monkey')
- 0 []
- 1 [Monkey]
- 2 []
- dtype: object
-
- On the other hand, the search for the pattern 'MONKEY' doesn't return any
- match:
-
- >>> s.str.findall('MONKEY')
- 0 []
- 1 []
- 2 []
- dtype: object
-
- Flags can be added to the pattern or regular expression. For instance,
- to find the pattern 'MONKEY' ignoring the case:
-
- >>> import re
- >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
- 0 []
- 1 [Monkey]
- 2 []
- dtype: object
-
- When the pattern matches more than one string in the Series, all matches
- are returned:
-
- >>> s.str.findall('on')
- 0 [on]
- 1 [on]
- 2 []
- dtype: object
-
- Regular expressions are supported too. For instance, the search for all the
- strings ending with the word 'on' is shown next:
-
- >>> s.str.findall('on$')
- 0 [on]
- 1 []
- 2 []
- dtype: object
-
- If the pattern is found more than once in the same string, then a list of
- multiple strings is returned:
-
- >>> s.str.findall('b')
- 0 []
- 1 []
- 2 [b, b]
- dtype: object
- """
- result = self._data.array._str_findall(pat, flags)
- return self._wrap_result(result, returns_string=False)
-
- @forbid_nonstring_types(["bytes"])
- def extract(
- self, pat: str, flags: int = 0, expand: bool = True
- ) -> DataFrame | Series | Index:
- r"""
- Extract capture groups in the regex `pat` as columns in a DataFrame.
-
- For each subject string in the Series, extract groups from the
- first match of regular expression `pat`.
-
- Parameters
- ----------
- pat : str
- Regular expression pattern with capturing groups.
- flags : int, default 0 (no flags)
- Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
- modify regular expression matching for things like case,
- spaces, etc. For more details, see :mod:`re`.
- expand : bool, default True
- If True, return DataFrame with one column per capture group.
- If False, return a Series/Index if there is one capture group
- or DataFrame if there are multiple capture groups.
-
- Returns
- -------
- DataFrame or Series or Index
- A DataFrame with one row for each subject string, and one
- column for each group. Any capture group names in regular
- expression pat will be used for column names; otherwise
- capture group numbers will be used. The dtype of each result
- column is always object, even when no match is found. If
- ``expand=False`` and pat has only one capture group, then
- return a Series (if subject is a Series) or Index (if subject
- is an Index).
-
- See Also
- --------
- extractall : Returns all matches (not just the first match).
-
- Examples
- --------
- A pattern with two groups will return a DataFrame with two columns.
- Non-matches will be NaN.
-
- >>> s = pd.Series(['a1', 'b2', 'c3'])
- >>> s.str.extract(r'([ab])(\d)')
- 0 1
- 0 a 1
- 1 b 2
- 2 NaN NaN
-
- A pattern may contain optional groups.
-
- >>> s.str.extract(r'([ab])?(\d)')
- 0 1
- 0 a 1
- 1 b 2
- 2 NaN 3
-
- Named groups will become column names in the result.
-
- >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
- letter digit
- 0 a 1
- 1 b 2
- 2 NaN NaN
-
- A pattern with one group will return a DataFrame with one column
- if expand=True.
-
- >>> s.str.extract(r'[ab](\d)', expand=True)
- 0
- 0 1
- 1 2
- 2 NaN
-
- A pattern with one group will return a Series if expand=False.
-
- >>> s.str.extract(r'[ab](\d)', expand=False)
- 0 1
- 1 2
- 2 NaN
- dtype: object
- """
- from pandas import DataFrame
-
- if not isinstance(expand, bool):
- raise ValueError("expand must be True or False")
-
- regex = re.compile(pat, flags=flags)
- if regex.groups == 0:
- raise ValueError("pattern contains no capture groups")
-
- if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex):
- raise ValueError("only one regex group is supported with Index")
-
- obj = self._data
- result_dtype = _result_dtype(obj)
-
- returns_df = regex.groups > 1 or expand
-
- if returns_df:
- name = None
- columns = _get_group_names(regex)
-
- if obj.array.size == 0:
- result = DataFrame(columns=columns, dtype=result_dtype)
-
- else:
- result_list = self._data.array._str_extract(
- pat, flags=flags, expand=returns_df
- )
-
- result_index: Index | None
- if isinstance(obj, ABCSeries):
- result_index = obj.index
- else:
- result_index = None
-
- result = DataFrame(
- result_list, columns=columns, index=result_index, dtype=result_dtype
- )
-
- else:
- name = _get_single_group_name(regex)
- result = self._data.array._str_extract(pat, flags=flags, expand=returns_df)
- return self._wrap_result(result, name=name)
-
- @forbid_nonstring_types(["bytes"])
- def extractall(self, pat, flags: int = 0):
- r"""
- Extract capture groups in the regex `pat` as columns in DataFrame.
-
- For each subject string in the Series, extract groups from all
- matches of regular expression pat. When each subject string in the
- Series has exactly one match, extractall(pat).xs(0, level='match')
- is the same as extract(pat).
-
- Parameters
- ----------
- pat : str
- Regular expression pattern with capturing groups.
- flags : int, default 0 (no flags)
- A ``re`` module flag, for example ``re.IGNORECASE``. These allow
- to modify regular expression matching for things like case, spaces,
- etc. Multiple flags can be combined with the bitwise OR operator,
- for example ``re.IGNORECASE | re.MULTILINE``.
-
- Returns
- -------
- DataFrame
- A ``DataFrame`` with one row for each match, and one column for each
- group. Its rows have a ``MultiIndex`` with first levels that come from
- the subject ``Series``. The last level is named 'match' and indexes the
- matches in each item of the ``Series``. Any capture group names in
- regular expression pat will be used for column names; otherwise capture
- group numbers will be used.
-
- See Also
- --------
- extract : Returns first match only (not all matches).
-
- Examples
- --------
- A pattern with one group will return a DataFrame with one column.
- Indices with no matches will not appear in the result.
-
- >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
- >>> s.str.extractall(r"[ab](\d)")
- 0
- match
- A 0 1
- 1 2
- B 0 1
-
- Capture group names are used for column names of the result.
-
- >>> s.str.extractall(r"[ab](?P<digit>\d)")
- digit
- match
- A 0 1
- 1 2
- B 0 1
-
- A pattern with two groups will return a DataFrame with two columns.
-
- >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
- letter digit
- match
- A 0 a 1
- 1 a 2
- B 0 b 1
-
- Optional groups that do not match are NaN in the result.
-
- >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
- letter digit
- match
- A 0 a 1
- 1 a 2
- B 0 b 1
- C 0 NaN 1
- """
- # TODO: dispatch
- return str_extractall(self._orig, pat, flags)
-
- _shared_docs[
- "find"
- ] = """
- Return %(side)s indexes in each strings in the Series/Index.
-
- Each of returned indexes corresponds to the position where the
- substring is fully contained between [start:end]. Return -1 on
- failure. Equivalent to standard :meth:`str.%(method)s`.
-
- Parameters
- ----------
- sub : str
- Substring being searched.
- start : int
- Left edge index.
- end : int
- Right edge index.
-
- Returns
- -------
- Series or Index of int.
-
- See Also
- --------
- %(also)s
- """
-
- @Appender(
- _shared_docs["find"]
- % {
- "side": "lowest",
- "method": "find",
- "also": "rfind : Return highest indexes in each strings.",
- }
- )
- @forbid_nonstring_types(["bytes"])
- def find(self, sub, start: int = 0, end=None):
- if not isinstance(sub, str):
- msg = f"expected a string object, not {type(sub).__name__}"
- raise TypeError(msg)
-
- result = self._data.array._str_find(sub, start, end)
- return self._wrap_result(result, returns_string=False)
-
- @Appender(
- _shared_docs["find"]
- % {
- "side": "highest",
- "method": "rfind",
- "also": "find : Return lowest indexes in each strings.",
- }
- )
- @forbid_nonstring_types(["bytes"])
- def rfind(self, sub, start: int = 0, end=None):
- if not isinstance(sub, str):
- msg = f"expected a string object, not {type(sub).__name__}"
- raise TypeError(msg)
-
- result = self._data.array._str_rfind(sub, start=start, end=end)
- return self._wrap_result(result, returns_string=False)
-
- @forbid_nonstring_types(["bytes"])
- def normalize(self, form):
- """
- Return the Unicode normal form for the strings in the Series/Index.
-
- For more information on the forms, see the
- :func:`unicodedata.normalize`.
-
- Parameters
- ----------
- form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
- Unicode form.
-
- Returns
- -------
- Series/Index of objects
- """
- result = self._data.array._str_normalize(form)
- return self._wrap_result(result)
-
- _shared_docs[
- "index"
- ] = """
- Return %(side)s indexes in each string in Series/Index.
-
- Each of the returned indexes corresponds to the position where the
- substring is fully contained between [start:end]. This is the same
- as ``str.%(similar)s`` except instead of returning -1, it raises a
- ValueError when the substring is not found. Equivalent to standard
- ``str.%(method)s``.
-
- Parameters
- ----------
- sub : str
- Substring being searched.
- start : int
- Left edge index.
- end : int
- Right edge index.
-
- Returns
- -------
- Series or Index of object
-
- See Also
- --------
- %(also)s
- """
-
- @Appender(
- _shared_docs["index"]
- % {
- "side": "lowest",
- "similar": "find",
- "method": "index",
- "also": "rindex : Return highest indexes in each strings.",
- }
- )
- @forbid_nonstring_types(["bytes"])
- def index(self, sub, start: int = 0, end=None):
- if not isinstance(sub, str):
- msg = f"expected a string object, not {type(sub).__name__}"
- raise TypeError(msg)
-
- result = self._data.array._str_index(sub, start=start, end=end)
- return self._wrap_result(result, returns_string=False)
-
- @Appender(
- _shared_docs["index"]
- % {
- "side": "highest",
- "similar": "rfind",
- "method": "rindex",
- "also": "index : Return lowest indexes in each strings.",
- }
- )
- @forbid_nonstring_types(["bytes"])
- def rindex(self, sub, start: int = 0, end=None):
- if not isinstance(sub, str):
- msg = f"expected a string object, not {type(sub).__name__}"
- raise TypeError(msg)
-
- result = self._data.array._str_rindex(sub, start=start, end=end)
- return self._wrap_result(result, returns_string=False)
-
- def len(self):
- """
- Compute the length of each element in the Series/Index.
-
- The element may be a sequence (such as a string, tuple or list) or a collection
- (such as a dictionary).
-
- Returns
- -------
- Series or Index of int
- A Series or Index of integer values indicating the length of each
- element in the Series or Index.
-
- See Also
- --------
- str.len : Python built-in function returning the length of an object.
- Series.size : Returns the length of the Series.
-
- Examples
- --------
- Returns the length (number of characters) in a string. Returns the
- number of entries for dictionaries, lists or tuples.
-
- >>> s = pd.Series(['dog',
- ... '',
- ... 5,
- ... {'foo' : 'bar'},
- ... [2, 3, 5, 7],
- ... ('one', 'two', 'three')])
- >>> s
- 0 dog
- 1
- 2 5
- 3 {'foo': 'bar'}
- 4 [2, 3, 5, 7]
- 5 (one, two, three)
- dtype: object
- >>> s.str.len()
- 0 3.0
- 1 0.0
- 2 NaN
- 3 1.0
- 4 4.0
- 5 3.0
- dtype: float64
- """
- result = self._data.array._str_len()
- return self._wrap_result(result, returns_string=False)
-
- _shared_docs[
- "casemethods"
- ] = """
- Convert strings in the Series/Index to %(type)s.
- %(version)s
- Equivalent to :meth:`str.%(method)s`.
-
- Returns
- -------
- Series or Index of object
-
- See Also
- --------
- Series.str.lower : Converts all characters to lowercase.
- Series.str.upper : Converts all characters to uppercase.
- Series.str.title : Converts first character of each word to uppercase and
- remaining to lowercase.
- Series.str.capitalize : Converts first character to uppercase and
- remaining to lowercase.
- Series.str.swapcase : Converts uppercase to lowercase and lowercase to
- uppercase.
- Series.str.casefold: Removes all case distinctions in the string.
-
- Examples
- --------
- >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
- >>> s
- 0 lower
- 1 CAPITALS
- 2 this is a sentence
- 3 SwApCaSe
- dtype: object
-
- >>> s.str.lower()
- 0 lower
- 1 capitals
- 2 this is a sentence
- 3 swapcase
- dtype: object
-
- >>> s.str.upper()
- 0 LOWER
- 1 CAPITALS
- 2 THIS IS A SENTENCE
- 3 SWAPCASE
- dtype: object
-
- >>> s.str.title()
- 0 Lower
- 1 Capitals
- 2 This Is A Sentence
- 3 Swapcase
- dtype: object
-
- >>> s.str.capitalize()
- 0 Lower
- 1 Capitals
- 2 This is a sentence
- 3 Swapcase
- dtype: object
-
- >>> s.str.swapcase()
- 0 LOWER
- 1 capitals
- 2 THIS IS A SENTENCE
- 3 sWaPcAsE
- dtype: object
- """
- # Types:
- # cases:
- # upper, lower, title, capitalize, swapcase, casefold
- # boolean:
- # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle
- # _doc_args holds dict of strings to use in substituting casemethod docs
- _doc_args: dict[str, dict[str, str]] = {}
- _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""}
- _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""}
- _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""}
- _doc_args["capitalize"] = {
- "type": "be capitalized",
- "method": "capitalize",
- "version": "",
- }
- _doc_args["swapcase"] = {
- "type": "be swapcased",
- "method": "swapcase",
- "version": "",
- }
- _doc_args["casefold"] = {
- "type": "be casefolded",
- "method": "casefold",
- "version": "",
- }
-
- @Appender(_shared_docs["casemethods"] % _doc_args["lower"])
- @forbid_nonstring_types(["bytes"])
- def lower(self):
- result = self._data.array._str_lower()
- return self._wrap_result(result)
-
- @Appender(_shared_docs["casemethods"] % _doc_args["upper"])
- @forbid_nonstring_types(["bytes"])
- def upper(self):
- result = self._data.array._str_upper()
- return self._wrap_result(result)
-
- @Appender(_shared_docs["casemethods"] % _doc_args["title"])
- @forbid_nonstring_types(["bytes"])
- def title(self):
- result = self._data.array._str_title()
- return self._wrap_result(result)
-
- @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"])
- @forbid_nonstring_types(["bytes"])
- def capitalize(self):
- result = self._data.array._str_capitalize()
- return self._wrap_result(result)
-
- @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"])
- @forbid_nonstring_types(["bytes"])
- def swapcase(self):
- result = self._data.array._str_swapcase()
- return self._wrap_result(result)
-
- @Appender(_shared_docs["casemethods"] % _doc_args["casefold"])
- @forbid_nonstring_types(["bytes"])
- def casefold(self):
- result = self._data.array._str_casefold()
- return self._wrap_result(result)
-
- _shared_docs[
- "ismethods"
- ] = """
- Check whether all characters in each string are %(type)s.
-
- This is equivalent to running the Python string method
- :meth:`str.%(method)s` for each element of the Series/Index. If a string
- has zero characters, ``False`` is returned for that check.
-
- Returns
- -------
- Series or Index of bool
- Series or Index of boolean values with the same length as the original
- Series/Index.
-
- See Also
- --------
- Series.str.isalpha : Check whether all characters are alphabetic.
- Series.str.isnumeric : Check whether all characters are numeric.
- Series.str.isalnum : Check whether all characters are alphanumeric.
- Series.str.isdigit : Check whether all characters are digits.
- Series.str.isdecimal : Check whether all characters are decimal.
- Series.str.isspace : Check whether all characters are whitespace.
- Series.str.islower : Check whether all characters are lowercase.
- Series.str.isupper : Check whether all characters are uppercase.
- Series.str.istitle : Check whether all characters are titlecase.
-
- Examples
- --------
- **Checks for Alphabetic and Numeric Characters**
-
- >>> s1 = pd.Series(['one', 'one1', '1', ''])
-
- >>> s1.str.isalpha()
- 0 True
- 1 False
- 2 False
- 3 False
- dtype: bool
-
- >>> s1.str.isnumeric()
- 0 False
- 1 False
- 2 True
- 3 False
- dtype: bool
-
- >>> s1.str.isalnum()
- 0 True
- 1 True
- 2 True
- 3 False
- dtype: bool
-
- Note that checks against characters mixed with any additional punctuation
- or whitespace will evaluate to false for an alphanumeric check.
-
- >>> s2 = pd.Series(['A B', '1.5', '3,000'])
- >>> s2.str.isalnum()
- 0 False
- 1 False
- 2 False
- dtype: bool
-
- **More Detailed Checks for Numeric Characters**
-
- There are several different but overlapping sets of numeric characters that
- can be checked for.
-
- >>> s3 = pd.Series(['23', '³', '⅕', ''])
-
- The ``s3.str.isdecimal`` method checks for characters used to form numbers
- in base 10.
-
- >>> s3.str.isdecimal()
- 0 True
- 1 False
- 2 False
- 3 False
- dtype: bool
-
- The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also
- includes special digits, like superscripted and subscripted digits in
- unicode.
-
- >>> s3.str.isdigit()
- 0 True
- 1 True
- 2 False
- 3 False
- dtype: bool
-
- The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also
- includes other characters that can represent quantities such as unicode
- fractions.
-
- >>> s3.str.isnumeric()
- 0 True
- 1 True
- 2 True
- 3 False
- dtype: bool
-
- **Checks for Whitespace**
-
- >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])
- >>> s4.str.isspace()
- 0 True
- 1 True
- 2 False
- dtype: bool
-
- **Checks for Character Case**
-
- >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
-
- >>> s5.str.islower()
- 0 True
- 1 False
- 2 False
- 3 False
- dtype: bool
-
- >>> s5.str.isupper()
- 0 False
- 1 False
- 2 True
- 3 False
- dtype: bool
-
- The ``s5.str.istitle`` method checks for whether all words are in title
- case (whether only the first letter of each word is capitalized). Words are
- assumed to be as any sequence of non-numeric characters separated by
- whitespace characters.
-
- >>> s5.str.istitle()
- 0 False
- 1 True
- 2 False
- 3 False
- dtype: bool
- """
- _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"}
- _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"}
- _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"}
- _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"}
- _doc_args["islower"] = {"type": "lowercase", "method": "islower"}
- _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"}
- _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"}
- _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"}
- _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"}
- # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624)
-
- isalnum = _map_and_wrap(
- "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"]
- )
- isalpha = _map_and_wrap(
- "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"]
- )
- isdigit = _map_and_wrap(
- "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"]
- )
- isspace = _map_and_wrap(
- "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"]
- )
- islower = _map_and_wrap(
- "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"]
- )
- isupper = _map_and_wrap(
- "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"]
- )
- istitle = _map_and_wrap(
- "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"]
- )
- isnumeric = _map_and_wrap(
- "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"]
- )
- isdecimal = _map_and_wrap(
- "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"]
- )
-
-
-def cat_safe(list_of_columns: list, sep: str):
- """
- Auxiliary function for :meth:`str.cat`.
-
- Same signature as cat_core, but handles TypeErrors in concatenation, which
- happen if the arrays in list_of columns have the wrong dtypes or content.
-
- Parameters
- ----------
- list_of_columns : list of numpy arrays
- List of arrays to be concatenated with sep;
- these arrays may not contain NaNs!
- sep : string
- The separator string for concatenating the columns.
-
- Returns
- -------
- nd.array
- The concatenation of list_of_columns with sep.
- """
- try:
- result = cat_core(list_of_columns, sep)
- except TypeError:
- # if there are any non-string values (wrong dtype or hidden behind
- # object dtype), np.sum will fail; catch and return with better message
- for column in list_of_columns:
- dtype = lib.infer_dtype(column, skipna=True)
- if dtype not in ["string", "empty"]:
- raise TypeError(
- "Concatenation requires list-likes containing only "
- "strings (or missing values). Offending values found in "
- f"column {dtype}"
- ) from None
- return result
-
-
-def cat_core(list_of_columns: list, sep: str):
- """
- Auxiliary function for :meth:`str.cat`
-
- Parameters
- ----------
- list_of_columns : list of numpy arrays
- List of arrays to be concatenated with sep;
- these arrays may not contain NaNs!
- sep : string
- The separator string for concatenating the columns.
-
- Returns
- -------
- nd.array
- The concatenation of list_of_columns with sep.
- """
- if sep == "":
- # no need to interleave sep if it is empty
- arr_of_cols = np.asarray(list_of_columns, dtype=object)
- return np.sum(arr_of_cols, axis=0)
- list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
- list_with_sep[::2] = list_of_columns
- arr_with_sep = np.asarray(list_with_sep, dtype=object)
- return np.sum(arr_with_sep, axis=0)
-
-
-def _result_dtype(arr):
- # workaround #27953
- # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
- # when the list of values is empty.
- from pandas.core.arrays.string_ import StringDtype
-
- if isinstance(arr.dtype, StringDtype):
- return arr.dtype
- else:
- return object
-
-
-def _get_single_group_name(regex: re.Pattern) -> Hashable:
- if regex.groupindex:
- return next(iter(regex.groupindex))
- else:
- return None
-
-
-def _get_group_names(regex: re.Pattern) -> list[Hashable]:
- """
- Get named groups from compiled regex.
-
- Unnamed groups are numbered.
-
- Parameters
- ----------
- regex : compiled regex
-
- Returns
- -------
- list of column labels
- """
- names = {v: k for k, v in regex.groupindex.items()}
- return [names.get(1 + i, i) for i in range(regex.groups)]
-
-
-def str_extractall(arr, pat, flags: int = 0):
- regex = re.compile(pat, flags=flags)
- # the regex must contain capture groups.
- if regex.groups == 0:
- raise ValueError("pattern contains no capture groups")
-
- if isinstance(arr, ABCIndex):
- arr = arr.to_series().reset_index(drop=True)
-
- columns = _get_group_names(regex)
- match_list = []
- index_list = []
- is_mi = arr.index.nlevels > 1
-
- for subject_key, subject in arr.items():
- if isinstance(subject, str):
- if not is_mi:
- subject_key = (subject_key,)
-
- for match_i, match_tuple in enumerate(regex.findall(subject)):
- if isinstance(match_tuple, str):
- match_tuple = (match_tuple,)
- na_tuple = [np.NaN if group == "" else group for group in match_tuple]
- match_list.append(na_tuple)
- result_key = tuple(subject_key + (match_i,))
- index_list.append(result_key)
-
- from pandas import MultiIndex
-
- index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
- dtype = _result_dtype(arr)
-
- result = arr._constructor_expanddim(
- match_list, index=index, columns=columns, dtype=dtype
- )
- return result
diff --git a/contrib/python/pandas/py3/pandas/core/strings/base.py b/contrib/python/pandas/py3/pandas/core/strings/base.py
deleted file mode 100644
index f1e716b6464..00000000000
--- a/contrib/python/pandas/py3/pandas/core/strings/base.py
+++ /dev/null
@@ -1,260 +0,0 @@
-from __future__ import annotations
-
-import abc
-import re
-from typing import (
- TYPE_CHECKING,
- Callable,
- Literal,
-)
-
-import numpy as np
-
-from pandas._typing import Scalar
-
-if TYPE_CHECKING:
- from pandas import Series
-
-
-class BaseStringArrayMethods(abc.ABC):
- """
- Base class for extension arrays implementing string methods.
-
- This is where our ExtensionArrays can override the implementation of
- Series.str.<method>. We don't expect this to work with
- 3rd-party extension arrays.
-
- * User calls Series.str.<method>
- * pandas extracts the extension array from the Series
- * pandas calls ``extension_array._str_<method>(*args, **kwargs)``
- * pandas wraps the result, to return to the user.
-
- See :ref:`Series.str` for the docstring of each method.
- """
-
- def _str_getitem(self, key):
- if isinstance(key, slice):
- return self._str_slice(start=key.start, stop=key.stop, step=key.step)
- else:
- return self._str_get(key)
-
- @abc.abstractmethod
- def _str_count(self, pat, flags: int = 0):
- pass
-
- @abc.abstractmethod
- def _str_pad(
- self,
- width,
- side: Literal["left", "right", "both"] = "left",
- fillchar: str = " ",
- ):
- pass
-
- @abc.abstractmethod
- def _str_contains(
- self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
- ):
- pass
-
- @abc.abstractmethod
- def _str_startswith(self, pat, na=None):
- pass
-
- @abc.abstractmethod
- def _str_endswith(self, pat, na=None):
- pass
-
- @abc.abstractmethod
- def _str_replace(
- self,
- pat: str | re.Pattern,
- repl: str | Callable,
- n: int = -1,
- case: bool = True,
- flags: int = 0,
- regex: bool = True,
- ):
- pass
-
- @abc.abstractmethod
- def _str_repeat(self, repeats):
- pass
-
- @abc.abstractmethod
- def _str_match(
- self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan
- ):
- pass
-
- @abc.abstractmethod
- def _str_fullmatch(
- self,
- pat: str | re.Pattern,
- case: bool = True,
- flags: int = 0,
- na: Scalar = np.nan,
- ):
- pass
-
- @abc.abstractmethod
- def _str_encode(self, encoding, errors: str = "strict"):
- pass
-
- @abc.abstractmethod
- def _str_find(self, sub, start: int = 0, end=None):
- pass
-
- @abc.abstractmethod
- def _str_rfind(self, sub, start: int = 0, end=None):
- pass
-
- @abc.abstractmethod
- def _str_findall(self, pat, flags: int = 0):
- pass
-
- @abc.abstractmethod
- def _str_get(self, i):
- pass
-
- @abc.abstractmethod
- def _str_index(self, sub, start: int = 0, end=None):
- pass
-
- @abc.abstractmethod
- def _str_rindex(self, sub, start: int = 0, end=None):
- pass
-
- @abc.abstractmethod
- def _str_join(self, sep):
- pass
-
- @abc.abstractmethod
- def _str_partition(self, sep, expand):
- pass
-
- @abc.abstractmethod
- def _str_rpartition(self, sep, expand):
- pass
-
- @abc.abstractmethod
- def _str_len(self):
- pass
-
- @abc.abstractmethod
- def _str_slice(self, start=None, stop=None, step=None):
- pass
-
- @abc.abstractmethod
- def _str_slice_replace(self, start=None, stop=None, repl=None):
- pass
-
- @abc.abstractmethod
- def _str_translate(self, table):
- pass
-
- @abc.abstractmethod
- def _str_wrap(self, width, **kwargs):
- pass
-
- @abc.abstractmethod
- def _str_get_dummies(self, sep: str = "|"):
- pass
-
- @abc.abstractmethod
- def _str_isalnum(self):
- pass
-
- @abc.abstractmethod
- def _str_isalpha(self):
- pass
-
- @abc.abstractmethod
- def _str_isdecimal(self):
- pass
-
- @abc.abstractmethod
- def _str_isdigit(self):
- pass
-
- @abc.abstractmethod
- def _str_islower(self):
- pass
-
- @abc.abstractmethod
- def _str_isnumeric(self):
- pass
-
- @abc.abstractmethod
- def _str_isspace(self):
- pass
-
- @abc.abstractmethod
- def _str_istitle(self):
- pass
-
- @abc.abstractmethod
- def _str_isupper(self):
- pass
-
- @abc.abstractmethod
- def _str_capitalize(self):
- pass
-
- @abc.abstractmethod
- def _str_casefold(self):
- pass
-
- @abc.abstractmethod
- def _str_title(self):
- pass
-
- @abc.abstractmethod
- def _str_swapcase(self):
- pass
-
- @abc.abstractmethod
- def _str_lower(self):
- pass
-
- @abc.abstractmethod
- def _str_upper(self):
- pass
-
- @abc.abstractmethod
- def _str_normalize(self, form):
- pass
-
- @abc.abstractmethod
- def _str_strip(self, to_strip=None):
- pass
-
- @abc.abstractmethod
- def _str_lstrip(self, to_strip=None):
- pass
-
- @abc.abstractmethod
- def _str_rstrip(self, to_strip=None):
- pass
-
- @abc.abstractmethod
- def _str_removeprefix(self, prefix: str) -> Series:
- pass
-
- @abc.abstractmethod
- def _str_removesuffix(self, suffix: str) -> Series:
- pass
-
- @abc.abstractmethod
- def _str_split(
- self, pat=None, n=-1, expand: bool = False, regex: bool | None = None
- ):
- pass
-
- @abc.abstractmethod
- def _str_rsplit(self, pat=None, n=-1):
- pass
-
- @abc.abstractmethod
- def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
- pass
diff --git a/contrib/python/pandas/py3/pandas/core/strings/object_array.py b/contrib/python/pandas/py3/pandas/core/strings/object_array.py
deleted file mode 100644
index 4055092b823..00000000000
--- a/contrib/python/pandas/py3/pandas/core/strings/object_array.py
+++ /dev/null
@@ -1,498 +0,0 @@
-from __future__ import annotations
-
-import functools
-import re
-import sys
-import textwrap
-from typing import (
- TYPE_CHECKING,
- Callable,
- Literal,
-)
-import unicodedata
-
-import numpy as np
-
-from pandas._libs import lib
-import pandas._libs.missing as libmissing
-import pandas._libs.ops as libops
-from pandas._typing import (
- NpDtype,
- Scalar,
-)
-
-from pandas.core.dtypes.common import is_scalar
-from pandas.core.dtypes.missing import isna
-
-from pandas.core.strings.base import BaseStringArrayMethods
-
-if TYPE_CHECKING:
- from pandas import Series
-
-
-class ObjectStringArrayMixin(BaseStringArrayMethods):
- """
- String Methods operating on object-dtype ndarrays.
- """
-
- _str_na_value = np.nan
-
- def __len__(self) -> int:
- # For typing, _str_map relies on the object being sized.
- raise NotImplementedError
-
- def _str_map(
- self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True
- ):
- """
- Map a callable over valid elements of the array.
-
- Parameters
- ----------
- f : Callable
- A function to call on each non-NA element.
- na_value : Scalar, optional
- The value to set for NA values. Might also be used for the
- fill value if the callable `f` raises an exception.
- This defaults to ``self._str_na_value`` which is ``np.nan``
- for object-dtype and Categorical and ``pd.NA`` for StringArray.
- dtype : Dtype, optional
- The dtype of the result array.
- convert : bool, default True
- Whether to call `maybe_convert_objects` on the resulting ndarray
- """
- if dtype is None:
- dtype = np.dtype("object")
- if na_value is None:
- na_value = self._str_na_value
-
- if not len(self):
- return np.array([], dtype=dtype)
-
- arr = np.asarray(self, dtype=object)
- mask = isna(arr)
- map_convert = convert and not np.all(mask)
- try:
- result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)
- except (TypeError, AttributeError) as err:
- # Reraise the exception if callable `f` got wrong number of args.
- # The user may want to be warned by this, instead of getting NaN
- p_err = (
- r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
- r"(?(3)required )positional arguments?"
- )
-
- if len(err.args) >= 1 and re.search(p_err, err.args[0]):
- # FIXME: this should be totally avoidable
- raise err
-
- def g(x):
- # This type of fallback behavior can be removed once
- # we remove object-dtype .str accessor.
- try:
- return f(x)
- except (TypeError, AttributeError):
- return na_value
-
- return self._str_map(g, na_value=na_value, dtype=dtype)
- if not isinstance(result, np.ndarray):
- return result
- if na_value is not np.nan:
- np.putmask(result, mask, na_value)
- if convert and result.dtype == object:
- result = lib.maybe_convert_objects(result)
- return result
-
- def _str_count(self, pat, flags: int = 0):
- regex = re.compile(pat, flags=flags)
- f = lambda x: len(regex.findall(x))
- return self._str_map(f, dtype="int64")
-
- def _str_pad(
- self,
- width,
- side: Literal["left", "right", "both"] = "left",
- fillchar: str = " ",
- ):
- if side == "left":
- f = lambda x: x.rjust(width, fillchar)
- elif side == "right":
- f = lambda x: x.ljust(width, fillchar)
- elif side == "both":
- f = lambda x: x.center(width, fillchar)
- else: # pragma: no cover
- raise ValueError("Invalid side")
- return self._str_map(f)
-
- def _str_contains(
- self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
- ):
- if regex:
- if not case:
- flags |= re.IGNORECASE
-
- pat = re.compile(pat, flags=flags)
-
- f = lambda x: pat.search(x) is not None
- else:
- if case:
- f = lambda x: pat in x
- else:
- upper_pat = pat.upper()
- f = lambda x: upper_pat in x.upper()
- return self._str_map(f, na, dtype=np.dtype("bool"))
-
- def _str_startswith(self, pat, na=None):
- f = lambda x: x.startswith(pat)
- return self._str_map(f, na_value=na, dtype=np.dtype(bool))
-
- def _str_endswith(self, pat, na=None):
- f = lambda x: x.endswith(pat)
- return self._str_map(f, na_value=na, dtype=np.dtype(bool))
-
- def _str_replace(
- self,
- pat: str | re.Pattern,
- repl: str | Callable,
- n: int = -1,
- case: bool = True,
- flags: int = 0,
- regex: bool = True,
- ):
- if case is False:
- # add case flag, if provided
- flags |= re.IGNORECASE
-
- if regex or flags or callable(repl):
- if not isinstance(pat, re.Pattern):
- if regex is False:
- pat = re.escape(pat)
- pat = re.compile(pat, flags=flags)
-
- n = n if n >= 0 else 0
- f = lambda x: pat.sub(repl=repl, string=x, count=n)
- else:
- f = lambda x: x.replace(pat, repl, n)
-
- return self._str_map(f, dtype=str)
-
- def _str_repeat(self, repeats):
- if is_scalar(repeats):
-
- def scalar_rep(x):
- try:
- return bytes.__mul__(x, repeats)
- except TypeError:
- return str.__mul__(x, repeats)
-
- return self._str_map(scalar_rep, dtype=str)
- else:
- from pandas.core.arrays.string_ import BaseStringArray
-
- def rep(x, r):
- if x is libmissing.NA:
- return x
- try:
- return bytes.__mul__(x, r)
- except TypeError:
- return str.__mul__(x, r)
-
- repeats = np.asarray(repeats, dtype=object)
- result = libops.vec_binop(np.asarray(self), repeats, rep)
- if isinstance(self, BaseStringArray):
- # Not going through map, so we have to do this here.
- result = type(self)._from_sequence(result)
- return result
-
- def _str_match(
- self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not case:
- flags |= re.IGNORECASE
-
- regex = re.compile(pat, flags=flags)
-
- f = lambda x: regex.match(x) is not None
- return self._str_map(f, na_value=na, dtype=np.dtype(bool))
-
- def _str_fullmatch(
- self,
- pat: str | re.Pattern,
- case: bool = True,
- flags: int = 0,
- na: Scalar | None = None,
- ):
- if not case:
- flags |= re.IGNORECASE
-
- regex = re.compile(pat, flags=flags)
-
- f = lambda x: regex.fullmatch(x) is not None
- return self._str_map(f, na_value=na, dtype=np.dtype(bool))
-
- def _str_encode(self, encoding, errors: str = "strict"):
- f = lambda x: x.encode(encoding, errors=errors)
- return self._str_map(f, dtype=object)
-
- def _str_find(self, sub, start: int = 0, end=None):
- return self._str_find_(sub, start, end, side="left")
-
- def _str_rfind(self, sub, start: int = 0, end=None):
- return self._str_find_(sub, start, end, side="right")
-
- def _str_find_(self, sub, start, end, side):
- if side == "left":
- method = "find"
- elif side == "right":
- method = "rfind"
- else: # pragma: no cover
- raise ValueError("Invalid side")
-
- if end is None:
- f = lambda x: getattr(x, method)(sub, start)
- else:
- f = lambda x: getattr(x, method)(sub, start, end)
- return self._str_map(f, dtype="int64")
-
- def _str_findall(self, pat, flags: int = 0):
- regex = re.compile(pat, flags=flags)
- return self._str_map(regex.findall, dtype="object")
-
- def _str_get(self, i):
- def f(x):
- if isinstance(x, dict):
- return x.get(i)
- elif len(x) > i >= -len(x):
- return x[i]
- return self._str_na_value
-
- return self._str_map(f)
-
- def _str_index(self, sub, start: int = 0, end=None):
- if end:
- f = lambda x: x.index(sub, start, end)
- else:
- f = lambda x: x.index(sub, start, end)
- return self._str_map(f, dtype="int64")
-
- def _str_rindex(self, sub, start: int = 0, end=None):
- if end:
- f = lambda x: x.rindex(sub, start, end)
- else:
- f = lambda x: x.rindex(sub, start, end)
- return self._str_map(f, dtype="int64")
-
- def _str_join(self, sep):
- return self._str_map(sep.join)
-
- def _str_partition(self, sep, expand):
- result = self._str_map(lambda x: x.partition(sep), dtype="object")
- return result
-
- def _str_rpartition(self, sep, expand):
- return self._str_map(lambda x: x.rpartition(sep), dtype="object")
-
- def _str_len(self):
- return self._str_map(len, dtype="int64")
-
- def _str_slice(self, start=None, stop=None, step=None):
- obj = slice(start, stop, step)
- return self._str_map(lambda x: x[obj])
-
- def _str_slice_replace(self, start=None, stop=None, repl=None):
- if repl is None:
- repl = ""
-
- def f(x):
- if x[start:stop] == "":
- local_stop = start
- else:
- local_stop = stop
- y = ""
- if start is not None:
- y += x[:start]
- y += repl
- if stop is not None:
- y += x[local_stop:]
- return y
-
- return self._str_map(f)
-
- def _str_split(
- self,
- pat: str | re.Pattern | None = None,
- n=-1,
- expand: bool = False,
- regex: bool | None = None,
- ):
- if pat is None:
- if n is None or n == 0:
- n = -1
- f = lambda x: x.split(pat, n)
- else:
- new_pat: str | re.Pattern
- if regex is True or isinstance(pat, re.Pattern):
- new_pat = re.compile(pat)
- elif regex is False:
- new_pat = pat
- # regex is None so link to old behavior #43563
- else:
- if len(pat) == 1:
- new_pat = pat
- else:
- new_pat = re.compile(pat)
-
- if isinstance(new_pat, re.Pattern):
- if n is None or n == -1:
- n = 0
- f = lambda x: new_pat.split(x, maxsplit=n)
- else:
- if n is None or n == 0:
- n = -1
- f = lambda x: x.split(pat, n)
- return self._str_map(f, dtype=object)
-
- def _str_rsplit(self, pat=None, n=-1):
- if n is None or n == 0:
- n = -1
- f = lambda x: x.rsplit(pat, n)
- return self._str_map(f, dtype="object")
-
- def _str_translate(self, table):
- return self._str_map(lambda x: x.translate(table))
-
- def _str_wrap(self, width, **kwargs):
- kwargs["width"] = width
- tw = textwrap.TextWrapper(**kwargs)
- return self._str_map(lambda s: "\n".join(tw.wrap(s)))
-
- def _str_get_dummies(self, sep: str = "|"):
- from pandas import Series
-
- arr = Series(self).fillna("")
- try:
- arr = sep + arr + sep
- except (TypeError, NotImplementedError):
- arr = sep + arr.astype(str) + sep
-
- tags: set[str] = set()
- for ts in Series(arr, copy=False).str.split(sep):
- tags.update(ts)
- tags2 = sorted(tags - {""})
-
- dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)
-
- def _isin(test_elements: str, element: str) -> bool:
- return element in test_elements
-
- for i, t in enumerate(tags2):
- pat = sep + t + sep
- dummies[:, i] = lib.map_infer(
- arr.to_numpy(), functools.partial(_isin, element=pat)
- )
- return dummies, tags2
-
- def _str_upper(self):
- return self._str_map(lambda x: x.upper())
-
- def _str_isalnum(self):
- return self._str_map(str.isalnum, dtype="bool")
-
- def _str_isalpha(self):
- return self._str_map(str.isalpha, dtype="bool")
-
- def _str_isdecimal(self):
- return self._str_map(str.isdecimal, dtype="bool")
-
- def _str_isdigit(self):
- return self._str_map(str.isdigit, dtype="bool")
-
- def _str_islower(self):
- return self._str_map(str.islower, dtype="bool")
-
- def _str_isnumeric(self):
- return self._str_map(str.isnumeric, dtype="bool")
-
- def _str_isspace(self):
- return self._str_map(str.isspace, dtype="bool")
-
- def _str_istitle(self):
- return self._str_map(str.istitle, dtype="bool")
-
- def _str_isupper(self):
- return self._str_map(str.isupper, dtype="bool")
-
- def _str_capitalize(self):
- return self._str_map(str.capitalize)
-
- def _str_casefold(self):
- return self._str_map(str.casefold)
-
- def _str_title(self):
- return self._str_map(str.title)
-
- def _str_swapcase(self):
- return self._str_map(str.swapcase)
-
- def _str_lower(self):
- return self._str_map(str.lower)
-
- def _str_normalize(self, form):
- f = lambda x: unicodedata.normalize(form, x)
- return self._str_map(f)
-
- def _str_strip(self, to_strip=None):
- return self._str_map(lambda x: x.strip(to_strip))
-
- def _str_lstrip(self, to_strip=None):
- return self._str_map(lambda x: x.lstrip(to_strip))
-
- def _str_rstrip(self, to_strip=None):
- return self._str_map(lambda x: x.rstrip(to_strip))
-
- def _str_removeprefix(self, prefix: str) -> Series:
- # outstanding question on whether to use native methods for users on Python 3.9+
- # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770,
- # in which case we could do return self._str_map(str.removeprefix)
-
- def removeprefix(text: str) -> str:
- if text.startswith(prefix):
- return text[len(prefix) :]
- return text
-
- return self._str_map(removeprefix)
-
- def _str_removesuffix(self, suffix: str) -> Series:
- if sys.version_info < (3, 9):
- # NOTE pyupgrade will remove this when we run it with --py39-plus
- # so don't remove the unnecessary `else` statement below
- from pandas.util._str_methods import removesuffix
-
- return self._str_map(functools.partial(removesuffix, suffix=suffix))
- else:
- return self._str_map(lambda x: x.removesuffix(suffix))
-
- def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
- regex = re.compile(pat, flags=flags)
- na_value = self._str_na_value
-
- if not expand:
-
- def g(x):
- m = regex.search(x)
- return m.groups()[0] if m else na_value
-
- return self._str_map(g, convert=False)
-
- empty_row = [na_value] * regex.groups
-
- def f(x):
- if not isinstance(x, str):
- return empty_row
- m = regex.search(x)
- if m:
- return [na_value if item is None else item for item in m.groups()]
- else:
- return empty_row
-
- return [f(val) for val in np.asarray(self)]
diff --git a/contrib/python/pandas/py3/pandas/core/tools/__init__.py b/contrib/python/pandas/py3/pandas/core/tools/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/tools/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/core/tools/datetimes.py b/contrib/python/pandas/py3/pandas/core/tools/datetimes.py
deleted file mode 100644
index f74758b87f3..00000000000
--- a/contrib/python/pandas/py3/pandas/core/tools/datetimes.py
+++ /dev/null
@@ -1,1272 +0,0 @@
-from __future__ import annotations
-
-from collections import abc
-from datetime import datetime
-from functools import partial
-from itertools import islice
-from typing import (
- TYPE_CHECKING,
- Callable,
- Hashable,
- List,
- Tuple,
- TypedDict,
- Union,
- cast,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import (
- lib,
- tslib,
-)
-from pandas._libs.tslibs import (
- OutOfBoundsDatetime,
- Timedelta,
- Timestamp,
- astype_overflowsafe,
- get_unit_from_dtype,
- iNaT,
- is_supported_unit,
- nat_strings,
- parsing,
- timezones as libtimezones,
-)
-from pandas._libs.tslibs.conversion import precision_from_unit
-from pandas._libs.tslibs.parsing import (
- DateParseError,
- guess_datetime_format,
-)
-from pandas._libs.tslibs.strptime import array_strptime
-from pandas._typing import (
- AnyArrayLike,
- ArrayLike,
- DateTimeErrorChoices,
- npt,
-)
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import (
- ensure_object,
- is_datetime64_dtype,
- is_datetime64tz_dtype,
- is_float,
- is_integer,
- is_integer_dtype,
- is_list_like,
- is_numeric_dtype,
- is_scalar,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import notna
-
-from pandas.arrays import (
- DatetimeArray,
- IntegerArray,
- PandasArray,
-)
-from pandas.core import algorithms
-from pandas.core.algorithms import unique
-from pandas.core.arrays.base import ExtensionArray
-from pandas.core.arrays.datetimes import (
- maybe_convert_dtype,
- objects_to_datetime64ns,
- tz_to_dtype,
-)
-from pandas.core.construction import extract_array
-from pandas.core.indexes.base import Index
-from pandas.core.indexes.datetimes import DatetimeIndex
-
-if TYPE_CHECKING:
- from pandas._libs.tslibs.nattype import NaTType
- from pandas._libs.tslibs.timedeltas import UnitChoices
-
- from pandas import (
- DataFrame,
- Series,
- )
-
-# ---------------------------------------------------------------------
-# types used in annotations
-
-ArrayConvertible = Union[List, Tuple, AnyArrayLike]
-Scalar = Union[float, str]
-DatetimeScalar = Union[Scalar, datetime]
-
-DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible]
-
-DatetimeDictArg = Union[List[Scalar], Tuple[Scalar, ...], AnyArrayLike]
-
-
-class YearMonthDayDict(TypedDict, total=True):
- year: DatetimeDictArg
- month: DatetimeDictArg
- day: DatetimeDictArg
-
-
-class FulldatetimeDict(YearMonthDayDict, total=False):
- hour: DatetimeDictArg
- hours: DatetimeDictArg
- minute: DatetimeDictArg
- minutes: DatetimeDictArg
- second: DatetimeDictArg
- seconds: DatetimeDictArg
- ms: DatetimeDictArg
- us: DatetimeDictArg
- ns: DatetimeDictArg
-
-
-DictConvertible = Union[FulldatetimeDict, "DataFrame"]
-start_caching_at = 50
-
-
-# ---------------------------------------------------------------------
-
-
-def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str | None:
- # Try to guess the format based on the first non-NaN element, return None if can't
- if (first_non_null := tslib.first_non_null(arr)) != -1:
- if type(first_non_nan_element := arr[first_non_null]) is str:
- # GH#32264 np.str_ object
- guessed_format = guess_datetime_format(
- first_non_nan_element, dayfirst=dayfirst
- )
- if guessed_format is not None:
- return guessed_format
- # If there are multiple non-null elements, warn about
- # how parsing might not be consistent
- if tslib.first_non_null(arr[first_non_null + 1 :]) != -1:
- warnings.warn(
- "Could not infer format, so each element will be parsed "
- "individually, falling back to `dateutil`. To ensure parsing is "
- "consistent and as-expected, please specify a format.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- return None
-
-
-def should_cache(
- arg: ArrayConvertible, unique_share: float = 0.7, check_count: int | None = None
-) -> bool:
- """
- Decides whether to do caching.
-
- If the percent of unique elements among `check_count` elements less
- than `unique_share * 100` then we can do caching.
-
- Parameters
- ----------
- arg: listlike, tuple, 1-d array, Series
- unique_share: float, default=0.7, optional
- 0 < unique_share < 1
- check_count: int, optional
- 0 <= check_count <= len(arg)
-
- Returns
- -------
- do_caching: bool
-
- Notes
- -----
- By default for a sequence of less than 50 items in size, we don't do
- caching; for the number of elements less than 5000, we take ten percent of
- all elements to check for a uniqueness share; if the sequence size is more
- than 5000, then we check only the first 500 elements.
- All constants were chosen empirically by.
- """
- do_caching = True
-
- # default realization
- if check_count is None:
- # in this case, the gain from caching is negligible
- if len(arg) <= start_caching_at:
- return False
-
- if len(arg) <= 5000:
- check_count = len(arg) // 10
- else:
- check_count = 500
- else:
- assert (
- 0 <= check_count <= len(arg)
- ), "check_count must be in next bounds: [0; len(arg)]"
- if check_count == 0:
- return False
-
- assert 0 < unique_share < 1, "unique_share must be in next bounds: (0; 1)"
-
- try:
- # We can't cache if the items are not hashable.
- unique_elements = set(islice(arg, check_count))
- except TypeError:
- return False
- if len(unique_elements) > check_count * unique_share:
- do_caching = False
- return do_caching
-
-
-def _maybe_cache(
- arg: ArrayConvertible,
- format: str | None,
- cache: bool,
- convert_listlike: Callable,
-) -> Series:
- """
- Create a cache of unique dates from an array of dates
-
- Parameters
- ----------
- arg : listlike, tuple, 1-d array, Series
- format : string
- Strftime format to parse time
- cache : bool
- True attempts to create a cache of converted values
- convert_listlike : function
- Conversion function to apply on dates
-
- Returns
- -------
- cache_array : Series
- Cache of converted, unique dates. Can be empty
- """
- from pandas import Series
-
- cache_array = Series(dtype=object)
-
- if cache:
- # Perform a quicker unique check
- if not should_cache(arg):
- return cache_array
-
- unique_dates = unique(arg)
- if len(unique_dates) < len(arg):
- cache_dates = convert_listlike(unique_dates, format)
- # GH#45319
- try:
- cache_array = Series(cache_dates, index=unique_dates, copy=False)
- except OutOfBoundsDatetime:
- return cache_array
- # GH#39882 and GH#35888 in case of None and NaT we get duplicates
- if not cache_array.index.is_unique:
- cache_array = cache_array[~cache_array.index.duplicated()]
- return cache_array
-
-
-def _box_as_indexlike(
- dt_array: ArrayLike, utc: bool = False, name: Hashable = None
-) -> Index:
- """
- Properly boxes the ndarray of datetimes to DatetimeIndex
- if it is possible or to generic Index instead
-
- Parameters
- ----------
- dt_array: 1-d array
- Array of datetimes to be wrapped in an Index.
- utc : bool
- Whether to convert/localize timestamps to UTC.
- name : string, default None
- Name for a resulting index
-
- Returns
- -------
- result : datetime of converted dates
- - DatetimeIndex if convertible to sole datetime64 type
- - general Index otherwise
- """
-
- if is_datetime64_dtype(dt_array):
- tz = "utc" if utc else None
- return DatetimeIndex(dt_array, tz=tz, name=name)
- return Index(dt_array, name=name, dtype=dt_array.dtype)
-
-
-def _convert_and_box_cache(
- arg: DatetimeScalarOrArrayConvertible,
- cache_array: Series,
- name: Hashable | None = None,
-) -> Index:
- """
- Convert array of dates with a cache and wrap the result in an Index.
-
- Parameters
- ----------
- arg : integer, float, string, datetime, list, tuple, 1-d array, Series
- cache_array : Series
- Cache of converted, unique dates
- name : string, default None
- Name for a DatetimeIndex
-
- Returns
- -------
- result : Index-like of converted dates
- """
- from pandas import Series
-
- result = Series(arg, dtype=cache_array.index.dtype).map(cache_array)
- return _box_as_indexlike(result._values, utc=False, name=name)
-
-
-def _return_parsed_timezone_results(
- result: np.ndarray, timezones, utc: bool, name
-) -> Index:
- """
- Return results from array_strptime if a %z or %Z directive was passed.
-
- Parameters
- ----------
- result : ndarray[int64]
- int64 date representations of the dates
- timezones : ndarray
- pytz timezone objects
- utc : bool
- Whether to convert/localize timestamps to UTC.
- name : string, default None
- Name for a DatetimeIndex
-
- Returns
- -------
- tz_result : Index-like of parsed dates with timezone
- """
- tz_results = np.empty(len(result), dtype=object)
- for zone in unique(timezones):
- mask = timezones == zone
- dta = DatetimeArray(result[mask]).tz_localize(zone)
- if utc:
- if dta.tzinfo is None:
- dta = dta.tz_localize("utc")
- else:
- dta = dta.tz_convert("utc")
- tz_results[mask] = dta
-
- return Index(tz_results, name=name)
-
-
-def _convert_listlike_datetimes(
- arg,
- format: str | None,
- name: Hashable = None,
- utc: bool = False,
- unit: str | None = None,
- errors: DateTimeErrorChoices = "raise",
- dayfirst: bool | None = None,
- yearfirst: bool | None = None,
- exact: bool = True,
-):
- """
- Helper function for to_datetime. Performs the conversions of 1D listlike
- of dates
-
- Parameters
- ----------
- arg : list, tuple, ndarray, Series, Index
- date to be parsed
- name : object
- None or string for the Index name
- utc : bool
- Whether to convert/localize timestamps to UTC.
- unit : str
- None or string of the frequency of the passed data
- errors : str
- error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'
- dayfirst : bool
- dayfirst parsing behavior from to_datetime
- yearfirst : bool
- yearfirst parsing behavior from to_datetime
- exact : bool, default True
- exact format matching behavior from to_datetime
-
- Returns
- -------
- Index-like of parsed dates
- """
- if isinstance(arg, (list, tuple)):
- arg = np.array(arg, dtype="O")
- elif isinstance(arg, PandasArray):
- arg = np.array(arg)
-
- arg_dtype = getattr(arg, "dtype", None)
- # these are shortcutable
- tz = "utc" if utc else None
- if is_datetime64tz_dtype(arg_dtype):
- if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
- return DatetimeIndex(arg, tz=tz, name=name)
- if utc:
- arg = arg.tz_convert(None).tz_localize("utc")
- return arg
-
- elif is_datetime64_dtype(arg_dtype):
- arg_dtype = cast(np.dtype, arg_dtype)
- if not is_supported_unit(get_unit_from_dtype(arg_dtype)):
- # We go to closest supported reso, i.e. "s"
- arg = astype_overflowsafe(
- # TODO: looks like we incorrectly raise with errors=="ignore"
- np.asarray(arg),
- np.dtype("M8[s]"),
- is_coerce=errors == "coerce",
- )
-
- if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
- return DatetimeIndex(arg, tz=tz, name=name)
- elif utc:
- # DatetimeArray, DatetimeIndex
- return arg.tz_localize("utc")
-
- return arg
-
- elif unit is not None:
- if format is not None:
- raise ValueError("cannot specify both format and unit")
- return _to_datetime_with_unit(arg, unit, name, utc, errors)
- elif getattr(arg, "ndim", 1) > 1:
- raise TypeError(
- "arg must be a string, datetime, list, tuple, 1-d array, or Series"
- )
-
- # warn if passing timedelta64, raise for PeriodDtype
- # NB: this must come after unit transformation
- try:
- arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz))
- except TypeError:
- if errors == "coerce":
- npvalues = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg))
- return DatetimeIndex(npvalues, name=name)
- elif errors == "ignore":
- idx = Index(arg, name=name)
- return idx
- raise
-
- arg = ensure_object(arg)
-
- if format is None:
- format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
-
- # `format` could be inferred, or user didn't ask for mixed-format parsing.
- if format is not None and format != "mixed":
- return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
-
- result, tz_parsed = objects_to_datetime64ns(
- arg,
- dayfirst=dayfirst,
- yearfirst=yearfirst,
- utc=utc,
- errors=errors,
- allow_object=True,
- )
-
- if tz_parsed is not None:
- # We can take a shortcut since the datetime64 numpy array
- # is in UTC
- dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed))
- return DatetimeIndex._simple_new(dta, name=name)
-
- return _box_as_indexlike(result, utc=utc, name=name)
-
-
-def _array_strptime_with_fallback(
- arg,
- name,
- utc: bool,
- fmt: str,
- exact: bool,
- errors: str,
-) -> Index:
- """
- Call array_strptime, with fallback behavior depending on 'errors'.
- """
- result, timezones = array_strptime(arg, fmt, exact=exact, errors=errors, utc=utc)
- if any(tz is not None for tz in timezones):
- return _return_parsed_timezone_results(result, timezones, utc, name)
-
- return _box_as_indexlike(result, utc=utc, name=name)
-
-
-def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
- """
- to_datetime specalized to the case where a 'unit' is passed.
- """
- arg = extract_array(arg, extract_numpy=True)
-
- # GH#30050 pass an ndarray to tslib.array_with_unit_to_datetime
- # because it expects an ndarray argument
- if isinstance(arg, IntegerArray):
- arr = arg.astype(f"datetime64[{unit}]")
- tz_parsed = None
- else:
- arg = np.asarray(arg)
-
- if arg.dtype.kind in ["i", "u"]:
- # Note we can't do "f" here because that could induce unwanted
- # rounding GH#14156, GH#20445
- arr = arg.astype(f"datetime64[{unit}]", copy=False)
- try:
- arr = astype_overflowsafe(arr, np.dtype("M8[ns]"), copy=False)
- except OutOfBoundsDatetime:
- if errors == "raise":
- raise
- arg = arg.astype(object)
- return _to_datetime_with_unit(arg, unit, name, utc, errors)
- tz_parsed = None
-
- elif arg.dtype.kind == "f":
- mult, _ = precision_from_unit(unit)
-
- mask = np.isnan(arg) | (arg == iNaT)
- fvalues = (arg * mult).astype("f8", copy=False)
- fvalues[mask] = 0
-
- if (fvalues < Timestamp.min._value).any() or (
- fvalues > Timestamp.max._value
- ).any():
- if errors != "raise":
- arg = arg.astype(object)
- return _to_datetime_with_unit(arg, unit, name, utc, errors)
- raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
-
- arr = fvalues.astype("M8[ns]", copy=False)
- arr[mask] = np.datetime64("NaT", "ns")
-
- tz_parsed = None
- else:
- arg = arg.astype(object, copy=False)
- arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
-
- if errors == "ignore":
- # Index constructor _may_ infer to DatetimeIndex
- result = Index._with_infer(arr, name=name)
- else:
- result = DatetimeIndex(arr, name=name)
-
- if not isinstance(result, DatetimeIndex):
- return result
-
- # GH#23758: We may still need to localize the result with tz
- # GH#25546: Apply tz_parsed first (from arg), then tz (from caller)
- # result will be naive but in UTC
- result = result.tz_localize("UTC").tz_convert(tz_parsed)
-
- if utc:
- if result.tz is None:
- result = result.tz_localize("utc")
- else:
- result = result.tz_convert("utc")
- return result
-
-
-def _adjust_to_origin(arg, origin, unit):
- """
- Helper function for to_datetime.
- Adjust input argument to the specified origin
-
- Parameters
- ----------
- arg : list, tuple, ndarray, Series, Index
- date to be adjusted
- origin : 'julian' or Timestamp
- origin offset for the arg
- unit : str
- passed unit from to_datetime, must be 'D'
-
- Returns
- -------
- ndarray or scalar of adjusted date(s)
- """
- if origin == "julian":
- original = arg
- j0 = Timestamp(0).to_julian_date()
- if unit != "D":
- raise ValueError("unit must be 'D' for origin='julian'")
- try:
- arg = arg - j0
- except TypeError as err:
- raise ValueError(
- "incompatible 'arg' type for given 'origin'='julian'"
- ) from err
-
- # preemptively check this for a nice range
- j_max = Timestamp.max.to_julian_date() - j0
- j_min = Timestamp.min.to_julian_date() - j0
- if np.any(arg > j_max) or np.any(arg < j_min):
- raise OutOfBoundsDatetime(
- f"{original} is Out of Bounds for origin='julian'"
- )
- else:
- # arg must be numeric
- if not (
- (is_scalar(arg) and (is_integer(arg) or is_float(arg)))
- or is_numeric_dtype(np.asarray(arg))
- ):
- raise ValueError(
- f"'{arg}' is not compatible with origin='{origin}'; "
- "it must be numeric with a unit specified"
- )
-
- # we are going to offset back to unix / epoch time
- try:
- offset = Timestamp(origin, unit=unit)
- except OutOfBoundsDatetime as err:
- raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err
- except ValueError as err:
- raise ValueError(
- f"origin {origin} cannot be converted to a Timestamp"
- ) from err
-
- if offset.tz is not None:
- raise ValueError(f"origin offset {offset} must be tz-naive")
- td_offset = offset - Timestamp(0)
-
- # convert the offset to the unit of the arg
- # this should be lossless in terms of precision
- ioffset = td_offset // Timedelta(1, unit=unit)
-
- # scalars & ndarray-like can handle the addition
- if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)):
- arg = np.asarray(arg)
- arg = arg + ioffset
- return arg
-
-
-@overload
-def to_datetime(
- arg: DatetimeScalar,
- errors: DateTimeErrorChoices = ...,
- dayfirst: bool = ...,
- yearfirst: bool = ...,
- utc: bool = ...,
- format: str | None = ...,
- exact: bool = ...,
- unit: str | None = ...,
- infer_datetime_format: bool = ...,
- origin=...,
- cache: bool = ...,
-) -> Timestamp:
- ...
-
-
-@overload
-def to_datetime(
- arg: Series | DictConvertible,
- errors: DateTimeErrorChoices = ...,
- dayfirst: bool = ...,
- yearfirst: bool = ...,
- utc: bool = ...,
- format: str | None = ...,
- exact: bool = ...,
- unit: str | None = ...,
- infer_datetime_format: bool = ...,
- origin=...,
- cache: bool = ...,
-) -> Series:
- ...
-
-
-@overload
-def to_datetime(
- arg: list | tuple | Index | ArrayLike,
- errors: DateTimeErrorChoices = ...,
- dayfirst: bool = ...,
- yearfirst: bool = ...,
- utc: bool = ...,
- format: str | None = ...,
- exact: bool = ...,
- unit: str | None = ...,
- infer_datetime_format: bool = ...,
- origin=...,
- cache: bool = ...,
-) -> DatetimeIndex:
- ...
-
-
-def to_datetime(
- arg: DatetimeScalarOrArrayConvertible | DictConvertible,
- errors: DateTimeErrorChoices = "raise",
- dayfirst: bool = False,
- yearfirst: bool = False,
- utc: bool = False,
- format: str | None = None,
- exact: bool | lib.NoDefault = lib.no_default,
- unit: str | None = None,
- infer_datetime_format: lib.NoDefault | bool = lib.no_default,
- origin: str = "unix",
- cache: bool = True,
-) -> DatetimeIndex | Series | DatetimeScalar | NaTType | None:
- """
- Convert argument to datetime.
-
- This function converts a scalar, array-like, :class:`Series` or
- :class:`DataFrame`/dict-like to a pandas datetime object.
-
- Parameters
- ----------
- arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like
- The object to convert to a datetime. If a :class:`DataFrame` is provided, the
- method expects minimally the following columns: :const:`"year"`,
- :const:`"month"`, :const:`"day"`.
- errors : {'ignore', 'raise', 'coerce'}, default 'raise'
- - If :const:`'raise'`, then invalid parsing will raise an exception.
- - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`.
- - If :const:`'ignore'`, then invalid parsing will return the input.
- dayfirst : bool, default False
- Specify a date parse order if `arg` is str or is list-like.
- If :const:`True`, parses dates with the day first, e.g. :const:`"10/11/12"`
- is parsed as :const:`2012-11-10`.
-
- .. warning::
-
- ``dayfirst=True`` is not strict, but will prefer to parse
- with day first.
-
- yearfirst : bool, default False
- Specify a date parse order if `arg` is str or is list-like.
-
- - If :const:`True` parses dates with the year first, e.g.
- :const:`"10/11/12"` is parsed as :const:`2010-11-12`.
- - If both `dayfirst` and `yearfirst` are :const:`True`, `yearfirst` is
- preceded (same as :mod:`dateutil`).
-
- .. warning::
-
- ``yearfirst=True`` is not strict, but will prefer to parse
- with year first.
-
- utc : bool, default False
- Control timezone-related parsing, localization and conversion.
-
- - If :const:`True`, the function *always* returns a timezone-aware
- UTC-localized :class:`Timestamp`, :class:`Series` or
- :class:`DatetimeIndex`. To do this, timezone-naive inputs are
- *localized* as UTC, while timezone-aware inputs are *converted* to UTC.
-
- - If :const:`False` (default), inputs will not be coerced to UTC.
- Timezone-naive inputs will remain naive, while timezone-aware ones
- will keep their time offsets. Limitations exist for mixed
- offsets (typically, daylight savings), see :ref:`Examples
- <to_datetime_tz_examples>` section for details.
-
- See also: pandas general documentation about `timezone conversion and
- localization
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
- #time-zone-handling>`_.
-
- format : str, default None
- The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See
- `strftime documentation
- <https://docs.python.org/3/library/datetime.html
- #strftime-and-strptime-behavior>`_ for more information on choices, though
- note that :const:`"%f"` will parse all the way up to nanoseconds.
- You can also pass:
-
- - "ISO8601", to parse any `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_
- time string (not necessarily in exactly the same format);
- - "mixed", to infer the format for each element individually. This is risky,
- and you should probably use it along with `dayfirst`.
- exact : bool, default True
- Control how `format` is used:
-
- - If :const:`True`, require an exact `format` match.
- - If :const:`False`, allow the `format` to match anywhere in the target
- string.
-
- Cannot be used alongside ``format='ISO8601'`` or ``format='mixed'``.
- unit : str, default 'ns'
- The unit of the arg (D,s,ms,us,ns) denote the unit, which is an
- integer or float number. This will be based off the origin.
- Example, with ``unit='ms'`` and ``origin='unix'``, this would calculate
- the number of milliseconds to the unix epoch start.
- infer_datetime_format : bool, default False
- If :const:`True` and no `format` is given, attempt to infer the format
- of the datetime strings based on the first non-NaN element,
- and if it can be inferred, switch to a faster method of parsing them.
- In some cases this can increase the parsing speed by ~5-10x.
-
- .. deprecated:: 2.0.0
- A strict version of this argument is now the default, passing it has
- no effect.
-
- origin : scalar, default 'unix'
- Define the reference date. The numeric values would be parsed as number
- of units (defined by `unit`) since this reference date.
-
- - If :const:`'unix'` (or POSIX) time; origin is set to 1970-01-01.
- - If :const:`'julian'`, unit must be :const:`'D'`, and origin is set to
- beginning of Julian Calendar. Julian day number :const:`0` is assigned
- to the day starting at noon on January 1, 4713 BC.
- - If Timestamp convertible (Timestamp, dt.datetime, np.datetimt64 or date
- string), origin is set to Timestamp identified by origin.
- - If a float or integer, origin is the millisecond difference
- relative to 1970-01-01.
- cache : bool, default True
- If :const:`True`, use a cache of unique, converted dates to apply the
- datetime conversion. May produce significant speed-up when parsing
- duplicate date strings, especially ones with timezone offsets. The cache
- is only used when there are at least 50 values. The presence of
- out-of-bounds values will render the cache unusable and may slow down
- parsing.
-
- Returns
- -------
- datetime
- If parsing succeeded.
- Return type depends on input (types in parenthesis correspond to
- fallback in case of unsuccessful timezone or out-of-range timestamp
- parsing):
-
- - scalar: :class:`Timestamp` (or :class:`datetime.datetime`)
- - array-like: :class:`DatetimeIndex` (or :class:`Series` with
- :class:`object` dtype containing :class:`datetime.datetime`)
- - Series: :class:`Series` of :class:`datetime64` dtype (or
- :class:`Series` of :class:`object` dtype containing
- :class:`datetime.datetime`)
- - DataFrame: :class:`Series` of :class:`datetime64` dtype (or
- :class:`Series` of :class:`object` dtype containing
- :class:`datetime.datetime`)
-
- Raises
- ------
- ParserError
- When parsing a date from string fails.
- ValueError
- When another datetime conversion error happens. For example when one
- of 'year', 'month', day' columns is missing in a :class:`DataFrame`, or
- when a Timezone-aware :class:`datetime.datetime` is found in an array-like
- of mixed time offsets, and ``utc=False``.
-
- See Also
- --------
- DataFrame.astype : Cast argument to a specified dtype.
- to_timedelta : Convert argument to timedelta.
- convert_dtypes : Convert dtypes.
-
- Notes
- -----
-
- Many input types are supported, and lead to different output types:
-
- - **scalars** can be int, float, str, datetime object (from stdlib :mod:`datetime`
- module or :mod:`numpy`). They are converted to :class:`Timestamp` when
- possible, otherwise they are converted to :class:`datetime.datetime`.
- None/NaN/null scalars are converted to :const:`NaT`.
-
- - **array-like** can contain int, float, str, datetime objects. They are
- converted to :class:`DatetimeIndex` when possible, otherwise they are
- converted to :class:`Index` with :class:`object` dtype, containing
- :class:`datetime.datetime`. None/NaN/null entries are converted to
- :const:`NaT` in both cases.
-
- - **Series** are converted to :class:`Series` with :class:`datetime64`
- dtype when possible, otherwise they are converted to :class:`Series` with
- :class:`object` dtype, containing :class:`datetime.datetime`. None/NaN/null
- entries are converted to :const:`NaT` in both cases.
-
- - **DataFrame/dict-like** are converted to :class:`Series` with
- :class:`datetime64` dtype. For each row a datetime is created from assembling
- the various dataframe columns. Column keys can be common abbreviations
- like [‘year’, ‘month’, ‘day’, ‘minute’, ‘second’, ‘ms’, ‘us’, ‘ns’]) or
- plurals of the same.
-
- The following causes are responsible for :class:`datetime.datetime` objects
- being returned (possibly inside an :class:`Index` or a :class:`Series` with
- :class:`object` dtype) instead of a proper pandas designated type
- (:class:`Timestamp`, :class:`DatetimeIndex` or :class:`Series`
- with :class:`datetime64` dtype):
-
- - when any input element is before :const:`Timestamp.min` or after
- :const:`Timestamp.max`, see `timestamp limitations
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
- #timeseries-timestamp-limits>`_.
-
- - when ``utc=False`` (default) and the input is an array-like or
- :class:`Series` containing mixed naive/aware datetime, or aware with mixed
- time offsets. Note that this happens in the (quite frequent) situation when
- the timezone has a daylight savings policy. In that case you may wish to
- use ``utc=True``.
-
- Examples
- --------
-
- **Handling various input formats**
-
- Assembling a datetime from multiple columns of a :class:`DataFrame`. The keys
- can be common abbreviations like ['year', 'month', 'day', 'minute', 'second',
- 'ms', 'us', 'ns']) or plurals of the same
-
- >>> df = pd.DataFrame({'year': [2015, 2016],
- ... 'month': [2, 3],
- ... 'day': [4, 5]})
- >>> pd.to_datetime(df)
- 0 2015-02-04
- 1 2016-03-05
- dtype: datetime64[ns]
-
- Using a unix epoch time
-
- >>> pd.to_datetime(1490195805, unit='s')
- Timestamp('2017-03-22 15:16:45')
- >>> pd.to_datetime(1490195805433502912, unit='ns')
- Timestamp('2017-03-22 15:16:45.433502912')
-
- .. warning:: For float arg, precision rounding might happen. To prevent
- unexpected behavior use a fixed-width exact type.
-
- Using a non-unix epoch origin
-
- >>> pd.to_datetime([1, 2, 3], unit='D',
- ... origin=pd.Timestamp('1960-01-01'))
- DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'],
- dtype='datetime64[ns]', freq=None)
-
- **Differences with strptime behavior**
-
- :const:`"%f"` will parse all the way up to nanoseconds.
-
- >>> pd.to_datetime('2018-10-26 12:00:00.0000000011',
- ... format='%Y-%m-%d %H:%M:%S.%f')
- Timestamp('2018-10-26 12:00:00.000000001')
-
- **Non-convertible date/times**
-
- If a date does not meet the `timestamp limitations
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html
- #timeseries-timestamp-limits>`_, passing ``errors='ignore'``
- will return the original input instead of raising any exception.
-
- Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`,
- in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
-
- >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')
- '13000101'
- >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
- NaT
-
- .. _to_datetime_tz_examples:
-
- **Timezones and time offsets**
-
- The default behaviour (``utc=False``) is as follows:
-
- - Timezone-naive inputs are converted to timezone-naive :class:`DatetimeIndex`:
-
- >>> pd.to_datetime(['2018-10-26 12:00:00', '2018-10-26 13:00:15'])
- DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'],
- dtype='datetime64[ns]', freq=None)
-
- - Timezone-aware inputs *with constant time offset* are converted to
- timezone-aware :class:`DatetimeIndex`:
-
- >>> pd.to_datetime(['2018-10-26 12:00 -0500', '2018-10-26 13:00 -0500'])
- DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'],
- dtype='datetime64[ns, UTC-05:00]', freq=None)
-
- - However, timezone-aware inputs *with mixed time offsets* (for example
- issued from a timezone with daylight savings, such as Europe/Paris)
- are **not successfully converted** to a :class:`DatetimeIndex`. Instead a
- simple :class:`Index` containing :class:`datetime.datetime` objects is
- returned:
-
- >>> pd.to_datetime(['2020-10-25 02:00 +0200', '2020-10-25 04:00 +0100'])
- Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00],
- dtype='object')
-
- - A mix of timezone-aware and timezone-naive inputs is also converted to
- a simple :class:`Index` containing :class:`datetime.datetime` objects:
-
- >>> from datetime import datetime
- >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)])
- Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object')
-
- |
-
- Setting ``utc=True`` solves most of the above issues:
-
- - Timezone-naive inputs are *localized* as UTC
-
- >>> pd.to_datetime(['2018-10-26 12:00', '2018-10-26 13:00'], utc=True)
- DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'],
- dtype='datetime64[ns, UTC]', freq=None)
-
- - Timezone-aware inputs are *converted* to UTC (the output represents the
- exact same datetime, but viewed from the UTC time offset `+00:00`).
-
- >>> pd.to_datetime(['2018-10-26 12:00 -0530', '2018-10-26 12:00 -0500'],
- ... utc=True)
- DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'],
- dtype='datetime64[ns, UTC]', freq=None)
-
- - Inputs can contain both string or datetime, the above
- rules still apply
-
- >>> pd.to_datetime(['2018-10-26 12:00', datetime(2020, 1, 1, 18)], utc=True)
- DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'],
- dtype='datetime64[ns, UTC]', freq=None)
- """
- if exact is not lib.no_default and format in {"mixed", "ISO8601"}:
- raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'")
- if infer_datetime_format is not lib.no_default:
- warnings.warn(
- "The argument 'infer_datetime_format' is deprecated and will "
- "be removed in a future version. "
- "A strict version of it is now the default, see "
- "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
- "You can safely remove this argument.",
- stacklevel=find_stack_level(),
- )
- if arg is None:
- return None
-
- if origin != "unix":
- arg = _adjust_to_origin(arg, origin, unit)
-
- convert_listlike = partial(
- _convert_listlike_datetimes,
- utc=utc,
- unit=unit,
- dayfirst=dayfirst,
- yearfirst=yearfirst,
- errors=errors,
- exact=exact,
- )
- # pylint: disable-next=used-before-assignment
- result: Timestamp | NaTType | Series | Index
-
- if isinstance(arg, Timestamp):
- result = arg
- if utc:
- if arg.tz is not None:
- result = arg.tz_convert("utc")
- else:
- result = arg.tz_localize("utc")
- elif isinstance(arg, ABCSeries):
- cache_array = _maybe_cache(arg, format, cache, convert_listlike)
- if not cache_array.empty:
- result = arg.map(cache_array)
- else:
- values = convert_listlike(arg._values, format)
- result = arg._constructor(values, index=arg.index, name=arg.name)
- elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)):
- result = _assemble_from_unit_mappings(arg, errors, utc)
- elif isinstance(arg, Index):
- cache_array = _maybe_cache(arg, format, cache, convert_listlike)
- if not cache_array.empty:
- result = _convert_and_box_cache(arg, cache_array, name=arg.name)
- else:
- result = convert_listlike(arg, format, name=arg.name)
- elif is_list_like(arg):
- try:
- # error: Argument 1 to "_maybe_cache" has incompatible type
- # "Union[float, str, datetime, List[Any], Tuple[Any, ...], ExtensionArray,
- # ndarray[Any, Any], Series]"; expected "Union[List[Any], Tuple[Any, ...],
- # Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series], Series]"
- argc = cast(
- Union[list, tuple, ExtensionArray, np.ndarray, "Series", Index], arg
- )
- cache_array = _maybe_cache(argc, format, cache, convert_listlike)
- except OutOfBoundsDatetime:
- # caching attempts to create a DatetimeIndex, which may raise
- # an OOB. If that's the desired behavior, then just reraise...
- if errors == "raise":
- raise
- # ... otherwise, continue without the cache.
- from pandas import Series
-
- cache_array = Series([], dtype=object) # just an empty array
- if not cache_array.empty:
- result = _convert_and_box_cache(argc, cache_array)
- else:
- result = convert_listlike(argc, format)
- else:
- result = convert_listlike(np.array([arg]), format)[0]
- if isinstance(arg, bool) and isinstance(result, np.bool_):
- result = bool(result) # TODO: avoid this kludge.
-
- # error: Incompatible return value type (got "Union[Timestamp, NaTType,
- # Series, Index]", expected "Union[DatetimeIndex, Series, float, str,
- # NaTType, None]")
- return result # type: ignore[return-value]
-
-
-# mappings for assembling units
-_unit_map = {
- "year": "year",
- "years": "year",
- "month": "month",
- "months": "month",
- "day": "day",
- "days": "day",
- "hour": "h",
- "hours": "h",
- "minute": "m",
- "minutes": "m",
- "second": "s",
- "seconds": "s",
- "ms": "ms",
- "millisecond": "ms",
- "milliseconds": "ms",
- "us": "us",
- "microsecond": "us",
- "microseconds": "us",
- "ns": "ns",
- "nanosecond": "ns",
- "nanoseconds": "ns",
-}
-
-
-def _assemble_from_unit_mappings(arg, errors: DateTimeErrorChoices, utc: bool):
- """
- assemble the unit specified fields from the arg (DataFrame)
- Return a Series for actual parsing
-
- Parameters
- ----------
- arg : DataFrame
- errors : {'ignore', 'raise', 'coerce'}, default 'raise'
-
- - If :const:`'raise'`, then invalid parsing will raise an exception
- - If :const:`'coerce'`, then invalid parsing will be set as :const:`NaT`
- - If :const:`'ignore'`, then invalid parsing will return the input
- utc : bool
- Whether to convert/localize timestamps to UTC.
-
- Returns
- -------
- Series
- """
- from pandas import (
- DataFrame,
- to_numeric,
- to_timedelta,
- )
-
- arg = DataFrame(arg)
- if not arg.columns.is_unique:
- raise ValueError("cannot assemble with duplicate keys")
-
- # replace passed unit with _unit_map
- def f(value):
- if value in _unit_map:
- return _unit_map[value]
-
- # m is case significant
- if value.lower() in _unit_map:
- return _unit_map[value.lower()]
-
- return value
-
- unit = {k: f(k) for k in arg.keys()}
- unit_rev = {v: k for k, v in unit.items()}
-
- # we require at least Ymd
- required = ["year", "month", "day"]
- req = sorted(set(required) - set(unit_rev.keys()))
- if len(req):
- _required = ",".join(req)
- raise ValueError(
- "to assemble mappings requires at least that "
- f"[year, month, day] be specified: [{_required}] is missing"
- )
-
- # keys we don't recognize
- excess = sorted(set(unit_rev.keys()) - set(_unit_map.values()))
- if len(excess):
- _excess = ",".join(excess)
- raise ValueError(
- f"extra keys have been passed to the datetime assemblage: [{_excess}]"
- )
-
- def coerce(values):
- # we allow coercion to if errors allows
- values = to_numeric(values, errors=errors)
-
- # prevent overflow in case of int8 or int16
- if is_integer_dtype(values):
- values = values.astype("int64", copy=False)
- return values
-
- values = (
- coerce(arg[unit_rev["year"]]) * 10000
- + coerce(arg[unit_rev["month"]]) * 100
- + coerce(arg[unit_rev["day"]])
- )
- try:
- values = to_datetime(values, format="%Y%m%d", errors=errors, utc=utc)
- except (TypeError, ValueError) as err:
- raise ValueError(f"cannot assemble the datetimes: {err}") from err
-
- units: list[UnitChoices] = ["h", "m", "s", "ms", "us", "ns"]
- for u in units:
- value = unit_rev.get(u)
- if value is not None and value in arg:
- try:
- values += to_timedelta(coerce(arg[value]), unit=u, errors=errors)
- except (TypeError, ValueError) as err:
- raise ValueError(
- f"cannot assemble the datetimes [{value}]: {err}"
- ) from err
- return values
-
-
-def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None:
- """
- try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
- arg is a passed in as an object dtype, but could really be ints/strings
- with nan-like/or floats (e.g. with nan)
-
- Parameters
- ----------
- arg : np.ndarray[object]
- errors : {'raise','ignore','coerce'}
- """
-
- def calc(carg):
- # calculate the actual result
- carg = carg.astype(object, copy=False)
- parsed = parsing.try_parse_year_month_day(
- carg / 10000, carg / 100 % 100, carg % 100
- )
- return tslib.array_to_datetime(parsed, errors=errors)[0]
-
- def calc_with_mask(carg, mask):
- result = np.empty(carg.shape, dtype="M8[ns]")
- iresult = result.view("i8")
- iresult[~mask] = iNaT
-
- masked_result = calc(carg[mask].astype(np.float64).astype(np.int64))
- result[mask] = masked_result.astype("M8[ns]")
- return result
-
- # try intlike / strings that are ints
- try:
- return calc(arg.astype(np.int64))
- except (ValueError, OverflowError, TypeError):
- pass
-
- # a float with actual np.nan
- try:
- carg = arg.astype(np.float64)
- return calc_with_mask(carg, notna(carg))
- except (ValueError, OverflowError, TypeError):
- pass
-
- # string with NaN-like
- try:
- # error: Argument 2 to "isin" has incompatible type "List[Any]"; expected
- # "Union[Union[ExtensionArray, ndarray], Index, Series]"
- mask = ~algorithms.isin(arg, list(nat_strings)) # type: ignore[arg-type]
- return calc_with_mask(arg, mask)
- except (ValueError, OverflowError, TypeError):
- pass
-
- return None
-
-
-__all__ = [
- "DateParseError",
- "should_cache",
- "to_datetime",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/tools/numeric.py b/contrib/python/pandas/py3/pandas/core/tools/numeric.py
deleted file mode 100644
index 5289753d194..00000000000
--- a/contrib/python/pandas/py3/pandas/core/tools/numeric.py
+++ /dev/null
@@ -1,310 +0,0 @@
-from __future__ import annotations
-
-from typing import Literal
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import (
- DateTimeErrorChoices,
- DtypeBackend,
- npt,
-)
-from pandas.util._validators import check_dtype_backend
-
-from pandas.core.dtypes.cast import maybe_downcast_numeric
-from pandas.core.dtypes.common import (
- ensure_object,
- is_bool_dtype,
- is_datetime_or_timedelta_dtype,
- is_decimal,
- is_integer_dtype,
- is_number,
- is_numeric_dtype,
- is_scalar,
- is_string_dtype,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.generic import (
- ABCIndex,
- ABCSeries,
-)
-
-import pandas as pd
-from pandas.core.arrays import BaseMaskedArray
-from pandas.core.arrays.string_ import StringDtype
-
-
-def to_numeric(
- arg,
- errors: DateTimeErrorChoices = "raise",
- downcast: Literal["integer", "signed", "unsigned", "float"] | None = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-):
- """
- Convert argument to a numeric type.
-
- The default return dtype is `float64` or `int64`
- depending on the data supplied. Use the `downcast` parameter
- to obtain other dtypes.
-
- Please note that precision loss may occur if really large numbers
- are passed in. Due to the internal limitations of `ndarray`, if
- numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
- or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
- passed in, it is very likely they will be converted to float so that
- they can be stored in an `ndarray`. These warnings apply similarly to
- `Series` since it internally leverages `ndarray`.
-
- Parameters
- ----------
- arg : scalar, list, tuple, 1-d array, or Series
- Argument to be converted.
- errors : {'ignore', 'raise', 'coerce'}, default 'raise'
- - If 'raise', then invalid parsing will raise an exception.
- - If 'coerce', then invalid parsing will be set as NaN.
- - If 'ignore', then invalid parsing will return the input.
- downcast : str, default None
- Can be 'integer', 'signed', 'unsigned', or 'float'.
- If not None, and if the data has been successfully cast to a
- numerical dtype (or if the data was numeric to begin with),
- downcast that resulting data to the smallest numerical dtype
- possible according to the following rules:
-
- - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
- - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
- - 'float': smallest float dtype (min.: np.float32)
-
- As this behaviour is separate from the core conversion to
- numeric values, any errors raised during the downcasting
- will be surfaced regardless of the value of the 'errors' input.
-
- In addition, downcasting will only occur if the size
- of the resulting data's dtype is strictly larger than
- the dtype it is to be cast to, so if none of the dtypes
- checked satisfy that specification, no downcasting will be
- performed on the data.
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- Returns
- -------
- ret
- Numeric if parsing succeeded.
- Return type depends on input. Series if Series, otherwise ndarray.
-
- See Also
- --------
- DataFrame.astype : Cast argument to a specified dtype.
- to_datetime : Convert argument to datetime.
- to_timedelta : Convert argument to timedelta.
- numpy.ndarray.astype : Cast a numpy array to a specified type.
- DataFrame.convert_dtypes : Convert dtypes.
-
- Examples
- --------
- Take separate series and convert to numeric, coercing when told to
-
- >>> s = pd.Series(['1.0', '2', -3])
- >>> pd.to_numeric(s)
- 0 1.0
- 1 2.0
- 2 -3.0
- dtype: float64
- >>> pd.to_numeric(s, downcast='float')
- 0 1.0
- 1 2.0
- 2 -3.0
- dtype: float32
- >>> pd.to_numeric(s, downcast='signed')
- 0 1
- 1 2
- 2 -3
- dtype: int8
- >>> s = pd.Series(['apple', '1.0', '2', -3])
- >>> pd.to_numeric(s, errors='ignore')
- 0 apple
- 1 1.0
- 2 2
- 3 -3
- dtype: object
- >>> pd.to_numeric(s, errors='coerce')
- 0 NaN
- 1 1.0
- 2 2.0
- 3 -3.0
- dtype: float64
-
- Downcasting of nullable integer and floating dtypes is supported:
-
- >>> s = pd.Series([1, 2, 3], dtype="Int64")
- >>> pd.to_numeric(s, downcast="integer")
- 0 1
- 1 2
- 2 3
- dtype: Int8
- >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
- >>> pd.to_numeric(s, downcast="float")
- 0 1.0
- 1 2.1
- 2 3.0
- dtype: Float32
- """
- if downcast not in (None, "integer", "signed", "unsigned", "float"):
- raise ValueError("invalid downcasting method provided")
-
- if errors not in ("ignore", "raise", "coerce"):
- raise ValueError("invalid error value specified")
-
- check_dtype_backend(dtype_backend)
-
- is_series = False
- is_index = False
- is_scalars = False
-
- if isinstance(arg, ABCSeries):
- is_series = True
- values = arg.values
- elif isinstance(arg, ABCIndex):
- is_index = True
- if needs_i8_conversion(arg.dtype):
- values = arg.view("i8")
- else:
- values = arg.values
- elif isinstance(arg, (list, tuple)):
- values = np.array(arg, dtype="O")
- elif is_scalar(arg):
- if is_decimal(arg):
- return float(arg)
- if is_number(arg):
- return arg
- is_scalars = True
- values = np.array([arg], dtype="O")
- elif getattr(arg, "ndim", 1) > 1:
- raise TypeError("arg must be a list, tuple, 1-d array, or Series")
- else:
- values = arg
-
- orig_values = values
-
- # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
- # save mask to reconstruct the full array after casting
- mask: npt.NDArray[np.bool_] | None = None
- if isinstance(values, BaseMaskedArray):
- mask = values._mask
- values = values._data[~mask]
-
- values_dtype = getattr(values, "dtype", None)
- if isinstance(values_dtype, pd.ArrowDtype):
- mask = values.isna()
- values = values.dropna().to_numpy()
- new_mask: np.ndarray | None = None
- if is_numeric_dtype(values_dtype):
- pass
- elif is_datetime_or_timedelta_dtype(values_dtype):
- values = values.view(np.int64)
- else:
- values = ensure_object(values)
- coerce_numeric = errors not in ("ignore", "raise")
- try:
- values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] # noqa
- values,
- set(),
- coerce_numeric=coerce_numeric,
- convert_to_masked_nullable=dtype_backend is not lib.no_default
- or isinstance(values_dtype, StringDtype),
- )
- except (ValueError, TypeError):
- if errors == "raise":
- raise
- values = orig_values
-
- if new_mask is not None:
- # Remove unnecessary values, is expected later anyway and enables
- # downcasting
- values = values[~new_mask]
- elif (
- dtype_backend is not lib.no_default
- and new_mask is None
- or isinstance(values_dtype, StringDtype)
- ):
- new_mask = np.zeros(values.shape, dtype=np.bool_)
-
- # attempt downcast only if the data has been successfully converted
- # to a numerical dtype and if a downcast method has been specified
- if downcast is not None and is_numeric_dtype(values.dtype):
- typecodes: str | None = None
-
- if downcast in ("integer", "signed"):
- typecodes = np.typecodes["Integer"]
- elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0):
- typecodes = np.typecodes["UnsignedInteger"]
- elif downcast == "float":
- typecodes = np.typecodes["Float"]
-
- # pandas support goes only to np.float32,
- # as float dtypes smaller than that are
- # extremely rare and not well supported
- float_32_char = np.dtype(np.float32).char
- float_32_ind = typecodes.index(float_32_char)
- typecodes = typecodes[float_32_ind:]
-
- if typecodes is not None:
- # from smallest to largest
- for typecode in typecodes:
- dtype = np.dtype(typecode)
- if dtype.itemsize <= values.dtype.itemsize:
- values = maybe_downcast_numeric(values, dtype)
-
- # successful conversion
- if values.dtype == dtype:
- break
-
- # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
- # masked array
- if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):
- if mask is None or (new_mask is not None and new_mask.shape == mask.shape):
- # GH 52588
- mask = new_mask
- else:
- mask = mask.copy()
- assert isinstance(mask, np.ndarray)
- data = np.zeros(mask.shape, dtype=values.dtype)
- data[~mask] = values
-
- from pandas.core.arrays import (
- ArrowExtensionArray,
- BooleanArray,
- FloatingArray,
- IntegerArray,
- )
-
- klass: type[IntegerArray] | type[BooleanArray] | type[FloatingArray]
- if is_integer_dtype(data.dtype):
- klass = IntegerArray
- elif is_bool_dtype(data.dtype):
- klass = BooleanArray
- else:
- klass = FloatingArray
- values = klass(data, mask)
-
- if dtype_backend == "pyarrow" or isinstance(values_dtype, pd.ArrowDtype):
- values = ArrowExtensionArray(values.__arrow_array__())
-
- if is_series:
- return arg._constructor(values, index=arg.index, name=arg.name)
- elif is_index:
- # because we want to coerce to numeric if possible,
- # do not use _shallow_copy
- return pd.Index(values, name=arg.name)
- elif is_scalars:
- return values[0]
- else:
- return values
diff --git a/contrib/python/pandas/py3/pandas/core/tools/timedeltas.py b/contrib/python/pandas/py3/pandas/core/tools/timedeltas.py
deleted file mode 100644
index 42cf92c6b2a..00000000000
--- a/contrib/python/pandas/py3/pandas/core/tools/timedeltas.py
+++ /dev/null
@@ -1,265 +0,0 @@
-"""
-timedelta support tools
-"""
-from __future__ import annotations
-
-from datetime import timedelta
-from typing import (
- TYPE_CHECKING,
- overload,
-)
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.tslibs import (
- NaT,
- NaTType,
-)
-from pandas._libs.tslibs.timedeltas import (
- Timedelta,
- parse_timedelta_unit,
-)
-
-from pandas.core.dtypes.common import is_list_like
-from pandas.core.dtypes.generic import (
- ABCIndex,
- ABCSeries,
-)
-
-from pandas.core.arrays.timedeltas import sequence_to_td64ns
-
-if TYPE_CHECKING:
- from pandas._libs.tslibs.timedeltas import UnitChoices
- from pandas._typing import (
- ArrayLike,
- DateTimeErrorChoices,
- )
-
- from pandas import (
- Index,
- Series,
- TimedeltaIndex,
- )
-
-
-@overload
-def to_timedelta(
- arg: str | float | timedelta,
- unit: UnitChoices | None = ...,
- errors: DateTimeErrorChoices = ...,
-) -> Timedelta:
- ...
-
-
-@overload
-def to_timedelta(
- arg: Series,
- unit: UnitChoices | None = ...,
- errors: DateTimeErrorChoices = ...,
-) -> Series:
- ...
-
-
-@overload
-def to_timedelta(
- arg: list | tuple | range | ArrayLike | Index,
- unit: UnitChoices | None = ...,
- errors: DateTimeErrorChoices = ...,
-) -> TimedeltaIndex:
- ...
-
-
-def to_timedelta(
- arg: str
- | int
- | float
- | timedelta
- | list
- | tuple
- | range
- | ArrayLike
- | Index
- | Series,
- unit: UnitChoices | None = None,
- errors: DateTimeErrorChoices = "raise",
-) -> Timedelta | TimedeltaIndex | Series:
- """
- Convert argument to timedelta.
-
- Timedeltas are absolute differences in times, expressed in difference
- units (e.g. days, hours, minutes, seconds). This method converts
- an argument from a recognized timedelta format / value into
- a Timedelta type.
-
- Parameters
- ----------
- arg : str, timedelta, list-like or Series
- The data to be converted to timedelta.
-
- .. versionchanged:: 2.0
- Strings with units 'M', 'Y' and 'y' do not represent
- unambiguous timedelta values and will raise an exception.
-
- unit : str, optional
- Denotes the unit of the arg for numeric `arg`. Defaults to ``"ns"``.
-
- Possible values:
-
- * 'W'
- * 'D' / 'days' / 'day'
- * 'hours' / 'hour' / 'hr' / 'h'
- * 'm' / 'minute' / 'min' / 'minutes' / 'T'
- * 'S' / 'seconds' / 'sec' / 'second'
- * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L'
- * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U'
- * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N'
-
- .. versionchanged:: 1.1.0
-
- Must not be specified when `arg` context strings and
- ``errors="raise"``.
-
- errors : {'ignore', 'raise', 'coerce'}, default 'raise'
- - If 'raise', then invalid parsing will raise an exception.
- - If 'coerce', then invalid parsing will be set as NaT.
- - If 'ignore', then invalid parsing will return the input.
-
- Returns
- -------
- timedelta
- If parsing succeeded.
- Return type depends on input:
-
- - list-like: TimedeltaIndex of timedelta64 dtype
- - Series: Series of timedelta64 dtype
- - scalar: Timedelta
-
- See Also
- --------
- DataFrame.astype : Cast argument to a specified dtype.
- to_datetime : Convert argument to datetime.
- convert_dtypes : Convert dtypes.
-
- Notes
- -----
- If the precision is higher than nanoseconds, the precision of the duration is
- truncated to nanoseconds for string inputs.
-
- Examples
- --------
- Parsing a single string to a Timedelta:
-
- >>> pd.to_timedelta('1 days 06:05:01.00003')
- Timedelta('1 days 06:05:01.000030')
- >>> pd.to_timedelta('15.5us')
- Timedelta('0 days 00:00:00.000015500')
-
- Parsing a list or array of strings:
-
- >>> pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan'])
- TimedeltaIndex(['1 days 06:05:01.000030', '0 days 00:00:00.000015500', NaT],
- dtype='timedelta64[ns]', freq=None)
-
- Converting numbers by specifying the `unit` keyword argument:
-
- >>> pd.to_timedelta(np.arange(5), unit='s')
- TimedeltaIndex(['0 days 00:00:00', '0 days 00:00:01', '0 days 00:00:02',
- '0 days 00:00:03', '0 days 00:00:04'],
- dtype='timedelta64[ns]', freq=None)
- >>> pd.to_timedelta(np.arange(5), unit='d')
- TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'],
- dtype='timedelta64[ns]', freq=None)
- """
- if unit is not None:
- unit = parse_timedelta_unit(unit)
-
- if errors not in ("ignore", "raise", "coerce"):
- raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'.")
-
- if unit in {"Y", "y", "M"}:
- raise ValueError(
- "Units 'M', 'Y', and 'y' are no longer supported, as they do not "
- "represent unambiguous timedelta values durations."
- )
-
- if arg is None:
- return arg
- elif isinstance(arg, ABCSeries):
- values = _convert_listlike(arg._values, unit=unit, errors=errors)
- return arg._constructor(values, index=arg.index, name=arg.name)
- elif isinstance(arg, ABCIndex):
- return _convert_listlike(arg, unit=unit, errors=errors, name=arg.name)
- elif isinstance(arg, np.ndarray) and arg.ndim == 0:
- # extract array scalar and process below
- # error: Incompatible types in assignment (expression has type "object",
- # variable has type "Union[str, int, float, timedelta, List[Any],
- # Tuple[Any, ...], Union[Union[ExtensionArray, ndarray[Any, Any]], Index,
- # Series]]") [assignment]
- arg = lib.item_from_zerodim(arg) # type: ignore[assignment]
- elif is_list_like(arg) and getattr(arg, "ndim", 1) == 1:
- return _convert_listlike(arg, unit=unit, errors=errors)
- elif getattr(arg, "ndim", 1) > 1:
- raise TypeError(
- "arg must be a string, timedelta, list, tuple, 1-d array, or Series"
- )
-
- if isinstance(arg, str) and unit is not None:
- raise ValueError("unit must not be specified if the input is/contains a str")
-
- # ...so it must be a scalar value. Return scalar.
- return _coerce_scalar_to_timedelta_type(arg, unit=unit, errors=errors)
-
-
-def _coerce_scalar_to_timedelta_type(
- r, unit: UnitChoices | None = "ns", errors: DateTimeErrorChoices = "raise"
-):
- """Convert string 'r' to a timedelta object."""
- result: Timedelta | NaTType
-
- try:
- result = Timedelta(r, unit)
- except ValueError:
- if errors == "raise":
- raise
- if errors == "ignore":
- return r
-
- # coerce
- result = NaT
-
- return result
-
-
-def _convert_listlike(
- arg, unit=None, errors: DateTimeErrorChoices = "raise", name=None
-):
- """Convert a list of objects to a timedelta index object."""
- if isinstance(arg, (list, tuple)) or not hasattr(arg, "dtype"):
- # This is needed only to ensure that in the case where we end up
- # returning arg (errors == "ignore"), and where the input is a
- # generator, we return a useful list-like instead of a
- # used-up generator
- if not hasattr(arg, "__array__"):
- arg = list(arg)
- arg = np.array(arg, dtype=object)
-
- try:
- td64arr = sequence_to_td64ns(arg, unit=unit, errors=errors, copy=False)[0]
- except ValueError:
- if errors == "ignore":
- return arg
- else:
- # This else-block accounts for the cases when errors='raise'
- # and errors='coerce'. If errors == 'raise', these errors
- # should be raised. If errors == 'coerce', we shouldn't
- # expect any errors to be raised, since all parsing errors
- # cause coercion to pd.NaT. However, if an error / bug is
- # introduced that causes an Exception to be raised, we would
- # like to surface it.
- raise
-
- from pandas import TimedeltaIndex
-
- value = TimedeltaIndex(td64arr, unit="ns", name=name)
- return value
diff --git a/contrib/python/pandas/py3/pandas/core/tools/times.py b/contrib/python/pandas/py3/pandas/core/tools/times.py
deleted file mode 100644
index cb178926123..00000000000
--- a/contrib/python/pandas/py3/pandas/core/tools/times.py
+++ /dev/null
@@ -1,154 +0,0 @@
-from __future__ import annotations
-
-from datetime import (
- datetime,
- time,
-)
-
-import numpy as np
-
-from pandas._libs.lib import is_list_like
-from pandas._typing import DateTimeErrorChoices
-
-from pandas.core.dtypes.generic import (
- ABCIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import notna
-
-
-def to_time(
- arg,
- format=None,
- infer_time_format: bool = False,
- errors: DateTimeErrorChoices = "raise",
-):
- """
- Parse time strings to time objects using fixed strptime formats ("%H:%M",
- "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p",
- "%I%M%S%p")
-
- Use infer_time_format if all the strings are in the same format to speed
- up conversion.
-
- Parameters
- ----------
- arg : string in time format, datetime.time, list, tuple, 1-d array, Series
- format : str, default None
- Format used to convert arg into a time object. If None, fixed formats
- are used.
- infer_time_format: bool, default False
- Infer the time format based on the first non-NaN element. If all
- strings are in the same format, this will speed up conversion.
- errors : {'ignore', 'raise', 'coerce'}, default 'raise'
- - If 'raise', then invalid parsing will raise an exception
- - If 'coerce', then invalid parsing will be set as None
- - If 'ignore', then invalid parsing will return the input
-
- Returns
- -------
- datetime.time
- """
-
- def _convert_listlike(arg, format):
- if isinstance(arg, (list, tuple)):
- arg = np.array(arg, dtype="O")
-
- elif getattr(arg, "ndim", 1) > 1:
- raise TypeError(
- "arg must be a string, datetime, list, tuple, 1-d array, or Series"
- )
-
- arg = np.asarray(arg, dtype="O")
-
- if infer_time_format and format is None:
- format = _guess_time_format_for_array(arg)
-
- times: list[time | None] = []
- if format is not None:
- for element in arg:
- try:
- times.append(datetime.strptime(element, format).time())
- except (ValueError, TypeError) as err:
- if errors == "raise":
- msg = (
- f"Cannot convert {element} to a time with given "
- f"format {format}"
- )
- raise ValueError(msg) from err
- if errors == "ignore":
- return arg
- else:
- times.append(None)
- else:
- formats = _time_formats[:]
- format_found = False
- for element in arg:
- time_object = None
- try:
- time_object = time.fromisoformat(element)
- except (ValueError, TypeError):
- for time_format in formats:
- try:
- time_object = datetime.strptime(element, time_format).time()
- if not format_found:
- # Put the found format in front
- fmt = formats.pop(formats.index(time_format))
- formats.insert(0, fmt)
- format_found = True
- break
- except (ValueError, TypeError):
- continue
-
- if time_object is not None:
- times.append(time_object)
- elif errors == "raise":
- raise ValueError(f"Cannot convert arg {arg} to a time")
- elif errors == "ignore":
- return arg
- else:
- times.append(None)
-
- return times
-
- if arg is None:
- return arg
- elif isinstance(arg, time):
- return arg
- elif isinstance(arg, ABCSeries):
- values = _convert_listlike(arg._values, format)
- return arg._constructor(values, index=arg.index, name=arg.name)
- elif isinstance(arg, ABCIndex):
- return _convert_listlike(arg, format)
- elif is_list_like(arg):
- return _convert_listlike(arg, format)
-
- return _convert_listlike(np.array([arg]), format)[0]
-
-
-# Fixed time formats for time parsing
-_time_formats = [
- "%H:%M",
- "%H%M",
- "%I:%M%p",
- "%I%M%p",
- "%H:%M:%S",
- "%H%M%S",
- "%I:%M:%S%p",
- "%I%M%S%p",
-]
-
-
-def _guess_time_format_for_array(arr):
- # Try to guess the format based on the first non-NaN element
- non_nan_elements = notna(arr).nonzero()[0]
- if len(non_nan_elements):
- element = arr[non_nan_elements[0]]
- for time_format in _time_formats:
- try:
- datetime.strptime(element, time_format)
- return time_format
- except ValueError:
- pass
-
- return None
diff --git a/contrib/python/pandas/py3/pandas/core/util/__init__.py b/contrib/python/pandas/py3/pandas/core/util/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/util/__init__.py
+++ /dev/null
diff --git a/contrib/python/pandas/py3/pandas/core/util/hashing.py b/contrib/python/pandas/py3/pandas/core/util/hashing.py
deleted file mode 100644
index 350914cc505..00000000000
--- a/contrib/python/pandas/py3/pandas/core/util/hashing.py
+++ /dev/null
@@ -1,366 +0,0 @@
-"""
-data hash pandas / numpy objects
-"""
-from __future__ import annotations
-
-import itertools
-from typing import (
- TYPE_CHECKING,
- Hashable,
- Iterable,
- Iterator,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.hashing import hash_object_array
-from pandas._typing import (
- ArrayLike,
- npt,
-)
-
-from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_list_like,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCExtensionArray,
- ABCIndex,
- ABCMultiIndex,
- ABCSeries,
-)
-
-if TYPE_CHECKING:
- from pandas import (
- Categorical,
- DataFrame,
- Index,
- MultiIndex,
- Series,
- )
-
-
-# 16 byte long hashing key
-_default_hash_key = "0123456789123456"
-
-
-def combine_hash_arrays(
- arrays: Iterator[np.ndarray], num_items: int
-) -> npt.NDArray[np.uint64]:
- """
- Parameters
- ----------
- arrays : Iterator[np.ndarray]
- num_items : int
-
- Returns
- -------
- np.ndarray[uint64]
-
- Should be the same as CPython's tupleobject.c
- """
- try:
- first = next(arrays)
- except StopIteration:
- return np.array([], dtype=np.uint64)
-
- arrays = itertools.chain([first], arrays)
-
- mult = np.uint64(1000003)
- out = np.zeros_like(first) + np.uint64(0x345678)
- last_i = 0
- for i, a in enumerate(arrays):
- inverse_i = num_items - i
- out ^= a
- out *= mult
- mult += np.uint64(82520 + inverse_i + inverse_i)
- last_i = i
- assert last_i + 1 == num_items, "Fed in wrong num_items"
- out += np.uint64(97531)
- return out
-
-
-def hash_pandas_object(
- obj: Index | DataFrame | Series,
- index: bool = True,
- encoding: str = "utf8",
- hash_key: str | None = _default_hash_key,
- categorize: bool = True,
-) -> Series:
- """
- Return a data hash of the Index/Series/DataFrame.
-
- Parameters
- ----------
- obj : Index, Series, or DataFrame
- index : bool, default True
- Include the index in the hash (if Series/DataFrame).
- encoding : str, default 'utf8'
- Encoding for data & key when strings.
- hash_key : str, default _default_hash_key
- Hash_key for string key to encode.
- categorize : bool, default True
- Whether to first categorize object arrays before hashing. This is more
- efficient when the array contains duplicate values.
-
- Returns
- -------
- Series of uint64, same length as the object
- """
- from pandas import Series
-
- if hash_key is None:
- hash_key = _default_hash_key
-
- if isinstance(obj, ABCMultiIndex):
- return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
-
- elif isinstance(obj, ABCIndex):
- h = hash_array(obj._values, encoding, hash_key, categorize).astype(
- "uint64", copy=False
- )
- ser = Series(h, index=obj, dtype="uint64", copy=False)
-
- elif isinstance(obj, ABCSeries):
- h = hash_array(obj._values, encoding, hash_key, categorize).astype(
- "uint64", copy=False
- )
- if index:
- index_iter = (
- hash_pandas_object(
- obj.index,
- index=False,
- encoding=encoding,
- hash_key=hash_key,
- categorize=categorize,
- )._values
- for _ in [None]
- )
- arrays = itertools.chain([h], index_iter)
- h = combine_hash_arrays(arrays, 2)
-
- ser = Series(h, index=obj.index, dtype="uint64", copy=False)
-
- elif isinstance(obj, ABCDataFrame):
- hashes = (
- hash_array(series._values, encoding, hash_key, categorize)
- for _, series in obj.items()
- )
- num_items = len(obj.columns)
- if index:
- index_hash_generator = (
- hash_pandas_object(
- obj.index,
- index=False,
- encoding=encoding,
- hash_key=hash_key,
- categorize=categorize,
- )._values
- for _ in [None]
- )
- num_items += 1
-
- # keep `hashes` specifically a generator to keep mypy happy
- _hashes = itertools.chain(hashes, index_hash_generator)
- hashes = (x for x in _hashes)
- h = combine_hash_arrays(hashes, num_items)
-
- ser = Series(h, index=obj.index, dtype="uint64", copy=False)
- else:
- raise TypeError(f"Unexpected type for hashing {type(obj)}")
-
- return ser
-
-
-def hash_tuples(
- vals: MultiIndex | Iterable[tuple[Hashable, ...]],
- encoding: str = "utf8",
- hash_key: str = _default_hash_key,
-) -> npt.NDArray[np.uint64]:
- """
- Hash an MultiIndex / listlike-of-tuples efficiently.
-
- Parameters
- ----------
- vals : MultiIndex or listlike-of-tuples
- encoding : str, default 'utf8'
- hash_key : str, default _default_hash_key
-
- Returns
- -------
- ndarray[np.uint64] of hashed values
- """
- if not is_list_like(vals):
- raise TypeError("must be convertible to a list-of-tuples")
-
- from pandas import (
- Categorical,
- MultiIndex,
- )
-
- if not isinstance(vals, ABCMultiIndex):
- mi = MultiIndex.from_tuples(vals)
- else:
- mi = vals
-
- # create a list-of-Categoricals
- cat_vals = [
- Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True)
- for level in range(mi.nlevels)
- ]
-
- # hash the list-of-ndarrays
- hashes = (
- _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals
- )
- h = combine_hash_arrays(hashes, len(cat_vals))
-
- return h
-
-
-def _hash_categorical(
- cat: Categorical, encoding: str, hash_key: str
-) -> npt.NDArray[np.uint64]:
- """
- Hash a Categorical by hashing its categories, and then mapping the codes
- to the hashes
-
- Parameters
- ----------
- cat : Categorical
- encoding : str
- hash_key : str
-
- Returns
- -------
- ndarray[np.uint64] of hashed values, same size as len(c)
- """
- # Convert ExtensionArrays to ndarrays
- values = np.asarray(cat.categories._values)
- hashed = hash_array(values, encoding, hash_key, categorize=False)
-
- # we have uint64, as we don't directly support missing values
- # we don't want to use take_nd which will coerce to float
- # instead, directly construct the result with a
- # max(np.uint64) as the missing value indicator
- #
- # TODO: GH 15362
-
- mask = cat.isna()
- if len(hashed):
- result = hashed.take(cat.codes)
- else:
- result = np.zeros(len(mask), dtype="uint64")
-
- if mask.any():
- result[mask] = lib.u8max
-
- return result
-
-
-def hash_array(
- vals: ArrayLike,
- encoding: str = "utf8",
- hash_key: str = _default_hash_key,
- categorize: bool = True,
-) -> npt.NDArray[np.uint64]:
- """
- Given a 1d array, return an array of deterministic integers.
-
- Parameters
- ----------
- vals : ndarray or ExtensionArray
- encoding : str, default 'utf8'
- Encoding for data & key when strings.
- hash_key : str, default _default_hash_key
- Hash_key for string key to encode.
- categorize : bool, default True
- Whether to first categorize object arrays before hashing. This is more
- efficient when the array contains duplicate values.
-
- Returns
- -------
- ndarray[np.uint64, ndim=1]
- Hashed values, same length as the vals.
- """
- if not hasattr(vals, "dtype"):
- raise TypeError("must pass a ndarray-like")
- dtype = vals.dtype
-
- # For categoricals, we hash the categories, then remap the codes to the
- # hash values. (This check is above the complex check so that we don't ask
- # numpy if categorical is a subdtype of complex, as it will choke).
- if is_categorical_dtype(dtype):
- vals = cast("Categorical", vals)
- return _hash_categorical(vals, encoding, hash_key)
-
- elif isinstance(vals, ABCExtensionArray):
- vals, _ = vals._values_for_factorize()
-
- elif not isinstance(vals, np.ndarray):
- # GH#42003
- raise TypeError(
- "hash_array requires np.ndarray or ExtensionArray, not "
- f"{type(vals).__name__}. Use hash_pandas_object instead."
- )
-
- return _hash_ndarray(vals, encoding, hash_key, categorize)
-
-
-def _hash_ndarray(
- vals: np.ndarray,
- encoding: str = "utf8",
- hash_key: str = _default_hash_key,
- categorize: bool = True,
-) -> npt.NDArray[np.uint64]:
- """
- See hash_array.__doc__.
- """
- dtype = vals.dtype
-
- # we'll be working with everything as 64-bit values, so handle this
- # 128-bit value early
- if np.issubdtype(dtype, np.complex128):
- return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals))
-
- # First, turn whatever array this is into unsigned 64-bit ints, if we can
- # manage it.
- elif dtype == bool:
- vals = vals.astype("u8")
- elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
- vals = vals.view("i8").astype("u8", copy=False)
- elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
- vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")
- else:
- # With repeated values, its MUCH faster to categorize object dtypes,
- # then hash and rename categories. We allow skipping the categorization
- # when the values are known/likely to be unique.
- if categorize:
- from pandas import (
- Categorical,
- Index,
- factorize,
- )
-
- codes, categories = factorize(vals, sort=False)
- cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
- return _hash_categorical(cat, encoding, hash_key)
-
- try:
- vals = hash_object_array(vals, hash_key, encoding)
- except TypeError:
- # we have mixed types
- vals = hash_object_array(
- vals.astype(str).astype(object), hash_key, encoding
- )
-
- # Then, redistribute these 64-bit ints within the space of 64-bit ints
- vals ^= vals >> 30
- vals *= np.uint64(0xBF58476D1CE4E5B9)
- vals ^= vals >> 27
- vals *= np.uint64(0x94D049BB133111EB)
- vals ^= vals >> 31
- return vals
diff --git a/contrib/python/pandas/py3/pandas/core/util/numba_.py b/contrib/python/pandas/py3/pandas/core/util/numba_.py
deleted file mode 100644
index be798e022ac..00000000000
--- a/contrib/python/pandas/py3/pandas/core/util/numba_.py
+++ /dev/null
@@ -1,112 +0,0 @@
-"""Common utilities for Numba operations"""
-from __future__ import annotations
-
-import types
-from typing import (
- TYPE_CHECKING,
- Callable,
-)
-
-import numpy as np
-
-from pandas.compat._optional import import_optional_dependency
-from pandas.errors import NumbaUtilError
-
-GLOBAL_USE_NUMBA: bool = False
-
-
-def maybe_use_numba(engine: str | None) -> bool:
- """Signal whether to use numba routines."""
- return engine == "numba" or (engine is None and GLOBAL_USE_NUMBA)
-
-
-def set_use_numba(enable: bool = False) -> None:
- global GLOBAL_USE_NUMBA
- if enable:
- import_optional_dependency("numba")
- GLOBAL_USE_NUMBA = enable
-
-
-def get_jit_arguments(
- engine_kwargs: dict[str, bool] | None = None, kwargs: dict | None = None
-) -> dict[str, bool]:
- """
- Return arguments to pass to numba.JIT, falling back on pandas default JIT settings.
-
- Parameters
- ----------
- engine_kwargs : dict, default None
- user passed keyword arguments for numba.JIT
- kwargs : dict, default None
- user passed keyword arguments to pass into the JITed function
-
- Returns
- -------
- dict[str, bool]
- nopython, nogil, parallel
-
- Raises
- ------
- NumbaUtilError
- """
- if engine_kwargs is None:
- engine_kwargs = {}
-
- nopython = engine_kwargs.get("nopython", True)
- if kwargs and nopython:
- raise NumbaUtilError(
- "numba does not support kwargs with nopython=True: "
- "https://github.com/numba/numba/issues/2916"
- )
- nogil = engine_kwargs.get("nogil", False)
- parallel = engine_kwargs.get("parallel", False)
- return {"nopython": nopython, "nogil": nogil, "parallel": parallel}
-
-
-def jit_user_function(
- func: Callable, nopython: bool, nogil: bool, parallel: bool
-) -> Callable:
- """
- JIT the user's function given the configurable arguments.
-
- Parameters
- ----------
- func : function
- user defined function
- nopython : bool
- nopython parameter for numba.JIT
- nogil : bool
- nogil parameter for numba.JIT
- parallel : bool
- parallel parameter for numba.JIT
-
- Returns
- -------
- function
- Numba JITed function
- """
- if TYPE_CHECKING:
- import numba
- else:
- numba = import_optional_dependency("numba")
-
- if numba.extending.is_jitted(func):
- # Don't jit a user passed jitted function
- numba_func = func
- else:
-
- @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel)
- def numba_func(data, *_args):
- if getattr(np, func.__name__, False) is func or isinstance(
- func, types.BuiltinFunctionType
- ):
- jf = func
- else:
- jf = numba.jit(func, nopython=nopython, nogil=nogil)
-
- def impl(data, *_args):
- return jf(data, *_args)
-
- return impl
-
- return numba_func
diff --git a/contrib/python/pandas/py3/pandas/core/window/__init__.py b/contrib/python/pandas/py3/pandas/core/window/__init__.py
deleted file mode 100644
index 857e12e5467..00000000000
--- a/contrib/python/pandas/py3/pandas/core/window/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from pandas.core.window.ewm import (
- ExponentialMovingWindow,
- ExponentialMovingWindowGroupby,
-)
-from pandas.core.window.expanding import (
- Expanding,
- ExpandingGroupby,
-)
-from pandas.core.window.rolling import (
- Rolling,
- RollingGroupby,
- Window,
-)
-
-__all__ = [
- "Expanding",
- "ExpandingGroupby",
- "ExponentialMovingWindow",
- "ExponentialMovingWindowGroupby",
- "Rolling",
- "RollingGroupby",
- "Window",
-]
diff --git a/contrib/python/pandas/py3/pandas/core/window/common.py b/contrib/python/pandas/py3/pandas/core/window/common.py
deleted file mode 100644
index b6c7bc5684d..00000000000
--- a/contrib/python/pandas/py3/pandas/core/window/common.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""Common utility functions for rolling operations"""
-from __future__ import annotations
-
-from collections import defaultdict
-from typing import cast
-
-import numpy as np
-
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-
-from pandas.core.indexes.api import MultiIndex
-
-
-def flex_binary_moment(arg1, arg2, f, pairwise: bool = False):
- if isinstance(arg1, ABCSeries) and isinstance(arg2, ABCSeries):
- X, Y = prep_binary(arg1, arg2)
- return f(X, Y)
-
- elif isinstance(arg1, ABCDataFrame):
- from pandas import DataFrame
-
- def dataframe_from_int_dict(data, frame_template) -> DataFrame:
- result = DataFrame(data, index=frame_template.index)
- if len(result.columns) > 0:
- result.columns = frame_template.columns[result.columns]
- else:
- result.columns = frame_template.columns.copy()
- return result
-
- results = {}
- if isinstance(arg2, ABCDataFrame):
- if pairwise is False:
- if arg1 is arg2:
- # special case in order to handle duplicate column names
- for i in range(len(arg1.columns)):
- results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i])
- return dataframe_from_int_dict(results, arg1)
- else:
- if not arg1.columns.is_unique:
- raise ValueError("'arg1' columns are not unique")
- if not arg2.columns.is_unique:
- raise ValueError("'arg2' columns are not unique")
- X, Y = arg1.align(arg2, join="outer")
- X, Y = prep_binary(X, Y)
- res_columns = arg1.columns.union(arg2.columns)
- for col in res_columns:
- if col in X and col in Y:
- results[col] = f(X[col], Y[col])
- return DataFrame(results, index=X.index, columns=res_columns)
- elif pairwise is True:
- results = defaultdict(dict)
- for i in range(len(arg1.columns)):
- for j in range(len(arg2.columns)):
- if j < i and arg2 is arg1:
- # Symmetric case
- results[i][j] = results[j][i]
- else:
- results[i][j] = f(
- *prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])
- )
-
- from pandas import concat
-
- result_index = arg1.index.union(arg2.index)
- if len(result_index):
- # construct result frame
- result = concat(
- [
- concat(
- [results[i][j] for j in range(len(arg2.columns))],
- ignore_index=True,
- )
- for i in range(len(arg1.columns))
- ],
- ignore_index=True,
- axis=1,
- )
- result.columns = arg1.columns
-
- # set the index and reorder
- if arg2.columns.nlevels > 1:
- # mypy needs to know columns is a MultiIndex, Index doesn't
- # have levels attribute
- arg2.columns = cast(MultiIndex, arg2.columns)
- # GH 21157: Equivalent to MultiIndex.from_product(
- # [result_index], <unique combinations of arg2.columns.levels>,
- # )
- # A normal MultiIndex.from_product will produce too many
- # combinations.
- result_level = np.tile(
- result_index, len(result) // len(result_index)
- )
- arg2_levels = (
- np.repeat(
- arg2.columns.get_level_values(i),
- len(result) // len(arg2.columns),
- )
- for i in range(arg2.columns.nlevels)
- )
- result_names = list(arg2.columns.names) + [result_index.name]
- result.index = MultiIndex.from_arrays(
- [*arg2_levels, result_level], names=result_names
- )
- # GH 34440
- num_levels = len(result.index.levels)
- new_order = [num_levels - 1] + list(range(num_levels - 1))
- result = result.reorder_levels(new_order).sort_index()
- else:
- result.index = MultiIndex.from_product(
- [range(len(arg2.columns)), range(len(result_index))]
- )
- result = result.swaplevel(1, 0).sort_index()
- result.index = MultiIndex.from_product(
- [result_index] + [arg2.columns]
- )
- else:
- # empty result
- result = DataFrame(
- index=MultiIndex(
- levels=[arg1.index, arg2.columns], codes=[[], []]
- ),
- columns=arg2.columns,
- dtype="float64",
- )
-
- # reset our index names to arg1 names
- # reset our column names to arg2 names
- # careful not to mutate the original names
- result.columns = result.columns.set_names(arg1.columns.names)
- result.index = result.index.set_names(
- result_index.names + arg2.columns.names
- )
-
- return result
- else:
- results = {
- i: f(*prep_binary(arg1.iloc[:, i], arg2))
- for i in range(len(arg1.columns))
- }
- return dataframe_from_int_dict(results, arg1)
-
- else:
- return flex_binary_moment(arg2, arg1, f)
-
-
-def zsqrt(x):
- with np.errstate(all="ignore"):
- result = np.sqrt(x)
- mask = x < 0
-
- if isinstance(x, ABCDataFrame):
- if mask._values.any():
- result[mask] = 0
- else:
- if mask.any():
- result[mask] = 0
-
- return result
-
-
-def prep_binary(arg1, arg2):
- # mask out values, this also makes a common index...
- X = arg1 + 0 * arg2
- Y = arg2 + 0 * arg1
- return X, Y
diff --git a/contrib/python/pandas/py3/pandas/core/window/doc.py b/contrib/python/pandas/py3/pandas/core/window/doc.py
deleted file mode 100644
index 2a5cbc04921..00000000000
--- a/contrib/python/pandas/py3/pandas/core/window/doc.py
+++ /dev/null
@@ -1,116 +0,0 @@
-"""Any shareable docstring components for rolling/expanding/ewm"""
-from __future__ import annotations
-
-from textwrap import dedent
-
-from pandas.core.shared_docs import _shared_docs
-
-_shared_docs = dict(**_shared_docs)
-
-
-def create_section_header(header: str) -> str:
- """Create numpydoc section header"""
- return f"{header}\n{'-' * len(header)}\n"
-
-
-template_header = "\nCalculate the {window_method} {aggregation_description}.\n\n"
-
-template_returns = dedent(
- """
- Series or DataFrame
- Return type is the same as the original object with ``np.float64`` dtype.\n
- """
-).replace("\n", "", 1)
-
-template_see_also = dedent(
- """
- pandas.Series.{window_method} : Calling {window_method} with Series data.
- pandas.DataFrame.{window_method} : Calling {window_method} with DataFrames.
- pandas.Series.{agg_method} : Aggregating {agg_method} for Series.
- pandas.DataFrame.{agg_method} : Aggregating {agg_method} for DataFrame.\n
- """
-).replace("\n", "", 1)
-
-kwargs_numeric_only = dedent(
- """
- numeric_only : bool, default False
- Include only float, int, boolean columns.
-
- .. versionadded:: 1.5.0\n
- """
-).replace("\n", "", 1)
-
-kwargs_scipy = dedent(
- """
- **kwargs
- Keyword arguments to configure the ``SciPy`` weighted window type.\n
- """
-).replace("\n", "", 1)
-
-window_apply_parameters = dedent(
- """
- func : function
- Must produce a single value from an ndarray input if ``raw=True``
- or a single value from a Series if ``raw=False``. Can also accept a
- Numba JIT function with ``engine='numba'`` specified.
-
- raw : bool, default False
- * ``False`` : passes each row or column as a Series to the
- function.
- * ``True`` : the passed function will receive ndarray
- objects instead.
- If you are just applying a NumPy reduction function this will
- achieve much better performance.
-
- engine : str, default None
- * ``'cython'`` : Runs rolling apply through C-extensions from cython.
- * ``'numba'`` : Runs rolling apply through JIT compiled code from numba.
- Only available when ``raw`` is set to ``True``.
- * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
-
- engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
- applied to both the ``func`` and the ``apply`` rolling aggregation.
-
- args : tuple, default None
- Positional arguments to be passed into func.
-
- kwargs : dict, default None
- Keyword arguments to be passed into func.\n
- """
-).replace("\n", "", 1)
-
-numba_notes = (
- "See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for "
- "extended documentation and performance considerations for the Numba engine.\n\n"
-)
-
-
-def window_agg_numba_parameters(version: str = "1.3") -> str:
- return (
- dedent(
- """
- engine : str, default None
- * ``'cython'`` : Runs the operation through C-extensions from cython.
- * ``'numba'`` : Runs the operation through JIT compiled code from numba.
- * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
-
- .. versionadded:: {version}.0
-
- engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
-
- .. versionadded:: {version}.0\n
- """
- )
- .replace("\n", "", 1)
- .replace("{version}", version)
- )
diff --git a/contrib/python/pandas/py3/pandas/core/window/ewm.py b/contrib/python/pandas/py3/pandas/core/window/ewm.py
deleted file mode 100644
index 4dcbb4ed804..00000000000
--- a/contrib/python/pandas/py3/pandas/core/window/ewm.py
+++ /dev/null
@@ -1,1012 +0,0 @@
-from __future__ import annotations
-
-import datetime
-from functools import partial
-from textwrap import dedent
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from pandas._libs.tslibs import Timedelta
-import pandas._libs.window.aggregations as window_aggregations
-from pandas._typing import (
- Axis,
- TimedeltaConvertibleTypes,
-)
-
-if TYPE_CHECKING:
- from pandas import DataFrame, Series
- from pandas.core.generic import NDFrame
-
-from pandas.util._decorators import doc
-
-from pandas.core.dtypes.common import (
- is_datetime64_ns_dtype,
- is_numeric_dtype,
-)
-from pandas.core.dtypes.missing import isna
-
-from pandas.core import common
-from pandas.core.indexers.objects import (
- BaseIndexer,
- ExponentialMovingWindowIndexer,
- GroupbyIndexer,
-)
-from pandas.core.util.numba_ import (
- get_jit_arguments,
- maybe_use_numba,
-)
-from pandas.core.window.common import zsqrt
-from pandas.core.window.doc import (
- _shared_docs,
- create_section_header,
- kwargs_numeric_only,
- numba_notes,
- template_header,
- template_returns,
- template_see_also,
- window_agg_numba_parameters,
-)
-from pandas.core.window.numba_ import (
- generate_numba_ewm_func,
- generate_numba_ewm_table_func,
-)
-from pandas.core.window.online import (
- EWMMeanState,
- generate_online_numba_ewma_func,
-)
-from pandas.core.window.rolling import (
- BaseWindow,
- BaseWindowGroupby,
-)
-
-
-def get_center_of_mass(
- comass: float | None,
- span: float | None,
- halflife: float | None,
- alpha: float | None,
-) -> float:
- valid_count = common.count_not_none(comass, span, halflife, alpha)
- if valid_count > 1:
- raise ValueError("comass, span, halflife, and alpha are mutually exclusive")
-
- # Convert to center of mass; domain checks ensure 0 < alpha <= 1
- if comass is not None:
- if comass < 0:
- raise ValueError("comass must satisfy: comass >= 0")
- elif span is not None:
- if span < 1:
- raise ValueError("span must satisfy: span >= 1")
- comass = (span - 1) / 2
- elif halflife is not None:
- if halflife <= 0:
- raise ValueError("halflife must satisfy: halflife > 0")
- decay = 1 - np.exp(np.log(0.5) / halflife)
- comass = 1 / decay - 1
- elif alpha is not None:
- if alpha <= 0 or alpha > 1:
- raise ValueError("alpha must satisfy: 0 < alpha <= 1")
- comass = (1 - alpha) / alpha
- else:
- raise ValueError("Must pass one of comass, span, halflife, or alpha")
-
- return float(comass)
-
-
-def _calculate_deltas(
- times: np.ndarray | NDFrame,
- halflife: float | TimedeltaConvertibleTypes | None,
-) -> np.ndarray:
- """
- Return the diff of the times divided by the half-life. These values are used in
- the calculation of the ewm mean.
-
- Parameters
- ----------
- times : np.ndarray, Series
- Times corresponding to the observations. Must be monotonically increasing
- and ``datetime64[ns]`` dtype.
- halflife : float, str, timedelta, optional
- Half-life specifying the decay
-
- Returns
- -------
- np.ndarray
- Diff of the times divided by the half-life
- """
- _times = np.asarray(times.view(np.int64), dtype=np.float64)
- # TODO: generalize to non-nano?
- _halflife = float(Timedelta(halflife).as_unit("ns")._value)
- return np.diff(_times) / _halflife
-
-
-class ExponentialMovingWindow(BaseWindow):
- r"""
- Provide exponentially weighted (EW) calculations.
-
- Exactly one of ``com``, ``span``, ``halflife``, or ``alpha`` must be
- provided if ``times`` is not provided. If ``times`` is provided,
- ``halflife`` and one of ``com``, ``span`` or ``alpha`` may be provided.
-
- Parameters
- ----------
- com : float, optional
- Specify decay in terms of center of mass
-
- :math:`\alpha = 1 / (1 + com)`, for :math:`com \geq 0`.
-
- span : float, optional
- Specify decay in terms of span
-
- :math:`\alpha = 2 / (span + 1)`, for :math:`span \geq 1`.
-
- halflife : float, str, timedelta, optional
- Specify decay in terms of half-life
-
- :math:`\alpha = 1 - \exp\left(-\ln(2) / halflife\right)`, for
- :math:`halflife > 0`.
-
- If ``times`` is specified, a timedelta convertible unit over which an
- observation decays to half its value. Only applicable to ``mean()``,
- and halflife value will not apply to the other functions.
-
- .. versionadded:: 1.1.0
-
- alpha : float, optional
- Specify smoothing factor :math:`\alpha` directly
-
- :math:`0 < \alpha \leq 1`.
-
- min_periods : int, default 0
- Minimum number of observations in window required to have a value;
- otherwise, result is ``np.nan``.
-
- adjust : bool, default True
- Divide by decaying adjustment factor in beginning periods to account
- for imbalance in relative weightings (viewing EWMA as a moving average).
-
- - When ``adjust=True`` (default), the EW function is calculated using weights
- :math:`w_i = (1 - \alpha)^i`. For example, the EW moving average of the series
- [:math:`x_0, x_1, ..., x_t`] would be:
-
- .. math::
- y_t = \frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ... + (1 -
- \alpha)^t x_0}{1 + (1 - \alpha) + (1 - \alpha)^2 + ... + (1 - \alpha)^t}
-
- - When ``adjust=False``, the exponentially weighted function is calculated
- recursively:
-
- .. math::
- \begin{split}
- y_0 &= x_0\\
- y_t &= (1 - \alpha) y_{t-1} + \alpha x_t,
- \end{split}
- ignore_na : bool, default False
- Ignore missing values when calculating weights.
-
- - When ``ignore_na=False`` (default), weights are based on absolute positions.
- For example, the weights of :math:`x_0` and :math:`x_2` used in calculating
- the final weighted average of [:math:`x_0`, None, :math:`x_2`] are
- :math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and
- :math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``.
-
- - When ``ignore_na=True``, weights are based
- on relative positions. For example, the weights of :math:`x_0` and :math:`x_2`
- used in calculating the final weighted average of
- [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if
- ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``.
-
- axis : {0, 1}, default 0
- If ``0`` or ``'index'``, calculate across the rows.
-
- If ``1`` or ``'columns'``, calculate across the columns.
-
- For `Series` this parameter is unused and defaults to 0.
-
- times : np.ndarray, Series, default None
-
- .. versionadded:: 1.1.0
-
- Only applicable to ``mean()``.
-
- Times corresponding to the observations. Must be monotonically increasing and
- ``datetime64[ns]`` dtype.
-
- If 1-D array like, a sequence with the same shape as the observations.
-
- method : str {'single', 'table'}, default 'single'
- .. versionadded:: 1.4.0
-
- Execute the rolling operation per single column or row (``'single'``)
- or over the entire object (``'table'``).
-
- This argument is only implemented when specifying ``engine='numba'``
- in the method call.
-
- Only applicable to ``mean()``
-
- Returns
- -------
- ``ExponentialMovingWindow`` subclass
-
- See Also
- --------
- rolling : Provides rolling window calculations.
- expanding : Provides expanding transformations.
-
- Notes
- -----
- See :ref:`Windowing Operations <window.exponentially_weighted>`
- for further usage details and examples.
-
- Examples
- --------
- >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})
- >>> df
- B
- 0 0.0
- 1 1.0
- 2 2.0
- 3 NaN
- 4 4.0
-
- >>> df.ewm(com=0.5).mean()
- B
- 0 0.000000
- 1 0.750000
- 2 1.615385
- 3 1.615385
- 4 3.670213
- >>> df.ewm(alpha=2 / 3).mean()
- B
- 0 0.000000
- 1 0.750000
- 2 1.615385
- 3 1.615385
- 4 3.670213
-
- **adjust**
-
- >>> df.ewm(com=0.5, adjust=True).mean()
- B
- 0 0.000000
- 1 0.750000
- 2 1.615385
- 3 1.615385
- 4 3.670213
- >>> df.ewm(com=0.5, adjust=False).mean()
- B
- 0 0.000000
- 1 0.666667
- 2 1.555556
- 3 1.555556
- 4 3.650794
-
- **ignore_na**
-
- >>> df.ewm(com=0.5, ignore_na=True).mean()
- B
- 0 0.000000
- 1 0.750000
- 2 1.615385
- 3 1.615385
- 4 3.225000
- >>> df.ewm(com=0.5, ignore_na=False).mean()
- B
- 0 0.000000
- 1 0.750000
- 2 1.615385
- 3 1.615385
- 4 3.670213
-
- **times**
-
- Exponentially weighted mean with weights calculated with a timedelta ``halflife``
- relative to ``times``.
-
- >>> times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17']
- >>> df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean()
- B
- 0 0.000000
- 1 0.585786
- 2 1.523889
- 3 1.523889
- 4 3.233686
- """
-
- _attributes = [
- "com",
- "span",
- "halflife",
- "alpha",
- "min_periods",
- "adjust",
- "ignore_na",
- "axis",
- "times",
- "method",
- ]
-
- def __init__(
- self,
- obj: NDFrame,
- com: float | None = None,
- span: float | None = None,
- halflife: float | TimedeltaConvertibleTypes | None = None,
- alpha: float | None = None,
- min_periods: int | None = 0,
- adjust: bool = True,
- ignore_na: bool = False,
- axis: Axis = 0,
- times: np.ndarray | NDFrame | None = None,
- method: str = "single",
- *,
- selection=None,
- ) -> None:
- super().__init__(
- obj=obj,
- min_periods=1 if min_periods is None else max(int(min_periods), 1),
- on=None,
- center=False,
- closed=None,
- method=method,
- axis=axis,
- selection=selection,
- )
- self.com = com
- self.span = span
- self.halflife = halflife
- self.alpha = alpha
- self.adjust = adjust
- self.ignore_na = ignore_na
- self.times = times
- if self.times is not None:
- if not self.adjust:
- raise NotImplementedError("times is not supported with adjust=False.")
- if not is_datetime64_ns_dtype(self.times):
- raise ValueError("times must be datetime64[ns] dtype.")
- if len(self.times) != len(obj):
- raise ValueError("times must be the same length as the object.")
- if not isinstance(self.halflife, (str, datetime.timedelta, np.timedelta64)):
- raise ValueError("halflife must be a timedelta convertible object")
- if isna(self.times).any():
- raise ValueError("Cannot convert NaT values to integer")
- self._deltas = _calculate_deltas(self.times, self.halflife)
- # Halflife is no longer applicable when calculating COM
- # But allow COM to still be calculated if the user passes other decay args
- if common.count_not_none(self.com, self.span, self.alpha) > 0:
- self._com = get_center_of_mass(self.com, self.span, None, self.alpha)
- else:
- self._com = 1.0
- else:
- if self.halflife is not None and isinstance(
- self.halflife, (str, datetime.timedelta, np.timedelta64)
- ):
- raise ValueError(
- "halflife can only be a timedelta convertible argument if "
- "times is not None."
- )
- # Without times, points are equally spaced
- self._deltas = np.ones(
- max(self.obj.shape[self.axis] - 1, 0), dtype=np.float64
- )
- self._com = get_center_of_mass(
- # error: Argument 3 to "get_center_of_mass" has incompatible type
- # "Union[float, Any, None, timedelta64, signedinteger[_64Bit]]";
- # expected "Optional[float]"
- self.com,
- self.span,
- self.halflife, # type: ignore[arg-type]
- self.alpha,
- )
-
- def _check_window_bounds(
- self, start: np.ndarray, end: np.ndarray, num_vals: int
- ) -> None:
- # emw algorithms are iterative with each point
- # ExponentialMovingWindowIndexer "bounds" are the entire window
- pass
-
- def _get_window_indexer(self) -> BaseIndexer:
- """
- Return an indexer class that will compute the window start and end bounds
- """
- return ExponentialMovingWindowIndexer()
-
- def online(
- self, engine: str = "numba", engine_kwargs=None
- ) -> OnlineExponentialMovingWindow:
- """
- Return an ``OnlineExponentialMovingWindow`` object to calculate
- exponentially moving window aggregations in an online method.
-
- .. versionadded:: 1.3.0
-
- Parameters
- ----------
- engine: str, default ``'numba'``
- Execution engine to calculate online aggregations.
- Applies to all supported aggregation methods.
-
- engine_kwargs : dict, default None
- Applies to all supported aggregation methods.
-
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
- applied to the function
-
- Returns
- -------
- OnlineExponentialMovingWindow
- """
- return OnlineExponentialMovingWindow(
- obj=self.obj,
- com=self.com,
- span=self.span,
- halflife=self.halflife,
- alpha=self.alpha,
- min_periods=self.min_periods,
- adjust=self.adjust,
- ignore_na=self.ignore_na,
- axis=self.axis,
- times=self.times,
- engine=engine,
- engine_kwargs=engine_kwargs,
- selection=self._selection,
- )
-
- @doc(
- _shared_docs["aggregate"],
- see_also=dedent(
- """
- See Also
- --------
- pandas.DataFrame.rolling.aggregate
- """
- ),
- examples=dedent(
- """
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
- >>> df
- A B C
- 0 1 4 7
- 1 2 5 8
- 2 3 6 9
-
- >>> df.ewm(alpha=0.5).mean()
- A B C
- 0 1.000000 4.000000 7.000000
- 1 1.666667 4.666667 7.666667
- 2 2.428571 5.428571 8.428571
- """
- ),
- klass="Series/Dataframe",
- axis="",
- )
- def aggregate(self, func, *args, **kwargs):
- return super().aggregate(func, *args, **kwargs)
-
- agg = aggregate
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes.replace("\n", "", 1),
- window_method="ewm",
- aggregation_description="(exponential weighted moment) mean",
- agg_method="mean",
- )
- def mean(
- self,
- numeric_only: bool = False,
- engine=None,
- engine_kwargs=None,
- ):
- if maybe_use_numba(engine):
- if self.method == "single":
- func = generate_numba_ewm_func
- else:
- func = generate_numba_ewm_table_func
- ewm_func = func(
- **get_jit_arguments(engine_kwargs),
- com=self._com,
- adjust=self.adjust,
- ignore_na=self.ignore_na,
- deltas=tuple(self._deltas),
- normalize=True,
- )
- return self._apply(ewm_func, name="mean")
- elif engine in ("cython", None):
- if engine_kwargs is not None:
- raise ValueError("cython engine does not accept engine_kwargs")
-
- deltas = None if self.times is None else self._deltas
- window_func = partial(
- window_aggregations.ewm,
- com=self._com,
- adjust=self.adjust,
- ignore_na=self.ignore_na,
- deltas=deltas,
- normalize=True,
- )
- return self._apply(window_func, name="mean", numeric_only=numeric_only)
- else:
- raise ValueError("engine must be either 'numba' or 'cython'")
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes.replace("\n", "", 1),
- window_method="ewm",
- aggregation_description="(exponential weighted moment) sum",
- agg_method="sum",
- )
- def sum(
- self,
- numeric_only: bool = False,
- engine=None,
- engine_kwargs=None,
- ):
- if not self.adjust:
- raise NotImplementedError("sum is not implemented with adjust=False")
- if maybe_use_numba(engine):
- if self.method == "single":
- func = generate_numba_ewm_func
- else:
- func = generate_numba_ewm_table_func
- ewm_func = func(
- **get_jit_arguments(engine_kwargs),
- com=self._com,
- adjust=self.adjust,
- ignore_na=self.ignore_na,
- deltas=tuple(self._deltas),
- normalize=False,
- )
- return self._apply(ewm_func, name="sum")
- elif engine in ("cython", None):
- if engine_kwargs is not None:
- raise ValueError("cython engine does not accept engine_kwargs")
-
- deltas = None if self.times is None else self._deltas
- window_func = partial(
- window_aggregations.ewm,
- com=self._com,
- adjust=self.adjust,
- ignore_na=self.ignore_na,
- deltas=deltas,
- normalize=False,
- )
- return self._apply(window_func, name="sum", numeric_only=numeric_only)
- else:
- raise ValueError("engine must be either 'numba' or 'cython'")
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- bias : bool, default False
- Use a standard estimation bias correction.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="ewm",
- aggregation_description="(exponential weighted moment) standard deviation",
- agg_method="std",
- )
- def std(self, bias: bool = False, numeric_only: bool = False):
- if (
- numeric_only
- and self._selected_obj.ndim == 1
- and not is_numeric_dtype(self._selected_obj.dtype)
- ):
- # Raise directly so error message says std instead of var
- raise NotImplementedError(
- f"{type(self).__name__}.std does not implement numeric_only"
- )
- return zsqrt(self.var(bias=bias, numeric_only=numeric_only))
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- bias : bool, default False
- Use a standard estimation bias correction.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="ewm",
- aggregation_description="(exponential weighted moment) variance",
- agg_method="var",
- )
- def var(self, bias: bool = False, numeric_only: bool = False):
- window_func = window_aggregations.ewmcov
- wfunc = partial(
- window_func,
- com=self._com,
- adjust=self.adjust,
- ignore_na=self.ignore_na,
- bias=bias,
- )
-
- def var_func(values, begin, end, min_periods):
- return wfunc(values, begin, end, min_periods, values)
-
- return self._apply(var_func, name="var", numeric_only=numeric_only)
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- other : Series or DataFrame , optional
- If not supplied then will default to self and produce pairwise
- output.
- pairwise : bool, default None
- If False then only matching columns between self and other will be
- used and the output will be a DataFrame.
- If True then all pairwise combinations will be calculated and the
- output will be a MultiIndex DataFrame in the case of DataFrame
- inputs. In the case of missing elements, only complete pairwise
- observations will be used.
- bias : bool, default False
- Use a standard estimation bias correction.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="ewm",
- aggregation_description="(exponential weighted moment) sample covariance",
- agg_method="cov",
- )
- def cov(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- bias: bool = False,
- numeric_only: bool = False,
- ):
- from pandas import Series
-
- self._validate_numeric_only("cov", numeric_only)
-
- def cov_func(x, y):
- x_array = self._prep_values(x)
- y_array = self._prep_values(y)
- window_indexer = self._get_window_indexer()
- min_periods = (
- self.min_periods
- if self.min_periods is not None
- else window_indexer.window_size
- )
- start, end = window_indexer.get_window_bounds(
- num_values=len(x_array),
- min_periods=min_periods,
- center=self.center,
- closed=self.closed,
- step=self.step,
- )
- result = window_aggregations.ewmcov(
- x_array,
- start,
- end,
- # error: Argument 4 to "ewmcov" has incompatible type
- # "Optional[int]"; expected "int"
- self.min_periods, # type: ignore[arg-type]
- y_array,
- self._com,
- self.adjust,
- self.ignore_na,
- bias,
- )
- return Series(result, index=x.index, name=x.name, copy=False)
-
- return self._apply_pairwise(
- self._selected_obj, other, pairwise, cov_func, numeric_only
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- other : Series or DataFrame, optional
- If not supplied then will default to self and produce pairwise
- output.
- pairwise : bool, default None
- If False then only matching columns between self and other will be
- used and the output will be a DataFrame.
- If True then all pairwise combinations will be calculated and the
- output will be a MultiIndex DataFrame in the case of DataFrame
- inputs. In the case of missing elements, only complete pairwise
- observations will be used.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="ewm",
- aggregation_description="(exponential weighted moment) sample correlation",
- agg_method="corr",
- )
- def corr(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- numeric_only: bool = False,
- ):
- from pandas import Series
-
- self._validate_numeric_only("corr", numeric_only)
-
- def cov_func(x, y):
- x_array = self._prep_values(x)
- y_array = self._prep_values(y)
- window_indexer = self._get_window_indexer()
- min_periods = (
- self.min_periods
- if self.min_periods is not None
- else window_indexer.window_size
- )
- start, end = window_indexer.get_window_bounds(
- num_values=len(x_array),
- min_periods=min_periods,
- center=self.center,
- closed=self.closed,
- step=self.step,
- )
-
- def _cov(X, Y):
- return window_aggregations.ewmcov(
- X,
- start,
- end,
- min_periods,
- Y,
- self._com,
- self.adjust,
- self.ignore_na,
- True,
- )
-
- with np.errstate(all="ignore"):
- cov = _cov(x_array, y_array)
- x_var = _cov(x_array, x_array)
- y_var = _cov(y_array, y_array)
- result = cov / zsqrt(x_var * y_var)
- return Series(result, index=x.index, name=x.name, copy=False)
-
- return self._apply_pairwise(
- self._selected_obj, other, pairwise, cov_func, numeric_only
- )
-
-
-class ExponentialMovingWindowGroupby(BaseWindowGroupby, ExponentialMovingWindow):
- """
- Provide an exponential moving window groupby implementation.
- """
-
- _attributes = ExponentialMovingWindow._attributes + BaseWindowGroupby._attributes
-
- def __init__(self, obj, *args, _grouper=None, **kwargs) -> None:
- super().__init__(obj, *args, _grouper=_grouper, **kwargs)
-
- if not obj.empty and self.times is not None:
- # sort the times and recalculate the deltas according to the groups
- groupby_order = np.concatenate(list(self._grouper.indices.values()))
- self._deltas = _calculate_deltas(
- self.times.take(groupby_order),
- self.halflife,
- )
-
- def _get_window_indexer(self) -> GroupbyIndexer:
- """
- Return an indexer class that will compute the window start and end bounds
-
- Returns
- -------
- GroupbyIndexer
- """
- window_indexer = GroupbyIndexer(
- groupby_indices=self._grouper.indices,
- window_indexer=ExponentialMovingWindowIndexer,
- )
- return window_indexer
-
-
-class OnlineExponentialMovingWindow(ExponentialMovingWindow):
- def __init__(
- self,
- obj: NDFrame,
- com: float | None = None,
- span: float | None = None,
- halflife: float | TimedeltaConvertibleTypes | None = None,
- alpha: float | None = None,
- min_periods: int | None = 0,
- adjust: bool = True,
- ignore_na: bool = False,
- axis: Axis = 0,
- times: np.ndarray | NDFrame | None = None,
- engine: str = "numba",
- engine_kwargs: dict[str, bool] | None = None,
- *,
- selection=None,
- ) -> None:
- if times is not None:
- raise NotImplementedError(
- "times is not implemented with online operations."
- )
- super().__init__(
- obj=obj,
- com=com,
- span=span,
- halflife=halflife,
- alpha=alpha,
- min_periods=min_periods,
- adjust=adjust,
- ignore_na=ignore_na,
- axis=axis,
- times=times,
- selection=selection,
- )
- self._mean = EWMMeanState(
- self._com, self.adjust, self.ignore_na, self.axis, obj.shape
- )
- if maybe_use_numba(engine):
- self.engine = engine
- self.engine_kwargs = engine_kwargs
- else:
- raise ValueError("'numba' is the only supported engine")
-
- def reset(self) -> None:
- """
- Reset the state captured by `update` calls.
- """
- self._mean.reset()
-
- def aggregate(self, func, *args, **kwargs):
- raise NotImplementedError("aggregate is not implemented.")
-
- def std(self, bias: bool = False, *args, **kwargs):
- raise NotImplementedError("std is not implemented.")
-
- def corr(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- numeric_only: bool = False,
- ):
- raise NotImplementedError("corr is not implemented.")
-
- def cov(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- bias: bool = False,
- numeric_only: bool = False,
- ):
- raise NotImplementedError("cov is not implemented.")
-
- def var(self, bias: bool = False, numeric_only: bool = False):
- raise NotImplementedError("var is not implemented.")
-
- def mean(self, *args, update=None, update_times=None, **kwargs):
- """
- Calculate an online exponentially weighted mean.
-
- Parameters
- ----------
- update: DataFrame or Series, default None
- New values to continue calculating the
- exponentially weighted mean from the last values and weights.
- Values should be float64 dtype.
-
- ``update`` needs to be ``None`` the first time the
- exponentially weighted mean is calculated.
-
- update_times: Series or 1-D np.ndarray, default None
- New times to continue calculating the
- exponentially weighted mean from the last values and weights.
- If ``None``, values are assumed to be evenly spaced
- in time.
- This feature is currently unsupported.
-
- Returns
- -------
- DataFrame or Series
-
- Examples
- --------
- >>> df = pd.DataFrame({"a": range(5), "b": range(5, 10)})
- >>> online_ewm = df.head(2).ewm(0.5).online()
- >>> online_ewm.mean()
- a b
- 0 0.00 5.00
- 1 0.75 5.75
- >>> online_ewm.mean(update=df.tail(3))
- a b
- 2 1.615385 6.615385
- 3 2.550000 7.550000
- 4 3.520661 8.520661
- >>> online_ewm.reset()
- >>> online_ewm.mean()
- a b
- 0 0.00 5.00
- 1 0.75 5.75
- """
- result_kwargs = {}
- is_frame = self._selected_obj.ndim == 2
- if update_times is not None:
- raise NotImplementedError("update_times is not implemented.")
- update_deltas = np.ones(
- max(self._selected_obj.shape[self.axis - 1] - 1, 0), dtype=np.float64
- )
- if update is not None:
- if self._mean.last_ewm is None:
- raise ValueError(
- "Must call mean with update=None first before passing update"
- )
- result_from = 1
- result_kwargs["index"] = update.index
- if is_frame:
- last_value = self._mean.last_ewm[np.newaxis, :]
- result_kwargs["columns"] = update.columns
- else:
- last_value = self._mean.last_ewm
- result_kwargs["name"] = update.name
- np_array = np.concatenate((last_value, update.to_numpy()))
- else:
- result_from = 0
- result_kwargs["index"] = self._selected_obj.index
- if is_frame:
- result_kwargs["columns"] = self._selected_obj.columns
- else:
- result_kwargs["name"] = self._selected_obj.name
- np_array = self._selected_obj.astype(np.float64).to_numpy()
- ewma_func = generate_online_numba_ewma_func(
- **get_jit_arguments(self.engine_kwargs)
- )
- result = self._mean.run_ewm(
- np_array if is_frame else np_array[:, np.newaxis],
- update_deltas,
- self.min_periods,
- ewma_func,
- )
- if not is_frame:
- result = result.squeeze()
- result = result[result_from:]
- result = self._selected_obj._constructor(result, **result_kwargs)
- return result
diff --git a/contrib/python/pandas/py3/pandas/core/window/expanding.py b/contrib/python/pandas/py3/pandas/core/window/expanding.py
deleted file mode 100644
index 6147f0f43c5..00000000000
--- a/contrib/python/pandas/py3/pandas/core/window/expanding.py
+++ /dev/null
@@ -1,816 +0,0 @@
-from __future__ import annotations
-
-from textwrap import dedent
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
-)
-
-from pandas._typing import (
- Axis,
- QuantileInterpolation,
- WindowingRankType,
-)
-
-if TYPE_CHECKING:
- from pandas import DataFrame, Series
- from pandas.core.generic import NDFrame
-
-from pandas.util._decorators import doc
-
-from pandas.core.indexers.objects import (
- BaseIndexer,
- ExpandingIndexer,
- GroupbyIndexer,
-)
-from pandas.core.window.doc import (
- _shared_docs,
- create_section_header,
- kwargs_numeric_only,
- numba_notes,
- template_header,
- template_returns,
- template_see_also,
- window_agg_numba_parameters,
- window_apply_parameters,
-)
-from pandas.core.window.rolling import (
- BaseWindowGroupby,
- RollingAndExpandingMixin,
-)
-
-
-class Expanding(RollingAndExpandingMixin):
- """
- Provide expanding window calculations.
-
- Parameters
- ----------
- min_periods : int, default 1
- Minimum number of observations in window required to have a value;
- otherwise, result is ``np.nan``.
-
- axis : int or str, default 0
- If ``0`` or ``'index'``, roll across the rows.
-
- If ``1`` or ``'columns'``, roll across the columns.
-
- For `Series` this parameter is unused and defaults to 0.
-
- method : str {'single', 'table'}, default 'single'
- Execute the rolling operation per single column or row (``'single'``)
- or over the entire object (``'table'``).
-
- This argument is only implemented when specifying ``engine='numba'``
- in the method call.
-
- .. versionadded:: 1.3.0
-
- Returns
- -------
- ``Expanding`` subclass
-
- See Also
- --------
- rolling : Provides rolling window calculations.
- ewm : Provides exponential weighted functions.
-
- Notes
- -----
- See :ref:`Windowing Operations <window.expanding>` for further usage details
- and examples.
-
- Examples
- --------
- >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})
- >>> df
- B
- 0 0.0
- 1 1.0
- 2 2.0
- 3 NaN
- 4 4.0
-
- **min_periods**
-
- Expanding sum with 1 vs 3 observations needed to calculate a value.
-
- >>> df.expanding(1).sum()
- B
- 0 0.0
- 1 1.0
- 2 3.0
- 3 3.0
- 4 7.0
- >>> df.expanding(3).sum()
- B
- 0 NaN
- 1 NaN
- 2 3.0
- 3 3.0
- 4 7.0
- """
-
- _attributes: list[str] = ["min_periods", "axis", "method"]
-
- def __init__(
- self,
- obj: NDFrame,
- min_periods: int = 1,
- axis: Axis = 0,
- method: str = "single",
- selection=None,
- ) -> None:
- super().__init__(
- obj=obj,
- min_periods=min_periods,
- axis=axis,
- method=method,
- selection=selection,
- )
-
- def _get_window_indexer(self) -> BaseIndexer:
- """
- Return an indexer class that will compute the window start and end bounds
- """
- return ExpandingIndexer()
-
- @doc(
- _shared_docs["aggregate"],
- see_also=dedent(
- """
- See Also
- --------
- pandas.DataFrame.aggregate : Similar DataFrame method.
- pandas.Series.aggregate : Similar Series method.
- """
- ),
- examples=dedent(
- """
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
- >>> df
- A B C
- 0 1 4 7
- 1 2 5 8
- 2 3 6 9
-
- >>> df.ewm(alpha=0.5).mean()
- A B C
- 0 1.000000 4.000000 7.000000
- 1 1.666667 4.666667 7.666667
- 2 2.428571 5.428571 8.428571
- """
- ),
- klass="Series/Dataframe",
- axis="",
- )
- def aggregate(self, func, *args, **kwargs):
- return super().aggregate(func, *args, **kwargs)
-
- agg = aggregate
-
- @doc(
- template_header,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="expanding",
- aggregation_description="count of non NaN observations",
- agg_method="count",
- )
- def count(self, numeric_only: bool = False):
- return super().count(numeric_only=numeric_only)
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- window_apply_parameters,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="expanding",
- aggregation_description="custom aggregation function",
- agg_method="apply",
- )
- def apply(
- self,
- func: Callable[..., Any],
- raw: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- args: tuple[Any, ...] | None = None,
- kwargs: dict[str, Any] | None = None,
- ):
- return super().apply(
- func,
- raw=raw,
- engine=engine,
- engine_kwargs=engine_kwargs,
- args=args,
- kwargs=kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes[:-1],
- window_method="expanding",
- aggregation_description="sum",
- agg_method="sum",
- )
- def sum(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().sum(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes[:-1],
- window_method="expanding",
- aggregation_description="maximum",
- agg_method="max",
- )
- def max(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().max(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes[:-1],
- window_method="expanding",
- aggregation_description="minimum",
- agg_method="min",
- )
- def min(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().min(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes[:-1],
- window_method="expanding",
- aggregation_description="mean",
- agg_method="mean",
- )
- def mean(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().mean(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes[:-1],
- window_method="expanding",
- aggregation_description="median",
- agg_method="median",
- )
- def median(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().median(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.\n
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- window_agg_numba_parameters("1.4"),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "numpy.std : Equivalent method for NumPy array.\n",
- template_see_also,
- create_section_header("Notes"),
- dedent(
- """
- The default ``ddof`` of 1 used in :meth:`Series.std` is different
- than the default ``ddof`` of 0 in :func:`numpy.std`.
-
- A minimum of one period is required for the rolling calculation.\n
- """
- ).replace("\n", "", 1),
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
-
- >>> s.expanding(3).std()
- 0 NaN
- 1 NaN
- 2 0.577350
- 3 0.957427
- 4 0.894427
- 5 0.836660
- 6 0.786796
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="standard deviation",
- agg_method="std",
- )
- def std(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().std(
- ddof=ddof,
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.\n
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- window_agg_numba_parameters("1.4"),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "numpy.var : Equivalent method for NumPy array.\n",
- template_see_also,
- create_section_header("Notes"),
- dedent(
- """
- The default ``ddof`` of 1 used in :meth:`Series.var` is different
- than the default ``ddof`` of 0 in :func:`numpy.var`.
-
- A minimum of one period is required for the rolling calculation.\n
- """
- ).replace("\n", "", 1),
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
-
- >>> s.expanding(3).var()
- 0 NaN
- 1 NaN
- 2 0.333333
- 3 0.916667
- 4 0.800000
- 5 0.700000
- 6 0.619048
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="variance",
- agg_method="var",
- )
- def var(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().var(
- ddof=ddof,
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.\n
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- "A minimum of one period is required for the calculation.\n\n",
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([0, 1, 2, 3])
-
- >>> s.expanding().sem()
- 0 NaN
- 1 0.707107
- 2 0.707107
- 3 0.745356
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="standard error of mean",
- agg_method="sem",
- )
- def sem(self, ddof: int = 1, numeric_only: bool = False):
- return super().sem(ddof=ddof, numeric_only=numeric_only)
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "scipy.stats.skew : Third moment of a probability density.\n",
- template_see_also,
- create_section_header("Notes"),
- "A minimum of three periods is required for the rolling calculation.\n",
- window_method="expanding",
- aggregation_description="unbiased skewness",
- agg_method="skew",
- )
- def skew(self, numeric_only: bool = False):
- return super().skew(numeric_only=numeric_only)
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "scipy.stats.kurtosis : Reference SciPy method.\n",
- template_see_also,
- create_section_header("Notes"),
- "A minimum of four periods is required for the calculation.\n\n",
- create_section_header("Examples"),
- dedent(
- """
- The example below will show a rolling calculation with a window size of
- four matching the equivalent function call using `scipy.stats`.
-
- >>> arr = [1, 2, 3, 4, 999]
- >>> import scipy.stats
- >>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}")
- -1.200000
- >>> print(f"{{scipy.stats.kurtosis(arr, bias=False):.6f}}")
- 4.999874
- >>> s = pd.Series(arr)
- >>> s.expanding(4).kurt()
- 0 NaN
- 1 NaN
- 2 NaN
- 3 -1.200000
- 4 4.999874
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="Fisher's definition of kurtosis without bias",
- agg_method="kurt",
- )
- def kurt(self, numeric_only: bool = False):
- return super().kurt(numeric_only=numeric_only)
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- quantile : float
- Quantile to compute. 0 <= quantile <= 1.
- interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}}
- This optional parameter specifies the interpolation method to use,
- when the desired quantile lies between two data points `i` and `j`:
-
- * linear: `i + (j - i) * fraction`, where `fraction` is the
- fractional part of the index surrounded by `i` and `j`.
- * lower: `i`.
- * higher: `j`.
- * nearest: `i` or `j` whichever is nearest.
- * midpoint: (`i` + `j`) / 2.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="expanding",
- aggregation_description="quantile",
- agg_method="quantile",
- )
- def quantile(
- self,
- quantile: float,
- interpolation: QuantileInterpolation = "linear",
- numeric_only: bool = False,
- ):
- return super().quantile(
- quantile=quantile,
- interpolation=interpolation,
- numeric_only=numeric_only,
- )
-
- @doc(
- template_header,
- ".. versionadded:: 1.4.0 \n\n",
- create_section_header("Parameters"),
- dedent(
- """
- method : {{'average', 'min', 'max'}}, default 'average'
- How to rank the group of records that have the same value (i.e. ties):
-
- * average: average rank of the group
- * min: lowest rank in the group
- * max: highest rank in the group
-
- ascending : bool, default True
- Whether or not the elements should be ranked in ascending order.
- pct : bool, default False
- Whether or not to display the returned rankings in percentile
- form.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([1, 4, 2, 3, 5, 3])
- >>> s.expanding().rank()
- 0 1.0
- 1 2.0
- 2 2.0
- 3 3.0
- 4 5.0
- 5 3.5
- dtype: float64
-
- >>> s.expanding().rank(method="max")
- 0 1.0
- 1 2.0
- 2 2.0
- 3 3.0
- 4 5.0
- 5 4.0
- dtype: float64
-
- >>> s.expanding().rank(method="min")
- 0 1.0
- 1 2.0
- 2 2.0
- 3 3.0
- 4 5.0
- 5 3.0
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="rank",
- agg_method="rank",
- )
- def rank(
- self,
- method: WindowingRankType = "average",
- ascending: bool = True,
- pct: bool = False,
- numeric_only: bool = False,
- ):
- return super().rank(
- method=method,
- ascending=ascending,
- pct=pct,
- numeric_only=numeric_only,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- other : Series or DataFrame, optional
- If not supplied then will default to self and produce pairwise
- output.
- pairwise : bool, default None
- If False then only matching columns between self and other will be
- used and the output will be a DataFrame.
- If True then all pairwise combinations will be calculated and the
- output will be a MultiIndexed DataFrame in the case of DataFrame
- inputs. In the case of missing elements, only complete pairwise
- observations will be used.
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="expanding",
- aggregation_description="sample covariance",
- agg_method="cov",
- )
- def cov(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- ddof: int = 1,
- numeric_only: bool = False,
- ):
- return super().cov(
- other=other,
- pairwise=pairwise,
- ddof=ddof,
- numeric_only=numeric_only,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- other : Series or DataFrame, optional
- If not supplied then will default to self and produce pairwise
- output.
- pairwise : bool, default None
- If False then only matching columns between self and other will be
- used and the output will be a DataFrame.
- If True then all pairwise combinations will be calculated and the
- output will be a MultiIndexed DataFrame in the case of DataFrame
- inputs. In the case of missing elements, only complete pairwise
- observations will be used.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- dedent(
- """
- cov : Similar method to calculate covariance.
- numpy.corrcoef : NumPy Pearson's correlation calculation.
- """
- ).replace("\n", "", 1),
- template_see_also,
- create_section_header("Notes"),
- dedent(
- """
- This function uses Pearson's definition of correlation
- (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient).
-
- When `other` is not specified, the output will be self correlation (e.g.
- all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise`
- set to `True`.
-
- Function will return ``NaN`` for correlations of equal valued sequences;
- this is the result of a 0/0 division error.
-
- When `pairwise` is set to `False`, only matching columns between `self` and
- `other` will be used.
-
- When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame
- with the original index on the first level, and the `other` DataFrame
- columns on the second level.
-
- In the case of missing elements, only complete pairwise observations
- will be used.
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="correlation",
- agg_method="corr",
- )
- def corr(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- ddof: int = 1,
- numeric_only: bool = False,
- ):
- return super().corr(
- other=other,
- pairwise=pairwise,
- ddof=ddof,
- numeric_only=numeric_only,
- )
-
-
-class ExpandingGroupby(BaseWindowGroupby, Expanding):
- """
- Provide a expanding groupby implementation.
- """
-
- _attributes = Expanding._attributes + BaseWindowGroupby._attributes
-
- def _get_window_indexer(self) -> GroupbyIndexer:
- """
- Return an indexer class that will compute the window start and end bounds
-
- Returns
- -------
- GroupbyIndexer
- """
- window_indexer = GroupbyIndexer(
- groupby_indices=self._grouper.indices,
- window_indexer=ExpandingIndexer,
- )
- return window_indexer
diff --git a/contrib/python/pandas/py3/pandas/core/window/numba_.py b/contrib/python/pandas/py3/pandas/core/window/numba_.py
deleted file mode 100644
index 756f8e3a1cf..00000000000
--- a/contrib/python/pandas/py3/pandas/core/window/numba_.py
+++ /dev/null
@@ -1,349 +0,0 @@
-from __future__ import annotations
-
-import functools
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
-)
-
-import numpy as np
-
-from pandas._typing import Scalar
-from pandas.compat._optional import import_optional_dependency
-
-from pandas.core.util.numba_ import jit_user_function
-
-
-@functools.lru_cache(maxsize=None)
-def generate_numba_apply_func(
- func: Callable[..., Scalar],
- nopython: bool,
- nogil: bool,
- parallel: bool,
-):
- """
- Generate a numba jitted apply function specified by values from engine_kwargs.
-
- 1. jit the user's function
- 2. Return a rolling apply function with the jitted function inline
-
- Configurations specified in engine_kwargs apply to both the user's
- function _AND_ the rolling apply function.
-
- Parameters
- ----------
- func : function
- function to be applied to each window and will be JITed
- nopython : bool
- nopython to be passed into numba.jit
- nogil : bool
- nogil to be passed into numba.jit
- parallel : bool
- parallel to be passed into numba.jit
-
- Returns
- -------
- Numba function
- """
- numba_func = jit_user_function(func, nopython, nogil, parallel)
- if TYPE_CHECKING:
- import numba
- else:
- numba = import_optional_dependency("numba")
-
- @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
- def roll_apply(
- values: np.ndarray,
- begin: np.ndarray,
- end: np.ndarray,
- minimum_periods: int,
- *args: Any,
- ) -> np.ndarray:
- result = np.empty(len(begin))
- for i in numba.prange(len(result)):
- start = begin[i]
- stop = end[i]
- window = values[start:stop]
- count_nan = np.sum(np.isnan(window))
- if len(window) - count_nan >= minimum_periods:
- result[i] = numba_func(window, *args)
- else:
- result[i] = np.nan
- return result
-
- return roll_apply
-
-
-@functools.lru_cache(maxsize=None)
-def generate_numba_ewm_func(
- nopython: bool,
- nogil: bool,
- parallel: bool,
- com: float,
- adjust: bool,
- ignore_na: bool,
- deltas: tuple,
- normalize: bool,
-):
- """
- Generate a numba jitted ewm mean or sum function specified by values
- from engine_kwargs.
-
- Parameters
- ----------
- nopython : bool
- nopython to be passed into numba.jit
- nogil : bool
- nogil to be passed into numba.jit
- parallel : bool
- parallel to be passed into numba.jit
- com : float
- adjust : bool
- ignore_na : bool
- deltas : tuple
- normalize : bool
-
- Returns
- -------
- Numba function
- """
- if TYPE_CHECKING:
- import numba
- else:
- numba = import_optional_dependency("numba")
-
- @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
- def ewm(
- values: np.ndarray,
- begin: np.ndarray,
- end: np.ndarray,
- minimum_periods: int,
- ) -> np.ndarray:
- result = np.empty(len(values))
- alpha = 1.0 / (1.0 + com)
- old_wt_factor = 1.0 - alpha
- new_wt = 1.0 if adjust else alpha
-
- for i in numba.prange(len(begin)):
- start = begin[i]
- stop = end[i]
- window = values[start:stop]
- sub_result = np.empty(len(window))
-
- weighted = window[0]
- nobs = int(not np.isnan(weighted))
- sub_result[0] = weighted if nobs >= minimum_periods else np.nan
- old_wt = 1.0
-
- for j in range(1, len(window)):
- cur = window[j]
- is_observation = not np.isnan(cur)
- nobs += is_observation
- if not np.isnan(weighted):
- if is_observation or not ignore_na:
- if normalize:
- # note that len(deltas) = len(vals) - 1 and deltas[i]
- # is to be used in conjunction with vals[i+1]
- old_wt *= old_wt_factor ** deltas[start + j - 1]
- else:
- weighted = old_wt_factor * weighted
- if is_observation:
- if normalize:
- # avoid numerical errors on constant series
- if weighted != cur:
- weighted = old_wt * weighted + new_wt * cur
- if normalize:
- weighted = weighted / (old_wt + new_wt)
- if adjust:
- old_wt += new_wt
- else:
- old_wt = 1.0
- else:
- weighted += cur
- elif is_observation:
- weighted = cur
-
- sub_result[j] = weighted if nobs >= minimum_periods else np.nan
-
- result[start:stop] = sub_result
-
- return result
-
- return ewm
-
-
-@functools.lru_cache(maxsize=None)
-def generate_numba_table_func(
- func: Callable[..., np.ndarray],
- nopython: bool,
- nogil: bool,
- parallel: bool,
-):
- """
- Generate a numba jitted function to apply window calculations table-wise.
-
- Func will be passed a M window size x N number of columns array, and
- must return a 1 x N number of columns array. Func is intended to operate
- row-wise, but the result will be transposed for axis=1.
-
- 1. jit the user's function
- 2. Return a rolling apply function with the jitted function inline
-
- Parameters
- ----------
- func : function
- function to be applied to each window and will be JITed
- nopython : bool
- nopython to be passed into numba.jit
- nogil : bool
- nogil to be passed into numba.jit
- parallel : bool
- parallel to be passed into numba.jit
-
- Returns
- -------
- Numba function
- """
- numba_func = jit_user_function(func, nopython, nogil, parallel)
- if TYPE_CHECKING:
- import numba
- else:
- numba = import_optional_dependency("numba")
-
- @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
- def roll_table(
- values: np.ndarray,
- begin: np.ndarray,
- end: np.ndarray,
- minimum_periods: int,
- *args: Any,
- ):
- result = np.empty((len(begin), values.shape[1]))
- min_periods_mask = np.empty(result.shape)
- for i in numba.prange(len(result)):
- start = begin[i]
- stop = end[i]
- window = values[start:stop]
- count_nan = np.sum(np.isnan(window), axis=0)
- sub_result = numba_func(window, *args)
- nan_mask = len(window) - count_nan >= minimum_periods
- min_periods_mask[i, :] = nan_mask
- result[i, :] = sub_result
- result = np.where(min_periods_mask, result, np.nan)
- return result
-
- return roll_table
-
-
-# This function will no longer be needed once numba supports
-# axis for all np.nan* agg functions
-# https://github.com/numba/numba/issues/1269
-@functools.lru_cache(maxsize=None)
-def generate_manual_numpy_nan_agg_with_axis(nan_func):
- if TYPE_CHECKING:
- import numba
- else:
- numba = import_optional_dependency("numba")
-
- @numba.jit(nopython=True, nogil=True, parallel=True)
- def nan_agg_with_axis(table):
- result = np.empty(table.shape[1])
- for i in numba.prange(table.shape[1]):
- partition = table[:, i]
- result[i] = nan_func(partition)
- return result
-
- return nan_agg_with_axis
-
-
-@functools.lru_cache(maxsize=None)
-def generate_numba_ewm_table_func(
- nopython: bool,
- nogil: bool,
- parallel: bool,
- com: float,
- adjust: bool,
- ignore_na: bool,
- deltas: tuple,
- normalize: bool,
-):
- """
- Generate a numba jitted ewm mean or sum function applied table wise specified
- by values from engine_kwargs.
-
- Parameters
- ----------
- nopython : bool
- nopython to be passed into numba.jit
- nogil : bool
- nogil to be passed into numba.jit
- parallel : bool
- parallel to be passed into numba.jit
- com : float
- adjust : bool
- ignore_na : bool
- deltas : tuple
- normalize: bool
-
- Returns
- -------
- Numba function
- """
- if TYPE_CHECKING:
- import numba
- else:
- numba = import_optional_dependency("numba")
-
- @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
- def ewm_table(
- values: np.ndarray,
- begin: np.ndarray,
- end: np.ndarray,
- minimum_periods: int,
- ) -> np.ndarray:
- alpha = 1.0 / (1.0 + com)
- old_wt_factor = 1.0 - alpha
- new_wt = 1.0 if adjust else alpha
- old_wt = np.ones(values.shape[1])
-
- result = np.empty(values.shape)
- weighted = values[0].copy()
- nobs = (~np.isnan(weighted)).astype(np.int64)
- result[0] = np.where(nobs >= minimum_periods, weighted, np.nan)
- for i in range(1, len(values)):
- cur = values[i]
- is_observations = ~np.isnan(cur)
- nobs += is_observations.astype(np.int64)
- for j in numba.prange(len(cur)):
- if not np.isnan(weighted[j]):
- if is_observations[j] or not ignore_na:
- if normalize:
- # note that len(deltas) = len(vals) - 1 and deltas[i]
- # is to be used in conjunction with vals[i+1]
- old_wt[j] *= old_wt_factor ** deltas[i - 1]
- else:
- weighted[j] = old_wt_factor * weighted[j]
- if is_observations[j]:
- if normalize:
- # avoid numerical errors on constant series
- if weighted[j] != cur[j]:
- weighted[j] = (
- old_wt[j] * weighted[j] + new_wt * cur[j]
- )
- if normalize:
- weighted[j] = weighted[j] / (old_wt[j] + new_wt)
- if adjust:
- old_wt[j] += new_wt
- else:
- old_wt[j] = 1.0
- else:
- weighted[j] += cur[j]
- elif is_observations[j]:
- weighted[j] = cur[j]
-
- result[i] = np.where(nobs >= minimum_periods, weighted, np.nan)
-
- return result
-
- return ewm_table
diff --git a/contrib/python/pandas/py3/pandas/core/window/online.py b/contrib/python/pandas/py3/pandas/core/window/online.py
deleted file mode 100644
index f9e3122b304..00000000000
--- a/contrib/python/pandas/py3/pandas/core/window/online.py
+++ /dev/null
@@ -1,118 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import numpy as np
-
-from pandas.compat._optional import import_optional_dependency
-
-
-def generate_online_numba_ewma_func(
- nopython: bool,
- nogil: bool,
- parallel: bool,
-):
- """
- Generate a numba jitted groupby ewma function specified by values
- from engine_kwargs.
-
- Parameters
- ----------
- nopython : bool
- nopython to be passed into numba.jit
- nogil : bool
- nogil to be passed into numba.jit
- parallel : bool
- parallel to be passed into numba.jit
-
- Returns
- -------
- Numba function
- """
- if TYPE_CHECKING:
- import numba
- else:
- numba = import_optional_dependency("numba")
-
- @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
- def online_ewma(
- values: np.ndarray,
- deltas: np.ndarray,
- minimum_periods: int,
- old_wt_factor: float,
- new_wt: float,
- old_wt: np.ndarray,
- adjust: bool,
- ignore_na: bool,
- ):
- """
- Compute online exponentially weighted mean per column over 2D values.
-
- Takes the first observation as is, then computes the subsequent
- exponentially weighted mean accounting minimum periods.
- """
- result = np.empty(values.shape)
- weighted_avg = values[0]
- nobs = (~np.isnan(weighted_avg)).astype(np.int64)
- result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)
-
- for i in range(1, len(values)):
- cur = values[i]
- is_observations = ~np.isnan(cur)
- nobs += is_observations.astype(np.int64)
- for j in numba.prange(len(cur)):
- if not np.isnan(weighted_avg[j]):
- if is_observations[j] or not ignore_na:
- # note that len(deltas) = len(vals) - 1 and deltas[i] is to be
- # used in conjunction with vals[i+1]
- old_wt[j] *= old_wt_factor ** deltas[j - 1]
- if is_observations[j]:
- # avoid numerical errors on constant series
- if weighted_avg[j] != cur[j]:
- weighted_avg[j] = (
- (old_wt[j] * weighted_avg[j]) + (new_wt * cur[j])
- ) / (old_wt[j] + new_wt)
- if adjust:
- old_wt[j] += new_wt
- else:
- old_wt[j] = 1.0
- elif is_observations[j]:
- weighted_avg[j] = cur[j]
-
- result[i] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)
-
- return result, old_wt
-
- return online_ewma
-
-
-class EWMMeanState:
- def __init__(self, com, adjust, ignore_na, axis, shape) -> None:
- alpha = 1.0 / (1.0 + com)
- self.axis = axis
- self.shape = shape
- self.adjust = adjust
- self.ignore_na = ignore_na
- self.new_wt = 1.0 if adjust else alpha
- self.old_wt_factor = 1.0 - alpha
- self.old_wt = np.ones(self.shape[self.axis - 1])
- self.last_ewm = None
-
- def run_ewm(self, weighted_avg, deltas, min_periods, ewm_func):
- result, old_wt = ewm_func(
- weighted_avg,
- deltas,
- min_periods,
- self.old_wt_factor,
- self.new_wt,
- self.old_wt,
- self.adjust,
- self.ignore_na,
- )
- self.old_wt = old_wt
- self.last_ewm = result[-1]
- return result
-
- def reset(self) -> None:
- self.old_wt = np.ones(self.shape[self.axis - 1])
- self.last_ewm = None
diff --git a/contrib/python/pandas/py3/pandas/core/window/rolling.py b/contrib/python/pandas/py3/pandas/core/window/rolling.py
deleted file mode 100644
index a0e4d003e45..00000000000
--- a/contrib/python/pandas/py3/pandas/core/window/rolling.py
+++ /dev/null
@@ -1,2744 +0,0 @@
-"""
-Provide a generic structure to support window functions,
-similar to how we have a Groupby object.
-"""
-from __future__ import annotations
-
-import copy
-from datetime import timedelta
-from functools import partial
-import inspect
-from textwrap import dedent
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Iterator,
- Sized,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs.tslibs import (
- BaseOffset,
- to_offset,
-)
-import pandas._libs.window.aggregations as window_aggregations
-from pandas._typing import (
- ArrayLike,
- Axis,
- NDFrameT,
- QuantileInterpolation,
- WindowingRankType,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.errors import DataError
-from pandas.util._decorators import doc
-
-from pandas.core.dtypes.common import (
- ensure_float64,
- is_bool,
- is_integer,
- is_list_like,
- is_numeric_dtype,
- is_scalar,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import notna
-
-from pandas.core._numba import executor
-from pandas.core.algorithms import factorize
-from pandas.core.apply import ResamplerWindowApply
-from pandas.core.arrays import ExtensionArray
-from pandas.core.base import SelectionMixin
-import pandas.core.common as com
-from pandas.core.indexers.objects import (
- BaseIndexer,
- FixedWindowIndexer,
- GroupbyIndexer,
- VariableWindowIndexer,
-)
-from pandas.core.indexes.api import (
- DatetimeIndex,
- Index,
- MultiIndex,
- PeriodIndex,
- TimedeltaIndex,
-)
-from pandas.core.reshape.concat import concat
-from pandas.core.util.numba_ import (
- get_jit_arguments,
- maybe_use_numba,
-)
-from pandas.core.window.common import (
- flex_binary_moment,
- zsqrt,
-)
-from pandas.core.window.doc import (
- _shared_docs,
- create_section_header,
- kwargs_numeric_only,
- kwargs_scipy,
- numba_notes,
- template_header,
- template_returns,
- template_see_also,
- window_agg_numba_parameters,
- window_apply_parameters,
-)
-from pandas.core.window.numba_ import (
- generate_manual_numpy_nan_agg_with_axis,
- generate_numba_apply_func,
- generate_numba_table_func,
-)
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
- from pandas.core.generic import NDFrame
- from pandas.core.groupby.ops import BaseGrouper
-
-
-class BaseWindow(SelectionMixin):
- """Provides utilities for performing windowing operations."""
-
- _attributes: list[str] = []
- exclusions: frozenset[Hashable] = frozenset()
- _on: Index
-
- def __init__(
- self,
- obj: NDFrame,
- window=None,
- min_periods: int | None = None,
- center: bool | None = False,
- win_type: str | None = None,
- axis: Axis = 0,
- on: str | Index | None = None,
- closed: str | None = None,
- step: int | None = None,
- method: str = "single",
- *,
- selection=None,
- ) -> None:
- self.obj = obj
- self.on = on
- self.closed = closed
- self.step = step
- self.window = window
- self.min_periods = min_periods
- self.center = center
- self.win_type = win_type
- self.axis = obj._get_axis_number(axis) if axis is not None else None
- self.method = method
- self._win_freq_i8: int | None = None
- if self.on is None:
- if self.axis == 0:
- self._on = self.obj.index
- else:
- # i.e. self.axis == 1
- self._on = self.obj.columns
- elif isinstance(self.on, Index):
- self._on = self.on
- elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns:
- self._on = Index(self.obj[self.on])
- else:
- raise ValueError(
- f"invalid on specified as {self.on}, "
- "must be a column (of DataFrame), an Index or None"
- )
-
- self._selection = selection
- self._validate()
-
- def _validate(self) -> None:
- if self.center is not None and not is_bool(self.center):
- raise ValueError("center must be a boolean")
- if self.min_periods is not None:
- if not is_integer(self.min_periods):
- raise ValueError("min_periods must be an integer")
- if self.min_periods < 0:
- raise ValueError("min_periods must be >= 0")
- if is_integer(self.window) and self.min_periods > self.window:
- raise ValueError(
- f"min_periods {self.min_periods} must be <= window {self.window}"
- )
- if self.closed is not None and self.closed not in [
- "right",
- "both",
- "left",
- "neither",
- ]:
- raise ValueError("closed must be 'right', 'left', 'both' or 'neither'")
- if not isinstance(self.obj, (ABCSeries, ABCDataFrame)):
- raise TypeError(f"invalid type: {type(self)}")
- if isinstance(self.window, BaseIndexer):
- # Validate that the passed BaseIndexer subclass has
- # a get_window_bounds with the correct signature.
- get_window_bounds_signature = inspect.signature(
- self.window.get_window_bounds
- ).parameters.keys()
- expected_signature = inspect.signature(
- BaseIndexer().get_window_bounds
- ).parameters.keys()
- if get_window_bounds_signature != expected_signature:
- raise ValueError(
- f"{type(self.window).__name__} does not implement "
- f"the correct signature for get_window_bounds"
- )
- if self.method not in ["table", "single"]:
- raise ValueError("method must be 'table' or 'single")
- if self.step is not None:
- if not is_integer(self.step):
- raise ValueError("step must be an integer")
- if self.step < 0:
- raise ValueError("step must be >= 0")
-
- def _check_window_bounds(
- self, start: np.ndarray, end: np.ndarray, num_vals: int
- ) -> None:
- if len(start) != len(end):
- raise ValueError(
- f"start ({len(start)}) and end ({len(end)}) bounds must be the "
- f"same length"
- )
- if len(start) != (num_vals + (self.step or 1) - 1) // (self.step or 1):
- raise ValueError(
- f"start and end bounds ({len(start)}) must be the same length "
- f"as the object ({num_vals}) divided by the step ({self.step}) "
- f"if given and rounded up"
- )
-
- def _slice_axis_for_step(self, index: Index, result: Sized | None = None) -> Index:
- """
- Slices the index for a given result and the preset step.
- """
- return (
- index
- if result is None or len(result) == len(index)
- else index[:: self.step]
- )
-
- def _validate_numeric_only(self, name: str, numeric_only: bool) -> None:
- """
- Validate numeric_only argument, raising if invalid for the input.
-
- Parameters
- ----------
- name : str
- Name of the operator (kernel).
- numeric_only : bool
- Value passed by user.
- """
- if (
- self._selected_obj.ndim == 1
- and numeric_only
- and not is_numeric_dtype(self._selected_obj.dtype)
- ):
- raise NotImplementedError(
- f"{type(self).__name__}.{name} does not implement numeric_only"
- )
-
- def _make_numeric_only(self, obj: NDFrameT) -> NDFrameT:
- """Subset DataFrame to numeric columns.
-
- Parameters
- ----------
- obj : DataFrame
-
- Returns
- -------
- obj subset to numeric-only columns.
- """
- result = obj.select_dtypes(include=["number"], exclude=["timedelta"])
- return result
-
- def _create_data(self, obj: NDFrameT, numeric_only: bool = False) -> NDFrameT:
- """
- Split data into blocks & return conformed data.
- """
- # filter out the on from the object
- if self.on is not None and not isinstance(self.on, Index) and obj.ndim == 2:
- obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False)
- if obj.ndim > 1 and (numeric_only or self.axis == 1):
- # GH: 20649 in case of mixed dtype and axis=1 we have to convert everything
- # to float to calculate the complete row at once. We exclude all non-numeric
- # dtypes.
- obj = self._make_numeric_only(obj)
- if self.axis == 1:
- obj = obj.astype("float64", copy=False)
- obj._mgr = obj._mgr.consolidate()
- return obj
-
- def _gotitem(self, key, ndim, subset=None):
- """
- Sub-classes to define. Return a sliced object.
-
- Parameters
- ----------
- key : str / list of selections
- ndim : {1, 2}
- requested ndim of result
- subset : object, default None
- subset to act on
- """
- # create a new object to prevent aliasing
- if subset is None:
- subset = self.obj
-
- # we need to make a shallow copy of ourselves
- # with the same groupby
- kwargs = {attr: getattr(self, attr) for attr in self._attributes}
-
- selection = None
- if subset.ndim == 2 and (
- (is_scalar(key) and key in subset) or is_list_like(key)
- ):
- selection = key
- elif subset.ndim == 1 and is_scalar(key) and key == subset.name:
- selection = key
-
- new_win = type(self)(subset, selection=selection, **kwargs)
- return new_win
-
- def __getattr__(self, attr: str):
- if attr in self._internal_names_set:
- return object.__getattribute__(self, attr)
- if attr in self.obj:
- return self[attr]
-
- raise AttributeError(
- f"'{type(self).__name__}' object has no attribute '{attr}'"
- )
-
- def _dir_additions(self):
- return self.obj._dir_additions()
-
- def __repr__(self) -> str:
- """
- Provide a nice str repr of our rolling object.
- """
- attrs_list = (
- f"{attr_name}={getattr(self, attr_name)}"
- for attr_name in self._attributes
- if getattr(self, attr_name, None) is not None and attr_name[0] != "_"
- )
- attrs = ",".join(attrs_list)
- return f"{type(self).__name__} [{attrs}]"
-
- def __iter__(self) -> Iterator:
- obj = self._selected_obj.set_axis(self._on)
- obj = self._create_data(obj)
- indexer = self._get_window_indexer()
-
- start, end = indexer.get_window_bounds(
- num_values=len(obj),
- min_periods=self.min_periods,
- center=self.center,
- closed=self.closed,
- step=self.step,
- )
- self._check_window_bounds(start, end, len(obj))
-
- for s, e in zip(start, end):
- result = obj.iloc[slice(s, e)]
- yield result
-
- def _prep_values(self, values: ArrayLike) -> np.ndarray:
- """Convert input to numpy arrays for Cython routines"""
- if needs_i8_conversion(values.dtype):
- raise NotImplementedError(
- f"ops for {type(self).__name__} for this "
- f"dtype {values.dtype} are not implemented"
- )
- # GH #12373 : rolling functions error on float32 data
- # make sure the data is coerced to float64
- try:
- if isinstance(values, ExtensionArray):
- values = values.to_numpy(np.float64, na_value=np.nan)
- else:
- values = ensure_float64(values)
- except (ValueError, TypeError) as err:
- raise TypeError(f"cannot handle this type -> {values.dtype}") from err
-
- # Convert inf to nan for C funcs
- inf = np.isinf(values)
- if inf.any():
- values = np.where(inf, np.nan, values)
-
- return values
-
- def _insert_on_column(self, result: DataFrame, obj: DataFrame) -> None:
- # if we have an 'on' column we want to put it back into
- # the results in the same location
- from pandas import Series
-
- if self.on is not None and not self._on.equals(obj.index):
- name = self._on.name
- extra_col = Series(self._on, index=self.obj.index, name=name, copy=False)
- if name in result.columns:
- # TODO: sure we want to overwrite results?
- result[name] = extra_col
- elif name in result.index.names:
- pass
- elif name in self._selected_obj.columns:
- # insert in the same location as we had in _selected_obj
- old_cols = self._selected_obj.columns
- new_cols = result.columns
- old_loc = old_cols.get_loc(name)
- overlap = new_cols.intersection(old_cols[:old_loc])
- new_loc = len(overlap)
- result.insert(new_loc, name, extra_col)
- else:
- # insert at the end
- result[name] = extra_col
-
- @property
- def _index_array(self):
- # TODO: why do we get here with e.g. MultiIndex?
- if needs_i8_conversion(self._on.dtype):
- idx = cast("PeriodIndex | DatetimeIndex | TimedeltaIndex", self._on)
- return idx.asi8
- return None
-
- def _resolve_output(self, out: DataFrame, obj: DataFrame) -> DataFrame:
- """Validate and finalize result."""
- if out.shape[1] == 0 and obj.shape[1] > 0:
- raise DataError("No numeric types to aggregate")
- if out.shape[1] == 0:
- return obj.astype("float64")
-
- self._insert_on_column(out, obj)
- return out
-
- def _get_window_indexer(self) -> BaseIndexer:
- """
- Return an indexer class that will compute the window start and end bounds
- """
- if isinstance(self.window, BaseIndexer):
- return self.window
- if self._win_freq_i8 is not None:
- return VariableWindowIndexer(
- index_array=self._index_array,
- window_size=self._win_freq_i8,
- center=self.center,
- )
- return FixedWindowIndexer(window_size=self.window)
-
- def _apply_series(
- self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None
- ) -> Series:
- """
- Series version of _apply_blockwise
- """
- obj = self._create_data(self._selected_obj)
-
- if name == "count":
- # GH 12541: Special case for count where we support date-like types
- obj = notna(obj).astype(int)
- try:
- values = self._prep_values(obj._values)
- except (TypeError, NotImplementedError) as err:
- raise DataError("No numeric types to aggregate") from err
-
- result = homogeneous_func(values)
- index = self._slice_axis_for_step(obj.index, result)
- return obj._constructor(result, index=index, name=obj.name)
-
- def _apply_blockwise(
- self,
- homogeneous_func: Callable[..., ArrayLike],
- name: str,
- numeric_only: bool = False,
- ) -> DataFrame | Series:
- """
- Apply the given function to the DataFrame broken down into homogeneous
- sub-frames.
- """
- self._validate_numeric_only(name, numeric_only)
- if self._selected_obj.ndim == 1:
- return self._apply_series(homogeneous_func, name)
-
- obj = self._create_data(self._selected_obj, numeric_only)
- if name == "count":
- # GH 12541: Special case for count where we support date-like types
- obj = notna(obj).astype(int)
- obj._mgr = obj._mgr.consolidate()
-
- if self.axis == 1:
- obj = obj.T
-
- taker = []
- res_values = []
- for i, arr in enumerate(obj._iter_column_arrays()):
- # GH#42736 operate column-wise instead of block-wise
- # As of 2.0, hfunc will raise for nuisance columns
- try:
- arr = self._prep_values(arr)
- except (TypeError, NotImplementedError) as err:
- raise DataError(
- f"Cannot aggregate non-numeric type: {arr.dtype}"
- ) from err
- res = homogeneous_func(arr)
- res_values.append(res)
- taker.append(i)
-
- index = self._slice_axis_for_step(
- obj.index, res_values[0] if len(res_values) > 0 else None
- )
- df = type(obj)._from_arrays(
- res_values,
- index=index,
- columns=obj.columns.take(taker),
- verify_integrity=False,
- )
-
- if self.axis == 1:
- df = df.T
-
- return self._resolve_output(df, obj)
-
- def _apply_tablewise(
- self,
- homogeneous_func: Callable[..., ArrayLike],
- name: str | None = None,
- numeric_only: bool = False,
- ) -> DataFrame | Series:
- """
- Apply the given function to the DataFrame across the entire object
- """
- if self._selected_obj.ndim == 1:
- raise ValueError("method='table' not applicable for Series objects.")
- obj = self._create_data(self._selected_obj, numeric_only)
- values = self._prep_values(obj.to_numpy())
- values = values.T if self.axis == 1 else values
- result = homogeneous_func(values)
- result = result.T if self.axis == 1 else result
- index = self._slice_axis_for_step(obj.index, result)
- columns = (
- obj.columns
- if result.shape[1] == len(obj.columns)
- else obj.columns[:: self.step]
- )
- out = obj._constructor(result, index=index, columns=columns)
-
- return self._resolve_output(out, obj)
-
- def _apply_pairwise(
- self,
- target: DataFrame | Series,
- other: DataFrame | Series | None,
- pairwise: bool | None,
- func: Callable[[DataFrame | Series, DataFrame | Series], DataFrame | Series],
- numeric_only: bool,
- ) -> DataFrame | Series:
- """
- Apply the given pairwise function given 2 pandas objects (DataFrame/Series)
- """
- target = self._create_data(target, numeric_only)
- if other is None:
- other = target
- # only default unset
- pairwise = True if pairwise is None else pairwise
- elif not isinstance(other, (ABCDataFrame, ABCSeries)):
- raise ValueError("other must be a DataFrame or Series")
- elif other.ndim == 2 and numeric_only:
- other = self._make_numeric_only(other)
-
- return flex_binary_moment(target, other, func, pairwise=bool(pairwise))
-
- def _apply(
- self,
- func: Callable[..., Any],
- name: str,
- numeric_only: bool = False,
- numba_args: tuple[Any, ...] = (),
- **kwargs,
- ):
- """
- Rolling statistical measure using supplied function.
-
- Designed to be used with passed-in Cython array-based functions.
-
- Parameters
- ----------
- func : callable function to apply
- name : str,
- numba_args : tuple
- args to be passed when func is a numba func
- **kwargs
- additional arguments for rolling function and window function
-
- Returns
- -------
- y : type of input
- """
- window_indexer = self._get_window_indexer()
- min_periods = (
- self.min_periods
- if self.min_periods is not None
- else window_indexer.window_size
- )
-
- def homogeneous_func(values: np.ndarray):
- # calculation function
-
- if values.size == 0:
- return values.copy()
-
- def calc(x):
- start, end = window_indexer.get_window_bounds(
- num_values=len(x),
- min_periods=min_periods,
- center=self.center,
- closed=self.closed,
- step=self.step,
- )
- self._check_window_bounds(start, end, len(x))
-
- return func(x, start, end, min_periods, *numba_args)
-
- with np.errstate(all="ignore"):
- result = calc(values)
-
- return result
-
- if self.method == "single":
- return self._apply_blockwise(homogeneous_func, name, numeric_only)
- else:
- return self._apply_tablewise(homogeneous_func, name, numeric_only)
-
- def _numba_apply(
- self,
- func: Callable[..., Any],
- engine_kwargs: dict[str, bool] | None = None,
- *func_args,
- ):
- window_indexer = self._get_window_indexer()
- min_periods = (
- self.min_periods
- if self.min_periods is not None
- else window_indexer.window_size
- )
- obj = self._create_data(self._selected_obj)
- if self.axis == 1:
- obj = obj.T
- values = self._prep_values(obj.to_numpy())
- if values.ndim == 1:
- values = values.reshape(-1, 1)
- start, end = window_indexer.get_window_bounds(
- num_values=len(values),
- min_periods=min_periods,
- center=self.center,
- closed=self.closed,
- step=self.step,
- )
- self._check_window_bounds(start, end, len(values))
- aggregator = executor.generate_shared_aggregator(
- func, **get_jit_arguments(engine_kwargs)
- )
- result = aggregator(values, start, end, min_periods, *func_args)
- result = result.T if self.axis == 1 else result
- index = self._slice_axis_for_step(obj.index, result)
- if obj.ndim == 1:
- result = result.squeeze()
- out = obj._constructor(result, index=index, name=obj.name)
- return out
- else:
- columns = self._slice_axis_for_step(obj.columns, result.T)
- out = obj._constructor(result, index=index, columns=columns)
- return self._resolve_output(out, obj)
-
- def aggregate(self, func, *args, **kwargs):
- result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
- if result is None:
- return self.apply(func, raw=False, args=args, kwargs=kwargs)
- return result
-
- agg = aggregate
-
-
-class BaseWindowGroupby(BaseWindow):
- """
- Provide the groupby windowing facilities.
- """
-
- _grouper: BaseGrouper
- _as_index: bool
- _attributes: list[str] = ["_grouper"]
-
- def __init__(
- self,
- obj: DataFrame | Series,
- *args,
- _grouper: BaseGrouper,
- _as_index: bool = True,
- **kwargs,
- ) -> None:
- from pandas.core.groupby.ops import BaseGrouper
-
- if not isinstance(_grouper, BaseGrouper):
- raise ValueError("Must pass a BaseGrouper object.")
- self._grouper = _grouper
- self._as_index = _as_index
- # GH 32262: It's convention to keep the grouping column in
- # groupby.<agg_func>, but unexpected to users in
- # groupby.rolling.<agg_func>
- obj = obj.drop(columns=self._grouper.names, errors="ignore")
- # GH 15354
- if kwargs.get("step") is not None:
- raise NotImplementedError("step not implemented for groupby")
- super().__init__(obj, *args, **kwargs)
-
- def _apply(
- self,
- func: Callable[..., Any],
- name: str,
- numeric_only: bool = False,
- numba_args: tuple[Any, ...] = (),
- **kwargs,
- ) -> DataFrame | Series:
- result = super()._apply(
- func,
- name,
- numeric_only,
- numba_args,
- **kwargs,
- )
- # Reconstruct the resulting MultiIndex
- # 1st set of levels = group by labels
- # 2nd set of levels = original DataFrame/Series index
- grouped_object_index = self.obj.index
- grouped_index_name = [*grouped_object_index.names]
- groupby_keys = copy.copy(self._grouper.names)
- result_index_names = groupby_keys + grouped_index_name
-
- drop_columns = [
- key
- for key in self._grouper.names
- if key not in self.obj.index.names or key is None
- ]
-
- if len(drop_columns) != len(groupby_keys):
- # Our result will have still kept the column in the result
- result = result.drop(columns=drop_columns, errors="ignore")
-
- codes = self._grouper.codes
- levels = copy.copy(self._grouper.levels)
-
- group_indices = self._grouper.indices.values()
- if group_indices:
- indexer = np.concatenate(list(group_indices))
- else:
- indexer = np.array([], dtype=np.intp)
- codes = [c.take(indexer) for c in codes]
-
- # if the index of the original dataframe needs to be preserved, append
- # this index (but reordered) to the codes/levels from the groupby
- if grouped_object_index is not None:
- idx = grouped_object_index.take(indexer)
- if not isinstance(idx, MultiIndex):
- idx = MultiIndex.from_arrays([idx])
- codes.extend(list(idx.codes))
- levels.extend(list(idx.levels))
-
- result_index = MultiIndex(
- levels, codes, names=result_index_names, verify_integrity=False
- )
-
- result.index = result_index
- if not self._as_index:
- result = result.reset_index(level=list(range(len(groupby_keys))))
- return result
-
- def _apply_pairwise(
- self,
- target: DataFrame | Series,
- other: DataFrame | Series | None,
- pairwise: bool | None,
- func: Callable[[DataFrame | Series, DataFrame | Series], DataFrame | Series],
- numeric_only: bool,
- ) -> DataFrame | Series:
- """
- Apply the given pairwise function given 2 pandas objects (DataFrame/Series)
- """
- # Manually drop the grouping column first
- target = target.drop(columns=self._grouper.names, errors="ignore")
- result = super()._apply_pairwise(target, other, pairwise, func, numeric_only)
- # 1) Determine the levels + codes of the groupby levels
- if other is not None and not all(
- len(group) == len(other) for group in self._grouper.indices.values()
- ):
- # GH 42915
- # len(other) != len(any group), so must reindex (expand) the result
- # from flex_binary_moment to a "transform"-like result
- # per groupby combination
- old_result_len = len(result)
- result = concat(
- [
- result.take(gb_indices).reindex(result.index)
- for gb_indices in self._grouper.indices.values()
- ]
- )
-
- gb_pairs = (
- com.maybe_make_list(pair) for pair in self._grouper.indices.keys()
- )
- groupby_codes = []
- groupby_levels = []
- # e.g. [[1, 2], [4, 5]] as [[1, 4], [2, 5]]
- for gb_level_pair in map(list, zip(*gb_pairs)):
- labels = np.repeat(np.array(gb_level_pair), old_result_len)
- codes, levels = factorize(labels)
- groupby_codes.append(codes)
- groupby_levels.append(levels)
- else:
- # pairwise=True or len(other) == len(each group), so repeat
- # the groupby labels by the number of columns in the original object
- groupby_codes = self._grouper.codes
- # error: Incompatible types in assignment (expression has type
- # "List[Index]", variable has type "List[Union[ndarray, Index]]")
- groupby_levels = self._grouper.levels # type: ignore[assignment]
-
- group_indices = self._grouper.indices.values()
- if group_indices:
- indexer = np.concatenate(list(group_indices))
- else:
- indexer = np.array([], dtype=np.intp)
-
- if target.ndim == 1:
- repeat_by = 1
- else:
- repeat_by = len(target.columns)
- groupby_codes = [
- np.repeat(c.take(indexer), repeat_by) for c in groupby_codes
- ]
- # 2) Determine the levels + codes of the result from super()._apply_pairwise
- if isinstance(result.index, MultiIndex):
- result_codes = list(result.index.codes)
- result_levels = list(result.index.levels)
- result_names = list(result.index.names)
- else:
- idx_codes, idx_levels = factorize(result.index)
- result_codes = [idx_codes]
- result_levels = [idx_levels]
- result_names = [result.index.name]
-
- # 3) Create the resulting index by combining 1) + 2)
- result_codes = groupby_codes + result_codes
- result_levels = groupby_levels + result_levels
- result_names = self._grouper.names + result_names
-
- result_index = MultiIndex(
- result_levels, result_codes, names=result_names, verify_integrity=False
- )
- result.index = result_index
- return result
-
- def _create_data(self, obj: NDFrameT, numeric_only: bool = False) -> NDFrameT:
- """
- Split data into blocks & return conformed data.
- """
- # Ensure the object we're rolling over is monotonically sorted relative
- # to the groups
- # GH 36197
- if not obj.empty:
- groupby_order = np.concatenate(list(self._grouper.indices.values())).astype(
- np.int64
- )
- obj = obj.take(groupby_order)
- return super()._create_data(obj, numeric_only)
-
- def _gotitem(self, key, ndim, subset=None):
- # we are setting the index on the actual object
- # here so our index is carried through to the selected obj
- # when we do the splitting for the groupby
- if self.on is not None:
- # GH 43355
- subset = self.obj.set_index(self._on)
- return super()._gotitem(key, ndim, subset=subset)
-
-
-class Window(BaseWindow):
- """
- Provide rolling window calculations.
-
- Parameters
- ----------
- window : int, timedelta, str, offset, or BaseIndexer subclass
- Size of the moving window.
-
- If an integer, the fixed number of observations used for
- each window.
-
- If a timedelta, str, or offset, the time period of each window. Each
- window will be a variable sized based on the observations included in
- the time-period. This is only valid for datetimelike indexes.
- To learn more about the offsets & frequency strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
-
- If a BaseIndexer subclass, the window boundaries
- based on the defined ``get_window_bounds`` method. Additional rolling
- keyword arguments, namely ``min_periods``, ``center``, ``closed`` and
- ``step`` will be passed to ``get_window_bounds``.
-
- min_periods : int, default None
- Minimum number of observations in window required to have a value;
- otherwise, result is ``np.nan``.
-
- For a window that is specified by an offset, ``min_periods`` will default to 1.
-
- For a window that is specified by an integer, ``min_periods`` will default
- to the size of the window.
-
- center : bool, default False
- If False, set the window labels as the right edge of the window index.
-
- If True, set the window labels as the center of the window index.
-
- win_type : str, default None
- If ``None``, all points are evenly weighted.
-
- If a string, it must be a valid `scipy.signal window function
- <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__.
-
- Certain Scipy window types require additional parameters to be passed
- in the aggregation function. The additional parameters must match
- the keywords specified in the Scipy window type method signature.
-
- on : str, optional
- For a DataFrame, a column label or Index level on which
- to calculate the rolling window, rather than the DataFrame's index.
-
- Provided integer column is ignored and excluded from result since
- an integer index is not used to calculate the rolling window.
-
- axis : int or str, default 0
- If ``0`` or ``'index'``, roll across the rows.
-
- If ``1`` or ``'columns'``, roll across the columns.
-
- For `Series` this parameter is unused and defaults to 0.
-
- closed : str, default None
- If ``'right'``, the first point in the window is excluded from calculations.
-
- If ``'left'``, the last point in the window is excluded from calculations.
-
- If ``'both'``, the no points in the window are excluded from calculations.
-
- If ``'neither'``, the first and last points in the window are excluded
- from calculations.
-
- Default ``None`` (``'right'``).
-
- .. versionchanged:: 1.2.0
-
- The closed parameter with fixed windows is now supported.
-
- step : int, default None
-
- .. versionadded:: 1.5.0
-
- Evaluate the window at every ``step`` result, equivalent to slicing as
- ``[::step]``. ``window`` must be an integer. Using a step argument other
- than None or 1 will produce a result with a different shape than the input.
-
- method : str {'single', 'table'}, default 'single'
-
- .. versionadded:: 1.3.0
-
- Execute the rolling operation per single column or row (``'single'``)
- or over the entire object (``'table'``).
-
- This argument is only implemented when specifying ``engine='numba'``
- in the method call.
-
- Returns
- -------
- ``Window`` subclass if a ``win_type`` is passed
-
- ``Rolling`` subclass if ``win_type`` is not passed
-
- See Also
- --------
- expanding : Provides expanding transformations.
- ewm : Provides exponential weighted functions.
-
- Notes
- -----
- See :ref:`Windowing Operations <window.generic>` for further usage details
- and examples.
-
- Examples
- --------
- >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]})
- >>> df
- B
- 0 0.0
- 1 1.0
- 2 2.0
- 3 NaN
- 4 4.0
-
- **window**
-
- Rolling sum with a window length of 2 observations.
-
- >>> df.rolling(2).sum()
- B
- 0 NaN
- 1 1.0
- 2 3.0
- 3 NaN
- 4 NaN
-
- Rolling sum with a window span of 2 seconds.
-
- >>> df_time = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]},
- ... index = [pd.Timestamp('20130101 09:00:00'),
- ... pd.Timestamp('20130101 09:00:02'),
- ... pd.Timestamp('20130101 09:00:03'),
- ... pd.Timestamp('20130101 09:00:05'),
- ... pd.Timestamp('20130101 09:00:06')])
-
- >>> df_time
- B
- 2013-01-01 09:00:00 0.0
- 2013-01-01 09:00:02 1.0
- 2013-01-01 09:00:03 2.0
- 2013-01-01 09:00:05 NaN
- 2013-01-01 09:00:06 4.0
-
- >>> df_time.rolling('2s').sum()
- B
- 2013-01-01 09:00:00 0.0
- 2013-01-01 09:00:02 1.0
- 2013-01-01 09:00:03 3.0
- 2013-01-01 09:00:05 NaN
- 2013-01-01 09:00:06 4.0
-
- Rolling sum with forward looking windows with 2 observations.
-
- >>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2)
- >>> df.rolling(window=indexer, min_periods=1).sum()
- B
- 0 1.0
- 1 3.0
- 2 2.0
- 3 4.0
- 4 4.0
-
- **min_periods**
-
- Rolling sum with a window length of 2 observations, but only needs a minimum of 1
- observation to calculate a value.
-
- >>> df.rolling(2, min_periods=1).sum()
- B
- 0 0.0
- 1 1.0
- 2 3.0
- 3 2.0
- 4 4.0
-
- **center**
-
- Rolling sum with the result assigned to the center of the window index.
-
- >>> df.rolling(3, min_periods=1, center=True).sum()
- B
- 0 1.0
- 1 3.0
- 2 3.0
- 3 6.0
- 4 4.0
-
- >>> df.rolling(3, min_periods=1, center=False).sum()
- B
- 0 0.0
- 1 1.0
- 2 3.0
- 3 3.0
- 4 6.0
-
- **step**
-
- Rolling sum with a window length of 2 observations, minimum of 1 observation to
- calculate a value, and a step of 2.
-
- >>> df.rolling(2, min_periods=1, step=2).sum()
- B
- 0 0.0
- 2 3.0
- 4 4.0
-
- **win_type**
-
- Rolling sum with a window length of 2, using the Scipy ``'gaussian'``
- window type. ``std`` is required in the aggregation function.
-
- >>> df.rolling(2, win_type='gaussian').sum(std=3)
- B
- 0 NaN
- 1 0.986207
- 2 2.958621
- 3 NaN
- 4 NaN
-
- **on**
-
- Rolling sum with a window length of 2 days.
-
- >>> df = pd.DataFrame({
- ... 'A': [pd.to_datetime('2020-01-01'),
- ... pd.to_datetime('2020-01-01'),
- ... pd.to_datetime('2020-01-02'),],
- ... 'B': [1, 2, 3], },
- ... index=pd.date_range('2020', periods=3))
-
- >>> df
- A B
- 2020-01-01 2020-01-01 1
- 2020-01-02 2020-01-01 2
- 2020-01-03 2020-01-02 3
-
- >>> df.rolling('2D', on='A').sum()
- A B
- 2020-01-01 2020-01-01 1.0
- 2020-01-02 2020-01-01 3.0
- 2020-01-03 2020-01-02 6.0
- """
-
- _attributes = [
- "window",
- "min_periods",
- "center",
- "win_type",
- "axis",
- "on",
- "closed",
- "step",
- "method",
- ]
-
- def _validate(self):
- super()._validate()
-
- if not isinstance(self.win_type, str):
- raise ValueError(f"Invalid win_type {self.win_type}")
- signal = import_optional_dependency(
- "scipy.signal.windows", extra="Scipy is required to generate window weight."
- )
- self._scipy_weight_generator = getattr(signal, self.win_type, None)
- if self._scipy_weight_generator is None:
- raise ValueError(f"Invalid win_type {self.win_type}")
-
- if isinstance(self.window, BaseIndexer):
- raise NotImplementedError(
- "BaseIndexer subclasses not implemented with win_types."
- )
- if not is_integer(self.window) or self.window < 0:
- raise ValueError("window must be an integer 0 or greater")
-
- if self.method != "single":
- raise NotImplementedError("'single' is the only supported method type.")
-
- def _center_window(self, result: np.ndarray, offset: int) -> np.ndarray:
- """
- Center the result in the window for weighted rolling aggregations.
- """
- if offset > 0:
- lead_indexer = [slice(offset, None)]
- result = np.copy(result[tuple(lead_indexer)])
- return result
-
- def _apply(
- self,
- func: Callable[[np.ndarray, int, int], np.ndarray],
- name: str,
- numeric_only: bool = False,
- numba_args: tuple[Any, ...] = (),
- **kwargs,
- ):
- """
- Rolling with weights statistical measure using supplied function.
-
- Designed to be used with passed-in Cython array-based functions.
-
- Parameters
- ----------
- func : callable function to apply
- name : str,
- numeric_only : bool, default False
- Whether to only operate on bool, int, and float columns
- numba_args : tuple
- unused
- **kwargs
- additional arguments for scipy windows if necessary
-
- Returns
- -------
- y : type of input
- """
- # "None" not callable [misc]
- window = self._scipy_weight_generator( # type: ignore[misc]
- self.window, **kwargs
- )
- offset = (len(window) - 1) // 2 if self.center else 0
-
- def homogeneous_func(values: np.ndarray):
- # calculation function
-
- if values.size == 0:
- return values.copy()
-
- def calc(x):
- additional_nans = np.array([np.nan] * offset)
- x = np.concatenate((x, additional_nans))
- return func(x, window, self.min_periods or len(window))
-
- with np.errstate(all="ignore"):
- # Our weighted aggregations return memoryviews
- result = np.asarray(calc(values))
-
- if self.center:
- result = self._center_window(result, offset)
-
- return result
-
- return self._apply_blockwise(homogeneous_func, name, numeric_only)[:: self.step]
-
- @doc(
- _shared_docs["aggregate"],
- see_also=dedent(
- """
- See Also
- --------
- pandas.DataFrame.aggregate : Similar DataFrame method.
- pandas.Series.aggregate : Similar Series method.
- """
- ),
- examples=dedent(
- """
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
- >>> df
- A B C
- 0 1 4 7
- 1 2 5 8
- 2 3 6 9
-
- >>> df.rolling(2, win_type="boxcar").agg("mean")
- A B C
- 0 NaN NaN NaN
- 1 1.5 4.5 7.5
- 2 2.5 5.5 8.5
- """
- ),
- klass="Series/DataFrame",
- axis="",
- )
- def aggregate(self, func, *args, **kwargs):
- result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
- if result is None:
- # these must apply directly
- result = func(self)
-
- return result
-
- agg = aggregate
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- kwargs_scipy,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="rolling",
- aggregation_description="weighted window sum",
- agg_method="sum",
- )
- def sum(self, numeric_only: bool = False, **kwargs):
- window_func = window_aggregations.roll_weighted_sum
- # error: Argument 1 to "_apply" of "Window" has incompatible type
- # "Callable[[ndarray, ndarray, int], ndarray]"; expected
- # "Callable[[ndarray, int, int], ndarray]"
- return self._apply(
- window_func, # type: ignore[arg-type]
- name="sum",
- numeric_only=numeric_only,
- **kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- kwargs_scipy,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="rolling",
- aggregation_description="weighted window mean",
- agg_method="mean",
- )
- def mean(self, numeric_only: bool = False, **kwargs):
- window_func = window_aggregations.roll_weighted_mean
- # error: Argument 1 to "_apply" of "Window" has incompatible type
- # "Callable[[ndarray, ndarray, int], ndarray]"; expected
- # "Callable[[ndarray, int, int], ndarray]"
- return self._apply(
- window_func, # type: ignore[arg-type]
- name="mean",
- numeric_only=numeric_only,
- **kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- kwargs_scipy,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="rolling",
- aggregation_description="weighted window variance",
- agg_method="var",
- )
- def var(self, ddof: int = 1, numeric_only: bool = False, **kwargs):
- window_func = partial(window_aggregations.roll_weighted_var, ddof=ddof)
- kwargs.pop("name", None)
- return self._apply(window_func, name="var", numeric_only=numeric_only, **kwargs)
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- kwargs_scipy,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="rolling",
- aggregation_description="weighted window standard deviation",
- agg_method="std",
- )
- def std(self, ddof: int = 1, numeric_only: bool = False, **kwargs):
- return zsqrt(
- self.var(ddof=ddof, name="std", numeric_only=numeric_only, **kwargs)
- )
-
-
-class RollingAndExpandingMixin(BaseWindow):
- def count(self, numeric_only: bool = False):
- window_func = window_aggregations.roll_sum
- return self._apply(window_func, name="count", numeric_only=numeric_only)
-
- def apply(
- self,
- func: Callable[..., Any],
- raw: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- args: tuple[Any, ...] | None = None,
- kwargs: dict[str, Any] | None = None,
- ):
- if args is None:
- args = ()
- if kwargs is None:
- kwargs = {}
-
- if not is_bool(raw):
- raise ValueError("raw parameter must be `True` or `False`")
-
- numba_args: tuple[Any, ...] = ()
- if maybe_use_numba(engine):
- if raw is False:
- raise ValueError("raw must be `True` when using the numba engine")
- numba_args = args
- if self.method == "single":
- apply_func = generate_numba_apply_func(
- func, **get_jit_arguments(engine_kwargs, kwargs)
- )
- else:
- apply_func = generate_numba_table_func(
- func, **get_jit_arguments(engine_kwargs, kwargs)
- )
- elif engine in ("cython", None):
- if engine_kwargs is not None:
- raise ValueError("cython engine does not accept engine_kwargs")
- apply_func = self._generate_cython_apply_func(args, kwargs, raw, func)
- else:
- raise ValueError("engine must be either 'numba' or 'cython'")
-
- return self._apply(
- apply_func,
- name="apply",
- numba_args=numba_args,
- )
-
- def _generate_cython_apply_func(
- self,
- args: tuple[Any, ...],
- kwargs: dict[str, Any],
- raw: bool,
- function: Callable[..., Any],
- ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, int], np.ndarray]:
- from pandas import Series
-
- window_func = partial(
- window_aggregations.roll_apply,
- args=args,
- kwargs=kwargs,
- raw=raw,
- function=function,
- )
-
- def apply_func(values, begin, end, min_periods, raw=raw):
- if not raw:
- # GH 45912
- values = Series(values, index=self._on, copy=False)
- return window_func(values, begin, end, min_periods)
-
- return apply_func
-
- def sum(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- if self.method == "table":
- func = generate_manual_numpy_nan_agg_with_axis(np.nansum)
- return self.apply(
- func,
- raw=True,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- else:
- from pandas.core._numba.kernels import sliding_sum
-
- return self._numba_apply(sliding_sum, engine_kwargs)
- window_func = window_aggregations.roll_sum
- return self._apply(window_func, name="sum", numeric_only=numeric_only)
-
- def max(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- if self.method == "table":
- func = generate_manual_numpy_nan_agg_with_axis(np.nanmax)
- return self.apply(
- func,
- raw=True,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- else:
- from pandas.core._numba.kernels import sliding_min_max
-
- return self._numba_apply(sliding_min_max, engine_kwargs, True)
- window_func = window_aggregations.roll_max
- return self._apply(window_func, name="max", numeric_only=numeric_only)
-
- def min(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- if self.method == "table":
- func = generate_manual_numpy_nan_agg_with_axis(np.nanmin)
- return self.apply(
- func,
- raw=True,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- else:
- from pandas.core._numba.kernels import sliding_min_max
-
- return self._numba_apply(sliding_min_max, engine_kwargs, False)
- window_func = window_aggregations.roll_min
- return self._apply(window_func, name="min", numeric_only=numeric_only)
-
- def mean(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- if self.method == "table":
- func = generate_manual_numpy_nan_agg_with_axis(np.nanmean)
- return self.apply(
- func,
- raw=True,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- else:
- from pandas.core._numba.kernels import sliding_mean
-
- return self._numba_apply(sliding_mean, engine_kwargs)
- window_func = window_aggregations.roll_mean
- return self._apply(window_func, name="mean", numeric_only=numeric_only)
-
- def median(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- if self.method == "table":
- func = generate_manual_numpy_nan_agg_with_axis(np.nanmedian)
- else:
- func = np.nanmedian
-
- return self.apply(
- func,
- raw=True,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- window_func = window_aggregations.roll_median_c
- return self._apply(window_func, name="median", numeric_only=numeric_only)
-
- def std(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- if self.method == "table":
- raise NotImplementedError("std not supported with method='table'")
- from pandas.core._numba.kernels import sliding_var
-
- return zsqrt(self._numba_apply(sliding_var, engine_kwargs, ddof))
- window_func = window_aggregations.roll_var
-
- def zsqrt_func(values, begin, end, min_periods):
- return zsqrt(window_func(values, begin, end, min_periods, ddof=ddof))
-
- return self._apply(
- zsqrt_func,
- name="std",
- numeric_only=numeric_only,
- )
-
- def var(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- if self.method == "table":
- raise NotImplementedError("var not supported with method='table'")
- from pandas.core._numba.kernels import sliding_var
-
- return self._numba_apply(sliding_var, engine_kwargs, ddof)
- window_func = partial(window_aggregations.roll_var, ddof=ddof)
- return self._apply(
- window_func,
- name="var",
- numeric_only=numeric_only,
- )
-
- def skew(self, numeric_only: bool = False):
- window_func = window_aggregations.roll_skew
- return self._apply(
- window_func,
- name="skew",
- numeric_only=numeric_only,
- )
-
- def sem(self, ddof: int = 1, numeric_only: bool = False):
- # Raise here so error message says sem instead of std
- self._validate_numeric_only("sem", numeric_only)
- return self.std(numeric_only=numeric_only) / (
- self.count(numeric_only=numeric_only) - ddof
- ).pow(0.5)
-
- def kurt(self, numeric_only: bool = False):
- window_func = window_aggregations.roll_kurt
- return self._apply(
- window_func,
- name="kurt",
- numeric_only=numeric_only,
- )
-
- def quantile(
- self,
- quantile: float,
- interpolation: QuantileInterpolation = "linear",
- numeric_only: bool = False,
- ):
- if quantile == 1.0:
- window_func = window_aggregations.roll_max
- elif quantile == 0.0:
- window_func = window_aggregations.roll_min
- else:
- window_func = partial(
- window_aggregations.roll_quantile,
- quantile=quantile,
- interpolation=interpolation,
- )
-
- return self._apply(window_func, name="quantile", numeric_only=numeric_only)
-
- def rank(
- self,
- method: WindowingRankType = "average",
- ascending: bool = True,
- pct: bool = False,
- numeric_only: bool = False,
- ):
- window_func = partial(
- window_aggregations.roll_rank,
- method=method,
- ascending=ascending,
- percentile=pct,
- )
-
- return self._apply(window_func, name="rank", numeric_only=numeric_only)
-
- def cov(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- ddof: int = 1,
- numeric_only: bool = False,
- ):
- if self.step is not None:
- raise NotImplementedError("step not implemented for cov")
- self._validate_numeric_only("cov", numeric_only)
-
- from pandas import Series
-
- def cov_func(x, y):
- x_array = self._prep_values(x)
- y_array = self._prep_values(y)
- window_indexer = self._get_window_indexer()
- min_periods = (
- self.min_periods
- if self.min_periods is not None
- else window_indexer.window_size
- )
- start, end = window_indexer.get_window_bounds(
- num_values=len(x_array),
- min_periods=min_periods,
- center=self.center,
- closed=self.closed,
- step=self.step,
- )
- self._check_window_bounds(start, end, len(x_array))
-
- with np.errstate(all="ignore"):
- mean_x_y = window_aggregations.roll_mean(
- x_array * y_array, start, end, min_periods
- )
- mean_x = window_aggregations.roll_mean(x_array, start, end, min_periods)
- mean_y = window_aggregations.roll_mean(y_array, start, end, min_periods)
- count_x_y = window_aggregations.roll_sum(
- notna(x_array + y_array).astype(np.float64), start, end, 0
- )
- result = (mean_x_y - mean_x * mean_y) * (count_x_y / (count_x_y - ddof))
- return Series(result, index=x.index, name=x.name, copy=False)
-
- return self._apply_pairwise(
- self._selected_obj, other, pairwise, cov_func, numeric_only
- )
-
- def corr(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- ddof: int = 1,
- numeric_only: bool = False,
- ):
- if self.step is not None:
- raise NotImplementedError("step not implemented for corr")
- self._validate_numeric_only("corr", numeric_only)
-
- from pandas import Series
-
- def corr_func(x, y):
- x_array = self._prep_values(x)
- y_array = self._prep_values(y)
- window_indexer = self._get_window_indexer()
- min_periods = (
- self.min_periods
- if self.min_periods is not None
- else window_indexer.window_size
- )
- start, end = window_indexer.get_window_bounds(
- num_values=len(x_array),
- min_periods=min_periods,
- center=self.center,
- closed=self.closed,
- step=self.step,
- )
- self._check_window_bounds(start, end, len(x_array))
-
- with np.errstate(all="ignore"):
- mean_x_y = window_aggregations.roll_mean(
- x_array * y_array, start, end, min_periods
- )
- mean_x = window_aggregations.roll_mean(x_array, start, end, min_periods)
- mean_y = window_aggregations.roll_mean(y_array, start, end, min_periods)
- count_x_y = window_aggregations.roll_sum(
- notna(x_array + y_array).astype(np.float64), start, end, 0
- )
- x_var = window_aggregations.roll_var(
- x_array, start, end, min_periods, ddof
- )
- y_var = window_aggregations.roll_var(
- y_array, start, end, min_periods, ddof
- )
- numerator = (mean_x_y - mean_x * mean_y) * (
- count_x_y / (count_x_y - ddof)
- )
- denominator = (x_var * y_var) ** 0.5
- result = numerator / denominator
- return Series(result, index=x.index, name=x.name, copy=False)
-
- return self._apply_pairwise(
- self._selected_obj, other, pairwise, corr_func, numeric_only
- )
-
-
-class Rolling(RollingAndExpandingMixin):
- _attributes: list[str] = [
- "window",
- "min_periods",
- "center",
- "win_type",
- "axis",
- "on",
- "closed",
- "step",
- "method",
- ]
-
- def _validate(self):
- super()._validate()
-
- # we allow rolling on a datetimelike index
- if (
- self.obj.empty
- or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex))
- ) and isinstance(self.window, (str, BaseOffset, timedelta)):
- self._validate_datetimelike_monotonic()
-
- # this will raise ValueError on non-fixed freqs
- try:
- freq = to_offset(self.window)
- except (TypeError, ValueError) as err:
- raise ValueError(
- f"passed window {self.window} is not "
- "compatible with a datetimelike index"
- ) from err
- if isinstance(self._on, PeriodIndex):
- # error: Incompatible types in assignment (expression has type
- # "float", variable has type "Optional[int]")
- self._win_freq_i8 = freq.nanos / ( # type: ignore[assignment]
- self._on.freq.nanos / self._on.freq.n
- )
- else:
- self._win_freq_i8 = freq.nanos
-
- # min_periods must be an integer
- if self.min_periods is None:
- self.min_periods = 1
-
- if self.step is not None:
- raise NotImplementedError(
- "step is not supported with frequency windows"
- )
-
- elif isinstance(self.window, BaseIndexer):
- # Passed BaseIndexer subclass should handle all other rolling kwargs
- pass
- elif not is_integer(self.window) or self.window < 0:
- raise ValueError("window must be an integer 0 or greater")
-
- def _validate_datetimelike_monotonic(self) -> None:
- """
- Validate self._on is monotonic (increasing or decreasing) and has
- no NaT values for frequency windows.
- """
- if self._on.hasnans:
- self._raise_monotonic_error("values must not have NaT")
- if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing):
- self._raise_monotonic_error("values must be monotonic")
-
- def _raise_monotonic_error(self, msg: str):
- on = self.on
- if on is None:
- if self.axis == 0:
- on = "index"
- else:
- on = "column"
- raise ValueError(f"{on} {msg}")
-
- @doc(
- _shared_docs["aggregate"],
- see_also=dedent(
- """
- See Also
- --------
- pandas.Series.rolling : Calling object with Series data.
- pandas.DataFrame.rolling : Calling object with DataFrame data.
- """
- ),
- examples=dedent(
- """
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
- >>> df
- A B C
- 0 1 4 7
- 1 2 5 8
- 2 3 6 9
-
- >>> df.rolling(2).sum()
- A B C
- 0 NaN NaN NaN
- 1 3.0 9.0 15.0
- 2 5.0 11.0 17.0
-
- >>> df.rolling(2).agg({"A": "sum", "B": "min"})
- A B
- 0 NaN NaN
- 1 3.0 4.0
- 2 5.0 5.0
- """
- ),
- klass="Series/Dataframe",
- axis="",
- )
- def aggregate(self, func, *args, **kwargs):
- return super().aggregate(func, *args, **kwargs)
-
- agg = aggregate
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([2, 3, np.nan, 10])
- >>> s.rolling(2).count()
- 0 NaN
- 1 2.0
- 2 1.0
- 3 1.0
- dtype: float64
- >>> s.rolling(3).count()
- 0 NaN
- 1 NaN
- 2 2.0
- 3 2.0
- dtype: float64
- >>> s.rolling(4).count()
- 0 NaN
- 1 NaN
- 2 NaN
- 3 3.0
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="count of non NaN observations",
- agg_method="count",
- )
- def count(self, numeric_only: bool = False):
- return super().count(numeric_only)
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- window_apply_parameters,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="rolling",
- aggregation_description="custom aggregation function",
- agg_method="apply",
- )
- def apply(
- self,
- func: Callable[..., Any],
- raw: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- args: tuple[Any, ...] | None = None,
- kwargs: dict[str, Any] | None = None,
- ):
- return super().apply(
- func,
- raw=raw,
- engine=engine,
- engine_kwargs=engine_kwargs,
- args=args,
- kwargs=kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes,
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([1, 2, 3, 4, 5])
- >>> s
- 0 1
- 1 2
- 2 3
- 3 4
- 4 5
- dtype: int64
-
- >>> s.rolling(3).sum()
- 0 NaN
- 1 NaN
- 2 6.0
- 3 9.0
- 4 12.0
- dtype: float64
-
- >>> s.rolling(3, center=True).sum()
- 0 NaN
- 1 6.0
- 2 9.0
- 3 12.0
- 4 NaN
- dtype: float64
-
- For DataFrame, each sum is computed column-wise.
-
- >>> df = pd.DataFrame({{"A": s, "B": s ** 2}})
- >>> df
- A B
- 0 1 1
- 1 2 4
- 2 3 9
- 3 4 16
- 4 5 25
-
- >>> df.rolling(3).sum()
- A B
- 0 NaN NaN
- 1 NaN NaN
- 2 6.0 14.0
- 3 9.0 29.0
- 4 12.0 50.0
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="sum",
- agg_method="sum",
- )
- def sum(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().sum(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes[:-1],
- window_method="rolling",
- aggregation_description="maximum",
- agg_method="max",
- )
- def max(
- self,
- numeric_only: bool = False,
- *args,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- **kwargs,
- ):
- return super().max(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes,
- create_section_header("Examples"),
- dedent(
- """
- Performing a rolling minimum with a window size of 3.
-
- >>> s = pd.Series([4, 3, 5, 2, 6])
- >>> s.rolling(3).min()
- 0 NaN
- 1 NaN
- 2 3.0
- 3 2.0
- 4 2.0
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="minimum",
- agg_method="min",
- )
- def min(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().min(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes,
- create_section_header("Examples"),
- dedent(
- """
- The below examples will show rolling mean calculations with window sizes of
- two and three, respectively.
-
- >>> s = pd.Series([1, 2, 3, 4])
- >>> s.rolling(2).mean()
- 0 NaN
- 1 1.5
- 2 2.5
- 3 3.5
- dtype: float64
-
- >>> s.rolling(3).mean()
- 0 NaN
- 1 NaN
- 2 2.0
- 3 3.0
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="mean",
- agg_method="mean",
- )
- def mean(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().mean(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes,
- create_section_header("Examples"),
- dedent(
- """
- Compute the rolling median of a series with a window size of 3.
-
- >>> s = pd.Series([0, 1, 2, 3, 4])
- >>> s.rolling(3).median()
- 0 NaN
- 1 NaN
- 2 1.0
- 3 2.0
- 4 3.0
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="median",
- agg_method="median",
- )
- def median(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().median(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- window_agg_numba_parameters("1.4"),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "numpy.std : Equivalent method for NumPy array.\n",
- template_see_also,
- create_section_header("Notes"),
- dedent(
- """
- The default ``ddof`` of 1 used in :meth:`Series.std` is different
- than the default ``ddof`` of 0 in :func:`numpy.std`.
-
- A minimum of one period is required for the rolling calculation.\n
- """
- ).replace("\n", "", 1),
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
- >>> s.rolling(3).std()
- 0 NaN
- 1 NaN
- 2 0.577350
- 3 1.000000
- 4 1.000000
- 5 1.154701
- 6 0.000000
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="standard deviation",
- agg_method="std",
- )
- def std(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().std(
- ddof=ddof,
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- window_agg_numba_parameters("1.4"),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "numpy.var : Equivalent method for NumPy array.\n",
- template_see_also,
- create_section_header("Notes"),
- dedent(
- """
- The default ``ddof`` of 1 used in :meth:`Series.var` is different
- than the default ``ddof`` of 0 in :func:`numpy.var`.
-
- A minimum of one period is required for the rolling calculation.\n
- """
- ).replace("\n", "", 1),
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
- >>> s.rolling(3).var()
- 0 NaN
- 1 NaN
- 2 0.333333
- 3 1.000000
- 4 1.000000
- 5 1.333333
- 6 0.000000
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="variance",
- agg_method="var",
- )
- def var(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().var(
- ddof=ddof,
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "scipy.stats.skew : Third moment of a probability density.\n",
- template_see_also,
- create_section_header("Notes"),
- "A minimum of three periods is required for the rolling calculation.\n",
- window_method="rolling",
- aggregation_description="unbiased skewness",
- agg_method="skew",
- )
- def skew(self, numeric_only: bool = False):
- return super().skew(numeric_only=numeric_only)
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- "A minimum of one period is required for the calculation.\n\n",
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([0, 1, 2, 3])
- >>> s.rolling(2, min_periods=1).sem()
- 0 NaN
- 1 0.707107
- 2 0.707107
- 3 0.707107
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="standard error of mean",
- agg_method="sem",
- )
- def sem(self, ddof: int = 1, numeric_only: bool = False):
- # Raise here so error message says sem instead of std
- self._validate_numeric_only("sem", numeric_only)
- return self.std(numeric_only=numeric_only) / (
- self.count(numeric_only) - ddof
- ).pow(0.5)
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "scipy.stats.kurtosis : Reference SciPy method.\n",
- template_see_also,
- create_section_header("Notes"),
- "A minimum of four periods is required for the calculation.\n\n",
- create_section_header("Examples"),
- dedent(
- """
- The example below will show a rolling calculation with a window size of
- four matching the equivalent function call using `scipy.stats`.
-
- >>> arr = [1, 2, 3, 4, 999]
- >>> import scipy.stats
- >>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}")
- -1.200000
- >>> print(f"{{scipy.stats.kurtosis(arr[1:], bias=False):.6f}}")
- 3.999946
- >>> s = pd.Series(arr)
- >>> s.rolling(4).kurt()
- 0 NaN
- 1 NaN
- 2 NaN
- 3 -1.200000
- 4 3.999946
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="Fisher's definition of kurtosis without bias",
- agg_method="kurt",
- )
- def kurt(self, numeric_only: bool = False):
- return super().kurt(numeric_only=numeric_only)
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- quantile : float
- Quantile to compute. 0 <= quantile <= 1.
- interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}}
- This optional parameter specifies the interpolation method to use,
- when the desired quantile lies between two data points `i` and `j`:
-
- * linear: `i + (j - i) * fraction`, where `fraction` is the
- fractional part of the index surrounded by `i` and `j`.
- * lower: `i`.
- * higher: `j`.
- * nearest: `i` or `j` whichever is nearest.
- * midpoint: (`i` + `j`) / 2.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([1, 2, 3, 4])
- >>> s.rolling(2).quantile(.4, interpolation='lower')
- 0 NaN
- 1 1.0
- 2 2.0
- 3 3.0
- dtype: float64
-
- >>> s.rolling(2).quantile(.4, interpolation='midpoint')
- 0 NaN
- 1 1.5
- 2 2.5
- 3 3.5
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="quantile",
- agg_method="quantile",
- )
- def quantile(
- self,
- quantile: float,
- interpolation: QuantileInterpolation = "linear",
- numeric_only: bool = False,
- ):
- return super().quantile(
- quantile=quantile,
- interpolation=interpolation,
- numeric_only=numeric_only,
- )
-
- @doc(
- template_header,
- ".. versionadded:: 1.4.0 \n\n",
- create_section_header("Parameters"),
- dedent(
- """
- method : {{'average', 'min', 'max'}}, default 'average'
- How to rank the group of records that have the same value (i.e. ties):
-
- * average: average rank of the group
- * min: lowest rank in the group
- * max: highest rank in the group
-
- ascending : bool, default True
- Whether or not the elements should be ranked in ascending order.
- pct : bool, default False
- Whether or not to display the returned rankings in percentile
- form.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([1, 4, 2, 3, 5, 3])
- >>> s.rolling(3).rank()
- 0 NaN
- 1 NaN
- 2 2.0
- 3 2.0
- 4 3.0
- 5 1.5
- dtype: float64
-
- >>> s.rolling(3).rank(method="max")
- 0 NaN
- 1 NaN
- 2 2.0
- 3 2.0
- 4 3.0
- 5 2.0
- dtype: float64
-
- >>> s.rolling(3).rank(method="min")
- 0 NaN
- 1 NaN
- 2 2.0
- 3 2.0
- 4 3.0
- 5 1.0
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="rank",
- agg_method="rank",
- )
- def rank(
- self,
- method: WindowingRankType = "average",
- ascending: bool = True,
- pct: bool = False,
- numeric_only: bool = False,
- ):
- return super().rank(
- method=method,
- ascending=ascending,
- pct=pct,
- numeric_only=numeric_only,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- other : Series or DataFrame, optional
- If not supplied then will default to self and produce pairwise
- output.
- pairwise : bool, default None
- If False then only matching columns between self and other will be
- used and the output will be a DataFrame.
- If True then all pairwise combinations will be calculated and the
- output will be a MultiIndexed DataFrame in the case of DataFrame
- inputs. In the case of missing elements, only complete pairwise
- observations will be used.
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="rolling",
- aggregation_description="sample covariance",
- agg_method="cov",
- )
- def cov(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- ddof: int = 1,
- numeric_only: bool = False,
- ):
- return super().cov(
- other=other,
- pairwise=pairwise,
- ddof=ddof,
- numeric_only=numeric_only,
- )
-
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- other : Series or DataFrame, optional
- If not supplied then will default to self and produce pairwise
- output.
- pairwise : bool, default None
- If False then only matching columns between self and other will be
- used and the output will be a DataFrame.
- If True then all pairwise combinations will be calculated and the
- output will be a MultiIndexed DataFrame in the case of DataFrame
- inputs. In the case of missing elements, only complete pairwise
- observations will be used.
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- dedent(
- """
- cov : Similar method to calculate covariance.
- numpy.corrcoef : NumPy Pearson's correlation calculation.
- """
- ).replace("\n", "", 1),
- template_see_also,
- create_section_header("Notes"),
- dedent(
- """
- This function uses Pearson's definition of correlation
- (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient).
-
- When `other` is not specified, the output will be self correlation (e.g.
- all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise`
- set to `True`.
-
- Function will return ``NaN`` for correlations of equal valued sequences;
- this is the result of a 0/0 division error.
-
- When `pairwise` is set to `False`, only matching columns between `self` and
- `other` will be used.
-
- When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame
- with the original index on the first level, and the `other` DataFrame
- columns on the second level.
-
- In the case of missing elements, only complete pairwise observations
- will be used.\n
- """
- ).replace("\n", "", 1),
- create_section_header("Examples"),
- dedent(
- """
- The below example shows a rolling calculation with a window size of
- four matching the equivalent function call using :meth:`numpy.corrcoef`.
-
- >>> v1 = [3, 3, 3, 5, 8]
- >>> v2 = [3, 4, 4, 4, 8]
- >>> # numpy returns a 2X2 array, the correlation coefficient
- >>> # is the number at entry [0][1]
- >>> print(f"{{np.corrcoef(v1[:-1], v2[:-1])[0][1]:.6f}}")
- 0.333333
- >>> print(f"{{np.corrcoef(v1[1:], v2[1:])[0][1]:.6f}}")
- 0.916949
- >>> s1 = pd.Series(v1)
- >>> s2 = pd.Series(v2)
- >>> s1.rolling(4).corr(s2)
- 0 NaN
- 1 NaN
- 2 NaN
- 3 0.333333
- 4 0.916949
- dtype: float64
-
- The below example shows a similar rolling calculation on a
- DataFrame using the pairwise option.
-
- >>> matrix = np.array([[51., 35.], [49., 30.], [47., 32.],\
- [46., 31.], [50., 36.]])
- >>> print(np.corrcoef(matrix[:-1,0], matrix[:-1,1]).round(7))
- [[1. 0.6263001]
- [0.6263001 1. ]]
- >>> print(np.corrcoef(matrix[1:,0], matrix[1:,1]).round(7))
- [[1. 0.5553681]
- [0.5553681 1. ]]
- >>> df = pd.DataFrame(matrix, columns=['X','Y'])
- >>> df
- X Y
- 0 51.0 35.0
- 1 49.0 30.0
- 2 47.0 32.0
- 3 46.0 31.0
- 4 50.0 36.0
- >>> df.rolling(4).corr(pairwise=True)
- X Y
- 0 X NaN NaN
- Y NaN NaN
- 1 X NaN NaN
- Y NaN NaN
- 2 X NaN NaN
- Y NaN NaN
- 3 X 1.000000 0.626300
- Y 0.626300 1.000000
- 4 X 1.000000 0.555368
- Y 0.555368 1.000000
- """
- ).replace("\n", "", 1),
- window_method="rolling",
- aggregation_description="correlation",
- agg_method="corr",
- )
- def corr(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- ddof: int = 1,
- numeric_only: bool = False,
- ):
- return super().corr(
- other=other,
- pairwise=pairwise,
- ddof=ddof,
- numeric_only=numeric_only,
- )
-
-
-Rolling.__doc__ = Window.__doc__
-
-
-class RollingGroupby(BaseWindowGroupby, Rolling):
- """
- Provide a rolling groupby implementation.
- """
-
- _attributes = Rolling._attributes + BaseWindowGroupby._attributes
-
- def _get_window_indexer(self) -> GroupbyIndexer:
- """
- Return an indexer class that will compute the window start and end bounds
-
- Returns
- -------
- GroupbyIndexer
- """
- rolling_indexer: type[BaseIndexer]
- indexer_kwargs: dict[str, Any] | None = None
- index_array = self._index_array
- if isinstance(self.window, BaseIndexer):
- rolling_indexer = type(self.window)
- indexer_kwargs = self.window.__dict__.copy()
- assert isinstance(indexer_kwargs, dict) # for mypy
- # We'll be using the index of each group later
- indexer_kwargs.pop("index_array", None)
- window = self.window
- elif self._win_freq_i8 is not None:
- rolling_indexer = VariableWindowIndexer
- # error: Incompatible types in assignment (expression has type
- # "int", variable has type "BaseIndexer")
- window = self._win_freq_i8 # type: ignore[assignment]
- else:
- rolling_indexer = FixedWindowIndexer
- window = self.window
- window_indexer = GroupbyIndexer(
- index_array=index_array,
- window_size=window,
- groupby_indices=self._grouper.indices,
- window_indexer=rolling_indexer,
- indexer_kwargs=indexer_kwargs,
- )
- return window_indexer
-
- def _validate_datetimelike_monotonic(self):
- """
- Validate that each group in self._on is monotonic
- """
- # GH 46061
- if self._on.hasnans:
- self._raise_monotonic_error("values must not have NaT")
- for group_indices in self._grouper.indices.values():
- group_on = self._on.take(group_indices)
- if not (
- group_on.is_monotonic_increasing or group_on.is_monotonic_decreasing
- ):
- on = "index" if self.on is None else self.on
- raise ValueError(
- f"Each group within {on} must be monotonic. "
- f"Sort the values in {on} first."
- )
diff --git a/contrib/python/pandas/py3/pandas/errors/__init__.py b/contrib/python/pandas/py3/pandas/errors/__init__.py
deleted file mode 100644
index e9c0047711f..00000000000
--- a/contrib/python/pandas/py3/pandas/errors/__init__.py
+++ /dev/null
@@ -1,637 +0,0 @@
-"""
-Expose public exceptions & warnings
-"""
-from __future__ import annotations
-
-import ctypes
-
-from pandas._config.config import OptionError
-
-from pandas._libs.tslibs import (
- OutOfBoundsDatetime,
- OutOfBoundsTimedelta,
-)
-
-from pandas.util.version import InvalidVersion
-
-
-class IntCastingNaNError(ValueError):
- """
- Exception raised when converting (``astype``) an array with NaN to an integer type.
- """
-
-
-class NullFrequencyError(ValueError):
- """
- Exception raised when a ``freq`` cannot be null.
-
- Particularly ``DatetimeIndex.shift``, ``TimedeltaIndex.shift``,
- ``PeriodIndex.shift``.
- """
-
-
-class PerformanceWarning(Warning):
- """
- Warning raised when there is a possible performance impact.
- """
-
-
-class UnsupportedFunctionCall(ValueError):
- """
- Exception raised when attempting to call a unsupported numpy function.
-
- For example, ``np.cumsum(groupby_object)``.
- """
-
-
-class UnsortedIndexError(KeyError):
- """
- Error raised when slicing a MultiIndex which has not been lexsorted.
-
- Subclass of `KeyError`.
- """
-
-
-class ParserError(ValueError):
- """
- Exception that is raised by an error encountered in parsing file contents.
-
- This is a generic error raised for errors encountered when functions like
- `read_csv` or `read_html` are parsing contents of a file.
-
- See Also
- --------
- read_csv : Read CSV (comma-separated) file into a DataFrame.
- read_html : Read HTML table into a DataFrame.
- """
-
-
-class DtypeWarning(Warning):
- """
- Warning raised when reading different dtypes in a column from a file.
-
- Raised for a dtype incompatibility. This can happen whenever `read_csv`
- or `read_table` encounter non-uniform dtypes in a column(s) of a given
- CSV file.
-
- See Also
- --------
- read_csv : Read CSV (comma-separated) file into a DataFrame.
- read_table : Read general delimited file into a DataFrame.
-
- Notes
- -----
- This warning is issued when dealing with larger files because the dtype
- checking happens per chunk read.
-
- Despite the warning, the CSV file is read with mixed types in a single
- column which will be an object type. See the examples below to better
- understand this issue.
-
- Examples
- --------
- This example creates and reads a large CSV file with a column that contains
- `int` and `str`.
-
- >>> df = pd.DataFrame({'a': (['1'] * 100000 + ['X'] * 100000 +
- ... ['1'] * 100000),
- ... 'b': ['b'] * 300000}) # doctest: +SKIP
- >>> df.to_csv('test.csv', index=False) # doctest: +SKIP
- >>> df2 = pd.read_csv('test.csv') # doctest: +SKIP
- ... # DtypeWarning: Columns (0) have mixed types
-
- Important to notice that ``df2`` will contain both `str` and `int` for the
- same input, '1'.
-
- >>> df2.iloc[262140, 0] # doctest: +SKIP
- '1'
- >>> type(df2.iloc[262140, 0]) # doctest: +SKIP
- <class 'str'>
- >>> df2.iloc[262150, 0] # doctest: +SKIP
- 1
- >>> type(df2.iloc[262150, 0]) # doctest: +SKIP
- <class 'int'>
-
- One way to solve this issue is using the `dtype` parameter in the
- `read_csv` and `read_table` functions to explicit the conversion:
-
- >>> df2 = pd.read_csv('test.csv', sep=',', dtype={'a': str}) # doctest: +SKIP
-
- No warning was issued.
- """
-
-
-class EmptyDataError(ValueError):
- """
- Exception raised in ``pd.read_csv`` when empty data or header is encountered.
- """
-
-
-class ParserWarning(Warning):
- """
- Warning raised when reading a file that doesn't use the default 'c' parser.
-
- Raised by `pd.read_csv` and `pd.read_table` when it is necessary to change
- parsers, generally from the default 'c' parser to 'python'.
-
- It happens due to a lack of support or functionality for parsing a
- particular attribute of a CSV file with the requested engine.
-
- Currently, 'c' unsupported options include the following parameters:
-
- 1. `sep` other than a single character (e.g. regex separators)
- 2. `skipfooter` higher than 0
- 3. `sep=None` with `delim_whitespace=False`
-
- The warning can be avoided by adding `engine='python'` as a parameter in
- `pd.read_csv` and `pd.read_table` methods.
-
- See Also
- --------
- pd.read_csv : Read CSV (comma-separated) file into DataFrame.
- pd.read_table : Read general delimited file into DataFrame.
-
- Examples
- --------
- Using a `sep` in `pd.read_csv` other than a single character:
-
- >>> import io
- >>> csv = '''a;b;c
- ... 1;1,8
- ... 1;2,1'''
- >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]') # doctest: +SKIP
- ... # ParserWarning: Falling back to the 'python' engine...
-
- Adding `engine='python'` to `pd.read_csv` removes the Warning:
-
- >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]', engine='python')
- """
-
-
-class MergeError(ValueError):
- """
- Exception raised when merging data.
-
- Subclass of ``ValueError``.
- """
-
-
-class AccessorRegistrationWarning(Warning):
- """
- Warning for attribute conflicts in accessor registration.
- """
-
-
-class AbstractMethodError(NotImplementedError):
- """
- Raise this error instead of NotImplementedError for abstract methods.
- """
-
- def __init__(self, class_instance, methodtype: str = "method") -> None:
- types = {"method", "classmethod", "staticmethod", "property"}
- if methodtype not in types:
- raise ValueError(
- f"methodtype must be one of {methodtype}, got {types} instead."
- )
- self.methodtype = methodtype
- self.class_instance = class_instance
-
- def __str__(self) -> str:
- if self.methodtype == "classmethod":
- name = self.class_instance.__name__
- else:
- name = type(self.class_instance).__name__
- return f"This {self.methodtype} must be defined in the concrete class {name}"
-
-
-class NumbaUtilError(Exception):
- """
- Error raised for unsupported Numba engine routines.
- """
-
-
-class DuplicateLabelError(ValueError):
- """
- Error raised when an operation would introduce duplicate labels.
-
- .. versionadded:: 1.2.0
-
- Examples
- --------
- >>> s = pd.Series([0, 1, 2], index=['a', 'b', 'c']).set_flags(
- ... allows_duplicate_labels=False
- ... )
- >>> s.reindex(['a', 'a', 'b'])
- Traceback (most recent call last):
- ...
- DuplicateLabelError: Index has duplicates.
- positions
- label
- a [0, 1]
- """
-
-
-class InvalidIndexError(Exception):
- """
- Exception raised when attempting to use an invalid index key.
-
- .. versionadded:: 1.1.0
- """
-
-
-class DataError(Exception):
- """
- Exceptionn raised when performing an operation on non-numerical data.
-
- For example, calling ``ohlc`` on a non-numerical column or a function
- on a rolling window.
- """
-
-
-class SpecificationError(Exception):
- """
- Exception raised by ``agg`` when the functions are ill-specified.
-
- The exception raised in two scenarios.
-
- The first way is calling ``agg`` on a
- Dataframe or Series using a nested renamer (dict-of-dict).
-
- The second way is calling ``agg`` on a Dataframe with duplicated functions
- names without assigning column name.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2],
- ... 'B': range(5),
- ... 'C': range(5)})
- >>> df.groupby('A').B.agg({'foo': 'count'}) # doctest: +SKIP
- ... # SpecificationError: nested renamer is not supported
-
- >>> df.groupby('A').agg({'B': {'foo': ['sum', 'max']}}) # doctest: +SKIP
- ... # SpecificationError: nested renamer is not supported
-
- >>> df.groupby('A').agg(['min', 'min']) # doctest: +SKIP
- ... # SpecificationError: nested renamer is not supported
- """
-
-
-class SettingWithCopyError(ValueError):
- """
- Exception raised when trying to set on a copied slice from a ``DataFrame``.
-
- The ``mode.chained_assignment`` needs to be set to set to 'raise.' This can
- happen unintentionally when chained indexing.
-
- For more information on evaluation order,
- see :ref:`the user guide<indexing.evaluation_order>`.
-
- For more information on view vs. copy,
- see :ref:`the user guide<indexing.view_versus_copy>`.
-
- Examples
- --------
- >>> pd.options.mode.chained_assignment = 'raise'
- >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A'])
- >>> df.loc[0:3]['A'] = 'a' # doctest: +SKIP
- ... # SettingWithCopyError: A value is trying to be set on a copy of a...
- """
-
-
-class SettingWithCopyWarning(Warning):
- """
- Warning raised when trying to set on a copied slice from a ``DataFrame``.
-
- The ``mode.chained_assignment`` needs to be set to set to 'warn.'
- 'Warn' is the default option. This can happen unintentionally when
- chained indexing.
-
- For more information on evaluation order,
- see :ref:`the user guide<indexing.evaluation_order>`.
-
- For more information on view vs. copy,
- see :ref:`the user guide<indexing.view_versus_copy>`.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A'])
- >>> df.loc[0:3]['A'] = 'a' # doctest: +SKIP
- ... # SettingWithCopyWarning: A value is trying to be set on a copy of a...
- """
-
-
-class ChainedAssignmentError(Warning):
- """
- Warning raised when trying to set using chained assignment.
-
- When the ``mode.copy_on_write`` option is enabled, chained assignment can
- never work. In such a situation, we are always setting into a temporary
- object that is the result of an indexing operation (getitem), which under
- Copy-on-Write always behaves as a copy. Thus, assigning through a chain
- can never update the original Series or DataFrame.
-
- For more information on view vs. copy,
- see :ref:`the user guide<indexing.view_versus_copy>`.
-
- Examples
- --------
- >>> pd.options.mode.copy_on_write = True
- >>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A'])
- >>> df["A"][0:3] = 10 # doctest: +SKIP
- ... # ChainedAssignmentError: ...
- >>> pd.options.mode.copy_on_write = False
- """
-
-
-_chained_assignment_msg = (
- "A value is trying to be set on a copy of a DataFrame or Series "
- "through chained assignment.\n"
- "When using the Copy-on-Write mode, such chained assignment never works "
- "to update the original DataFrame or Series, because the intermediate "
- "object on which we are setting values always behaves as a copy.\n\n"
- "Try using '.loc[row_indexer, col_indexer] = value' instead, to perform "
- "the assignment in a single step.\n\n"
- "See the caveats in the documentation: "
- "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
- "indexing.html#returning-a-view-versus-a-copy"
-)
-
-
-class NumExprClobberingError(NameError):
- """
- Exception raised when trying to use a built-in numexpr name as a variable name.
-
- ``eval`` or ``query`` will throw the error if the engine is set
- to 'numexpr'. 'numexpr' is the default engine value for these methods if the
- numexpr package is installed.
-
- Examples
- --------
- >>> df = pd.DataFrame({'abs': [1, 1, 1]})
- >>> df.query("abs > 2") # doctest: +SKIP
- ... # NumExprClobberingError: Variables in expression "(abs) > (2)" overlap...
- >>> sin, a = 1, 2
- >>> pd.eval("sin + a", engine='numexpr') # doctest: +SKIP
- ... # NumExprClobberingError: Variables in expression "(sin) + (a)" overlap...
- """
-
-
-class UndefinedVariableError(NameError):
- """
- Exception raised by ``query`` or ``eval`` when using an undefined variable name.
-
- It will also specify whether the undefined variable is local or not.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 1]})
- >>> df.query("A > x") # doctest: +SKIP
- ... # UndefinedVariableError: name 'x' is not defined
- >>> df.query("A > @y") # doctest: +SKIP
- ... # UndefinedVariableError: local variable 'y' is not defined
- >>> pd.eval('x + 1') # doctest: +SKIP
- ... # UndefinedVariableError: name 'x' is not defined
- """
-
- def __init__(self, name: str, is_local: bool | None = None) -> None:
- base_msg = f"{repr(name)} is not defined"
- if is_local:
- msg = f"local variable {base_msg}"
- else:
- msg = f"name {base_msg}"
- super().__init__(msg)
-
-
-class IndexingError(Exception):
- """
- Exception is raised when trying to index and there is a mismatch in dimensions.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 1]})
- >>> df.loc[..., ..., 'A'] # doctest: +SKIP
- ... # IndexingError: indexer may only contain one '...' entry
- >>> df = pd.DataFrame({'A': [1, 1, 1]})
- >>> df.loc[1, ..., ...] # doctest: +SKIP
- ... # IndexingError: Too many indexers
- >>> df[pd.Series([True], dtype=bool)] # doctest: +SKIP
- ... # IndexingError: Unalignable boolean Series provided as indexer...
- >>> s = pd.Series(range(2),
- ... index = pd.MultiIndex.from_product([["a", "b"], ["c"]]))
- >>> s.loc["a", "c", "d"] # doctest: +SKIP
- ... # IndexingError: Too many indexers
- """
-
-
-class PyperclipException(RuntimeError):
- """
- Exception raised when clipboard functionality is unsupported.
-
- Raised by ``to_clipboard()`` and ``read_clipboard()``.
- """
-
-
-class PyperclipWindowsException(PyperclipException):
- """
- Exception raised when clipboard functionality is unsupported by Windows.
-
- Access to the clipboard handle would be denied due to some other
- window process is accessing it.
- """
-
- def __init__(self, message: str) -> None:
- # attr only exists on Windows, so typing fails on other platforms
- message += f" ({ctypes.WinError()})" # type: ignore[attr-defined]
- super().__init__(message)
-
-
-class CSSWarning(UserWarning):
- """
- Warning is raised when converting css styling fails.
-
- This can be due to the styling not having an equivalent value or because the
- styling isn't properly formatted.
-
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 1]})
- >>> (df.style.applymap(lambda x: 'background-color: blueGreenRed;')
- ... .to_excel('styled.xlsx')) # doctest: +SKIP
- ... # CSSWarning: Unhandled color format: 'blueGreenRed'
- >>> (df.style.applymap(lambda x: 'border: 1px solid red red;')
- ... .to_excel('styled.xlsx')) # doctest: +SKIP
- ... # CSSWarning: Too many tokens provided to "border" (expected 1-3)
- """
-
-
-class PossibleDataLossError(Exception):
- """
- Exception raised when trying to open a HDFStore file when already opened.
-
- Examples
- --------
- >>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP
- >>> store.open("w") # doctest: +SKIP
- ... # PossibleDataLossError: Re-opening the file [my-store] with mode [a]...
- """
-
-
-class ClosedFileError(Exception):
- """
- Exception is raised when trying to perform an operation on a closed HDFStore file.
-
- Examples
- --------
- >>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP
- >>> store.close() # doctest: +SKIP
- >>> store.keys() # doctest: +SKIP
- ... # ClosedFileError: my-store file is not open!
- """
-
-
-class IncompatibilityWarning(Warning):
- """
- Warning raised when trying to use where criteria on an incompatible HDF5 file.
- """
-
-
-class AttributeConflictWarning(Warning):
- """
- Warning raised when index attributes conflict when using HDFStore.
-
- Occurs when attempting to append an index with a different
- name than the existing index on an HDFStore or attempting to append an index with a
- different frequency than the existing index on an HDFStore.
- """
-
-
-class DatabaseError(OSError):
- """
- Error is raised when executing sql with bad syntax or sql that throws an error.
-
- Examples
- --------
- >>> from sqlite3 import connect
- >>> conn = connect(':memory:')
- >>> pd.read_sql('select * test', conn) # doctest: +SKIP
- ... # DatabaseError: Execution failed on sql 'test': near "test": syntax error
- """
-
-
-class PossiblePrecisionLoss(Warning):
- """
- Warning raised by to_stata on a column with a value outside or equal to int64.
-
- When the column value is outside or equal to the int64 value the column is
- converted to a float64 dtype.
-
- Examples
- --------
- >>> df = pd.DataFrame({"s": pd.Series([1, 2**53], dtype=np.int64)})
- >>> df.to_stata('test') # doctest: +SKIP
- ... # PossiblePrecisionLoss: Column converted from int64 to float64...
- """
-
-
-class ValueLabelTypeMismatch(Warning):
- """
- Warning raised by to_stata on a category column that contains non-string values.
-
- Examples
- --------
- >>> df = pd.DataFrame({"categories": pd.Series(["a", 2], dtype="category")})
- >>> df.to_stata('test') # doctest: +SKIP
- ... # ValueLabelTypeMismatch: Stata value labels (pandas categories) must be str...
- """
-
-
-class InvalidColumnName(Warning):
- """
- Warning raised by to_stata the column contains a non-valid stata name.
-
- Because the column name is an invalid Stata variable, the name needs to be
- converted.
-
- Examples
- --------
- >>> df = pd.DataFrame({"0categories": pd.Series([2, 2])})
- >>> df.to_stata('test') # doctest: +SKIP
- ... # InvalidColumnName: Not all pandas column names were valid Stata variable...
- """
-
-
-class CategoricalConversionWarning(Warning):
- """
- Warning is raised when reading a partial labeled Stata file using a iterator.
-
- Examples
- --------
- >>> from pandas.io.stata import StataReader
- >>> with StataReader('dta_file', chunksize=2) as reader: # doctest: +SKIP
- ... for i, block in enumerate(reader):
- ... print(i, block)
- ... # CategoricalConversionWarning: One or more series with value labels...
- """
-
-
-class LossySetitemError(Exception):
- """
- Raised when trying to do a __setitem__ on an np.ndarray that is not lossless.
- """
-
-
-class NoBufferPresent(Exception):
- """
- Exception is raised in _get_data_buffer to signal that there is no requested buffer.
- """
-
-
-class InvalidComparison(Exception):
- """
- Exception is raised by _validate_comparison_value to indicate an invalid comparison.
- """
-
-
-__all__ = [
- "AbstractMethodError",
- "AccessorRegistrationWarning",
- "AttributeConflictWarning",
- "CategoricalConversionWarning",
- "ClosedFileError",
- "CSSWarning",
- "DatabaseError",
- "DataError",
- "DtypeWarning",
- "DuplicateLabelError",
- "EmptyDataError",
- "IncompatibilityWarning",
- "IntCastingNaNError",
- "InvalidColumnName",
- "InvalidComparison",
- "InvalidIndexError",
- "InvalidVersion",
- "IndexingError",
- "LossySetitemError",
- "MergeError",
- "NoBufferPresent",
- "NullFrequencyError",
- "NumbaUtilError",
- "NumExprClobberingError",
- "OptionError",
- "OutOfBoundsDatetime",
- "OutOfBoundsTimedelta",
- "ParserError",
- "ParserWarning",
- "PerformanceWarning",
- "PossibleDataLossError",
- "PossiblePrecisionLoss",
- "PyperclipException",
- "PyperclipWindowsException",
- "SettingWithCopyError",
- "SettingWithCopyWarning",
- "SpecificationError",
- "UndefinedVariableError",
- "UnsortedIndexError",
- "UnsupportedFunctionCall",
- "ValueLabelTypeMismatch",
-]
diff --git a/contrib/python/pandas/py3/pandas/io/__init__.py b/contrib/python/pandas/py3/pandas/io/__init__.py
deleted file mode 100644
index bd3ddc09393..00000000000
--- a/contrib/python/pandas/py3/pandas/io/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
- # import modules that have public classes/functions
- from pandas.io import (
- formats,
- json,
- stata,
- )
-
- # and mark only those modules as public
- __all__ = ["formats", "json", "stata"]
diff --git a/contrib/python/pandas/py3/pandas/io/_util.py b/contrib/python/pandas/py3/pandas/io/_util.py
deleted file mode 100644
index d2a001f0cf9..00000000000
--- a/contrib/python/pandas/py3/pandas/io/_util.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from __future__ import annotations
-
-from pandas.compat._optional import import_optional_dependency
-
-import pandas as pd
-
-
-def _arrow_dtype_mapping() -> dict:
- pa = import_optional_dependency("pyarrow")
- return {
- pa.int8(): pd.Int8Dtype(),
- pa.int16(): pd.Int16Dtype(),
- pa.int32(): pd.Int32Dtype(),
- pa.int64(): pd.Int64Dtype(),
- pa.uint8(): pd.UInt8Dtype(),
- pa.uint16(): pd.UInt16Dtype(),
- pa.uint32(): pd.UInt32Dtype(),
- pa.uint64(): pd.UInt64Dtype(),
- pa.bool_(): pd.BooleanDtype(),
- pa.string(): pd.StringDtype(),
- pa.float32(): pd.Float32Dtype(),
- pa.float64(): pd.Float64Dtype(),
- }
diff --git a/contrib/python/pandas/py3/pandas/io/api.py b/contrib/python/pandas/py3/pandas/io/api.py
deleted file mode 100644
index 4e8b34a61df..00000000000
--- a/contrib/python/pandas/py3/pandas/io/api.py
+++ /dev/null
@@ -1,65 +0,0 @@
-"""
-Data IO api
-"""
-
-from pandas.io.clipboards import read_clipboard
-from pandas.io.excel import (
- ExcelFile,
- ExcelWriter,
- read_excel,
-)
-from pandas.io.feather_format import read_feather
-from pandas.io.gbq import read_gbq
-from pandas.io.html import read_html
-from pandas.io.json import read_json
-from pandas.io.orc import read_orc
-from pandas.io.parquet import read_parquet
-from pandas.io.parsers import (
- read_csv,
- read_fwf,
- read_table,
-)
-from pandas.io.pickle import (
- read_pickle,
- to_pickle,
-)
-from pandas.io.pytables import (
- HDFStore,
- read_hdf,
-)
-from pandas.io.sas import read_sas
-from pandas.io.spss import read_spss
-from pandas.io.sql import (
- read_sql,
- read_sql_query,
- read_sql_table,
-)
-from pandas.io.stata import read_stata
-from pandas.io.xml import read_xml
-
-__all__ = [
- "ExcelFile",
- "ExcelWriter",
- "HDFStore",
- "read_clipboard",
- "read_csv",
- "read_excel",
- "read_feather",
- "read_fwf",
- "read_gbq",
- "read_hdf",
- "read_html",
- "read_json",
- "read_orc",
- "read_parquet",
- "read_pickle",
- "read_sas",
- "read_spss",
- "read_sql",
- "read_sql_query",
- "read_sql_table",
- "read_stata",
- "read_table",
- "read_xml",
- "to_pickle",
-]
diff --git a/contrib/python/pandas/py3/pandas/io/clipboard/__init__.py b/contrib/python/pandas/py3/pandas/io/clipboard/__init__.py
deleted file mode 100644
index e574ed2c805..00000000000
--- a/contrib/python/pandas/py3/pandas/io/clipboard/__init__.py
+++ /dev/null
@@ -1,678 +0,0 @@
-"""
-Pyperclip
-
-A cross-platform clipboard module for Python,
-with copy & paste functions for plain text.
-By Al Sweigart al@inventwithpython.com
-BSD License
-
-Usage:
- import pyperclip
- pyperclip.copy('The text to be copied to the clipboard.')
- spam = pyperclip.paste()
-
- if not pyperclip.is_available():
- print("Copy functionality unavailable!")
-
-On Windows, no additional modules are needed.
-On Mac, the pyobjc module is used, falling back to the pbcopy and pbpaste cli
- commands. (These commands should come with OS X.).
-On Linux, install xclip or xsel via package manager. For example, in Debian:
- sudo apt-get install xclip
- sudo apt-get install xsel
-
-Otherwise on Linux, you will need the PyQt5 modules installed.
-
-This module does not work with PyGObject yet.
-
-Cygwin is currently not supported.
-
-Security Note: This module runs programs with these names:
- - which
- - where
- - pbcopy
- - pbpaste
- - xclip
- - xsel
- - klipper
- - qdbus
-A malicious user could rename or add programs with these names, tricking
-Pyperclip into running them with whatever permissions the Python process has.
-
-"""
-
-__version__ = "1.7.0"
-
-
-import contextlib
-import ctypes
-from ctypes import (
- c_size_t,
- c_wchar,
- c_wchar_p,
- get_errno,
- sizeof,
-)
-import os
-import platform
-from shutil import which
-import subprocess
-import time
-import warnings
-
-from pandas.errors import (
- PyperclipException,
- PyperclipWindowsException,
-)
-from pandas.util._exceptions import find_stack_level
-
-# `import PyQt4` sys.exit()s if DISPLAY is not in the environment.
-# Thus, we need to detect the presence of $DISPLAY manually
-# and not load PyQt4 if it is absent.
-HAS_DISPLAY = os.getenv("DISPLAY")
-
-EXCEPT_MSG = """
- Pyperclip could not find a copy/paste mechanism for your system.
- For more information, please visit
- https://pyperclip.readthedocs.io/en/latest/#not-implemented-error
- """
-
-ENCODING = "utf-8"
-
-# The "which" unix command finds where a command is.
-if platform.system() == "Windows":
- WHICH_CMD = "where"
-else:
- WHICH_CMD = "which"
-
-
-def _executable_exists(name):
- return (
- subprocess.call(
- [WHICH_CMD, name], stdout=subprocess.PIPE, stderr=subprocess.PIPE
- )
- == 0
- )
-
-
-def _stringifyText(text) -> str:
- acceptedTypes = (str, int, float, bool)
- if not isinstance(text, acceptedTypes):
- raise PyperclipException(
- f"only str, int, float, and bool values "
- f"can be copied to the clipboard, not {type(text).__name__}"
- )
- return str(text)
-
-
-def init_osx_pbcopy_clipboard():
- def copy_osx_pbcopy(text):
- text = _stringifyText(text) # Converts non-str values to str.
- with subprocess.Popen(
- ["pbcopy", "w"], stdin=subprocess.PIPE, close_fds=True
- ) as p:
- p.communicate(input=text.encode(ENCODING))
-
- def paste_osx_pbcopy():
- with subprocess.Popen(
- ["pbpaste", "r"], stdout=subprocess.PIPE, close_fds=True
- ) as p:
- stdout = p.communicate()[0]
- return stdout.decode(ENCODING)
-
- return copy_osx_pbcopy, paste_osx_pbcopy
-
-
-def init_osx_pyobjc_clipboard():
- def copy_osx_pyobjc(text):
- """Copy string argument to clipboard"""
- text = _stringifyText(text) # Converts non-str values to str.
- newStr = Foundation.NSString.stringWithString_(text).nsstring()
- newData = newStr.dataUsingEncoding_(Foundation.NSUTF8StringEncoding)
- board = AppKit.NSPasteboard.generalPasteboard()
- board.declareTypes_owner_([AppKit.NSStringPboardType], None)
- board.setData_forType_(newData, AppKit.NSStringPboardType)
-
- def paste_osx_pyobjc():
- """Returns contents of clipboard"""
- board = AppKit.NSPasteboard.generalPasteboard()
- content = board.stringForType_(AppKit.NSStringPboardType)
- return content
-
- return copy_osx_pyobjc, paste_osx_pyobjc
-
-
-def init_qt_clipboard():
- global QApplication
- # $DISPLAY should exist
-
- # Try to import from qtpy, but if that fails try PyQt5 then PyQt4
- try:
- from qtpy.QtWidgets import QApplication
- except ImportError:
- try:
- from PyQt5.QtWidgets import QApplication
- except ImportError:
- from PyQt4.QtGui import QApplication
-
- app = QApplication.instance()
- if app is None:
- app = QApplication([])
-
- def copy_qt(text):
- text = _stringifyText(text) # Converts non-str values to str.
- cb = app.clipboard()
- cb.setText(text)
-
- def paste_qt() -> str:
- cb = app.clipboard()
- return str(cb.text())
-
- return copy_qt, paste_qt
-
-
-def init_xclip_clipboard():
- DEFAULT_SELECTION = "c"
- PRIMARY_SELECTION = "p"
-
- def copy_xclip(text, primary=False):
- text = _stringifyText(text) # Converts non-str values to str.
- selection = DEFAULT_SELECTION
- if primary:
- selection = PRIMARY_SELECTION
- with subprocess.Popen(
- ["xclip", "-selection", selection], stdin=subprocess.PIPE, close_fds=True
- ) as p:
- p.communicate(input=text.encode(ENCODING))
-
- def paste_xclip(primary=False):
- selection = DEFAULT_SELECTION
- if primary:
- selection = PRIMARY_SELECTION
- with subprocess.Popen(
- ["xclip", "-selection", selection, "-o"],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- close_fds=True,
- ) as p:
- stdout = p.communicate()[0]
- # Intentionally ignore extraneous output on stderr when clipboard is empty
- return stdout.decode(ENCODING)
-
- return copy_xclip, paste_xclip
-
-
-def init_xsel_clipboard():
- DEFAULT_SELECTION = "-b"
- PRIMARY_SELECTION = "-p"
-
- def copy_xsel(text, primary=False):
- text = _stringifyText(text) # Converts non-str values to str.
- selection_flag = DEFAULT_SELECTION
- if primary:
- selection_flag = PRIMARY_SELECTION
- with subprocess.Popen(
- ["xsel", selection_flag, "-i"], stdin=subprocess.PIPE, close_fds=True
- ) as p:
- p.communicate(input=text.encode(ENCODING))
-
- def paste_xsel(primary=False):
- selection_flag = DEFAULT_SELECTION
- if primary:
- selection_flag = PRIMARY_SELECTION
- with subprocess.Popen(
- ["xsel", selection_flag, "-o"], stdout=subprocess.PIPE, close_fds=True
- ) as p:
- stdout = p.communicate()[0]
- return stdout.decode(ENCODING)
-
- return copy_xsel, paste_xsel
-
-
-def init_klipper_clipboard():
- def copy_klipper(text):
- text = _stringifyText(text) # Converts non-str values to str.
- with subprocess.Popen(
- [
- "qdbus",
- "org.kde.klipper",
- "/klipper",
- "setClipboardContents",
- text.encode(ENCODING),
- ],
- stdin=subprocess.PIPE,
- close_fds=True,
- ) as p:
- p.communicate(input=None)
-
- def paste_klipper():
- with subprocess.Popen(
- ["qdbus", "org.kde.klipper", "/klipper", "getClipboardContents"],
- stdout=subprocess.PIPE,
- close_fds=True,
- ) as p:
- stdout = p.communicate()[0]
-
- # Workaround for https://bugs.kde.org/show_bug.cgi?id=342874
- # TODO: https://github.com/asweigart/pyperclip/issues/43
- clipboardContents = stdout.decode(ENCODING)
- # even if blank, Klipper will append a newline at the end
- assert len(clipboardContents) > 0
- # make sure that newline is there
- assert clipboardContents.endswith("\n")
- if clipboardContents.endswith("\n"):
- clipboardContents = clipboardContents[:-1]
- return clipboardContents
-
- return copy_klipper, paste_klipper
-
-
-def init_dev_clipboard_clipboard():
- def copy_dev_clipboard(text):
- text = _stringifyText(text) # Converts non-str values to str.
- if text == "":
- warnings.warn(
- "Pyperclip cannot copy a blank string to the clipboard on Cygwin. "
- "This is effectively a no-op.",
- stacklevel=find_stack_level(),
- )
- if "\r" in text:
- warnings.warn(
- "Pyperclip cannot handle \\r characters on Cygwin.",
- stacklevel=find_stack_level(),
- )
-
- with open("/dev/clipboard", "w") as fd:
- fd.write(text)
-
- def paste_dev_clipboard() -> str:
- with open("/dev/clipboard") as fd:
- content = fd.read()
- return content
-
- return copy_dev_clipboard, paste_dev_clipboard
-
-
-def init_no_clipboard():
- class ClipboardUnavailable:
- def __call__(self, *args, **kwargs):
- raise PyperclipException(EXCEPT_MSG)
-
- def __bool__(self) -> bool:
- return False
-
- return ClipboardUnavailable(), ClipboardUnavailable()
-
-
-# Windows-related clipboard functions:
-class CheckedCall:
- def __init__(self, f) -> None:
- super().__setattr__("f", f)
-
- def __call__(self, *args):
- ret = self.f(*args)
- if not ret and get_errno():
- raise PyperclipWindowsException("Error calling " + self.f.__name__)
- return ret
-
- def __setattr__(self, key, value):
- setattr(self.f, key, value)
-
-
-def init_windows_clipboard():
- global HGLOBAL, LPVOID, DWORD, LPCSTR, INT
- global HWND, HINSTANCE, HMENU, BOOL, UINT, HANDLE
- from ctypes.wintypes import (
- BOOL,
- DWORD,
- HANDLE,
- HGLOBAL,
- HINSTANCE,
- HMENU,
- HWND,
- INT,
- LPCSTR,
- LPVOID,
- UINT,
- )
-
- windll = ctypes.windll
- msvcrt = ctypes.CDLL("msvcrt")
-
- safeCreateWindowExA = CheckedCall(windll.user32.CreateWindowExA)
- safeCreateWindowExA.argtypes = [
- DWORD,
- LPCSTR,
- LPCSTR,
- DWORD,
- INT,
- INT,
- INT,
- INT,
- HWND,
- HMENU,
- HINSTANCE,
- LPVOID,
- ]
- safeCreateWindowExA.restype = HWND
-
- safeDestroyWindow = CheckedCall(windll.user32.DestroyWindow)
- safeDestroyWindow.argtypes = [HWND]
- safeDestroyWindow.restype = BOOL
-
- OpenClipboard = windll.user32.OpenClipboard
- OpenClipboard.argtypes = [HWND]
- OpenClipboard.restype = BOOL
-
- safeCloseClipboard = CheckedCall(windll.user32.CloseClipboard)
- safeCloseClipboard.argtypes = []
- safeCloseClipboard.restype = BOOL
-
- safeEmptyClipboard = CheckedCall(windll.user32.EmptyClipboard)
- safeEmptyClipboard.argtypes = []
- safeEmptyClipboard.restype = BOOL
-
- safeGetClipboardData = CheckedCall(windll.user32.GetClipboardData)
- safeGetClipboardData.argtypes = [UINT]
- safeGetClipboardData.restype = HANDLE
-
- safeSetClipboardData = CheckedCall(windll.user32.SetClipboardData)
- safeSetClipboardData.argtypes = [UINT, HANDLE]
- safeSetClipboardData.restype = HANDLE
-
- safeGlobalAlloc = CheckedCall(windll.kernel32.GlobalAlloc)
- safeGlobalAlloc.argtypes = [UINT, c_size_t]
- safeGlobalAlloc.restype = HGLOBAL
-
- safeGlobalLock = CheckedCall(windll.kernel32.GlobalLock)
- safeGlobalLock.argtypes = [HGLOBAL]
- safeGlobalLock.restype = LPVOID
-
- safeGlobalUnlock = CheckedCall(windll.kernel32.GlobalUnlock)
- safeGlobalUnlock.argtypes = [HGLOBAL]
- safeGlobalUnlock.restype = BOOL
-
- wcslen = CheckedCall(msvcrt.wcslen)
- wcslen.argtypes = [c_wchar_p]
- wcslen.restype = UINT
-
- GMEM_MOVEABLE = 0x0002
- CF_UNICODETEXT = 13
-
- @contextlib.contextmanager
- def window():
- """
- Context that provides a valid Windows hwnd.
- """
- # we really just need the hwnd, so setting "STATIC"
- # as predefined lpClass is just fine.
- hwnd = safeCreateWindowExA(
- 0, b"STATIC", None, 0, 0, 0, 0, 0, None, None, None, None
- )
- try:
- yield hwnd
- finally:
- safeDestroyWindow(hwnd)
-
- @contextlib.contextmanager
- def clipboard(hwnd):
- """
- Context manager that opens the clipboard and prevents
- other applications from modifying the clipboard content.
- """
- # We may not get the clipboard handle immediately because
- # some other application is accessing it (?)
- # We try for at least 500ms to get the clipboard.
- t = time.time() + 0.5
- success = False
- while time.time() < t:
- success = OpenClipboard(hwnd)
- if success:
- break
- time.sleep(0.01)
- if not success:
- raise PyperclipWindowsException("Error calling OpenClipboard")
-
- try:
- yield
- finally:
- safeCloseClipboard()
-
- def copy_windows(text):
- # This function is heavily based on
- # http://msdn.com/ms649016#_win32_Copying_Information_to_the_Clipboard
-
- text = _stringifyText(text) # Converts non-str values to str.
-
- with window() as hwnd:
- # http://msdn.com/ms649048
- # If an application calls OpenClipboard with hwnd set to NULL,
- # EmptyClipboard sets the clipboard owner to NULL;
- # this causes SetClipboardData to fail.
- # => We need a valid hwnd to copy something.
- with clipboard(hwnd):
- safeEmptyClipboard()
-
- if text:
- # http://msdn.com/ms649051
- # If the hMem parameter identifies a memory object,
- # the object must have been allocated using the
- # function with the GMEM_MOVEABLE flag.
- count = wcslen(text) + 1
- handle = safeGlobalAlloc(GMEM_MOVEABLE, count * sizeof(c_wchar))
- locked_handle = safeGlobalLock(handle)
-
- ctypes.memmove(
- c_wchar_p(locked_handle),
- c_wchar_p(text),
- count * sizeof(c_wchar),
- )
-
- safeGlobalUnlock(handle)
- safeSetClipboardData(CF_UNICODETEXT, handle)
-
- def paste_windows():
- with clipboard(None):
- handle = safeGetClipboardData(CF_UNICODETEXT)
- if not handle:
- # GetClipboardData may return NULL with errno == NO_ERROR
- # if the clipboard is empty.
- # (Also, it may return a handle to an empty buffer,
- # but technically that's not empty)
- return ""
- return c_wchar_p(handle).value
-
- return copy_windows, paste_windows
-
-
-def init_wsl_clipboard():
- def copy_wsl(text):
- text = _stringifyText(text) # Converts non-str values to str.
- with subprocess.Popen(["clip.exe"], stdin=subprocess.PIPE, close_fds=True) as p:
- p.communicate(input=text.encode(ENCODING))
-
- def paste_wsl():
- with subprocess.Popen(
- ["powershell.exe", "-command", "Get-Clipboard"],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- close_fds=True,
- ) as p:
- stdout = p.communicate()[0]
- # WSL appends "\r\n" to the contents.
- return stdout[:-2].decode(ENCODING)
-
- return copy_wsl, paste_wsl
-
-
-# Automatic detection of clipboard mechanisms
-# and importing is done in determine_clipboard():
-def determine_clipboard():
- """
- Determine the OS/platform and set the copy() and paste() functions
- accordingly.
- """
- global Foundation, AppKit, qtpy, PyQt4, PyQt5
-
- # Setup for the CYGWIN platform:
- if (
- "cygwin" in platform.system().lower()
- ): # Cygwin has a variety of values returned by platform.system(),
- # such as 'CYGWIN_NT-6.1'
- # FIXME(pyperclip#55): pyperclip currently does not support Cygwin,
- # see https://github.com/asweigart/pyperclip/issues/55
- if os.path.exists("/dev/clipboard"):
- warnings.warn(
- "Pyperclip's support for Cygwin is not perfect, "
- "see https://github.com/asweigart/pyperclip/issues/55",
- stacklevel=find_stack_level(),
- )
- return init_dev_clipboard_clipboard()
-
- # Setup for the WINDOWS platform:
- elif os.name == "nt" or platform.system() == "Windows":
- return init_windows_clipboard()
-
- if platform.system() == "Linux":
- if which("wslconfig.exe"):
- return init_wsl_clipboard()
-
- # Setup for the macOS platform:
- if os.name == "mac" or platform.system() == "Darwin":
- try:
- import AppKit
- import Foundation # check if pyobjc is installed
- except ImportError:
- return init_osx_pbcopy_clipboard()
- else:
- return init_osx_pyobjc_clipboard()
-
- # Setup for the LINUX platform:
- if HAS_DISPLAY:
- if _executable_exists("xsel"):
- return init_xsel_clipboard()
- if _executable_exists("xclip"):
- return init_xclip_clipboard()
- if _executable_exists("klipper") and _executable_exists("qdbus"):
- return init_klipper_clipboard()
-
- try:
- # qtpy is a small abstraction layer that lets you write applications
- # using a single api call to either PyQt or PySide.
- # https://pypi.python.org/project/QtPy
- import qtpy # check if qtpy is installed
- except ImportError:
- # If qtpy isn't installed, fall back on importing PyQt4.
- try:
- import PyQt5 # check if PyQt5 is installed
- except ImportError:
- try:
- import PyQt4 # check if PyQt4 is installed
- except ImportError:
- pass # We want to fail fast for all non-ImportError exceptions.
- else:
- return init_qt_clipboard()
- else:
- return init_qt_clipboard()
- else:
- return init_qt_clipboard()
-
- return init_no_clipboard()
-
-
-def set_clipboard(clipboard):
- """
- Explicitly sets the clipboard mechanism. The "clipboard mechanism" is how
- the copy() and paste() functions interact with the operating system to
- implement the copy/paste feature. The clipboard parameter must be one of:
- - pbcopy
- - pyobjc (default on macOS)
- - qt
- - xclip
- - xsel
- - klipper
- - windows (default on Windows)
- - no (this is what is set when no clipboard mechanism can be found)
- """
- global copy, paste
-
- clipboard_types = {
- "pbcopy": init_osx_pbcopy_clipboard,
- "pyobjc": init_osx_pyobjc_clipboard,
- "qt": init_qt_clipboard, # TODO - split this into 'qtpy', 'pyqt4', and 'pyqt5'
- "xclip": init_xclip_clipboard,
- "xsel": init_xsel_clipboard,
- "klipper": init_klipper_clipboard,
- "windows": init_windows_clipboard,
- "no": init_no_clipboard,
- }
-
- if clipboard not in clipboard_types:
- allowed_clipboard_types = [repr(_) for _ in clipboard_types]
- raise ValueError(
- f"Argument must be one of {', '.join(allowed_clipboard_types)}"
- )
-
- # Sets pyperclip's copy() and paste() functions:
- copy, paste = clipboard_types[clipboard]()
-
-
-def lazy_load_stub_copy(text):
- """
- A stub function for copy(), which will load the real copy() function when
- called so that the real copy() function is used for later calls.
-
- This allows users to import pyperclip without having determine_clipboard()
- automatically run, which will automatically select a clipboard mechanism.
- This could be a problem if it selects, say, the memory-heavy PyQt4 module
- but the user was just going to immediately call set_clipboard() to use a
- different clipboard mechanism.
-
- The lazy loading this stub function implements gives the user a chance to
- call set_clipboard() to pick another clipboard mechanism. Or, if the user
- simply calls copy() or paste() without calling set_clipboard() first,
- will fall back on whatever clipboard mechanism that determine_clipboard()
- automatically chooses.
- """
- global copy, paste
- copy, paste = determine_clipboard()
- return copy(text)
-
-
-def lazy_load_stub_paste():
- """
- A stub function for paste(), which will load the real paste() function when
- called so that the real paste() function is used for later calls.
-
- This allows users to import pyperclip without having determine_clipboard()
- automatically run, which will automatically select a clipboard mechanism.
- This could be a problem if it selects, say, the memory-heavy PyQt4 module
- but the user was just going to immediately call set_clipboard() to use a
- different clipboard mechanism.
-
- The lazy loading this stub function implements gives the user a chance to
- call set_clipboard() to pick another clipboard mechanism. Or, if the user
- simply calls copy() or paste() without calling set_clipboard() first,
- will fall back on whatever clipboard mechanism that determine_clipboard()
- automatically chooses.
- """
- global copy, paste
- copy, paste = determine_clipboard()
- return paste()
-
-
-def is_available() -> bool:
- return copy != lazy_load_stub_copy and paste != lazy_load_stub_paste
-
-
-# Initially, copy() and paste() are set to lazy loading wrappers which will
-# set `copy` and `paste` to real functions the first time they're used, unless
-# set_clipboard() or determine_clipboard() is called first.
-copy, paste = lazy_load_stub_copy, lazy_load_stub_paste
-
-
-__all__ = ["copy", "paste", "set_clipboard", "determine_clipboard"]
-
-# pandas aliases
-clipboard_get = paste
-clipboard_set = copy
diff --git a/contrib/python/pandas/py3/pandas/io/clipboards.py b/contrib/python/pandas/py3/pandas/io/clipboards.py
deleted file mode 100644
index e5981e8d15e..00000000000
--- a/contrib/python/pandas/py3/pandas/io/clipboards.py
+++ /dev/null
@@ -1,178 +0,0 @@
-""" io on the clipboard """
-from __future__ import annotations
-
-from io import StringIO
-from typing import TYPE_CHECKING
-import warnings
-
-from pandas._libs import lib
-from pandas.util._exceptions import find_stack_level
-from pandas.util._validators import check_dtype_backend
-
-from pandas.core.dtypes.generic import ABCDataFrame
-
-from pandas import (
- get_option,
- option_context,
-)
-
-if TYPE_CHECKING:
- from pandas._typing import DtypeBackend
-
-
-def read_clipboard(
- sep: str = r"\s+",
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- **kwargs,
-): # pragma: no cover
- r"""
- Read text from clipboard and pass to read_csv.
-
- Parameters
- ----------
- sep : str, default '\s+'
- A string or regex delimiter. The default of '\s+' denotes
- one or more whitespace characters.
-
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- **kwargs
- See read_csv for the full argument list.
-
- Returns
- -------
- DataFrame
- A parsed DataFrame object.
- """
- encoding = kwargs.pop("encoding", "utf-8")
-
- # only utf-8 is valid for passed value because that's what clipboard
- # supports
- if encoding is not None and encoding.lower().replace("-", "") != "utf8":
- raise NotImplementedError("reading from clipboard only supports utf-8 encoding")
-
- check_dtype_backend(dtype_backend)
-
- from pandas.io.clipboard import clipboard_get
- from pandas.io.parsers import read_csv
-
- text = clipboard_get()
-
- # Try to decode (if needed, as "text" might already be a string here).
- try:
- text = text.decode(kwargs.get("encoding") or get_option("display.encoding"))
- except AttributeError:
- pass
-
- # Excel copies into clipboard with \t separation
- # inspect no more then the 10 first lines, if they
- # all contain an equal number (>0) of tabs, infer
- # that this came from excel and set 'sep' accordingly
- lines = text[:10000].split("\n")[:-1][:10]
-
- # Need to remove leading white space, since read_csv
- # accepts:
- # a b
- # 0 1 2
- # 1 3 4
-
- counts = {x.lstrip(" ").count("\t") for x in lines}
- if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
- sep = "\t"
- # check the number of leading tabs in the first line
- # to account for index columns
- index_length = len(lines[0]) - len(lines[0].lstrip(" \t"))
- if index_length != 0:
- kwargs.setdefault("index_col", list(range(index_length)))
-
- # Edge case where sep is specified to be None, return to default
- if sep is None and kwargs.get("delim_whitespace") is None:
- sep = r"\s+"
-
- # Regex separator currently only works with python engine.
- # Default to python if separator is multi-character (regex)
- if len(sep) > 1 and kwargs.get("engine") is None:
- kwargs["engine"] = "python"
- elif len(sep) > 1 and kwargs.get("engine") == "c":
- warnings.warn(
- "read_clipboard with regex separator does not work properly with c engine.",
- stacklevel=find_stack_level(),
- )
-
- return read_csv(StringIO(text), sep=sep, dtype_backend=dtype_backend, **kwargs)
-
-
-def to_clipboard(
- obj, excel: bool | None = True, sep: str | None = None, **kwargs
-) -> None: # pragma: no cover
- """
- Attempt to write text representation of object to the system clipboard
- The clipboard can be then pasted into Excel for example.
-
- Parameters
- ----------
- obj : the object to write to the clipboard
- excel : bool, defaults to True
- if True, use the provided separator, writing in a csv
- format for allowing easy pasting into excel.
- if False, write a string representation of the object
- to the clipboard
- sep : optional, defaults to tab
- other keywords are passed to to_csv
-
- Notes
- -----
- Requirements for your platform
- - Linux: xclip, or xsel (with PyQt4 modules)
- - Windows:
- - OS X:
- """
- encoding = kwargs.pop("encoding", "utf-8")
-
- # testing if an invalid encoding is passed to clipboard
- if encoding is not None and encoding.lower().replace("-", "") != "utf8":
- raise ValueError("clipboard only supports utf-8 encoding")
-
- from pandas.io.clipboard import clipboard_set
-
- if excel is None:
- excel = True
-
- if excel:
- try:
- if sep is None:
- sep = "\t"
- buf = StringIO()
-
- # clipboard_set (pyperclip) expects unicode
- obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs)
- text = buf.getvalue()
-
- clipboard_set(text)
- return
- except TypeError:
- warnings.warn(
- "to_clipboard in excel mode requires a single character separator.",
- stacklevel=find_stack_level(),
- )
- elif sep is not None:
- warnings.warn(
- "to_clipboard with excel=False ignores the sep argument.",
- stacklevel=find_stack_level(),
- )
-
- if isinstance(obj, ABCDataFrame):
- # str(df) has various unhelpful defaults, like truncation
- with option_context("display.max_colwidth", None):
- objstr = obj.to_string(**kwargs)
- else:
- objstr = str(obj)
- clipboard_set(objstr)
diff --git a/contrib/python/pandas/py3/pandas/io/common.py b/contrib/python/pandas/py3/pandas/io/common.py
deleted file mode 100644
index 13185603c7b..00000000000
--- a/contrib/python/pandas/py3/pandas/io/common.py
+++ /dev/null
@@ -1,1253 +0,0 @@
-"""Common IO api utilities"""
-from __future__ import annotations
-
-from abc import (
- ABC,
- abstractmethod,
-)
-import codecs
-from collections import defaultdict
-import dataclasses
-import functools
-import gzip
-from io import (
- BufferedIOBase,
- BytesIO,
- RawIOBase,
- StringIO,
- TextIOBase,
- TextIOWrapper,
-)
-import mmap
-import os
-from pathlib import Path
-import re
-import tarfile
-from typing import (
- IO,
- Any,
- AnyStr,
- DefaultDict,
- Generic,
- Hashable,
- Literal,
- Mapping,
- Sequence,
- TypeVar,
- cast,
- overload,
-)
-from urllib.parse import (
- urljoin,
- urlparse as parse_url,
- uses_netloc,
- uses_params,
- uses_relative,
-)
-import warnings
-import zipfile
-
-from pandas._typing import (
- BaseBuffer,
- CompressionDict,
- CompressionOptions,
- FilePath,
- ReadBuffer,
- ReadCsvBuffer,
- StorageOptions,
- WriteBuffer,
-)
-from pandas.compat import get_lzma_file
-from pandas.compat._optional import import_optional_dependency
-from pandas.compat.compressors import BZ2File as _BZ2File
-from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import (
- is_bool,
- is_file_like,
- is_integer,
- is_list_like,
-)
-
-from pandas.core.indexes.api import MultiIndex
-from pandas.core.shared_docs import _shared_docs
-
-_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
-_VALID_URLS.discard("")
-_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://")
-
-BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer)
-
-
-@dataclasses.dataclass
-class IOArgs:
- """
- Return value of io/common.py:_get_filepath_or_buffer.
- """
-
- filepath_or_buffer: str | BaseBuffer
- encoding: str
- mode: str
- compression: CompressionDict
- should_close: bool = False
-
-
-@dataclasses.dataclass
-class IOHandles(Generic[AnyStr]):
- """
- Return value of io/common.py:get_handle
-
- Can be used as a context manager.
-
- This is used to easily close created buffers and to handle corner cases when
- TextIOWrapper is inserted.
-
- handle: The file handle to be used.
- created_handles: All file handles that are created by get_handle
- is_wrapped: Whether a TextIOWrapper needs to be detached.
- """
-
- # handle might not implement the IO-interface
- handle: IO[AnyStr]
- compression: CompressionDict
- created_handles: list[IO[bytes] | IO[str]] = dataclasses.field(default_factory=list)
- is_wrapped: bool = False
-
- def close(self) -> None:
- """
- Close all created buffers.
-
- Note: If a TextIOWrapper was inserted, it is flushed and detached to
- avoid closing the potentially user-created buffer.
- """
- if self.is_wrapped:
- assert isinstance(self.handle, TextIOWrapper)
- self.handle.flush()
- self.handle.detach()
- self.created_handles.remove(self.handle)
- for handle in self.created_handles:
- handle.close()
- self.created_handles = []
- self.is_wrapped = False
-
- def __enter__(self) -> IOHandles[AnyStr]:
- return self
-
- def __exit__(self, *args: Any) -> None:
- self.close()
-
-
-def is_url(url: object) -> bool:
- """
- Check to see if a URL has a valid protocol.
-
- Parameters
- ----------
- url : str or unicode
-
- Returns
- -------
- isurl : bool
- If `url` has a valid protocol return True otherwise False.
- """
- if not isinstance(url, str):
- return False
- return parse_url(url).scheme in _VALID_URLS
-
-
-@overload
-def _expand_user(filepath_or_buffer: str) -> str:
- ...
-
-
-@overload
-def _expand_user(filepath_or_buffer: BaseBufferT) -> BaseBufferT:
- ...
-
-
-def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT:
- """
- Return the argument with an initial component of ~ or ~user
- replaced by that user's home directory.
-
- Parameters
- ----------
- filepath_or_buffer : object to be converted if possible
-
- Returns
- -------
- expanded_filepath_or_buffer : an expanded filepath or the
- input if not expandable
- """
- if isinstance(filepath_or_buffer, str):
- return os.path.expanduser(filepath_or_buffer)
- return filepath_or_buffer
-
-
-def validate_header_arg(header: object) -> None:
- if header is None:
- return
- if is_integer(header):
- header = cast(int, header)
- if header < 0:
- # GH 27779
- raise ValueError(
- "Passing negative integer to header is invalid. "
- "For no header, use header=None instead"
- )
- return
- if is_list_like(header, allow_sets=False):
- header = cast(Sequence, header)
- if not all(map(is_integer, header)):
- raise ValueError("header must be integer or list of integers")
- if any(i < 0 for i in header):
- raise ValueError("cannot specify multi-index header with negative integers")
- return
- if is_bool(header):
- raise TypeError(
- "Passing a bool to header is invalid. Use header=None for no header or "
- "header=int or list-like of ints to specify "
- "the row(s) making up the column names"
- )
- # GH 16338
- raise ValueError("header must be integer or list of integers")
-
-
-@overload
-def stringify_path(filepath_or_buffer: FilePath, convert_file_like: bool = ...) -> str:
- ...
-
-
-@overload
-def stringify_path(
- filepath_or_buffer: BaseBufferT, convert_file_like: bool = ...
-) -> BaseBufferT:
- ...
-
-
-def stringify_path(
- filepath_or_buffer: FilePath | BaseBufferT,
- convert_file_like: bool = False,
-) -> str | BaseBufferT:
- """
- Attempt to convert a path-like object to a string.
-
- Parameters
- ----------
- filepath_or_buffer : object to be converted
-
- Returns
- -------
- str_filepath_or_buffer : maybe a string version of the object
-
- Notes
- -----
- Objects supporting the fspath protocol (python 3.6+) are coerced
- according to its __fspath__ method.
-
- Any other object is passed through unchanged, which includes bytes,
- strings, buffers, or anything else that's not even path-like.
- """
- if not convert_file_like and is_file_like(filepath_or_buffer):
- # GH 38125: some fsspec objects implement os.PathLike but have already opened a
- # file. This prevents opening the file a second time. infer_compression calls
- # this function with convert_file_like=True to infer the compression.
- return cast(BaseBufferT, filepath_or_buffer)
-
- if isinstance(filepath_or_buffer, os.PathLike):
- filepath_or_buffer = filepath_or_buffer.__fspath__()
- return _expand_user(filepath_or_buffer)
-
-
-def urlopen(*args, **kwargs):
- """
- Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
- the stdlib.
- """
- import urllib.request
-
- return urllib.request.urlopen(*args, **kwargs)
-
-
-def is_fsspec_url(url: FilePath | BaseBuffer) -> bool:
- """
- Returns true if the given URL looks like
- something fsspec can handle
- """
- return (
- isinstance(url, str)
- and bool(_RFC_3986_PATTERN.match(url))
- and not url.startswith(("http://", "https://"))
- )
-
-
-@doc(
- storage_options=_shared_docs["storage_options"],
- compression_options=_shared_docs["compression_options"] % "filepath_or_buffer",
-)
-def _get_filepath_or_buffer(
- filepath_or_buffer: FilePath | BaseBuffer,
- encoding: str = "utf-8",
- compression: CompressionOptions = None,
- mode: str = "r",
- storage_options: StorageOptions = None,
-) -> IOArgs:
- """
- If the filepath_or_buffer is a url, translate and return the buffer.
- Otherwise passthrough.
-
- Parameters
- ----------
- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
- or buffer
- {compression_options}
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- encoding : the encoding to use to decode bytes, default is 'utf-8'
- mode : str, optional
-
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- ..versionchange:: 1.2.0
-
- Returns the dataclass IOArgs.
- """
- filepath_or_buffer = stringify_path(filepath_or_buffer)
-
- # handle compression dict
- compression_method, compression = get_compression_method(compression)
- compression_method = infer_compression(filepath_or_buffer, compression_method)
-
- # GH21227 internal compression is not used for non-binary handles.
- if compression_method and hasattr(filepath_or_buffer, "write") and "b" not in mode:
- warnings.warn(
- "compression has no effect when passing a non-binary object as input.",
- RuntimeWarning,
- stacklevel=find_stack_level(),
- )
- compression_method = None
-
- compression = dict(compression, method=compression_method)
-
- # bz2 and xz do not write the byte order mark for utf-16 and utf-32
- # print a warning when writing such files
- if (
- "w" in mode
- and compression_method in ["bz2", "xz"]
- and encoding in ["utf-16", "utf-32"]
- ):
- warnings.warn(
- f"{compression} will not write the byte order mark for {encoding}",
- UnicodeWarning,
- stacklevel=find_stack_level(),
- )
-
- # Use binary mode when converting path-like objects to file-like objects (fsspec)
- # except when text mode is explicitly requested. The original mode is returned if
- # fsspec is not used.
- fsspec_mode = mode
- if "t" not in fsspec_mode and "b" not in fsspec_mode:
- fsspec_mode += "b"
-
- if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
- # TODO: fsspec can also handle HTTP via requests, but leaving this
- # unchanged. using fsspec appears to break the ability to infer if the
- # server responded with gzipped data
- storage_options = storage_options or {}
-
- # waiting until now for importing to match intended lazy logic of
- # urlopen function defined elsewhere in this module
- import urllib.request
-
- # assuming storage_options is to be interpreted as headers
- req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
- with urlopen(req_info) as req:
- content_encoding = req.headers.get("Content-Encoding", None)
- if content_encoding == "gzip":
- # Override compression based on Content-Encoding header
- compression = {"method": "gzip"}
- reader = BytesIO(req.read())
- return IOArgs(
- filepath_or_buffer=reader,
- encoding=encoding,
- compression=compression,
- should_close=True,
- mode=fsspec_mode,
- )
-
- if is_fsspec_url(filepath_or_buffer):
- assert isinstance(
- filepath_or_buffer, str
- ) # just to appease mypy for this branch
- # two special-case s3-like protocols; these have special meaning in Hadoop,
- # but are equivalent to just "s3" from fsspec's point of view
- # cc #11071
- if filepath_or_buffer.startswith("s3a://"):
- filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://")
- if filepath_or_buffer.startswith("s3n://"):
- filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://")
- fsspec = import_optional_dependency("fsspec")
-
- # If botocore is installed we fallback to reading with anon=True
- # to allow reads from public buckets
- err_types_to_retry_with_anon: list[Any] = []
- try:
- import_optional_dependency("botocore")
- from botocore.exceptions import (
- ClientError,
- NoCredentialsError,
- )
-
- err_types_to_retry_with_anon = [
- ClientError,
- NoCredentialsError,
- PermissionError,
- ]
- except ImportError:
- pass
-
- try:
- file_obj = fsspec.open(
- filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
- ).open()
- # GH 34626 Reads from Public Buckets without Credentials needs anon=True
- except tuple(err_types_to_retry_with_anon):
- if storage_options is None:
- storage_options = {"anon": True}
- else:
- # don't mutate user input.
- storage_options = dict(storage_options)
- storage_options["anon"] = True
- file_obj = fsspec.open(
- filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
- ).open()
-
- return IOArgs(
- filepath_or_buffer=file_obj,
- encoding=encoding,
- compression=compression,
- should_close=True,
- mode=fsspec_mode,
- )
- elif storage_options:
- raise ValueError(
- "storage_options passed with file object or non-fsspec file path"
- )
-
- if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
- return IOArgs(
- filepath_or_buffer=_expand_user(filepath_or_buffer),
- encoding=encoding,
- compression=compression,
- should_close=False,
- mode=mode,
- )
-
- # is_file_like requires (read | write) & __iter__ but __iter__ is only
- # needed for read_csv(engine=python)
- if not (
- hasattr(filepath_or_buffer, "read") or hasattr(filepath_or_buffer, "write")
- ):
- msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}"
- raise ValueError(msg)
-
- return IOArgs(
- filepath_or_buffer=filepath_or_buffer,
- encoding=encoding,
- compression=compression,
- should_close=False,
- mode=mode,
- )
-
-
-def file_path_to_url(path: str) -> str:
- """
- converts an absolute native path to a FILE URL.
-
- Parameters
- ----------
- path : a path in native format
-
- Returns
- -------
- a valid FILE URL
- """
- # lazify expensive import (~30ms)
- from urllib.request import pathname2url
-
- return urljoin("file:", pathname2url(path))
-
-
-extension_to_compression = {
- ".tar": "tar",
- ".tar.gz": "tar",
- ".tar.bz2": "tar",
- ".tar.xz": "tar",
- ".gz": "gzip",
- ".bz2": "bz2",
- ".zip": "zip",
- ".xz": "xz",
- ".zst": "zstd",
-}
-_supported_compressions = set(extension_to_compression.values())
-
-
-def get_compression_method(
- compression: CompressionOptions,
-) -> tuple[str | None, CompressionDict]:
- """
- Simplifies a compression argument to a compression method string and
- a mapping containing additional arguments.
-
- Parameters
- ----------
- compression : str or mapping
- If string, specifies the compression method. If mapping, value at key
- 'method' specifies compression method.
-
- Returns
- -------
- tuple of ({compression method}, Optional[str]
- {compression arguments}, Dict[str, Any])
-
- Raises
- ------
- ValueError on mapping missing 'method' key
- """
- compression_method: str | None
- if isinstance(compression, Mapping):
- compression_args = dict(compression)
- try:
- compression_method = compression_args.pop("method")
- except KeyError as err:
- raise ValueError("If mapping, compression must have key 'method'") from err
- else:
- compression_args = {}
- compression_method = compression
- return compression_method, compression_args
-
-
-@doc(compression_options=_shared_docs["compression_options"] % "filepath_or_buffer")
-def infer_compression(
- filepath_or_buffer: FilePath | BaseBuffer, compression: str | None
-) -> str | None:
- """
- Get the compression method for filepath_or_buffer. If compression='infer',
- the inferred compression method is returned. Otherwise, the input
- compression method is returned unchanged, unless it's invalid, in which
- case an error is raised.
-
- Parameters
- ----------
- filepath_or_buffer : str or file handle
- File path or object.
- {compression_options}
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- Returns
- -------
- string or None
-
- Raises
- ------
- ValueError on invalid compression specified.
- """
- if compression is None:
- return None
-
- # Infer compression
- if compression == "infer":
- # Convert all path types (e.g. pathlib.Path) to strings
- filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True)
- if not isinstance(filepath_or_buffer, str):
- # Cannot infer compression of a buffer, assume no compression
- return None
-
- # Infer compression from the filename/URL extension
- for extension, compression in extension_to_compression.items():
- if filepath_or_buffer.lower().endswith(extension):
- return compression
- return None
-
- # Compression has been specified. Check that it's valid
- if compression in _supported_compressions:
- return compression
-
- valid = ["infer", None] + sorted(_supported_compressions)
- msg = (
- f"Unrecognized compression type: {compression}\n"
- f"Valid compression types are {valid}"
- )
- raise ValueError(msg)
-
-
-def check_parent_directory(path: Path | str) -> None:
- """
- Check if parent directory of a file exists, raise OSError if it does not
-
- Parameters
- ----------
- path: Path or str
- Path to check parent directory of
- """
- parent = Path(path).parent
- if not parent.is_dir():
- raise OSError(rf"Cannot save file into a non-existent directory: '{parent}'")
-
-
-@overload
-def get_handle(
- path_or_buf: FilePath | BaseBuffer,
- mode: str,
- *,
- encoding: str | None = ...,
- compression: CompressionOptions = ...,
- memory_map: bool = ...,
- is_text: Literal[False],
- errors: str | None = ...,
- storage_options: StorageOptions = ...,
-) -> IOHandles[bytes]:
- ...
-
-
-@overload
-def get_handle(
- path_or_buf: FilePath | BaseBuffer,
- mode: str,
- *,
- encoding: str | None = ...,
- compression: CompressionOptions = ...,
- memory_map: bool = ...,
- is_text: Literal[True] = ...,
- errors: str | None = ...,
- storage_options: StorageOptions = ...,
-) -> IOHandles[str]:
- ...
-
-
-@overload
-def get_handle(
- path_or_buf: FilePath | BaseBuffer,
- mode: str,
- *,
- encoding: str | None = ...,
- compression: CompressionOptions = ...,
- memory_map: bool = ...,
- is_text: bool = ...,
- errors: str | None = ...,
- storage_options: StorageOptions = ...,
-) -> IOHandles[str] | IOHandles[bytes]:
- ...
-
-
-@doc(compression_options=_shared_docs["compression_options"] % "path_or_buf")
-def get_handle(
- path_or_buf: FilePath | BaseBuffer,
- mode: str,
- *,
- encoding: str | None = None,
- compression: CompressionOptions = None,
- memory_map: bool = False,
- is_text: bool = True,
- errors: str | None = None,
- storage_options: StorageOptions = None,
-) -> IOHandles[str] | IOHandles[bytes]:
- """
- Get file handle for given path/buffer and mode.
-
- Parameters
- ----------
- path_or_buf : str or file handle
- File path or object.
- mode : str
- Mode to open path_or_buf with.
- encoding : str or None
- Encoding to use.
- {compression_options}
-
- .. versionchanged:: 1.0.0
- May now be a dict with key 'method' as compression mode
- and other keys as compression options if compression
- mode is 'zip'.
-
- .. versionchanged:: 1.1.0
- Passing compression options as keys in dict is now
- supported for compression modes 'gzip', 'bz2', 'zstd' and 'zip'.
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- memory_map : bool, default False
- See parsers._parser_params for more information. Only used by read_csv.
- is_text : bool, default True
- Whether the type of the content passed to the file/buffer is string or
- bytes. This is not the same as `"b" not in mode`. If a string content is
- passed to a binary file/buffer, a wrapper is inserted.
- errors : str, default 'strict'
- Specifies how encoding and decoding errors are to be handled.
- See the errors argument for :func:`open` for a full list
- of options.
- storage_options: StorageOptions = None
- Passed to _get_filepath_or_buffer
-
- .. versionchanged:: 1.2.0
-
- Returns the dataclass IOHandles
- """
- # Windows does not default to utf-8. Set to utf-8 for a consistent behavior
- encoding = encoding or "utf-8"
-
- errors = errors or "strict"
-
- # read_csv does not know whether the buffer is opened in binary/text mode
- if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
- mode += "b"
-
- # validate encoding and errors
- codecs.lookup(encoding)
- if isinstance(errors, str):
- codecs.lookup_error(errors)
-
- # open URLs
- ioargs = _get_filepath_or_buffer(
- path_or_buf,
- encoding=encoding,
- compression=compression,
- mode=mode,
- storage_options=storage_options,
- )
-
- handle = ioargs.filepath_or_buffer
- handles: list[BaseBuffer]
-
- # memory mapping needs to be the first step
- # only used for read_csv
- handle, memory_map, handles = _maybe_memory_map(handle, memory_map)
-
- is_path = isinstance(handle, str)
- compression_args = dict(ioargs.compression)
- compression = compression_args.pop("method")
-
- # Only for write methods
- if "r" not in mode and is_path:
- check_parent_directory(str(handle))
-
- if compression:
- if compression != "zstd":
- # compression libraries do not like an explicit text-mode
- ioargs.mode = ioargs.mode.replace("t", "")
- elif compression == "zstd" and "b" not in ioargs.mode:
- # python-zstandard defaults to text mode, but we always expect
- # compression libraries to use binary mode.
- ioargs.mode += "b"
-
- # GZ Compression
- if compression == "gzip":
- if isinstance(handle, str):
- # error: Incompatible types in assignment (expression has type
- # "GzipFile", variable has type "Union[str, BaseBuffer]")
- handle = gzip.GzipFile( # type: ignore[assignment]
- filename=handle,
- mode=ioargs.mode,
- **compression_args,
- )
- else:
- handle = gzip.GzipFile(
- # No overload variant of "GzipFile" matches argument types
- # "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
- fileobj=handle, # type: ignore[call-overload]
- mode=ioargs.mode,
- **compression_args,
- )
-
- # BZ Compression
- elif compression == "bz2":
- # Overload of "BZ2File" to handle pickle protocol 5
- # "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
- handle = _BZ2File( # type: ignore[call-overload]
- handle,
- mode=ioargs.mode,
- **compression_args,
- )
-
- # ZIP Compression
- elif compression == "zip":
- # error: Argument 1 to "_BytesZipFile" has incompatible type
- # "Union[str, BaseBuffer]"; expected "Union[Union[str, PathLike[str]],
- # ReadBuffer[bytes], WriteBuffer[bytes]]"
- handle = _BytesZipFile(
- handle, ioargs.mode, **compression_args # type: ignore[arg-type]
- )
- if handle.buffer.mode == "r":
- handles.append(handle)
- zip_names = handle.buffer.namelist()
- if len(zip_names) == 1:
- handle = handle.buffer.open(zip_names.pop())
- elif not zip_names:
- raise ValueError(f"Zero files found in ZIP file {path_or_buf}")
- else:
- raise ValueError(
- "Multiple files found in ZIP file. "
- f"Only one file per ZIP: {zip_names}"
- )
-
- # TAR Encoding
- elif compression == "tar":
- compression_args.setdefault("mode", ioargs.mode)
- if isinstance(handle, str):
- handle = _BytesTarFile(name=handle, **compression_args)
- else:
- # error: Argument "fileobj" to "_BytesTarFile" has incompatible
- # type "BaseBuffer"; expected "Union[ReadBuffer[bytes],
- # WriteBuffer[bytes], None]"
- handle = _BytesTarFile(
- fileobj=handle, **compression_args # type: ignore[arg-type]
- )
- assert isinstance(handle, _BytesTarFile)
- if "r" in handle.buffer.mode:
- handles.append(handle)
- files = handle.buffer.getnames()
- if len(files) == 1:
- file = handle.buffer.extractfile(files[0])
- assert file is not None
- handle = file
- elif not files:
- raise ValueError(f"Zero files found in TAR archive {path_or_buf}")
- else:
- raise ValueError(
- "Multiple files found in TAR archive. "
- f"Only one file per TAR archive: {files}"
- )
-
- # XZ Compression
- elif compression == "xz":
- # error: Argument 1 to "LZMAFile" has incompatible type "Union[str,
- # BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str],
- # PathLike[bytes]], IO[bytes]]]"
- handle = get_lzma_file()(handle, ioargs.mode) # type: ignore[arg-type]
-
- # Zstd Compression
- elif compression == "zstd":
- zstd = import_optional_dependency("zstandard")
- if "r" in ioargs.mode:
- open_args = {"dctx": zstd.ZstdDecompressor(**compression_args)}
- else:
- open_args = {"cctx": zstd.ZstdCompressor(**compression_args)}
- handle = zstd.open(
- handle,
- mode=ioargs.mode,
- **open_args,
- )
-
- # Unrecognized Compression
- else:
- msg = f"Unrecognized compression type: {compression}"
- raise ValueError(msg)
-
- assert not isinstance(handle, str)
- handles.append(handle)
-
- elif isinstance(handle, str):
- # Check whether the filename is to be opened in binary mode.
- # Binary mode does not support 'encoding' and 'newline'.
- if ioargs.encoding and "b" not in ioargs.mode:
- # Encoding
- handle = open(
- handle,
- ioargs.mode,
- encoding=ioargs.encoding,
- errors=errors,
- newline="",
- )
- else:
- # Binary mode
- handle = open(handle, ioargs.mode)
- handles.append(handle)
-
- # Convert BytesIO or file objects passed with an encoding
- is_wrapped = False
- if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase):
- # not added to handles as it does not open/buffer resources
- handle = _BytesIOWrapper(
- handle,
- encoding=ioargs.encoding,
- )
- elif is_text and (
- compression or memory_map or _is_binary_mode(handle, ioargs.mode)
- ):
- if (
- not hasattr(handle, "readable")
- or not hasattr(handle, "writable")
- or not hasattr(handle, "seekable")
- ):
- handle = _IOWrapper(handle)
- # error: Argument 1 to "TextIOWrapper" has incompatible type
- # "_IOWrapper"; expected "IO[bytes]"
- handle = TextIOWrapper(
- handle, # type: ignore[arg-type]
- encoding=ioargs.encoding,
- errors=errors,
- newline="",
- )
- handles.append(handle)
- # only marked as wrapped when the caller provided a handle
- is_wrapped = not (
- isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close
- )
-
- if "r" in ioargs.mode and not hasattr(handle, "read"):
- raise TypeError(
- "Expected file path name or file-like object, "
- f"got {type(ioargs.filepath_or_buffer)} type"
- )
-
- handles.reverse() # close the most recently added buffer first
- if ioargs.should_close:
- assert not isinstance(ioargs.filepath_or_buffer, str)
- handles.append(ioargs.filepath_or_buffer)
-
- return IOHandles(
- # error: Argument "handle" to "IOHandles" has incompatible type
- # "Union[TextIOWrapper, GzipFile, BaseBuffer, typing.IO[bytes],
- # typing.IO[Any]]"; expected "pandas._typing.IO[Any]"
- handle=handle, # type: ignore[arg-type]
- # error: Argument "created_handles" to "IOHandles" has incompatible type
- # "List[BaseBuffer]"; expected "List[Union[IO[bytes], IO[str]]]"
- created_handles=handles, # type: ignore[arg-type]
- is_wrapped=is_wrapped,
- compression=ioargs.compression,
- )
-
-
-# error: Definition of "__enter__" in base class "IOBase" is incompatible
-# with definition in base class "BinaryIO"
-class _BufferedWriter(BytesIO, ABC): # type: ignore[misc]
- """
- Some objects do not support multiple .write() calls (TarFile and ZipFile).
- This wrapper writes to the underlying buffer on close.
- """
-
- @abstractmethod
- def write_to_buffer(self) -> None:
- ...
-
- def close(self) -> None:
- if self.closed:
- # already closed
- return
- if self.getvalue():
- # write to buffer
- self.seek(0)
- # error: "_BufferedWriter" has no attribute "buffer"
- with self.buffer: # type: ignore[attr-defined]
- self.write_to_buffer()
- else:
- # error: "_BufferedWriter" has no attribute "buffer"
- self.buffer.close() # type: ignore[attr-defined]
- super().close()
-
-
-class _BytesTarFile(_BufferedWriter):
- def __init__(
- self,
- name: str | None = None,
- mode: Literal["r", "a", "w", "x"] = "r",
- fileobj: ReadBuffer[bytes] | WriteBuffer[bytes] | None = None,
- archive_name: str | None = None,
- **kwargs,
- ) -> None:
- super().__init__()
- self.archive_name = archive_name
- self.name = name
- # error: Argument "fileobj" to "open" of "TarFile" has incompatible
- # type "Union[ReadBuffer[bytes], WriteBuffer[bytes], None]"; expected
- # "Optional[IO[bytes]]"
- self.buffer = tarfile.TarFile.open(
- name=name,
- mode=self.extend_mode(mode),
- fileobj=fileobj, # type: ignore[arg-type]
- **kwargs,
- )
-
- def extend_mode(self, mode: str) -> str:
- mode = mode.replace("b", "")
- if mode != "w":
- return mode
- if self.name is not None:
- suffix = Path(self.name).suffix
- if suffix in (".gz", ".xz", ".bz2"):
- mode = f"{mode}:{suffix[1:]}"
- return mode
-
- def infer_filename(self) -> str | None:
- """
- If an explicit archive_name is not given, we still want the file inside the zip
- file not to be named something.tar, because that causes confusion (GH39465).
- """
- if self.name is None:
- return None
-
- filename = Path(self.name)
- if filename.suffix == ".tar":
- return filename.with_suffix("").name
- elif filename.suffix in (".tar.gz", ".tar.bz2", ".tar.xz"):
- return filename.with_suffix("").with_suffix("").name
- return filename.name
-
- def write_to_buffer(self) -> None:
- # TarFile needs a non-empty string
- archive_name = self.archive_name or self.infer_filename() or "tar"
- tarinfo = tarfile.TarInfo(name=archive_name)
- tarinfo.size = len(self.getvalue())
- self.buffer.addfile(tarinfo, self)
-
-
-class _BytesZipFile(_BufferedWriter):
- def __init__(
- self,
- file: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
- mode: str,
- archive_name: str | None = None,
- **kwargs,
- ) -> None:
- super().__init__()
- mode = mode.replace("b", "")
- self.archive_name = archive_name
-
- kwargs.setdefault("compression", zipfile.ZIP_DEFLATED)
- # error: Argument 1 to "ZipFile" has incompatible type "Union[
- # Union[str, PathLike[str]], ReadBuffer[bytes], WriteBuffer[bytes]]";
- # expected "Union[Union[str, PathLike[str]], IO[bytes]]"
- self.buffer = zipfile.ZipFile(file, mode, **kwargs) # type: ignore[arg-type]
-
- def infer_filename(self) -> str | None:
- """
- If an explicit archive_name is not given, we still want the file inside the zip
- file not to be named something.zip, because that causes confusion (GH39465).
- """
- if isinstance(self.buffer.filename, (os.PathLike, str)):
- filename = Path(self.buffer.filename)
- if filename.suffix == ".zip":
- return filename.with_suffix("").name
- return filename.name
- return None
-
- def write_to_buffer(self) -> None:
- # ZipFile needs a non-empty string
- archive_name = self.archive_name or self.infer_filename() or "zip"
- self.buffer.writestr(archive_name, self.getvalue())
-
-
-class _IOWrapper:
- # TextIOWrapper is overly strict: it request that the buffer has seekable, readable,
- # and writable. If we have a read-only buffer, we shouldn't need writable and vice
- # versa. Some buffers, are seek/read/writ-able but they do not have the "-able"
- # methods, e.g., tempfile.SpooledTemporaryFile.
- # If a buffer does not have the above "-able" methods, we simple assume they are
- # seek/read/writ-able.
- def __init__(self, buffer: BaseBuffer) -> None:
- self.buffer = buffer
-
- def __getattr__(self, name: str):
- return getattr(self.buffer, name)
-
- def readable(self) -> bool:
- if hasattr(self.buffer, "readable"):
- return self.buffer.readable()
- return True
-
- def seekable(self) -> bool:
- if hasattr(self.buffer, "seekable"):
- return self.buffer.seekable()
- return True
-
- def writable(self) -> bool:
- if hasattr(self.buffer, "writable"):
- return self.buffer.writable()
- return True
-
-
-class _BytesIOWrapper:
- # Wrapper that wraps a StringIO buffer and reads bytes from it
- # Created for compat with pyarrow read_csv
- def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8") -> None:
- self.buffer = buffer
- self.encoding = encoding
- # Because a character can be represented by more than 1 byte,
- # it is possible that reading will produce more bytes than n
- # We store the extra bytes in this overflow variable, and append the
- # overflow to the front of the bytestring the next time reading is performed
- self.overflow = b""
-
- def __getattr__(self, attr: str):
- return getattr(self.buffer, attr)
-
- def read(self, n: int | None = -1) -> bytes:
- assert self.buffer is not None
- bytestring = self.buffer.read(n).encode(self.encoding)
- # When n=-1/n greater than remaining bytes: Read entire file/rest of file
- combined_bytestring = self.overflow + bytestring
- if n is None or n < 0 or n >= len(combined_bytestring):
- self.overflow = b""
- return combined_bytestring
- else:
- to_return = combined_bytestring[:n]
- self.overflow = combined_bytestring[n:]
- return to_return
-
-
-def _maybe_memory_map(
- handle: str | BaseBuffer, memory_map: bool
-) -> tuple[str | BaseBuffer, bool, list[BaseBuffer]]:
- """Try to memory map file/buffer."""
- handles: list[BaseBuffer] = []
- memory_map &= hasattr(handle, "fileno") or isinstance(handle, str)
- if not memory_map:
- return handle, memory_map, handles
-
- # mmap used by only read_csv
- handle = cast(ReadCsvBuffer, handle)
-
- # need to open the file first
- if isinstance(handle, str):
- handle = open(handle, "rb")
- handles.append(handle)
-
- try:
- # open mmap and adds *-able
- # error: Argument 1 to "_IOWrapper" has incompatible type "mmap";
- # expected "BaseBuffer"
- wrapped = _IOWrapper(
- mmap.mmap(
- handle.fileno(), 0, access=mmap.ACCESS_READ # type: ignore[arg-type]
- )
- )
- finally:
- for handle in reversed(handles):
- # error: "BaseBuffer" has no attribute "close"
- handle.close() # type: ignore[attr-defined]
-
- return wrapped, memory_map, [wrapped]
-
-
-def file_exists(filepath_or_buffer: FilePath | BaseBuffer) -> bool:
- """Test whether file exists."""
- exists = False
- filepath_or_buffer = stringify_path(filepath_or_buffer)
- if not isinstance(filepath_or_buffer, str):
- return exists
- try:
- exists = os.path.exists(filepath_or_buffer)
- # gh-5874: if the filepath is too long will raise here
- except (TypeError, ValueError):
- pass
- return exists
-
-
-def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool:
- """Whether the handle is opened in binary mode"""
- # specified by user
- if "t" in mode or "b" in mode:
- return "b" in mode
-
- # exceptions
- text_classes = (
- # classes that expect string but have 'b' in mode
- codecs.StreamWriter,
- codecs.StreamReader,
- codecs.StreamReaderWriter,
- )
- if issubclass(type(handle), text_classes):
- return False
-
- return isinstance(handle, _get_binary_io_classes()) or "b" in getattr(
- handle, "mode", mode
- )
-
-
-@functools.lru_cache
-def _get_binary_io_classes() -> tuple[type, ...]:
- """IO classes that that expect bytes"""
- binary_classes: tuple[type, ...] = (BufferedIOBase, RawIOBase)
-
- # python-zstandard doesn't use any of the builtin base classes; instead we
- # have to use the `zstd.ZstdDecompressionReader` class for isinstance checks.
- # Unfortunately `zstd.ZstdDecompressionReader` isn't exposed by python-zstandard
- # so we have to get it from a `zstd.ZstdDecompressor` instance.
- # See also https://github.com/indygreg/python-zstandard/pull/165.
- zstd = import_optional_dependency("zstandard", errors="ignore")
- if zstd is not None:
- with zstd.ZstdDecompressor().stream_reader(b"") as reader:
- binary_classes += (type(reader),)
-
- return binary_classes
-
-
-def is_potential_multi_index(
- columns: Sequence[Hashable] | MultiIndex,
- index_col: bool | Sequence[int] | None = None,
-) -> bool:
- """
- Check whether or not the `columns` parameter
- could be converted into a MultiIndex.
-
- Parameters
- ----------
- columns : array-like
- Object which may or may not be convertible into a MultiIndex
- index_col : None, bool or list, optional
- Column or columns to use as the (possibly hierarchical) index
-
- Returns
- -------
- bool : Whether or not columns could become a MultiIndex
- """
- if index_col is None or isinstance(index_col, bool):
- index_col = []
-
- return bool(
- len(columns)
- and not isinstance(columns, MultiIndex)
- and all(isinstance(c, tuple) for c in columns if c not in list(index_col))
- )
-
-
-def dedup_names(
- names: Sequence[Hashable], is_potential_multiindex: bool
-) -> Sequence[Hashable]:
- """
- Rename column names if duplicates exist.
-
- Currently the renaming is done by appending a period and an autonumeric,
- but a custom pattern may be supported in the future.
-
- Examples
- --------
- >>> dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False)
- ['x', 'y', 'x.1', 'x.2']
- """
- names = list(names) # so we can index
- counts: DefaultDict[Hashable, int] = defaultdict(int)
-
- for i, col in enumerate(names):
- cur_count = counts[col]
-
- while cur_count > 0:
- counts[col] = cur_count + 1
-
- if is_potential_multiindex:
- # for mypy
- assert isinstance(col, tuple)
- col = col[:-1] + (f"{col[-1]}.{cur_count}",)
- else:
- col = f"{col}.{cur_count}"
- cur_count = counts[col]
-
- names[i] = col
- counts[col] = cur_count + 1
-
- return names
diff --git a/contrib/python/pandas/py3/pandas/io/excel/__init__.py b/contrib/python/pandas/py3/pandas/io/excel/__init__.py
deleted file mode 100644
index 275cbf0148f..00000000000
--- a/contrib/python/pandas/py3/pandas/io/excel/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from pandas.io.excel._base import (
- ExcelFile,
- ExcelWriter,
- read_excel,
-)
-from pandas.io.excel._odswriter import ODSWriter as _ODSWriter
-from pandas.io.excel._openpyxl import OpenpyxlWriter as _OpenpyxlWriter
-from pandas.io.excel._util import register_writer
-from pandas.io.excel._xlsxwriter import XlsxWriter as _XlsxWriter
-
-__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]
-
-
-register_writer(_OpenpyxlWriter)
-
-register_writer(_XlsxWriter)
-
-
-register_writer(_ODSWriter)
diff --git a/contrib/python/pandas/py3/pandas/io/excel/_base.py b/contrib/python/pandas/py3/pandas/io/excel/_base.py
deleted file mode 100644
index 6d9eb2c1ee5..00000000000
--- a/contrib/python/pandas/py3/pandas/io/excel/_base.py
+++ /dev/null
@@ -1,1594 +0,0 @@
-from __future__ import annotations
-
-import abc
-import datetime
-from functools import partial
-from io import BytesIO
-import os
-from textwrap import fill
-from types import TracebackType
-from typing import (
- IO,
- Any,
- Callable,
- Hashable,
- Iterable,
- List,
- Literal,
- Mapping,
- Sequence,
- Union,
- cast,
- overload,
-)
-import zipfile
-
-from pandas._config import config
-
-from pandas._libs import lib
-from pandas._libs.parsers import STR_NA_VALUES
-from pandas._typing import (
- DtypeArg,
- DtypeBackend,
- FilePath,
- IntStrT,
- ReadBuffer,
- StorageOptions,
- WriteExcelBuffer,
-)
-from pandas.compat._optional import (
- get_version,
- import_optional_dependency,
-)
-from pandas.errors import EmptyDataError
-from pandas.util._decorators import (
- Appender,
- doc,
-)
-from pandas.util._validators import check_dtype_backend
-
-from pandas.core.dtypes.common import (
- is_bool,
- is_float,
- is_integer,
- is_list_like,
-)
-
-from pandas.core.frame import DataFrame
-from pandas.core.shared_docs import _shared_docs
-from pandas.util.version import Version
-
-from pandas.io.common import (
- IOHandles,
- get_handle,
- stringify_path,
- validate_header_arg,
-)
-from pandas.io.excel._util import (
- fill_mi_header,
- get_default_engine,
- get_writer,
- maybe_convert_usecols,
- pop_header_name,
-)
-from pandas.io.parsers import TextParser
-from pandas.io.parsers.readers import validate_integer
-
-_read_excel_doc = (
- """
-Read an Excel file into a pandas DataFrame.
-
-Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions
-read from a local filesystem or URL. Supports an option to read
-a single sheet or a list of sheets.
-
-Parameters
-----------
-io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file. For file URLs, a host is
- expected. A local file could be: ``file://localhost/path/to/table.xlsx``.
-
- If you want to pass in a path object, pandas accepts any ``os.PathLike``.
-
- By file-like object, we refer to objects with a ``read()`` method,
- such as a file handle (e.g. via builtin ``open`` function)
- or ``StringIO``.
-sheet_name : str, int, list, or None, default 0
- Strings are used for sheet names. Integers are used in zero-indexed
- sheet positions (chart sheets do not count as a sheet position).
- Lists of strings/integers are used to request multiple sheets.
- Specify None to get all worksheets.
-
- Available cases:
-
- * Defaults to ``0``: 1st sheet as a `DataFrame`
- * ``1``: 2nd sheet as a `DataFrame`
- * ``"Sheet1"``: Load sheet with name "Sheet1"
- * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5"
- as a dict of `DataFrame`
- * None: All worksheets.
-
-header : int, list of int, default 0
- Row (0-indexed) to use for the column labels of the parsed
- DataFrame. If a list of integers is passed those row positions will
- be combined into a ``MultiIndex``. Use None if there is no header.
-names : array-like, default None
- List of column names to use. If file contains no header row,
- then you should explicitly pass header=None.
-index_col : int, list of int, default None
- Column (0-indexed) to use as the row labels of the DataFrame.
- Pass None if there is no such column. If a list is passed,
- those columns will be combined into a ``MultiIndex``. If a
- subset of data is selected with ``usecols``, index_col
- is based on the subset.
-
- Missing values will be forward filled to allow roundtripping with
- ``to_excel`` for ``merged_cells=True``. To avoid forward filling the
- missing values use ``set_index`` after reading the data instead of
- ``index_col``.
-usecols : str, list-like, or callable, default None
- * If None, then parse all columns.
- * If str, then indicates comma separated list of Excel column letters
- and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of
- both sides.
- * If list of int, then indicates list of column numbers to be parsed
- (0-indexed).
- * If list of string, then indicates list of column names to be parsed.
- * If callable, then evaluate each column name against it and parse the
- column if the callable returns ``True``.
-
- Returns a subset of the columns according to behavior above.
-dtype : Type name or dict of column -> type, default None
- Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}}
- Use `object` to preserve data as stored in Excel and not interpret dtype.
- If converters are specified, they will be applied INSTEAD
- of dtype conversion.
-engine : str, default None
- If io is not a buffer or path, this must be set to identify io.
- Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
- Engine compatibility :
-
- - "xlrd" supports old-style Excel files (.xls).
- - "openpyxl" supports newer Excel file formats.
- - "odf" supports OpenDocument file formats (.odf, .ods, .odt).
- - "pyxlsb" supports Binary Excel files.
-
- .. versionchanged:: 1.2.0
- The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
- now only supports old-style ``.xls`` files.
- When ``engine=None``, the following logic will be
- used to determine the engine:
-
- - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
- then `odf <https://pypi.org/project/odfpy/>`_ will be used.
- - Otherwise if ``path_or_buffer`` is an xls format,
- ``xlrd`` will be used.
- - Otherwise if ``path_or_buffer`` is in xlsb format,
- ``pyxlsb`` will be used.
-
- .. versionadded:: 1.3.0
- - Otherwise ``openpyxl`` will be used.
-
- .. versionchanged:: 1.3.0
-
-converters : dict, default None
- Dict of functions for converting values in certain columns. Keys can
- either be integers or column labels, values are functions that take one
- input argument, the Excel cell content, and return the transformed
- content.
-true_values : list, default None
- Values to consider as True.
-false_values : list, default None
- Values to consider as False.
-skiprows : list-like, int, or callable, optional
- Line numbers to skip (0-indexed) or number of lines to skip (int) at the
- start of the file. If callable, the callable function will be evaluated
- against the row indices, returning True if the row should be skipped and
- False otherwise. An example of a valid callable argument would be ``lambda
- x: x in [0, 2]``.
-nrows : int, default None
- Number of rows to parse.
-na_values : scalar, str, list-like, or dict, default None
- Additional strings to recognize as NA/NaN. If dict passed, specific
- per-column NA values. By default the following values are interpreted
- as NaN: '"""
- + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
- + """'.
-keep_default_na : bool, default True
- Whether or not to include the default NaN values when parsing the data.
- Depending on whether `na_values` is passed in, the behavior is as follows:
-
- * If `keep_default_na` is True, and `na_values` are specified, `na_values`
- is appended to the default NaN values used for parsing.
- * If `keep_default_na` is True, and `na_values` are not specified, only
- the default NaN values are used for parsing.
- * If `keep_default_na` is False, and `na_values` are specified, only
- the NaN values specified `na_values` are used for parsing.
- * If `keep_default_na` is False, and `na_values` are not specified, no
- strings will be parsed as NaN.
-
- Note that if `na_filter` is passed in as False, the `keep_default_na` and
- `na_values` parameters will be ignored.
-na_filter : bool, default True
- Detect missing value markers (empty strings and the value of na_values). In
- data without any NAs, passing na_filter=False can improve the performance
- of reading a large file.
-verbose : bool, default False
- Indicate number of NA values placed in non-numeric columns.
-parse_dates : bool, list-like, or dict, default False
- The behavior is as follows:
-
- * bool. If True -> try parsing the index.
- * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
- each as a separate date column.
- * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
- a single date column.
- * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
- result 'foo'
-
- If a column or index contains an unparsable date, the entire column or
- index will be returned unaltered as an object data type. If you don`t want to
- parse some cells as date just change their type in Excel to "Text".
- For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``.
-
- Note: A fast-path exists for iso8601-formatted dates.
-date_parser : function, optional
- Function to use for converting a sequence of string columns to an array of
- datetime instances. The default uses ``dateutil.parser.parser`` to do the
- conversion. Pandas will try to call `date_parser` in three different ways,
- advancing to the next if an exception occurs: 1) Pass one or more arrays
- (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
- string values from the columns defined by `parse_dates` into a single array
- and pass that; and 3) call `date_parser` once for each row using one or
- more strings (corresponding to the columns defined by `parse_dates`) as
- arguments.
-
- .. deprecated:: 2.0.0
- Use ``date_format`` instead, or read in as ``object`` and then apply
- :func:`to_datetime` as-needed.
-date_format : str or dict of column -> format, default ``None``
- If used in conjunction with ``parse_dates``, will parse dates according to this
- format. For anything more complex,
- please read in as ``object`` and then apply :func:`to_datetime` as-needed.
-
- .. versionadded:: 2.0.0
-thousands : str, default None
- Thousands separator for parsing string columns to numeric. Note that
- this parameter is only necessary for columns stored as TEXT in Excel,
- any numeric columns will automatically be parsed, regardless of display
- format.
-decimal : str, default '.'
- Character to recognize as decimal point for parsing string columns to numeric.
- Note that this parameter is only necessary for columns stored as TEXT in Excel,
- any numeric columns will automatically be parsed, regardless of display
- format.(e.g. use ',' for European data).
-
- .. versionadded:: 1.4.0
-
-comment : str, default None
- Comments out remainder of line. Pass a character or characters to this
- argument to indicate comments in the input file. Any data between the
- comment string and the end of the current line is ignored.
-skipfooter : int, default 0
- Rows at the end to skip (0-indexed).
-{storage_options}
-
- .. versionadded:: 1.2.0
-
-dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
-Returns
--------
-DataFrame or dict of DataFrames
- DataFrame from the passed in Excel file. See notes in sheet_name
- argument for more information on when a dict of DataFrames is returned.
-
-See Also
---------
-DataFrame.to_excel : Write DataFrame to an Excel file.
-DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
-read_csv : Read a comma-separated values (csv) file into DataFrame.
-read_fwf : Read a table of fixed-width formatted lines into DataFrame.
-
-Examples
---------
-The file can be read using the file name as string or an open file object:
-
->>> pd.read_excel('tmp.xlsx', index_col=0) # doctest: +SKIP
- Name Value
-0 string1 1
-1 string2 2
-2 #Comment 3
-
->>> pd.read_excel(open('tmp.xlsx', 'rb'),
-... sheet_name='Sheet3') # doctest: +SKIP
- Unnamed: 0 Name Value
-0 0 string1 1
-1 1 string2 2
-2 2 #Comment 3
-
-Index and header can be specified via the `index_col` and `header` arguments
-
->>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP
- 0 1 2
-0 NaN Name Value
-1 0.0 string1 1
-2 1.0 string2 2
-3 2.0 #Comment 3
-
-Column types are inferred but can be explicitly specified
-
->>> pd.read_excel('tmp.xlsx', index_col=0,
-... dtype={{'Name': str, 'Value': float}}) # doctest: +SKIP
- Name Value
-0 string1 1.0
-1 string2 2.0
-2 #Comment 3.0
-
-True, False, and NA values, and thousands separators have defaults,
-but can be explicitly specified, too. Supply the values you would like
-as strings or lists of strings!
-
->>> pd.read_excel('tmp.xlsx', index_col=0,
-... na_values=['string1', 'string2']) # doctest: +SKIP
- Name Value
-0 NaN 1
-1 NaN 2
-2 #Comment 3
-
-Comment lines in the excel input file can be skipped using the `comment` kwarg
-
->>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP
- Name Value
-0 string1 1.0
-1 string2 2.0
-2 None NaN
-"""
-)
-
-
-@overload
-def read_excel(
- io,
- # sheet name is str or int -> DataFrame
- sheet_name: str | int = ...,
- *,
- header: int | Sequence[int] | None = ...,
- names: list[str] | None = ...,
- index_col: int | Sequence[int] | None = ...,
- usecols: int
- | str
- | Sequence[int]
- | Sequence[str]
- | Callable[[str], bool]
- | None = ...,
- dtype: DtypeArg | None = ...,
- engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,
- converters: dict[str, Callable] | dict[int, Callable] | None = ...,
- true_values: Iterable[Hashable] | None = ...,
- false_values: Iterable[Hashable] | None = ...,
- skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- parse_dates: list | dict | bool = ...,
- date_parser: Callable | lib.NoDefault = ...,
- date_format: dict[Hashable, str] | str | None = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- comment: str | None = ...,
- skipfooter: int = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> DataFrame:
- ...
-
-
-@overload
-def read_excel(
- io,
- # sheet name is list or None -> dict[IntStrT, DataFrame]
- sheet_name: list[IntStrT] | None,
- *,
- header: int | Sequence[int] | None = ...,
- names: list[str] | None = ...,
- index_col: int | Sequence[int] | None = ...,
- usecols: int
- | str
- | Sequence[int]
- | Sequence[str]
- | Callable[[str], bool]
- | None = ...,
- dtype: DtypeArg | None = ...,
- engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = ...,
- converters: dict[str, Callable] | dict[int, Callable] | None = ...,
- true_values: Iterable[Hashable] | None = ...,
- false_values: Iterable[Hashable] | None = ...,
- skiprows: Sequence[int] | int | Callable[[int], object] | None = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- parse_dates: list | dict | bool = ...,
- date_parser: Callable | lib.NoDefault = ...,
- date_format: dict[Hashable, str] | str | None = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- comment: str | None = ...,
- skipfooter: int = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> dict[IntStrT, DataFrame]:
- ...
-
-
-@doc(storage_options=_shared_docs["storage_options"])
-@Appender(_read_excel_doc)
-def read_excel(
- io,
- sheet_name: str | int | list[IntStrT] | None = 0,
- *,
- header: int | Sequence[int] | None = 0,
- names: list[str] | None = None,
- index_col: int | Sequence[int] | None = None,
- usecols: int
- | str
- | Sequence[int]
- | Sequence[str]
- | Callable[[str], bool]
- | None = None,
- dtype: DtypeArg | None = None,
- engine: Literal["xlrd", "openpyxl", "odf", "pyxlsb"] | None = None,
- converters: dict[str, Callable] | dict[int, Callable] | None = None,
- true_values: Iterable[Hashable] | None = None,
- false_values: Iterable[Hashable] | None = None,
- skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
- nrows: int | None = None,
- na_values=None,
- keep_default_na: bool = True,
- na_filter: bool = True,
- verbose: bool = False,
- parse_dates: list | dict | bool = False,
- date_parser: Callable | lib.NoDefault = lib.no_default,
- date_format: dict[Hashable, str] | str | None = None,
- thousands: str | None = None,
- decimal: str = ".",
- comment: str | None = None,
- skipfooter: int = 0,
- storage_options: StorageOptions = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-) -> DataFrame | dict[IntStrT, DataFrame]:
- check_dtype_backend(dtype_backend)
-
- should_close = False
- if not isinstance(io, ExcelFile):
- should_close = True
- io = ExcelFile(io, storage_options=storage_options, engine=engine)
- elif engine and engine != io.engine:
- raise ValueError(
- "Engine should not be specified when passing "
- "an ExcelFile - ExcelFile already has the engine set"
- )
-
- try:
- data = io.parse(
- sheet_name=sheet_name,
- header=header,
- names=names,
- index_col=index_col,
- usecols=usecols,
- dtype=dtype,
- converters=converters,
- true_values=true_values,
- false_values=false_values,
- skiprows=skiprows,
- nrows=nrows,
- na_values=na_values,
- keep_default_na=keep_default_na,
- na_filter=na_filter,
- verbose=verbose,
- parse_dates=parse_dates,
- date_parser=date_parser,
- date_format=date_format,
- thousands=thousands,
- decimal=decimal,
- comment=comment,
- skipfooter=skipfooter,
- dtype_backend=dtype_backend,
- )
- finally:
- # make sure to close opened file handles
- if should_close:
- io.close()
- return data
-
-
-class BaseExcelReader(metaclass=abc.ABCMeta):
- def __init__(
- self, filepath_or_buffer, storage_options: StorageOptions = None
- ) -> None:
- # First argument can also be bytes, so create a buffer
- if isinstance(filepath_or_buffer, bytes):
- filepath_or_buffer = BytesIO(filepath_or_buffer)
-
- self.handles = IOHandles(
- handle=filepath_or_buffer, compression={"method": None}
- )
- if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
- self.handles = get_handle(
- filepath_or_buffer, "rb", storage_options=storage_options, is_text=False
- )
-
- if isinstance(self.handles.handle, self._workbook_class):
- self.book = self.handles.handle
- elif hasattr(self.handles.handle, "read"):
- # N.B. xlrd.Book has a read attribute too
- self.handles.handle.seek(0)
- try:
- self.book = self.load_workbook(self.handles.handle)
- except Exception:
- self.close()
- raise
- else:
- raise ValueError(
- "Must explicitly set engine if not passing in buffer or path for io."
- )
-
- @property
- @abc.abstractmethod
- def _workbook_class(self):
- pass
-
- @abc.abstractmethod
- def load_workbook(self, filepath_or_buffer):
- pass
-
- def close(self) -> None:
- if hasattr(self, "book"):
- if hasattr(self.book, "close"):
- # pyxlsb: opens a TemporaryFile
- # openpyxl: https://stackoverflow.com/questions/31416842/
- # openpyxl-does-not-close-excel-workbook-in-read-only-mode
- self.book.close()
- elif hasattr(self.book, "release_resources"):
- # xlrd
- # https://github.com/python-excel/xlrd/blob/2.0.1/xlrd/book.py#L548
- self.book.release_resources()
- self.handles.close()
-
- @property
- @abc.abstractmethod
- def sheet_names(self) -> list[str]:
- pass
-
- @abc.abstractmethod
- def get_sheet_by_name(self, name: str):
- pass
-
- @abc.abstractmethod
- def get_sheet_by_index(self, index: int):
- pass
-
- @abc.abstractmethod
- def get_sheet_data(self, sheet, rows: int | None = None):
- pass
-
- def raise_if_bad_sheet_by_index(self, index: int) -> None:
- n_sheets = len(self.sheet_names)
- if index >= n_sheets:
- raise ValueError(
- f"Worksheet index {index} is invalid, {n_sheets} worksheets found"
- )
-
- def raise_if_bad_sheet_by_name(self, name: str) -> None:
- if name not in self.sheet_names:
- raise ValueError(f"Worksheet named '{name}' not found")
-
- def _check_skiprows_func(
- self,
- skiprows: Callable,
- rows_to_use: int,
- ) -> int:
- """
- Determine how many file rows are required to obtain `nrows` data
- rows when `skiprows` is a function.
-
- Parameters
- ----------
- skiprows : function
- The function passed to read_excel by the user.
- rows_to_use : int
- The number of rows that will be needed for the header and
- the data.
-
- Returns
- -------
- int
- """
- i = 0
- rows_used_so_far = 0
- while rows_used_so_far < rows_to_use:
- if not skiprows(i):
- rows_used_so_far += 1
- i += 1
- return i
-
- def _calc_rows(
- self,
- header: int | Sequence[int] | None,
- index_col: int | Sequence[int] | None,
- skiprows: Sequence[int] | int | Callable[[int], object] | None,
- nrows: int | None,
- ) -> int | None:
- """
- If nrows specified, find the number of rows needed from the
- file, otherwise return None.
-
-
- Parameters
- ----------
- header : int, list of int, or None
- See read_excel docstring.
- index_col : int, list of int, or None
- See read_excel docstring.
- skiprows : list-like, int, callable, or None
- See read_excel docstring.
- nrows : int or None
- See read_excel docstring.
-
- Returns
- -------
- int or None
- """
- if nrows is None:
- return None
- if header is None:
- header_rows = 1
- elif is_integer(header):
- header = cast(int, header)
- header_rows = 1 + header
- else:
- header = cast(Sequence, header)
- header_rows = 1 + header[-1]
- # If there is a MultiIndex header and an index then there is also
- # a row containing just the index name(s)
- if is_list_like(header) and index_col is not None:
- header = cast(Sequence, header)
- if len(header) > 1:
- header_rows += 1
- if skiprows is None:
- return header_rows + nrows
- if is_integer(skiprows):
- skiprows = cast(int, skiprows)
- return header_rows + nrows + skiprows
- if is_list_like(skiprows):
-
- def f(skiprows: Sequence, x: int) -> bool:
- return x in skiprows
-
- skiprows = cast(Sequence, skiprows)
- return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows)
- if callable(skiprows):
- return self._check_skiprows_func(
- skiprows,
- header_rows + nrows,
- )
- # else unexpected skiprows type: read_excel will not optimize
- # the number of rows read from file
- return None
-
- def parse(
- self,
- sheet_name: str | int | list[int] | list[str] | None = 0,
- header: int | Sequence[int] | None = 0,
- names=None,
- index_col: int | Sequence[int] | None = None,
- usecols=None,
- dtype: DtypeArg | None = None,
- true_values: Iterable[Hashable] | None = None,
- false_values: Iterable[Hashable] | None = None,
- skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
- nrows: int | None = None,
- na_values=None,
- verbose: bool = False,
- parse_dates: list | dict | bool = False,
- date_parser: Callable | lib.NoDefault = lib.no_default,
- date_format: dict[Hashable, str] | str | None = None,
- thousands: str | None = None,
- decimal: str = ".",
- comment: str | None = None,
- skipfooter: int = 0,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- **kwds,
- ):
- validate_header_arg(header)
- validate_integer("nrows", nrows)
-
- ret_dict = False
-
- # Keep sheetname to maintain backwards compatibility.
- sheets: list[int] | list[str]
- if isinstance(sheet_name, list):
- sheets = sheet_name
- ret_dict = True
- elif sheet_name is None:
- sheets = self.sheet_names
- ret_dict = True
- elif isinstance(sheet_name, str):
- sheets = [sheet_name]
- else:
- sheets = [sheet_name]
-
- # handle same-type duplicates.
- sheets = cast(Union[List[int], List[str]], list(dict.fromkeys(sheets).keys()))
-
- output = {}
-
- last_sheetname = None
- for asheetname in sheets:
- last_sheetname = asheetname
- if verbose:
- print(f"Reading sheet {asheetname}")
-
- if isinstance(asheetname, str):
- sheet = self.get_sheet_by_name(asheetname)
- else: # assume an integer if not a string
- sheet = self.get_sheet_by_index(asheetname)
-
- file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
- data = self.get_sheet_data(sheet, file_rows_needed)
- if hasattr(sheet, "close"):
- # pyxlsb opens two TemporaryFiles
- sheet.close()
- usecols = maybe_convert_usecols(usecols)
-
- if not data:
- output[asheetname] = DataFrame()
- continue
-
- is_list_header = False
- is_len_one_list_header = False
- if is_list_like(header):
- assert isinstance(header, Sequence)
- is_list_header = True
- if len(header) == 1:
- is_len_one_list_header = True
-
- if is_len_one_list_header:
- header = cast(Sequence[int], header)[0]
-
- # forward fill and pull out names for MultiIndex column
- header_names = None
- if header is not None and is_list_like(header):
- assert isinstance(header, Sequence)
-
- header_names = []
- control_row = [True] * len(data[0])
-
- for row in header:
- if is_integer(skiprows):
- assert isinstance(skiprows, int)
- row += skiprows
-
- if row > len(data) - 1:
- raise ValueError(
- f"header index {row} exceeds maximum index "
- f"{len(data) - 1} of data.",
- )
-
- data[row], control_row = fill_mi_header(data[row], control_row)
-
- if index_col is not None:
- header_name, _ = pop_header_name(data[row], index_col)
- header_names.append(header_name)
-
- # If there is a MultiIndex header and an index then there is also
- # a row containing just the index name(s)
- has_index_names = False
- if is_list_header and not is_len_one_list_header and index_col is not None:
- index_col_list: Sequence[int]
- if isinstance(index_col, int):
- index_col_list = [index_col]
- else:
- assert isinstance(index_col, Sequence)
- index_col_list = index_col
-
- # We have to handle mi without names. If any of the entries in the data
- # columns are not empty, this is a regular row
- assert isinstance(header, Sequence)
- if len(header) < len(data):
- potential_index_names = data[len(header)]
- potential_data = [
- x
- for i, x in enumerate(potential_index_names)
- if not control_row[i] and i not in index_col_list
- ]
- has_index_names = all(x == "" or x is None for x in potential_data)
-
- if is_list_like(index_col):
- # Forward fill values for MultiIndex index.
- if header is None:
- offset = 0
- elif isinstance(header, int):
- offset = 1 + header
- else:
- offset = 1 + max(header)
-
- # GH34673: if MultiIndex names present and not defined in the header,
- # offset needs to be incremented so that forward filling starts
- # from the first MI value instead of the name
- if has_index_names:
- offset += 1
-
- # Check if we have an empty dataset
- # before trying to collect data.
- if offset < len(data):
- assert isinstance(index_col, Sequence)
-
- for col in index_col:
- last = data[offset][col]
-
- for row in range(offset + 1, len(data)):
- if data[row][col] == "" or data[row][col] is None:
- data[row][col] = last
- else:
- last = data[row][col]
-
- # GH 12292 : error when read one empty column from excel file
- try:
- parser = TextParser(
- data,
- names=names,
- header=header,
- index_col=index_col,
- has_index_names=has_index_names,
- dtype=dtype,
- true_values=true_values,
- false_values=false_values,
- skiprows=skiprows,
- nrows=nrows,
- na_values=na_values,
- skip_blank_lines=False, # GH 39808
- parse_dates=parse_dates,
- date_parser=date_parser,
- date_format=date_format,
- thousands=thousands,
- decimal=decimal,
- comment=comment,
- skipfooter=skipfooter,
- usecols=usecols,
- dtype_backend=dtype_backend,
- **kwds,
- )
-
- output[asheetname] = parser.read(nrows=nrows)
-
- if header_names:
- output[asheetname].columns = output[asheetname].columns.set_names(
- header_names
- )
-
- except EmptyDataError:
- # No Data, return an empty DataFrame
- output[asheetname] = DataFrame()
-
- except Exception as err:
- err.args = (f"{err.args[0]} (sheet: {asheetname})", *err.args[1:])
- raise err
-
- if last_sheetname is None:
- raise ValueError("Sheet name is an empty list")
-
- if ret_dict:
- return output
- else:
- return output[last_sheetname]
-
-
-@doc(storage_options=_shared_docs["storage_options"])
-class ExcelWriter(metaclass=abc.ABCMeta):
- """
- Class for writing DataFrame objects into excel sheets.
-
- Default is to use:
-
- * `xlsxwriter <https://pypi.org/project/XlsxWriter/>`__ for xlsx files if xlsxwriter
- is installed otherwise `openpyxl <https://pypi.org/project/openpyxl/>`__
- * `odswriter <https://pypi.org/project/odswriter/>`__ for ods files
-
- See ``DataFrame.to_excel`` for typical usage.
-
- The writer should be used as a context manager. Otherwise, call `close()` to save
- and close any opened file handles.
-
- Parameters
- ----------
- path : str or typing.BinaryIO
- Path to xls or xlsx or ods file.
- engine : str (optional)
- Engine to use for writing. If None, defaults to
- ``io.excel.<extension>.writer``. NOTE: can only be passed as a keyword
- argument.
- date_format : str, default None
- Format string for dates written into Excel files (e.g. 'YYYY-MM-DD').
- datetime_format : str, default None
- Format string for datetime objects written into Excel files.
- (e.g. 'YYYY-MM-DD HH:MM:SS').
- mode : {{'w', 'a'}}, default 'w'
- File mode to use (write or append). Append does not work with fsspec URLs.
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- if_sheet_exists : {{'error', 'new', 'replace', 'overlay'}}, default 'error'
- How to behave when trying to write to a sheet that already
- exists (append mode only).
-
- * error: raise a ValueError.
- * new: Create a new sheet, with a name determined by the engine.
- * replace: Delete the contents of the sheet before writing to it.
- * overlay: Write contents to the existing sheet without removing the old
- contents.
-
- .. versionadded:: 1.3.0
-
- .. versionchanged:: 1.4.0
-
- Added ``overlay`` option
-
- engine_kwargs : dict, optional
- Keyword arguments to be passed into the engine. These will be passed to
- the following functions of the respective engines:
-
- * xlsxwriter: ``xlsxwriter.Workbook(file, **engine_kwargs)``
- * openpyxl (write mode): ``openpyxl.Workbook(**engine_kwargs)``
- * openpyxl (append mode): ``openpyxl.load_workbook(file, **engine_kwargs)``
- * odswriter: ``odf.opendocument.OpenDocumentSpreadsheet(**engine_kwargs)``
-
- .. versionadded:: 1.3.0
-
- Notes
- -----
- For compatibility with CSV writers, ExcelWriter serializes lists
- and dicts to strings before writing.
-
- Examples
- --------
- Default usage:
-
- >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
- >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:
- ... df.to_excel(writer) # doctest: +SKIP
-
- To write to separate sheets in a single file:
-
- >>> df1 = pd.DataFrame([["AAA", "BBB"]], columns=["Spam", "Egg"]) # doctest: +SKIP
- >>> df2 = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
- >>> with pd.ExcelWriter("path_to_file.xlsx") as writer:
- ... df1.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP
- ... df2.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP
-
- You can set the date format or datetime format:
-
- >>> from datetime import date, datetime # doctest: +SKIP
- >>> df = pd.DataFrame(
- ... [
- ... [date(2014, 1, 31), date(1999, 9, 24)],
- ... [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)],
- ... ],
- ... index=["Date", "Datetime"],
- ... columns=["X", "Y"],
- ... ) # doctest: +SKIP
- >>> with pd.ExcelWriter(
- ... "path_to_file.xlsx",
- ... date_format="YYYY-MM-DD",
- ... datetime_format="YYYY-MM-DD HH:MM:SS"
- ... ) as writer:
- ... df.to_excel(writer) # doctest: +SKIP
-
- You can also append to an existing Excel file:
-
- >>> with pd.ExcelWriter("path_to_file.xlsx", mode="a", engine="openpyxl") as writer:
- ... df.to_excel(writer, sheet_name="Sheet3") # doctest: +SKIP
-
- Here, the `if_sheet_exists` parameter can be set to replace a sheet if it
- already exists:
-
- >>> with ExcelWriter(
- ... "path_to_file.xlsx",
- ... mode="a",
- ... engine="openpyxl",
- ... if_sheet_exists="replace",
- ... ) as writer:
- ... df.to_excel(writer, sheet_name="Sheet1") # doctest: +SKIP
-
- You can also write multiple DataFrames to a single sheet. Note that the
- ``if_sheet_exists`` parameter needs to be set to ``overlay``:
-
- >>> with ExcelWriter("path_to_file.xlsx",
- ... mode="a",
- ... engine="openpyxl",
- ... if_sheet_exists="overlay",
- ... ) as writer:
- ... df1.to_excel(writer, sheet_name="Sheet1")
- ... df2.to_excel(writer, sheet_name="Sheet1", startcol=3) # doctest: +SKIP
-
- You can store Excel file in RAM:
-
- >>> import io
- >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"])
- >>> buffer = io.BytesIO()
- >>> with pd.ExcelWriter(buffer) as writer:
- ... df.to_excel(writer)
-
- You can pack Excel file into zip archive:
-
- >>> import zipfile # doctest: +SKIP
- >>> df = pd.DataFrame([["ABC", "XYZ"]], columns=["Foo", "Bar"]) # doctest: +SKIP
- >>> with zipfile.ZipFile("path_to_file.zip", "w") as zf:
- ... with zf.open("filename.xlsx", "w") as buffer:
- ... with pd.ExcelWriter(buffer) as writer:
- ... df.to_excel(writer) # doctest: +SKIP
-
- You can specify additional arguments to the underlying engine:
-
- >>> with pd.ExcelWriter(
- ... "path_to_file.xlsx",
- ... engine="xlsxwriter",
- ... engine_kwargs={{"options": {{"nan_inf_to_errors": True}}}}
- ... ) as writer:
- ... df.to_excel(writer) # doctest: +SKIP
-
- In append mode, ``engine_kwargs`` are passed through to
- openpyxl's ``load_workbook``:
-
- >>> with pd.ExcelWriter(
- ... "path_to_file.xlsx",
- ... engine="openpyxl",
- ... mode="a",
- ... engine_kwargs={{"keep_vba": True}}
- ... ) as writer:
- ... df.to_excel(writer, sheet_name="Sheet2") # doctest: +SKIP
- """
-
- # Defining an ExcelWriter implementation (see abstract methods for more...)
-
- # - Mandatory
- # - ``write_cells(self, cells, sheet_name=None, startrow=0, startcol=0)``
- # --> called to write additional DataFrames to disk
- # - ``_supported_extensions`` (tuple of supported extensions), used to
- # check that engine supports the given extension.
- # - ``_engine`` - string that gives the engine name. Necessary to
- # instantiate class directly and bypass ``ExcelWriterMeta`` engine
- # lookup.
- # - ``save(self)`` --> called to save file to disk
- # - Mostly mandatory (i.e. should at least exist)
- # - book, cur_sheet, path
-
- # - Optional:
- # - ``__init__(self, path, engine=None, **kwargs)`` --> always called
- # with path as first argument.
-
- # You also need to register the class with ``register_writer()``.
- # Technically, ExcelWriter implementations don't need to subclass
- # ExcelWriter.
-
- _engine: str
- _supported_extensions: tuple[str, ...]
-
- def __new__(
- cls: type[ExcelWriter],
- path: FilePath | WriteExcelBuffer | ExcelWriter,
- engine: str | None = None,
- date_format: str | None = None,
- datetime_format: str | None = None,
- mode: str = "w",
- storage_options: StorageOptions = None,
- if_sheet_exists: Literal["error", "new", "replace", "overlay"] | None = None,
- engine_kwargs: dict | None = None,
- ) -> ExcelWriter:
- # only switch class if generic(ExcelWriter)
- if cls is ExcelWriter:
- if engine is None or (isinstance(engine, str) and engine == "auto"):
- if isinstance(path, str):
- ext = os.path.splitext(path)[-1][1:]
- else:
- ext = "xlsx"
-
- try:
- engine = config.get_option(f"io.excel.{ext}.writer", silent=True)
- if engine == "auto":
- engine = get_default_engine(ext, mode="writer")
- except KeyError as err:
- raise ValueError(f"No engine for filetype: '{ext}'") from err
-
- # for mypy
- assert engine is not None
- cls = get_writer(engine)
-
- return object.__new__(cls)
-
- # declare external properties you can count on
- _path = None
-
- @property
- def supported_extensions(self) -> tuple[str, ...]:
- """Extensions that writer engine supports."""
- return self._supported_extensions
-
- @property
- def engine(self) -> str:
- """Name of engine."""
- return self._engine
-
- @property
- @abc.abstractmethod
- def sheets(self) -> dict[str, Any]:
- """Mapping of sheet names to sheet objects."""
-
- @property
- @abc.abstractmethod
- def book(self):
- """
- Book instance. Class type will depend on the engine used.
-
- This attribute can be used to access engine-specific features.
- """
-
- @abc.abstractmethod
- def _write_cells(
- self,
- cells,
- sheet_name: str | None = None,
- startrow: int = 0,
- startcol: int = 0,
- freeze_panes: tuple[int, int] | None = None,
- ) -> None:
- """
- Write given formatted cells into Excel an excel sheet
-
- Parameters
- ----------
- cells : generator
- cell of formatted data to save to Excel sheet
- sheet_name : str, default None
- Name of Excel sheet, if None, then use self.cur_sheet
- startrow : upper left cell row to dump data frame
- startcol : upper left cell column to dump data frame
- freeze_panes: int tuple of length 2
- contains the bottom-most row and right-most column to freeze
- """
-
- @abc.abstractmethod
- def _save(self) -> None:
- """
- Save workbook to disk.
- """
-
- def __init__(
- self,
- path: FilePath | WriteExcelBuffer | ExcelWriter,
- engine: str | None = None,
- date_format: str | None = None,
- datetime_format: str | None = None,
- mode: str = "w",
- storage_options: StorageOptions = None,
- if_sheet_exists: str | None = None,
- engine_kwargs: dict[str, Any] | None = None,
- ) -> None:
- # validate that this engine can handle the extension
- if isinstance(path, str):
- ext = os.path.splitext(path)[-1]
- self.check_extension(ext)
-
- # use mode to open the file
- if "b" not in mode:
- mode += "b"
- # use "a" for the user to append data to excel but internally use "r+" to let
- # the excel backend first read the existing file and then write any data to it
- mode = mode.replace("a", "r+")
-
- if if_sheet_exists not in (None, "error", "new", "replace", "overlay"):
- raise ValueError(
- f"'{if_sheet_exists}' is not valid for if_sheet_exists. "
- "Valid options are 'error', 'new', 'replace' and 'overlay'."
- )
- if if_sheet_exists and "r+" not in mode:
- raise ValueError("if_sheet_exists is only valid in append mode (mode='a')")
- if if_sheet_exists is None:
- if_sheet_exists = "error"
- self._if_sheet_exists = if_sheet_exists
-
- # cast ExcelWriter to avoid adding 'if self._handles is not None'
- self._handles = IOHandles(
- cast(IO[bytes], path), compression={"compression": None}
- )
- if not isinstance(path, ExcelWriter):
- self._handles = get_handle(
- path, mode, storage_options=storage_options, is_text=False
- )
- self._cur_sheet = None
-
- if date_format is None:
- self._date_format = "YYYY-MM-DD"
- else:
- self._date_format = date_format
- if datetime_format is None:
- self._datetime_format = "YYYY-MM-DD HH:MM:SS"
- else:
- self._datetime_format = datetime_format
-
- self._mode = mode
-
- @property
- def date_format(self) -> str:
- """
- Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’).
- """
- return self._date_format
-
- @property
- def datetime_format(self) -> str:
- """
- Format string for dates written into Excel files (e.g. ‘YYYY-MM-DD’).
- """
- return self._datetime_format
-
- @property
- def if_sheet_exists(self) -> str:
- """
- How to behave when writing to a sheet that already exists in append mode.
- """
- return self._if_sheet_exists
-
- def __fspath__(self) -> str:
- return getattr(self._handles.handle, "name", "")
-
- def _get_sheet_name(self, sheet_name: str | None) -> str:
- if sheet_name is None:
- sheet_name = self._cur_sheet
- if sheet_name is None: # pragma: no cover
- raise ValueError("Must pass explicit sheet_name or set _cur_sheet property")
- return sheet_name
-
- def _value_with_fmt(self, val) -> tuple[object, str | None]:
- """
- Convert numpy types to Python types for the Excel writers.
-
- Parameters
- ----------
- val : object
- Value to be written into cells
-
- Returns
- -------
- Tuple with the first element being the converted value and the second
- being an optional format
- """
- fmt = None
-
- if is_integer(val):
- val = int(val)
- elif is_float(val):
- val = float(val)
- elif is_bool(val):
- val = bool(val)
- elif isinstance(val, datetime.datetime):
- fmt = self._datetime_format
- elif isinstance(val, datetime.date):
- fmt = self._date_format
- elif isinstance(val, datetime.timedelta):
- val = val.total_seconds() / 86400
- fmt = "0"
- else:
- val = str(val)
-
- return val, fmt
-
- @classmethod
- def check_extension(cls, ext: str) -> Literal[True]:
- """
- checks that path's extension against the Writer's supported
- extensions. If it isn't supported, raises UnsupportedFiletypeError.
- """
- if ext.startswith("."):
- ext = ext[1:]
- if not any(ext in extension for extension in cls._supported_extensions):
- raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'")
- return True
-
- # Allow use as a contextmanager
- def __enter__(self) -> ExcelWriter:
- return self
-
- def __exit__(
- self,
- exc_type: type[BaseException] | None,
- exc_value: BaseException | None,
- traceback: TracebackType | None,
- ) -> None:
- self.close()
-
- def close(self) -> None:
- """synonym for save, to make it more file-like"""
- self._save()
- self._handles.close()
-
-
-XLS_SIGNATURES = (
- b"\x09\x00\x04\x00\x07\x00\x10\x00", # BIFF2
- b"\x09\x02\x06\x00\x00\x00\x10\x00", # BIFF3
- b"\x09\x04\x06\x00\x00\x00\x10\x00", # BIFF4
- b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", # Compound File Binary
-)
-ZIP_SIGNATURE = b"PK\x03\x04"
-PEEK_SIZE = max(map(len, XLS_SIGNATURES + (ZIP_SIGNATURE,)))
-
-
-@doc(storage_options=_shared_docs["storage_options"])
-def inspect_excel_format(
- content_or_path: FilePath | ReadBuffer[bytes],
- storage_options: StorageOptions = None,
-) -> str | None:
- """
- Inspect the path or content of an excel file and get its format.
-
- Adopted from xlrd: https://github.com/python-excel/xlrd.
-
- Parameters
- ----------
- content_or_path : str or file-like object
- Path to file or content of file to inspect. May be a URL.
- {storage_options}
-
- Returns
- -------
- str or None
- Format of file if it can be determined.
-
- Raises
- ------
- ValueError
- If resulting stream is empty.
- BadZipFile
- If resulting stream does not have an XLS signature and is not a valid zipfile.
- """
- if isinstance(content_or_path, bytes):
- content_or_path = BytesIO(content_or_path)
-
- with get_handle(
- content_or_path, "rb", storage_options=storage_options, is_text=False
- ) as handle:
- stream = handle.handle
- stream.seek(0)
- buf = stream.read(PEEK_SIZE)
- if buf is None:
- raise ValueError("stream is empty")
- assert isinstance(buf, bytes)
- peek = buf
- stream.seek(0)
-
- if any(peek.startswith(sig) for sig in XLS_SIGNATURES):
- return "xls"
- elif not peek.startswith(ZIP_SIGNATURE):
- return None
-
- with zipfile.ZipFile(stream) as zf:
- # Workaround for some third party files that use forward slashes and
- # lower case names.
- component_names = [
- name.replace("\\", "/").lower() for name in zf.namelist()
- ]
-
- if "xl/workbook.xml" in component_names:
- return "xlsx"
- if "xl/workbook.bin" in component_names:
- return "xlsb"
- if "content.xml" in component_names:
- return "ods"
- return "zip"
-
-
-class ExcelFile:
- """
- Class for parsing tabular Excel sheets into DataFrame objects.
-
- See read_excel for more documentation.
-
- Parameters
- ----------
- path_or_buffer : str, bytes, path object (pathlib.Path or py._path.local.LocalPath),
- A file-like object, xlrd workbook or openpyxl workbook.
- If a string or path object, expected to be a path to a
- .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
- engine : str, default None
- If io is not a buffer or path, this must be set to identify io.
- Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
- Engine compatibility :
-
- - ``xlrd`` supports old-style Excel files (.xls).
- - ``openpyxl`` supports newer Excel file formats.
- - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
- - ``pyxlsb`` supports Binary Excel files.
-
- .. versionchanged:: 1.2.0
-
- The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
- now only supports old-style ``.xls`` files.
- When ``engine=None``, the following logic will be
- used to determine the engine:
-
- - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
- then `odf <https://pypi.org/project/odfpy/>`_ will be used.
- - Otherwise if ``path_or_buffer`` is an xls format,
- ``xlrd`` will be used.
- - Otherwise if ``path_or_buffer`` is in xlsb format,
- `pyxlsb <https://pypi.org/project/pyxlsb/>`_ will be used.
-
- .. versionadded:: 1.3.0
-
- - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
- then ``openpyxl`` will be used.
- - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
-
- .. warning::
-
- Please do not report issues when using ``xlrd`` to read ``.xlsx`` files.
- This is not supported, switch to using ``openpyxl`` instead.
- """
-
- from pandas.io.excel._odfreader import ODFReader
- from pandas.io.excel._openpyxl import OpenpyxlReader
- from pandas.io.excel._pyxlsb import PyxlsbReader
- from pandas.io.excel._xlrd import XlrdReader
-
- _engines: Mapping[str, Any] = {
- "xlrd": XlrdReader,
- "openpyxl": OpenpyxlReader,
- "odf": ODFReader,
- "pyxlsb": PyxlsbReader,
- }
-
- def __init__(
- self,
- path_or_buffer,
- engine: str | None = None,
- storage_options: StorageOptions = None,
- ) -> None:
- if engine is not None and engine not in self._engines:
- raise ValueError(f"Unknown engine: {engine}")
-
- # First argument can also be bytes, so create a buffer
- if isinstance(path_or_buffer, bytes):
- path_or_buffer = BytesIO(path_or_buffer)
-
- # Could be a str, ExcelFile, Book, etc.
- self.io = path_or_buffer
- # Always a string
- self._io = stringify_path(path_or_buffer)
-
- # Determine xlrd version if installed
- if import_optional_dependency("xlrd", errors="ignore") is None:
- xlrd_version = None
- else:
- import xlrd
-
- xlrd_version = Version(get_version(xlrd))
-
- if engine is None:
- # Only determine ext if it is needed
- ext: str | None
- if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
- ext = "xls"
- else:
- ext = inspect_excel_format(
- content_or_path=path_or_buffer, storage_options=storage_options
- )
- if ext is None:
- raise ValueError(
- "Excel file format cannot be determined, you must specify "
- "an engine manually."
- )
-
- engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
- if engine == "auto":
- engine = get_default_engine(ext, mode="reader")
-
- assert engine is not None
- self.engine = engine
- self.storage_options = storage_options
-
- self._reader = self._engines[engine](self._io, storage_options=storage_options)
-
- def __fspath__(self):
- return self._io
-
- def parse(
- self,
- sheet_name: str | int | list[int] | list[str] | None = 0,
- header: int | Sequence[int] | None = 0,
- names=None,
- index_col: int | Sequence[int] | None = None,
- usecols=None,
- converters=None,
- true_values: Iterable[Hashable] | None = None,
- false_values: Iterable[Hashable] | None = None,
- skiprows: Sequence[int] | int | Callable[[int], object] | None = None,
- nrows: int | None = None,
- na_values=None,
- parse_dates: list | dict | bool = False,
- date_parser: Callable | lib.NoDefault = lib.no_default,
- date_format: str | dict[Hashable, str] | None = None,
- thousands: str | None = None,
- comment: str | None = None,
- skipfooter: int = 0,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- **kwds,
- ) -> DataFrame | dict[str, DataFrame] | dict[int, DataFrame]:
- """
- Parse specified sheet(s) into a DataFrame.
-
- Equivalent to read_excel(ExcelFile, ...) See the read_excel
- docstring for more info on accepted parameters.
-
- Returns
- -------
- DataFrame or dict of DataFrames
- DataFrame from the passed in Excel file.
- """
- return self._reader.parse(
- sheet_name=sheet_name,
- header=header,
- names=names,
- index_col=index_col,
- usecols=usecols,
- converters=converters,
- true_values=true_values,
- false_values=false_values,
- skiprows=skiprows,
- nrows=nrows,
- na_values=na_values,
- parse_dates=parse_dates,
- date_parser=date_parser,
- date_format=date_format,
- thousands=thousands,
- comment=comment,
- skipfooter=skipfooter,
- dtype_backend=dtype_backend,
- **kwds,
- )
-
- @property
- def book(self):
- return self._reader.book
-
- @property
- def sheet_names(self):
- return self._reader.sheet_names
-
- def close(self) -> None:
- """close io if necessary"""
- self._reader.close()
-
- def __enter__(self) -> ExcelFile:
- return self
-
- def __exit__(
- self,
- exc_type: type[BaseException] | None,
- exc_value: BaseException | None,
- traceback: TracebackType | None,
- ) -> None:
- self.close()
diff --git a/contrib/python/pandas/py3/pandas/io/excel/_odfreader.py b/contrib/python/pandas/py3/pandas/io/excel/_odfreader.py
deleted file mode 100644
index c3d7cb5df71..00000000000
--- a/contrib/python/pandas/py3/pandas/io/excel/_odfreader.py
+++ /dev/null
@@ -1,249 +0,0 @@
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- cast,
-)
-
-import numpy as np
-
-from pandas._typing import (
- FilePath,
- ReadBuffer,
- Scalar,
- StorageOptions,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.util._decorators import doc
-
-import pandas as pd
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.excel._base import BaseExcelReader
-
-if TYPE_CHECKING:
- from pandas._libs.tslibs.nattype import NaTType
-
-
-@doc(storage_options=_shared_docs["storage_options"])
-class ODFReader(BaseExcelReader):
- def __init__(
- self,
- filepath_or_buffer: FilePath | ReadBuffer[bytes],
- storage_options: StorageOptions = None,
- ) -> None:
- """
- Read tables out of OpenDocument formatted files.
-
- Parameters
- ----------
- filepath_or_buffer : str, path to be parsed or
- an open readable stream.
- {storage_options}
- """
- import_optional_dependency("odf")
- super().__init__(filepath_or_buffer, storage_options=storage_options)
-
- @property
- def _workbook_class(self):
- from odf.opendocument import OpenDocument
-
- return OpenDocument
-
- def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
- from odf.opendocument import load
-
- return load(filepath_or_buffer)
-
- @property
- def empty_value(self) -> str:
- """Property for compat with other readers."""
- return ""
-
- @property
- def sheet_names(self) -> list[str]:
- """Return a list of sheet names present in the document"""
- from odf.table import Table
-
- tables = self.book.getElementsByType(Table)
- return [t.getAttribute("name") for t in tables]
-
- def get_sheet_by_index(self, index: int):
- from odf.table import Table
-
- self.raise_if_bad_sheet_by_index(index)
- tables = self.book.getElementsByType(Table)
- return tables[index]
-
- def get_sheet_by_name(self, name: str):
- from odf.table import Table
-
- self.raise_if_bad_sheet_by_name(name)
- tables = self.book.getElementsByType(Table)
-
- for table in tables:
- if table.getAttribute("name") == name:
- return table
-
- self.close()
- raise ValueError(f"sheet {name} not found")
-
- def get_sheet_data(
- self, sheet, file_rows_needed: int | None = None
- ) -> list[list[Scalar | NaTType]]:
- """
- Parse an ODF Table into a list of lists
- """
- from odf.table import (
- CoveredTableCell,
- TableCell,
- TableRow,
- )
-
- covered_cell_name = CoveredTableCell().qname
- table_cell_name = TableCell().qname
- cell_names = {covered_cell_name, table_cell_name}
-
- sheet_rows = sheet.getElementsByType(TableRow)
- empty_rows = 0
- max_row_len = 0
-
- table: list[list[Scalar | NaTType]] = []
-
- for sheet_row in sheet_rows:
- sheet_cells = [
- x
- for x in sheet_row.childNodes
- if hasattr(x, "qname") and x.qname in cell_names
- ]
- empty_cells = 0
- table_row: list[Scalar | NaTType] = []
-
- for sheet_cell in sheet_cells:
- if sheet_cell.qname == table_cell_name:
- value = self._get_cell_value(sheet_cell)
- else:
- value = self.empty_value
-
- column_repeat = self._get_column_repeat(sheet_cell)
-
- # Queue up empty values, writing only if content succeeds them
- if value == self.empty_value:
- empty_cells += column_repeat
- else:
- table_row.extend([self.empty_value] * empty_cells)
- empty_cells = 0
- table_row.extend([value] * column_repeat)
-
- if max_row_len < len(table_row):
- max_row_len = len(table_row)
-
- row_repeat = self._get_row_repeat(sheet_row)
- if self._is_empty_row(sheet_row):
- empty_rows += row_repeat
- else:
- # add blank rows to our table
- table.extend([[self.empty_value]] * empty_rows)
- empty_rows = 0
- for _ in range(row_repeat):
- table.append(table_row)
- if file_rows_needed is not None and len(table) >= file_rows_needed:
- break
-
- # Make our table square
- for row in table:
- if len(row) < max_row_len:
- row.extend([self.empty_value] * (max_row_len - len(row)))
-
- return table
-
- def _get_row_repeat(self, row) -> int:
- """
- Return number of times this row was repeated
- Repeating an empty row appeared to be a common way
- of representing sparse rows in the table.
- """
- from odf.namespaces import TABLENS
-
- return int(row.attributes.get((TABLENS, "number-rows-repeated"), 1))
-
- def _get_column_repeat(self, cell) -> int:
- from odf.namespaces import TABLENS
-
- return int(cell.attributes.get((TABLENS, "number-columns-repeated"), 1))
-
- def _is_empty_row(self, row) -> bool:
- """
- Helper function to find empty rows
- """
- for column in row.childNodes:
- if len(column.childNodes) > 0:
- return False
-
- return True
-
- def _get_cell_value(self, cell) -> Scalar | NaTType:
- from odf.namespaces import OFFICENS
-
- if str(cell) == "#N/A":
- return np.nan
-
- cell_type = cell.attributes.get((OFFICENS, "value-type"))
- if cell_type == "boolean":
- if str(cell) == "TRUE":
- return True
- return False
- if cell_type is None:
- return self.empty_value
- elif cell_type == "float":
- # GH5394
- cell_value = float(cell.attributes.get((OFFICENS, "value")))
- val = int(cell_value)
- if val == cell_value:
- return val
- return cell_value
- elif cell_type == "percentage":
- cell_value = cell.attributes.get((OFFICENS, "value"))
- return float(cell_value)
- elif cell_type == "string":
- return self._get_cell_string_value(cell)
- elif cell_type == "currency":
- cell_value = cell.attributes.get((OFFICENS, "value"))
- return float(cell_value)
- elif cell_type == "date":
- cell_value = cell.attributes.get((OFFICENS, "date-value"))
- return pd.Timestamp(cell_value)
- elif cell_type == "time":
- stamp = pd.Timestamp(str(cell))
- # cast needed here because Scalar doesn't include datetime.time
- return cast(Scalar, stamp.time())
- else:
- self.close()
- raise ValueError(f"Unrecognized type {cell_type}")
-
- def _get_cell_string_value(self, cell) -> str:
- """
- Find and decode OpenDocument text:s tags that represent
- a run length encoded sequence of space characters.
- """
- from odf.element import Element
- from odf.namespaces import TEXTNS
- from odf.text import S
-
- text_s = S().qname
-
- value = []
-
- for fragment in cell.childNodes:
- if isinstance(fragment, Element):
- if fragment.qname == text_s:
- spaces = int(fragment.attributes.get((TEXTNS, "c"), 1))
- value.append(" " * spaces)
- else:
- # recursive impl needed in case of nested fragments
- # with multiple spaces
- # https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704
- value.append(self._get_cell_string_value(fragment))
- else:
- value.append(str(fragment).strip("\n"))
- return "".join(value)
diff --git a/contrib/python/pandas/py3/pandas/io/excel/_odswriter.py b/contrib/python/pandas/py3/pandas/io/excel/_odswriter.py
deleted file mode 100644
index 6f1d62111e5..00000000000
--- a/contrib/python/pandas/py3/pandas/io/excel/_odswriter.py
+++ /dev/null
@@ -1,337 +0,0 @@
-from __future__ import annotations
-
-from collections import defaultdict
-import datetime
-from typing import (
- TYPE_CHECKING,
- Any,
- DefaultDict,
- Tuple,
- cast,
-)
-
-from pandas._libs import json
-from pandas._typing import (
- FilePath,
- StorageOptions,
- WriteExcelBuffer,
-)
-
-from pandas.io.excel._base import ExcelWriter
-from pandas.io.excel._util import (
- combine_kwargs,
- validate_freeze_panes,
-)
-
-if TYPE_CHECKING:
- from pandas.io.formats.excel import ExcelCell
-
-
-class ODSWriter(ExcelWriter):
- _engine = "odf"
- _supported_extensions = (".ods",)
-
- def __init__(
- self,
- path: FilePath | WriteExcelBuffer | ExcelWriter,
- engine: str | None = None,
- date_format: str | None = None,
- datetime_format=None,
- mode: str = "w",
- storage_options: StorageOptions = None,
- if_sheet_exists: str | None = None,
- engine_kwargs: dict[str, Any] | None = None,
- **kwargs,
- ) -> None:
- from odf.opendocument import OpenDocumentSpreadsheet
-
- if mode == "a":
- raise ValueError("Append mode is not supported with odf!")
-
- engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
- self._book = OpenDocumentSpreadsheet(**engine_kwargs)
-
- super().__init__(
- path,
- mode=mode,
- storage_options=storage_options,
- if_sheet_exists=if_sheet_exists,
- engine_kwargs=engine_kwargs,
- )
-
- self._style_dict: dict[str, str] = {}
-
- @property
- def book(self):
- """
- Book instance of class odf.opendocument.OpenDocumentSpreadsheet.
-
- This attribute can be used to access engine-specific features.
- """
- return self._book
-
- @property
- def sheets(self) -> dict[str, Any]:
- """Mapping of sheet names to sheet objects."""
- from odf.table import Table
-
- result = {
- sheet.getAttribute("name"): sheet
- for sheet in self.book.getElementsByType(Table)
- }
- return result
-
- def _save(self) -> None:
- """
- Save workbook to disk.
- """
- for sheet in self.sheets.values():
- self.book.spreadsheet.addElement(sheet)
- self.book.save(self._handles.handle)
-
- def _write_cells(
- self,
- cells: list[ExcelCell],
- sheet_name: str | None = None,
- startrow: int = 0,
- startcol: int = 0,
- freeze_panes: tuple[int, int] | None = None,
- ) -> None:
- """
- Write the frame cells using odf
- """
- from odf.table import (
- Table,
- TableCell,
- TableRow,
- )
- from odf.text import P
-
- sheet_name = self._get_sheet_name(sheet_name)
- assert sheet_name is not None
-
- if sheet_name in self.sheets:
- wks = self.sheets[sheet_name]
- else:
- wks = Table(name=sheet_name)
- self.book.spreadsheet.addElement(wks)
-
- if validate_freeze_panes(freeze_panes):
- freeze_panes = cast(Tuple[int, int], freeze_panes)
- self._create_freeze_panes(sheet_name, freeze_panes)
-
- for _ in range(startrow):
- wks.addElement(TableRow())
-
- rows: DefaultDict = defaultdict(TableRow)
- col_count: DefaultDict = defaultdict(int)
-
- for cell in sorted(cells, key=lambda cell: (cell.row, cell.col)):
- # only add empty cells if the row is still empty
- if not col_count[cell.row]:
- for _ in range(startcol):
- rows[cell.row].addElement(TableCell())
-
- # fill with empty cells if needed
- for _ in range(cell.col - col_count[cell.row]):
- rows[cell.row].addElement(TableCell())
- col_count[cell.row] += 1
-
- pvalue, tc = self._make_table_cell(cell)
- rows[cell.row].addElement(tc)
- col_count[cell.row] += 1
- p = P(text=pvalue)
- tc.addElement(p)
-
- # add all rows to the sheet
- if len(rows) > 0:
- for row_nr in range(max(rows.keys()) + 1):
- wks.addElement(rows[row_nr])
-
- def _make_table_cell_attributes(self, cell) -> dict[str, int | str]:
- """Convert cell attributes to OpenDocument attributes
-
- Parameters
- ----------
- cell : ExcelCell
- Spreadsheet cell data
-
- Returns
- -------
- attributes : Dict[str, Union[int, str]]
- Dictionary with attributes and attribute values
- """
- attributes: dict[str, int | str] = {}
- style_name = self._process_style(cell.style)
- if style_name is not None:
- attributes["stylename"] = style_name
- if cell.mergestart is not None and cell.mergeend is not None:
- attributes["numberrowsspanned"] = max(1, cell.mergestart)
- attributes["numbercolumnsspanned"] = cell.mergeend
- return attributes
-
- def _make_table_cell(self, cell) -> tuple[object, Any]:
- """Convert cell data to an OpenDocument spreadsheet cell
-
- Parameters
- ----------
- cell : ExcelCell
- Spreadsheet cell data
-
- Returns
- -------
- pvalue, cell : Tuple[str, TableCell]
- Display value, Cell value
- """
- from odf.table import TableCell
-
- attributes = self._make_table_cell_attributes(cell)
- val, fmt = self._value_with_fmt(cell.val)
- pvalue = value = val
- if isinstance(val, bool):
- value = str(val).lower()
- pvalue = str(val).upper()
- if isinstance(val, datetime.datetime):
- # Fast formatting
- value = val.isoformat()
- # Slow but locale-dependent
- pvalue = val.strftime("%c")
- return (
- pvalue,
- TableCell(valuetype="date", datevalue=value, attributes=attributes),
- )
- elif isinstance(val, datetime.date):
- # Fast formatting
- value = f"{val.year}-{val.month:02d}-{val.day:02d}"
- # Slow but locale-dependent
- pvalue = val.strftime("%x")
- return (
- pvalue,
- TableCell(valuetype="date", datevalue=value, attributes=attributes),
- )
- else:
- class_to_cell_type = {
- str: "string",
- int: "float",
- float: "float",
- bool: "boolean",
- }
- return (
- pvalue,
- TableCell(
- valuetype=class_to_cell_type[type(val)],
- value=value,
- attributes=attributes,
- ),
- )
-
- def _process_style(self, style: dict[str, Any]) -> str:
- """Convert a style dictionary to a OpenDocument style sheet
-
- Parameters
- ----------
- style : Dict
- Style dictionary
-
- Returns
- -------
- style_key : str
- Unique style key for later reference in sheet
- """
- from odf.style import (
- ParagraphProperties,
- Style,
- TableCellProperties,
- TextProperties,
- )
-
- if style is None:
- return None
- style_key = json.dumps(style)
- if style_key in self._style_dict:
- return self._style_dict[style_key]
- name = f"pd{len(self._style_dict)+1}"
- self._style_dict[style_key] = name
- odf_style = Style(name=name, family="table-cell")
- if "font" in style:
- font = style["font"]
- if font.get("bold", False):
- odf_style.addElement(TextProperties(fontweight="bold"))
- if "borders" in style:
- borders = style["borders"]
- for side, thickness in borders.items():
- thickness_translation = {"thin": "0.75pt solid #000000"}
- odf_style.addElement(
- TableCellProperties(
- attributes={f"border{side}": thickness_translation[thickness]}
- )
- )
- if "alignment" in style:
- alignment = style["alignment"]
- horizontal = alignment.get("horizontal")
- if horizontal:
- odf_style.addElement(ParagraphProperties(textalign=horizontal))
- vertical = alignment.get("vertical")
- if vertical:
- odf_style.addElement(TableCellProperties(verticalalign=vertical))
- self.book.styles.addElement(odf_style)
- return name
-
- def _create_freeze_panes(
- self, sheet_name: str, freeze_panes: tuple[int, int]
- ) -> None:
- """
- Create freeze panes in the sheet.
-
- Parameters
- ----------
- sheet_name : str
- Name of the spreadsheet
- freeze_panes : tuple of (int, int)
- Freeze pane location x and y
- """
- from odf.config import (
- ConfigItem,
- ConfigItemMapEntry,
- ConfigItemMapIndexed,
- ConfigItemMapNamed,
- ConfigItemSet,
- )
-
- config_item_set = ConfigItemSet(name="ooo:view-settings")
- self.book.settings.addElement(config_item_set)
-
- config_item_map_indexed = ConfigItemMapIndexed(name="Views")
- config_item_set.addElement(config_item_map_indexed)
-
- config_item_map_entry = ConfigItemMapEntry()
- config_item_map_indexed.addElement(config_item_map_entry)
-
- config_item_map_named = ConfigItemMapNamed(name="Tables")
- config_item_map_entry.addElement(config_item_map_named)
-
- config_item_map_entry = ConfigItemMapEntry(name=sheet_name)
- config_item_map_named.addElement(config_item_map_entry)
-
- config_item_map_entry.addElement(
- ConfigItem(name="HorizontalSplitMode", type="short", text="2")
- )
- config_item_map_entry.addElement(
- ConfigItem(name="VerticalSplitMode", type="short", text="2")
- )
- config_item_map_entry.addElement(
- ConfigItem(
- name="HorizontalSplitPosition", type="int", text=str(freeze_panes[0])
- )
- )
- config_item_map_entry.addElement(
- ConfigItem(
- name="VerticalSplitPosition", type="int", text=str(freeze_panes[1])
- )
- )
- config_item_map_entry.addElement(
- ConfigItem(name="PositionRight", type="int", text=str(freeze_panes[0]))
- )
- config_item_map_entry.addElement(
- ConfigItem(name="PositionBottom", type="int", text=str(freeze_panes[1]))
- )
diff --git a/contrib/python/pandas/py3/pandas/io/excel/_openpyxl.py b/contrib/python/pandas/py3/pandas/io/excel/_openpyxl.py
deleted file mode 100644
index 594813fe0c1..00000000000
--- a/contrib/python/pandas/py3/pandas/io/excel/_openpyxl.py
+++ /dev/null
@@ -1,626 +0,0 @@
-from __future__ import annotations
-
-import mmap
-from typing import (
- TYPE_CHECKING,
- Any,
- Tuple,
- cast,
-)
-
-import numpy as np
-
-from pandas._typing import (
- FilePath,
- ReadBuffer,
- Scalar,
- StorageOptions,
- WriteExcelBuffer,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.util._decorators import doc
-
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.excel._base import (
- BaseExcelReader,
- ExcelWriter,
-)
-from pandas.io.excel._util import (
- combine_kwargs,
- validate_freeze_panes,
-)
-
-if TYPE_CHECKING:
- from openpyxl.descriptors.serialisable import Serialisable
- from openpyxl.workbook import Workbook
-
-
-class OpenpyxlWriter(ExcelWriter):
- _engine = "openpyxl"
- _supported_extensions = (".xlsx", ".xlsm")
-
- def __init__(
- self,
- path: FilePath | WriteExcelBuffer | ExcelWriter,
- engine: str | None = None,
- date_format: str | None = None,
- datetime_format: str | None = None,
- mode: str = "w",
- storage_options: StorageOptions = None,
- if_sheet_exists: str | None = None,
- engine_kwargs: dict[str, Any] | None = None,
- **kwargs,
- ) -> None:
- # Use the openpyxl module as the Excel writer.
- from openpyxl.workbook import Workbook
-
- engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
-
- super().__init__(
- path,
- mode=mode,
- storage_options=storage_options,
- if_sheet_exists=if_sheet_exists,
- engine_kwargs=engine_kwargs,
- )
-
- # ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from
- # the file and later write to it
- if "r+" in self._mode: # Load from existing workbook
- from openpyxl import load_workbook
-
- try:
- self._book = load_workbook(self._handles.handle, **engine_kwargs)
- except TypeError:
- self._handles.handle.close()
- raise
- self._handles.handle.seek(0)
- else:
- # Create workbook object with default optimized_write=True.
- try:
- self._book = Workbook(**engine_kwargs)
- except TypeError:
- self._handles.handle.close()
- raise
-
- if self.book.worksheets:
- self.book.remove(self.book.worksheets[0])
-
- @property
- def book(self) -> Workbook:
- """
- Book instance of class openpyxl.workbook.Workbook.
-
- This attribute can be used to access engine-specific features.
- """
- return self._book
-
- @property
- def sheets(self) -> dict[str, Any]:
- """Mapping of sheet names to sheet objects."""
- result = {name: self.book[name] for name in self.book.sheetnames}
- return result
-
- def _save(self) -> None:
- """
- Save workbook to disk.
- """
- self.book.save(self._handles.handle)
- if "r+" in self._mode and not isinstance(self._handles.handle, mmap.mmap):
- # truncate file to the written content
- self._handles.handle.truncate()
-
- @classmethod
- def _convert_to_style_kwargs(cls, style_dict: dict) -> dict[str, Serialisable]:
- """
- Convert a style_dict to a set of kwargs suitable for initializing
- or updating-on-copy an openpyxl v2 style object.
-
- Parameters
- ----------
- style_dict : dict
- A dict with zero or more of the following keys (or their synonyms).
- 'font'
- 'fill'
- 'border' ('borders')
- 'alignment'
- 'number_format'
- 'protection'
-
- Returns
- -------
- style_kwargs : dict
- A dict with the same, normalized keys as ``style_dict`` but each
- value has been replaced with a native openpyxl style object of the
- appropriate class.
- """
- _style_key_map = {"borders": "border"}
-
- style_kwargs: dict[str, Serialisable] = {}
- for k, v in style_dict.items():
- k = _style_key_map.get(k, k)
- _conv_to_x = getattr(cls, f"_convert_to_{k}", lambda x: None)
- new_v = _conv_to_x(v)
- if new_v:
- style_kwargs[k] = new_v
-
- return style_kwargs
-
- @classmethod
- def _convert_to_color(cls, color_spec):
- """
- Convert ``color_spec`` to an openpyxl v2 Color object.
-
- Parameters
- ----------
- color_spec : str, dict
- A 32-bit ARGB hex string, or a dict with zero or more of the
- following keys.
- 'rgb'
- 'indexed'
- 'auto'
- 'theme'
- 'tint'
- 'index'
- 'type'
-
- Returns
- -------
- color : openpyxl.styles.Color
- """
- from openpyxl.styles import Color
-
- if isinstance(color_spec, str):
- return Color(color_spec)
- else:
- return Color(**color_spec)
-
- @classmethod
- def _convert_to_font(cls, font_dict):
- """
- Convert ``font_dict`` to an openpyxl v2 Font object.
-
- Parameters
- ----------
- font_dict : dict
- A dict with zero or more of the following keys (or their synonyms).
- 'name'
- 'size' ('sz')
- 'bold' ('b')
- 'italic' ('i')
- 'underline' ('u')
- 'strikethrough' ('strike')
- 'color'
- 'vertAlign' ('vertalign')
- 'charset'
- 'scheme'
- 'family'
- 'outline'
- 'shadow'
- 'condense'
-
- Returns
- -------
- font : openpyxl.styles.Font
- """
- from openpyxl.styles import Font
-
- _font_key_map = {
- "sz": "size",
- "b": "bold",
- "i": "italic",
- "u": "underline",
- "strike": "strikethrough",
- "vertalign": "vertAlign",
- }
-
- font_kwargs = {}
- for k, v in font_dict.items():
- k = _font_key_map.get(k, k)
- if k == "color":
- v = cls._convert_to_color(v)
- font_kwargs[k] = v
-
- return Font(**font_kwargs)
-
- @classmethod
- def _convert_to_stop(cls, stop_seq):
- """
- Convert ``stop_seq`` to a list of openpyxl v2 Color objects,
- suitable for initializing the ``GradientFill`` ``stop`` parameter.
-
- Parameters
- ----------
- stop_seq : iterable
- An iterable that yields objects suitable for consumption by
- ``_convert_to_color``.
-
- Returns
- -------
- stop : list of openpyxl.styles.Color
- """
- return map(cls._convert_to_color, stop_seq)
-
- @classmethod
- def _convert_to_fill(cls, fill_dict: dict[str, Any]):
- """
- Convert ``fill_dict`` to an openpyxl v2 Fill object.
-
- Parameters
- ----------
- fill_dict : dict
- A dict with one or more of the following keys (or their synonyms),
- 'fill_type' ('patternType', 'patterntype')
- 'start_color' ('fgColor', 'fgcolor')
- 'end_color' ('bgColor', 'bgcolor')
- or one or more of the following keys (or their synonyms).
- 'type' ('fill_type')
- 'degree'
- 'left'
- 'right'
- 'top'
- 'bottom'
- 'stop'
-
- Returns
- -------
- fill : openpyxl.styles.Fill
- """
- from openpyxl.styles import (
- GradientFill,
- PatternFill,
- )
-
- _pattern_fill_key_map = {
- "patternType": "fill_type",
- "patterntype": "fill_type",
- "fgColor": "start_color",
- "fgcolor": "start_color",
- "bgColor": "end_color",
- "bgcolor": "end_color",
- }
-
- _gradient_fill_key_map = {"fill_type": "type"}
-
- pfill_kwargs = {}
- gfill_kwargs = {}
- for k, v in fill_dict.items():
- pk = _pattern_fill_key_map.get(k)
- gk = _gradient_fill_key_map.get(k)
- if pk in ["start_color", "end_color"]:
- v = cls._convert_to_color(v)
- if gk == "stop":
- v = cls._convert_to_stop(v)
- if pk:
- pfill_kwargs[pk] = v
- elif gk:
- gfill_kwargs[gk] = v
- else:
- pfill_kwargs[k] = v
- gfill_kwargs[k] = v
-
- try:
- return PatternFill(**pfill_kwargs)
- except TypeError:
- return GradientFill(**gfill_kwargs)
-
- @classmethod
- def _convert_to_side(cls, side_spec):
- """
- Convert ``side_spec`` to an openpyxl v2 Side object.
-
- Parameters
- ----------
- side_spec : str, dict
- A string specifying the border style, or a dict with zero or more
- of the following keys (or their synonyms).
- 'style' ('border_style')
- 'color'
-
- Returns
- -------
- side : openpyxl.styles.Side
- """
- from openpyxl.styles import Side
-
- _side_key_map = {"border_style": "style"}
-
- if isinstance(side_spec, str):
- return Side(style=side_spec)
-
- side_kwargs = {}
- for k, v in side_spec.items():
- k = _side_key_map.get(k, k)
- if k == "color":
- v = cls._convert_to_color(v)
- side_kwargs[k] = v
-
- return Side(**side_kwargs)
-
- @classmethod
- def _convert_to_border(cls, border_dict):
- """
- Convert ``border_dict`` to an openpyxl v2 Border object.
-
- Parameters
- ----------
- border_dict : dict
- A dict with zero or more of the following keys (or their synonyms).
- 'left'
- 'right'
- 'top'
- 'bottom'
- 'diagonal'
- 'diagonal_direction'
- 'vertical'
- 'horizontal'
- 'diagonalUp' ('diagonalup')
- 'diagonalDown' ('diagonaldown')
- 'outline'
-
- Returns
- -------
- border : openpyxl.styles.Border
- """
- from openpyxl.styles import Border
-
- _border_key_map = {"diagonalup": "diagonalUp", "diagonaldown": "diagonalDown"}
-
- border_kwargs = {}
- for k, v in border_dict.items():
- k = _border_key_map.get(k, k)
- if k == "color":
- v = cls._convert_to_color(v)
- if k in ["left", "right", "top", "bottom", "diagonal"]:
- v = cls._convert_to_side(v)
- border_kwargs[k] = v
-
- return Border(**border_kwargs)
-
- @classmethod
- def _convert_to_alignment(cls, alignment_dict):
- """
- Convert ``alignment_dict`` to an openpyxl v2 Alignment object.
-
- Parameters
- ----------
- alignment_dict : dict
- A dict with zero or more of the following keys (or their synonyms).
- 'horizontal'
- 'vertical'
- 'text_rotation'
- 'wrap_text'
- 'shrink_to_fit'
- 'indent'
- Returns
- -------
- alignment : openpyxl.styles.Alignment
- """
- from openpyxl.styles import Alignment
-
- return Alignment(**alignment_dict)
-
- @classmethod
- def _convert_to_number_format(cls, number_format_dict):
- """
- Convert ``number_format_dict`` to an openpyxl v2.1.0 number format
- initializer.
-
- Parameters
- ----------
- number_format_dict : dict
- A dict with zero or more of the following keys.
- 'format_code' : str
-
- Returns
- -------
- number_format : str
- """
- return number_format_dict["format_code"]
-
- @classmethod
- def _convert_to_protection(cls, protection_dict):
- """
- Convert ``protection_dict`` to an openpyxl v2 Protection object.
-
- Parameters
- ----------
- protection_dict : dict
- A dict with zero or more of the following keys.
- 'locked'
- 'hidden'
-
- Returns
- -------
- """
- from openpyxl.styles import Protection
-
- return Protection(**protection_dict)
-
- def _write_cells(
- self,
- cells,
- sheet_name: str | None = None,
- startrow: int = 0,
- startcol: int = 0,
- freeze_panes: tuple[int, int] | None = None,
- ) -> None:
- # Write the frame cells using openpyxl.
- sheet_name = self._get_sheet_name(sheet_name)
-
- _style_cache: dict[str, dict[str, Serialisable]] = {}
-
- if sheet_name in self.sheets and self._if_sheet_exists != "new":
- if "r+" in self._mode:
- if self._if_sheet_exists == "replace":
- old_wks = self.sheets[sheet_name]
- target_index = self.book.index(old_wks)
- del self.book[sheet_name]
- wks = self.book.create_sheet(sheet_name, target_index)
- elif self._if_sheet_exists == "error":
- raise ValueError(
- f"Sheet '{sheet_name}' already exists and "
- f"if_sheet_exists is set to 'error'."
- )
- elif self._if_sheet_exists == "overlay":
- wks = self.sheets[sheet_name]
- else:
- raise ValueError(
- f"'{self._if_sheet_exists}' is not valid for if_sheet_exists. "
- "Valid options are 'error', 'new', 'replace' and 'overlay'."
- )
- else:
- wks = self.sheets[sheet_name]
- else:
- wks = self.book.create_sheet()
- wks.title = sheet_name
-
- if validate_freeze_panes(freeze_panes):
- freeze_panes = cast(Tuple[int, int], freeze_panes)
- wks.freeze_panes = wks.cell(
- row=freeze_panes[0] + 1, column=freeze_panes[1] + 1
- )
-
- for cell in cells:
- xcell = wks.cell(
- row=startrow + cell.row + 1, column=startcol + cell.col + 1
- )
- xcell.value, fmt = self._value_with_fmt(cell.val)
- if fmt:
- xcell.number_format = fmt
-
- style_kwargs: dict[str, Serialisable] | None = {}
- if cell.style:
- key = str(cell.style)
- style_kwargs = _style_cache.get(key)
- if style_kwargs is None:
- style_kwargs = self._convert_to_style_kwargs(cell.style)
- _style_cache[key] = style_kwargs
-
- if style_kwargs:
- for k, v in style_kwargs.items():
- setattr(xcell, k, v)
-
- if cell.mergestart is not None and cell.mergeend is not None:
- wks.merge_cells(
- start_row=startrow + cell.row + 1,
- start_column=startcol + cell.col + 1,
- end_column=startcol + cell.mergeend + 1,
- end_row=startrow + cell.mergestart + 1,
- )
-
- # When cells are merged only the top-left cell is preserved
- # The behaviour of the other cells in a merged range is
- # undefined
- if style_kwargs:
- first_row = startrow + cell.row + 1
- last_row = startrow + cell.mergestart + 1
- first_col = startcol + cell.col + 1
- last_col = startcol + cell.mergeend + 1
-
- for row in range(first_row, last_row + 1):
- for col in range(first_col, last_col + 1):
- if row == first_row and col == first_col:
- # Ignore first cell. It is already handled.
- continue
- xcell = wks.cell(column=col, row=row)
- for k, v in style_kwargs.items():
- setattr(xcell, k, v)
-
-
-class OpenpyxlReader(BaseExcelReader):
- @doc(storage_options=_shared_docs["storage_options"])
- def __init__(
- self,
- filepath_or_buffer: FilePath | ReadBuffer[bytes],
- storage_options: StorageOptions = None,
- ) -> None:
- """
- Reader using openpyxl engine.
-
- Parameters
- ----------
- filepath_or_buffer : str, path object or Workbook
- Object to be parsed.
- {storage_options}
- """
- import_optional_dependency("openpyxl")
- super().__init__(filepath_or_buffer, storage_options=storage_options)
-
- @property
- def _workbook_class(self):
- from openpyxl import Workbook
-
- return Workbook
-
- def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
- from openpyxl import load_workbook
-
- return load_workbook(
- filepath_or_buffer, read_only=True, data_only=True, keep_links=False
- )
-
- @property
- def sheet_names(self) -> list[str]:
- return [sheet.title for sheet in self.book.worksheets]
-
- def get_sheet_by_name(self, name: str):
- self.raise_if_bad_sheet_by_name(name)
- return self.book[name]
-
- def get_sheet_by_index(self, index: int):
- self.raise_if_bad_sheet_by_index(index)
- return self.book.worksheets[index]
-
- def _convert_cell(self, cell) -> Scalar:
- from openpyxl.cell.cell import (
- TYPE_ERROR,
- TYPE_NUMERIC,
- )
-
- if cell.value is None:
- return "" # compat with xlrd
- elif cell.data_type == TYPE_ERROR:
- return np.nan
- elif cell.data_type == TYPE_NUMERIC:
- val = int(cell.value)
- if val == cell.value:
- return val
- return float(cell.value)
-
- return cell.value
-
- def get_sheet_data(
- self, sheet, file_rows_needed: int | None = None
- ) -> list[list[Scalar]]:
- if self.book.read_only:
- sheet.reset_dimensions()
-
- data: list[list[Scalar]] = []
- last_row_with_data = -1
- for row_number, row in enumerate(sheet.rows):
- converted_row = [self._convert_cell(cell) for cell in row]
- while converted_row and converted_row[-1] == "":
- # trim trailing empty elements
- converted_row.pop()
- if converted_row:
- last_row_with_data = row_number
- data.append(converted_row)
- if file_rows_needed is not None and len(data) >= file_rows_needed:
- break
-
- # Trim trailing empty rows
- data = data[: last_row_with_data + 1]
-
- if len(data) > 0:
- # extend rows to max width
- max_width = max(len(data_row) for data_row in data)
- if min(len(data_row) for data_row in data) < max_width:
- empty_cell: list[Scalar] = [""]
- data = [
- data_row + (max_width - len(data_row)) * empty_cell
- for data_row in data
- ]
-
- return data
diff --git a/contrib/python/pandas/py3/pandas/io/excel/_pyxlsb.py b/contrib/python/pandas/py3/pandas/io/excel/_pyxlsb.py
deleted file mode 100644
index 634baee6313..00000000000
--- a/contrib/python/pandas/py3/pandas/io/excel/_pyxlsb.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# pyright: reportMissingImports=false
-from __future__ import annotations
-
-from pandas._typing import (
- FilePath,
- ReadBuffer,
- Scalar,
- StorageOptions,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.util._decorators import doc
-
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.excel._base import BaseExcelReader
-
-
-class PyxlsbReader(BaseExcelReader):
- @doc(storage_options=_shared_docs["storage_options"])
- def __init__(
- self,
- filepath_or_buffer: FilePath | ReadBuffer[bytes],
- storage_options: StorageOptions = None,
- ) -> None:
- """
- Reader using pyxlsb engine.
-
- Parameters
- ----------
- filepath_or_buffer : str, path object, or Workbook
- Object to be parsed.
- {storage_options}
- """
- import_optional_dependency("pyxlsb")
- # This will call load_workbook on the filepath or buffer
- # And set the result to the book-attribute
- super().__init__(filepath_or_buffer, storage_options=storage_options)
-
- @property
- def _workbook_class(self):
- from pyxlsb import Workbook
-
- return Workbook
-
- def load_workbook(self, filepath_or_buffer: FilePath | ReadBuffer[bytes]):
- from pyxlsb import open_workbook
-
- # TODO: hack in buffer capability
- # This might need some modifications to the Pyxlsb library
- # Actual work for opening it is in xlsbpackage.py, line 20-ish
-
- return open_workbook(filepath_or_buffer)
-
- @property
- def sheet_names(self) -> list[str]:
- return self.book.sheets
-
- def get_sheet_by_name(self, name: str):
- self.raise_if_bad_sheet_by_name(name)
- return self.book.get_sheet(name)
-
- def get_sheet_by_index(self, index: int):
- self.raise_if_bad_sheet_by_index(index)
- # pyxlsb sheets are indexed from 1 onwards
- # There's a fix for this in the source, but the pypi package doesn't have it
- return self.book.get_sheet(index + 1)
-
- def _convert_cell(self, cell) -> Scalar:
- # TODO: there is no way to distinguish between floats and datetimes in pyxlsb
- # This means that there is no way to read datetime types from an xlsb file yet
- if cell.v is None:
- return "" # Prevents non-named columns from not showing up as Unnamed: i
- if isinstance(cell.v, float):
- val = int(cell.v)
- if val == cell.v:
- return val
- else:
- return float(cell.v)
-
- return cell.v
-
- def get_sheet_data(
- self,
- sheet,
- file_rows_needed: int | None = None,
- ) -> list[list[Scalar]]:
- data: list[list[Scalar]] = []
- prevous_row_number = -1
- # When sparse=True the rows can have different lengths and empty rows are
- # not returned. The cells are namedtuples of row, col, value (r, c, v).
- for row in sheet.rows(sparse=True):
- row_number = row[0].r
- converted_row = [self._convert_cell(cell) for cell in row]
- while converted_row and converted_row[-1] == "":
- # trim trailing empty elements
- converted_row.pop()
- if converted_row:
- data.extend([[]] * (row_number - prevous_row_number - 1))
- data.append(converted_row)
- prevous_row_number = row_number
- if file_rows_needed is not None and len(data) >= file_rows_needed:
- break
- if data:
- # extend rows to max_width
- max_width = max(len(data_row) for data_row in data)
- if min(len(data_row) for data_row in data) < max_width:
- empty_cell: list[Scalar] = [""]
- data = [
- data_row + (max_width - len(data_row)) * empty_cell
- for data_row in data
- ]
- return data
diff --git a/contrib/python/pandas/py3/pandas/io/excel/_util.py b/contrib/python/pandas/py3/pandas/io/excel/_util.py
deleted file mode 100644
index 72c64c5ec89..00000000000
--- a/contrib/python/pandas/py3/pandas/io/excel/_util.py
+++ /dev/null
@@ -1,332 +0,0 @@
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Iterable,
- Literal,
- MutableMapping,
- Sequence,
- TypeVar,
- overload,
-)
-
-from pandas.compat._optional import import_optional_dependency
-
-from pandas.core.dtypes.common import (
- is_integer,
- is_list_like,
-)
-
-if TYPE_CHECKING:
- from pandas.io.excel._base import ExcelWriter
-
- ExcelWriter_t = type[ExcelWriter]
- usecols_func = TypeVar("usecols_func", bound=Callable[[Hashable], object])
-
-_writers: MutableMapping[str, ExcelWriter_t] = {}
-
-
-def register_writer(klass: ExcelWriter_t) -> None:
- """
- Add engine to the excel writer registry.io.excel.
-
- You must use this method to integrate with ``to_excel``.
-
- Parameters
- ----------
- klass : ExcelWriter
- """
- if not callable(klass):
- raise ValueError("Can only register callables as engines")
- engine_name = klass._engine
- _writers[engine_name] = klass
-
-
-def get_default_engine(ext: str, mode: Literal["reader", "writer"] = "reader") -> str:
- """
- Return the default reader/writer for the given extension.
-
- Parameters
- ----------
- ext : str
- The excel file extension for which to get the default engine.
- mode : str {'reader', 'writer'}
- Whether to get the default engine for reading or writing.
- Either 'reader' or 'writer'
-
- Returns
- -------
- str
- The default engine for the extension.
- """
- _default_readers = {
- "xlsx": "openpyxl",
- "xlsm": "openpyxl",
- "xlsb": "pyxlsb",
- "xls": "xlrd",
- "ods": "odf",
- }
- _default_writers = {
- "xlsx": "openpyxl",
- "xlsm": "openpyxl",
- "xlsb": "pyxlsb",
- "ods": "odf",
- }
- assert mode in ["reader", "writer"]
- if mode == "writer":
- # Prefer xlsxwriter over openpyxl if installed
- xlsxwriter = import_optional_dependency("xlsxwriter", errors="warn")
- if xlsxwriter:
- _default_writers["xlsx"] = "xlsxwriter"
- return _default_writers[ext]
- else:
- return _default_readers[ext]
-
-
-def get_writer(engine_name: str) -> ExcelWriter_t:
- try:
- return _writers[engine_name]
- except KeyError as err:
- raise ValueError(f"No Excel writer '{engine_name}'") from err
-
-
-def _excel2num(x: str) -> int:
- """
- Convert Excel column name like 'AB' to 0-based column index.
-
- Parameters
- ----------
- x : str
- The Excel column name to convert to a 0-based column index.
-
- Returns
- -------
- num : int
- The column index corresponding to the name.
-
- Raises
- ------
- ValueError
- Part of the Excel column name was invalid.
- """
- index = 0
-
- for c in x.upper().strip():
- cp = ord(c)
-
- if cp < ord("A") or cp > ord("Z"):
- raise ValueError(f"Invalid column name: {x}")
-
- index = index * 26 + cp - ord("A") + 1
-
- return index - 1
-
-
-def _range2cols(areas: str) -> list[int]:
- """
- Convert comma separated list of column names and ranges to indices.
-
- Parameters
- ----------
- areas : str
- A string containing a sequence of column ranges (or areas).
-
- Returns
- -------
- cols : list
- A list of 0-based column indices.
-
- Examples
- --------
- >>> _range2cols('A:E')
- [0, 1, 2, 3, 4]
- >>> _range2cols('A,C,Z:AB')
- [0, 2, 25, 26, 27]
- """
- cols: list[int] = []
-
- for rng in areas.split(","):
- if ":" in rng:
- rngs = rng.split(":")
- cols.extend(range(_excel2num(rngs[0]), _excel2num(rngs[1]) + 1))
- else:
- cols.append(_excel2num(rng))
-
- return cols
-
-
-@overload
-def maybe_convert_usecols(usecols: str | list[int]) -> list[int]:
- ...
-
-
-@overload
-def maybe_convert_usecols(usecols: list[str]) -> list[str]:
- ...
-
-
-@overload
-def maybe_convert_usecols(usecols: usecols_func) -> usecols_func:
- ...
-
-
-@overload
-def maybe_convert_usecols(usecols: None) -> None:
- ...
-
-
-def maybe_convert_usecols(
- usecols: str | list[int] | list[str] | usecols_func | None,
-) -> None | list[int] | list[str] | usecols_func:
- """
- Convert `usecols` into a compatible format for parsing in `parsers.py`.
-
- Parameters
- ----------
- usecols : object
- The use-columns object to potentially convert.
-
- Returns
- -------
- converted : object
- The compatible format of `usecols`.
- """
- if usecols is None:
- return usecols
-
- if is_integer(usecols):
- raise ValueError(
- "Passing an integer for `usecols` is no longer supported. "
- "Please pass in a list of int from 0 to `usecols` inclusive instead."
- )
-
- if isinstance(usecols, str):
- return _range2cols(usecols)
-
- return usecols
-
-
-@overload
-def validate_freeze_panes(freeze_panes: tuple[int, int]) -> Literal[True]:
- ...
-
-
-@overload
-def validate_freeze_panes(freeze_panes: None) -> Literal[False]:
- ...
-
-
-def validate_freeze_panes(freeze_panes: tuple[int, int] | None) -> bool:
- if freeze_panes is not None:
- if len(freeze_panes) == 2 and all(
- isinstance(item, int) for item in freeze_panes
- ):
- return True
-
- raise ValueError(
- "freeze_panes must be of form (row, column) "
- "where row and column are integers"
- )
-
- # freeze_panes wasn't specified, return False so it won't be applied
- # to output sheet
- return False
-
-
-def fill_mi_header(
- row: list[Hashable], control_row: list[bool]
-) -> tuple[list[Hashable], list[bool]]:
- """
- Forward fill blank entries in row but only inside the same parent index.
-
- Used for creating headers in Multiindex.
-
- Parameters
- ----------
- row : list
- List of items in a single row.
- control_row : list of bool
- Helps to determine if particular column is in same parent index as the
- previous value. Used to stop propagation of empty cells between
- different indexes.
-
- Returns
- -------
- Returns changed row and control_row
- """
- last = row[0]
- for i in range(1, len(row)):
- if not control_row[i]:
- last = row[i]
-
- if row[i] == "" or row[i] is None:
- row[i] = last
- else:
- control_row[i] = False
- last = row[i]
-
- return row, control_row
-
-
-def pop_header_name(
- row: list[Hashable], index_col: int | Sequence[int]
-) -> tuple[Hashable | None, list[Hashable]]:
- """
- Pop the header name for MultiIndex parsing.
-
- Parameters
- ----------
- row : list
- The data row to parse for the header name.
- index_col : int, list
- The index columns for our data. Assumed to be non-null.
-
- Returns
- -------
- header_name : str
- The extracted header name.
- trimmed_row : list
- The original data row with the header name removed.
- """
- # Pop out header name and fill w/blank.
- if is_list_like(index_col):
- assert isinstance(index_col, Iterable)
- i = max(index_col)
- else:
- assert not isinstance(index_col, Iterable)
- i = index_col
-
- header_name = row[i]
- header_name = None if header_name == "" else header_name
-
- return header_name, row[:i] + [""] + row[i + 1 :]
-
-
-def combine_kwargs(engine_kwargs: dict[str, Any] | None, kwargs: dict) -> dict:
- """
- Used to combine two sources of kwargs for the backend engine.
-
- Use of kwargs is deprecated, this function is solely for use in 1.3 and should
- be removed in 1.4/2.0. Also _base.ExcelWriter.__new__ ensures either engine_kwargs
- or kwargs must be None or empty respectively.
-
- Parameters
- ----------
- engine_kwargs: dict
- kwargs to be passed through to the engine.
- kwargs: dict
- kwargs to be psased through to the engine (deprecated)
-
- Returns
- -------
- engine_kwargs combined with kwargs
- """
- if engine_kwargs is None:
- result = {}
- else:
- result = engine_kwargs.copy()
- result.update(kwargs)
- return result
diff --git a/contrib/python/pandas/py3/pandas/io/excel/_xlrd.py b/contrib/python/pandas/py3/pandas/io/excel/_xlrd.py
deleted file mode 100644
index 37bd4c1ba5b..00000000000
--- a/contrib/python/pandas/py3/pandas/io/excel/_xlrd.py
+++ /dev/null
@@ -1,126 +0,0 @@
-from __future__ import annotations
-
-from datetime import time
-
-import numpy as np
-
-from pandas._typing import (
- Scalar,
- StorageOptions,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.util._decorators import doc
-
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.excel._base import BaseExcelReader
-
-
-class XlrdReader(BaseExcelReader):
- @doc(storage_options=_shared_docs["storage_options"])
- def __init__(
- self, filepath_or_buffer, storage_options: StorageOptions = None
- ) -> None:
- """
- Reader using xlrd engine.
-
- Parameters
- ----------
- filepath_or_buffer : str, path object or Workbook
- Object to be parsed.
- {storage_options}
- """
- err_msg = "Install xlrd >= 2.0.1 for xls Excel support"
- import_optional_dependency("xlrd", extra=err_msg)
- super().__init__(filepath_or_buffer, storage_options=storage_options)
-
- @property
- def _workbook_class(self):
- from xlrd import Book
-
- return Book
-
- def load_workbook(self, filepath_or_buffer):
- from xlrd import open_workbook
-
- if hasattr(filepath_or_buffer, "read"):
- data = filepath_or_buffer.read()
- return open_workbook(file_contents=data)
- else:
- return open_workbook(filepath_or_buffer)
-
- @property
- def sheet_names(self):
- return self.book.sheet_names()
-
- def get_sheet_by_name(self, name):
- self.raise_if_bad_sheet_by_name(name)
- return self.book.sheet_by_name(name)
-
- def get_sheet_by_index(self, index):
- self.raise_if_bad_sheet_by_index(index)
- return self.book.sheet_by_index(index)
-
- def get_sheet_data(
- self, sheet, file_rows_needed: int | None = None
- ) -> list[list[Scalar]]:
- from xlrd import (
- XL_CELL_BOOLEAN,
- XL_CELL_DATE,
- XL_CELL_ERROR,
- XL_CELL_NUMBER,
- xldate,
- )
-
- epoch1904 = self.book.datemode
-
- def _parse_cell(cell_contents, cell_typ):
- """
- converts the contents of the cell into a pandas appropriate object
- """
- if cell_typ == XL_CELL_DATE:
- # Use the newer xlrd datetime handling.
- try:
- cell_contents = xldate.xldate_as_datetime(cell_contents, epoch1904)
- except OverflowError:
- return cell_contents
-
- # Excel doesn't distinguish between dates and time,
- # so we treat dates on the epoch as times only.
- # Also, Excel supports 1900 and 1904 epochs.
- year = (cell_contents.timetuple())[0:3]
- if (not epoch1904 and year == (1899, 12, 31)) or (
- epoch1904 and year == (1904, 1, 1)
- ):
- cell_contents = time(
- cell_contents.hour,
- cell_contents.minute,
- cell_contents.second,
- cell_contents.microsecond,
- )
-
- elif cell_typ == XL_CELL_ERROR:
- cell_contents = np.nan
- elif cell_typ == XL_CELL_BOOLEAN:
- cell_contents = bool(cell_contents)
- elif cell_typ == XL_CELL_NUMBER:
- # GH5394 - Excel 'numbers' are always floats
- # it's a minimal perf hit and less surprising
- val = int(cell_contents)
- if val == cell_contents:
- cell_contents = val
- return cell_contents
-
- data = []
-
- nrows = sheet.nrows
- if file_rows_needed is not None:
- nrows = min(nrows, file_rows_needed)
- for i in range(nrows):
- row = [
- _parse_cell(value, typ)
- for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
- ]
- data.append(row)
-
- return data
diff --git a/contrib/python/pandas/py3/pandas/io/excel/_xlsxwriter.py b/contrib/python/pandas/py3/pandas/io/excel/_xlsxwriter.py
deleted file mode 100644
index 1800d3d87f7..00000000000
--- a/contrib/python/pandas/py3/pandas/io/excel/_xlsxwriter.py
+++ /dev/null
@@ -1,275 +0,0 @@
-from __future__ import annotations
-
-from typing import Any
-
-from pandas._libs import json
-from pandas._typing import (
- FilePath,
- StorageOptions,
- WriteExcelBuffer,
-)
-
-from pandas.io.excel._base import ExcelWriter
-from pandas.io.excel._util import (
- combine_kwargs,
- validate_freeze_panes,
-)
-
-
-class _XlsxStyler:
- # Map from openpyxl-oriented styles to flatter xlsxwriter representation
- # Ordering necessary for both determinism and because some are keyed by
- # prefixes of others.
- STYLE_MAPPING: dict[str, list[tuple[tuple[str, ...], str]]] = {
- "font": [
- (("name",), "font_name"),
- (("sz",), "font_size"),
- (("size",), "font_size"),
- (("color", "rgb"), "font_color"),
- (("color",), "font_color"),
- (("b",), "bold"),
- (("bold",), "bold"),
- (("i",), "italic"),
- (("italic",), "italic"),
- (("u",), "underline"),
- (("underline",), "underline"),
- (("strike",), "font_strikeout"),
- (("vertAlign",), "font_script"),
- (("vertalign",), "font_script"),
- ],
- "number_format": [(("format_code",), "num_format"), ((), "num_format")],
- "protection": [(("locked",), "locked"), (("hidden",), "hidden")],
- "alignment": [
- (("horizontal",), "align"),
- (("vertical",), "valign"),
- (("text_rotation",), "rotation"),
- (("wrap_text",), "text_wrap"),
- (("indent",), "indent"),
- (("shrink_to_fit",), "shrink"),
- ],
- "fill": [
- (("patternType",), "pattern"),
- (("patterntype",), "pattern"),
- (("fill_type",), "pattern"),
- (("start_color", "rgb"), "fg_color"),
- (("fgColor", "rgb"), "fg_color"),
- (("fgcolor", "rgb"), "fg_color"),
- (("start_color",), "fg_color"),
- (("fgColor",), "fg_color"),
- (("fgcolor",), "fg_color"),
- (("end_color", "rgb"), "bg_color"),
- (("bgColor", "rgb"), "bg_color"),
- (("bgcolor", "rgb"), "bg_color"),
- (("end_color",), "bg_color"),
- (("bgColor",), "bg_color"),
- (("bgcolor",), "bg_color"),
- ],
- "border": [
- (("color", "rgb"), "border_color"),
- (("color",), "border_color"),
- (("style",), "border"),
- (("top", "color", "rgb"), "top_color"),
- (("top", "color"), "top_color"),
- (("top", "style"), "top"),
- (("top",), "top"),
- (("right", "color", "rgb"), "right_color"),
- (("right", "color"), "right_color"),
- (("right", "style"), "right"),
- (("right",), "right"),
- (("bottom", "color", "rgb"), "bottom_color"),
- (("bottom", "color"), "bottom_color"),
- (("bottom", "style"), "bottom"),
- (("bottom",), "bottom"),
- (("left", "color", "rgb"), "left_color"),
- (("left", "color"), "left_color"),
- (("left", "style"), "left"),
- (("left",), "left"),
- ],
- }
-
- @classmethod
- def convert(cls, style_dict, num_format_str=None):
- """
- converts a style_dict to an xlsxwriter format dict
-
- Parameters
- ----------
- style_dict : style dictionary to convert
- num_format_str : optional number format string
- """
- # Create a XlsxWriter format object.
- props = {}
-
- if num_format_str is not None:
- props["num_format"] = num_format_str
-
- if style_dict is None:
- return props
-
- if "borders" in style_dict:
- style_dict = style_dict.copy()
- style_dict["border"] = style_dict.pop("borders")
-
- for style_group_key, style_group in style_dict.items():
- for src, dst in cls.STYLE_MAPPING.get(style_group_key, []):
- # src is a sequence of keys into a nested dict
- # dst is a flat key
- if dst in props:
- continue
- v = style_group
- for k in src:
- try:
- v = v[k]
- except (KeyError, TypeError):
- break
- else:
- props[dst] = v
-
- if isinstance(props.get("pattern"), str):
- # TODO: support other fill patterns
- props["pattern"] = 0 if props["pattern"] == "none" else 1
-
- for k in ["border", "top", "right", "bottom", "left"]:
- if isinstance(props.get(k), str):
- try:
- props[k] = [
- "none",
- "thin",
- "medium",
- "dashed",
- "dotted",
- "thick",
- "double",
- "hair",
- "mediumDashed",
- "dashDot",
- "mediumDashDot",
- "dashDotDot",
- "mediumDashDotDot",
- "slantDashDot",
- ].index(props[k])
- except ValueError:
- props[k] = 2
-
- if isinstance(props.get("font_script"), str):
- props["font_script"] = ["baseline", "superscript", "subscript"].index(
- props["font_script"]
- )
-
- if isinstance(props.get("underline"), str):
- props["underline"] = {
- "none": 0,
- "single": 1,
- "double": 2,
- "singleAccounting": 33,
- "doubleAccounting": 34,
- }[props["underline"]]
-
- # GH 30107 - xlsxwriter uses different name
- if props.get("valign") == "center":
- props["valign"] = "vcenter"
-
- return props
-
-
-class XlsxWriter(ExcelWriter):
- _engine = "xlsxwriter"
- _supported_extensions = (".xlsx",)
-
- def __init__(
- self,
- path: FilePath | WriteExcelBuffer | ExcelWriter,
- engine: str | None = None,
- date_format: str | None = None,
- datetime_format: str | None = None,
- mode: str = "w",
- storage_options: StorageOptions = None,
- if_sheet_exists: str | None = None,
- engine_kwargs: dict[str, Any] | None = None,
- **kwargs,
- ) -> None:
- # Use the xlsxwriter module as the Excel writer.
- from xlsxwriter import Workbook
-
- engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
-
- if mode == "a":
- raise ValueError("Append mode is not supported with xlsxwriter!")
-
- super().__init__(
- path,
- engine=engine,
- date_format=date_format,
- datetime_format=datetime_format,
- mode=mode,
- storage_options=storage_options,
- if_sheet_exists=if_sheet_exists,
- engine_kwargs=engine_kwargs,
- )
-
- self._book = Workbook(self._handles.handle, **engine_kwargs)
-
- @property
- def book(self):
- """
- Book instance of class xlsxwriter.Workbook.
-
- This attribute can be used to access engine-specific features.
- """
- return self._book
-
- @property
- def sheets(self) -> dict[str, Any]:
- result = self.book.sheetnames
- return result
-
- def _save(self) -> None:
- """
- Save workbook to disk.
- """
- self.book.close()
-
- def _write_cells(
- self,
- cells,
- sheet_name: str | None = None,
- startrow: int = 0,
- startcol: int = 0,
- freeze_panes: tuple[int, int] | None = None,
- ) -> None:
- # Write the frame cells using xlsxwriter.
- sheet_name = self._get_sheet_name(sheet_name)
-
- wks = self.book.get_worksheet_by_name(sheet_name)
- if wks is None:
- wks = self.book.add_worksheet(sheet_name)
-
- style_dict = {"null": None}
-
- if validate_freeze_panes(freeze_panes):
- wks.freeze_panes(*(freeze_panes))
-
- for cell in cells:
- val, fmt = self._value_with_fmt(cell.val)
-
- stylekey = json.dumps(cell.style)
- if fmt:
- stylekey += fmt
-
- if stylekey in style_dict:
- style = style_dict[stylekey]
- else:
- style = self.book.add_format(_XlsxStyler.convert(cell.style, fmt))
- style_dict[stylekey] = style
-
- if cell.mergestart is not None and cell.mergeend is not None:
- wks.merge_range(
- startrow + cell.row,
- startcol + cell.col,
- startrow + cell.mergestart,
- startcol + cell.mergeend,
- val,
- style,
- )
- else:
- wks.write(startrow + cell.row, startcol + cell.col, val, style)
diff --git a/contrib/python/pandas/py3/pandas/io/feather_format.py b/contrib/python/pandas/py3/pandas/io/feather_format.py
deleted file mode 100644
index f45f5f104fd..00000000000
--- a/contrib/python/pandas/py3/pandas/io/feather_format.py
+++ /dev/null
@@ -1,162 +0,0 @@
-""" feather-format compat """
-from __future__ import annotations
-
-from typing import (
- Hashable,
- Sequence,
-)
-
-from pandas._libs import lib
-from pandas._typing import (
- DtypeBackend,
- FilePath,
- ReadBuffer,
- StorageOptions,
- WriteBuffer,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.util._decorators import doc
-from pandas.util._validators import check_dtype_backend
-
-import pandas as pd
-from pandas.core.api import (
- DataFrame,
- RangeIndex,
-)
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.common import get_handle
-
-
-@doc(storage_options=_shared_docs["storage_options"])
-def to_feather(
- df: DataFrame,
- path: FilePath | WriteBuffer[bytes],
- storage_options: StorageOptions = None,
- **kwargs,
-) -> None:
- """
- Write a DataFrame to the binary Feather format.
-
- Parameters
- ----------
- df : DataFrame
- path : str, path object, or file-like object
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- **kwargs :
- Additional keywords passed to `pyarrow.feather.write_feather`.
-
- .. versionadded:: 1.1.0
- """
- import_optional_dependency("pyarrow")
- from pyarrow import feather
-
- if not isinstance(df, DataFrame):
- raise ValueError("feather only support IO with DataFrames")
-
- valid_types = {"string", "unicode"}
-
- # validate index
- # --------------
-
- # validate that we have only a default index
- # raise on anything else as we don't serialize the index
-
- if not df.index.dtype == "int64":
- typ = type(df.index)
- raise ValueError(
- f"feather does not support serializing {typ} "
- "for the index; you can .reset_index() to make the index into column(s)"
- )
-
- if not df.index.equals(RangeIndex.from_range(range(len(df)))):
- raise ValueError(
- "feather does not support serializing a non-default index for the index; "
- "you can .reset_index() to make the index into column(s)"
- )
-
- if df.index.name is not None:
- raise ValueError(
- "feather does not serialize index meta-data on a default index"
- )
-
- # validate columns
- # ----------------
-
- # must have value column names (strings only)
- if df.columns.inferred_type not in valid_types:
- raise ValueError("feather must have string column names")
-
- with get_handle(
- path, "wb", storage_options=storage_options, is_text=False
- ) as handles:
- feather.write_feather(df, handles.handle, **kwargs)
-
-
-@doc(storage_options=_shared_docs["storage_options"])
-def read_feather(
- path: FilePath | ReadBuffer[bytes],
- columns: Sequence[Hashable] | None = None,
- use_threads: bool = True,
- storage_options: StorageOptions = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-):
- """
- Load a feather-format object from the file path.
-
- Parameters
- ----------
- path : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``read()`` function. The string could be a URL.
- Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
- expected. A local file could be: ``file://localhost/path/to/table.feather``.
- columns : sequence, default None
- If not provided, all columns are read.
- use_threads : bool, default True
- Whether to parallelize reading using multiple threads.
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- Returns
- -------
- type of object stored in file
- """
- import_optional_dependency("pyarrow")
- from pyarrow import feather
-
- check_dtype_backend(dtype_backend)
-
- with get_handle(
- path, "rb", storage_options=storage_options, is_text=False
- ) as handles:
- if dtype_backend is lib.no_default:
- return feather.read_feather(
- handles.handle, columns=columns, use_threads=bool(use_threads)
- )
-
- pa_table = feather.read_table(
- handles.handle, columns=columns, use_threads=bool(use_threads)
- )
-
- if dtype_backend == "numpy_nullable":
- from pandas.io._util import _arrow_dtype_mapping
-
- return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get)
-
- elif dtype_backend == "pyarrow":
- return pa_table.to_pandas(types_mapper=pd.ArrowDtype)
diff --git a/contrib/python/pandas/py3/pandas/io/formats/__init__.py b/contrib/python/pandas/py3/pandas/io/formats/__init__.py
deleted file mode 100644
index 8a3486a4d71..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
- # import modules that have public classes/functions
- from pandas.io.formats import style
-
- # and mark only those modules as public
- __all__ = ["style"]
diff --git a/contrib/python/pandas/py3/pandas/io/formats/_color_data.py b/contrib/python/pandas/py3/pandas/io/formats/_color_data.py
deleted file mode 100644
index 2e7cb7f2964..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/_color_data.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# GH37967: Enable the use of CSS named colors, as defined in
-# matplotlib.colors.CSS4_COLORS, when exporting to Excel.
-# This data has been copied here, instead of being imported from matplotlib,
-# not to have ``to_excel`` methods require matplotlib.
-# source: matplotlib._color_data (3.3.3)
-from __future__ import annotations
-
-CSS4_COLORS = {
- "aliceblue": "F0F8FF",
- "antiquewhite": "FAEBD7",
- "aqua": "00FFFF",
- "aquamarine": "7FFFD4",
- "azure": "F0FFFF",
- "beige": "F5F5DC",
- "bisque": "FFE4C4",
- "black": "000000",
- "blanchedalmond": "FFEBCD",
- "blue": "0000FF",
- "blueviolet": "8A2BE2",
- "brown": "A52A2A",
- "burlywood": "DEB887",
- "cadetblue": "5F9EA0",
- "chartreuse": "7FFF00",
- "chocolate": "D2691E",
- "coral": "FF7F50",
- "cornflowerblue": "6495ED",
- "cornsilk": "FFF8DC",
- "crimson": "DC143C",
- "cyan": "00FFFF",
- "darkblue": "00008B",
- "darkcyan": "008B8B",
- "darkgoldenrod": "B8860B",
- "darkgray": "A9A9A9",
- "darkgreen": "006400",
- "darkgrey": "A9A9A9",
- "darkkhaki": "BDB76B",
- "darkmagenta": "8B008B",
- "darkolivegreen": "556B2F",
- "darkorange": "FF8C00",
- "darkorchid": "9932CC",
- "darkred": "8B0000",
- "darksalmon": "E9967A",
- "darkseagreen": "8FBC8F",
- "darkslateblue": "483D8B",
- "darkslategray": "2F4F4F",
- "darkslategrey": "2F4F4F",
- "darkturquoise": "00CED1",
- "darkviolet": "9400D3",
- "deeppink": "FF1493",
- "deepskyblue": "00BFFF",
- "dimgray": "696969",
- "dimgrey": "696969",
- "dodgerblue": "1E90FF",
- "firebrick": "B22222",
- "floralwhite": "FFFAF0",
- "forestgreen": "228B22",
- "fuchsia": "FF00FF",
- "gainsboro": "DCDCDC",
- "ghostwhite": "F8F8FF",
- "gold": "FFD700",
- "goldenrod": "DAA520",
- "gray": "808080",
- "green": "008000",
- "greenyellow": "ADFF2F",
- "grey": "808080",
- "honeydew": "F0FFF0",
- "hotpink": "FF69B4",
- "indianred": "CD5C5C",
- "indigo": "4B0082",
- "ivory": "FFFFF0",
- "khaki": "F0E68C",
- "lavender": "E6E6FA",
- "lavenderblush": "FFF0F5",
- "lawngreen": "7CFC00",
- "lemonchiffon": "FFFACD",
- "lightblue": "ADD8E6",
- "lightcoral": "F08080",
- "lightcyan": "E0FFFF",
- "lightgoldenrodyellow": "FAFAD2",
- "lightgray": "D3D3D3",
- "lightgreen": "90EE90",
- "lightgrey": "D3D3D3",
- "lightpink": "FFB6C1",
- "lightsalmon": "FFA07A",
- "lightseagreen": "20B2AA",
- "lightskyblue": "87CEFA",
- "lightslategray": "778899",
- "lightslategrey": "778899",
- "lightsteelblue": "B0C4DE",
- "lightyellow": "FFFFE0",
- "lime": "00FF00",
- "limegreen": "32CD32",
- "linen": "FAF0E6",
- "magenta": "FF00FF",
- "maroon": "800000",
- "mediumaquamarine": "66CDAA",
- "mediumblue": "0000CD",
- "mediumorchid": "BA55D3",
- "mediumpurple": "9370DB",
- "mediumseagreen": "3CB371",
- "mediumslateblue": "7B68EE",
- "mediumspringgreen": "00FA9A",
- "mediumturquoise": "48D1CC",
- "mediumvioletred": "C71585",
- "midnightblue": "191970",
- "mintcream": "F5FFFA",
- "mistyrose": "FFE4E1",
- "moccasin": "FFE4B5",
- "navajowhite": "FFDEAD",
- "navy": "000080",
- "oldlace": "FDF5E6",
- "olive": "808000",
- "olivedrab": "6B8E23",
- "orange": "FFA500",
- "orangered": "FF4500",
- "orchid": "DA70D6",
- "palegoldenrod": "EEE8AA",
- "palegreen": "98FB98",
- "paleturquoise": "AFEEEE",
- "palevioletred": "DB7093",
- "papayawhip": "FFEFD5",
- "peachpuff": "FFDAB9",
- "peru": "CD853F",
- "pink": "FFC0CB",
- "plum": "DDA0DD",
- "powderblue": "B0E0E6",
- "purple": "800080",
- "rebeccapurple": "663399",
- "red": "FF0000",
- "rosybrown": "BC8F8F",
- "royalblue": "4169E1",
- "saddlebrown": "8B4513",
- "salmon": "FA8072",
- "sandybrown": "F4A460",
- "seagreen": "2E8B57",
- "seashell": "FFF5EE",
- "sienna": "A0522D",
- "silver": "C0C0C0",
- "skyblue": "87CEEB",
- "slateblue": "6A5ACD",
- "slategray": "708090",
- "slategrey": "708090",
- "snow": "FFFAFA",
- "springgreen": "00FF7F",
- "steelblue": "4682B4",
- "tan": "D2B48C",
- "teal": "008080",
- "thistle": "D8BFD8",
- "tomato": "FF6347",
- "turquoise": "40E0D0",
- "violet": "EE82EE",
- "wheat": "F5DEB3",
- "white": "FFFFFF",
- "whitesmoke": "F5F5F5",
- "yellow": "FFFF00",
- "yellowgreen": "9ACD32",
-}
diff --git a/contrib/python/pandas/py3/pandas/io/formats/console.py b/contrib/python/pandas/py3/pandas/io/formats/console.py
deleted file mode 100644
index 2a6cbe07629..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/console.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""
-Internal module for console introspection
-"""
-from __future__ import annotations
-
-from shutil import get_terminal_size
-
-
-def get_console_size() -> tuple[int | None, int | None]:
- """
- Return console size as tuple = (width, height).
-
- Returns (None,None) in non-interactive session.
- """
- from pandas import get_option
-
- display_width = get_option("display.width")
- display_height = get_option("display.max_rows")
-
- # Consider
- # interactive shell terminal, can detect term size
- # interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term
- # size non-interactive script, should disregard term size
-
- # in addition
- # width,height have default values, but setting to 'None' signals
- # should use Auto-Detection, But only in interactive shell-terminal.
- # Simple. yeah.
-
- if in_interactive_session():
- if in_ipython_frontend():
- # sane defaults for interactive non-shell terminal
- # match default for width,height in config_init
- from pandas._config.config import get_default_val
-
- terminal_width = get_default_val("display.width")
- terminal_height = get_default_val("display.max_rows")
- else:
- # pure terminal
- terminal_width, terminal_height = get_terminal_size()
- else:
- terminal_width, terminal_height = None, None
-
- # Note if the User sets width/Height to None (auto-detection)
- # and we're in a script (non-inter), this will return (None,None)
- # caller needs to deal.
- return display_width or terminal_width, display_height or terminal_height
-
-
-# ----------------------------------------------------------------------
-# Detect our environment
-
-
-def in_interactive_session() -> bool:
- """
- Check if we're running in an interactive shell.
-
- Returns
- -------
- bool
- True if running under python/ipython interactive shell.
- """
- from pandas import get_option
-
- def check_main():
- try:
- import __main__ as main
- except ModuleNotFoundError:
- return get_option("mode.sim_interactive")
- return not hasattr(main, "__file__") or get_option("mode.sim_interactive")
-
- try:
- # error: Name '__IPYTHON__' is not defined
- return __IPYTHON__ or check_main() # type: ignore[name-defined]
- except NameError:
- return check_main()
-
-
-def in_ipython_frontend() -> bool:
- """
- Check if we're inside an IPython zmq frontend.
-
- Returns
- -------
- bool
- """
- try:
- # error: Name 'get_ipython' is not defined
- ip = get_ipython() # type: ignore[name-defined]
- return "zmq" in str(type(ip)).lower()
- except NameError:
- pass
-
- return False
diff --git a/contrib/python/pandas/py3/pandas/io/formats/css.py b/contrib/python/pandas/py3/pandas/io/formats/css.py
deleted file mode 100644
index f2f808a6e20..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/css.py
+++ /dev/null
@@ -1,418 +0,0 @@
-"""
-Utilities for interpreting CSS from Stylers for formatting non-HTML outputs.
-"""
-from __future__ import annotations
-
-import re
-from typing import (
- Callable,
- Generator,
- Iterable,
- Iterator,
-)
-import warnings
-
-from pandas.errors import CSSWarning
-from pandas.util._exceptions import find_stack_level
-
-
-def _side_expander(prop_fmt: str) -> Callable:
- """
- Wrapper to expand shorthand property into top, right, bottom, left properties
-
- Parameters
- ----------
- side : str
- The border side to expand into properties
-
- Returns
- -------
- function: Return to call when a 'border(-{side}): {value}' string is encountered
- """
-
- def expand(self, prop, value: str) -> Generator[tuple[str, str], None, None]:
- """
- Expand shorthand property into side-specific property (top, right, bottom, left)
-
- Parameters
- ----------
- prop (str): CSS property name
- value (str): String token for property
-
- Yields
- ------
- Tuple (str, str): Expanded property, value
- """
- tokens = value.split()
- try:
- mapping = self.SIDE_SHORTHANDS[len(tokens)]
- except KeyError:
- warnings.warn(
- f'Could not expand "{prop}: {value}"',
- CSSWarning,
- stacklevel=find_stack_level(),
- )
- return
- for key, idx in zip(self.SIDES, mapping):
- yield prop_fmt.format(key), tokens[idx]
-
- return expand
-
-
-def _border_expander(side: str = "") -> Callable:
- """
- Wrapper to expand 'border' property into border color, style, and width properties
-
- Parameters
- ----------
- side : str
- The border side to expand into properties
-
- Returns
- -------
- function: Return to call when a 'border(-{side}): {value}' string is encountered
- """
- if side != "":
- side = f"-{side}"
-
- def expand(self, prop, value: str) -> Generator[tuple[str, str], None, None]:
- """
- Expand border into color, style, and width tuples
-
- Parameters
- ----------
- prop : str
- CSS property name passed to styler
- value : str
- Value passed to styler for property
-
- Yields
- ------
- Tuple (str, str): Expanded property, value
- """
- tokens = value.split()
- if len(tokens) == 0 or len(tokens) > 3:
- warnings.warn(
- f'Too many tokens provided to "{prop}" (expected 1-3)',
- CSSWarning,
- stacklevel=find_stack_level(),
- )
-
- # TODO: Can we use current color as initial value to comply with CSS standards?
- border_declarations = {
- f"border{side}-color": "black",
- f"border{side}-style": "none",
- f"border{side}-width": "medium",
- }
- for token in tokens:
- if token.lower() in self.BORDER_STYLES:
- border_declarations[f"border{side}-style"] = token
- elif any(ratio in token.lower() for ratio in self.BORDER_WIDTH_RATIOS):
- border_declarations[f"border{side}-width"] = token
- else:
- border_declarations[f"border{side}-color"] = token
- # TODO: Warn user if item entered more than once (e.g. "border: red green")
-
- # Per CSS, "border" will reset previous "border-*" definitions
- yield from self.atomize(border_declarations.items())
-
- return expand
-
-
-class CSSResolver:
- """
- A callable for parsing and resolving CSS to atomic properties.
- """
-
- UNIT_RATIOS = {
- "pt": ("pt", 1),
- "em": ("em", 1),
- "rem": ("pt", 12),
- "ex": ("em", 0.5),
- # 'ch':
- "px": ("pt", 0.75),
- "pc": ("pt", 12),
- "in": ("pt", 72),
- "cm": ("in", 1 / 2.54),
- "mm": ("in", 1 / 25.4),
- "q": ("mm", 0.25),
- "!!default": ("em", 0),
- }
-
- FONT_SIZE_RATIOS = UNIT_RATIOS.copy()
- FONT_SIZE_RATIOS.update(
- {
- "%": ("em", 0.01),
- "xx-small": ("rem", 0.5),
- "x-small": ("rem", 0.625),
- "small": ("rem", 0.8),
- "medium": ("rem", 1),
- "large": ("rem", 1.125),
- "x-large": ("rem", 1.5),
- "xx-large": ("rem", 2),
- "smaller": ("em", 1 / 1.2),
- "larger": ("em", 1.2),
- "!!default": ("em", 1),
- }
- )
-
- MARGIN_RATIOS = UNIT_RATIOS.copy()
- MARGIN_RATIOS.update({"none": ("pt", 0)})
-
- BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy()
- BORDER_WIDTH_RATIOS.update(
- {
- "none": ("pt", 0),
- "thick": ("px", 4),
- "medium": ("px", 2),
- "thin": ("px", 1),
- # Default: medium only if solid
- }
- )
-
- BORDER_STYLES = [
- "none",
- "hidden",
- "dotted",
- "dashed",
- "solid",
- "double",
- "groove",
- "ridge",
- "inset",
- "outset",
- "mediumdashdot",
- "dashdotdot",
- "hair",
- "mediumdashdotdot",
- "dashdot",
- "slantdashdot",
- "mediumdashed",
- ]
-
- SIDE_SHORTHANDS = {
- 1: [0, 0, 0, 0],
- 2: [0, 1, 0, 1],
- 3: [0, 1, 2, 1],
- 4: [0, 1, 2, 3],
- }
-
- SIDES = ("top", "right", "bottom", "left")
-
- CSS_EXPANSIONS = {
- **{
- (f"border-{prop}" if prop else "border"): _border_expander(prop)
- for prop in ["", "top", "right", "bottom", "left"]
- },
- **{
- f"border-{prop}": _side_expander(f"border-{{:s}}-{prop}")
- for prop in ["color", "style", "width"]
- },
- **{
- "margin": _side_expander("margin-{:s}"),
- "padding": _side_expander("padding-{:s}"),
- },
- }
-
- def __call__(
- self,
- declarations: str | Iterable[tuple[str, str]],
- inherited: dict[str, str] | None = None,
- ) -> dict[str, str]:
- """
- The given declarations to atomic properties.
-
- Parameters
- ----------
- declarations_str : str | Iterable[tuple[str, str]]
- A CSS string or set of CSS declaration tuples
- e.g. "font-weight: bold; background: blue" or
- {("font-weight", "bold"), ("background", "blue")}
- inherited : dict, optional
- Atomic properties indicating the inherited style context in which
- declarations_str is to be resolved. ``inherited`` should already
- be resolved, i.e. valid output of this method.
-
- Returns
- -------
- dict
- Atomic CSS 2.2 properties.
-
- Examples
- --------
- >>> resolve = CSSResolver()
- >>> inherited = {'font-family': 'serif', 'font-weight': 'bold'}
- >>> out = resolve('''
- ... border-color: BLUE RED;
- ... font-size: 1em;
- ... font-size: 2em;
- ... font-weight: normal;
- ... font-weight: inherit;
- ... ''', inherited)
- >>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE
- [('border-bottom-color', 'blue'),
- ('border-left-color', 'red'),
- ('border-right-color', 'red'),
- ('border-top-color', 'blue'),
- ('font-family', 'serif'),
- ('font-size', '24pt'),
- ('font-weight', 'bold')]
- """
- if isinstance(declarations, str):
- declarations = self.parse(declarations)
- props = dict(self.atomize(declarations))
- if inherited is None:
- inherited = {}
-
- props = self._update_initial(props, inherited)
- props = self._update_font_size(props, inherited)
- return self._update_other_units(props)
-
- def _update_initial(
- self,
- props: dict[str, str],
- inherited: dict[str, str],
- ) -> dict[str, str]:
- # 1. resolve inherited, initial
- for prop, val in inherited.items():
- if prop not in props:
- props[prop] = val
-
- new_props = props.copy()
- for prop, val in props.items():
- if val == "inherit":
- val = inherited.get(prop, "initial")
-
- if val in ("initial", None):
- # we do not define a complete initial stylesheet
- del new_props[prop]
- else:
- new_props[prop] = val
- return new_props
-
- def _update_font_size(
- self,
- props: dict[str, str],
- inherited: dict[str, str],
- ) -> dict[str, str]:
- # 2. resolve relative font size
- if props.get("font-size"):
- props["font-size"] = self.size_to_pt(
- props["font-size"],
- self._get_font_size(inherited),
- conversions=self.FONT_SIZE_RATIOS,
- )
- return props
-
- def _get_font_size(self, props: dict[str, str]) -> float | None:
- if props.get("font-size"):
- font_size_string = props["font-size"]
- return self._get_float_font_size_from_pt(font_size_string)
- return None
-
- def _get_float_font_size_from_pt(self, font_size_string: str) -> float:
- assert font_size_string.endswith("pt")
- return float(font_size_string.rstrip("pt"))
-
- def _update_other_units(self, props: dict[str, str]) -> dict[str, str]:
- font_size = self._get_font_size(props)
- # 3. TODO: resolve other font-relative units
- for side in self.SIDES:
- prop = f"border-{side}-width"
- if prop in props:
- props[prop] = self.size_to_pt(
- props[prop],
- em_pt=font_size,
- conversions=self.BORDER_WIDTH_RATIOS,
- )
-
- for prop in [f"margin-{side}", f"padding-{side}"]:
- if prop in props:
- # TODO: support %
- props[prop] = self.size_to_pt(
- props[prop],
- em_pt=font_size,
- conversions=self.MARGIN_RATIOS,
- )
- return props
-
- def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS):
- def _error():
- warnings.warn(
- f"Unhandled size: {repr(in_val)}",
- CSSWarning,
- stacklevel=find_stack_level(),
- )
- return self.size_to_pt("1!!default", conversions=conversions)
-
- match = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val)
- if match is None:
- return _error()
-
- val, unit = match.groups()
- if val == "":
- # hack for 'large' etc.
- val = 1
- else:
- try:
- val = float(val)
- except ValueError:
- return _error()
-
- while unit != "pt":
- if unit == "em":
- if em_pt is None:
- unit = "rem"
- else:
- val *= em_pt
- unit = "pt"
- continue
-
- try:
- unit, mul = conversions[unit]
- except KeyError:
- return _error()
- val *= mul
-
- val = round(val, 5)
- if int(val) == val:
- size_fmt = f"{int(val):d}pt"
- else:
- size_fmt = f"{val:f}pt"
- return size_fmt
-
- def atomize(self, declarations: Iterable) -> Generator[tuple[str, str], None, None]:
- for prop, value in declarations:
- prop = prop.lower()
- value = value.lower()
- if prop in self.CSS_EXPANSIONS:
- expand = self.CSS_EXPANSIONS[prop]
- yield from expand(self, prop, value)
- else:
- yield prop, value
-
- def parse(self, declarations_str: str) -> Iterator[tuple[str, str]]:
- """
- Generates (prop, value) pairs from declarations.
-
- In a future version may generate parsed tokens from tinycss/tinycss2
-
- Parameters
- ----------
- declarations_str : str
- """
- for decl in declarations_str.split(";"):
- if not decl.strip():
- continue
- prop, sep, val = decl.partition(":")
- prop = prop.strip().lower()
- # TODO: don't lowercase case sensitive parts of values (strings)
- val = val.strip().lower()
- if sep:
- yield prop, val
- else:
- warnings.warn(
- f"Ill-formatted attribute: expected a colon in {repr(decl)}",
- CSSWarning,
- stacklevel=find_stack_level(),
- )
diff --git a/contrib/python/pandas/py3/pandas/io/formats/csvs.py b/contrib/python/pandas/py3/pandas/io/formats/csvs.py
deleted file mode 100644
index bda8de2de88..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/csvs.py
+++ /dev/null
@@ -1,319 +0,0 @@
-"""
-Module for formatting output data into CSV files.
-"""
-
-from __future__ import annotations
-
-import csv as csvlib
-import os
-from typing import (
- TYPE_CHECKING,
- Any,
- Hashable,
- Iterator,
- Sequence,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs import writers as libwriters
-from pandas._typing import (
- CompressionOptions,
- FilePath,
- FloatFormatType,
- IndexLabel,
- StorageOptions,
- WriteBuffer,
-)
-from pandas.util._decorators import cache_readonly
-
-from pandas.core.dtypes.generic import (
- ABCDatetimeIndex,
- ABCIndex,
- ABCMultiIndex,
- ABCPeriodIndex,
-)
-from pandas.core.dtypes.missing import notna
-
-from pandas.core.indexes.api import Index
-
-from pandas.io.common import get_handle
-
-if TYPE_CHECKING:
- from pandas.io.formats.format import DataFrameFormatter
-
-
-class CSVFormatter:
- cols: np.ndarray
-
- def __init__(
- self,
- formatter: DataFrameFormatter,
- path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "",
- sep: str = ",",
- cols: Sequence[Hashable] | None = None,
- index_label: IndexLabel | None = None,
- mode: str = "w",
- encoding: str | None = None,
- errors: str = "strict",
- compression: CompressionOptions = "infer",
- quoting: int | None = None,
- lineterminator: str | None = "\n",
- chunksize: int | None = None,
- quotechar: str | None = '"',
- date_format: str | None = None,
- doublequote: bool = True,
- escapechar: str | None = None,
- storage_options: StorageOptions = None,
- ) -> None:
- self.fmt = formatter
-
- self.obj = self.fmt.frame
-
- self.filepath_or_buffer = path_or_buf
- self.encoding = encoding
- self.compression: CompressionOptions = compression
- self.mode = mode
- self.storage_options = storage_options
-
- self.sep = sep
- self.index_label = self._initialize_index_label(index_label)
- self.errors = errors
- self.quoting = quoting or csvlib.QUOTE_MINIMAL
- self.quotechar = self._initialize_quotechar(quotechar)
- self.doublequote = doublequote
- self.escapechar = escapechar
- self.lineterminator = lineterminator or os.linesep
- self.date_format = date_format
- self.cols = self._initialize_columns(cols)
- self.chunksize = self._initialize_chunksize(chunksize)
-
- @property
- def na_rep(self) -> str:
- return self.fmt.na_rep
-
- @property
- def float_format(self) -> FloatFormatType | None:
- return self.fmt.float_format
-
- @property
- def decimal(self) -> str:
- return self.fmt.decimal
-
- @property
- def header(self) -> bool | Sequence[str]:
- return self.fmt.header
-
- @property
- def index(self) -> bool:
- return self.fmt.index
-
- def _initialize_index_label(self, index_label: IndexLabel | None) -> IndexLabel:
- if index_label is not False:
- if index_label is None:
- return self._get_index_label_from_obj()
- elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndex)):
- # given a string for a DF with Index
- return [index_label]
- return index_label
-
- def _get_index_label_from_obj(self) -> Sequence[Hashable]:
- if isinstance(self.obj.index, ABCMultiIndex):
- return self._get_index_label_multiindex()
- else:
- return self._get_index_label_flat()
-
- def _get_index_label_multiindex(self) -> Sequence[Hashable]:
- return [name or "" for name in self.obj.index.names]
-
- def _get_index_label_flat(self) -> Sequence[Hashable]:
- index_label = self.obj.index.name
- return [""] if index_label is None else [index_label]
-
- def _initialize_quotechar(self, quotechar: str | None) -> str | None:
- if self.quoting != csvlib.QUOTE_NONE:
- # prevents crash in _csv
- return quotechar
- return None
-
- @property
- def has_mi_columns(self) -> bool:
- return bool(isinstance(self.obj.columns, ABCMultiIndex))
-
- def _initialize_columns(self, cols: Sequence[Hashable] | None) -> np.ndarray:
- # validate mi options
- if self.has_mi_columns:
- if cols is not None:
- msg = "cannot specify cols with a MultiIndex on the columns"
- raise TypeError(msg)
-
- if cols is not None:
- if isinstance(cols, ABCIndex):
- cols = cols._format_native_types(**self._number_format)
- else:
- cols = list(cols)
- self.obj = self.obj.loc[:, cols]
-
- # update columns to include possible multiplicity of dupes
- # and make sure cols is just a list of labels
- new_cols = self.obj.columns
- return new_cols._format_native_types(**self._number_format)
-
- def _initialize_chunksize(self, chunksize: int | None) -> int:
- if chunksize is None:
- return (100000 // (len(self.cols) or 1)) or 1
- return int(chunksize)
-
- @property
- def _number_format(self) -> dict[str, Any]:
- """Dictionary used for storing number formatting settings."""
- return {
- "na_rep": self.na_rep,
- "float_format": self.float_format,
- "date_format": self.date_format,
- "quoting": self.quoting,
- "decimal": self.decimal,
- }
-
- @cache_readonly
- def data_index(self) -> Index:
- data_index = self.obj.index
- if (
- isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex))
- and self.date_format is not None
- ):
- data_index = Index(
- [x.strftime(self.date_format) if notna(x) else "" for x in data_index]
- )
- elif isinstance(data_index, ABCMultiIndex):
- data_index = data_index.remove_unused_levels()
- return data_index
-
- @property
- def nlevels(self) -> int:
- if self.index:
- return getattr(self.data_index, "nlevels", 1)
- else:
- return 0
-
- @property
- def _has_aliases(self) -> bool:
- return isinstance(self.header, (tuple, list, np.ndarray, ABCIndex))
-
- @property
- def _need_to_save_header(self) -> bool:
- return bool(self._has_aliases or self.header)
-
- @property
- def write_cols(self) -> Sequence[Hashable]:
- if self._has_aliases:
- assert not isinstance(self.header, bool)
- if len(self.header) != len(self.cols):
- raise ValueError(
- f"Writing {len(self.cols)} cols but got {len(self.header)} aliases"
- )
- return self.header
- else:
- # self.cols is an ndarray derived from Index._format_native_types,
- # so its entries are strings, i.e. hashable
- return cast(Sequence[Hashable], self.cols)
-
- @property
- def encoded_labels(self) -> list[Hashable]:
- encoded_labels: list[Hashable] = []
-
- if self.index and self.index_label:
- assert isinstance(self.index_label, Sequence)
- encoded_labels = list(self.index_label)
-
- if not self.has_mi_columns or self._has_aliases:
- encoded_labels += list(self.write_cols)
-
- return encoded_labels
-
- def save(self) -> None:
- """
- Create the writer & save.
- """
- # apply compression and byte/text conversion
- with get_handle(
- self.filepath_or_buffer,
- self.mode,
- encoding=self.encoding,
- errors=self.errors,
- compression=self.compression,
- storage_options=self.storage_options,
- ) as handles:
- # Note: self.encoding is irrelevant here
- self.writer = csvlib.writer(
- handles.handle,
- lineterminator=self.lineterminator,
- delimiter=self.sep,
- quoting=self.quoting,
- doublequote=self.doublequote,
- escapechar=self.escapechar,
- quotechar=self.quotechar,
- )
-
- self._save()
-
- def _save(self) -> None:
- if self._need_to_save_header:
- self._save_header()
- self._save_body()
-
- def _save_header(self) -> None:
- if not self.has_mi_columns or self._has_aliases:
- self.writer.writerow(self.encoded_labels)
- else:
- for row in self._generate_multiindex_header_rows():
- self.writer.writerow(row)
-
- def _generate_multiindex_header_rows(self) -> Iterator[list[Hashable]]:
- columns = self.obj.columns
- for i in range(columns.nlevels):
- # we need at least 1 index column to write our col names
- col_line = []
- if self.index:
- # name is the first column
- col_line.append(columns.names[i])
-
- if isinstance(self.index_label, list) and len(self.index_label) > 1:
- col_line.extend([""] * (len(self.index_label) - 1))
-
- col_line.extend(columns._get_level_values(i))
- yield col_line
-
- # Write out the index line if it's not empty.
- # Otherwise, we will print out an extraneous
- # blank line between the mi and the data rows.
- if self.encoded_labels and set(self.encoded_labels) != {""}:
- yield self.encoded_labels + [""] * len(columns)
-
- def _save_body(self) -> None:
- nrows = len(self.data_index)
- chunks = (nrows // self.chunksize) + 1
- for i in range(chunks):
- start_i = i * self.chunksize
- end_i = min(start_i + self.chunksize, nrows)
- if start_i >= end_i:
- break
- self._save_chunk(start_i, end_i)
-
- def _save_chunk(self, start_i: int, end_i: int) -> None:
- # create the data for a chunk
- slicer = slice(start_i, end_i)
- df = self.obj.iloc[slicer]
-
- res = df._mgr.to_native_types(**self._number_format)
- data = [res.iget_values(i) for i in range(len(res.items))]
-
- ix = self.data_index[slicer]._format_native_types(**self._number_format)
- libwriters.write_csv_rows(
- data,
- ix,
- self.nlevels,
- self.cols,
- self.writer,
- )
diff --git a/contrib/python/pandas/py3/pandas/io/formats/excel.py b/contrib/python/pandas/py3/pandas/io/formats/excel.py
deleted file mode 100644
index 34c4d330761..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/excel.py
+++ /dev/null
@@ -1,950 +0,0 @@
-"""
-Utilities for conversion to writer-agnostic Excel representation.
-"""
-from __future__ import annotations
-
-from functools import (
- lru_cache,
- reduce,
-)
-import itertools
-import re
-from typing import (
- Any,
- Callable,
- Hashable,
- Iterable,
- Mapping,
- Sequence,
- cast,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs.lib import is_list_like
-from pandas._typing import (
- IndexLabel,
- StorageOptions,
-)
-from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes import missing
-from pandas.core.dtypes.common import (
- is_float,
- is_scalar,
-)
-
-from pandas import (
- DataFrame,
- Index,
- MultiIndex,
- PeriodIndex,
-)
-import pandas.core.common as com
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.formats._color_data import CSS4_COLORS
-from pandas.io.formats.css import (
- CSSResolver,
- CSSWarning,
-)
-from pandas.io.formats.format import get_level_lengths
-from pandas.io.formats.printing import pprint_thing
-
-
-class ExcelCell:
- __fields__ = ("row", "col", "val", "style", "mergestart", "mergeend")
- __slots__ = __fields__
-
- def __init__(
- self,
- row: int,
- col: int,
- val,
- style=None,
- mergestart: int | None = None,
- mergeend: int | None = None,
- ) -> None:
- self.row = row
- self.col = col
- self.val = val
- self.style = style
- self.mergestart = mergestart
- self.mergeend = mergeend
-
-
-class CssExcelCell(ExcelCell):
- def __init__(
- self,
- row: int,
- col: int,
- val,
- style: dict | None,
- css_styles: dict[tuple[int, int], list[tuple[str, Any]]] | None,
- css_row: int,
- css_col: int,
- css_converter: Callable | None,
- **kwargs,
- ) -> None:
- if css_styles and css_converter:
- # Use dict to get only one (case-insensitive) declaration per property
- declaration_dict = {
- prop.lower(): val for prop, val in css_styles[css_row, css_col]
- }
- # Convert to frozenset for order-invariant caching
- unique_declarations = frozenset(declaration_dict.items())
- style = css_converter(unique_declarations)
-
- super().__init__(row=row, col=col, val=val, style=style, **kwargs)
-
-
-class CSSToExcelConverter:
- """
- A callable for converting CSS declarations to ExcelWriter styles
-
- Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow),
- focusing on font styling, backgrounds, borders and alignment.
-
- Operates by first computing CSS styles in a fairly generic
- way (see :meth:`compute_css`) then determining Excel style
- properties from CSS properties (see :meth:`build_xlstyle`).
-
- Parameters
- ----------
- inherited : str, optional
- CSS declarations understood to be the containing scope for the
- CSS processed by :meth:`__call__`.
- """
-
- NAMED_COLORS = CSS4_COLORS
-
- VERTICAL_MAP = {
- "top": "top",
- "text-top": "top",
- "middle": "center",
- "baseline": "bottom",
- "bottom": "bottom",
- "text-bottom": "bottom",
- # OpenXML also has 'justify', 'distributed'
- }
-
- BOLD_MAP = {
- "bold": True,
- "bolder": True,
- "600": True,
- "700": True,
- "800": True,
- "900": True,
- "normal": False,
- "lighter": False,
- "100": False,
- "200": False,
- "300": False,
- "400": False,
- "500": False,
- }
-
- ITALIC_MAP = {
- "normal": False,
- "italic": True,
- "oblique": True,
- }
-
- FAMILY_MAP = {
- "serif": 1, # roman
- "sans-serif": 2, # swiss
- "cursive": 4, # script
- "fantasy": 5, # decorative
- }
-
- BORDER_STYLE_MAP = {
- style.lower(): style
- for style in [
- "dashed",
- "mediumDashDot",
- "dashDotDot",
- "hair",
- "dotted",
- "mediumDashDotDot",
- "double",
- "dashDot",
- "slantDashDot",
- "mediumDashed",
- ]
- }
-
- # NB: Most of the methods here could be classmethods, as only __init__
- # and __call__ make use of instance attributes. We leave them as
- # instancemethods so that users can easily experiment with extensions
- # without monkey-patching.
- inherited: dict[str, str] | None
-
- def __init__(self, inherited: str | None = None) -> None:
- if inherited is not None:
- self.inherited = self.compute_css(inherited)
- else:
- self.inherited = None
- # We should avoid lru_cache on the __call__ method.
- # Otherwise once the method __call__ has been called
- # garbage collection no longer deletes the instance.
- self._call_cached = lru_cache(maxsize=None)(self._call_uncached)
-
- compute_css = CSSResolver()
-
- def __call__(
- self, declarations: str | frozenset[tuple[str, str]]
- ) -> dict[str, dict[str, str]]:
- """
- Convert CSS declarations to ExcelWriter style.
-
- Parameters
- ----------
- declarations : str | frozenset[tuple[str, str]]
- CSS string or set of CSS declaration tuples.
- e.g. "font-weight: bold; background: blue" or
- {("font-weight", "bold"), ("background", "blue")}
-
- Returns
- -------
- xlstyle : dict
- A style as interpreted by ExcelWriter when found in
- ExcelCell.style.
- """
- return self._call_cached(declarations)
-
- def _call_uncached(
- self, declarations: str | frozenset[tuple[str, str]]
- ) -> dict[str, dict[str, str]]:
- properties = self.compute_css(declarations, self.inherited)
- return self.build_xlstyle(properties)
-
- def build_xlstyle(self, props: Mapping[str, str]) -> dict[str, dict[str, str]]:
- out = {
- "alignment": self.build_alignment(props),
- "border": self.build_border(props),
- "fill": self.build_fill(props),
- "font": self.build_font(props),
- "number_format": self.build_number_format(props),
- }
-
- # TODO: handle cell width and height: needs support in pandas.io.excel
-
- def remove_none(d: dict[str, str | None]) -> None:
- """Remove key where value is None, through nested dicts"""
- for k, v in list(d.items()):
- if v is None:
- del d[k]
- elif isinstance(v, dict):
- remove_none(v)
- if not v:
- del d[k]
-
- remove_none(out)
- return out
-
- def build_alignment(self, props: Mapping[str, str]) -> dict[str, bool | str | None]:
- # TODO: text-indent, padding-left -> alignment.indent
- return {
- "horizontal": props.get("text-align"),
- "vertical": self._get_vertical_alignment(props),
- "wrap_text": self._get_is_wrap_text(props),
- }
-
- def _get_vertical_alignment(self, props: Mapping[str, str]) -> str | None:
- vertical_align = props.get("vertical-align")
- if vertical_align:
- return self.VERTICAL_MAP.get(vertical_align)
- return None
-
- def _get_is_wrap_text(self, props: Mapping[str, str]) -> bool | None:
- if props.get("white-space") is None:
- return None
- return bool(props["white-space"] not in ("nowrap", "pre", "pre-line"))
-
- def build_border(
- self, props: Mapping[str, str]
- ) -> dict[str, dict[str, str | None]]:
- return {
- side: {
- "style": self._border_style(
- props.get(f"border-{side}-style"),
- props.get(f"border-{side}-width"),
- self.color_to_excel(props.get(f"border-{side}-color")),
- ),
- "color": self.color_to_excel(props.get(f"border-{side}-color")),
- }
- for side in ["top", "right", "bottom", "left"]
- }
-
- def _border_style(self, style: str | None, width: str | None, color: str | None):
- # convert styles and widths to openxml, one of:
- # 'dashDot'
- # 'dashDotDot'
- # 'dashed'
- # 'dotted'
- # 'double'
- # 'hair'
- # 'medium'
- # 'mediumDashDot'
- # 'mediumDashDotDot'
- # 'mediumDashed'
- # 'slantDashDot'
- # 'thick'
- # 'thin'
- if width is None and style is None and color is None:
- # Return None will remove "border" from style dictionary
- return None
-
- if width is None and style is None:
- # Return "none" will keep "border" in style dictionary
- return "none"
-
- if style in ("none", "hidden"):
- return "none"
-
- width_name = self._get_width_name(width)
- if width_name is None:
- return "none"
-
- if style in (None, "groove", "ridge", "inset", "outset", "solid"):
- # not handled
- return width_name
-
- if style == "double":
- return "double"
- if style == "dotted":
- if width_name in ("hair", "thin"):
- return "dotted"
- return "mediumDashDotDot"
- if style == "dashed":
- if width_name in ("hair", "thin"):
- return "dashed"
- return "mediumDashed"
- elif style in self.BORDER_STYLE_MAP:
- # Excel-specific styles
- return self.BORDER_STYLE_MAP[style]
- else:
- warnings.warn(
- f"Unhandled border style format: {repr(style)}",
- CSSWarning,
- stacklevel=find_stack_level(),
- )
- return "none"
-
- def _get_width_name(self, width_input: str | None) -> str | None:
- width = self._width_to_float(width_input)
- if width < 1e-5:
- return None
- elif width < 1.3:
- return "thin"
- elif width < 2.8:
- return "medium"
- return "thick"
-
- def _width_to_float(self, width: str | None) -> float:
- if width is None:
- width = "2pt"
- return self._pt_to_float(width)
-
- def _pt_to_float(self, pt_string: str) -> float:
- assert pt_string.endswith("pt")
- return float(pt_string.rstrip("pt"))
-
- def build_fill(self, props: Mapping[str, str]):
- # TODO: perhaps allow for special properties
- # -excel-pattern-bgcolor and -excel-pattern-type
- fill_color = props.get("background-color")
- if fill_color not in (None, "transparent", "none"):
- return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"}
-
- def build_number_format(self, props: Mapping[str, str]) -> dict[str, str | None]:
- fc = props.get("number-format")
- fc = fc.replace("§", ";") if isinstance(fc, str) else fc
- return {"format_code": fc}
-
- def build_font(
- self, props: Mapping[str, str]
- ) -> dict[str, bool | float | str | None]:
- font_names = self._get_font_names(props)
- decoration = self._get_decoration(props)
- return {
- "name": font_names[0] if font_names else None,
- "family": self._select_font_family(font_names),
- "size": self._get_font_size(props),
- "bold": self._get_is_bold(props),
- "italic": self._get_is_italic(props),
- "underline": ("single" if "underline" in decoration else None),
- "strike": ("line-through" in decoration) or None,
- "color": self.color_to_excel(props.get("color")),
- # shadow if nonzero digit before shadow color
- "shadow": self._get_shadow(props),
- }
-
- def _get_is_bold(self, props: Mapping[str, str]) -> bool | None:
- weight = props.get("font-weight")
- if weight:
- return self.BOLD_MAP.get(weight)
- return None
-
- def _get_is_italic(self, props: Mapping[str, str]) -> bool | None:
- font_style = props.get("font-style")
- if font_style:
- return self.ITALIC_MAP.get(font_style)
- return None
-
- def _get_decoration(self, props: Mapping[str, str]) -> Sequence[str]:
- decoration = props.get("text-decoration")
- if decoration is not None:
- return decoration.split()
- else:
- return ()
-
- def _get_underline(self, decoration: Sequence[str]) -> str | None:
- if "underline" in decoration:
- return "single"
- return None
-
- def _get_shadow(self, props: Mapping[str, str]) -> bool | None:
- if "text-shadow" in props:
- return bool(re.search("^[^#(]*[1-9]", props["text-shadow"]))
- return None
-
- def _get_font_names(self, props: Mapping[str, str]) -> Sequence[str]:
- font_names_tmp = re.findall(
- r"""(?x)
- (
- "(?:[^"]|\\")+"
- |
- '(?:[^']|\\')+'
- |
- [^'",]+
- )(?=,|\s*$)
- """,
- props.get("font-family", ""),
- )
-
- font_names = []
- for name in font_names_tmp:
- if name[:1] == '"':
- name = name[1:-1].replace('\\"', '"')
- elif name[:1] == "'":
- name = name[1:-1].replace("\\'", "'")
- else:
- name = name.strip()
- if name:
- font_names.append(name)
- return font_names
-
- def _get_font_size(self, props: Mapping[str, str]) -> float | None:
- size = props.get("font-size")
- if size is None:
- return size
- return self._pt_to_float(size)
-
- def _select_font_family(self, font_names) -> int | None:
- family = None
- for name in font_names:
- family = self.FAMILY_MAP.get(name)
- if family:
- break
-
- return family
-
- def color_to_excel(self, val: str | None) -> str | None:
- if val is None:
- return None
-
- if self._is_hex_color(val):
- return self._convert_hex_to_excel(val)
-
- try:
- return self.NAMED_COLORS[val]
- except KeyError:
- warnings.warn(
- f"Unhandled color format: {repr(val)}",
- CSSWarning,
- stacklevel=find_stack_level(),
- )
- return None
-
- def _is_hex_color(self, color_string: str) -> bool:
- return bool(color_string.startswith("#"))
-
- def _convert_hex_to_excel(self, color_string: str) -> str:
- code = color_string.lstrip("#")
- if self._is_shorthand_color(color_string):
- return (code[0] * 2 + code[1] * 2 + code[2] * 2).upper()
- else:
- return code.upper()
-
- def _is_shorthand_color(self, color_string: str) -> bool:
- """Check if color code is shorthand.
-
- #FFF is a shorthand as opposed to full #FFFFFF.
- """
- code = color_string.lstrip("#")
- if len(code) == 3:
- return True
- elif len(code) == 6:
- return False
- else:
- raise ValueError(f"Unexpected color {color_string}")
-
-
-class ExcelFormatter:
- """
- Class for formatting a DataFrame to a list of ExcelCells,
-
- Parameters
- ----------
- df : DataFrame or Styler
- na_rep: na representation
- float_format : str, default None
- Format string for floating point numbers
- cols : sequence, optional
- Columns to write
- header : bool or sequence of str, default True
- Write out column names. If a list of string is given it is
- assumed to be aliases for the column names
- index : bool, default True
- output row names (index)
- index_label : str or sequence, default None
- Column label for index column(s) if desired. If None is given, and
- `header` and `index` are True, then the index names are used. A
- sequence should be given if the DataFrame uses MultiIndex.
- merge_cells : bool, default False
- Format MultiIndex and Hierarchical Rows as merged cells.
- inf_rep : str, default `'inf'`
- representation for np.inf values (which aren't representable in Excel)
- A `'-'` sign will be added in front of -inf.
- style_converter : callable, optional
- This translates Styler styles (CSS) into ExcelWriter styles.
- Defaults to ``CSSToExcelConverter()``.
- It should have signature css_declarations string -> excel style.
- This is only called for body cells.
- """
-
- max_rows = 2**20
- max_cols = 2**14
-
- def __init__(
- self,
- df,
- na_rep: str = "",
- float_format: str | None = None,
- cols: Sequence[Hashable] | None = None,
- header: Sequence[Hashable] | bool = True,
- index: bool = True,
- index_label: IndexLabel | None = None,
- merge_cells: bool = False,
- inf_rep: str = "inf",
- style_converter: Callable | None = None,
- ) -> None:
- self.rowcounter = 0
- self.na_rep = na_rep
- if not isinstance(df, DataFrame):
- self.styler = df
- self.styler._compute() # calculate applied styles
- df = df.data
- if style_converter is None:
- style_converter = CSSToExcelConverter()
- self.style_converter: Callable | None = style_converter
- else:
- self.styler = None
- self.style_converter = None
- self.df = df
- if cols is not None:
- # all missing, raise
- if not len(Index(cols).intersection(df.columns)):
- raise KeyError("passes columns are not ALL present dataframe")
-
- if len(Index(cols).intersection(df.columns)) != len(set(cols)):
- # Deprecated in GH#17295, enforced in 1.0.0
- raise KeyError("Not all names specified in 'columns' are found")
-
- self.df = df.reindex(columns=cols)
-
- self.columns = self.df.columns
- self.float_format = float_format
- self.index = index
- self.index_label = index_label
- self.header = header
- self.merge_cells = merge_cells
- self.inf_rep = inf_rep
-
- @property
- def header_style(self) -> dict[str, dict[str, str | bool]]:
- return {
- "font": {"bold": True},
- "borders": {
- "top": "thin",
- "right": "thin",
- "bottom": "thin",
- "left": "thin",
- },
- "alignment": {"horizontal": "center", "vertical": "top"},
- }
-
- def _format_value(self, val):
- if is_scalar(val) and missing.isna(val):
- val = self.na_rep
- elif is_float(val):
- if missing.isposinf_scalar(val):
- val = self.inf_rep
- elif missing.isneginf_scalar(val):
- val = f"-{self.inf_rep}"
- elif self.float_format is not None:
- val = float(self.float_format % val)
- if getattr(val, "tzinfo", None) is not None:
- raise ValueError(
- "Excel does not support datetimes with "
- "timezones. Please ensure that datetimes "
- "are timezone unaware before writing to Excel."
- )
- return val
-
- def _format_header_mi(self) -> Iterable[ExcelCell]:
- if self.columns.nlevels > 1:
- if not self.index:
- raise NotImplementedError(
- "Writing to Excel with MultiIndex columns and no "
- "index ('index'=False) is not yet implemented."
- )
-
- if not (self._has_aliases or self.header):
- return
-
- columns = self.columns
- level_strs = columns.format(
- sparsify=self.merge_cells, adjoin=False, names=False
- )
- level_lengths = get_level_lengths(level_strs)
- coloffset = 0
- lnum = 0
-
- if self.index and isinstance(self.df.index, MultiIndex):
- coloffset = len(self.df.index[0]) - 1
-
- if self.merge_cells:
- # Format multi-index as a merged cells.
- for lnum, name in enumerate(columns.names):
- yield ExcelCell(
- row=lnum,
- col=coloffset,
- val=name,
- style=self.header_style,
- )
-
- for lnum, (spans, levels, level_codes) in enumerate(
- zip(level_lengths, columns.levels, columns.codes)
- ):
- values = levels.take(level_codes)
- for i, span_val in spans.items():
- mergestart, mergeend = None, None
- if span_val > 1:
- mergestart, mergeend = lnum, coloffset + i + span_val
- yield CssExcelCell(
- row=lnum,
- col=coloffset + i + 1,
- val=values[i],
- style=self.header_style,
- css_styles=getattr(self.styler, "ctx_columns", None),
- css_row=lnum,
- css_col=i,
- css_converter=self.style_converter,
- mergestart=mergestart,
- mergeend=mergeend,
- )
- else:
- # Format in legacy format with dots to indicate levels.
- for i, values in enumerate(zip(*level_strs)):
- v = ".".join(map(pprint_thing, values))
- yield CssExcelCell(
- row=lnum,
- col=coloffset + i + 1,
- val=v,
- style=self.header_style,
- css_styles=getattr(self.styler, "ctx_columns", None),
- css_row=lnum,
- css_col=i,
- css_converter=self.style_converter,
- )
-
- self.rowcounter = lnum
-
- def _format_header_regular(self) -> Iterable[ExcelCell]:
- if self._has_aliases or self.header:
- coloffset = 0
-
- if self.index:
- coloffset = 1
- if isinstance(self.df.index, MultiIndex):
- coloffset = len(self.df.index.names)
-
- colnames = self.columns
- if self._has_aliases:
- self.header = cast(Sequence, self.header)
- if len(self.header) != len(self.columns):
- raise ValueError(
- f"Writing {len(self.columns)} cols "
- f"but got {len(self.header)} aliases"
- )
- colnames = self.header
-
- for colindex, colname in enumerate(colnames):
- yield CssExcelCell(
- row=self.rowcounter,
- col=colindex + coloffset,
- val=colname,
- style=self.header_style,
- css_styles=getattr(self.styler, "ctx_columns", None),
- css_row=0,
- css_col=colindex,
- css_converter=self.style_converter,
- )
-
- def _format_header(self) -> Iterable[ExcelCell]:
- gen: Iterable[ExcelCell]
-
- if isinstance(self.columns, MultiIndex):
- gen = self._format_header_mi()
- else:
- gen = self._format_header_regular()
-
- gen2: Iterable[ExcelCell] = ()
-
- if self.df.index.names:
- row = [x if x is not None else "" for x in self.df.index.names] + [
- ""
- ] * len(self.columns)
- if reduce(lambda x, y: x and y, map(lambda x: x != "", row)):
- gen2 = (
- ExcelCell(self.rowcounter, colindex, val, self.header_style)
- for colindex, val in enumerate(row)
- )
- self.rowcounter += 1
- return itertools.chain(gen, gen2)
-
- def _format_body(self) -> Iterable[ExcelCell]:
- if isinstance(self.df.index, MultiIndex):
- return self._format_hierarchical_rows()
- else:
- return self._format_regular_rows()
-
- def _format_regular_rows(self) -> Iterable[ExcelCell]:
- if self._has_aliases or self.header:
- self.rowcounter += 1
-
- # output index and index_label?
- if self.index:
- # check aliases
- # if list only take first as this is not a MultiIndex
- if self.index_label and isinstance(
- self.index_label, (list, tuple, np.ndarray, Index)
- ):
- index_label = self.index_label[0]
- # if string good to go
- elif self.index_label and isinstance(self.index_label, str):
- index_label = self.index_label
- else:
- index_label = self.df.index.names[0]
-
- if isinstance(self.columns, MultiIndex):
- self.rowcounter += 1
-
- if index_label and self.header is not False:
- yield ExcelCell(self.rowcounter - 1, 0, index_label, self.header_style)
-
- # write index_values
- index_values = self.df.index
- if isinstance(self.df.index, PeriodIndex):
- index_values = self.df.index.to_timestamp()
-
- for idx, idxval in enumerate(index_values):
- yield CssExcelCell(
- row=self.rowcounter + idx,
- col=0,
- val=idxval,
- style=self.header_style,
- css_styles=getattr(self.styler, "ctx_index", None),
- css_row=idx,
- css_col=0,
- css_converter=self.style_converter,
- )
- coloffset = 1
- else:
- coloffset = 0
-
- yield from self._generate_body(coloffset)
-
- def _format_hierarchical_rows(self) -> Iterable[ExcelCell]:
- if self._has_aliases or self.header:
- self.rowcounter += 1
-
- gcolidx = 0
-
- if self.index:
- index_labels = self.df.index.names
- # check for aliases
- if self.index_label and isinstance(
- self.index_label, (list, tuple, np.ndarray, Index)
- ):
- index_labels = self.index_label
-
- # MultiIndex columns require an extra row
- # with index names (blank if None) for
- # unambiguous round-trip, unless not merging,
- # in which case the names all go on one row Issue #11328
- if isinstance(self.columns, MultiIndex) and self.merge_cells:
- self.rowcounter += 1
-
- # if index labels are not empty go ahead and dump
- if com.any_not_none(*index_labels) and self.header is not False:
- for cidx, name in enumerate(index_labels):
- yield ExcelCell(self.rowcounter - 1, cidx, name, self.header_style)
-
- if self.merge_cells:
- # Format hierarchical rows as merged cells.
- level_strs = self.df.index.format(
- sparsify=True, adjoin=False, names=False
- )
- level_lengths = get_level_lengths(level_strs)
-
- for spans, levels, level_codes in zip(
- level_lengths, self.df.index.levels, self.df.index.codes
- ):
- values = levels.take(
- level_codes,
- allow_fill=levels._can_hold_na,
- fill_value=levels._na_value,
- )
-
- for i, span_val in spans.items():
- mergestart, mergeend = None, None
- if span_val > 1:
- mergestart = self.rowcounter + i + span_val - 1
- mergeend = gcolidx
- yield CssExcelCell(
- row=self.rowcounter + i,
- col=gcolidx,
- val=values[i],
- style=self.header_style,
- css_styles=getattr(self.styler, "ctx_index", None),
- css_row=i,
- css_col=gcolidx,
- css_converter=self.style_converter,
- mergestart=mergestart,
- mergeend=mergeend,
- )
- gcolidx += 1
-
- else:
- # Format hierarchical rows with non-merged values.
- for indexcolvals in zip(*self.df.index):
- for idx, indexcolval in enumerate(indexcolvals):
- yield CssExcelCell(
- row=self.rowcounter + idx,
- col=gcolidx,
- val=indexcolval,
- style=self.header_style,
- css_styles=getattr(self.styler, "ctx_index", None),
- css_row=idx,
- css_col=gcolidx,
- css_converter=self.style_converter,
- )
- gcolidx += 1
-
- yield from self._generate_body(gcolidx)
-
- @property
- def _has_aliases(self) -> bool:
- """Whether the aliases for column names are present."""
- return is_list_like(self.header)
-
- def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]:
- # Write the body of the frame data series by series.
- for colidx in range(len(self.columns)):
- series = self.df.iloc[:, colidx]
- for i, val in enumerate(series):
- yield CssExcelCell(
- row=self.rowcounter + i,
- col=colidx + coloffset,
- val=val,
- style=None,
- css_styles=getattr(self.styler, "ctx", None),
- css_row=i,
- css_col=colidx,
- css_converter=self.style_converter,
- )
-
- def get_formatted_cells(self) -> Iterable[ExcelCell]:
- for cell in itertools.chain(self._format_header(), self._format_body()):
- cell.val = self._format_value(cell.val)
- yield cell
-
- @doc(storage_options=_shared_docs["storage_options"])
- def write(
- self,
- writer,
- sheet_name: str = "Sheet1",
- startrow: int = 0,
- startcol: int = 0,
- freeze_panes: tuple[int, int] | None = None,
- engine: str | None = None,
- storage_options: StorageOptions = None,
- ) -> None:
- """
- writer : path-like, file-like, or ExcelWriter object
- File path or existing ExcelWriter
- sheet_name : str, default 'Sheet1'
- Name of sheet which will contain DataFrame
- startrow :
- upper left cell row to dump data frame
- startcol :
- upper left cell column to dump data frame
- freeze_panes : tuple of integer (length 2), default None
- Specifies the one-based bottommost row and rightmost column that
- is to be frozen
- engine : string, default None
- write engine to use if writer is a path - you can also set this
- via the options ``io.excel.xlsx.writer``,
- or ``io.excel.xlsm.writer``.
-
- {storage_options}
-
- .. versionadded:: 1.2.0
- """
- from pandas.io.excel import ExcelWriter
-
- num_rows, num_cols = self.df.shape
- if num_rows > self.max_rows or num_cols > self.max_cols:
- raise ValueError(
- f"This sheet is too large! Your sheet size is: {num_rows}, {num_cols} "
- f"Max sheet size is: {self.max_rows}, {self.max_cols}"
- )
-
- formatted_cells = self.get_formatted_cells()
- if isinstance(writer, ExcelWriter):
- need_save = False
- else:
- # error: Cannot instantiate abstract class 'ExcelWriter' with abstract
- # attributes 'engine', 'save', 'supported_extensions' and 'write_cells'
- writer = ExcelWriter( # type: ignore[abstract]
- writer, engine=engine, storage_options=storage_options
- )
- need_save = True
-
- try:
- writer._write_cells(
- formatted_cells,
- sheet_name,
- startrow=startrow,
- startcol=startcol,
- freeze_panes=freeze_panes,
- )
- finally:
- # make sure to close opened file handles
- if need_save:
- writer.close()
diff --git a/contrib/python/pandas/py3/pandas/io/formats/format.py b/contrib/python/pandas/py3/pandas/io/formats/format.py
deleted file mode 100644
index ee2eaa6c583..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/format.py
+++ /dev/null
@@ -1,2240 +0,0 @@
-"""
-Internal module for formatting output data in csv, html, xml,
-and latex files. This module also applies to display formatting.
-"""
-from __future__ import annotations
-
-from contextlib import contextmanager
-from csv import (
- QUOTE_NONE,
- QUOTE_NONNUMERIC,
-)
-from decimal import Decimal
-from functools import partial
-from io import StringIO
-import math
-import re
-from shutil import get_terminal_size
-from typing import (
- IO,
- TYPE_CHECKING,
- Any,
- Callable,
- Final,
- Generator,
- Hashable,
- Iterable,
- List,
- Mapping,
- Sequence,
- cast,
-)
-from unicodedata import east_asian_width
-
-import numpy as np
-
-from pandas._config.config import (
- get_option,
- set_option,
-)
-
-from pandas._libs import lib
-from pandas._libs.missing import NA
-from pandas._libs.tslibs import (
- NaT,
- Timedelta,
- Timestamp,
- get_unit_from_dtype,
- iNaT,
- periods_per_day,
-)
-from pandas._libs.tslibs.nattype import NaTType
-from pandas._typing import (
- ArrayLike,
- Axes,
- ColspaceArgType,
- ColspaceType,
- CompressionOptions,
- FilePath,
- FloatFormatType,
- FormattersType,
- IndexLabel,
- StorageOptions,
- WriteBuffer,
-)
-
-from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_complex_dtype,
- is_datetime64_dtype,
- is_extension_array_dtype,
- is_float,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_list_like,
- is_numeric_dtype,
- is_scalar,
- is_timedelta64_dtype,
-)
-from pandas.core.dtypes.dtypes import DatetimeTZDtype
-from pandas.core.dtypes.missing import (
- isna,
- notna,
-)
-
-from pandas.core.arrays import (
- Categorical,
- DatetimeArray,
- TimedeltaArray,
-)
-from pandas.core.arrays.string_ import StringDtype
-from pandas.core.base import PandasObject
-import pandas.core.common as com
-from pandas.core.construction import extract_array
-from pandas.core.indexes.api import (
- Index,
- MultiIndex,
- PeriodIndex,
- ensure_index,
-)
-from pandas.core.indexes.datetimes import DatetimeIndex
-from pandas.core.indexes.timedeltas import TimedeltaIndex
-from pandas.core.reshape.concat import concat
-
-from pandas.io.common import (
- check_parent_directory,
- stringify_path,
-)
-from pandas.io.formats import printing
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
-
-
-common_docstring: Final = """
- Parameters
- ----------
- buf : str, Path or StringIO-like, optional, default None
- Buffer to write to. If None, the output is returned as a string.
- columns : sequence, optional, default None
- The subset of columns to write. Writes all columns by default.
- col_space : %(col_space_type)s, optional
- %(col_space)s.
- header : %(header_type)s, optional
- %(header)s.
- index : bool, optional, default True
- Whether to print index (row) labels.
- na_rep : str, optional, default 'NaN'
- String representation of ``NaN`` to use.
- formatters : list, tuple or dict of one-param. functions, optional
- Formatter functions to apply to columns' elements by position or
- name.
- The result of each function must be a unicode string.
- List/tuple must be of length equal to the number of columns.
- float_format : one-parameter function, optional, default None
- Formatter function to apply to columns' elements if they are
- floats. This function must return a unicode string and will be
- applied only to the non-``NaN`` elements, with ``NaN`` being
- handled by ``na_rep``.
-
- .. versionchanged:: 1.2.0
-
- sparsify : bool, optional, default True
- Set to False for a DataFrame with a hierarchical index to print
- every multiindex key at each row.
- index_names : bool, optional, default True
- Prints the names of the indexes.
- justify : str, default None
- How to justify the column labels. If None uses the option from
- the print configuration (controlled by set_option), 'right' out
- of the box. Valid values are
-
- * left
- * right
- * center
- * justify
- * justify-all
- * start
- * end
- * inherit
- * match-parent
- * initial
- * unset.
- max_rows : int, optional
- Maximum number of rows to display in the console.
- max_cols : int, optional
- Maximum number of columns to display in the console.
- show_dimensions : bool, default False
- Display DataFrame dimensions (number of rows by number of columns).
- decimal : str, default '.'
- Character recognized as decimal separator, e.g. ',' in Europe.
- """
-
-_VALID_JUSTIFY_PARAMETERS = (
- "left",
- "right",
- "center",
- "justify",
- "justify-all",
- "start",
- "end",
- "inherit",
- "match-parent",
- "initial",
- "unset",
-)
-
-return_docstring: Final = """
- Returns
- -------
- str or None
- If buf is None, returns the result as a string. Otherwise returns
- None.
- """
-
-
-class CategoricalFormatter:
- def __init__(
- self,
- categorical: Categorical,
- buf: IO[str] | None = None,
- length: bool = True,
- na_rep: str = "NaN",
- footer: bool = True,
- ) -> None:
- self.categorical = categorical
- self.buf = buf if buf is not None else StringIO("")
- self.na_rep = na_rep
- self.length = length
- self.footer = footer
- self.quoting = QUOTE_NONNUMERIC
-
- def _get_footer(self) -> str:
- footer = ""
-
- if self.length:
- if footer:
- footer += ", "
- footer += f"Length: {len(self.categorical)}"
-
- level_info = self.categorical._repr_categories_info()
-
- # Levels are added in a newline
- if footer:
- footer += "\n"
- footer += level_info
-
- return str(footer)
-
- def _get_formatted_values(self) -> list[str]:
- return format_array(
- self.categorical._internal_get_values(),
- None,
- float_format=None,
- na_rep=self.na_rep,
- quoting=self.quoting,
- )
-
- def to_string(self) -> str:
- categorical = self.categorical
-
- if len(categorical) == 0:
- if self.footer:
- return self._get_footer()
- else:
- return ""
-
- fmt_values = self._get_formatted_values()
-
- fmt_values = [i.strip() for i in fmt_values]
- values = ", ".join(fmt_values)
- result = ["[" + values + "]"]
- if self.footer:
- footer = self._get_footer()
- if footer:
- result.append(footer)
-
- return str("\n".join(result))
-
-
-class SeriesFormatter:
- def __init__(
- self,
- series: Series,
- buf: IO[str] | None = None,
- length: bool | str = True,
- header: bool = True,
- index: bool = True,
- na_rep: str = "NaN",
- name: bool = False,
- float_format: str | None = None,
- dtype: bool = True,
- max_rows: int | None = None,
- min_rows: int | None = None,
- ) -> None:
- self.series = series
- self.buf = buf if buf is not None else StringIO()
- self.name = name
- self.na_rep = na_rep
- self.header = header
- self.length = length
- self.index = index
- self.max_rows = max_rows
- self.min_rows = min_rows
-
- if float_format is None:
- float_format = get_option("display.float_format")
- self.float_format = float_format
- self.dtype = dtype
- self.adj = get_adjustment()
-
- self._chk_truncate()
-
- def _chk_truncate(self) -> None:
- self.tr_row_num: int | None
-
- min_rows = self.min_rows
- max_rows = self.max_rows
- # truncation determined by max_rows, actual truncated number of rows
- # used below by min_rows
- is_truncated_vertically = max_rows and (len(self.series) > max_rows)
- series = self.series
- if is_truncated_vertically:
- max_rows = cast(int, max_rows)
- if min_rows:
- # if min_rows is set (not None or 0), set max_rows to minimum
- # of both
- max_rows = min(min_rows, max_rows)
- if max_rows == 1:
- row_num = max_rows
- series = series.iloc[:max_rows]
- else:
- row_num = max_rows // 2
- series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
- self.tr_row_num = row_num
- else:
- self.tr_row_num = None
- self.tr_series = series
- self.is_truncated_vertically = is_truncated_vertically
-
- def _get_footer(self) -> str:
- name = self.series.name
- footer = ""
-
- if getattr(self.series.index, "freq", None) is not None:
- assert isinstance(
- self.series.index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)
- )
- footer += f"Freq: {self.series.index.freqstr}"
-
- if self.name is not False and name is not None:
- if footer:
- footer += ", "
-
- series_name = printing.pprint_thing(name, escape_chars=("\t", "\r", "\n"))
- footer += f"Name: {series_name}"
-
- if self.length is True or (
- self.length == "truncate" and self.is_truncated_vertically
- ):
- if footer:
- footer += ", "
- footer += f"Length: {len(self.series)}"
-
- if self.dtype is not False and self.dtype is not None:
- dtype_name = getattr(self.tr_series.dtype, "name", None)
- if dtype_name:
- if footer:
- footer += ", "
- footer += f"dtype: {printing.pprint_thing(dtype_name)}"
-
- # level infos are added to the end and in a new line, like it is done
- # for Categoricals
- if is_categorical_dtype(self.tr_series.dtype):
- level_info = self.tr_series._values._repr_categories_info()
- if footer:
- footer += "\n"
- footer += level_info
-
- return str(footer)
-
- def _get_formatted_index(self) -> tuple[list[str], bool]:
- index = self.tr_series.index
-
- if isinstance(index, MultiIndex):
- have_header = any(name for name in index.names)
- fmt_index = index.format(names=True)
- else:
- have_header = index.name is not None
- fmt_index = index.format(name=True)
- return fmt_index, have_header
-
- def _get_formatted_values(self) -> list[str]:
- return format_array(
- self.tr_series._values,
- None,
- float_format=self.float_format,
- na_rep=self.na_rep,
- leading_space=self.index,
- )
-
- def to_string(self) -> str:
- series = self.tr_series
- footer = self._get_footer()
-
- if len(series) == 0:
- return f"{type(self.series).__name__}([], {footer})"
-
- fmt_index, have_header = self._get_formatted_index()
- fmt_values = self._get_formatted_values()
-
- if self.is_truncated_vertically:
- n_header_rows = 0
- row_num = self.tr_row_num
- row_num = cast(int, row_num)
- width = self.adj.len(fmt_values[row_num - 1])
- if width > 3:
- dot_str = "..."
- else:
- dot_str = ".."
- # Series uses mode=center because it has single value columns
- # DataFrame uses mode=left
- dot_str = self.adj.justify([dot_str], width, mode="center")[0]
- fmt_values.insert(row_num + n_header_rows, dot_str)
- fmt_index.insert(row_num + 1, "")
-
- if self.index:
- result = self.adj.adjoin(3, *[fmt_index[1:], fmt_values])
- else:
- result = self.adj.adjoin(3, fmt_values)
-
- if self.header and have_header:
- result = fmt_index[0] + "\n" + result
-
- if footer:
- result += "\n" + footer
-
- return str("".join(result))
-
-
-class TextAdjustment:
- def __init__(self) -> None:
- self.encoding = get_option("display.encoding")
-
- def len(self, text: str) -> int:
- return len(text)
-
- def justify(self, texts: Any, max_len: int, mode: str = "right") -> list[str]:
- return printing.justify(texts, max_len, mode=mode)
-
- def adjoin(self, space: int, *lists, **kwargs) -> str:
- return printing.adjoin(
- space, *lists, strlen=self.len, justfunc=self.justify, **kwargs
- )
-
-
-class EastAsianTextAdjustment(TextAdjustment):
- def __init__(self) -> None:
- super().__init__()
- if get_option("display.unicode.ambiguous_as_wide"):
- self.ambiguous_width = 2
- else:
- self.ambiguous_width = 1
-
- # Definition of East Asian Width
- # https://unicode.org/reports/tr11/
- # Ambiguous width can be changed by option
- self._EAW_MAP = {"Na": 1, "N": 1, "W": 2, "F": 2, "H": 1}
-
- def len(self, text: str) -> int:
- """
- Calculate display width considering unicode East Asian Width
- """
- if not isinstance(text, str):
- return len(text)
-
- return sum(
- self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width) for c in text
- )
-
- def justify(
- self, texts: Iterable[str], max_len: int, mode: str = "right"
- ) -> list[str]:
- # re-calculate padding space per str considering East Asian Width
- def _get_pad(t):
- return max_len - self.len(t) + len(t)
-
- if mode == "left":
- return [x.ljust(_get_pad(x)) for x in texts]
- elif mode == "center":
- return [x.center(_get_pad(x)) for x in texts]
- else:
- return [x.rjust(_get_pad(x)) for x in texts]
-
-
-def get_adjustment() -> TextAdjustment:
- use_east_asian_width = get_option("display.unicode.east_asian_width")
- if use_east_asian_width:
- return EastAsianTextAdjustment()
- else:
- return TextAdjustment()
-
-
-def get_dataframe_repr_params() -> dict[str, Any]:
- """Get the parameters used to repr(dataFrame) calls using DataFrame.to_string.
-
- Supplying these parameters to DataFrame.to_string is equivalent to calling
- ``repr(DataFrame)``. This is useful if you want to adjust the repr output.
-
- .. versionadded:: 1.4.0
-
- Example
- -------
- >>> import pandas as pd
- >>>
- >>> df = pd.DataFrame([[1, 2], [3, 4]])
- >>> repr_params = pd.io.formats.format.get_dataframe_repr_params()
- >>> repr(df) == df.to_string(**repr_params)
- True
- """
- from pandas.io.formats import console
-
- if get_option("display.expand_frame_repr"):
- line_width, _ = console.get_console_size()
- else:
- line_width = None
- return {
- "max_rows": get_option("display.max_rows"),
- "min_rows": get_option("display.min_rows"),
- "max_cols": get_option("display.max_columns"),
- "max_colwidth": get_option("display.max_colwidth"),
- "show_dimensions": get_option("display.show_dimensions"),
- "line_width": line_width,
- }
-
-
-def get_series_repr_params() -> dict[str, Any]:
- """Get the parameters used to repr(Series) calls using Series.to_string.
-
- Supplying these parameters to Series.to_string is equivalent to calling
- ``repr(series)``. This is useful if you want to adjust the series repr output.
-
- .. versionadded:: 1.4.0
-
- Example
- -------
- >>> import pandas as pd
- >>>
- >>> ser = pd.Series([1, 2, 3, 4])
- >>> repr_params = pd.io.formats.format.get_series_repr_params()
- >>> repr(ser) == ser.to_string(**repr_params)
- True
- """
- width, height = get_terminal_size()
- max_rows = (
- height
- if get_option("display.max_rows") == 0
- else get_option("display.max_rows")
- )
- min_rows = (
- height
- if get_option("display.max_rows") == 0
- else get_option("display.min_rows")
- )
-
- return {
- "name": True,
- "dtype": True,
- "min_rows": min_rows,
- "max_rows": max_rows,
- "length": get_option("display.show_dimensions"),
- }
-
-
-class DataFrameFormatter:
- """Class for processing dataframe formatting options and data."""
-
- __doc__ = __doc__ if __doc__ else ""
- __doc__ += common_docstring + return_docstring
-
- def __init__(
- self,
- frame: DataFrame,
- columns: Sequence[Hashable] | None = None,
- col_space: ColspaceArgType | None = None,
- header: bool | Sequence[str] = True,
- index: bool = True,
- na_rep: str = "NaN",
- formatters: FormattersType | None = None,
- justify: str | None = None,
- float_format: FloatFormatType | None = None,
- sparsify: bool | None = None,
- index_names: bool = True,
- max_rows: int | None = None,
- min_rows: int | None = None,
- max_cols: int | None = None,
- show_dimensions: bool | str = False,
- decimal: str = ".",
- bold_rows: bool = False,
- escape: bool = True,
- ) -> None:
- self.frame = frame
- self.columns = self._initialize_columns(columns)
- self.col_space = self._initialize_colspace(col_space)
- self.header = header
- self.index = index
- self.na_rep = na_rep
- self.formatters = self._initialize_formatters(formatters)
- self.justify = self._initialize_justify(justify)
- self.float_format = float_format
- self.sparsify = self._initialize_sparsify(sparsify)
- self.show_index_names = index_names
- self.decimal = decimal
- self.bold_rows = bold_rows
- self.escape = escape
- self.max_rows = max_rows
- self.min_rows = min_rows
- self.max_cols = max_cols
- self.show_dimensions = show_dimensions
-
- self.max_cols_fitted = self._calc_max_cols_fitted()
- self.max_rows_fitted = self._calc_max_rows_fitted()
-
- self.tr_frame = self.frame
- self.truncate()
- self.adj = get_adjustment()
-
- def get_strcols(self) -> list[list[str]]:
- """
- Render a DataFrame to a list of columns (as lists of strings).
- """
- strcols = self._get_strcols_without_index()
-
- if self.index:
- str_index = self._get_formatted_index(self.tr_frame)
- strcols.insert(0, str_index)
-
- return strcols
-
- @property
- def should_show_dimensions(self) -> bool:
- return self.show_dimensions is True or (
- self.show_dimensions == "truncate" and self.is_truncated
- )
-
- @property
- def is_truncated(self) -> bool:
- return bool(self.is_truncated_horizontally or self.is_truncated_vertically)
-
- @property
- def is_truncated_horizontally(self) -> bool:
- return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted))
-
- @property
- def is_truncated_vertically(self) -> bool:
- return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted))
-
- @property
- def dimensions_info(self) -> str:
- return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]"
-
- @property
- def has_index_names(self) -> bool:
- return _has_names(self.frame.index)
-
- @property
- def has_column_names(self) -> bool:
- return _has_names(self.frame.columns)
-
- @property
- def show_row_idx_names(self) -> bool:
- return all((self.has_index_names, self.index, self.show_index_names))
-
- @property
- def show_col_idx_names(self) -> bool:
- return all((self.has_column_names, self.show_index_names, self.header))
-
- @property
- def max_rows_displayed(self) -> int:
- return min(self.max_rows or len(self.frame), len(self.frame))
-
- def _initialize_sparsify(self, sparsify: bool | None) -> bool:
- if sparsify is None:
- return get_option("display.multi_sparse")
- return sparsify
-
- def _initialize_formatters(
- self, formatters: FormattersType | None
- ) -> FormattersType:
- if formatters is None:
- return {}
- elif len(self.frame.columns) == len(formatters) or isinstance(formatters, dict):
- return formatters
- else:
- raise ValueError(
- f"Formatters length({len(formatters)}) should match "
- f"DataFrame number of columns({len(self.frame.columns)})"
- )
-
- def _initialize_justify(self, justify: str | None) -> str:
- if justify is None:
- return get_option("display.colheader_justify")
- else:
- return justify
-
- def _initialize_columns(self, columns: Sequence[Hashable] | None) -> Index:
- if columns is not None:
- # GH 47231 - columns doesn't have to be `Sequence[str]`
- # Will fix in later PR
- cols = ensure_index(cast(Axes, columns))
- self.frame = self.frame[cols]
- return cols
- else:
- return self.frame.columns
-
- def _initialize_colspace(self, col_space: ColspaceArgType | None) -> ColspaceType:
- result: ColspaceType
-
- if col_space is None:
- result = {}
- elif isinstance(col_space, (int, str)):
- result = {"": col_space}
- result.update({column: col_space for column in self.frame.columns})
- elif isinstance(col_space, Mapping):
- for column in col_space.keys():
- if column not in self.frame.columns and column != "":
- raise ValueError(
- f"Col_space is defined for an unknown column: {column}"
- )
- result = col_space
- else:
- if len(self.frame.columns) != len(col_space):
- raise ValueError(
- f"Col_space length({len(col_space)}) should match "
- f"DataFrame number of columns({len(self.frame.columns)})"
- )
- result = dict(zip(self.frame.columns, col_space))
- return result
-
- def _calc_max_cols_fitted(self) -> int | None:
- """Number of columns fitting the screen."""
- if not self._is_in_terminal():
- return self.max_cols
-
- width, _ = get_terminal_size()
- if self._is_screen_narrow(width):
- return width
- else:
- return self.max_cols
-
- def _calc_max_rows_fitted(self) -> int | None:
- """Number of rows with data fitting the screen."""
- max_rows: int | None
-
- if self._is_in_terminal():
- _, height = get_terminal_size()
- if self.max_rows == 0:
- # rows available to fill with actual data
- return height - self._get_number_of_auxillary_rows()
-
- if self._is_screen_short(height):
- max_rows = height
- else:
- max_rows = self.max_rows
- else:
- max_rows = self.max_rows
-
- return self._adjust_max_rows(max_rows)
-
- def _adjust_max_rows(self, max_rows: int | None) -> int | None:
- """Adjust max_rows using display logic.
-
- See description here:
- https://pandas.pydata.org/docs/dev/user_guide/options.html#frequently-used-options
-
- GH #37359
- """
- if max_rows:
- if (len(self.frame) > max_rows) and self.min_rows:
- # if truncated, set max_rows showed to min_rows
- max_rows = min(self.min_rows, max_rows)
- return max_rows
-
- def _is_in_terminal(self) -> bool:
- """Check if the output is to be shown in terminal."""
- return bool(self.max_cols == 0 or self.max_rows == 0)
-
- def _is_screen_narrow(self, max_width) -> bool:
- return bool(self.max_cols == 0 and len(self.frame.columns) > max_width)
-
- def _is_screen_short(self, max_height) -> bool:
- return bool(self.max_rows == 0 and len(self.frame) > max_height)
-
- def _get_number_of_auxillary_rows(self) -> int:
- """Get number of rows occupied by prompt, dots and dimension info."""
- dot_row = 1
- prompt_row = 1
- num_rows = dot_row + prompt_row
-
- if self.show_dimensions:
- num_rows += len(self.dimensions_info.splitlines())
-
- if self.header:
- num_rows += 1
-
- return num_rows
-
- def truncate(self) -> None:
- """
- Check whether the frame should be truncated. If so, slice the frame up.
- """
- if self.is_truncated_horizontally:
- self._truncate_horizontally()
-
- if self.is_truncated_vertically:
- self._truncate_vertically()
-
- def _truncate_horizontally(self) -> None:
- """Remove columns, which are not to be displayed and adjust formatters.
-
- Attributes affected:
- - tr_frame
- - formatters
- - tr_col_num
- """
- assert self.max_cols_fitted is not None
- col_num = self.max_cols_fitted // 2
- if col_num >= 1:
- left = self.tr_frame.iloc[:, :col_num]
- right = self.tr_frame.iloc[:, -col_num:]
- self.tr_frame = concat((left, right), axis=1)
-
- # truncate formatter
- if isinstance(self.formatters, (list, tuple)):
- self.formatters = [
- *self.formatters[:col_num],
- *self.formatters[-col_num:],
- ]
- else:
- col_num = cast(int, self.max_cols)
- self.tr_frame = self.tr_frame.iloc[:, :col_num]
- self.tr_col_num = col_num
-
- def _truncate_vertically(self) -> None:
- """Remove rows, which are not to be displayed.
-
- Attributes affected:
- - tr_frame
- - tr_row_num
- """
- assert self.max_rows_fitted is not None
- row_num = self.max_rows_fitted // 2
- if row_num >= 1:
- head = self.tr_frame.iloc[:row_num, :]
- tail = self.tr_frame.iloc[-row_num:, :]
- self.tr_frame = concat((head, tail))
- else:
- row_num = cast(int, self.max_rows)
- self.tr_frame = self.tr_frame.iloc[:row_num, :]
- self.tr_row_num = row_num
-
- def _get_strcols_without_index(self) -> list[list[str]]:
- strcols: list[list[str]] = []
-
- if not is_list_like(self.header) and not self.header:
- for i, c in enumerate(self.tr_frame):
- fmt_values = self.format_col(i)
- fmt_values = _make_fixed_width(
- strings=fmt_values,
- justify=self.justify,
- minimum=int(self.col_space.get(c, 0)),
- adj=self.adj,
- )
- strcols.append(fmt_values)
- return strcols
-
- if is_list_like(self.header):
- # cast here since can't be bool if is_list_like
- self.header = cast(List[str], self.header)
- if len(self.header) != len(self.columns):
- raise ValueError(
- f"Writing {len(self.columns)} cols "
- f"but got {len(self.header)} aliases"
- )
- str_columns = [[label] for label in self.header]
- else:
- str_columns = self._get_formatted_column_labels(self.tr_frame)
-
- if self.show_row_idx_names:
- for x in str_columns:
- x.append("")
-
- for i, c in enumerate(self.tr_frame):
- cheader = str_columns[i]
- header_colwidth = max(
- int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader)
- )
- fmt_values = self.format_col(i)
- fmt_values = _make_fixed_width(
- fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
- )
-
- max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth)
- cheader = self.adj.justify(cheader, max_len, mode=self.justify)
- strcols.append(cheader + fmt_values)
-
- return strcols
-
- def format_col(self, i: int) -> list[str]:
- frame = self.tr_frame
- formatter = self._get_formatter(i)
- return format_array(
- frame.iloc[:, i]._values,
- formatter,
- float_format=self.float_format,
- na_rep=self.na_rep,
- space=self.col_space.get(frame.columns[i]),
- decimal=self.decimal,
- leading_space=self.index,
- )
-
- def _get_formatter(self, i: str | int) -> Callable | None:
- if isinstance(self.formatters, (list, tuple)):
- if is_integer(i):
- i = cast(int, i)
- return self.formatters[i]
- else:
- return None
- else:
- if is_integer(i) and i not in self.columns:
- i = self.columns[i]
- return self.formatters.get(i, None)
-
- def _get_formatted_column_labels(self, frame: DataFrame) -> list[list[str]]:
- from pandas.core.indexes.multi import sparsify_labels
-
- columns = frame.columns
-
- if isinstance(columns, MultiIndex):
- fmt_columns = columns.format(sparsify=False, adjoin=False)
- fmt_columns = list(zip(*fmt_columns))
- dtypes = self.frame.dtypes._values
-
- # if we have a Float level, they don't use leading space at all
- restrict_formatting = any(level.is_floating for level in columns.levels)
- need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
-
- def space_format(x, y):
- if (
- y not in self.formatters
- and need_leadsp[x]
- and not restrict_formatting
- ):
- return " " + y
- return y
-
- str_columns = list(
- zip(*([space_format(x, y) for y in x] for x in fmt_columns))
- )
- if self.sparsify and len(str_columns):
- str_columns = sparsify_labels(str_columns)
-
- str_columns = [list(x) for x in zip(*str_columns)]
- else:
- fmt_columns = columns.format()
- dtypes = self.frame.dtypes
- need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
- str_columns = [
- [" " + x if not self._get_formatter(i) and need_leadsp[x] else x]
- for i, x in enumerate(fmt_columns)
- ]
- # self.str_columns = str_columns
- return str_columns
-
- def _get_formatted_index(self, frame: DataFrame) -> list[str]:
- # Note: this is only used by to_string() and to_latex(), not by
- # to_html(). so safe to cast col_space here.
- col_space = {k: cast(int, v) for k, v in self.col_space.items()}
- index = frame.index
- columns = frame.columns
- fmt = self._get_formatter("__index__")
-
- if isinstance(index, MultiIndex):
- fmt_index = index.format(
- sparsify=self.sparsify,
- adjoin=False,
- names=self.show_row_idx_names,
- formatter=fmt,
- )
- else:
- fmt_index = [index.format(name=self.show_row_idx_names, formatter=fmt)]
-
- fmt_index = [
- tuple(
- _make_fixed_width(
- list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj
- )
- )
- for x in fmt_index
- ]
-
- adjoined = self.adj.adjoin(1, *fmt_index).split("\n")
-
- # empty space for columns
- if self.show_col_idx_names:
- col_header = [str(x) for x in self._get_column_name_list()]
- else:
- col_header = [""] * columns.nlevels
-
- if self.header:
- return col_header + adjoined
- else:
- return adjoined
-
- def _get_column_name_list(self) -> list[Hashable]:
- names: list[Hashable] = []
- columns = self.frame.columns
- if isinstance(columns, MultiIndex):
- names.extend("" if name is None else name for name in columns.names)
- else:
- names.append("" if columns.name is None else columns.name)
- return names
-
-
-class DataFrameRenderer:
- """Class for creating dataframe output in multiple formats.
-
- Called in pandas.core.generic.NDFrame:
- - to_csv
- - to_latex
-
- Called in pandas.core.frame.DataFrame:
- - to_html
- - to_string
-
- Parameters
- ----------
- fmt : DataFrameFormatter
- Formatter with the formatting options.
- """
-
- def __init__(self, fmt: DataFrameFormatter) -> None:
- self.fmt = fmt
-
- def to_latex(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- column_format: str | None = None,
- longtable: bool = False,
- encoding: str | None = None,
- multicolumn: bool = False,
- multicolumn_format: str | None = None,
- multirow: bool = False,
- caption: str | tuple[str, str] | None = None,
- label: str | None = None,
- position: str | None = None,
- ) -> str | None:
- """
- Render a DataFrame to a LaTeX tabular/longtable environment output.
- """
- from pandas.io.formats.latex import LatexFormatter
-
- latex_formatter = LatexFormatter(
- self.fmt,
- longtable=longtable,
- column_format=column_format,
- multicolumn=multicolumn,
- multicolumn_format=multicolumn_format,
- multirow=multirow,
- caption=caption,
- label=label,
- position=position,
- )
- string = latex_formatter.to_string()
- return save_to_buffer(string, buf=buf, encoding=encoding)
-
- def to_html(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- encoding: str | None = None,
- classes: str | list | tuple | None = None,
- notebook: bool = False,
- border: int | bool | None = None,
- table_id: str | None = None,
- render_links: bool = False,
- ) -> str | None:
- """
- Render a DataFrame to a html table.
-
- Parameters
- ----------
- buf : str, path object, file-like object, or None, default None
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a string ``write()`` function. If None, the result is
- returned as a string.
- encoding : str, default “utf-8”
- Set character encoding.
- classes : str or list-like
- classes to include in the `class` attribute of the opening
- ``<table>`` tag, in addition to the default "dataframe".
- notebook : {True, False}, optional, default False
- Whether the generated HTML is for IPython Notebook.
- border : int
- A ``border=border`` attribute is included in the opening
- ``<table>`` tag. Default ``pd.options.display.html.border``.
- table_id : str, optional
- A css id is included in the opening `<table>` tag if specified.
- render_links : bool, default False
- Convert URLs to HTML links.
- """
- from pandas.io.formats.html import (
- HTMLFormatter,
- NotebookFormatter,
- )
-
- Klass = NotebookFormatter if notebook else HTMLFormatter
-
- html_formatter = Klass(
- self.fmt,
- classes=classes,
- border=border,
- table_id=table_id,
- render_links=render_links,
- )
- string = html_formatter.to_string()
- return save_to_buffer(string, buf=buf, encoding=encoding)
-
- def to_string(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- encoding: str | None = None,
- line_width: int | None = None,
- ) -> str | None:
- """
- Render a DataFrame to a console-friendly tabular output.
-
- Parameters
- ----------
- buf : str, path object, file-like object, or None, default None
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a string ``write()`` function. If None, the result is
- returned as a string.
- encoding: str, default “utf-8”
- Set character encoding.
- line_width : int, optional
- Width to wrap a line in characters.
- """
- from pandas.io.formats.string import StringFormatter
-
- string_formatter = StringFormatter(self.fmt, line_width=line_width)
- string = string_formatter.to_string()
- return save_to_buffer(string, buf=buf, encoding=encoding)
-
- def to_csv(
- self,
- path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
- encoding: str | None = None,
- sep: str = ",",
- columns: Sequence[Hashable] | None = None,
- index_label: IndexLabel | None = None,
- mode: str = "w",
- compression: CompressionOptions = "infer",
- quoting: int | None = None,
- quotechar: str = '"',
- lineterminator: str | None = None,
- chunksize: int | None = None,
- date_format: str | None = None,
- doublequote: bool = True,
- escapechar: str | None = None,
- errors: str = "strict",
- storage_options: StorageOptions = None,
- ) -> str | None:
- """
- Render dataframe as comma-separated file.
- """
- from pandas.io.formats.csvs import CSVFormatter
-
- if path_or_buf is None:
- created_buffer = True
- path_or_buf = StringIO()
- else:
- created_buffer = False
-
- csv_formatter = CSVFormatter(
- path_or_buf=path_or_buf,
- lineterminator=lineterminator,
- sep=sep,
- encoding=encoding,
- errors=errors,
- compression=compression,
- quoting=quoting,
- cols=columns,
- index_label=index_label,
- mode=mode,
- chunksize=chunksize,
- quotechar=quotechar,
- date_format=date_format,
- doublequote=doublequote,
- escapechar=escapechar,
- storage_options=storage_options,
- formatter=self.fmt,
- )
- csv_formatter.save()
-
- if created_buffer:
- assert isinstance(path_or_buf, StringIO)
- content = path_or_buf.getvalue()
- path_or_buf.close()
- return content
-
- return None
-
-
-def save_to_buffer(
- string: str,
- buf: FilePath | WriteBuffer[str] | None = None,
- encoding: str | None = None,
-) -> str | None:
- """
- Perform serialization. Write to buf or return as string if buf is None.
- """
- with get_buffer(buf, encoding=encoding) as f:
- f.write(string)
- if buf is None:
- # error: "WriteBuffer[str]" has no attribute "getvalue"
- return f.getvalue() # type: ignore[attr-defined]
- return None
-
-
-@contextmanager
-def get_buffer(
- buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None
-) -> Generator[WriteBuffer[str], None, None] | Generator[StringIO, None, None]:
- """
- Context manager to open, yield and close buffer for filenames or Path-like
- objects, otherwise yield buf unchanged.
- """
- if buf is not None:
- buf = stringify_path(buf)
- else:
- buf = StringIO()
-
- if encoding is None:
- encoding = "utf-8"
- elif not isinstance(buf, str):
- raise ValueError("buf is not a file name and encoding is specified.")
-
- if hasattr(buf, "write"):
- # Incompatible types in "yield" (actual type "Union[str, WriteBuffer[str],
- # StringIO]", expected type "Union[WriteBuffer[str], StringIO]")
- yield buf # type: ignore[misc]
- elif isinstance(buf, str):
- check_parent_directory(str(buf))
- with open(buf, "w", encoding=encoding, newline="") as f:
- # GH#30034 open instead of codecs.open prevents a file leak
- # if we have an invalid encoding argument.
- # newline="" is needed to roundtrip correctly on
- # windows test_to_latex_filename
- yield f
- else:
- raise TypeError("buf is not a file name and it has no write method")
-
-
-# ----------------------------------------------------------------------
-# Array formatters
-
-
-def format_array(
- values: Any,
- formatter: Callable | None,
- float_format: FloatFormatType | None = None,
- na_rep: str = "NaN",
- digits: int | None = None,
- space: str | int | None = None,
- justify: str = "right",
- decimal: str = ".",
- leading_space: bool | None = True,
- quoting: int | None = None,
- fallback_formatter: Callable | None = None,
-) -> list[str]:
- """
- Format an array for printing.
-
- Parameters
- ----------
- values
- formatter
- float_format
- na_rep
- digits
- space
- justify
- decimal
- leading_space : bool, optional, default True
- Whether the array should be formatted with a leading space.
- When an array as a column of a Series or DataFrame, we do want
- the leading space to pad between columns.
-
- When formatting an Index subclass
- (e.g. IntervalIndex._format_native_types), we don't want the
- leading space since it should be left-aligned.
- fallback_formatter
-
- Returns
- -------
- List[str]
- """
- fmt_klass: type[GenericArrayFormatter]
- if is_datetime64_dtype(values.dtype):
- fmt_klass = Datetime64Formatter
- elif isinstance(values.dtype, DatetimeTZDtype):
- fmt_klass = Datetime64TZFormatter
- elif is_timedelta64_dtype(values.dtype):
- fmt_klass = Timedelta64Formatter
- elif is_extension_array_dtype(values.dtype):
- fmt_klass = ExtensionArrayFormatter
- elif is_float_dtype(values.dtype) or is_complex_dtype(values.dtype):
- fmt_klass = FloatArrayFormatter
- elif is_integer_dtype(values.dtype):
- fmt_klass = IntArrayFormatter
- else:
- fmt_klass = GenericArrayFormatter
-
- if space is None:
- space = 12
-
- if float_format is None:
- float_format = get_option("display.float_format")
-
- if digits is None:
- digits = get_option("display.precision")
-
- fmt_obj = fmt_klass(
- values,
- digits=digits,
- na_rep=na_rep,
- float_format=float_format,
- formatter=formatter,
- space=space,
- justify=justify,
- decimal=decimal,
- leading_space=leading_space,
- quoting=quoting,
- fallback_formatter=fallback_formatter,
- )
-
- return fmt_obj.get_result()
-
-
-class GenericArrayFormatter:
- def __init__(
- self,
- values: Any,
- digits: int = 7,
- formatter: Callable | None = None,
- na_rep: str = "NaN",
- space: str | int = 12,
- float_format: FloatFormatType | None = None,
- justify: str = "right",
- decimal: str = ".",
- quoting: int | None = None,
- fixed_width: bool = True,
- leading_space: bool | None = True,
- fallback_formatter: Callable | None = None,
- ) -> None:
- self.values = values
- self.digits = digits
- self.na_rep = na_rep
- self.space = space
- self.formatter = formatter
- self.float_format = float_format
- self.justify = justify
- self.decimal = decimal
- self.quoting = quoting
- self.fixed_width = fixed_width
- self.leading_space = leading_space
- self.fallback_formatter = fallback_formatter
-
- def get_result(self) -> list[str]:
- fmt_values = self._format_strings()
- return _make_fixed_width(fmt_values, self.justify)
-
- def _format_strings(self) -> list[str]:
- if self.float_format is None:
- float_format = get_option("display.float_format")
- if float_format is None:
- precision = get_option("display.precision")
- float_format = lambda x: _trim_zeros_single_float(
- f"{x: .{precision:d}f}"
- )
- else:
- float_format = self.float_format
-
- if self.formatter is not None:
- formatter = self.formatter
- elif self.fallback_formatter is not None:
- formatter = self.fallback_formatter
- else:
- quote_strings = self.quoting is not None and self.quoting != QUOTE_NONE
- formatter = partial(
- printing.pprint_thing,
- escape_chars=("\t", "\r", "\n"),
- quote_strings=quote_strings,
- )
-
- def _format(x):
- if self.na_rep is not None and is_scalar(x) and isna(x):
- try:
- # try block for np.isnat specifically
- # determine na_rep if x is None or NaT-like
- if x is None:
- return "None"
- elif x is NA:
- return str(NA)
- elif x is NaT or np.isnat(x):
- return "NaT"
- except (TypeError, ValueError):
- # np.isnat only handles datetime or timedelta objects
- pass
- return self.na_rep
- elif isinstance(x, PandasObject):
- return str(x)
- elif isinstance(x, StringDtype):
- return repr(x)
- else:
- # object dtype
- return str(formatter(x))
-
- vals = extract_array(self.values, extract_numpy=True)
- if not isinstance(vals, np.ndarray):
- raise TypeError(
- "ExtensionArray formatting should use ExtensionArrayFormatter"
- )
- inferred = lib.map_infer(vals, is_float)
- is_float_type = (
- inferred
- # vals may have 2 or more dimensions
- & np.all(notna(vals), axis=tuple(range(1, len(vals.shape))))
- )
- leading_space = self.leading_space
- if leading_space is None:
- leading_space = is_float_type.any()
-
- fmt_values = []
- for i, v in enumerate(vals):
- if (not is_float_type[i] or self.formatter is not None) and leading_space:
- fmt_values.append(f" {_format(v)}")
- elif is_float_type[i]:
- fmt_values.append(float_format(v))
- else:
- if leading_space is False:
- # False specifically, so that the default is
- # to include a space if we get here.
- tpl = "{v}"
- else:
- tpl = " {v}"
- fmt_values.append(tpl.format(v=_format(v)))
-
- return fmt_values
-
-
-class FloatArrayFormatter(GenericArrayFormatter):
- def __init__(self, *args, **kwargs) -> None:
- super().__init__(*args, **kwargs)
-
- # float_format is expected to be a string
- # formatter should be used to pass a function
- if self.float_format is not None and self.formatter is None:
- # GH21625, GH22270
- self.fixed_width = False
- if callable(self.float_format):
- self.formatter = self.float_format
- self.float_format = None
-
- def _value_formatter(
- self,
- float_format: FloatFormatType | None = None,
- threshold: float | None = None,
- ) -> Callable:
- """Returns a function to be applied on each value to format it"""
- # the float_format parameter supersedes self.float_format
- if float_format is None:
- float_format = self.float_format
-
- # we are going to compose different functions, to first convert to
- # a string, then replace the decimal symbol, and finally chop according
- # to the threshold
-
- # when there is no float_format, we use str instead of '%g'
- # because str(0.0) = '0.0' while '%g' % 0.0 = '0'
- if float_format:
-
- def base_formatter(v):
- assert float_format is not None # for mypy
- # error: "str" not callable
- # error: Unexpected keyword argument "value" for "__call__" of
- # "EngFormatter"
- return (
- float_format(value=v) # type: ignore[operator,call-arg]
- if notna(v)
- else self.na_rep
- )
-
- else:
-
- def base_formatter(v):
- return str(v) if notna(v) else self.na_rep
-
- if self.decimal != ".":
-
- def decimal_formatter(v):
- return base_formatter(v).replace(".", self.decimal, 1)
-
- else:
- decimal_formatter = base_formatter
-
- if threshold is None:
- return decimal_formatter
-
- def formatter(value):
- if notna(value):
- if abs(value) > threshold:
- return decimal_formatter(value)
- else:
- return decimal_formatter(0.0)
- else:
- return self.na_rep
-
- return formatter
-
- def get_result_as_array(self) -> np.ndarray:
- """
- Returns the float values converted into strings using
- the parameters given at initialisation, as a numpy array
- """
-
- def format_with_na_rep(values: ArrayLike, formatter: Callable, na_rep: str):
- mask = isna(values)
- formatted = np.array(
- [
- formatter(val) if not m else na_rep
- for val, m in zip(values.ravel(), mask.ravel())
- ]
- ).reshape(values.shape)
- return formatted
-
- if self.formatter is not None:
- return format_with_na_rep(self.values, self.formatter, self.na_rep)
-
- if self.fixed_width:
- threshold = get_option("display.chop_threshold")
- else:
- threshold = None
-
- # if we have a fixed_width, we'll need to try different float_format
- def format_values_with(float_format):
- formatter = self._value_formatter(float_format, threshold)
-
- # default formatter leaves a space to the left when formatting
- # floats, must be consistent for left-justifying NaNs (GH #25061)
- if self.justify == "left":
- na_rep = " " + self.na_rep
- else:
- na_rep = self.na_rep
-
- # separate the wheat from the chaff
- values = self.values
- is_complex = is_complex_dtype(values)
- values = format_with_na_rep(values, formatter, na_rep)
-
- if self.fixed_width:
- if is_complex:
- result = _trim_zeros_complex(values, self.decimal)
- else:
- result = _trim_zeros_float(values, self.decimal)
- return np.asarray(result, dtype="object")
-
- return values
-
- # There is a special default string when we are fixed-width
- # The default is otherwise to use str instead of a formatting string
- float_format: FloatFormatType | None
- if self.float_format is None:
- if self.fixed_width:
- if self.leading_space is True:
- fmt_str = "{value: .{digits:d}f}"
- else:
- fmt_str = "{value:.{digits:d}f}"
- float_format = partial(fmt_str.format, digits=self.digits)
- else:
- float_format = self.float_format
- else:
- float_format = lambda value: self.float_format % value
-
- formatted_values = format_values_with(float_format)
-
- if not self.fixed_width:
- return formatted_values
-
- # we need do convert to engineering format if some values are too small
- # and would appear as 0, or if some values are too big and take too
- # much space
-
- if len(formatted_values) > 0:
- maxlen = max(len(x) for x in formatted_values)
- too_long = maxlen > self.digits + 6
- else:
- too_long = False
-
- with np.errstate(invalid="ignore"):
- abs_vals = np.abs(self.values)
- # this is pretty arbitrary for now
- # large values: more that 8 characters including decimal symbol
- # and first digit, hence > 1e6
- has_large_values = (abs_vals > 1e6).any()
- has_small_values = (
- (abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)
- ).any()
-
- if has_small_values or (too_long and has_large_values):
- if self.leading_space is True:
- fmt_str = "{value: .{digits:d}e}"
- else:
- fmt_str = "{value:.{digits:d}e}"
- float_format = partial(fmt_str.format, digits=self.digits)
- formatted_values = format_values_with(float_format)
-
- return formatted_values
-
- def _format_strings(self) -> list[str]:
- return list(self.get_result_as_array())
-
-
-class IntArrayFormatter(GenericArrayFormatter):
- def _format_strings(self) -> list[str]:
- if self.leading_space is False:
- formatter_str = lambda x: f"{x:d}".format(x=x)
- else:
- formatter_str = lambda x: f"{x: d}".format(x=x)
- formatter = self.formatter or formatter_str
- fmt_values = [formatter(x) for x in self.values]
- return fmt_values
-
-
-class Datetime64Formatter(GenericArrayFormatter):
- def __init__(
- self,
- values: np.ndarray | Series | DatetimeIndex | DatetimeArray,
- nat_rep: str = "NaT",
- date_format: None = None,
- **kwargs,
- ) -> None:
- super().__init__(values, **kwargs)
- self.nat_rep = nat_rep
- self.date_format = date_format
-
- def _format_strings(self) -> list[str]:
- """we by definition have DO NOT have a TZ"""
- values = self.values
-
- if not isinstance(values, DatetimeIndex):
- values = DatetimeIndex(values)
-
- if self.formatter is not None and callable(self.formatter):
- return [self.formatter(x) for x in values]
-
- fmt_values = values._data._format_native_types(
- na_rep=self.nat_rep, date_format=self.date_format
- )
- return fmt_values.tolist()
-
-
-class ExtensionArrayFormatter(GenericArrayFormatter):
- def _format_strings(self) -> list[str]:
- values = extract_array(self.values, extract_numpy=True)
-
- formatter = self.formatter
- fallback_formatter = None
- if formatter is None:
- fallback_formatter = values._formatter(boxed=True)
-
- if isinstance(values, Categorical):
- # Categorical is special for now, so that we can preserve tzinfo
- array = values._internal_get_values()
- else:
- array = np.asarray(values)
-
- fmt_values = format_array(
- array,
- formatter,
- float_format=self.float_format,
- na_rep=self.na_rep,
- digits=self.digits,
- space=self.space,
- justify=self.justify,
- decimal=self.decimal,
- leading_space=self.leading_space,
- quoting=self.quoting,
- fallback_formatter=fallback_formatter,
- )
- return fmt_values
-
-
-def format_percentiles(
- percentiles: (np.ndarray | Sequence[float]),
-) -> list[str]:
- """
- Outputs rounded and formatted percentiles.
-
- Parameters
- ----------
- percentiles : list-like, containing floats from interval [0,1]
-
- Returns
- -------
- formatted : list of strings
-
- Notes
- -----
- Rounding precision is chosen so that: (1) if any two elements of
- ``percentiles`` differ, they remain different after rounding
- (2) no entry is *rounded* to 0% or 100%.
- Any non-integer is always rounded to at least 1 decimal place.
-
- Examples
- --------
- Keeps all entries different after rounding:
-
- >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999])
- ['1.999%', '2.001%', '50%', '66.667%', '99.99%']
-
- No element is rounded to 0% or 100% (unless already equal to it).
- Duplicates are allowed:
-
- >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
- ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
- """
- percentiles = np.asarray(percentiles)
-
- # It checks for np.NaN as well
- with np.errstate(invalid="ignore"):
- if (
- not is_numeric_dtype(percentiles)
- or not np.all(percentiles >= 0)
- or not np.all(percentiles <= 1)
- ):
- raise ValueError("percentiles should all be in the interval [0,1]")
-
- percentiles = 100 * percentiles
- percentiles_round_type = percentiles.round().astype(int)
-
- int_idx = np.isclose(percentiles_round_type, percentiles)
-
- if np.all(int_idx):
- out = percentiles_round_type.astype(str)
- return [i + "%" for i in out]
-
- unique_pcts = np.unique(percentiles)
- to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None
- to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None
-
- # Least precision that keeps percentiles unique after rounding
- prec = -np.floor(
- np.log10(np.min(np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end)))
- ).astype(int)
- prec = max(1, prec)
- out = np.empty_like(percentiles, dtype=object)
- out[int_idx] = percentiles[int_idx].round().astype(int).astype(str)
-
- out[~int_idx] = percentiles[~int_idx].round(prec).astype(str)
- return [i + "%" for i in out]
-
-
-def is_dates_only(values: np.ndarray | DatetimeArray | Index | DatetimeIndex) -> bool:
- # return a boolean if we are only dates (and don't have a timezone)
- if not isinstance(values, Index):
- values = values.ravel()
-
- if not isinstance(values, (DatetimeArray, DatetimeIndex)):
- values = DatetimeIndex(values)
-
- if values.tz is not None:
- return False
-
- values_int = values.asi8
- consider_values = values_int != iNaT
- # error: Argument 1 to "py_get_unit_from_dtype" has incompatible type
- # "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]"
- reso = get_unit_from_dtype(values.dtype) # type: ignore[arg-type]
- ppd = periods_per_day(reso)
-
- # TODO: can we reuse is_date_array_normalized? would need a skipna kwd
- even_days = np.logical_and(consider_values, values_int % ppd != 0).sum() == 0
- if even_days:
- return True
- return False
-
-
-def _format_datetime64(x: NaTType | Timestamp, nat_rep: str = "NaT") -> str:
- if x is NaT:
- return nat_rep
-
- # Timestamp.__str__ falls back to datetime.datetime.__str__ = isoformat(sep=' ')
- # so it already uses string formatting rather than strftime (faster).
- return str(x)
-
-
-def _format_datetime64_dateonly(
- x: NaTType | Timestamp,
- nat_rep: str = "NaT",
- date_format: str | None = None,
-) -> str:
- if isinstance(x, NaTType):
- return nat_rep
-
- if date_format:
- return x.strftime(date_format)
- else:
- # Timestamp._date_repr relies on string formatting (faster than strftime)
- return x._date_repr
-
-
-def get_format_datetime64(
- is_dates_only_: bool, nat_rep: str = "NaT", date_format: str | None = None
-) -> Callable:
- """Return a formatter callable taking a datetime64 as input and providing
- a string as output"""
-
- if is_dates_only_:
- return lambda x: _format_datetime64_dateonly(
- x, nat_rep=nat_rep, date_format=date_format
- )
- else:
- return lambda x: _format_datetime64(x, nat_rep=nat_rep)
-
-
-def get_format_datetime64_from_values(
- values: np.ndarray | DatetimeArray | DatetimeIndex, date_format: str | None
-) -> str | None:
- """given values and a date_format, return a string format"""
- if isinstance(values, np.ndarray) and values.ndim > 1:
- # We don't actually care about the order of values, and DatetimeIndex
- # only accepts 1D values
- values = values.ravel()
-
- ido = is_dates_only(values)
- if ido:
- # Only dates and no timezone: provide a default format
- return date_format or "%Y-%m-%d"
- return date_format
-
-
-class Datetime64TZFormatter(Datetime64Formatter):
- def _format_strings(self) -> list[str]:
- """we by definition have a TZ"""
- values = self.values.astype(object)
- ido = is_dates_only(values)
- formatter = self.formatter or get_format_datetime64(
- ido, date_format=self.date_format
- )
- fmt_values = [formatter(x) for x in values]
-
- return fmt_values
-
-
-class Timedelta64Formatter(GenericArrayFormatter):
- def __init__(
- self,
- values: np.ndarray | TimedeltaIndex,
- nat_rep: str = "NaT",
- box: bool = False,
- **kwargs,
- ) -> None:
- super().__init__(values, **kwargs)
- self.nat_rep = nat_rep
- self.box = box
-
- def _format_strings(self) -> list[str]:
- formatter = self.formatter or get_format_timedelta64(
- self.values, nat_rep=self.nat_rep, box=self.box
- )
- return [formatter(x) for x in self.values]
-
-
-def get_format_timedelta64(
- values: np.ndarray | TimedeltaIndex | TimedeltaArray,
- nat_rep: str | float = "NaT",
- box: bool = False,
-) -> Callable:
- """
- Return a formatter function for a range of timedeltas.
- These will all have the same format argument
-
- If box, then show the return in quotes
- """
- values_int = values.view(np.int64)
-
- consider_values = values_int != iNaT
-
- one_day_nanos = 86400 * 10**9
- # error: Unsupported operand types for % ("ExtensionArray" and "int")
- not_midnight = values_int % one_day_nanos != 0 # type: ignore[operator]
- # error: Argument 1 to "__call__" of "ufunc" has incompatible type
- # "Union[Any, ExtensionArray, ndarray]"; expected
- # "Union[Union[int, float, complex, str, bytes, generic],
- # Sequence[Union[int, float, complex, str, bytes, generic]],
- # Sequence[Sequence[Any]], _SupportsArray]"
- both = np.logical_and(consider_values, not_midnight) # type: ignore[arg-type]
- even_days = both.sum() == 0
-
- if even_days:
- format = None
- else:
- format = "long"
-
- def _formatter(x):
- if x is None or (is_scalar(x) and isna(x)):
- return nat_rep
-
- if not isinstance(x, Timedelta):
- x = Timedelta(x)
-
- # Timedelta._repr_base uses string formatting (faster than strftime)
- result = x._repr_base(format=format)
- if box:
- result = f"'{result}'"
- return result
-
- return _formatter
-
-
-def _make_fixed_width(
- strings: list[str],
- justify: str = "right",
- minimum: int | None = None,
- adj: TextAdjustment | None = None,
-) -> list[str]:
- if len(strings) == 0 or justify == "all":
- return strings
-
- if adj is None:
- adjustment = get_adjustment()
- else:
- adjustment = adj
-
- max_len = max(adjustment.len(x) for x in strings)
-
- if minimum is not None:
- max_len = max(minimum, max_len)
-
- conf_max = get_option("display.max_colwidth")
- if conf_max is not None and max_len > conf_max:
- max_len = conf_max
-
- def just(x: str) -> str:
- if conf_max is not None:
- if (conf_max > 3) & (adjustment.len(x) > max_len):
- x = x[: max_len - 3] + "..."
- return x
-
- strings = [just(x) for x in strings]
- result = adjustment.justify(strings, max_len, mode=justify)
- return result
-
-
-def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> list[str]:
- """
- Separates the real and imaginary parts from the complex number, and
- executes the _trim_zeros_float method on each of those.
- """
- trimmed = [
- "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal))
- for x in str_complexes
- ]
-
- # pad strings to the length of the longest trimmed string for alignment
- lengths = [len(s) for s in trimmed]
- max_length = max(lengths)
- padded = [
- s[: -((k - 1) // 2 + 1)] # real part
- + (max_length - k) // 2 * "0"
- + s[-((k - 1) // 2 + 1) : -((k - 1) // 2)] # + / -
- + s[-((k - 1) // 2) : -1] # imaginary part
- + (max_length - k) // 2 * "0"
- + s[-1]
- for s, k in zip(trimmed, lengths)
- ]
- return padded
-
-
-def _trim_zeros_single_float(str_float: str) -> str:
- """
- Trims trailing zeros after a decimal point,
- leaving just one if necessary.
- """
- str_float = str_float.rstrip("0")
- if str_float.endswith("."):
- str_float += "0"
-
- return str_float
-
-
-def _trim_zeros_float(
- str_floats: np.ndarray | list[str], decimal: str = "."
-) -> list[str]:
- """
- Trims the maximum number of trailing zeros equally from
- all numbers containing decimals, leaving just one if
- necessary.
- """
- trimmed = str_floats
- number_regex = re.compile(rf"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$")
-
- def is_number_with_decimal(x) -> bool:
- return re.match(number_regex, x) is not None
-
- def should_trim(values: np.ndarray | list[str]) -> bool:
- """
- Determine if an array of strings should be trimmed.
-
- Returns True if all numbers containing decimals (defined by the
- above regular expression) within the array end in a zero, otherwise
- returns False.
- """
- numbers = [x for x in values if is_number_with_decimal(x)]
- return len(numbers) > 0 and all(x.endswith("0") for x in numbers)
-
- while should_trim(trimmed):
- trimmed = [x[:-1] if is_number_with_decimal(x) else x for x in trimmed]
-
- # leave one 0 after the decimal points if need be.
- result = [
- x + "0" if is_number_with_decimal(x) and x.endswith(decimal) else x
- for x in trimmed
- ]
- return result
-
-
-def _has_names(index: Index) -> bool:
- if isinstance(index, MultiIndex):
- return com.any_not_none(*index.names)
- else:
- return index.name is not None
-
-
-class EngFormatter:
- """
- Formats float values according to engineering format.
-
- Based on matplotlib.ticker.EngFormatter
- """
-
- # The SI engineering prefixes
- ENG_PREFIXES = {
- -24: "y",
- -21: "z",
- -18: "a",
- -15: "f",
- -12: "p",
- -9: "n",
- -6: "u",
- -3: "m",
- 0: "",
- 3: "k",
- 6: "M",
- 9: "G",
- 12: "T",
- 15: "P",
- 18: "E",
- 21: "Z",
- 24: "Y",
- }
-
- def __init__(
- self, accuracy: int | None = None, use_eng_prefix: bool = False
- ) -> None:
- self.accuracy = accuracy
- self.use_eng_prefix = use_eng_prefix
-
- def __call__(self, num: float) -> str:
- """
- Formats a number in engineering notation, appending a letter
- representing the power of 1000 of the original number. Some examples:
- >>> format_eng = EngFormatter(accuracy=0, use_eng_prefix=True)
- >>> format_eng(0)
- ' 0'
- >>> format_eng = EngFormatter(accuracy=1, use_eng_prefix=True)
- >>> format_eng(1_000_000)
- ' 1.0M'
- >>> format_eng = EngFormatter(accuracy=2, use_eng_prefix=False)
- >>> format_eng("-1e-6")
- '-1.00E-06'
-
- @param num: the value to represent
- @type num: either a numeric value or a string that can be converted to
- a numeric value (as per decimal.Decimal constructor)
-
- @return: engineering formatted string
- """
- dnum = Decimal(str(num))
-
- if Decimal.is_nan(dnum):
- return "NaN"
-
- if Decimal.is_infinite(dnum):
- return "inf"
-
- sign = 1
-
- if dnum < 0: # pragma: no cover
- sign = -1
- dnum = -dnum
-
- if dnum != 0:
- pow10 = Decimal(int(math.floor(dnum.log10() / 3) * 3))
- else:
- pow10 = Decimal(0)
-
- pow10 = pow10.min(max(self.ENG_PREFIXES.keys()))
- pow10 = pow10.max(min(self.ENG_PREFIXES.keys()))
- int_pow10 = int(pow10)
-
- if self.use_eng_prefix:
- prefix = self.ENG_PREFIXES[int_pow10]
- else:
- if int_pow10 < 0:
- prefix = f"E-{-int_pow10:02d}"
- else:
- prefix = f"E+{int_pow10:02d}"
-
- mant = sign * dnum / (10**pow10)
-
- if self.accuracy is None: # pragma: no cover
- format_str = "{mant: g}{prefix}"
- else:
- format_str = f"{{mant: .{self.accuracy:d}f}}{{prefix}}"
-
- formatted = format_str.format(mant=mant, prefix=prefix)
-
- return formatted
-
-
-def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> None:
- """
- Format float representation in DataFrame with SI notation.
-
- Parameters
- ----------
- accuracy : int, default 3
- Number of decimal digits after the floating point.
- use_eng_prefix : bool, default False
- Whether to represent a value with SI prefixes.
-
- Returns
- -------
- None
-
- Examples
- --------
- >>> df = pd.DataFrame([1e-9, 1e-3, 1, 1e3, 1e6])
- >>> df
- 0
- 0 1.000000e-09
- 1 1.000000e-03
- 2 1.000000e+00
- 3 1.000000e+03
- 4 1.000000e+06
-
- >>> pd.set_eng_float_format(accuracy=1)
- >>> df
- 0
- 0 1.0E-09
- 1 1.0E-03
- 2 1.0E+00
- 3 1.0E+03
- 4 1.0E+06
-
- >>> pd.set_eng_float_format(use_eng_prefix=True)
- >>> df
- 0
- 0 1.000n
- 1 1.000m
- 2 1.000
- 3 1.000k
- 4 1.000M
-
- >>> pd.set_eng_float_format(accuracy=1, use_eng_prefix=True)
- >>> df
- 0
- 0 1.0n
- 1 1.0m
- 2 1.0
- 3 1.0k
- 4 1.0M
-
- >>> pd.set_option("display.float_format", None) # unset option
- """
- set_option("display.float_format", EngFormatter(accuracy, use_eng_prefix))
-
-
-def get_level_lengths(
- levels: Any, sentinel: bool | object | str = ""
-) -> list[dict[int, int]]:
- """
- For each index in each level the function returns lengths of indexes.
-
- Parameters
- ----------
- levels : list of lists
- List of values on for level.
- sentinel : string, optional
- Value which states that no new index starts on there.
-
- Returns
- -------
- Returns list of maps. For each level returns map of indexes (key is index
- in row and value is length of index).
- """
- if len(levels) == 0:
- return []
-
- control = [True] * len(levels[0])
-
- result = []
- for level in levels:
- last_index = 0
-
- lengths = {}
- for i, key in enumerate(level):
- if control[i] and key == sentinel:
- pass
- else:
- control[i] = False
- lengths[last_index] = i - last_index
- last_index = i
-
- lengths[last_index] = len(level) - last_index
-
- result.append(lengths)
-
- return result
-
-
-def buffer_put_lines(buf: WriteBuffer[str], lines: list[str]) -> None:
- """
- Appends lines to a buffer.
-
- Parameters
- ----------
- buf
- The buffer to write to
- lines
- The lines to append.
- """
- if any(isinstance(x, str) for x in lines):
- lines = [str(x) for x in lines]
- buf.write("\n".join(lines))
diff --git a/contrib/python/pandas/py3/pandas/io/formats/html.py b/contrib/python/pandas/py3/pandas/io/formats/html.py
deleted file mode 100644
index 6fbbbd01cf7..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/html.py
+++ /dev/null
@@ -1,633 +0,0 @@
-"""
-Module for formatting output data in HTML.
-"""
-from __future__ import annotations
-
-from textwrap import dedent
-from typing import (
- Any,
- Final,
- Hashable,
- Iterable,
- Mapping,
- cast,
-)
-
-from pandas._config import get_option
-
-from pandas._libs import lib
-
-from pandas import (
- MultiIndex,
- option_context,
-)
-
-from pandas.io.common import is_url
-from pandas.io.formats.format import (
- DataFrameFormatter,
- get_level_lengths,
-)
-from pandas.io.formats.printing import pprint_thing
-
-
-class HTMLFormatter:
- """
- Internal class for formatting output data in html.
- This class is intended for shared functionality between
- DataFrame.to_html() and DataFrame._repr_html_().
- Any logic in common with other output formatting methods
- should ideally be inherited from classes in format.py
- and this class responsible for only producing html markup.
- """
-
- indent_delta: Final = 2
-
- def __init__(
- self,
- formatter: DataFrameFormatter,
- classes: str | list[str] | tuple[str, ...] | None = None,
- border: int | bool | None = None,
- table_id: str | None = None,
- render_links: bool = False,
- ) -> None:
- self.fmt = formatter
- self.classes = classes
-
- self.frame = self.fmt.frame
- self.columns = self.fmt.tr_frame.columns
- self.elements: list[str] = []
- self.bold_rows = self.fmt.bold_rows
- self.escape = self.fmt.escape
- self.show_dimensions = self.fmt.show_dimensions
- if border is None or border is True:
- border = cast(int, get_option("display.html.border"))
- elif not border:
- border = None
-
- self.border = border
- self.table_id = table_id
- self.render_links = render_links
-
- self.col_space = {
- column: f"{value}px" if isinstance(value, int) else value
- for column, value in self.fmt.col_space.items()
- }
-
- def to_string(self) -> str:
- lines = self.render()
- if any(isinstance(x, str) for x in lines):
- lines = [str(x) for x in lines]
- return "\n".join(lines)
-
- def render(self) -> list[str]:
- self._write_table()
-
- if self.should_show_dimensions:
- by = chr(215) # ×
- self.write(
- f"<p>{len(self.frame)} rows {by} {len(self.frame.columns)} columns</p>"
- )
-
- return self.elements
-
- @property
- def should_show_dimensions(self) -> bool:
- return self.fmt.should_show_dimensions
-
- @property
- def show_row_idx_names(self) -> bool:
- return self.fmt.show_row_idx_names
-
- @property
- def show_col_idx_names(self) -> bool:
- return self.fmt.show_col_idx_names
-
- @property
- def row_levels(self) -> int:
- if self.fmt.index:
- # showing (row) index
- return self.frame.index.nlevels
- elif self.show_col_idx_names:
- # see gh-22579
- # Column misalignment also occurs for
- # a standard index when the columns index is named.
- # If the row index is not displayed a column of
- # blank cells need to be included before the DataFrame values.
- return 1
- # not showing (row) index
- return 0
-
- def _get_columns_formatted_values(self) -> Iterable:
- return self.columns
-
- @property
- def is_truncated(self) -> bool:
- return self.fmt.is_truncated
-
- @property
- def ncols(self) -> int:
- return len(self.fmt.tr_frame.columns)
-
- def write(self, s: Any, indent: int = 0) -> None:
- rs = pprint_thing(s)
- self.elements.append(" " * indent + rs)
-
- def write_th(
- self, s: Any, header: bool = False, indent: int = 0, tags: str | None = None
- ) -> None:
- """
- Method for writing a formatted <th> cell.
-
- If col_space is set on the formatter then that is used for
- the value of min-width.
-
- Parameters
- ----------
- s : object
- The data to be written inside the cell.
- header : bool, default False
- Set to True if the <th> is for use inside <thead>. This will
- cause min-width to be set if there is one.
- indent : int, default 0
- The indentation level of the cell.
- tags : str, default None
- Tags to include in the cell.
-
- Returns
- -------
- A written <th> cell.
- """
- col_space = self.col_space.get(s, None)
-
- if header and col_space is not None:
- tags = tags or ""
- tags += f'style="min-width: {col_space};"'
-
- self._write_cell(s, kind="th", indent=indent, tags=tags)
-
- def write_td(self, s: Any, indent: int = 0, tags: str | None = None) -> None:
- self._write_cell(s, kind="td", indent=indent, tags=tags)
-
- def _write_cell(
- self, s: Any, kind: str = "td", indent: int = 0, tags: str | None = None
- ) -> None:
- if tags is not None:
- start_tag = f"<{kind} {tags}>"
- else:
- start_tag = f"<{kind}>"
-
- if self.escape:
- # escape & first to prevent double escaping of &
- esc = {"&": r"&amp;", "<": r"&lt;", ">": r"&gt;"}
- else:
- esc = {}
-
- rs = pprint_thing(s, escape_chars=esc).strip()
-
- if self.render_links and is_url(rs):
- rs_unescaped = pprint_thing(s, escape_chars={}).strip()
- start_tag += f'<a href="{rs_unescaped}" target="_blank">'
- end_a = "</a>"
- else:
- end_a = ""
-
- self.write(f"{start_tag}{rs}{end_a}</{kind}>", indent)
-
- def write_tr(
- self,
- line: Iterable,
- indent: int = 0,
- indent_delta: int = 0,
- header: bool = False,
- align: str | None = None,
- tags: dict[int, str] | None = None,
- nindex_levels: int = 0,
- ) -> None:
- if tags is None:
- tags = {}
-
- if align is None:
- self.write("<tr>", indent)
- else:
- self.write(f'<tr style="text-align: {align};">', indent)
- indent += indent_delta
-
- for i, s in enumerate(line):
- val_tag = tags.get(i, None)
- if header or (self.bold_rows and i < nindex_levels):
- self.write_th(s, indent=indent, header=header, tags=val_tag)
- else:
- self.write_td(s, indent, tags=val_tag)
-
- indent -= indent_delta
- self.write("</tr>", indent)
-
- def _write_table(self, indent: int = 0) -> None:
- _classes = ["dataframe"] # Default class.
- use_mathjax = get_option("display.html.use_mathjax")
- if not use_mathjax:
- _classes.append("tex2jax_ignore")
- if self.classes is not None:
- if isinstance(self.classes, str):
- self.classes = self.classes.split()
- if not isinstance(self.classes, (list, tuple)):
- raise TypeError(
- "classes must be a string, list, "
- f"or tuple, not {type(self.classes)}"
- )
- _classes.extend(self.classes)
-
- if self.table_id is None:
- id_section = ""
- else:
- id_section = f' id="{self.table_id}"'
-
- if self.border is None:
- border_attr = ""
- else:
- border_attr = f' border="{self.border}"'
-
- self.write(
- f'<table{border_attr} class="{" ".join(_classes)}"{id_section}>',
- indent,
- )
-
- if self.fmt.header or self.show_row_idx_names:
- self._write_header(indent + self.indent_delta)
-
- self._write_body(indent + self.indent_delta)
-
- self.write("</table>", indent)
-
- def _write_col_header(self, indent: int) -> None:
- row: list[Hashable]
- is_truncated_horizontally = self.fmt.is_truncated_horizontally
- if isinstance(self.columns, MultiIndex):
- template = 'colspan="{span:d}" halign="left"'
-
- sentinel: lib.NoDefault | bool
- if self.fmt.sparsify:
- # GH3547
- sentinel = lib.no_default
- else:
- sentinel = False
- levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False)
- level_lengths = get_level_lengths(levels, sentinel)
- inner_lvl = len(level_lengths) - 1
- for lnum, (records, values) in enumerate(zip(level_lengths, levels)):
- if is_truncated_horizontally:
- # modify the header lines
- ins_col = self.fmt.tr_col_num
- if self.fmt.sparsify:
- recs_new = {}
- # Increment tags after ... col.
- for tag, span in list(records.items()):
- if tag >= ins_col:
- recs_new[tag + 1] = span
- elif tag + span > ins_col:
- recs_new[tag] = span + 1
- if lnum == inner_lvl:
- values = (
- values[:ins_col] + ("...",) + values[ins_col:]
- )
- else:
- # sparse col headers do not receive a ...
- values = (
- values[:ins_col]
- + (values[ins_col - 1],)
- + values[ins_col:]
- )
- else:
- recs_new[tag] = span
- # if ins_col lies between tags, all col headers
- # get ...
- if tag + span == ins_col:
- recs_new[ins_col] = 1
- values = values[:ins_col] + ("...",) + values[ins_col:]
- records = recs_new
- inner_lvl = len(level_lengths) - 1
- if lnum == inner_lvl:
- records[ins_col] = 1
- else:
- recs_new = {}
- for tag, span in list(records.items()):
- if tag >= ins_col:
- recs_new[tag + 1] = span
- else:
- recs_new[tag] = span
- recs_new[ins_col] = 1
- records = recs_new
- values = values[:ins_col] + ["..."] + values[ins_col:]
-
- # see gh-22579
- # Column Offset Bug with to_html(index=False) with
- # MultiIndex Columns and Index.
- # Initially fill row with blank cells before column names.
- # TODO: Refactor to remove code duplication with code
- # block below for standard columns index.
- row = [""] * (self.row_levels - 1)
- if self.fmt.index or self.show_col_idx_names:
- # see gh-22747
- # If to_html(index_names=False) do not show columns
- # index names.
- # TODO: Refactor to use _get_column_name_list from
- # DataFrameFormatter class and create a
- # _get_formatted_column_labels function for code
- # parity with DataFrameFormatter class.
- if self.fmt.show_index_names:
- name = self.columns.names[lnum]
- row.append(pprint_thing(name or ""))
- else:
- row.append("")
-
- tags = {}
- j = len(row)
- for i, v in enumerate(values):
- if i in records:
- if records[i] > 1:
- tags[j] = template.format(span=records[i])
- else:
- continue
- j += 1
- row.append(v)
- self.write_tr(row, indent, self.indent_delta, tags=tags, header=True)
- else:
- # see gh-22579
- # Column misalignment also occurs for
- # a standard index when the columns index is named.
- # Initially fill row with blank cells before column names.
- # TODO: Refactor to remove code duplication with code block
- # above for columns MultiIndex.
- row = [""] * (self.row_levels - 1)
- if self.fmt.index or self.show_col_idx_names:
- # see gh-22747
- # If to_html(index_names=False) do not show columns
- # index names.
- # TODO: Refactor to use _get_column_name_list from
- # DataFrameFormatter class.
- if self.fmt.show_index_names:
- row.append(self.columns.name or "")
- else:
- row.append("")
- row.extend(self._get_columns_formatted_values())
- align = self.fmt.justify
-
- if is_truncated_horizontally:
- ins_col = self.row_levels + self.fmt.tr_col_num
- row.insert(ins_col, "...")
-
- self.write_tr(row, indent, self.indent_delta, header=True, align=align)
-
- def _write_row_header(self, indent: int) -> None:
- is_truncated_horizontally = self.fmt.is_truncated_horizontally
- row = [x if x is not None else "" for x in self.frame.index.names] + [""] * (
- self.ncols + (1 if is_truncated_horizontally else 0)
- )
- self.write_tr(row, indent, self.indent_delta, header=True)
-
- def _write_header(self, indent: int) -> None:
- self.write("<thead>", indent)
-
- if self.fmt.header:
- self._write_col_header(indent + self.indent_delta)
-
- if self.show_row_idx_names:
- self._write_row_header(indent + self.indent_delta)
-
- self.write("</thead>", indent)
-
- def _get_formatted_values(self) -> dict[int, list[str]]:
- with option_context("display.max_colwidth", None):
- fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)}
- return fmt_values
-
- def _write_body(self, indent: int) -> None:
- self.write("<tbody>", indent)
- fmt_values = self._get_formatted_values()
-
- # write values
- if self.fmt.index and isinstance(self.frame.index, MultiIndex):
- self._write_hierarchical_rows(fmt_values, indent + self.indent_delta)
- else:
- self._write_regular_rows(fmt_values, indent + self.indent_delta)
-
- self.write("</tbody>", indent)
-
- def _write_regular_rows(
- self, fmt_values: Mapping[int, list[str]], indent: int
- ) -> None:
- is_truncated_horizontally = self.fmt.is_truncated_horizontally
- is_truncated_vertically = self.fmt.is_truncated_vertically
-
- nrows = len(self.fmt.tr_frame)
-
- if self.fmt.index:
- fmt = self.fmt._get_formatter("__index__")
- if fmt is not None:
- index_values = self.fmt.tr_frame.index.map(fmt)
- else:
- index_values = self.fmt.tr_frame.index.format()
-
- row: list[str] = []
- for i in range(nrows):
- if is_truncated_vertically and i == (self.fmt.tr_row_num):
- str_sep_row = ["..."] * len(row)
- self.write_tr(
- str_sep_row,
- indent,
- self.indent_delta,
- tags=None,
- nindex_levels=self.row_levels,
- )
-
- row = []
- if self.fmt.index:
- row.append(index_values[i])
- # see gh-22579
- # Column misalignment also occurs for
- # a standard index when the columns index is named.
- # Add blank cell before data cells.
- elif self.show_col_idx_names:
- row.append("")
- row.extend(fmt_values[j][i] for j in range(self.ncols))
-
- if is_truncated_horizontally:
- dot_col_ix = self.fmt.tr_col_num + self.row_levels
- row.insert(dot_col_ix, "...")
- self.write_tr(
- row, indent, self.indent_delta, tags=None, nindex_levels=self.row_levels
- )
-
- def _write_hierarchical_rows(
- self, fmt_values: Mapping[int, list[str]], indent: int
- ) -> None:
- template = 'rowspan="{span}" valign="top"'
-
- is_truncated_horizontally = self.fmt.is_truncated_horizontally
- is_truncated_vertically = self.fmt.is_truncated_vertically
- frame = self.fmt.tr_frame
- nrows = len(frame)
-
- assert isinstance(frame.index, MultiIndex)
- idx_values = frame.index.format(sparsify=False, adjoin=False, names=False)
- idx_values = list(zip(*idx_values))
-
- if self.fmt.sparsify:
- # GH3547
- sentinel = lib.no_default
- levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False)
-
- level_lengths = get_level_lengths(levels, sentinel)
- inner_lvl = len(level_lengths) - 1
- if is_truncated_vertically:
- # Insert ... row and adjust idx_values and
- # level_lengths to take this into account.
- ins_row = self.fmt.tr_row_num
- inserted = False
- for lnum, records in enumerate(level_lengths):
- rec_new = {}
- for tag, span in list(records.items()):
- if tag >= ins_row:
- rec_new[tag + 1] = span
- elif tag + span > ins_row:
- rec_new[tag] = span + 1
-
- # GH 14882 - Make sure insertion done once
- if not inserted:
- dot_row = list(idx_values[ins_row - 1])
- dot_row[-1] = "..."
- idx_values.insert(ins_row, tuple(dot_row))
- inserted = True
- else:
- dot_row = list(idx_values[ins_row])
- dot_row[inner_lvl - lnum] = "..."
- idx_values[ins_row] = tuple(dot_row)
- else:
- rec_new[tag] = span
- # If ins_row lies between tags, all cols idx cols
- # receive ...
- if tag + span == ins_row:
- rec_new[ins_row] = 1
- if lnum == 0:
- idx_values.insert(
- ins_row, tuple(["..."] * len(level_lengths))
- )
-
- # GH 14882 - Place ... in correct level
- elif inserted:
- dot_row = list(idx_values[ins_row])
- dot_row[inner_lvl - lnum] = "..."
- idx_values[ins_row] = tuple(dot_row)
- level_lengths[lnum] = rec_new
-
- level_lengths[inner_lvl][ins_row] = 1
- for ix_col in fmt_values:
- fmt_values[ix_col].insert(ins_row, "...")
- nrows += 1
-
- for i in range(nrows):
- row = []
- tags = {}
-
- sparse_offset = 0
- j = 0
- for records, v in zip(level_lengths, idx_values[i]):
- if i in records:
- if records[i] > 1:
- tags[j] = template.format(span=records[i])
- else:
- sparse_offset += 1
- continue
-
- j += 1
- row.append(v)
-
- row.extend(fmt_values[j][i] for j in range(self.ncols))
- if is_truncated_horizontally:
- row.insert(
- self.row_levels - sparse_offset + self.fmt.tr_col_num, "..."
- )
- self.write_tr(
- row,
- indent,
- self.indent_delta,
- tags=tags,
- nindex_levels=len(levels) - sparse_offset,
- )
- else:
- row = []
- for i in range(len(frame)):
- if is_truncated_vertically and i == (self.fmt.tr_row_num):
- str_sep_row = ["..."] * len(row)
- self.write_tr(
- str_sep_row,
- indent,
- self.indent_delta,
- tags=None,
- nindex_levels=self.row_levels,
- )
-
- idx_values = list(
- zip(*frame.index.format(sparsify=False, adjoin=False, names=False))
- )
- row = []
- row.extend(idx_values[i])
- row.extend(fmt_values[j][i] for j in range(self.ncols))
- if is_truncated_horizontally:
- row.insert(self.row_levels + self.fmt.tr_col_num, "...")
- self.write_tr(
- row,
- indent,
- self.indent_delta,
- tags=None,
- nindex_levels=frame.index.nlevels,
- )
-
-
-class NotebookFormatter(HTMLFormatter):
- """
- Internal class for formatting output data in html for display in Jupyter
- Notebooks. This class is intended for functionality specific to
- DataFrame._repr_html_() and DataFrame.to_html(notebook=True)
- """
-
- def _get_formatted_values(self) -> dict[int, list[str]]:
- return {i: self.fmt.format_col(i) for i in range(self.ncols)}
-
- def _get_columns_formatted_values(self) -> list[str]:
- return self.columns.format()
-
- def write_style(self) -> None:
- # We use the "scoped" attribute here so that the desired
- # style properties for the data frame are not then applied
- # throughout the entire notebook.
- template_first = """\
- <style scoped>"""
- template_last = """\
- </style>"""
- template_select = """\
- .dataframe %s {
- %s: %s;
- }"""
- element_props = [
- ("tbody tr th:only-of-type", "vertical-align", "middle"),
- ("tbody tr th", "vertical-align", "top"),
- ]
- if isinstance(self.columns, MultiIndex):
- element_props.append(("thead tr th", "text-align", "left"))
- if self.show_row_idx_names:
- element_props.append(
- ("thead tr:last-of-type th", "text-align", "right")
- )
- else:
- element_props.append(("thead th", "text-align", "right"))
- template_mid = "\n\n".join(map(lambda t: template_select % t, element_props))
- template = dedent("\n".join((template_first, template_mid, template_last)))
- self.write(template)
-
- def render(self) -> list[str]:
- self.write("<div>")
- self.write_style()
- super().render()
- self.write("</div>")
- return self.elements
diff --git a/contrib/python/pandas/py3/pandas/io/formats/info.py b/contrib/python/pandas/py3/pandas/io/formats/info.py
deleted file mode 100644
index d826c0a148e..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/info.py
+++ /dev/null
@@ -1,1101 +0,0 @@
-from __future__ import annotations
-
-from abc import (
- ABC,
- abstractmethod,
-)
-import sys
-from textwrap import dedent
-from typing import (
- TYPE_CHECKING,
- Iterable,
- Iterator,
- Mapping,
- Sequence,
-)
-
-from pandas._config import get_option
-
-from pandas._typing import (
- Dtype,
- WriteBuffer,
-)
-
-from pandas.io.formats import format as fmt
-from pandas.io.formats.printing import pprint_thing
-
-if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Index,
- Series,
- )
-
-
-frame_max_cols_sub = dedent(
- """\
- max_cols : int, optional
- When to switch from the verbose to the truncated output. If the
- DataFrame has more than `max_cols` columns, the truncated output
- is used. By default, the setting in
- ``pandas.options.display.max_info_columns`` is used."""
-)
-
-
-show_counts_sub = dedent(
- """\
- show_counts : bool, optional
- Whether to show the non-null counts. By default, this is shown
- only if the DataFrame is smaller than
- ``pandas.options.display.max_info_rows`` and
- ``pandas.options.display.max_info_columns``. A value of True always
- shows the counts, and False never shows the counts."""
-)
-
-
-frame_examples_sub = dedent(
- """\
- >>> int_values = [1, 2, 3, 4, 5]
- >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
- >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
- >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
- ... "float_col": float_values})
- >>> df
- int_col text_col float_col
- 0 1 alpha 0.00
- 1 2 beta 0.25
- 2 3 gamma 0.50
- 3 4 delta 0.75
- 4 5 epsilon 1.00
-
- Prints information of all columns:
-
- >>> df.info(verbose=True)
- <class 'pandas.core.frame.DataFrame'>
- RangeIndex: 5 entries, 0 to 4
- Data columns (total 3 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 int_col 5 non-null int64
- 1 text_col 5 non-null object
- 2 float_col 5 non-null float64
- dtypes: float64(1), int64(1), object(1)
- memory usage: 248.0+ bytes
-
- Prints a summary of columns count and its dtypes but not per column
- information:
-
- >>> df.info(verbose=False)
- <class 'pandas.core.frame.DataFrame'>
- RangeIndex: 5 entries, 0 to 4
- Columns: 3 entries, int_col to float_col
- dtypes: float64(1), int64(1), object(1)
- memory usage: 248.0+ bytes
-
- Pipe output of DataFrame.info to buffer instead of sys.stdout, get
- buffer content and writes to a text file:
-
- >>> import io
- >>> buffer = io.StringIO()
- >>> df.info(buf=buffer)
- >>> s = buffer.getvalue()
- >>> with open("df_info.txt", "w",
- ... encoding="utf-8") as f: # doctest: +SKIP
- ... f.write(s)
- 260
-
- The `memory_usage` parameter allows deep introspection mode, specially
- useful for big DataFrames and fine-tune memory optimization:
-
- >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
- >>> df = pd.DataFrame({
- ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
- ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
- ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
- ... })
- >>> df.info()
- <class 'pandas.core.frame.DataFrame'>
- RangeIndex: 1000000 entries, 0 to 999999
- Data columns (total 3 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 column_1 1000000 non-null object
- 1 column_2 1000000 non-null object
- 2 column_3 1000000 non-null object
- dtypes: object(3)
- memory usage: 22.9+ MB
-
- >>> df.info(memory_usage='deep')
- <class 'pandas.core.frame.DataFrame'>
- RangeIndex: 1000000 entries, 0 to 999999
- Data columns (total 3 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 column_1 1000000 non-null object
- 1 column_2 1000000 non-null object
- 2 column_3 1000000 non-null object
- dtypes: object(3)
- memory usage: 165.9 MB"""
-)
-
-
-frame_see_also_sub = dedent(
- """\
- DataFrame.describe: Generate descriptive statistics of DataFrame
- columns.
- DataFrame.memory_usage: Memory usage of DataFrame columns."""
-)
-
-
-frame_sub_kwargs = {
- "klass": "DataFrame",
- "type_sub": " and columns",
- "max_cols_sub": frame_max_cols_sub,
- "show_counts_sub": show_counts_sub,
- "examples_sub": frame_examples_sub,
- "see_also_sub": frame_see_also_sub,
- "version_added_sub": "",
-}
-
-
-series_examples_sub = dedent(
- """\
- >>> int_values = [1, 2, 3, 4, 5]
- >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
- >>> s = pd.Series(text_values, index=int_values)
- >>> s.info()
- <class 'pandas.core.series.Series'>
- Index: 5 entries, 1 to 5
- Series name: None
- Non-Null Count Dtype
- -------------- -----
- 5 non-null object
- dtypes: object(1)
- memory usage: 80.0+ bytes
-
- Prints a summary excluding information about its values:
-
- >>> s.info(verbose=False)
- <class 'pandas.core.series.Series'>
- Index: 5 entries, 1 to 5
- dtypes: object(1)
- memory usage: 80.0+ bytes
-
- Pipe output of Series.info to buffer instead of sys.stdout, get
- buffer content and writes to a text file:
-
- >>> import io
- >>> buffer = io.StringIO()
- >>> s.info(buf=buffer)
- >>> s = buffer.getvalue()
- >>> with open("df_info.txt", "w",
- ... encoding="utf-8") as f: # doctest: +SKIP
- ... f.write(s)
- 260
-
- The `memory_usage` parameter allows deep introspection mode, specially
- useful for big Series and fine-tune memory optimization:
-
- >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
- >>> s = pd.Series(np.random.choice(['a', 'b', 'c'], 10 ** 6))
- >>> s.info()
- <class 'pandas.core.series.Series'>
- RangeIndex: 1000000 entries, 0 to 999999
- Series name: None
- Non-Null Count Dtype
- -------------- -----
- 1000000 non-null object
- dtypes: object(1)
- memory usage: 7.6+ MB
-
- >>> s.info(memory_usage='deep')
- <class 'pandas.core.series.Series'>
- RangeIndex: 1000000 entries, 0 to 999999
- Series name: None
- Non-Null Count Dtype
- -------------- -----
- 1000000 non-null object
- dtypes: object(1)
- memory usage: 55.3 MB"""
-)
-
-
-series_see_also_sub = dedent(
- """\
- Series.describe: Generate descriptive statistics of Series.
- Series.memory_usage: Memory usage of Series."""
-)
-
-
-series_sub_kwargs = {
- "klass": "Series",
- "type_sub": "",
- "max_cols_sub": "",
- "show_counts_sub": show_counts_sub,
- "examples_sub": series_examples_sub,
- "see_also_sub": series_see_also_sub,
- "version_added_sub": "\n.. versionadded:: 1.4.0\n",
-}
-
-
-INFO_DOCSTRING = dedent(
- """
- Print a concise summary of a {klass}.
-
- This method prints information about a {klass} including
- the index dtype{type_sub}, non-null values and memory usage.
- {version_added_sub}\
-
- Parameters
- ----------
- verbose : bool, optional
- Whether to print the full summary. By default, the setting in
- ``pandas.options.display.max_info_columns`` is followed.
- buf : writable buffer, defaults to sys.stdout
- Where to send the output. By default, the output is printed to
- sys.stdout. Pass a writable buffer if you need to further process
- the output.
- {max_cols_sub}
- memory_usage : bool, str, optional
- Specifies whether total memory usage of the {klass}
- elements (including the index) should be displayed. By default,
- this follows the ``pandas.options.display.memory_usage`` setting.
-
- True always show memory usage. False never shows memory usage.
- A value of 'deep' is equivalent to "True with deep introspection".
- Memory usage is shown in human-readable units (base-2
- representation). Without deep introspection a memory estimation is
- made based in column dtype and number of rows assuming values
- consume the same memory amount for corresponding dtypes. With deep
- memory introspection, a real memory usage calculation is performed
- at the cost of computational resources. See the
- :ref:`Frequently Asked Questions <df-memory-usage>` for more
- details.
- {show_counts_sub}
-
- Returns
- -------
- None
- This method prints a summary of a {klass} and returns None.
-
- See Also
- --------
- {see_also_sub}
-
- Examples
- --------
- {examples_sub}
- """
-)
-
-
-def _put_str(s: str | Dtype, space: int) -> str:
- """
- Make string of specified length, padding to the right if necessary.
-
- Parameters
- ----------
- s : Union[str, Dtype]
- String to be formatted.
- space : int
- Length to force string to be of.
-
- Returns
- -------
- str
- String coerced to given length.
-
- Examples
- --------
- >>> pd.io.formats.info._put_str("panda", 6)
- 'panda '
- >>> pd.io.formats.info._put_str("panda", 4)
- 'pand'
- """
- return str(s)[:space].ljust(space)
-
-
-def _sizeof_fmt(num: float, size_qualifier: str) -> str:
- """
- Return size in human readable format.
-
- Parameters
- ----------
- num : int
- Size in bytes.
- size_qualifier : str
- Either empty, or '+' (if lower bound).
-
- Returns
- -------
- str
- Size in human readable format.
-
- Examples
- --------
- >>> _sizeof_fmt(23028, '')
- '22.5 KB'
-
- >>> _sizeof_fmt(23028, '+')
- '22.5+ KB'
- """
- for x in ["bytes", "KB", "MB", "GB", "TB"]:
- if num < 1024.0:
- return f"{num:3.1f}{size_qualifier} {x}"
- num /= 1024.0
- return f"{num:3.1f}{size_qualifier} PB"
-
-
-def _initialize_memory_usage(
- memory_usage: bool | str | None = None,
-) -> bool | str:
- """Get memory usage based on inputs and display options."""
- if memory_usage is None:
- memory_usage = get_option("display.memory_usage")
- return memory_usage
-
-
-class BaseInfo(ABC):
- """
- Base class for DataFrameInfo and SeriesInfo.
-
- Parameters
- ----------
- data : DataFrame or Series
- Either dataframe or series.
- memory_usage : bool or str, optional
- If "deep", introspect the data deeply by interrogating object dtypes
- for system-level memory consumption, and include it in the returned
- values.
- """
-
- data: DataFrame | Series
- memory_usage: bool | str
-
- @property
- @abstractmethod
- def dtypes(self) -> Iterable[Dtype]:
- """
- Dtypes.
-
- Returns
- -------
- dtypes : sequence
- Dtype of each of the DataFrame's columns (or one series column).
- """
-
- @property
- @abstractmethod
- def dtype_counts(self) -> Mapping[str, int]:
- """Mapping dtype - number of counts."""
-
- @property
- @abstractmethod
- def non_null_counts(self) -> Sequence[int]:
- """Sequence of non-null counts for all columns or column (if series)."""
-
- @property
- @abstractmethod
- def memory_usage_bytes(self) -> int:
- """
- Memory usage in bytes.
-
- Returns
- -------
- memory_usage_bytes : int
- Object's total memory usage in bytes.
- """
-
- @property
- def memory_usage_string(self) -> str:
- """Memory usage in a form of human readable string."""
- return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n"
-
- @property
- def size_qualifier(self) -> str:
- size_qualifier = ""
- if self.memory_usage:
- if self.memory_usage != "deep":
- # size_qualifier is just a best effort; not guaranteed to catch
- # all cases (e.g., it misses categorical data even with object
- # categories)
- if (
- "object" in self.dtype_counts
- or self.data.index._is_memory_usage_qualified()
- ):
- size_qualifier = "+"
- return size_qualifier
-
- @abstractmethod
- def render(
- self,
- *,
- buf: WriteBuffer[str] | None,
- max_cols: int | None,
- verbose: bool | None,
- show_counts: bool | None,
- ) -> None:
- pass
-
-
-class DataFrameInfo(BaseInfo):
- """
- Class storing dataframe-specific info.
- """
-
- def __init__(
- self,
- data: DataFrame,
- memory_usage: bool | str | None = None,
- ) -> None:
- self.data: DataFrame = data
- self.memory_usage = _initialize_memory_usage(memory_usage)
-
- @property
- def dtype_counts(self) -> Mapping[str, int]:
- return _get_dataframe_dtype_counts(self.data)
-
- @property
- def dtypes(self) -> Iterable[Dtype]:
- """
- Dtypes.
-
- Returns
- -------
- dtypes
- Dtype of each of the DataFrame's columns.
- """
- return self.data.dtypes
-
- @property
- def ids(self) -> Index:
- """
- Column names.
-
- Returns
- -------
- ids : Index
- DataFrame's column names.
- """
- return self.data.columns
-
- @property
- def col_count(self) -> int:
- """Number of columns to be summarized."""
- return len(self.ids)
-
- @property
- def non_null_counts(self) -> Sequence[int]:
- """Sequence of non-null counts for all columns or column (if series)."""
- return self.data.count()
-
- @property
- def memory_usage_bytes(self) -> int:
- deep = self.memory_usage == "deep"
- return self.data.memory_usage(index=True, deep=deep).sum()
-
- def render(
- self,
- *,
- buf: WriteBuffer[str] | None,
- max_cols: int | None,
- verbose: bool | None,
- show_counts: bool | None,
- ) -> None:
- printer = DataFrameInfoPrinter(
- info=self,
- max_cols=max_cols,
- verbose=verbose,
- show_counts=show_counts,
- )
- printer.to_buffer(buf)
-
-
-class SeriesInfo(BaseInfo):
- """
- Class storing series-specific info.
- """
-
- def __init__(
- self,
- data: Series,
- memory_usage: bool | str | None = None,
- ) -> None:
- self.data: Series = data
- self.memory_usage = _initialize_memory_usage(memory_usage)
-
- def render(
- self,
- *,
- buf: WriteBuffer[str] | None = None,
- max_cols: int | None = None,
- verbose: bool | None = None,
- show_counts: bool | None = None,
- ) -> None:
- if max_cols is not None:
- raise ValueError(
- "Argument `max_cols` can only be passed "
- "in DataFrame.info, not Series.info"
- )
- printer = SeriesInfoPrinter(
- info=self,
- verbose=verbose,
- show_counts=show_counts,
- )
- printer.to_buffer(buf)
-
- @property
- def non_null_counts(self) -> Sequence[int]:
- return [self.data.count()]
-
- @property
- def dtypes(self) -> Iterable[Dtype]:
- return [self.data.dtypes]
-
- @property
- def dtype_counts(self) -> Mapping[str, int]:
- from pandas.core.frame import DataFrame
-
- return _get_dataframe_dtype_counts(DataFrame(self.data))
-
- @property
- def memory_usage_bytes(self) -> int:
- """Memory usage in bytes.
-
- Returns
- -------
- memory_usage_bytes : int
- Object's total memory usage in bytes.
- """
- deep = self.memory_usage == "deep"
- return self.data.memory_usage(index=True, deep=deep)
-
-
-class InfoPrinterAbstract:
- """
- Class for printing dataframe or series info.
- """
-
- def to_buffer(self, buf: WriteBuffer[str] | None = None) -> None:
- """Save dataframe info into buffer."""
- table_builder = self._create_table_builder()
- lines = table_builder.get_lines()
- if buf is None: # pragma: no cover
- buf = sys.stdout
- fmt.buffer_put_lines(buf, lines)
-
- @abstractmethod
- def _create_table_builder(self) -> TableBuilderAbstract:
- """Create instance of table builder."""
-
-
-class DataFrameInfoPrinter(InfoPrinterAbstract):
- """
- Class for printing dataframe info.
-
- Parameters
- ----------
- info : DataFrameInfo
- Instance of DataFrameInfo.
- max_cols : int, optional
- When to switch from the verbose to the truncated output.
- verbose : bool, optional
- Whether to print the full summary.
- show_counts : bool, optional
- Whether to show the non-null counts.
- """
-
- def __init__(
- self,
- info: DataFrameInfo,
- max_cols: int | None = None,
- verbose: bool | None = None,
- show_counts: bool | None = None,
- ) -> None:
- self.info = info
- self.data = info.data
- self.verbose = verbose
- self.max_cols = self._initialize_max_cols(max_cols)
- self.show_counts = self._initialize_show_counts(show_counts)
-
- @property
- def max_rows(self) -> int:
- """Maximum info rows to be displayed."""
- return get_option("display.max_info_rows", len(self.data) + 1)
-
- @property
- def exceeds_info_cols(self) -> bool:
- """Check if number of columns to be summarized does not exceed maximum."""
- return bool(self.col_count > self.max_cols)
-
- @property
- def exceeds_info_rows(self) -> bool:
- """Check if number of rows to be summarized does not exceed maximum."""
- return bool(len(self.data) > self.max_rows)
-
- @property
- def col_count(self) -> int:
- """Number of columns to be summarized."""
- return self.info.col_count
-
- def _initialize_max_cols(self, max_cols: int | None) -> int:
- if max_cols is None:
- return get_option("display.max_info_columns", self.col_count + 1)
- return max_cols
-
- def _initialize_show_counts(self, show_counts: bool | None) -> bool:
- if show_counts is None:
- return bool(not self.exceeds_info_cols and not self.exceeds_info_rows)
- else:
- return show_counts
-
- def _create_table_builder(self) -> DataFrameTableBuilder:
- """
- Create instance of table builder based on verbosity and display settings.
- """
- if self.verbose:
- return DataFrameTableBuilderVerbose(
- info=self.info,
- with_counts=self.show_counts,
- )
- elif self.verbose is False: # specifically set to False, not necessarily None
- return DataFrameTableBuilderNonVerbose(info=self.info)
- else:
- if self.exceeds_info_cols:
- return DataFrameTableBuilderNonVerbose(info=self.info)
- else:
- return DataFrameTableBuilderVerbose(
- info=self.info,
- with_counts=self.show_counts,
- )
-
-
-class SeriesInfoPrinter(InfoPrinterAbstract):
- """Class for printing series info.
-
- Parameters
- ----------
- info : SeriesInfo
- Instance of SeriesInfo.
- verbose : bool, optional
- Whether to print the full summary.
- show_counts : bool, optional
- Whether to show the non-null counts.
- """
-
- def __init__(
- self,
- info: SeriesInfo,
- verbose: bool | None = None,
- show_counts: bool | None = None,
- ) -> None:
- self.info = info
- self.data = info.data
- self.verbose = verbose
- self.show_counts = self._initialize_show_counts(show_counts)
-
- def _create_table_builder(self) -> SeriesTableBuilder:
- """
- Create instance of table builder based on verbosity.
- """
- if self.verbose or self.verbose is None:
- return SeriesTableBuilderVerbose(
- info=self.info,
- with_counts=self.show_counts,
- )
- else:
- return SeriesTableBuilderNonVerbose(info=self.info)
-
- def _initialize_show_counts(self, show_counts: bool | None) -> bool:
- if show_counts is None:
- return True
- else:
- return show_counts
-
-
-class TableBuilderAbstract(ABC):
- """
- Abstract builder for info table.
- """
-
- _lines: list[str]
- info: BaseInfo
-
- @abstractmethod
- def get_lines(self) -> list[str]:
- """Product in a form of list of lines (strings)."""
-
- @property
- def data(self) -> DataFrame | Series:
- return self.info.data
-
- @property
- def dtypes(self) -> Iterable[Dtype]:
- """Dtypes of each of the DataFrame's columns."""
- return self.info.dtypes
-
- @property
- def dtype_counts(self) -> Mapping[str, int]:
- """Mapping dtype - number of counts."""
- return self.info.dtype_counts
-
- @property
- def display_memory_usage(self) -> bool:
- """Whether to display memory usage."""
- return bool(self.info.memory_usage)
-
- @property
- def memory_usage_string(self) -> str:
- """Memory usage string with proper size qualifier."""
- return self.info.memory_usage_string
-
- @property
- def non_null_counts(self) -> Sequence[int]:
- return self.info.non_null_counts
-
- def add_object_type_line(self) -> None:
- """Add line with string representation of dataframe to the table."""
- self._lines.append(str(type(self.data)))
-
- def add_index_range_line(self) -> None:
- """Add line with range of indices to the table."""
- self._lines.append(self.data.index._summary())
-
- def add_dtypes_line(self) -> None:
- """Add summary line with dtypes present in dataframe."""
- collected_dtypes = [
- f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items())
- ]
- self._lines.append(f"dtypes: {', '.join(collected_dtypes)}")
-
-
-class DataFrameTableBuilder(TableBuilderAbstract):
- """
- Abstract builder for dataframe info table.
-
- Parameters
- ----------
- info : DataFrameInfo.
- Instance of DataFrameInfo.
- """
-
- def __init__(self, *, info: DataFrameInfo) -> None:
- self.info: DataFrameInfo = info
-
- def get_lines(self) -> list[str]:
- self._lines = []
- if self.col_count == 0:
- self._fill_empty_info()
- else:
- self._fill_non_empty_info()
- return self._lines
-
- def _fill_empty_info(self) -> None:
- """Add lines to the info table, pertaining to empty dataframe."""
- self.add_object_type_line()
- self.add_index_range_line()
- self._lines.append(f"Empty {type(self.data).__name__}\n")
-
- @abstractmethod
- def _fill_non_empty_info(self) -> None:
- """Add lines to the info table, pertaining to non-empty dataframe."""
-
- @property
- def data(self) -> DataFrame:
- """DataFrame."""
- return self.info.data
-
- @property
- def ids(self) -> Index:
- """Dataframe columns."""
- return self.info.ids
-
- @property
- def col_count(self) -> int:
- """Number of dataframe columns to be summarized."""
- return self.info.col_count
-
- def add_memory_usage_line(self) -> None:
- """Add line containing memory usage."""
- self._lines.append(f"memory usage: {self.memory_usage_string}")
-
-
-class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder):
- """
- Dataframe info table builder for non-verbose output.
- """
-
- def _fill_non_empty_info(self) -> None:
- """Add lines to the info table, pertaining to non-empty dataframe."""
- self.add_object_type_line()
- self.add_index_range_line()
- self.add_columns_summary_line()
- self.add_dtypes_line()
- if self.display_memory_usage:
- self.add_memory_usage_line()
-
- def add_columns_summary_line(self) -> None:
- self._lines.append(self.ids._summary(name="Columns"))
-
-
-class TableBuilderVerboseMixin(TableBuilderAbstract):
- """
- Mixin for verbose info output.
- """
-
- SPACING: str = " " * 2
- strrows: Sequence[Sequence[str]]
- gross_column_widths: Sequence[int]
- with_counts: bool
-
- @property
- @abstractmethod
- def headers(self) -> Sequence[str]:
- """Headers names of the columns in verbose table."""
-
- @property
- def header_column_widths(self) -> Sequence[int]:
- """Widths of header columns (only titles)."""
- return [len(col) for col in self.headers]
-
- def _get_gross_column_widths(self) -> Sequence[int]:
- """Get widths of columns containing both headers and actual content."""
- body_column_widths = self._get_body_column_widths()
- return [
- max(*widths)
- for widths in zip(self.header_column_widths, body_column_widths)
- ]
-
- def _get_body_column_widths(self) -> Sequence[int]:
- """Get widths of table content columns."""
- strcols: Sequence[Sequence[str]] = list(zip(*self.strrows))
- return [max(len(x) for x in col) for col in strcols]
-
- def _gen_rows(self) -> Iterator[Sequence[str]]:
- """
- Generator function yielding rows content.
-
- Each element represents a row comprising a sequence of strings.
- """
- if self.with_counts:
- return self._gen_rows_with_counts()
- else:
- return self._gen_rows_without_counts()
-
- @abstractmethod
- def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
- """Iterator with string representation of body data with counts."""
-
- @abstractmethod
- def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
- """Iterator with string representation of body data without counts."""
-
- def add_header_line(self) -> None:
- header_line = self.SPACING.join(
- [
- _put_str(header, col_width)
- for header, col_width in zip(self.headers, self.gross_column_widths)
- ]
- )
- self._lines.append(header_line)
-
- def add_separator_line(self) -> None:
- separator_line = self.SPACING.join(
- [
- _put_str("-" * header_colwidth, gross_colwidth)
- for header_colwidth, gross_colwidth in zip(
- self.header_column_widths, self.gross_column_widths
- )
- ]
- )
- self._lines.append(separator_line)
-
- def add_body_lines(self) -> None:
- for row in self.strrows:
- body_line = self.SPACING.join(
- [
- _put_str(col, gross_colwidth)
- for col, gross_colwidth in zip(row, self.gross_column_widths)
- ]
- )
- self._lines.append(body_line)
-
- def _gen_non_null_counts(self) -> Iterator[str]:
- """Iterator with string representation of non-null counts."""
- for count in self.non_null_counts:
- yield f"{count} non-null"
-
- def _gen_dtypes(self) -> Iterator[str]:
- """Iterator with string representation of column dtypes."""
- for dtype in self.dtypes:
- yield pprint_thing(dtype)
-
-
-class DataFrameTableBuilderVerbose(DataFrameTableBuilder, TableBuilderVerboseMixin):
- """
- Dataframe info table builder for verbose output.
- """
-
- def __init__(
- self,
- *,
- info: DataFrameInfo,
- with_counts: bool,
- ) -> None:
- self.info = info
- self.with_counts = with_counts
- self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
- self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
-
- def _fill_non_empty_info(self) -> None:
- """Add lines to the info table, pertaining to non-empty dataframe."""
- self.add_object_type_line()
- self.add_index_range_line()
- self.add_columns_summary_line()
- self.add_header_line()
- self.add_separator_line()
- self.add_body_lines()
- self.add_dtypes_line()
- if self.display_memory_usage:
- self.add_memory_usage_line()
-
- @property
- def headers(self) -> Sequence[str]:
- """Headers names of the columns in verbose table."""
- if self.with_counts:
- return [" # ", "Column", "Non-Null Count", "Dtype"]
- return [" # ", "Column", "Dtype"]
-
- def add_columns_summary_line(self) -> None:
- self._lines.append(f"Data columns (total {self.col_count} columns):")
-
- def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
- """Iterator with string representation of body data without counts."""
- yield from zip(
- self._gen_line_numbers(),
- self._gen_columns(),
- self._gen_dtypes(),
- )
-
- def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
- """Iterator with string representation of body data with counts."""
- yield from zip(
- self._gen_line_numbers(),
- self._gen_columns(),
- self._gen_non_null_counts(),
- self._gen_dtypes(),
- )
-
- def _gen_line_numbers(self) -> Iterator[str]:
- """Iterator with string representation of column numbers."""
- for i, _ in enumerate(self.ids):
- yield f" {i}"
-
- def _gen_columns(self) -> Iterator[str]:
- """Iterator with string representation of column names."""
- for col in self.ids:
- yield pprint_thing(col)
-
-
-class SeriesTableBuilder(TableBuilderAbstract):
- """
- Abstract builder for series info table.
-
- Parameters
- ----------
- info : SeriesInfo.
- Instance of SeriesInfo.
- """
-
- def __init__(self, *, info: SeriesInfo) -> None:
- self.info: SeriesInfo = info
-
- def get_lines(self) -> list[str]:
- self._lines = []
- self._fill_non_empty_info()
- return self._lines
-
- @property
- def data(self) -> Series:
- """Series."""
- return self.info.data
-
- def add_memory_usage_line(self) -> None:
- """Add line containing memory usage."""
- self._lines.append(f"memory usage: {self.memory_usage_string}")
-
- @abstractmethod
- def _fill_non_empty_info(self) -> None:
- """Add lines to the info table, pertaining to non-empty series."""
-
-
-class SeriesTableBuilderNonVerbose(SeriesTableBuilder):
- """
- Series info table builder for non-verbose output.
- """
-
- def _fill_non_empty_info(self) -> None:
- """Add lines to the info table, pertaining to non-empty series."""
- self.add_object_type_line()
- self.add_index_range_line()
- self.add_dtypes_line()
- if self.display_memory_usage:
- self.add_memory_usage_line()
-
-
-class SeriesTableBuilderVerbose(SeriesTableBuilder, TableBuilderVerboseMixin):
- """
- Series info table builder for verbose output.
- """
-
- def __init__(
- self,
- *,
- info: SeriesInfo,
- with_counts: bool,
- ) -> None:
- self.info = info
- self.with_counts = with_counts
- self.strrows: Sequence[Sequence[str]] = list(self._gen_rows())
- self.gross_column_widths: Sequence[int] = self._get_gross_column_widths()
-
- def _fill_non_empty_info(self) -> None:
- """Add lines to the info table, pertaining to non-empty series."""
- self.add_object_type_line()
- self.add_index_range_line()
- self.add_series_name_line()
- self.add_header_line()
- self.add_separator_line()
- self.add_body_lines()
- self.add_dtypes_line()
- if self.display_memory_usage:
- self.add_memory_usage_line()
-
- def add_series_name_line(self) -> None:
- self._lines.append(f"Series name: {self.data.name}")
-
- @property
- def headers(self) -> Sequence[str]:
- """Headers names of the columns in verbose table."""
- if self.with_counts:
- return ["Non-Null Count", "Dtype"]
- return ["Dtype"]
-
- def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
- """Iterator with string representation of body data without counts."""
- yield from self._gen_dtypes()
-
- def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
- """Iterator with string representation of body data with counts."""
- yield from zip(
- self._gen_non_null_counts(),
- self._gen_dtypes(),
- )
-
-
-def _get_dataframe_dtype_counts(df: DataFrame) -> Mapping[str, int]:
- """
- Create mapping between datatypes and their number of occurrences.
- """
- # groupby dtype.name to collect e.g. Categorical columns
- return df.dtypes.value_counts().groupby(lambda x: x.name).sum()
diff --git a/contrib/python/pandas/py3/pandas/io/formats/latex.py b/contrib/python/pandas/py3/pandas/io/formats/latex.py
deleted file mode 100644
index a97f3d4ef54..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/latex.py
+++ /dev/null
@@ -1,831 +0,0 @@
-"""
-Module for formatting output data in Latex.
-"""
-from __future__ import annotations
-
-from abc import (
- ABC,
- abstractmethod,
-)
-from typing import (
- TYPE_CHECKING,
- Iterator,
- Sequence,
-)
-
-import numpy as np
-
-from pandas.core.dtypes.generic import ABCMultiIndex
-
-if TYPE_CHECKING:
- from pandas.io.formats.format import DataFrameFormatter
-
-
-def _split_into_full_short_caption(
- caption: str | tuple[str, str] | None
-) -> tuple[str, str]:
- """Extract full and short captions from caption string/tuple.
-
- Parameters
- ----------
- caption : str or tuple, optional
- Either table caption string or tuple (full_caption, short_caption).
- If string is provided, then it is treated as table full caption,
- while short_caption is considered an empty string.
-
- Returns
- -------
- full_caption, short_caption : tuple
- Tuple of full_caption, short_caption strings.
- """
- if caption:
- if isinstance(caption, str):
- full_caption = caption
- short_caption = ""
- else:
- try:
- full_caption, short_caption = caption
- except ValueError as err:
- msg = "caption must be either a string or a tuple of two strings"
- raise ValueError(msg) from err
- else:
- full_caption = ""
- short_caption = ""
- return full_caption, short_caption
-
-
-class RowStringConverter:
- r"""Converter for dataframe rows into LaTeX strings.
-
- Parameters
- ----------
- formatter : `DataFrameFormatter`
- Instance of `DataFrameFormatter`.
- multicolumn: bool, optional
- Whether to use \multicolumn macro.
- multicolumn_format: str, optional
- Multicolumn format.
- multirow: bool, optional
- Whether to use \multirow macro.
-
- """
-
- def __init__(
- self,
- formatter: DataFrameFormatter,
- multicolumn: bool = False,
- multicolumn_format: str | None = None,
- multirow: bool = False,
- ) -> None:
- self.fmt = formatter
- self.frame = self.fmt.frame
- self.multicolumn = multicolumn
- self.multicolumn_format = multicolumn_format
- self.multirow = multirow
- self.clinebuf: list[list[int]] = []
- self.strcols = self._get_strcols()
- self.strrows = list(zip(*self.strcols))
-
- def get_strrow(self, row_num: int) -> str:
- """Get string representation of the row."""
- row = self.strrows[row_num]
-
- is_multicol = (
- row_num < self.column_levels and self.fmt.header and self.multicolumn
- )
-
- is_multirow = (
- row_num >= self.header_levels
- and self.fmt.index
- and self.multirow
- and self.index_levels > 1
- )
-
- is_cline_maybe_required = is_multirow and row_num < len(self.strrows) - 1
-
- crow = self._preprocess_row(row)
-
- if is_multicol:
- crow = self._format_multicolumn(crow)
- if is_multirow:
- crow = self._format_multirow(crow, row_num)
-
- lst = []
- lst.append(" & ".join(crow))
- lst.append(" \\\\")
- if is_cline_maybe_required:
- cline = self._compose_cline(row_num, len(self.strcols))
- lst.append(cline)
- return "".join(lst)
-
- @property
- def _header_row_num(self) -> int:
- """Number of rows in header."""
- return self.header_levels if self.fmt.header else 0
-
- @property
- def index_levels(self) -> int:
- """Integer number of levels in index."""
- return self.frame.index.nlevels
-
- @property
- def column_levels(self) -> int:
- return self.frame.columns.nlevels
-
- @property
- def header_levels(self) -> int:
- nlevels = self.column_levels
- if self.fmt.has_index_names and self.fmt.show_index_names:
- nlevels += 1
- return nlevels
-
- def _get_strcols(self) -> list[list[str]]:
- """String representation of the columns."""
- if self.fmt.frame.empty:
- strcols = [[self._empty_info_line]]
- else:
- strcols = self.fmt.get_strcols()
-
- # reestablish the MultiIndex that has been joined by get_strcols()
- if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):
- out = self.frame.index.format(
- adjoin=False,
- sparsify=self.fmt.sparsify,
- names=self.fmt.has_index_names,
- na_rep=self.fmt.na_rep,
- )
-
- # index.format will sparsify repeated entries with empty strings
- # so pad these with some empty space
- def pad_empties(x):
- for pad in reversed(x):
- if pad:
- return [x[0]] + [i if i else " " * len(pad) for i in x[1:]]
-
- gen = (pad_empties(i) for i in out)
-
- # Add empty spaces for each column level
- clevels = self.frame.columns.nlevels
- out = [[" " * len(i[-1])] * clevels + i for i in gen]
-
- # Add the column names to the last index column
- cnames = self.frame.columns.names
- if any(cnames):
- new_names = [i if i else "{}" for i in cnames]
- out[self.frame.index.nlevels - 1][:clevels] = new_names
-
- # Get rid of old multiindex column and add new ones
- strcols = out + strcols[1:]
- return strcols
-
- @property
- def _empty_info_line(self) -> str:
- return (
- f"Empty {type(self.frame).__name__}\n"
- f"Columns: {self.frame.columns}\n"
- f"Index: {self.frame.index}"
- )
-
- def _preprocess_row(self, row: Sequence[str]) -> list[str]:
- """Preprocess elements of the row."""
- if self.fmt.escape:
- crow = _escape_symbols(row)
- else:
- crow = [x if x else "{}" for x in row]
- if self.fmt.bold_rows and self.fmt.index:
- crow = _convert_to_bold(crow, self.index_levels)
- return crow
-
- def _format_multicolumn(self, row: list[str]) -> list[str]:
- r"""
- Combine columns belonging to a group to a single multicolumn entry
- according to self.multicolumn_format
-
- e.g.:
- a & & & b & c &
- will become
- \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c}
- """
- row2 = row[: self.index_levels]
- ncol = 1
- coltext = ""
-
- def append_col() -> None:
- # write multicolumn if needed
- if ncol > 1:
- row2.append(
- f"\\multicolumn{{{ncol:d}}}{{{self.multicolumn_format}}}"
- f"{{{coltext.strip()}}}"
- )
- # don't modify where not needed
- else:
- row2.append(coltext)
-
- for c in row[self.index_levels :]:
- # if next col has text, write the previous
- if c.strip():
- if coltext:
- append_col()
- coltext = c
- ncol = 1
- # if not, add it to the previous multicolumn
- else:
- ncol += 1
- # write last column name
- if coltext:
- append_col()
- return row2
-
- def _format_multirow(self, row: list[str], i: int) -> list[str]:
- r"""
- Check following rows, whether row should be a multirow
-
- e.g.: becomes:
- a & 0 & \multirow{2}{*}{a} & 0 &
- & 1 & & 1 &
- b & 0 & \cline{1-2}
- b & 0 &
- """
- for j in range(self.index_levels):
- if row[j].strip():
- nrow = 1
- for r in self.strrows[i + 1 :]:
- if not r[j].strip():
- nrow += 1
- else:
- break
- if nrow > 1:
- # overwrite non-multirow entry
- row[j] = f"\\multirow{{{nrow:d}}}{{*}}{{{row[j].strip()}}}"
- # save when to end the current block with \cline
- self.clinebuf.append([i + nrow - 1, j + 1])
- return row
-
- def _compose_cline(self, i: int, icol: int) -> str:
- """
- Create clines after multirow-blocks are finished.
- """
- lst = []
- for cl in self.clinebuf:
- if cl[0] == i:
- lst.append(f"\n\\cline{{{cl[1]:d}-{icol:d}}}")
- # remove entries that have been written to buffer
- self.clinebuf = [x for x in self.clinebuf if x[0] != i]
- return "".join(lst)
-
-
-class RowStringIterator(RowStringConverter):
- """Iterator over rows of the header or the body of the table."""
-
- @abstractmethod
- def __iter__(self) -> Iterator[str]:
- """Iterate over LaTeX string representations of rows."""
-
-
-class RowHeaderIterator(RowStringIterator):
- """Iterator for the table header rows."""
-
- def __iter__(self) -> Iterator[str]:
- for row_num in range(len(self.strrows)):
- if row_num < self._header_row_num:
- yield self.get_strrow(row_num)
-
-
-class RowBodyIterator(RowStringIterator):
- """Iterator for the table body rows."""
-
- def __iter__(self) -> Iterator[str]:
- for row_num in range(len(self.strrows)):
- if row_num >= self._header_row_num:
- yield self.get_strrow(row_num)
-
-
-class TableBuilderAbstract(ABC):
- """
- Abstract table builder producing string representation of LaTeX table.
-
- Parameters
- ----------
- formatter : `DataFrameFormatter`
- Instance of `DataFrameFormatter`.
- column_format: str, optional
- Column format, for example, 'rcl' for three columns.
- multicolumn: bool, optional
- Use multicolumn to enhance MultiIndex columns.
- multicolumn_format: str, optional
- The alignment for multicolumns, similar to column_format.
- multirow: bool, optional
- Use multirow to enhance MultiIndex rows.
- caption: str, optional
- Table caption.
- short_caption: str, optional
- Table short caption.
- label: str, optional
- LaTeX label.
- position: str, optional
- Float placement specifier, for example, 'htb'.
- """
-
- def __init__(
- self,
- formatter: DataFrameFormatter,
- column_format: str | None = None,
- multicolumn: bool = False,
- multicolumn_format: str | None = None,
- multirow: bool = False,
- caption: str | None = None,
- short_caption: str | None = None,
- label: str | None = None,
- position: str | None = None,
- ) -> None:
- self.fmt = formatter
- self.column_format = column_format
- self.multicolumn = multicolumn
- self.multicolumn_format = multicolumn_format
- self.multirow = multirow
- self.caption = caption
- self.short_caption = short_caption
- self.label = label
- self.position = position
-
- def get_result(self) -> str:
- """String representation of LaTeX table."""
- elements = [
- self.env_begin,
- self.top_separator,
- self.header,
- self.middle_separator,
- self.env_body,
- self.bottom_separator,
- self.env_end,
- ]
- result = "\n".join([item for item in elements if item])
- trailing_newline = "\n"
- result += trailing_newline
- return result
-
- @property
- @abstractmethod
- def env_begin(self) -> str:
- """Beginning of the environment."""
-
- @property
- @abstractmethod
- def top_separator(self) -> str:
- """Top level separator."""
-
- @property
- @abstractmethod
- def header(self) -> str:
- """Header lines."""
-
- @property
- @abstractmethod
- def middle_separator(self) -> str:
- """Middle level separator."""
-
- @property
- @abstractmethod
- def env_body(self) -> str:
- """Environment body."""
-
- @property
- @abstractmethod
- def bottom_separator(self) -> str:
- """Bottom level separator."""
-
- @property
- @abstractmethod
- def env_end(self) -> str:
- """End of the environment."""
-
-
-class GenericTableBuilder(TableBuilderAbstract):
- """Table builder producing string representation of LaTeX table."""
-
- @property
- def header(self) -> str:
- iterator = self._create_row_iterator(over="header")
- return "\n".join(list(iterator))
-
- @property
- def top_separator(self) -> str:
- return "\\toprule"
-
- @property
- def middle_separator(self) -> str:
- return "\\midrule" if self._is_separator_required() else ""
-
- @property
- def env_body(self) -> str:
- iterator = self._create_row_iterator(over="body")
- return "\n".join(list(iterator))
-
- def _is_separator_required(self) -> bool:
- return bool(self.header and self.env_body)
-
- @property
- def _position_macro(self) -> str:
- r"""Position macro, extracted from self.position, like [h]."""
- return f"[{self.position}]" if self.position else ""
-
- @property
- def _caption_macro(self) -> str:
- r"""Caption macro, extracted from self.caption.
-
- With short caption:
- \caption[short_caption]{caption_string}.
-
- Without short caption:
- \caption{caption_string}.
- """
- if self.caption:
- return "".join(
- [
- r"\caption",
- f"[{self.short_caption}]" if self.short_caption else "",
- f"{{{self.caption}}}",
- ]
- )
- return ""
-
- @property
- def _label_macro(self) -> str:
- r"""Label macro, extracted from self.label, like \label{ref}."""
- return f"\\label{{{self.label}}}" if self.label else ""
-
- def _create_row_iterator(self, over: str) -> RowStringIterator:
- """Create iterator over header or body of the table.
-
- Parameters
- ----------
- over : {'body', 'header'}
- Over what to iterate.
-
- Returns
- -------
- RowStringIterator
- Iterator over body or header.
- """
- iterator_kind = self._select_iterator(over)
- return iterator_kind(
- formatter=self.fmt,
- multicolumn=self.multicolumn,
- multicolumn_format=self.multicolumn_format,
- multirow=self.multirow,
- )
-
- def _select_iterator(self, over: str) -> type[RowStringIterator]:
- """Select proper iterator over table rows."""
- if over == "header":
- return RowHeaderIterator
- elif over == "body":
- return RowBodyIterator
- else:
- msg = f"'over' must be either 'header' or 'body', but {over} was provided"
- raise ValueError(msg)
-
-
-class LongTableBuilder(GenericTableBuilder):
- """Concrete table builder for longtable.
-
- >>> from pandas.io.formats import format as fmt
- >>> df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]})
- >>> formatter = fmt.DataFrameFormatter(df)
- >>> builder = LongTableBuilder(formatter, caption='a long table',
- ... label='tab:long', column_format='lrl')
- >>> table = builder.get_result()
- >>> print(table)
- \\begin{longtable}{lrl}
- \\caption{a long table}
- \\label{tab:long}\\\\
- \\toprule
- {} & a & b \\\\
- \\midrule
- \\endfirsthead
- \\caption[]{a long table} \\\\
- \\toprule
- {} & a & b \\\\
- \\midrule
- \\endhead
- \\midrule
- \\multicolumn{3}{r}{{Continued on next page}} \\\\
- \\midrule
- \\endfoot
- <BLANKLINE>
- \\bottomrule
- \\endlastfoot
- 0 & 1 & b1 \\\\
- 1 & 2 & b2 \\\\
- \\end{longtable}
- <BLANKLINE>
- """
-
- @property
- def env_begin(self) -> str:
- first_row = (
- f"\\begin{{longtable}}{self._position_macro}{{{self.column_format}}}"
- )
- elements = [first_row, f"{self._caption_and_label()}"]
- return "\n".join([item for item in elements if item])
-
- def _caption_and_label(self) -> str:
- if self.caption or self.label:
- double_backslash = "\\\\"
- elements = [f"{self._caption_macro}", f"{self._label_macro}"]
- caption_and_label = "\n".join([item for item in elements if item])
- caption_and_label += double_backslash
- return caption_and_label
- else:
- return ""
-
- @property
- def middle_separator(self) -> str:
- iterator = self._create_row_iterator(over="header")
-
- # the content between \endfirsthead and \endhead commands
- # mitigates repeated List of Tables entries in the final LaTeX
- # document when dealing with longtable environments; GH #34360
- elements = [
- "\\midrule",
- "\\endfirsthead",
- f"\\caption[]{{{self.caption}}} \\\\" if self.caption else "",
- self.top_separator,
- self.header,
- "\\midrule",
- "\\endhead",
- "\\midrule",
- f"\\multicolumn{{{len(iterator.strcols)}}}{{r}}"
- "{{Continued on next page}} \\\\",
- "\\midrule",
- "\\endfoot\n",
- "\\bottomrule",
- "\\endlastfoot",
- ]
- if self._is_separator_required():
- return "\n".join(elements)
- return ""
-
- @property
- def bottom_separator(self) -> str:
- return ""
-
- @property
- def env_end(self) -> str:
- return "\\end{longtable}"
-
-
-class RegularTableBuilder(GenericTableBuilder):
- """Concrete table builder for regular table.
-
- >>> from pandas.io.formats import format as fmt
- >>> df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]})
- >>> formatter = fmt.DataFrameFormatter(df)
- >>> builder = RegularTableBuilder(formatter, caption='caption', label='lab',
- ... column_format='lrc')
- >>> table = builder.get_result()
- >>> print(table)
- \\begin{table}
- \\centering
- \\caption{caption}
- \\label{lab}
- \\begin{tabular}{lrc}
- \\toprule
- {} & a & b \\\\
- \\midrule
- 0 & 1 & b1 \\\\
- 1 & 2 & b2 \\\\
- \\bottomrule
- \\end{tabular}
- \\end{table}
- <BLANKLINE>
- """
-
- @property
- def env_begin(self) -> str:
- elements = [
- f"\\begin{{table}}{self._position_macro}",
- "\\centering",
- f"{self._caption_macro}",
- f"{self._label_macro}",
- f"\\begin{{tabular}}{{{self.column_format}}}",
- ]
- return "\n".join([item for item in elements if item])
-
- @property
- def bottom_separator(self) -> str:
- return "\\bottomrule"
-
- @property
- def env_end(self) -> str:
- return "\n".join(["\\end{tabular}", "\\end{table}"])
-
-
-class TabularBuilder(GenericTableBuilder):
- """Concrete table builder for tabular environment.
-
- >>> from pandas.io.formats import format as fmt
- >>> df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]})
- >>> formatter = fmt.DataFrameFormatter(df)
- >>> builder = TabularBuilder(formatter, column_format='lrc')
- >>> table = builder.get_result()
- >>> print(table)
- \\begin{tabular}{lrc}
- \\toprule
- {} & a & b \\\\
- \\midrule
- 0 & 1 & b1 \\\\
- 1 & 2 & b2 \\\\
- \\bottomrule
- \\end{tabular}
- <BLANKLINE>
- """
-
- @property
- def env_begin(self) -> str:
- return f"\\begin{{tabular}}{{{self.column_format}}}"
-
- @property
- def bottom_separator(self) -> str:
- return "\\bottomrule"
-
- @property
- def env_end(self) -> str:
- return "\\end{tabular}"
-
-
-class LatexFormatter:
- r"""
- Used to render a DataFrame to a LaTeX tabular/longtable environment output.
-
- Parameters
- ----------
- formatter : `DataFrameFormatter`
- longtable : bool, default False
- Use longtable environment.
- column_format : str, default None
- The columns format as specified in `LaTeX table format
- <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g 'rcl' for 3 columns
- multicolumn : bool, default False
- Use \multicolumn to enhance MultiIndex columns.
- multicolumn_format : str, default 'l'
- The alignment for multicolumns, similar to `column_format`
- multirow : bool, default False
- Use \multirow to enhance MultiIndex rows.
- caption : str or tuple, optional
- Tuple (full_caption, short_caption),
- which results in \caption[short_caption]{full_caption};
- if a single string is passed, no short caption will be set.
- label : str, optional
- The LaTeX label to be placed inside ``\label{}`` in the output.
- position : str, optional
- The LaTeX positional argument for tables, to be placed after
- ``\begin{}`` in the output.
-
- See Also
- --------
- HTMLFormatter
- """
-
- def __init__(
- self,
- formatter: DataFrameFormatter,
- longtable: bool = False,
- column_format: str | None = None,
- multicolumn: bool = False,
- multicolumn_format: str | None = None,
- multirow: bool = False,
- caption: str | tuple[str, str] | None = None,
- label: str | None = None,
- position: str | None = None,
- ) -> None:
- self.fmt = formatter
- self.frame = self.fmt.frame
- self.longtable = longtable
- self.column_format = column_format
- self.multicolumn = multicolumn
- self.multicolumn_format = multicolumn_format
- self.multirow = multirow
- self.caption, self.short_caption = _split_into_full_short_caption(caption)
- self.label = label
- self.position = position
-
- def to_string(self) -> str:
- """
- Render a DataFrame to a LaTeX tabular, longtable, or table/tabular
- environment output.
- """
- return self.builder.get_result()
-
- @property
- def builder(self) -> TableBuilderAbstract:
- """Concrete table builder.
-
- Returns
- -------
- TableBuilder
- """
- builder = self._select_builder()
- return builder(
- formatter=self.fmt,
- column_format=self.column_format,
- multicolumn=self.multicolumn,
- multicolumn_format=self.multicolumn_format,
- multirow=self.multirow,
- caption=self.caption,
- short_caption=self.short_caption,
- label=self.label,
- position=self.position,
- )
-
- def _select_builder(self) -> type[TableBuilderAbstract]:
- """Select proper table builder."""
- if self.longtable:
- return LongTableBuilder
- if any([self.caption, self.label, self.position]):
- return RegularTableBuilder
- return TabularBuilder
-
- @property
- def column_format(self) -> str | None:
- """Column format."""
- return self._column_format
-
- @column_format.setter
- def column_format(self, input_column_format: str | None) -> None:
- """Setter for column format."""
- if input_column_format is None:
- self._column_format = (
- self._get_index_format() + self._get_column_format_based_on_dtypes()
- )
- elif not isinstance(input_column_format, str):
- raise ValueError(
- f"column_format must be str or unicode, "
- f"not {type(input_column_format)}"
- )
- else:
- self._column_format = input_column_format
-
- def _get_column_format_based_on_dtypes(self) -> str:
- """Get column format based on data type.
-
- Right alignment for numbers and left - for strings.
- """
-
- def get_col_type(dtype) -> str:
- if issubclass(dtype.type, np.number):
- return "r"
- return "l"
-
- dtypes = self.frame.dtypes._values
- return "".join(map(get_col_type, dtypes))
-
- def _get_index_format(self) -> str:
- """Get index column format."""
- return "l" * self.frame.index.nlevels if self.fmt.index else ""
-
-
-def _escape_symbols(row: Sequence[str]) -> list[str]:
- """Carry out string replacements for special symbols.
-
- Parameters
- ----------
- row : list
- List of string, that may contain special symbols.
-
- Returns
- -------
- list
- list of strings with the special symbols replaced.
- """
- return [
- (
- x.replace("\\", "\\textbackslash ")
- .replace("_", "\\_")
- .replace("%", "\\%")
- .replace("$", "\\$")
- .replace("#", "\\#")
- .replace("{", "\\{")
- .replace("}", "\\}")
- .replace("~", "\\textasciitilde ")
- .replace("^", "\\textasciicircum ")
- .replace("&", "\\&")
- if (x and x != "{}")
- else "{}"
- )
- for x in row
- ]
-
-
-def _convert_to_bold(crow: Sequence[str], ilevels: int) -> list[str]:
- """Convert elements in ``crow`` to bold."""
- return [
- f"\\textbf{{{x}}}" if j < ilevels and x.strip() not in ["", "{}"] else x
- for j, x in enumerate(crow)
- ]
-
-
-if __name__ == "__main__":
- import doctest
-
- doctest.testmod()
diff --git a/contrib/python/pandas/py3/pandas/io/formats/printing.py b/contrib/python/pandas/py3/pandas/io/formats/printing.py
deleted file mode 100644
index 1f0341a6082..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/printing.py
+++ /dev/null
@@ -1,504 +0,0 @@
-"""
-Printing tools.
-"""
-from __future__ import annotations
-
-import sys
-from typing import (
- Any,
- Callable,
- Dict,
- Iterable,
- Mapping,
- Sequence,
- TypeVar,
- Union,
-)
-
-from pandas._config import get_option
-
-from pandas.core.dtypes.inference import is_sequence
-
-EscapeChars = Union[Mapping[str, str], Iterable[str]]
-_KT = TypeVar("_KT")
-_VT = TypeVar("_VT")
-
-
-def adjoin(space: int, *lists: list[str], **kwargs) -> str:
- """
- Glues together two sets of strings using the amount of space requested.
- The idea is to prettify.
-
- ----------
- space : int
- number of spaces for padding
- lists : str
- list of str which being joined
- strlen : callable
- function used to calculate the length of each str. Needed for unicode
- handling.
- justfunc : callable
- function used to justify str. Needed for unicode handling.
- """
- strlen = kwargs.pop("strlen", len)
- justfunc = kwargs.pop("justfunc", justify)
-
- out_lines = []
- newLists = []
- lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
- # not the last one
- lengths.append(max(map(len, lists[-1])))
- maxLen = max(map(len, lists))
- for i, lst in enumerate(lists):
- nl = justfunc(lst, lengths[i], mode="left")
- nl = ([" " * lengths[i]] * (maxLen - len(lst))) + nl
- newLists.append(nl)
- toJoin = zip(*newLists)
- for lines in toJoin:
- out_lines.append("".join(lines))
- return "\n".join(out_lines)
-
-
-def justify(texts: Iterable[str], max_len: int, mode: str = "right") -> list[str]:
- """
- Perform ljust, center, rjust against string or list-like
- """
- if mode == "left":
- return [x.ljust(max_len) for x in texts]
- elif mode == "center":
- return [x.center(max_len) for x in texts]
- else:
- return [x.rjust(max_len) for x in texts]
-
-
-# Unicode consolidation
-# ---------------------
-#
-# pprinting utility functions for generating Unicode text or
-# bytes(3.x)/str(2.x) representations of objects.
-# Try to use these as much as possible rather than rolling your own.
-#
-# When to use
-# -----------
-#
-# 1) If you're writing code internal to pandas (no I/O directly involved),
-# use pprint_thing().
-#
-# It will always return unicode text which can handled by other
-# parts of the package without breakage.
-#
-# 2) if you need to write something out to file, use
-# pprint_thing_encoded(encoding).
-#
-# If no encoding is specified, it defaults to utf-8. Since encoding pure
-# ascii with utf-8 is a no-op you can safely use the default utf-8 if you're
-# working with straight ascii.
-
-
-def _pprint_seq(
- seq: Sequence, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds
-) -> str:
- """
- internal. pprinter for iterables. you should probably use pprint_thing()
- rather than calling this directly.
-
- bounds length of printed sequence, depending on options
- """
- if isinstance(seq, set):
- fmt = "{{{body}}}"
- else:
- fmt = "[{body}]" if hasattr(seq, "__setitem__") else "({body})"
-
- if max_seq_items is False:
- nitems = len(seq)
- else:
- nitems = max_seq_items or get_option("max_seq_items") or len(seq)
-
- s = iter(seq)
- # handle sets, no slicing
- r = [
- pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
- for i in range(min(nitems, len(seq)))
- ]
- body = ", ".join(r)
-
- if nitems < len(seq):
- body += ", ..."
- elif isinstance(seq, tuple) and len(seq) == 1:
- body += ","
-
- return fmt.format(body=body)
-
-
-def _pprint_dict(
- seq: Mapping, _nest_lvl: int = 0, max_seq_items: int | None = None, **kwds
-) -> str:
- """
- internal. pprinter for iterables. you should probably use pprint_thing()
- rather than calling this directly.
- """
- fmt = "{{{things}}}"
- pairs = []
-
- pfmt = "{key}: {val}"
-
- if max_seq_items is False:
- nitems = len(seq)
- else:
- nitems = max_seq_items or get_option("max_seq_items") or len(seq)
-
- for k, v in list(seq.items())[:nitems]:
- pairs.append(
- pfmt.format(
- key=pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
- val=pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds),
- )
- )
-
- if nitems < len(seq):
- return fmt.format(things=", ".join(pairs) + ", ...")
- else:
- return fmt.format(things=", ".join(pairs))
-
-
-def pprint_thing(
- thing: Any,
- _nest_lvl: int = 0,
- escape_chars: EscapeChars | None = None,
- default_escapes: bool = False,
- quote_strings: bool = False,
- max_seq_items: int | None = None,
-) -> str:
- """
- This function is the sanctioned way of converting objects
- to a string representation and properly handles nested sequences.
-
- Parameters
- ----------
- thing : anything to be formatted
- _nest_lvl : internal use only. pprint_thing() is mutually-recursive
- with pprint_sequence, this argument is used to keep track of the
- current nesting level, and limit it.
- escape_chars : list or dict, optional
- Characters to escape. If a dict is passed the values are the
- replacements
- default_escapes : bool, default False
- Whether the input escape characters replaces or adds to the defaults
- max_seq_items : int or None, default None
- Pass through to other pretty printers to limit sequence printing
-
- Returns
- -------
- str
- """
-
- def as_escaped_string(
- thing: Any, escape_chars: EscapeChars | None = escape_chars
- ) -> str:
- translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"}
- if isinstance(escape_chars, dict):
- if default_escapes:
- translate.update(escape_chars)
- else:
- translate = escape_chars
- escape_chars = list(escape_chars.keys())
- else:
- escape_chars = escape_chars or ()
-
- result = str(thing)
- for c in escape_chars:
- result = result.replace(c, translate[c])
- return result
-
- if hasattr(thing, "__next__"):
- return str(thing)
- elif isinstance(thing, dict) and _nest_lvl < get_option(
- "display.pprint_nest_depth"
- ):
- result = _pprint_dict(
- thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
- )
- elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
- result = _pprint_seq(
- thing,
- _nest_lvl,
- escape_chars=escape_chars,
- quote_strings=quote_strings,
- max_seq_items=max_seq_items,
- )
- elif isinstance(thing, str) and quote_strings:
- result = f"'{as_escaped_string(thing)}'"
- else:
- result = as_escaped_string(thing)
-
- return result
-
-
-def pprint_thing_encoded(
- object, encoding: str = "utf-8", errors: str = "replace"
-) -> bytes:
- value = pprint_thing(object) # get unicode representation of object
- return value.encode(encoding, errors)
-
-
-def enable_data_resource_formatter(enable: bool) -> None:
- if "IPython" not in sys.modules:
- # definitely not in IPython
- return
- from IPython import get_ipython
-
- ip = get_ipython()
- if ip is None:
- # still not in IPython
- return
-
- formatters = ip.display_formatter.formatters
- mimetype = "application/vnd.dataresource+json"
-
- if enable:
- if mimetype not in formatters:
- # define tableschema formatter
- from IPython.core.formatters import BaseFormatter
- from traitlets import ObjectName
-
- class TableSchemaFormatter(BaseFormatter):
- print_method = ObjectName("_repr_data_resource_")
- _return_type = (dict,)
-
- # register it:
- formatters[mimetype] = TableSchemaFormatter()
- # enable it if it's been disabled:
- formatters[mimetype].enabled = True
- else:
- # unregister tableschema mime-type
- if mimetype in formatters:
- formatters[mimetype].enabled = False
-
-
-def default_pprint(thing: Any, max_seq_items: int | None = None) -> str:
- return pprint_thing(
- thing,
- escape_chars=("\t", "\r", "\n"),
- quote_strings=True,
- max_seq_items=max_seq_items,
- )
-
-
-def format_object_summary(
- obj,
- formatter: Callable,
- is_justify: bool = True,
- name: str | None = None,
- indent_for_name: bool = True,
- line_break_each_value: bool = False,
-) -> str:
- """
- Return the formatted obj as a unicode string
-
- Parameters
- ----------
- obj : object
- must be iterable and support __getitem__
- formatter : callable
- string formatter for an element
- is_justify : bool
- should justify the display
- name : name, optional
- defaults to the class name of the obj
- indent_for_name : bool, default True
- Whether subsequent lines should be indented to
- align with the name.
- line_break_each_value : bool, default False
- If True, inserts a line break for each value of ``obj``.
- If False, only break lines when the a line of values gets wider
- than the display width.
-
- Returns
- -------
- summary string
- """
- from pandas.io.formats.console import get_console_size
- from pandas.io.formats.format import get_adjustment
-
- display_width, _ = get_console_size()
- if display_width is None:
- display_width = get_option("display.width") or 80
- if name is None:
- name = type(obj).__name__
-
- if indent_for_name:
- name_len = len(name)
- space1 = f'\n{(" " * (name_len + 1))}'
- space2 = f'\n{(" " * (name_len + 2))}'
- else:
- space1 = "\n"
- space2 = "\n " # space for the opening '['
-
- n = len(obj)
- if line_break_each_value:
- # If we want to vertically align on each value of obj, we need to
- # separate values by a line break and indent the values
- sep = ",\n " + " " * len(name)
- else:
- sep = ","
- max_seq_items = get_option("display.max_seq_items") or n
-
- # are we a truncated display
- is_truncated = n > max_seq_items
-
- # adj can optionally handle unicode eastern asian width
- adj = get_adjustment()
-
- def _extend_line(
- s: str, line: str, value: str, display_width: int, next_line_prefix: str
- ) -> tuple[str, str]:
- if adj.len(line.rstrip()) + adj.len(value.rstrip()) >= display_width:
- s += line.rstrip()
- line = next_line_prefix
- line += value
- return s, line
-
- def best_len(values: list[str]) -> int:
- if values:
- return max(adj.len(x) for x in values)
- else:
- return 0
-
- close = ", "
-
- if n == 0:
- summary = f"[]{close}"
- elif n == 1 and not line_break_each_value:
- first = formatter(obj[0])
- summary = f"[{first}]{close}"
- elif n == 2 and not line_break_each_value:
- first = formatter(obj[0])
- last = formatter(obj[-1])
- summary = f"[{first}, {last}]{close}"
- else:
- if max_seq_items == 1:
- # If max_seq_items=1 show only last element
- head = []
- tail = [formatter(x) for x in obj[-1:]]
- elif n > max_seq_items:
- n = min(max_seq_items // 2, 10)
- head = [formatter(x) for x in obj[:n]]
- tail = [formatter(x) for x in obj[-n:]]
- else:
- head = []
- tail = [formatter(x) for x in obj]
-
- # adjust all values to max length if needed
- if is_justify:
- if line_break_each_value:
- # Justify each string in the values of head and tail, so the
- # strings will right align when head and tail are stacked
- # vertically.
- head, tail = _justify(head, tail)
- elif is_truncated or not (
- len(", ".join(head)) < display_width
- and len(", ".join(tail)) < display_width
- ):
- # Each string in head and tail should align with each other
- max_length = max(best_len(head), best_len(tail))
- head = [x.rjust(max_length) for x in head]
- tail = [x.rjust(max_length) for x in tail]
- # If we are not truncated and we are only a single
- # line, then don't justify
-
- if line_break_each_value:
- # Now head and tail are of type List[Tuple[str]]. Below we
- # convert them into List[str], so there will be one string per
- # value. Also truncate items horizontally if wider than
- # max_space
- max_space = display_width - len(space2)
- value = tail[0]
- for max_items in reversed(range(1, len(value) + 1)):
- pprinted_seq = _pprint_seq(value, max_seq_items=max_items)
- if len(pprinted_seq) < max_space:
- head = [_pprint_seq(x, max_seq_items=max_items) for x in head]
- tail = [_pprint_seq(x, max_seq_items=max_items) for x in tail]
- break
-
- summary = ""
- line = space2
-
- for head_value in head:
- word = head_value + sep + " "
- summary, line = _extend_line(summary, line, word, display_width, space2)
-
- if is_truncated:
- # remove trailing space of last line
- summary += line.rstrip() + space2 + "..."
- line = space2
-
- for tail_item in tail[:-1]:
- word = tail_item + sep + " "
- summary, line = _extend_line(summary, line, word, display_width, space2)
-
- # last value: no sep added + 1 space of width used for trailing ','
- summary, line = _extend_line(summary, line, tail[-1], display_width - 2, space2)
- summary += line
-
- # right now close is either '' or ', '
- # Now we want to include the ']', but not the maybe space.
- close = "]" + close.rstrip(" ")
- summary += close
-
- if len(summary) > (display_width) or line_break_each_value:
- summary += space1
- else: # one row
- summary += " "
-
- # remove initial space
- summary = "[" + summary[len(space2) :]
-
- return summary
-
-
-def _justify(
- head: list[Sequence[str]], tail: list[Sequence[str]]
-) -> tuple[list[tuple[str, ...]], list[tuple[str, ...]]]:
- """
- Justify items in head and tail, so they are right-aligned when stacked.
-
- Parameters
- ----------
- head : list-like of list-likes of strings
- tail : list-like of list-likes of strings
-
- Returns
- -------
- tuple of list of tuples of strings
- Same as head and tail, but items are right aligned when stacked
- vertically.
-
- Examples
- --------
- >>> _justify([['a', 'b']], [['abc', 'abcd']])
- ([(' a', ' b')], [('abc', 'abcd')])
- """
- combined = head + tail
-
- # For each position for the sequences in ``combined``,
- # find the length of the largest string.
- max_length = [0] * len(combined[0])
- for inner_seq in combined:
- length = [len(item) for item in inner_seq]
- max_length = [max(x, y) for x, y in zip(max_length, length)]
-
- # justify each item in each list-like in head and tail using max_length
- head_tuples = [
- tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in head
- ]
- tail_tuples = [
- tuple(x.rjust(max_len) for x, max_len in zip(seq, max_length)) for seq in tail
- ]
- return head_tuples, tail_tuples
-
-
-class PrettyDict(Dict[_KT, _VT]):
- """Dict extension to support abbreviated __repr__"""
-
- def __repr__(self) -> str:
- return pprint_thing(self)
diff --git a/contrib/python/pandas/py3/pandas/io/formats/string.py b/contrib/python/pandas/py3/pandas/io/formats/string.py
deleted file mode 100644
index c143988bdc8..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/string.py
+++ /dev/null
@@ -1,207 +0,0 @@
-"""
-Module for formatting output data in console (to string).
-"""
-from __future__ import annotations
-
-from shutil import get_terminal_size
-from typing import (
- TYPE_CHECKING,
- Iterable,
-)
-
-import numpy as np
-
-from pandas.io.formats.printing import pprint_thing
-
-if TYPE_CHECKING:
- from pandas.io.formats.format import DataFrameFormatter
-
-
-class StringFormatter:
- """Formatter for string representation of a dataframe."""
-
- def __init__(self, fmt: DataFrameFormatter, line_width: int | None = None) -> None:
- self.fmt = fmt
- self.adj = fmt.adj
- self.frame = fmt.frame
- self.line_width = line_width
-
- def to_string(self) -> str:
- text = self._get_string_representation()
- if self.fmt.should_show_dimensions:
- text = "".join([text, self.fmt.dimensions_info])
- return text
-
- def _get_strcols(self) -> list[list[str]]:
- strcols = self.fmt.get_strcols()
- if self.fmt.is_truncated:
- strcols = self._insert_dot_separators(strcols)
- return strcols
-
- def _get_string_representation(self) -> str:
- if self.fmt.frame.empty:
- return self._empty_info_line
-
- strcols = self._get_strcols()
-
- if self.line_width is None:
- # no need to wrap around just print the whole frame
- return self.adj.adjoin(1, *strcols)
-
- if self._need_to_wrap_around:
- return self._join_multiline(strcols)
-
- return self._fit_strcols_to_terminal_width(strcols)
-
- @property
- def _empty_info_line(self) -> str:
- return (
- f"Empty {type(self.frame).__name__}\n"
- f"Columns: {pprint_thing(self.frame.columns)}\n"
- f"Index: {pprint_thing(self.frame.index)}"
- )
-
- @property
- def _need_to_wrap_around(self) -> bool:
- return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0)
-
- def _insert_dot_separators(self, strcols: list[list[str]]) -> list[list[str]]:
- str_index = self.fmt._get_formatted_index(self.fmt.tr_frame)
- index_length = len(str_index)
-
- if self.fmt.is_truncated_horizontally:
- strcols = self._insert_dot_separator_horizontal(strcols, index_length)
-
- if self.fmt.is_truncated_vertically:
- strcols = self._insert_dot_separator_vertical(strcols, index_length)
-
- return strcols
-
- @property
- def _adjusted_tr_col_num(self) -> int:
- return self.fmt.tr_col_num + 1 if self.fmt.index else self.fmt.tr_col_num
-
- def _insert_dot_separator_horizontal(
- self, strcols: list[list[str]], index_length: int
- ) -> list[list[str]]:
- strcols.insert(self._adjusted_tr_col_num, [" ..."] * index_length)
- return strcols
-
- def _insert_dot_separator_vertical(
- self, strcols: list[list[str]], index_length: int
- ) -> list[list[str]]:
- n_header_rows = index_length - len(self.fmt.tr_frame)
- row_num = self.fmt.tr_row_num
- for ix, col in enumerate(strcols):
- cwidth = self.adj.len(col[row_num])
-
- if self.fmt.is_truncated_horizontally:
- is_dot_col = ix == self._adjusted_tr_col_num
- else:
- is_dot_col = False
-
- if cwidth > 3 or is_dot_col:
- dots = "..."
- else:
- dots = ".."
-
- if ix == 0 and self.fmt.index:
- dot_mode = "left"
- elif is_dot_col:
- cwidth = 4
- dot_mode = "right"
- else:
- dot_mode = "right"
-
- dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0]
- col.insert(row_num + n_header_rows, dot_str)
- return strcols
-
- def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str:
- lwidth = self.line_width
- adjoin_width = 1
- strcols = list(strcols_input)
-
- if self.fmt.index:
- idx = strcols.pop(0)
- lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width
-
- col_widths = [
- np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0
- for col in strcols
- ]
-
- assert lwidth is not None
- col_bins = _binify(col_widths, lwidth)
- nbins = len(col_bins)
-
- str_lst = []
- start = 0
- for i, end in enumerate(col_bins):
- row = strcols[start:end]
- if self.fmt.index:
- row.insert(0, idx)
- if nbins > 1:
- nrows = len(row[-1])
- if end <= len(strcols) and i < nbins - 1:
- row.append([" \\"] + [" "] * (nrows - 1))
- else:
- row.append([" "] * nrows)
- str_lst.append(self.adj.adjoin(adjoin_width, *row))
- start = end
- return "\n\n".join(str_lst)
-
- def _fit_strcols_to_terminal_width(self, strcols: list[list[str]]) -> str:
- from pandas import Series
-
- lines = self.adj.adjoin(1, *strcols).split("\n")
- max_len = Series(lines).str.len().max()
- # plus truncate dot col
- width, _ = get_terminal_size()
- dif = max_len - width
- # '+ 1' to avoid too wide repr (GH PR #17023)
- adj_dif = dif + 1
- col_lens = Series([Series(ele).apply(len).max() for ele in strcols])
- n_cols = len(col_lens)
- counter = 0
- while adj_dif > 0 and n_cols > 1:
- counter += 1
- mid = round(n_cols / 2)
- mid_ix = col_lens.index[mid]
- col_len = col_lens[mid_ix]
- # adjoin adds one
- adj_dif -= col_len + 1
- col_lens = col_lens.drop(mid_ix)
- n_cols = len(col_lens)
-
- # subtract index column
- max_cols_fitted = n_cols - self.fmt.index
- # GH-21180. Ensure that we print at least two.
- max_cols_fitted = max(max_cols_fitted, 2)
- self.fmt.max_cols_fitted = max_cols_fitted
-
- # Call again _truncate to cut frame appropriately
- # and then generate string representation
- self.fmt.truncate()
- strcols = self._get_strcols()
- return self.adj.adjoin(1, *strcols)
-
-
-def _binify(cols: list[int], line_width: int) -> list[int]:
- adjoin_width = 1
- bins = []
- curr_width = 0
- i_last_column = len(cols) - 1
- for i, w in enumerate(cols):
- w_adjoined = w + adjoin_width
- curr_width += w_adjoined
- if i_last_column == i:
- wrap = curr_width + 1 > line_width and i > 0
- else:
- wrap = curr_width + 2 > line_width and i > 0
- if wrap:
- bins.append(i)
- curr_width = w_adjoined
-
- bins.append(len(cols))
- return bins
diff --git a/contrib/python/pandas/py3/pandas/io/formats/style.py b/contrib/python/pandas/py3/pandas/io/formats/style.py
deleted file mode 100644
index ecabea1bc32..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/style.py
+++ /dev/null
@@ -1,3946 +0,0 @@
-"""
-Module for applying conditional formatting to DataFrames and Series.
-"""
-from __future__ import annotations
-
-from contextlib import contextmanager
-import copy
-from functools import partial
-import operator
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Generator,
- Hashable,
- Sequence,
- overload,
-)
-
-import numpy as np
-
-from pandas._config import get_option
-
-from pandas._typing import (
- Axis,
- AxisInt,
- FilePath,
- IndexLabel,
- Level,
- QuantileInterpolation,
- Scalar,
- StorageOptions,
- WriteBuffer,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.util._decorators import (
- Substitution,
- doc,
-)
-
-import pandas as pd
-from pandas import (
- IndexSlice,
- RangeIndex,
-)
-import pandas.core.common as com
-from pandas.core.frame import (
- DataFrame,
- Series,
-)
-from pandas.core.generic import NDFrame
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.formats.format import save_to_buffer
-
-jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.")
-
-from pandas.io.formats.style_render import (
- CSSProperties,
- CSSStyles,
- ExtFormatter,
- StylerRenderer,
- Subset,
- Tooltips,
- format_table_styles,
- maybe_convert_css_to_tuples,
- non_reducing_slice,
- refactor_levels,
-)
-
-if TYPE_CHECKING:
- from matplotlib.colors import Colormap
-
-try:
- import matplotlib as mpl
- import matplotlib.pyplot as plt
-
- has_mpl = True
-except ImportError:
- has_mpl = False
-
-
-@contextmanager
-def _mpl(func: Callable) -> Generator[tuple[Any, Any], None, None]:
- if has_mpl:
- yield plt, mpl
- else:
- raise ImportError(f"{func.__name__} requires matplotlib.")
-
-
-####
-# Shared Doc Strings
-
-subset_args = """subset : label, array-like, IndexSlice, optional
- A valid 2d input to `DataFrame.loc[<subset>]`, or, in the case of a 1d input
- or single key, to `DataFrame.loc[:, <subset>]` where the columns are
- prioritised, to limit ``data`` to *before* applying the function."""
-
-properties_args = """props : str, default None
- CSS properties to use for highlighting. If ``props`` is given, ``color``
- is not used."""
-
-coloring_args = """color : str, default '{default}'
- Background color to use for highlighting."""
-
-buffering_args = """buf : str, path object, file-like object, optional
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a string ``write()`` function. If ``None``, the result is
- returned as a string."""
-
-encoding_args = """encoding : str, optional
- Character encoding setting for file output (and meta tags if available).
- Defaults to ``pandas.options.styler.render.encoding`` value of "utf-8"."""
-
-#
-###
-
-
-class Styler(StylerRenderer):
- r"""
- Helps style a DataFrame or Series according to the data with HTML and CSS.
-
- Parameters
- ----------
- data : Series or DataFrame
- Data to be styled - either a Series or DataFrame.
- precision : int, optional
- Precision to round floats to. If not given defaults to
- ``pandas.options.styler.format.precision``.
-
- .. versionchanged:: 1.4.0
- table_styles : list-like, default None
- List of {selector: (attr, value)} dicts; see Notes.
- uuid : str, default None
- A unique identifier to avoid CSS collisions; generated automatically.
- caption : str, tuple, default None
- String caption to attach to the table. Tuple only used for LaTeX dual captions.
- table_attributes : str, default None
- Items that show up in the opening ``<table>`` tag
- in addition to automatic (by default) id.
- cell_ids : bool, default True
- If True, each cell will have an ``id`` attribute in their HTML tag.
- The ``id`` takes the form ``T_<uuid>_row<num_row>_col<num_col>``
- where ``<uuid>`` is the unique identifier, ``<num_row>`` is the row
- number and ``<num_col>`` is the column number.
- na_rep : str, optional
- Representation for missing values.
- If ``na_rep`` is None, no special formatting is applied, and falls back to
- ``pandas.options.styler.format.na_rep``.
-
- uuid_len : int, default 5
- If ``uuid`` is not specified, the length of the ``uuid`` to randomly generate
- expressed in hex characters, in range [0, 32].
-
- .. versionadded:: 1.2.0
-
- decimal : str, optional
- Character used as decimal separator for floats, complex and integers. If not
- given uses ``pandas.options.styler.format.decimal``.
-
- .. versionadded:: 1.3.0
-
- thousands : str, optional, default None
- Character used as thousands separator for floats, complex and integers. If not
- given uses ``pandas.options.styler.format.thousands``.
-
- .. versionadded:: 1.3.0
-
- escape : str, optional
- Use 'html' to replace the characters ``&``, ``<``, ``>``, ``'``, and ``"``
- in cell display string with HTML-safe sequences.
- Use 'latex' to replace the characters ``&``, ``%``, ``$``, ``#``, ``_``,
- ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with
- LaTeX-safe sequences. If not given uses ``pandas.options.styler.format.escape``.
-
- .. versionadded:: 1.3.0
- formatter : str, callable, dict, optional
- Object to define how values are displayed. See ``Styler.format``. If not given
- uses ``pandas.options.styler.format.formatter``.
-
- .. versionadded:: 1.4.0
-
- Attributes
- ----------
- env : Jinja2 jinja2.Environment
- template_html : Jinja2 Template
- template_html_table : Jinja2 Template
- template_html_style : Jinja2 Template
- template_latex : Jinja2 Template
- loader : Jinja2 Loader
-
- See Also
- --------
- DataFrame.style : Return a Styler object containing methods for building
- a styled HTML representation for the DataFrame.
-
- Notes
- -----
- Most styling will be done by passing style functions into
- ``Styler.apply`` or ``Styler.applymap``. Style functions should
- return values with strings containing CSS ``'attr: value'`` that will
- be applied to the indicated cells.
-
- If using in the Jupyter notebook, Styler has defined a ``_repr_html_``
- to automatically render itself. Otherwise call Styler.to_html to get
- the generated HTML.
-
- CSS classes are attached to the generated HTML
-
- * Index and Column names include ``index_name`` and ``level<k>``
- where `k` is its level in a MultiIndex
- * Index label cells include
-
- * ``row_heading``
- * ``row<n>`` where `n` is the numeric position of the row
- * ``level<k>`` where `k` is the level in a MultiIndex
-
- * Column label cells include
- * ``col_heading``
- * ``col<n>`` where `n` is the numeric position of the column
- * ``level<k>`` where `k` is the level in a MultiIndex
-
- * Blank cells include ``blank``
- * Data cells include ``data``
- * Trimmed cells include ``col_trim`` or ``row_trim``.
-
- Any, or all, or these classes can be renamed by using the ``css_class_names``
- argument in ``Styler.set_table_classes``, giving a value such as
- *{"row": "MY_ROW_CLASS", "col_trim": "", "row_trim": ""}*.
- """
-
- def __init__(
- self,
- data: DataFrame | Series,
- precision: int | None = None,
- table_styles: CSSStyles | None = None,
- uuid: str | None = None,
- caption: str | tuple | list | None = None,
- table_attributes: str | None = None,
- cell_ids: bool = True,
- na_rep: str | None = None,
- uuid_len: int = 5,
- decimal: str | None = None,
- thousands: str | None = None,
- escape: str | None = None,
- formatter: ExtFormatter | None = None,
- ) -> None:
- super().__init__(
- data=data,
- uuid=uuid,
- uuid_len=uuid_len,
- table_styles=table_styles,
- table_attributes=table_attributes,
- caption=caption,
- cell_ids=cell_ids,
- precision=precision,
- )
-
- # validate ordered args
- thousands = thousands or get_option("styler.format.thousands")
- decimal = decimal or get_option("styler.format.decimal")
- na_rep = na_rep or get_option("styler.format.na_rep")
- escape = escape or get_option("styler.format.escape")
- formatter = formatter or get_option("styler.format.formatter")
- # precision is handled by superclass as default for performance
-
- self.format(
- formatter=formatter,
- precision=precision,
- na_rep=na_rep,
- escape=escape,
- decimal=decimal,
- thousands=thousands,
- )
-
- def concat(self, other: Styler) -> Styler:
- """
- Append another Styler to combine the output into a single table.
-
- .. versionadded:: 1.5.0
-
- Parameters
- ----------
- other : Styler
- The other Styler object which has already been styled and formatted. The
- data for this Styler must have the same columns as the original, and the
- number of index levels must also be the same to render correctly.
-
- Returns
- -------
- Styler
-
- Notes
- -----
- The purpose of this method is to extend existing styled dataframes with other
- metrics that may be useful but may not conform to the original's structure.
- For example adding a sub total row, or displaying metrics such as means,
- variance or counts.
-
- Styles that are applied using the ``apply``, ``applymap``, ``apply_index``
- and ``applymap_index``, and formatting applied with ``format`` and
- ``format_index`` will be preserved.
-
- .. warning::
- Only the output methods ``to_html``, ``to_string`` and ``to_latex``
- currently work with concatenated Stylers.
-
- Other output methods, including ``to_excel``, **do not** work with
- concatenated Stylers.
-
- The following should be noted:
-
- - ``table_styles``, ``table_attributes``, ``caption`` and ``uuid`` are all
- inherited from the original Styler and not ``other``.
- - hidden columns and hidden index levels will be inherited from the
- original Styler
- - ``css`` will be inherited from the original Styler, and the value of
- keys ``data``, ``row_heading`` and ``row`` will be prepended with
- ``foot0_``. If more concats are chained, their styles will be prepended
- with ``foot1_``, ''foot_2'', etc., and if a concatenated style have
- another concatanated style, the second style will be prepended with
- ``foot{parent}_foot{child}_``.
-
- A common use case is to concatenate user defined functions with
- ``DataFrame.agg`` or with described statistics via ``DataFrame.describe``.
- See examples.
-
- Examples
- --------
- A common use case is adding totals rows, or otherwise, via methods calculated
- in ``DataFrame.agg``.
-
- >>> df = DataFrame([[4, 6], [1, 9], [3, 4], [5, 5], [9,6]],
- ... columns=["Mike", "Jim"],
- ... index=["Mon", "Tue", "Wed", "Thurs", "Fri"])
- >>> styler = df.style.concat(df.agg(["sum"]).style) # doctest: +SKIP
-
- .. figure:: ../../_static/style/footer_simple.png
-
- Since the concatenated object is a Styler the existing functionality can be
- used to conditionally format it as well as the original.
-
- >>> descriptors = df.agg(["sum", "mean", lambda s: s.dtype])
- >>> descriptors.index = ["Total", "Average", "dtype"]
- >>> other = (descriptors.style
- ... .highlight_max(axis=1, subset=(["Total", "Average"], slice(None)))
- ... .format(subset=("Average", slice(None)), precision=2, decimal=",")
- ... .applymap(lambda v: "font-weight: bold;"))
- >>> styler = (df.style
- ... .highlight_max(color="salmon")
- ... .set_table_styles([{"selector": ".foot_row0",
- ... "props": "border-top: 1px solid black;"}]))
- >>> styler.concat(other) # doctest: +SKIP
-
- .. figure:: ../../_static/style/footer_extended.png
-
- When ``other`` has fewer index levels than the original Styler it is possible
- to extend the index in ``other``, with placeholder levels.
-
- >>> df = DataFrame([[1], [2]], index=pd.MultiIndex.from_product([[0], [1, 2]]))
- >>> descriptors = df.agg(["sum"])
- >>> descriptors.index = pd.MultiIndex.from_product([[""], descriptors.index])
- >>> df.style.concat(descriptors.style) # doctest: +SKIP
- """
- if not isinstance(other, Styler):
- raise TypeError("`other` must be of type `Styler`")
- if not self.data.columns.equals(other.data.columns):
- raise ValueError("`other.data` must have same columns as `Styler.data`")
- if not self.data.index.nlevels == other.data.index.nlevels:
- raise ValueError(
- "number of index levels must be same in `other` "
- "as in `Styler`. See documentation for suggestions."
- )
- self.concatenated.append(other)
- return self
-
- def _repr_html_(self) -> str | None:
- """
- Hooks into Jupyter notebook rich display system, which calls _repr_html_ by
- default if an object is returned at the end of a cell.
- """
- if get_option("styler.render.repr") == "html":
- return self.to_html()
- return None
-
- def _repr_latex_(self) -> str | None:
- if get_option("styler.render.repr") == "latex":
- return self.to_latex()
- return None
-
- def set_tooltips(
- self,
- ttips: DataFrame,
- props: CSSProperties | None = None,
- css_class: str | None = None,
- ) -> Styler:
- """
- Set the DataFrame of strings on ``Styler`` generating ``:hover`` tooltips.
-
- These string based tooltips are only applicable to ``<td>`` HTML elements,
- and cannot be used for column or index headers.
-
- .. versionadded:: 1.3.0
-
- Parameters
- ----------
- ttips : DataFrame
- DataFrame containing strings that will be translated to tooltips, mapped
- by identical column and index values that must exist on the underlying
- Styler data. None, NaN values, and empty strings will be ignored and
- not affect the rendered HTML.
- props : list-like or str, optional
- List of (attr, value) tuples or a valid CSS string. If ``None`` adopts
- the internal default values described in notes.
- css_class : str, optional
- Name of the tooltip class used in CSS, should conform to HTML standards.
- Only useful if integrating tooltips with external CSS. If ``None`` uses the
- internal default value 'pd-t'.
-
- Returns
- -------
- Styler
-
- Notes
- -----
- Tooltips are created by adding `<span class="pd-t"></span>` to each data cell
- and then manipulating the table level CSS to attach pseudo hover and pseudo
- after selectors to produce the required the results.
-
- The default properties for the tooltip CSS class are:
-
- - visibility: hidden
- - position: absolute
- - z-index: 1
- - background-color: black
- - color: white
- - transform: translate(-20px, -20px)
-
- The property 'visibility: hidden;' is a key prerequisite to the hover
- functionality, and should always be included in any manual properties
- specification, using the ``props`` argument.
-
- Tooltips are not designed to be efficient, and can add large amounts of
- additional HTML for larger tables, since they also require that ``cell_ids``
- is forced to `True`.
-
- Examples
- --------
- Basic application
-
- >>> df = pd.DataFrame(data=[[0, 1], [2, 3]])
- >>> ttips = pd.DataFrame(
- ... data=[["Min", ""], [np.nan, "Max"]], columns=df.columns, index=df.index
- ... )
- >>> s = df.style.set_tooltips(ttips).to_html()
-
- Optionally controlling the tooltip visual display
-
- >>> df.style.set_tooltips(ttips, css_class='tt-add', props=[
- ... ('visibility', 'hidden'),
- ... ('position', 'absolute'),
- ... ('z-index', 1)]) # doctest: +SKIP
- >>> df.style.set_tooltips(ttips, css_class='tt-add',
- ... props='visibility:hidden; position:absolute; z-index:1;')
- ... # doctest: +SKIP
- """
- if not self.cell_ids:
- # tooltips not optimised for individual cell check. requires reasonable
- # redesign and more extensive code for a feature that might be rarely used.
- raise NotImplementedError(
- "Tooltips can only render with 'cell_ids' is True."
- )
- if not ttips.index.is_unique or not ttips.columns.is_unique:
- raise KeyError(
- "Tooltips render only if `ttips` has unique index and columns."
- )
- if self.tooltips is None: # create a default instance if necessary
- self.tooltips = Tooltips()
- self.tooltips.tt_data = ttips
- if props:
- self.tooltips.class_properties = props
- if css_class:
- self.tooltips.class_name = css_class
-
- return self
-
- @doc(
- NDFrame.to_excel,
- klass="Styler",
- storage_options=_shared_docs["storage_options"],
- storage_options_versionadded="1.5.0",
- )
- def to_excel(
- self,
- excel_writer,
- sheet_name: str = "Sheet1",
- na_rep: str = "",
- float_format: str | None = None,
- columns: Sequence[Hashable] | None = None,
- header: Sequence[Hashable] | bool = True,
- index: bool = True,
- index_label: IndexLabel | None = None,
- startrow: int = 0,
- startcol: int = 0,
- engine: str | None = None,
- merge_cells: bool = True,
- encoding: str | None = None,
- inf_rep: str = "inf",
- verbose: bool = True,
- freeze_panes: tuple[int, int] | None = None,
- storage_options: StorageOptions = None,
- ) -> None:
- from pandas.io.formats.excel import ExcelFormatter
-
- formatter = ExcelFormatter(
- self,
- na_rep=na_rep,
- cols=columns,
- header=header,
- float_format=float_format,
- index=index,
- index_label=index_label,
- merge_cells=merge_cells,
- inf_rep=inf_rep,
- )
- formatter.write(
- excel_writer,
- sheet_name=sheet_name,
- startrow=startrow,
- startcol=startcol,
- freeze_panes=freeze_panes,
- engine=engine,
- storage_options=storage_options,
- )
-
- @overload
- def to_latex(
- self,
- buf: FilePath | WriteBuffer[str],
- *,
- column_format: str | None = ...,
- position: str | None = ...,
- position_float: str | None = ...,
- hrules: bool | None = ...,
- clines: str | None = ...,
- label: str | None = ...,
- caption: str | tuple | None = ...,
- sparse_index: bool | None = ...,
- sparse_columns: bool | None = ...,
- multirow_align: str | None = ...,
- multicol_align: str | None = ...,
- siunitx: bool = ...,
- environment: str | None = ...,
- encoding: str | None = ...,
- convert_css: bool = ...,
- ) -> None:
- ...
-
- @overload
- def to_latex(
- self,
- buf: None = ...,
- *,
- column_format: str | None = ...,
- position: str | None = ...,
- position_float: str | None = ...,
- hrules: bool | None = ...,
- clines: str | None = ...,
- label: str | None = ...,
- caption: str | tuple | None = ...,
- sparse_index: bool | None = ...,
- sparse_columns: bool | None = ...,
- multirow_align: str | None = ...,
- multicol_align: str | None = ...,
- siunitx: bool = ...,
- environment: str | None = ...,
- encoding: str | None = ...,
- convert_css: bool = ...,
- ) -> str:
- ...
-
- def to_latex(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- *,
- column_format: str | None = None,
- position: str | None = None,
- position_float: str | None = None,
- hrules: bool | None = None,
- clines: str | None = None,
- label: str | None = None,
- caption: str | tuple | None = None,
- sparse_index: bool | None = None,
- sparse_columns: bool | None = None,
- multirow_align: str | None = None,
- multicol_align: str | None = None,
- siunitx: bool = False,
- environment: str | None = None,
- encoding: str | None = None,
- convert_css: bool = False,
- ) -> str | None:
- r"""
- Write Styler to a file, buffer or string in LaTeX format.
-
- .. versionadded:: 1.3.0
-
- Parameters
- ----------
- buf : str, path object, file-like object, or None, default None
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a string ``write()`` function. If None, the result is
- returned as a string.
- column_format : str, optional
- The LaTeX column specification placed in location:
-
- \\begin{tabular}{<column_format>}
-
- Defaults to 'l' for index and
- non-numeric data columns, and, for numeric data columns,
- to 'r' by default, or 'S' if ``siunitx`` is ``True``.
- position : str, optional
- The LaTeX positional argument (e.g. 'h!') for tables, placed in location:
-
- ``\\begin{table}[<position>]``.
- position_float : {"centering", "raggedleft", "raggedright"}, optional
- The LaTeX float command placed in location:
-
- \\begin{table}[<position>]
-
- \\<position_float>
-
- Cannot be used if ``environment`` is "longtable".
- hrules : bool
- Set to `True` to add \\toprule, \\midrule and \\bottomrule from the
- {booktabs} LaTeX package.
- Defaults to ``pandas.options.styler.latex.hrules``, which is `False`.
-
- .. versionchanged:: 1.4.0
- clines : str, optional
- Use to control adding \\cline commands for the index labels separation.
- Possible values are:
-
- - `None`: no cline commands are added (default).
- - `"all;data"`: a cline is added for every index value extending the
- width of the table, including data entries.
- - `"all;index"`: as above with lines extending only the width of the
- index entries.
- - `"skip-last;data"`: a cline is added for each index value except the
- last level (which is never sparsified), extending the widtn of the
- table.
- - `"skip-last;index"`: as above with lines extending only the width of the
- index entries.
-
- .. versionadded:: 1.4.0
- label : str, optional
- The LaTeX label included as: \\label{<label>}.
- This is used with \\ref{<label>} in the main .tex file.
- caption : str, tuple, optional
- If string, the LaTeX table caption included as: \\caption{<caption>}.
- If tuple, i.e ("full caption", "short caption"), the caption included
- as: \\caption[<caption[1]>]{<caption[0]>}.
- sparse_index : bool, optional
- Whether to sparsify the display of a hierarchical index. Setting to False
- will display each explicit level element in a hierarchical key for each row.
- Defaults to ``pandas.options.styler.sparse.index``, which is `True`.
- sparse_columns : bool, optional
- Whether to sparsify the display of a hierarchical index. Setting to False
- will display each explicit level element in a hierarchical key for each
- column. Defaults to ``pandas.options.styler.sparse.columns``, which
- is `True`.
- multirow_align : {"c", "t", "b", "naive"}, optional
- If sparsifying hierarchical MultiIndexes whether to align text centrally,
- at the top or bottom using the multirow package. If not given defaults to
- ``pandas.options.styler.latex.multirow_align``, which is `"c"`.
- If "naive" is given renders without multirow.
-
- .. versionchanged:: 1.4.0
- multicol_align : {"r", "c", "l", "naive-l", "naive-r"}, optional
- If sparsifying hierarchical MultiIndex columns whether to align text at
- the left, centrally, or at the right. If not given defaults to
- ``pandas.options.styler.latex.multicol_align``, which is "r".
- If a naive option is given renders without multicol.
- Pipe decorators can also be added to non-naive values to draw vertical
- rules, e.g. "\|r" will draw a rule on the left side of right aligned merged
- cells.
-
- .. versionchanged:: 1.4.0
- siunitx : bool, default False
- Set to ``True`` to structure LaTeX compatible with the {siunitx} package.
- environment : str, optional
- If given, the environment that will replace 'table' in ``\\begin{table}``.
- If 'longtable' is specified then a more suitable template is
- rendered. If not given defaults to
- ``pandas.options.styler.latex.environment``, which is `None`.
-
- .. versionadded:: 1.4.0
- encoding : str, optional
- Character encoding setting. Defaults
- to ``pandas.options.styler.render.encoding``, which is "utf-8".
- convert_css : bool, default False
- Convert simple cell-styles from CSS to LaTeX format. Any CSS not found in
- conversion table is dropped. A style can be forced by adding option
- `--latex`. See notes.
-
- Returns
- -------
- str or None
- If `buf` is None, returns the result as a string. Otherwise returns `None`.
-
- See Also
- --------
- Styler.format: Format the text display value of cells.
-
- Notes
- -----
- **Latex Packages**
-
- For the following features we recommend the following LaTeX inclusions:
-
- ===================== ==========================================================
- Feature Inclusion
- ===================== ==========================================================
- sparse columns none: included within default {tabular} environment
- sparse rows \\usepackage{multirow}
- hrules \\usepackage{booktabs}
- colors \\usepackage[table]{xcolor}
- siunitx \\usepackage{siunitx}
- bold (with siunitx) | \\usepackage{etoolbox}
- | \\robustify\\bfseries
- | \\sisetup{detect-all = true} *(within {document})*
- italic (with siunitx) | \\usepackage{etoolbox}
- | \\robustify\\itshape
- | \\sisetup{detect-all = true} *(within {document})*
- environment \\usepackage{longtable} if arg is "longtable"
- | or any other relevant environment package
- hyperlinks \\usepackage{hyperref}
- ===================== ==========================================================
-
- **Cell Styles**
-
- LaTeX styling can only be rendered if the accompanying styling functions have
- been constructed with appropriate LaTeX commands. All styling
- functionality is built around the concept of a CSS ``(<attribute>, <value>)``
- pair (see `Table Visualization <../../user_guide/style.ipynb>`_), and this
- should be replaced by a LaTeX
- ``(<command>, <options>)`` approach. Each cell will be styled individually
- using nested LaTeX commands with their accompanied options.
-
- For example the following code will highlight and bold a cell in HTML-CSS:
-
- >>> df = pd.DataFrame([[1,2], [3,4]])
- >>> s = df.style.highlight_max(axis=None,
- ... props='background-color:red; font-weight:bold;')
- >>> s.to_html() # doctest: +SKIP
-
- The equivalent using LaTeX only commands is the following:
-
- >>> s = df.style.highlight_max(axis=None,
- ... props='cellcolor:{red}; bfseries: ;')
- >>> s.to_latex() # doctest: +SKIP
-
- Internally these structured LaTeX ``(<command>, <options>)`` pairs
- are translated to the
- ``display_value`` with the default structure:
- ``\<command><options> <display_value>``.
- Where there are multiple commands the latter is nested recursively, so that
- the above example highlighted cell is rendered as
- ``\cellcolor{red} \bfseries 4``.
-
- Occasionally this format does not suit the applied command, or
- combination of LaTeX packages that is in use, so additional flags can be
- added to the ``<options>``, within the tuple, to result in different
- positions of required braces (the **default** being the same as ``--nowrap``):
-
- =================================== ============================================
- Tuple Format Output Structure
- =================================== ============================================
- (<command>,<options>) \\<command><options> <display_value>
- (<command>,<options> ``--nowrap``) \\<command><options> <display_value>
- (<command>,<options> ``--rwrap``) \\<command><options>{<display_value>}
- (<command>,<options> ``--wrap``) {\\<command><options> <display_value>}
- (<command>,<options> ``--lwrap``) {\\<command><options>} <display_value>
- (<command>,<options> ``--dwrap``) {\\<command><options>}{<display_value>}
- =================================== ============================================
-
- For example the `textbf` command for font-weight
- should always be used with `--rwrap` so ``('textbf', '--rwrap')`` will render a
- working cell, wrapped with braces, as ``\textbf{<display_value>}``.
-
- A more comprehensive example is as follows:
-
- >>> df = pd.DataFrame([[1, 2.2, "dogs"], [3, 4.4, "cats"], [2, 6.6, "cows"]],
- ... index=["ix1", "ix2", "ix3"],
- ... columns=["Integers", "Floats", "Strings"])
- >>> s = df.style.highlight_max(
- ... props='cellcolor:[HTML]{FFFF00}; color:{red};'
- ... 'textit:--rwrap; textbf:--rwrap;'
- ... )
- >>> s.to_latex() # doctest: +SKIP
-
- .. figure:: ../../_static/style/latex_1.png
-
- **Table Styles**
-
- Internally Styler uses its ``table_styles`` object to parse the
- ``column_format``, ``position``, ``position_float``, and ``label``
- input arguments. These arguments are added to table styles in the format:
-
- .. code-block:: python
-
- set_table_styles([
- {"selector": "column_format", "props": f":{column_format};"},
- {"selector": "position", "props": f":{position};"},
- {"selector": "position_float", "props": f":{position_float};"},
- {"selector": "label", "props": f":{{{label.replace(':','§')}}};"}
- ], overwrite=False)
-
- Exception is made for the ``hrules`` argument which, in fact, controls all three
- commands: ``toprule``, ``bottomrule`` and ``midrule`` simultaneously. Instead of
- setting ``hrules`` to ``True``, it is also possible to set each
- individual rule definition, by manually setting the ``table_styles``,
- for example below we set a regular ``toprule``, set an ``hline`` for
- ``bottomrule`` and exclude the ``midrule``:
-
- .. code-block:: python
-
- set_table_styles([
- {'selector': 'toprule', 'props': ':toprule;'},
- {'selector': 'bottomrule', 'props': ':hline;'},
- ], overwrite=False)
-
- If other ``commands`` are added to table styles they will be detected, and
- positioned immediately above the '\\begin{tabular}' command. For example to
- add odd and even row coloring, from the {colortbl} package, in format
- ``\rowcolors{1}{pink}{red}``, use:
-
- .. code-block:: python
-
- set_table_styles([
- {'selector': 'rowcolors', 'props': ':{1}{pink}{red};'}
- ], overwrite=False)
-
- A more comprehensive example using these arguments is as follows:
-
- >>> df.columns = pd.MultiIndex.from_tuples([
- ... ("Numeric", "Integers"),
- ... ("Numeric", "Floats"),
- ... ("Non-Numeric", "Strings")
- ... ])
- >>> df.index = pd.MultiIndex.from_tuples([
- ... ("L0", "ix1"), ("L0", "ix2"), ("L1", "ix3")
- ... ])
- >>> s = df.style.highlight_max(
- ... props='cellcolor:[HTML]{FFFF00}; color:{red}; itshape:; bfseries:;'
- ... )
- >>> s.to_latex(
- ... column_format="rrrrr", position="h", position_float="centering",
- ... hrules=True, label="table:5", caption="Styled LaTeX Table",
- ... multirow_align="t", multicol_align="r"
- ... ) # doctest: +SKIP
-
- .. figure:: ../../_static/style/latex_2.png
-
- **Formatting**
-
- To format values :meth:`Styler.format` should be used prior to calling
- `Styler.to_latex`, as well as other methods such as :meth:`Styler.hide`
- for example:
-
- >>> s.clear()
- >>> s.table_styles = []
- >>> s.caption = None
- >>> s.format({
- ... ("Numeric", "Integers"): '\${}',
- ... ("Numeric", "Floats"): '{:.3f}',
- ... ("Non-Numeric", "Strings"): str.upper
- ... }) # doctest: +SKIP
- Numeric Non-Numeric
- Integers Floats Strings
- L0 ix1 $1 2.200 DOGS
- ix2 $3 4.400 CATS
- L1 ix3 $2 6.600 COWS
-
- >>> s.to_latex() # doctest: +SKIP
- \begin{tabular}{llrrl}
- {} & {} & \multicolumn{2}{r}{Numeric} & {Non-Numeric} \\
- {} & {} & {Integers} & {Floats} & {Strings} \\
- \multirow[c]{2}{*}{L0} & ix1 & \\$1 & 2.200 & DOGS \\
- & ix2 & \$3 & 4.400 & CATS \\
- L1 & ix3 & \$2 & 6.600 & COWS \\
- \end{tabular}
-
- **CSS Conversion**
-
- This method can convert a Styler constructured with HTML-CSS to LaTeX using
- the following limited conversions.
-
- ================== ==================== ============= ==========================
- CSS Attribute CSS value LaTeX Command LaTeX Options
- ================== ==================== ============= ==========================
- font-weight | bold | bfseries
- | bolder | bfseries
- font-style | italic | itshape
- | oblique | slshape
- background-color | red cellcolor | {red}--lwrap
- | #fe01ea | [HTML]{FE01EA}--lwrap
- | #f0e | [HTML]{FF00EE}--lwrap
- | rgb(128,255,0) | [rgb]{0.5,1,0}--lwrap
- | rgba(128,0,0,0.5) | [rgb]{0.5,0,0}--lwrap
- | rgb(25%,255,50%) | [rgb]{0.25,1,0.5}--lwrap
- color | red color | {red}
- | #fe01ea | [HTML]{FE01EA}
- | #f0e | [HTML]{FF00EE}
- | rgb(128,255,0) | [rgb]{0.5,1,0}
- | rgba(128,0,0,0.5) | [rgb]{0.5,0,0}
- | rgb(25%,255,50%) | [rgb]{0.25,1,0.5}
- ================== ==================== ============= ==========================
-
- It is also possible to add user-defined LaTeX only styles to a HTML-CSS Styler
- using the ``--latex`` flag, and to add LaTeX parsing options that the
- converter will detect within a CSS-comment.
-
- >>> df = pd.DataFrame([[1]])
- >>> df.style.set_properties(
- ... **{"font-weight": "bold /* --dwrap */", "Huge": "--latex--rwrap"}
- ... ).to_latex(convert_css=True) # doctest: +SKIP
- \begin{tabular}{lr}
- {} & {0} \\
- 0 & {\bfseries}{\Huge{1}} \\
- \end{tabular}
-
- Examples
- --------
- Below we give a complete step by step example adding some advanced features
- and noting some common gotchas.
-
- First we create the DataFrame and Styler as usual, including MultiIndex rows
- and columns, which allow for more advanced formatting options:
-
- >>> cidx = pd.MultiIndex.from_arrays([
- ... ["Equity", "Equity", "Equity", "Equity",
- ... "Stats", "Stats", "Stats", "Stats", "Rating"],
- ... ["Energy", "Energy", "Consumer", "Consumer", "", "", "", "", ""],
- ... ["BP", "Shell", "H&M", "Unilever",
- ... "Std Dev", "Variance", "52w High", "52w Low", ""]
- ... ])
- >>> iidx = pd.MultiIndex.from_arrays([
- ... ["Equity", "Equity", "Equity", "Equity"],
- ... ["Energy", "Energy", "Consumer", "Consumer"],
- ... ["BP", "Shell", "H&M", "Unilever"]
- ... ])
- >>> styler = pd.DataFrame([
- ... [1, 0.8, 0.66, 0.72, 32.1678, 32.1678**2, 335.12, 240.89, "Buy"],
- ... [0.8, 1.0, 0.69, 0.79, 1.876, 1.876**2, 14.12, 19.78, "Hold"],
- ... [0.66, 0.69, 1.0, 0.86, 7, 7**2, 210.9, 140.6, "Buy"],
- ... [0.72, 0.79, 0.86, 1.0, 213.76, 213.76**2, 2807, 3678, "Sell"],
- ... ], columns=cidx, index=iidx).style
-
- Second we will format the display and, since our table is quite wide, will
- hide the repeated level-0 of the index:
-
- >>> (styler.format(subset="Equity", precision=2)
- ... .format(subset="Stats", precision=1, thousands=",")
- ... .format(subset="Rating", formatter=str.upper)
- ... .format_index(escape="latex", axis=1)
- ... .format_index(escape="latex", axis=0)
- ... .hide(level=0, axis=0)) # doctest: +SKIP
-
- Note that one of the string entries of the index and column headers is "H&M".
- Without applying the `escape="latex"` option to the `format_index` method the
- resultant LaTeX will fail to render, and the error returned is quite
- difficult to debug. Using the appropriate escape the "&" is converted to "\\&".
-
- Thirdly we will apply some (CSS-HTML) styles to our object. We will use a
- builtin method and also define our own method to highlight the stock
- recommendation:
-
- >>> def rating_color(v):
- ... if v == "Buy": color = "#33ff85"
- ... elif v == "Sell": color = "#ff5933"
- ... else: color = "#ffdd33"
- ... return f"color: {color}; font-weight: bold;"
- >>> (styler.background_gradient(cmap="inferno", subset="Equity", vmin=0, vmax=1)
- ... .applymap(rating_color, subset="Rating")) # doctest: +SKIP
-
- All the above styles will work with HTML (see below) and LaTeX upon conversion:
-
- .. figure:: ../../_static/style/latex_stocks_html.png
-
- However, we finally want to add one LaTeX only style
- (from the {graphicx} package), that is not easy to convert from CSS and
- pandas does not support it. Notice the `--latex` flag used here,
- as well as `--rwrap` to ensure this is formatted correctly and
- not ignored upon conversion.
-
- >>> styler.applymap_index(
- ... lambda v: "rotatebox:{45}--rwrap--latex;", level=2, axis=1
- ... ) # doctest: +SKIP
-
- Finally we render our LaTeX adding in other options as required:
-
- >>> styler.to_latex(
- ... caption="Selected stock correlation and simple statistics.",
- ... clines="skip-last;data",
- ... convert_css=True,
- ... position_float="centering",
- ... multicol_align="|c|",
- ... hrules=True,
- ... ) # doctest: +SKIP
- \begin{table}
- \centering
- \caption{Selected stock correlation and simple statistics.}
- \begin{tabular}{llrrrrrrrrl}
- \toprule
- & & \multicolumn{4}{|c|}{Equity} & \multicolumn{4}{|c|}{Stats} & Rating \\
- & & \multicolumn{2}{|c|}{Energy} & \multicolumn{2}{|c|}{Consumer} &
- \multicolumn{4}{|c|}{} & \\
- & & \rotatebox{45}{BP} & \rotatebox{45}{Shell} & \rotatebox{45}{H\&M} &
- \rotatebox{45}{Unilever} & \rotatebox{45}{Std Dev} & \rotatebox{45}{Variance} &
- \rotatebox{45}{52w High} & \rotatebox{45}{52w Low} & \rotatebox{45}{} \\
- \midrule
- \multirow[c]{2}{*}{Energy} & BP & {\cellcolor[HTML]{FCFFA4}}
- \color[HTML]{000000} 1.00 & {\cellcolor[HTML]{FCA50A}} \color[HTML]{000000}
- 0.80 & {\cellcolor[HTML]{EB6628}} \color[HTML]{F1F1F1} 0.66 &
- {\cellcolor[HTML]{F68013}} \color[HTML]{F1F1F1} 0.72 & 32.2 & 1,034.8 & 335.1
- & 240.9 & \color[HTML]{33FF85} \bfseries BUY \\
- & Shell & {\cellcolor[HTML]{FCA50A}} \color[HTML]{000000} 0.80 &
- {\cellcolor[HTML]{FCFFA4}} \color[HTML]{000000} 1.00 &
- {\cellcolor[HTML]{F1731D}} \color[HTML]{F1F1F1} 0.69 &
- {\cellcolor[HTML]{FCA108}} \color[HTML]{000000} 0.79 & 1.9 & 3.5 & 14.1 &
- 19.8 & \color[HTML]{FFDD33} \bfseries HOLD \\
- \cline{1-11}
- \multirow[c]{2}{*}{Consumer} & H\&M & {\cellcolor[HTML]{EB6628}}
- \color[HTML]{F1F1F1} 0.66 & {\cellcolor[HTML]{F1731D}} \color[HTML]{F1F1F1}
- 0.69 & {\cellcolor[HTML]{FCFFA4}} \color[HTML]{000000} 1.00 &
- {\cellcolor[HTML]{FAC42A}} \color[HTML]{000000} 0.86 & 7.0 & 49.0 & 210.9 &
- 140.6 & \color[HTML]{33FF85} \bfseries BUY \\
- & Unilever & {\cellcolor[HTML]{F68013}} \color[HTML]{F1F1F1} 0.72 &
- {\cellcolor[HTML]{FCA108}} \color[HTML]{000000} 0.79 &
- {\cellcolor[HTML]{FAC42A}} \color[HTML]{000000} 0.86 &
- {\cellcolor[HTML]{FCFFA4}} \color[HTML]{000000} 1.00 & 213.8 & 45,693.3 &
- 2,807.0 & 3,678.0 & \color[HTML]{FF5933} \bfseries SELL \\
- \cline{1-11}
- \bottomrule
- \end{tabular}
- \end{table}
-
- .. figure:: ../../_static/style/latex_stocks.png
- """
- obj = self._copy(deepcopy=True) # manipulate table_styles on obj, not self
-
- table_selectors = (
- [style["selector"] for style in self.table_styles]
- if self.table_styles is not None
- else []
- )
-
- if column_format is not None:
- # add more recent setting to table_styles
- obj.set_table_styles(
- [{"selector": "column_format", "props": f":{column_format}"}],
- overwrite=False,
- )
- elif "column_format" in table_selectors:
- pass # adopt what has been previously set in table_styles
- else:
- # create a default: set float, complex, int cols to 'r' ('S'), index to 'l'
- _original_columns = self.data.columns
- self.data.columns = RangeIndex(stop=len(self.data.columns))
- numeric_cols = self.data._get_numeric_data().columns.to_list()
- self.data.columns = _original_columns
- column_format = ""
- for level in range(self.index.nlevels):
- column_format += "" if self.hide_index_[level] else "l"
- for ci, _ in enumerate(self.data.columns):
- if ci not in self.hidden_columns:
- column_format += (
- ("r" if not siunitx else "S") if ci in numeric_cols else "l"
- )
- obj.set_table_styles(
- [{"selector": "column_format", "props": f":{column_format}"}],
- overwrite=False,
- )
-
- if position:
- obj.set_table_styles(
- [{"selector": "position", "props": f":{position}"}],
- overwrite=False,
- )
-
- if position_float:
- if environment == "longtable":
- raise ValueError(
- "`position_float` cannot be used in 'longtable' `environment`"
- )
- if position_float not in ["raggedright", "raggedleft", "centering"]:
- raise ValueError(
- f"`position_float` should be one of "
- f"'raggedright', 'raggedleft', 'centering', "
- f"got: '{position_float}'"
- )
- obj.set_table_styles(
- [{"selector": "position_float", "props": f":{position_float}"}],
- overwrite=False,
- )
-
- hrules = get_option("styler.latex.hrules") if hrules is None else hrules
- if hrules:
- obj.set_table_styles(
- [
- {"selector": "toprule", "props": ":toprule"},
- {"selector": "midrule", "props": ":midrule"},
- {"selector": "bottomrule", "props": ":bottomrule"},
- ],
- overwrite=False,
- )
-
- if label:
- obj.set_table_styles(
- [{"selector": "label", "props": f":{{{label.replace(':', '§')}}}"}],
- overwrite=False,
- )
-
- if caption:
- obj.set_caption(caption)
-
- if sparse_index is None:
- sparse_index = get_option("styler.sparse.index")
- if sparse_columns is None:
- sparse_columns = get_option("styler.sparse.columns")
- environment = environment or get_option("styler.latex.environment")
- multicol_align = multicol_align or get_option("styler.latex.multicol_align")
- multirow_align = multirow_align or get_option("styler.latex.multirow_align")
- latex = obj._render_latex(
- sparse_index=sparse_index,
- sparse_columns=sparse_columns,
- multirow_align=multirow_align,
- multicol_align=multicol_align,
- environment=environment,
- convert_css=convert_css,
- siunitx=siunitx,
- clines=clines,
- )
-
- encoding = (
- (encoding or get_option("styler.render.encoding"))
- if isinstance(buf, str) # i.e. a filepath
- else encoding
- )
- return save_to_buffer(latex, buf=buf, encoding=encoding)
-
- @overload
- def to_html(
- self,
- buf: FilePath | WriteBuffer[str],
- *,
- table_uuid: str | None = ...,
- table_attributes: str | None = ...,
- sparse_index: bool | None = ...,
- sparse_columns: bool | None = ...,
- bold_headers: bool = ...,
- caption: str | None = ...,
- max_rows: int | None = ...,
- max_columns: int | None = ...,
- encoding: str | None = ...,
- doctype_html: bool = ...,
- exclude_styles: bool = ...,
- **kwargs,
- ) -> None:
- ...
-
- @overload
- def to_html(
- self,
- buf: None = ...,
- *,
- table_uuid: str | None = ...,
- table_attributes: str | None = ...,
- sparse_index: bool | None = ...,
- sparse_columns: bool | None = ...,
- bold_headers: bool = ...,
- caption: str | None = ...,
- max_rows: int | None = ...,
- max_columns: int | None = ...,
- encoding: str | None = ...,
- doctype_html: bool = ...,
- exclude_styles: bool = ...,
- **kwargs,
- ) -> str:
- ...
-
- @Substitution(buf=buffering_args, encoding=encoding_args)
- def to_html(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- *,
- table_uuid: str | None = None,
- table_attributes: str | None = None,
- sparse_index: bool | None = None,
- sparse_columns: bool | None = None,
- bold_headers: bool = False,
- caption: str | None = None,
- max_rows: int | None = None,
- max_columns: int | None = None,
- encoding: str | None = None,
- doctype_html: bool = False,
- exclude_styles: bool = False,
- **kwargs,
- ) -> str | None:
- """
- Write Styler to a file, buffer or string in HTML-CSS format.
-
- .. versionadded:: 1.3.0
-
- Parameters
- ----------
- %(buf)s
- table_uuid : str, optional
- Id attribute assigned to the <table> HTML element in the format:
-
- ``<table id="T_<table_uuid>" ..>``
-
- If not given uses Styler's initially assigned value.
- table_attributes : str, optional
- Attributes to assign within the `<table>` HTML element in the format:
-
- ``<table .. <table_attributes> >``
-
- If not given defaults to Styler's preexisting value.
- sparse_index : bool, optional
- Whether to sparsify the display of a hierarchical index. Setting to False
- will display each explicit level element in a hierarchical key for each row.
- Defaults to ``pandas.options.styler.sparse.index`` value.
-
- .. versionadded:: 1.4.0
- sparse_columns : bool, optional
- Whether to sparsify the display of a hierarchical index. Setting to False
- will display each explicit level element in a hierarchical key for each
- column. Defaults to ``pandas.options.styler.sparse.columns`` value.
-
- .. versionadded:: 1.4.0
- bold_headers : bool, optional
- Adds "font-weight: bold;" as a CSS property to table style header cells.
-
- .. versionadded:: 1.4.0
- caption : str, optional
- Set, or overwrite, the caption on Styler before rendering.
-
- .. versionadded:: 1.4.0
- max_rows : int, optional
- The maximum number of rows that will be rendered. Defaults to
- ``pandas.options.styler.render.max_rows/max_columns``.
-
- .. versionadded:: 1.4.0
- max_columns : int, optional
- The maximum number of columns that will be rendered. Defaults to
- ``pandas.options.styler.render.max_columns``, which is None.
-
- Rows and columns may be reduced if the number of total elements is
- large. This value is set to ``pandas.options.styler.render.max_elements``,
- which is 262144 (18 bit browser rendering).
-
- .. versionadded:: 1.4.0
- %(encoding)s
- doctype_html : bool, default False
- Whether to output a fully structured HTML file including all
- HTML elements, or just the core ``<style>`` and ``<table>`` elements.
- exclude_styles : bool, default False
- Whether to include the ``<style>`` element and all associated element
- ``class`` and ``id`` identifiers, or solely the ``<table>`` element without
- styling identifiers.
- **kwargs
- Any additional keyword arguments are passed through to the jinja2
- ``self.template.render`` process. This is useful when you need to provide
- additional variables for a custom template.
-
- Returns
- -------
- str or None
- If `buf` is None, returns the result as a string. Otherwise returns `None`.
-
- See Also
- --------
- DataFrame.to_html: Write a DataFrame to a file, buffer or string in HTML format.
- """
- obj = self._copy(deepcopy=True) # manipulate table_styles on obj, not self
-
- if table_uuid:
- obj.set_uuid(table_uuid)
-
- if table_attributes:
- obj.set_table_attributes(table_attributes)
-
- if sparse_index is None:
- sparse_index = get_option("styler.sparse.index")
- if sparse_columns is None:
- sparse_columns = get_option("styler.sparse.columns")
-
- if bold_headers:
- obj.set_table_styles(
- [{"selector": "th", "props": "font-weight: bold;"}], overwrite=False
- )
-
- if caption is not None:
- obj.set_caption(caption)
-
- # Build HTML string..
- html = obj._render_html(
- sparse_index=sparse_index,
- sparse_columns=sparse_columns,
- max_rows=max_rows,
- max_cols=max_columns,
- exclude_styles=exclude_styles,
- encoding=encoding or get_option("styler.render.encoding"),
- doctype_html=doctype_html,
- **kwargs,
- )
-
- return save_to_buffer(
- html, buf=buf, encoding=(encoding if buf is not None else None)
- )
-
- @overload
- def to_string(
- self,
- buf: FilePath | WriteBuffer[str],
- *,
- encoding=...,
- sparse_index: bool | None = ...,
- sparse_columns: bool | None = ...,
- max_rows: int | None = ...,
- max_columns: int | None = ...,
- delimiter: str = ...,
- ) -> None:
- ...
-
- @overload
- def to_string(
- self,
- buf: None = ...,
- *,
- encoding=...,
- sparse_index: bool | None = ...,
- sparse_columns: bool | None = ...,
- max_rows: int | None = ...,
- max_columns: int | None = ...,
- delimiter: str = ...,
- ) -> str:
- ...
-
- @Substitution(buf=buffering_args, encoding=encoding_args)
- def to_string(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- *,
- encoding=None,
- sparse_index: bool | None = None,
- sparse_columns: bool | None = None,
- max_rows: int | None = None,
- max_columns: int | None = None,
- delimiter: str = " ",
- ) -> str | None:
- """
- Write Styler to a file, buffer or string in text format.
-
- .. versionadded:: 1.5.0
-
- Parameters
- ----------
- %(buf)s
- %(encoding)s
- sparse_index : bool, optional
- Whether to sparsify the display of a hierarchical index. Setting to False
- will display each explicit level element in a hierarchical key for each row.
- Defaults to ``pandas.options.styler.sparse.index`` value.
- sparse_columns : bool, optional
- Whether to sparsify the display of a hierarchical index. Setting to False
- will display each explicit level element in a hierarchical key for each
- column. Defaults to ``pandas.options.styler.sparse.columns`` value.
- max_rows : int, optional
- The maximum number of rows that will be rendered. Defaults to
- ``pandas.options.styler.render.max_rows``, which is None.
- max_columns : int, optional
- The maximum number of columns that will be rendered. Defaults to
- ``pandas.options.styler.render.max_columns``, which is None.
-
- Rows and columns may be reduced if the number of total elements is
- large. This value is set to ``pandas.options.styler.render.max_elements``,
- which is 262144 (18 bit browser rendering).
- delimiter : str, default single space
- The separator between data elements.
-
- Returns
- -------
- str or None
- If `buf` is None, returns the result as a string. Otherwise returns `None`.
- """
- obj = self._copy(deepcopy=True)
-
- if sparse_index is None:
- sparse_index = get_option("styler.sparse.index")
- if sparse_columns is None:
- sparse_columns = get_option("styler.sparse.columns")
-
- text = obj._render_string(
- sparse_columns=sparse_columns,
- sparse_index=sparse_index,
- max_rows=max_rows,
- max_cols=max_columns,
- delimiter=delimiter,
- )
- return save_to_buffer(
- text, buf=buf, encoding=(encoding if buf is not None else None)
- )
-
- def set_td_classes(self, classes: DataFrame) -> Styler:
- """
- Set the ``class`` attribute of ``<td>`` HTML elements.
-
- Parameters
- ----------
- classes : DataFrame
- DataFrame containing strings that will be translated to CSS classes,
- mapped by identical column and index key values that must exist on the
- underlying Styler data. None, NaN values, and empty strings will
- be ignored and not affect the rendered HTML.
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.set_table_styles: Set the table styles included within the ``<style>``
- HTML element.
- Styler.set_table_attributes: Set the table attributes added to the ``<table>``
- HTML element.
-
- Notes
- -----
- Can be used in combination with ``Styler.set_table_styles`` to define an
- internal CSS solution without reference to external CSS files.
-
- Examples
- --------
- >>> df = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
- >>> classes = pd.DataFrame([
- ... ["min-val red", "", "blue"],
- ... ["red", None, "blue max-val"]
- ... ], index=df.index, columns=df.columns)
- >>> df.style.set_td_classes(classes) # doctest: +SKIP
-
- Using `MultiIndex` columns and a `classes` `DataFrame` as a subset of the
- underlying,
-
- >>> df = pd.DataFrame([[1,2],[3,4]], index=["a", "b"],
- ... columns=[["level0", "level0"], ["level1a", "level1b"]])
- >>> classes = pd.DataFrame(["min-val"], index=["a"],
- ... columns=[["level0"],["level1a"]])
- >>> df.style.set_td_classes(classes) # doctest: +SKIP
-
- Form of the output with new additional css classes,
-
- >>> df = pd.DataFrame([[1]])
- >>> css = pd.DataFrame([["other-class"]])
- >>> s = Styler(df, uuid="_", cell_ids=False).set_td_classes(css)
- >>> s.hide(axis=0).to_html() # doctest: +SKIP
- '<style type="text/css"></style>'
- '<table id="T__">'
- ' <thead>'
- ' <tr><th class="col_heading level0 col0" >0</th></tr>'
- ' </thead>'
- ' <tbody>'
- ' <tr><td class="data row0 col0 other-class" >1</td></tr>'
- ' </tbody>'
- '</table>'
- """
- if not classes.index.is_unique or not classes.columns.is_unique:
- raise KeyError(
- "Classes render only if `classes` has unique index and columns."
- )
- classes = classes.reindex_like(self.data)
-
- for r, row_tup in enumerate(classes.itertuples()):
- for c, value in enumerate(row_tup[1:]):
- if not (pd.isna(value) or value == ""):
- self.cell_context[(r, c)] = str(value)
-
- return self
-
- def _update_ctx(self, attrs: DataFrame) -> None:
- """
- Update the state of the ``Styler`` for data cells.
-
- Collects a mapping of {index_label: [('<property>', '<value>'), ..]}.
-
- Parameters
- ----------
- attrs : DataFrame
- should contain strings of '<property>: <value>;<prop2>: <val2>'
- Whitespace shouldn't matter and the final trailing ';' shouldn't
- matter.
- """
- if not self.index.is_unique or not self.columns.is_unique:
- raise KeyError(
- "`Styler.apply` and `.applymap` are not compatible "
- "with non-unique index or columns."
- )
-
- for cn in attrs.columns:
- j = self.columns.get_loc(cn)
- ser = attrs[cn]
- for rn, c in ser.items():
- if not c or pd.isna(c):
- continue
- css_list = maybe_convert_css_to_tuples(c)
- i = self.index.get_loc(rn)
- self.ctx[(i, j)].extend(css_list)
-
- def _update_ctx_header(self, attrs: DataFrame, axis: AxisInt) -> None:
- """
- Update the state of the ``Styler`` for header cells.
-
- Collects a mapping of {index_label: [('<property>', '<value>'), ..]}.
-
- Parameters
- ----------
- attrs : Series
- Should contain strings of '<property>: <value>;<prop2>: <val2>', and an
- integer index.
- Whitespace shouldn't matter and the final trailing ';' shouldn't
- matter.
- axis : int
- Identifies whether the ctx object being updated is the index or columns
- """
- for j in attrs.columns:
- ser = attrs[j]
- for i, c in ser.items():
- if not c:
- continue
- css_list = maybe_convert_css_to_tuples(c)
- if axis == 0:
- self.ctx_index[(i, j)].extend(css_list)
- else:
- self.ctx_columns[(j, i)].extend(css_list)
-
- def _copy(self, deepcopy: bool = False) -> Styler:
- """
- Copies a Styler, allowing for deepcopy or shallow copy
-
- Copying a Styler aims to recreate a new Styler object which contains the same
- data and styles as the original.
-
- Data dependent attributes [copied and NOT exported]:
- - formatting (._display_funcs)
- - hidden index values or column values (.hidden_rows, .hidden_columns)
- - tooltips
- - cell_context (cell css classes)
- - ctx (cell css styles)
- - caption
- - concatenated stylers
-
- Non-data dependent attributes [copied and exported]:
- - css
- - hidden index state and hidden columns state (.hide_index_, .hide_columns_)
- - table_attributes
- - table_styles
- - applied styles (_todo)
-
- """
- # GH 40675
- styler = Styler(
- self.data, # populates attributes 'data', 'columns', 'index' as shallow
- )
- shallow = [ # simple string or boolean immutables
- "hide_index_",
- "hide_columns_",
- "hide_column_names",
- "hide_index_names",
- "table_attributes",
- "cell_ids",
- "caption",
- "uuid",
- "uuid_len",
- "template_latex", # also copy templates if these have been customised
- "template_html_style",
- "template_html_table",
- "template_html",
- ]
- deep = [ # nested lists or dicts
- "css",
- "concatenated",
- "_display_funcs",
- "_display_funcs_index",
- "_display_funcs_columns",
- "hidden_rows",
- "hidden_columns",
- "ctx",
- "ctx_index",
- "ctx_columns",
- "cell_context",
- "_todo",
- "table_styles",
- "tooltips",
- ]
-
- for attr in shallow:
- setattr(styler, attr, getattr(self, attr))
-
- for attr in deep:
- val = getattr(self, attr)
- setattr(styler, attr, copy.deepcopy(val) if deepcopy else val)
-
- return styler
-
- def __copy__(self) -> Styler:
- return self._copy(deepcopy=False)
-
- def __deepcopy__(self, memo) -> Styler:
- return self._copy(deepcopy=True)
-
- def clear(self) -> None:
- """
- Reset the ``Styler``, removing any previously applied styles.
-
- Returns None.
- """
- # create default GH 40675
- clean_copy = Styler(self.data, uuid=self.uuid)
- clean_attrs = [a for a in clean_copy.__dict__ if not callable(a)]
- self_attrs = [a for a in self.__dict__ if not callable(a)] # maybe more attrs
- for attr in clean_attrs:
- setattr(self, attr, getattr(clean_copy, attr))
- for attr in set(self_attrs).difference(clean_attrs):
- delattr(self, attr)
-
- def _apply(
- self,
- func: Callable,
- axis: Axis | None = 0,
- subset: Subset | None = None,
- **kwargs,
- ) -> Styler:
- subset = slice(None) if subset is None else subset
- subset = non_reducing_slice(subset)
- data = self.data.loc[subset]
- if data.empty:
- result = DataFrame()
- elif axis is None:
- result = func(data, **kwargs)
- if not isinstance(result, DataFrame):
- if not isinstance(result, np.ndarray):
- raise TypeError(
- f"Function {repr(func)} must return a DataFrame or ndarray "
- f"when passed to `Styler.apply` with axis=None"
- )
- if data.shape != result.shape:
- raise ValueError(
- f"Function {repr(func)} returned ndarray with wrong shape.\n"
- f"Result has shape: {result.shape}\n"
- f"Expected shape: {data.shape}"
- )
- result = DataFrame(result, index=data.index, columns=data.columns)
- else:
- axis = self.data._get_axis_number(axis)
- if axis == 0:
- result = data.apply(func, axis=0, **kwargs)
- else:
- result = data.T.apply(func, axis=0, **kwargs).T # see GH 42005
-
- if isinstance(result, Series):
- raise ValueError(
- f"Function {repr(func)} resulted in the apply method collapsing to a "
- f"Series.\nUsually, this is the result of the function returning a "
- f"single value, instead of list-like."
- )
- msg = (
- f"Function {repr(func)} created invalid {{0}} labels.\nUsually, this is "
- f"the result of the function returning a "
- f"{'Series' if axis is not None else 'DataFrame'} which contains invalid "
- f"labels, or returning an incorrectly shaped, list-like object which "
- f"cannot be mapped to labels, possibly due to applying the function along "
- f"the wrong axis.\n"
- f"Result {{0}} has shape: {{1}}\n"
- f"Expected {{0}} shape: {{2}}"
- )
- if not all(result.index.isin(data.index)):
- raise ValueError(msg.format("index", result.index.shape, data.index.shape))
- if not all(result.columns.isin(data.columns)):
- raise ValueError(
- msg.format("columns", result.columns.shape, data.columns.shape)
- )
- self._update_ctx(result)
- return self
-
- @Substitution(subset=subset_args)
- def apply(
- self,
- func: Callable,
- axis: Axis | None = 0,
- subset: Subset | None = None,
- **kwargs,
- ) -> Styler:
- """
- Apply a CSS-styling function column-wise, row-wise, or table-wise.
-
- Updates the HTML representation with the result.
-
- Parameters
- ----------
- func : function
- ``func`` should take a Series if ``axis`` in [0,1] and return a list-like
- object of same length, or a Series, not necessarily of same length, with
- valid index labels considering ``subset``.
- ``func`` should take a DataFrame if ``axis`` is ``None`` and return either
- an ndarray with the same shape or a DataFrame, not necessarily of the same
- shape, with valid index and columns labels considering ``subset``.
-
- .. versionchanged:: 1.3.0
-
- .. versionchanged:: 1.4.0
-
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- Apply to each column (``axis=0`` or ``'index'``), to each row
- (``axis=1`` or ``'columns'``), or to the entire DataFrame at once
- with ``axis=None``.
- %(subset)s
- **kwargs : dict
- Pass along to ``func``.
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.applymap_index: Apply a CSS-styling function to headers elementwise.
- Styler.apply_index: Apply a CSS-styling function to headers level-wise.
- Styler.applymap: Apply a CSS-styling function elementwise.
-
- Notes
- -----
- The elements of the output of ``func`` should be CSS styles as strings, in the
- format 'attribute: value; attribute2: value2; ...' or,
- if nothing is to be applied to that element, an empty string or ``None``.
-
- This is similar to ``DataFrame.apply``, except that ``axis=None``
- applies the function to the entire DataFrame at once,
- rather than column-wise or row-wise.
-
- Examples
- --------
- >>> def highlight_max(x, color):
- ... return np.where(x == np.nanmax(x.to_numpy()), f"color: {color};", None)
- >>> df = pd.DataFrame(np.random.randn(5, 2), columns=["A", "B"])
- >>> df.style.apply(highlight_max, color='red') # doctest: +SKIP
- >>> df.style.apply(highlight_max, color='blue', axis=1) # doctest: +SKIP
- >>> df.style.apply(highlight_max, color='green', axis=None) # doctest: +SKIP
-
- Using ``subset`` to restrict application to a single column or multiple columns
-
- >>> df.style.apply(highlight_max, color='red', subset="A")
- ... # doctest: +SKIP
- >>> df.style.apply(highlight_max, color='red', subset=["A", "B"])
- ... # doctest: +SKIP
-
- Using a 2d input to ``subset`` to select rows in addition to columns
-
- >>> df.style.apply(highlight_max, color='red', subset=([0,1,2], slice(None)))
- ... # doctest: +SKIP
- >>> df.style.apply(highlight_max, color='red', subset=(slice(0,5,2), "A"))
- ... # doctest: +SKIP
-
- Using a function which returns a Series / DataFrame of unequal length but
- containing valid index labels
-
- >>> df = pd.DataFrame([[1, 2], [3, 4], [4, 6]], index=["A1", "A2", "Total"])
- >>> total_style = pd.Series("font-weight: bold;", index=["Total"])
- >>> df.style.apply(lambda s: total_style) # doctest: +SKIP
-
- See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for
- more details.
- """
- self._todo.append(
- (lambda instance: getattr(instance, "_apply"), (func, axis, subset), kwargs)
- )
- return self
-
- def _apply_index(
- self,
- func: Callable,
- axis: Axis = 0,
- level: Level | list[Level] | None = None,
- method: str = "apply",
- **kwargs,
- ) -> Styler:
- axis = self.data._get_axis_number(axis)
- obj = self.index if axis == 0 else self.columns
-
- levels_ = refactor_levels(level, obj)
- data = DataFrame(obj.to_list()).loc[:, levels_]
-
- if method == "apply":
- result = data.apply(func, axis=0, **kwargs)
- elif method == "applymap":
- result = data.applymap(func, **kwargs)
-
- self._update_ctx_header(result, axis)
- return self
-
- @doc(
- this="apply",
- wise="level-wise",
- alt="applymap",
- altwise="elementwise",
- func="take a Series and return a string array of the same length",
- input_note="the index as a Series, if an Index, or a level of a MultiIndex",
- output_note="an identically sized array of CSS styles as strings",
- var="s",
- ret='np.where(s == "B", "background-color: yellow;", "")',
- ret2='["background-color: yellow;" if "x" in v else "" for v in s]',
- )
- def apply_index(
- self,
- func: Callable,
- axis: AxisInt | str = 0,
- level: Level | list[Level] | None = None,
- **kwargs,
- ) -> Styler:
- """
- Apply a CSS-styling function to the index or column headers, {wise}.
-
- Updates the HTML representation with the result.
-
- .. versionadded:: 1.4.0
-
- Parameters
- ----------
- func : function
- ``func`` should {func}.
- axis : {{0, 1, "index", "columns"}}
- The headers over which to apply the function.
- level : int, str, list, optional
- If index is MultiIndex the level(s) over which to apply the function.
- **kwargs : dict
- Pass along to ``func``.
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.{alt}_index: Apply a CSS-styling function to headers {altwise}.
- Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise.
- Styler.applymap: Apply a CSS-styling function elementwise.
-
- Notes
- -----
- Each input to ``func`` will be {input_note}. The output of ``func`` should be
- {output_note}, in the format 'attribute: value; attribute2: value2; ...'
- or, if nothing is to be applied to that element, an empty string or ``None``.
-
- Examples
- --------
- Basic usage to conditionally highlight values in the index.
-
- >>> df = pd.DataFrame([[1,2], [3,4]], index=["A", "B"])
- >>> def color_b(s):
- ... return {ret}
- >>> df.style.{this}_index(color_b) # doctest: +SKIP
-
- .. figure:: ../../_static/style/appmaphead1.png
-
- Selectively applying to specific levels of MultiIndex columns.
-
- >>> midx = pd.MultiIndex.from_product([['ix', 'jy'], [0, 1], ['x3', 'z4']])
- >>> df = pd.DataFrame([np.arange(8)], columns=midx)
- >>> def highlight_x({var}):
- ... return {ret2}
- >>> df.style.{this}_index(highlight_x, axis="columns", level=[0, 2])
- ... # doctest: +SKIP
-
- .. figure:: ../../_static/style/appmaphead2.png
- """
- self._todo.append(
- (
- lambda instance: getattr(instance, "_apply_index"),
- (func, axis, level, "apply"),
- kwargs,
- )
- )
- return self
-
- @doc(
- apply_index,
- this="applymap",
- wise="elementwise",
- alt="apply",
- altwise="level-wise",
- func="take a scalar and return a string",
- input_note="an index value, if an Index, or a level value of a MultiIndex",
- output_note="CSS styles as a string",
- var="v",
- ret='"background-color: yellow;" if v == "B" else None',
- ret2='"background-color: yellow;" if "x" in v else None',
- )
- def applymap_index(
- self,
- func: Callable,
- axis: AxisInt | str = 0,
- level: Level | list[Level] | None = None,
- **kwargs,
- ) -> Styler:
- self._todo.append(
- (
- lambda instance: getattr(instance, "_apply_index"),
- (func, axis, level, "applymap"),
- kwargs,
- )
- )
- return self
-
- def _applymap(
- self, func: Callable, subset: Subset | None = None, **kwargs
- ) -> Styler:
- func = partial(func, **kwargs) # applymap doesn't take kwargs?
- if subset is None:
- subset = IndexSlice[:]
- subset = non_reducing_slice(subset)
- result = self.data.loc[subset].applymap(func)
- self._update_ctx(result)
- return self
-
- @Substitution(subset=subset_args)
- def applymap(
- self, func: Callable, subset: Subset | None = None, **kwargs
- ) -> Styler:
- """
- Apply a CSS-styling function elementwise.
-
- Updates the HTML representation with the result.
-
- Parameters
- ----------
- func : function
- ``func`` should take a scalar and return a string.
- %(subset)s
- **kwargs : dict
- Pass along to ``func``.
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.applymap_index: Apply a CSS-styling function to headers elementwise.
- Styler.apply_index: Apply a CSS-styling function to headers level-wise.
- Styler.apply: Apply a CSS-styling function column-wise, row-wise, or table-wise.
-
- Notes
- -----
- The elements of the output of ``func`` should be CSS styles as strings, in the
- format 'attribute: value; attribute2: value2; ...' or,
- if nothing is to be applied to that element, an empty string or ``None``.
-
- Examples
- --------
- >>> def color_negative(v, color):
- ... return f"color: {color};" if v < 0 else None
- >>> df = pd.DataFrame(np.random.randn(5, 2), columns=["A", "B"])
- >>> df.style.applymap(color_negative, color='red') # doctest: +SKIP
-
- Using ``subset`` to restrict application to a single column or multiple columns
-
- >>> df.style.applymap(color_negative, color='red', subset="A")
- ... # doctest: +SKIP
- >>> df.style.applymap(color_negative, color='red', subset=["A", "B"])
- ... # doctest: +SKIP
-
- Using a 2d input to ``subset`` to select rows in addition to columns
-
- >>> df.style.applymap(color_negative, color='red',
- ... subset=([0,1,2], slice(None))) # doctest: +SKIP
- >>> df.style.applymap(color_negative, color='red', subset=(slice(0,5,2), "A"))
- ... # doctest: +SKIP
-
- See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for
- more details.
- """
- self._todo.append(
- (lambda instance: getattr(instance, "_applymap"), (func, subset), kwargs)
- )
- return self
-
- def set_table_attributes(self, attributes: str) -> Styler:
- """
- Set the table attributes added to the ``<table>`` HTML element.
-
- These are items in addition to automatic (by default) ``id`` attribute.
-
- Parameters
- ----------
- attributes : str
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.set_table_styles: Set the table styles included within the ``<style>``
- HTML element.
- Styler.set_td_classes: Set the DataFrame of strings added to the ``class``
- attribute of ``<td>`` HTML elements.
-
- Examples
- --------
- >>> df = pd.DataFrame(np.random.randn(10, 4))
- >>> df.style.set_table_attributes('class="pure-table"') # doctest: +SKIP
- # ... <table class="pure-table"> ...
- """
- self.table_attributes = attributes
- return self
-
- def export(self) -> dict[str, Any]:
- """
- Export the styles applied to the current Styler.
-
- Can be applied to a second Styler with ``Styler.use``.
-
- Returns
- -------
- dict
-
- See Also
- --------
- Styler.use: Set the styles on the current Styler.
- Styler.copy: Create a copy of the current Styler.
-
- Notes
- -----
- This method is designed to copy non-data dependent attributes of
- one Styler to another. It differs from ``Styler.copy`` where data and
- data dependent attributes are also copied.
-
- The following items are exported since they are not generally data dependent:
-
- - Styling functions added by the ``apply`` and ``applymap``
- - Whether axes and names are hidden from the display, if unambiguous.
- - Table attributes
- - Table styles
-
- The following attributes are considered data dependent and therefore not
- exported:
-
- - Caption
- - UUID
- - Tooltips
- - Any hidden rows or columns identified by Index labels
- - Any formatting applied using ``Styler.format``
- - Any CSS classes added using ``Styler.set_td_classes``
-
- Examples
- --------
-
- >>> styler = DataFrame([[1, 2], [3, 4]]).style
- >>> styler2 = DataFrame([[9, 9, 9]]).style
- >>> styler.hide(axis=0).highlight_max(axis=1) # doctest: +SKIP
- >>> export = styler.export()
- >>> styler2.use(export) # doctest: +SKIP
- """
- return {
- "apply": copy.copy(self._todo),
- "table_attributes": self.table_attributes,
- "table_styles": copy.copy(self.table_styles),
- "hide_index": all(self.hide_index_),
- "hide_columns": all(self.hide_columns_),
- "hide_index_names": self.hide_index_names,
- "hide_column_names": self.hide_column_names,
- "css": copy.copy(self.css),
- }
-
- def use(self, styles: dict[str, Any]) -> Styler:
- """
- Set the styles on the current Styler.
-
- Possibly uses styles from ``Styler.export``.
-
- Parameters
- ----------
- styles : dict(str, Any)
- List of attributes to add to Styler. Dict keys should contain only:
- - "apply": list of styler functions, typically added with ``apply`` or
- ``applymap``.
- - "table_attributes": HTML attributes, typically added with
- ``set_table_attributes``.
- - "table_styles": CSS selectors and properties, typically added with
- ``set_table_styles``.
- - "hide_index": whether the index is hidden, typically added with
- ``hide_index``, or a boolean list for hidden levels.
- - "hide_columns": whether column headers are hidden, typically added with
- ``hide_columns``, or a boolean list for hidden levels.
- - "hide_index_names": whether index names are hidden.
- - "hide_column_names": whether column header names are hidden.
- - "css": the css class names used.
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.export : Export the non data dependent attributes to the current Styler.
-
- Examples
- --------
-
- >>> styler = DataFrame([[1, 2], [3, 4]]).style
- >>> styler2 = DataFrame([[9, 9, 9]]).style
- >>> styler.hide(axis=0).highlight_max(axis=1) # doctest: +SKIP
- >>> export = styler.export()
- >>> styler2.use(export) # doctest: +SKIP
- """
- self._todo.extend(styles.get("apply", []))
- table_attributes: str = self.table_attributes or ""
- obj_table_atts: str = (
- ""
- if styles.get("table_attributes") is None
- else str(styles.get("table_attributes"))
- )
- self.set_table_attributes((table_attributes + " " + obj_table_atts).strip())
- if styles.get("table_styles"):
- self.set_table_styles(styles.get("table_styles"), overwrite=False)
-
- for obj in ["index", "columns"]:
- hide_obj = styles.get("hide_" + obj)
- if hide_obj is not None:
- if isinstance(hide_obj, bool):
- n = getattr(self, obj).nlevels
- setattr(self, "hide_" + obj + "_", [hide_obj] * n)
- else:
- setattr(self, "hide_" + obj + "_", hide_obj)
-
- self.hide_index_names = styles.get("hide_index_names", False)
- self.hide_column_names = styles.get("hide_column_names", False)
- if styles.get("css"):
- self.css = styles.get("css") # type: ignore[assignment]
- return self
-
- def set_uuid(self, uuid: str) -> Styler:
- """
- Set the uuid applied to ``id`` attributes of HTML elements.
-
- Parameters
- ----------
- uuid : str
-
- Returns
- -------
- Styler
-
- Notes
- -----
- Almost all HTML elements within the table, and including the ``<table>`` element
- are assigned ``id`` attributes. The format is ``T_uuid_<extra>`` where
- ``<extra>`` is typically a more specific identifier, such as ``row1_col2``.
- """
- self.uuid = uuid
- return self
-
- def set_caption(self, caption: str | tuple | list) -> Styler:
- """
- Set the text added to a ``<caption>`` HTML element.
-
- Parameters
- ----------
- caption : str, tuple, list
- For HTML output either the string input is used or the first element of the
- tuple. For LaTeX the string input provides a caption and the additional
- tuple input allows for full captions and short captions, in that order.
-
- Returns
- -------
- Styler
- """
- msg = "`caption` must be either a string or 2-tuple of strings."
- if isinstance(caption, (list, tuple)):
- if (
- len(caption) != 2
- or not isinstance(caption[0], str)
- or not isinstance(caption[1], str)
- ):
- raise ValueError(msg)
- elif not isinstance(caption, str):
- raise ValueError(msg)
- self.caption = caption
- return self
-
- def set_sticky(
- self,
- axis: Axis = 0,
- pixel_size: int | None = None,
- levels: Level | list[Level] | None = None,
- ) -> Styler:
- """
- Add CSS to permanently display the index or column headers in a scrolling frame.
-
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Whether to make the index or column headers sticky.
- pixel_size : int, optional
- Required to configure the width of index cells or the height of column
- header cells when sticking a MultiIndex (or with a named Index).
- Defaults to 75 and 25 respectively.
- levels : int, str, list, optional
- If ``axis`` is a MultiIndex the specific levels to stick. If ``None`` will
- stick all levels.
-
- Returns
- -------
- Styler
-
- Notes
- -----
- This method uses the CSS 'position: sticky;' property to display. It is
- designed to work with visible axes, therefore both:
-
- - `styler.set_sticky(axis="index").hide(axis="index")`
- - `styler.set_sticky(axis="columns").hide(axis="columns")`
-
- may produce strange behaviour due to CSS controls with missing elements.
- """
- axis = self.data._get_axis_number(axis)
- obj = self.data.index if axis == 0 else self.data.columns
- pixel_size = (75 if axis == 0 else 25) if not pixel_size else pixel_size
-
- props = "position:sticky; background-color:inherit;"
- if not isinstance(obj, pd.MultiIndex):
- # handling MultiIndexes requires different CSS
-
- if axis == 1:
- # stick the first <tr> of <head> and, if index names, the second <tr>
- # if self._hide_columns then no <thead><tr> here will exist: no conflict
- styles: CSSStyles = [
- {
- "selector": "thead tr:nth-child(1) th",
- "props": props + "top:0px; z-index:2;",
- }
- ]
- if self.index.names[0] is not None:
- styles[0]["props"] = (
- props + f"top:0px; z-index:2; height:{pixel_size}px;"
- )
- styles.append(
- {
- "selector": "thead tr:nth-child(2) th",
- "props": props
- + f"top:{pixel_size}px; z-index:2; height:{pixel_size}px; ",
- }
- )
- else:
- # stick the first <th> of each <tr> in both <thead> and <tbody>
- # if self._hide_index then no <th> will exist in <tbody>: no conflict
- # but <th> will exist in <thead>: conflict with initial element
- styles = [
- {
- "selector": "thead tr th:nth-child(1)",
- "props": props + "left:0px; z-index:3 !important;",
- },
- {
- "selector": "tbody tr th:nth-child(1)",
- "props": props + "left:0px; z-index:1;",
- },
- ]
-
- else:
- # handle the MultiIndex case
- range_idx = list(range(obj.nlevels))
- levels_: list[int] = refactor_levels(levels, obj) if levels else range_idx
- levels_ = sorted(levels_)
-
- if axis == 1:
- styles = []
- for i, level in enumerate(levels_):
- styles.append(
- {
- "selector": f"thead tr:nth-child({level+1}) th",
- "props": props
- + (
- f"top:{i * pixel_size}px; height:{pixel_size}px; "
- "z-index:2;"
- ),
- }
- )
- if not all(name is None for name in self.index.names):
- styles.append(
- {
- "selector": f"thead tr:nth-child({obj.nlevels+1}) th",
- "props": props
- + (
- f"top:{(len(levels_)) * pixel_size}px; "
- f"height:{pixel_size}px; z-index:2;"
- ),
- }
- )
-
- else:
- styles = []
- for i, level in enumerate(levels_):
- props_ = props + (
- f"left:{i * pixel_size}px; "
- f"min-width:{pixel_size}px; "
- f"max-width:{pixel_size}px; "
- )
- styles.extend(
- [
- {
- "selector": f"thead tr th:nth-child({level+1})",
- "props": props_ + "z-index:3 !important;",
- },
- {
- "selector": f"tbody tr th.level{level}",
- "props": props_ + "z-index:1;",
- },
- ]
- )
-
- return self.set_table_styles(styles, overwrite=False)
-
- def set_table_styles(
- self,
- table_styles: dict[Any, CSSStyles] | CSSStyles | None = None,
- axis: AxisInt = 0,
- overwrite: bool = True,
- css_class_names: dict[str, str] | None = None,
- ) -> Styler:
- """
- Set the table styles included within the ``<style>`` HTML element.
-
- This function can be used to style the entire table, columns, rows or
- specific HTML selectors.
-
- Parameters
- ----------
- table_styles : list or dict
- If supplying a list, each individual table_style should be a
- dictionary with ``selector`` and ``props`` keys. ``selector``
- should be a CSS selector that the style will be applied to
- (automatically prefixed by the table's UUID) and ``props``
- should be a list of tuples with ``(attribute, value)``.
- If supplying a dict, the dict keys should correspond to
- column names or index values, depending upon the specified
- `axis` argument. These will be mapped to row or col CSS
- selectors. MultiIndex values as dict keys should be
- in their respective tuple form. The dict values should be
- a list as specified in the form with CSS selectors and
- props that will be applied to the specified row or column.
-
- .. versionchanged:: 1.2.0
-
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- Apply to each column (``axis=0`` or ``'index'``), to each row
- (``axis=1`` or ``'columns'``). Only used if `table_styles` is
- dict.
-
- .. versionadded:: 1.2.0
-
- overwrite : bool, default True
- Styles are replaced if `True`, or extended if `False`. CSS
- rules are preserved so most recent styles set will dominate
- if selectors intersect.
-
- .. versionadded:: 1.2.0
-
- css_class_names : dict, optional
- A dict of strings used to replace the default CSS classes described below.
-
- .. versionadded:: 1.4.0
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.set_td_classes: Set the DataFrame of strings added to the ``class``
- attribute of ``<td>`` HTML elements.
- Styler.set_table_attributes: Set the table attributes added to the ``<table>``
- HTML element.
-
- Notes
- -----
- The default CSS classes dict, whose values can be replaced is as follows:
-
- .. code-block:: python
-
- css_class_names = {"row_heading": "row_heading",
- "col_heading": "col_heading",
- "index_name": "index_name",
- "col": "col",
- "row": "row",
- "col_trim": "col_trim",
- "row_trim": "row_trim",
- "level": "level",
- "data": "data",
- "blank": "blank",
- "foot": "foot"}
-
- Examples
- --------
- >>> df = pd.DataFrame(np.random.randn(10, 4),
- ... columns=['A', 'B', 'C', 'D'])
- >>> df.style.set_table_styles(
- ... [{'selector': 'tr:hover',
- ... 'props': [('background-color', 'yellow')]}]
- ... ) # doctest: +SKIP
-
- Or with CSS strings
-
- >>> df.style.set_table_styles(
- ... [{'selector': 'tr:hover',
- ... 'props': 'background-color: yellow; font-size: 1em;'}]
- ... ) # doctest: +SKIP
-
- Adding column styling by name
-
- >>> df.style.set_table_styles({
- ... 'A': [{'selector': '',
- ... 'props': [('color', 'red')]}],
- ... 'B': [{'selector': 'td',
- ... 'props': 'color: blue;'}]
- ... }, overwrite=False) # doctest: +SKIP
-
- Adding row styling
-
- >>> df.style.set_table_styles({
- ... 0: [{'selector': 'td:hover',
- ... 'props': [('font-size', '25px')]}]
- ... }, axis=1, overwrite=False) # doctest: +SKIP
-
- See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for
- more details.
- """
- if css_class_names is not None:
- self.css = {**self.css, **css_class_names}
-
- if table_styles is None:
- return self
- elif isinstance(table_styles, dict):
- axis = self.data._get_axis_number(axis)
- obj = self.data.index if axis == 1 else self.data.columns
- idf = f".{self.css['row']}" if axis == 1 else f".{self.css['col']}"
-
- table_styles = [
- {
- "selector": str(s["selector"]) + idf + str(idx),
- "props": maybe_convert_css_to_tuples(s["props"]),
- }
- for key, styles in table_styles.items()
- for idx in obj.get_indexer_for([key])
- for s in format_table_styles(styles)
- ]
- else:
- table_styles = [
- {
- "selector": s["selector"],
- "props": maybe_convert_css_to_tuples(s["props"]),
- }
- for s in table_styles
- ]
-
- if not overwrite and self.table_styles is not None:
- self.table_styles.extend(table_styles)
- else:
- self.table_styles = table_styles
- return self
-
- def hide(
- self,
- subset: Subset | None = None,
- axis: Axis = 0,
- level: Level | list[Level] | None = None,
- names: bool = False,
- ) -> Styler:
- """
- Hide the entire index / column headers, or specific rows / columns from display.
-
- .. versionadded:: 1.4.0
-
- Parameters
- ----------
- subset : label, array-like, IndexSlice, optional
- A valid 1d input or single key along the axis within
- `DataFrame.loc[<subset>, :]` or `DataFrame.loc[:, <subset>]` depending
- upon ``axis``, to limit ``data`` to select hidden rows / columns.
- axis : {"index", 0, "columns", 1}
- Apply to the index or columns.
- level : int, str, list
- The level(s) to hide in a MultiIndex if hiding the entire index / column
- headers. Cannot be used simultaneously with ``subset``.
- names : bool
- Whether to hide the level name(s) of the index / columns headers in the case
- it (or at least one the levels) remains visible.
-
- Returns
- -------
- Styler
-
- Notes
- -----
- .. warning::
- This method only works with the output methods ``to_html``, ``to_string``
- and ``to_latex``.
-
- Other output methods, including ``to_excel``, ignore this hiding method
- and will display all data.
-
- This method has multiple functionality depending upon the combination
- of the ``subset``, ``level`` and ``names`` arguments (see examples). The
- ``axis`` argument is used only to control whether the method is applied to row
- or column headers:
-
- .. list-table:: Argument combinations
- :widths: 10 20 10 60
- :header-rows: 1
-
- * - ``subset``
- - ``level``
- - ``names``
- - Effect
- * - None
- - None
- - False
- - The axis-Index is hidden entirely.
- * - None
- - None
- - True
- - Only the axis-Index names are hidden.
- * - None
- - Int, Str, List
- - False
- - Specified axis-MultiIndex levels are hidden entirely.
- * - None
- - Int, Str, List
- - True
- - Specified axis-MultiIndex levels are hidden entirely and the names of
- remaining axis-MultiIndex levels.
- * - Subset
- - None
- - False
- - The specified data rows/columns are hidden, but the axis-Index itself,
- and names, remain unchanged.
- * - Subset
- - None
- - True
- - The specified data rows/columns and axis-Index names are hidden, but
- the axis-Index itself remains unchanged.
- * - Subset
- - Int, Str, List
- - Boolean
- - ValueError: cannot supply ``subset`` and ``level`` simultaneously.
-
- Note this method only hides the identifed elements so can be chained to hide
- multiple elements in sequence.
-
- Examples
- --------
- Simple application hiding specific rows:
-
- >>> df = pd.DataFrame([[1,2], [3,4], [5,6]], index=["a", "b", "c"])
- >>> df.style.hide(["a", "b"]) # doctest: +SKIP
- 0 1
- c 5 6
-
- Hide the index and retain the data values:
-
- >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]])
- >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx)
- >>> df.style.format("{:.1f}").hide() # doctest: +SKIP
- x y
- a b c a b c
- 0.1 0.0 0.4 1.3 0.6 -1.4
- 0.7 1.0 1.3 1.5 -0.0 -0.2
- 1.4 -0.8 1.6 -0.2 -0.4 -0.3
- 0.4 1.0 -0.2 -0.8 -1.2 1.1
- -0.6 1.2 1.8 1.9 0.3 0.3
- 0.8 0.5 -0.3 1.2 2.2 -0.8
-
- Hide specific rows in a MultiIndex but retain the index:
-
- >>> df.style.format("{:.1f}").hide(subset=(slice(None), ["a", "c"]))
- ... # doctest: +SKIP
- x y
- a b c a b c
- x b 0.7 1.0 1.3 1.5 -0.0 -0.2
- y b -0.6 1.2 1.8 1.9 0.3 0.3
-
- Hide specific rows and the index through chaining:
-
- >>> df.style.format("{:.1f}").hide(subset=(slice(None), ["a", "c"])).hide()
- ... # doctest: +SKIP
- x y
- a b c a b c
- 0.7 1.0 1.3 1.5 -0.0 -0.2
- -0.6 1.2 1.8 1.9 0.3 0.3
-
- Hide a specific level:
-
- >>> df.style.format("{:,.1f}").hide(level=1) # doctest: +SKIP
- x y
- a b c a b c
- x 0.1 0.0 0.4 1.3 0.6 -1.4
- 0.7 1.0 1.3 1.5 -0.0 -0.2
- 1.4 -0.8 1.6 -0.2 -0.4 -0.3
- y 0.4 1.0 -0.2 -0.8 -1.2 1.1
- -0.6 1.2 1.8 1.9 0.3 0.3
- 0.8 0.5 -0.3 1.2 2.2 -0.8
-
- Hiding just the index level names:
-
- >>> df.index.names = ["lev0", "lev1"]
- >>> df.style.format("{:,.1f}").hide(names=True) # doctest: +SKIP
- x y
- a b c a b c
- x a 0.1 0.0 0.4 1.3 0.6 -1.4
- b 0.7 1.0 1.3 1.5 -0.0 -0.2
- c 1.4 -0.8 1.6 -0.2 -0.4 -0.3
- y a 0.4 1.0 -0.2 -0.8 -1.2 1.1
- b -0.6 1.2 1.8 1.9 0.3 0.3
- c 0.8 0.5 -0.3 1.2 2.2 -0.8
-
- Examples all produce equivalently transposed effects with ``axis="columns"``.
- """
- axis = self.data._get_axis_number(axis)
- if axis == 0:
- obj, objs, alt = "index", "index", "rows"
- else:
- obj, objs, alt = "column", "columns", "columns"
-
- if level is not None and subset is not None:
- raise ValueError("`subset` and `level` cannot be passed simultaneously")
-
- if subset is None:
- if level is None and names:
- # this combination implies user shows the index and hides just names
- setattr(self, f"hide_{obj}_names", True)
- return self
-
- levels_ = refactor_levels(level, getattr(self, objs))
- setattr(
- self,
- f"hide_{objs}_",
- [lev in levels_ for lev in range(getattr(self, objs).nlevels)],
- )
- else:
- if axis == 0:
- subset_ = IndexSlice[subset, :] # new var so mypy reads not Optional
- else:
- subset_ = IndexSlice[:, subset] # new var so mypy reads not Optional
- subset = non_reducing_slice(subset_)
- hide = self.data.loc[subset]
- h_els = getattr(self, objs).get_indexer_for(getattr(hide, objs))
- setattr(self, f"hidden_{alt}", h_els)
-
- if names:
- setattr(self, f"hide_{obj}_names", True)
- return self
-
- # -----------------------------------------------------------------------
- # A collection of "builtin" styles
- # -----------------------------------------------------------------------
-
- def _get_numeric_subset_default(self):
- # Returns a boolean mask indicating where `self.data` has numerical columns.
- # Choosing a mask as opposed to the column names also works for
- # boolean column labels (GH47838).
- return self.data.columns.isin(self.data.select_dtypes(include=np.number))
-
- @doc(
- name="background",
- alt="text",
- image_prefix="bg",
- text_threshold="""text_color_threshold : float or int\n
- Luminance threshold for determining text color in [0, 1]. Facilitates text\n
- visibility across varying background colors. All text is dark if 0, and\n
- light if 1, defaults to 0.408.""",
- )
- @Substitution(subset=subset_args)
- def background_gradient(
- self,
- cmap: str | Colormap = "PuBu",
- low: float = 0,
- high: float = 0,
- axis: Axis | None = 0,
- subset: Subset | None = None,
- text_color_threshold: float = 0.408,
- vmin: float | None = None,
- vmax: float | None = None,
- gmap: Sequence | None = None,
- ) -> Styler:
- """
- Color the {name} in a gradient style.
-
- The {name} color is determined according
- to the data in each column, row or frame, or by a given
- gradient map. Requires matplotlib.
-
- Parameters
- ----------
- cmap : str or colormap
- Matplotlib colormap.
- low : float
- Compress the color range at the low end. This is a multiple of the data
- range to extend below the minimum; good values usually in [0, 1],
- defaults to 0.
- high : float
- Compress the color range at the high end. This is a multiple of the data
- range to extend above the maximum; good values usually in [0, 1],
- defaults to 0.
- axis : {{0, 1, "index", "columns", None}}, default 0
- Apply to each column (``axis=0`` or ``'index'``), to each row
- (``axis=1`` or ``'columns'``), or to the entire DataFrame at once
- with ``axis=None``.
- %(subset)s
- {text_threshold}
- vmin : float, optional
- Minimum data value that corresponds to colormap minimum value.
- If not specified the minimum value of the data (or gmap) will be used.
- vmax : float, optional
- Maximum data value that corresponds to colormap maximum value.
- If not specified the maximum value of the data (or gmap) will be used.
- gmap : array-like, optional
- Gradient map for determining the {name} colors. If not supplied
- will use the underlying data from rows, columns or frame. If given as an
- ndarray or list-like must be an identical shape to the underlying data
- considering ``axis`` and ``subset``. If given as DataFrame or Series must
- have same index and column labels considering ``axis`` and ``subset``.
- If supplied, ``vmin`` and ``vmax`` should be given relative to this
- gradient map.
-
- .. versionadded:: 1.3.0
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.{alt}_gradient: Color the {alt} in a gradient style.
-
- Notes
- -----
- When using ``low`` and ``high`` the range
- of the gradient, given by the data if ``gmap`` is not given or by ``gmap``,
- is extended at the low end effectively by
- `map.min - low * map.range` and at the high end by
- `map.max + high * map.range` before the colors are normalized and determined.
-
- If combining with ``vmin`` and ``vmax`` the `map.min`, `map.max` and
- `map.range` are replaced by values according to the values derived from
- ``vmin`` and ``vmax``.
-
- This method will preselect numeric columns and ignore non-numeric columns
- unless a ``gmap`` is supplied in which case no preselection occurs.
-
- Examples
- --------
- >>> df = pd.DataFrame(columns=["City", "Temp (c)", "Rain (mm)", "Wind (m/s)"],
- ... data=[["Stockholm", 21.6, 5.0, 3.2],
- ... ["Oslo", 22.4, 13.3, 3.1],
- ... ["Copenhagen", 24.5, 0.0, 6.7]])
-
- Shading the values column-wise, with ``axis=0``, preselecting numeric columns
-
- >>> df.style.{name}_gradient(axis=0) # doctest: +SKIP
-
- .. figure:: ../../_static/style/{image_prefix}_ax0.png
-
- Shading all values collectively using ``axis=None``
-
- >>> df.style.{name}_gradient(axis=None) # doctest: +SKIP
-
- .. figure:: ../../_static/style/{image_prefix}_axNone.png
-
- Compress the color map from the both ``low`` and ``high`` ends
-
- >>> df.style.{name}_gradient(axis=None, low=0.75, high=1.0) # doctest: +SKIP
-
- .. figure:: ../../_static/style/{image_prefix}_axNone_lowhigh.png
-
- Manually setting ``vmin`` and ``vmax`` gradient thresholds
-
- >>> df.style.{name}_gradient(axis=None, vmin=6.7, vmax=21.6) # doctest: +SKIP
-
- .. figure:: ../../_static/style/{image_prefix}_axNone_vminvmax.png
-
- Setting a ``gmap`` and applying to all columns with another ``cmap``
-
- >>> df.style.{name}_gradient(axis=0, gmap=df['Temp (c)'], cmap='YlOrRd')
- ... # doctest: +SKIP
-
- .. figure:: ../../_static/style/{image_prefix}_gmap.png
-
- Setting the gradient map for a dataframe (i.e. ``axis=None``), we need to
- explicitly state ``subset`` to match the ``gmap`` shape
-
- >>> gmap = np.array([[1,2,3], [2,3,4], [3,4,5]])
- >>> df.style.{name}_gradient(axis=None, gmap=gmap,
- ... cmap='YlOrRd', subset=['Temp (c)', 'Rain (mm)', 'Wind (m/s)']
- ... ) # doctest: +SKIP
-
- .. figure:: ../../_static/style/{image_prefix}_axNone_gmap.png
- """
- if subset is None and gmap is None:
- subset = self._get_numeric_subset_default()
-
- self.apply(
- _background_gradient,
- cmap=cmap,
- subset=subset,
- axis=axis,
- low=low,
- high=high,
- text_color_threshold=text_color_threshold,
- vmin=vmin,
- vmax=vmax,
- gmap=gmap,
- )
- return self
-
- @doc(
- background_gradient,
- name="text",
- alt="background",
- image_prefix="tg",
- text_threshold="",
- )
- def text_gradient(
- self,
- cmap: str | Colormap = "PuBu",
- low: float = 0,
- high: float = 0,
- axis: Axis | None = 0,
- subset: Subset | None = None,
- vmin: float | None = None,
- vmax: float | None = None,
- gmap: Sequence | None = None,
- ) -> Styler:
- if subset is None and gmap is None:
- subset = self._get_numeric_subset_default()
-
- return self.apply(
- _background_gradient,
- cmap=cmap,
- subset=subset,
- axis=axis,
- low=low,
- high=high,
- vmin=vmin,
- vmax=vmax,
- gmap=gmap,
- text_only=True,
- )
-
- @Substitution(subset=subset_args)
- def set_properties(self, subset: Subset | None = None, **kwargs) -> Styler:
- """
- Set defined CSS-properties to each ``<td>`` HTML element for the given subset.
-
- Parameters
- ----------
- %(subset)s
- **kwargs : dict
- A dictionary of property, value pairs to be set for each cell.
-
- Returns
- -------
- Styler
-
- Notes
- -----
- This is a convenience methods which wraps the :meth:`Styler.applymap` calling a
- function returning the CSS-properties independently of the data.
-
- Examples
- --------
- >>> df = pd.DataFrame(np.random.randn(10, 4))
- >>> df.style.set_properties(color="white", align="right") # doctest: +SKIP
- >>> df.style.set_properties(**{'background-color': 'yellow'}) # doctest: +SKIP
-
- See `Table Visualization <../../user_guide/style.ipynb>`_ user guide for
- more details.
- """
- values = "".join([f"{p}: {v};" for p, v in kwargs.items()])
- return self.applymap(lambda x: values, subset=subset)
-
- @Substitution(subset=subset_args)
- def bar( # pylint: disable=disallowed-name
- self,
- subset: Subset | None = None,
- axis: Axis | None = 0,
- *,
- color: str | list | tuple | None = None,
- cmap: Any | None = None,
- width: float = 100,
- height: float = 100,
- align: str | float | Callable = "mid",
- vmin: float | None = None,
- vmax: float | None = None,
- props: str = "width: 10em;",
- ) -> Styler:
- """
- Draw bar chart in the cell backgrounds.
-
- .. versionchanged:: 1.4.0
-
- Parameters
- ----------
- %(subset)s
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- Apply to each column (``axis=0`` or ``'index'``), to each row
- (``axis=1`` or ``'columns'``), or to the entire DataFrame at once
- with ``axis=None``.
- color : str or 2-tuple/list
- If a str is passed, the color is the same for both
- negative and positive numbers. If 2-tuple/list is used, the
- first element is the color_negative and the second is the
- color_positive (eg: ['#d65f5f', '#5fba7d']).
- cmap : str, matplotlib.cm.ColorMap
- A string name of a matplotlib Colormap, or a Colormap object. Cannot be
- used together with ``color``.
-
- .. versionadded:: 1.4.0
- width : float, default 100
- The percentage of the cell, measured from the left, in which to draw the
- bars, in [0, 100].
- height : float, default 100
- The percentage height of the bar in the cell, centrally aligned, in [0,100].
-
- .. versionadded:: 1.4.0
- align : str, int, float, callable, default 'mid'
- How to align the bars within the cells relative to a width adjusted center.
- If string must be one of:
-
- - 'left' : bars are drawn rightwards from the minimum data value.
- - 'right' : bars are drawn leftwards from the maximum data value.
- - 'zero' : a value of zero is located at the center of the cell.
- - 'mid' : a value of (max-min)/2 is located at the center of the cell,
- or if all values are negative (positive) the zero is
- aligned at the right (left) of the cell.
- - 'mean' : the mean value of the data is located at the center of the cell.
-
- If a float or integer is given this will indicate the center of the cell.
-
- If a callable should take a 1d or 2d array and return a scalar.
-
- .. versionchanged:: 1.4.0
-
- vmin : float, optional
- Minimum bar value, defining the left hand limit
- of the bar drawing range, lower values are clipped to `vmin`.
- When None (default): the minimum value of the data will be used.
- vmax : float, optional
- Maximum bar value, defining the right hand limit
- of the bar drawing range, higher values are clipped to `vmax`.
- When None (default): the maximum value of the data will be used.
- props : str, optional
- The base CSS of the cell that is extended to add the bar chart. Defaults to
- `"width: 10em;"`.
-
- .. versionadded:: 1.4.0
-
- Returns
- -------
- Styler
-
- Notes
- -----
- This section of the user guide:
- `Table Visualization <../../user_guide/style.ipynb>`_ gives
- a number of examples for different settings and color coordination.
- """
- if color is None and cmap is None:
- color = "#d65f5f"
- elif color is not None and cmap is not None:
- raise ValueError("`color` and `cmap` cannot both be given")
- elif color is not None:
- if (isinstance(color, (list, tuple)) and len(color) > 2) or not isinstance(
- color, (str, list, tuple)
- ):
- raise ValueError(
- "`color` must be string or list or tuple of 2 strings,"
- "(eg: color=['#d65f5f', '#5fba7d'])"
- )
-
- if not 0 <= width <= 100:
- raise ValueError(f"`width` must be a value in [0, 100], got {width}")
- if not 0 <= height <= 100:
- raise ValueError(f"`height` must be a value in [0, 100], got {height}")
-
- if subset is None:
- subset = self._get_numeric_subset_default()
-
- self.apply(
- _bar,
- subset=subset,
- axis=axis,
- align=align,
- colors=color,
- cmap=cmap,
- width=width / 100,
- height=height / 100,
- vmin=vmin,
- vmax=vmax,
- base_css=props,
- )
-
- return self
-
- @Substitution(
- subset=subset_args,
- props=properties_args,
- color=coloring_args.format(default="red"),
- )
- def highlight_null(
- self,
- color: str = "red",
- subset: Subset | None = None,
- props: str | None = None,
- ) -> Styler:
- """
- Highlight missing values with a style.
-
- Parameters
- ----------
- %(color)s
-
- .. versionadded:: 1.5.0
-
- %(subset)s
-
- .. versionadded:: 1.1.0
-
- %(props)s
-
- .. versionadded:: 1.3.0
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.highlight_max: Highlight the maximum with a style.
- Styler.highlight_min: Highlight the minimum with a style.
- Styler.highlight_between: Highlight a defined range with a style.
- Styler.highlight_quantile: Highlight values defined by a quantile with a style.
- """
-
- def f(data: DataFrame, props: str) -> np.ndarray:
- return np.where(pd.isna(data).to_numpy(), props, "")
-
- if props is None:
- props = f"background-color: {color};"
- return self.apply(f, axis=None, subset=subset, props=props)
-
- @Substitution(
- subset=subset_args,
- color=coloring_args.format(default="yellow"),
- props=properties_args,
- )
- def highlight_max(
- self,
- subset: Subset | None = None,
- color: str = "yellow",
- axis: Axis | None = 0,
- props: str | None = None,
- ) -> Styler:
- """
- Highlight the maximum with a style.
-
- Parameters
- ----------
- %(subset)s
- %(color)s
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- Apply to each column (``axis=0`` or ``'index'``), to each row
- (``axis=1`` or ``'columns'``), or to the entire DataFrame at once
- with ``axis=None``.
- %(props)s
-
- .. versionadded:: 1.3.0
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.highlight_null: Highlight missing values with a style.
- Styler.highlight_min: Highlight the minimum with a style.
- Styler.highlight_between: Highlight a defined range with a style.
- Styler.highlight_quantile: Highlight values defined by a quantile with a style.
- """
-
- if props is None:
- props = f"background-color: {color};"
- return self.apply(
- partial(_highlight_value, op="max"),
- axis=axis,
- subset=subset,
- props=props,
- )
-
- @Substitution(
- subset=subset_args,
- color=coloring_args.format(default="yellow"),
- props=properties_args,
- )
- def highlight_min(
- self,
- subset: Subset | None = None,
- color: str = "yellow",
- axis: Axis | None = 0,
- props: str | None = None,
- ) -> Styler:
- """
- Highlight the minimum with a style.
-
- Parameters
- ----------
- %(subset)s
- %(color)s
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- Apply to each column (``axis=0`` or ``'index'``), to each row
- (``axis=1`` or ``'columns'``), or to the entire DataFrame at once
- with ``axis=None``.
- %(props)s
-
- .. versionadded:: 1.3.0
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.highlight_null: Highlight missing values with a style.
- Styler.highlight_max: Highlight the maximum with a style.
- Styler.highlight_between: Highlight a defined range with a style.
- Styler.highlight_quantile: Highlight values defined by a quantile with a style.
- """
-
- if props is None:
- props = f"background-color: {color};"
- return self.apply(
- partial(_highlight_value, op="min"),
- axis=axis,
- subset=subset,
- props=props,
- )
-
- @Substitution(
- subset=subset_args,
- color=coloring_args.format(default="yellow"),
- props=properties_args,
- )
- def highlight_between(
- self,
- subset: Subset | None = None,
- color: str = "yellow",
- axis: Axis | None = 0,
- left: Scalar | Sequence | None = None,
- right: Scalar | Sequence | None = None,
- inclusive: str = "both",
- props: str | None = None,
- ) -> Styler:
- """
- Highlight a defined range with a style.
-
- .. versionadded:: 1.3.0
-
- Parameters
- ----------
- %(subset)s
- %(color)s
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- If ``left`` or ``right`` given as sequence, axis along which to apply those
- boundaries. See examples.
- left : scalar or datetime-like, or sequence or array-like, default None
- Left bound for defining the range.
- right : scalar or datetime-like, or sequence or array-like, default None
- Right bound for defining the range.
- inclusive : {'both', 'neither', 'left', 'right'}
- Identify whether bounds are closed or open.
- %(props)s
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.highlight_null: Highlight missing values with a style.
- Styler.highlight_max: Highlight the maximum with a style.
- Styler.highlight_min: Highlight the minimum with a style.
- Styler.highlight_quantile: Highlight values defined by a quantile with a style.
-
- Notes
- -----
- If ``left`` is ``None`` only the right bound is applied.
- If ``right`` is ``None`` only the left bound is applied. If both are ``None``
- all values are highlighted.
-
- ``axis`` is only needed if ``left`` or ``right`` are provided as a sequence or
- an array-like object for aligning the shapes. If ``left`` and ``right`` are
- both scalars then all ``axis`` inputs will give the same result.
-
- This function only works with compatible ``dtypes``. For example a datetime-like
- region can only use equivalent datetime-like ``left`` and ``right`` arguments.
- Use ``subset`` to control regions which have multiple ``dtypes``.
-
- Examples
- --------
- Basic usage
-
- >>> df = pd.DataFrame({
- ... 'One': [1.2, 1.6, 1.5],
- ... 'Two': [2.9, 2.1, 2.5],
- ... 'Three': [3.1, 3.2, 3.8],
- ... })
- >>> df.style.highlight_between(left=2.1, right=2.9) # doctest: +SKIP
-
- .. figure:: ../../_static/style/hbetw_basic.png
-
- Using a range input sequence along an ``axis``, in this case setting a ``left``
- and ``right`` for each column individually
-
- >>> df.style.highlight_between(left=[1.4, 2.4, 3.4], right=[1.6, 2.6, 3.6],
- ... axis=1, color="#fffd75") # doctest: +SKIP
-
- .. figure:: ../../_static/style/hbetw_seq.png
-
- Using ``axis=None`` and providing the ``left`` argument as an array that
- matches the input DataFrame, with a constant ``right``
-
- >>> df.style.highlight_between(left=[[2,2,3],[2,2,3],[3,3,3]], right=3.5,
- ... axis=None, color="#fffd75") # doctest: +SKIP
-
- .. figure:: ../../_static/style/hbetw_axNone.png
-
- Using ``props`` instead of default background coloring
-
- >>> df.style.highlight_between(left=1.5, right=3.5,
- ... props='font-weight:bold;color:#e83e8c') # doctest: +SKIP
-
- .. figure:: ../../_static/style/hbetw_props.png
- """
- if props is None:
- props = f"background-color: {color};"
- return self.apply(
- _highlight_between,
- axis=axis,
- subset=subset,
- props=props,
- left=left,
- right=right,
- inclusive=inclusive,
- )
-
- @Substitution(
- subset=subset_args,
- color=coloring_args.format(default="yellow"),
- props=properties_args,
- )
- def highlight_quantile(
- self,
- subset: Subset | None = None,
- color: str = "yellow",
- axis: Axis | None = 0,
- q_left: float = 0.0,
- q_right: float = 1.0,
- interpolation: QuantileInterpolation = "linear",
- inclusive: str = "both",
- props: str | None = None,
- ) -> Styler:
- """
- Highlight values defined by a quantile with a style.
-
- .. versionadded:: 1.3.0
-
- Parameters
- ----------
- %(subset)s
- %(color)s
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- Axis along which to determine and highlight quantiles. If ``None`` quantiles
- are measured over the entire DataFrame. See examples.
- q_left : float, default 0
- Left bound, in [0, q_right), for the target quantile range.
- q_right : float, default 1
- Right bound, in (q_left, 1], for the target quantile range.
- interpolation : {‘linear’, ‘lower’, ‘higher’, ‘midpoint’, ‘nearest’}
- Argument passed to ``Series.quantile`` or ``DataFrame.quantile`` for
- quantile estimation.
- inclusive : {'both', 'neither', 'left', 'right'}
- Identify whether quantile bounds are closed or open.
- %(props)s
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.highlight_null: Highlight missing values with a style.
- Styler.highlight_max: Highlight the maximum with a style.
- Styler.highlight_min: Highlight the minimum with a style.
- Styler.highlight_between: Highlight a defined range with a style.
-
- Notes
- -----
- This function does not work with ``str`` dtypes.
-
- Examples
- --------
- Using ``axis=None`` and apply a quantile to all collective data
-
- >>> df = pd.DataFrame(np.arange(10).reshape(2,5) + 1)
- >>> df.style.highlight_quantile(axis=None, q_left=0.8, color="#fffd75")
- ... # doctest: +SKIP
-
- .. figure:: ../../_static/style/hq_axNone.png
-
- Or highlight quantiles row-wise or column-wise, in this case by row-wise
-
- >>> df.style.highlight_quantile(axis=1, q_left=0.8, color="#fffd75")
- ... # doctest: +SKIP
-
- .. figure:: ../../_static/style/hq_ax1.png
-
- Use ``props`` instead of default background coloring
-
- >>> df.style.highlight_quantile(axis=None, q_left=0.2, q_right=0.8,
- ... props='font-weight:bold;color:#e83e8c') # doctest: +SKIP
-
- .. figure:: ../../_static/style/hq_props.png
- """
- subset_ = slice(None) if subset is None else subset
- subset_ = non_reducing_slice(subset_)
- data = self.data.loc[subset_]
-
- # after quantile is found along axis, e.g. along rows,
- # applying the calculated quantile to alternate axis, e.g. to each column
- quantiles = [q_left, q_right]
- if axis is None:
- q = Series(data.to_numpy().ravel()).quantile(
- q=quantiles, interpolation=interpolation
- )
- axis_apply: int | None = None
- else:
- axis = self.data._get_axis_number(axis)
- q = data.quantile(
- axis=axis, numeric_only=False, q=quantiles, interpolation=interpolation
- )
- axis_apply = 1 - axis
-
- if props is None:
- props = f"background-color: {color};"
- return self.apply(
- _highlight_between,
- axis=axis_apply,
- subset=subset,
- props=props,
- left=q.iloc[0],
- right=q.iloc[1],
- inclusive=inclusive,
- )
-
- @classmethod
- def from_custom_template(
- cls, searchpath, html_table: str | None = None, html_style: str | None = None
- ):
- """
- Factory function for creating a subclass of ``Styler``.
-
- Uses custom templates and Jinja environment.
-
- .. versionchanged:: 1.3.0
-
- Parameters
- ----------
- searchpath : str or list
- Path or paths of directories containing the templates.
- html_table : str
- Name of your custom template to replace the html_table template.
-
- .. versionadded:: 1.3.0
-
- html_style : str
- Name of your custom template to replace the html_style template.
-
- .. versionadded:: 1.3.0
-
- Returns
- -------
- MyStyler : subclass of Styler
- Has the correct ``env``,``template_html``, ``template_html_table`` and
- ``template_html_style`` class attributes set.
- """
- loader = jinja2.ChoiceLoader([jinja2.FileSystemLoader(searchpath), cls.loader])
-
- # mypy doesn't like dynamically-defined classes
- # error: Variable "cls" is not valid as a type
- # error: Invalid base class "cls"
- class MyStyler(cls): # type: ignore[valid-type,misc]
- env = jinja2.Environment(loader=loader)
- if html_table:
- template_html_table = env.get_template(html_table)
- if html_style:
- template_html_style = env.get_template(html_style)
-
- return MyStyler
-
- def pipe(self, func: Callable, *args, **kwargs):
- """
- Apply ``func(self, *args, **kwargs)``, and return the result.
-
- Parameters
- ----------
- func : function
- Function to apply to the Styler. Alternatively, a
- ``(callable, keyword)`` tuple where ``keyword`` is a string
- indicating the keyword of ``callable`` that expects the Styler.
- *args : optional
- Arguments passed to `func`.
- **kwargs : optional
- A dictionary of keyword arguments passed into ``func``.
-
- Returns
- -------
- object :
- The value returned by ``func``.
-
- See Also
- --------
- DataFrame.pipe : Analogous method for DataFrame.
- Styler.apply : Apply a CSS-styling function column-wise, row-wise, or
- table-wise.
-
- Notes
- -----
- Like :meth:`DataFrame.pipe`, this method can simplify the
- application of several user-defined functions to a styler. Instead
- of writing:
-
- .. code-block:: python
-
- f(g(df.style.format(precision=3), arg1=a), arg2=b, arg3=c)
-
- users can write:
-
- .. code-block:: python
-
- (df.style.format(precision=3)
- .pipe(g, arg1=a)
- .pipe(f, arg2=b, arg3=c))
-
- In particular, this allows users to define functions that take a
- styler object, along with other parameters, and return the styler after
- making styling changes (such as calling :meth:`Styler.apply` or
- :meth:`Styler.set_properties`).
-
- Examples
- --------
-
- **Common Use**
-
- A common usage pattern is to pre-define styling operations which
- can be easily applied to a generic styler in a single ``pipe`` call.
-
- >>> def some_highlights(styler, min_color="red", max_color="blue"):
- ... styler.highlight_min(color=min_color, axis=None)
- ... styler.highlight_max(color=max_color, axis=None)
- ... styler.highlight_null()
- ... return styler
- >>> df = pd.DataFrame([[1, 2, 3, pd.NA], [pd.NA, 4, 5, 6]], dtype="Int64")
- >>> df.style.pipe(some_highlights, min_color="green") # doctest: +SKIP
-
- .. figure:: ../../_static/style/df_pipe_hl.png
-
- Since the method returns a ``Styler`` object it can be chained with other
- methods as if applying the underlying highlighters directly.
-
- >>> (df.style.format("{:.1f}")
- ... .pipe(some_highlights, min_color="green")
- ... .highlight_between(left=2, right=5)) # doctest: +SKIP
-
- .. figure:: ../../_static/style/df_pipe_hl2.png
-
- **Advanced Use**
-
- Sometimes it may be necessary to pre-define styling functions, but in the case
- where those functions rely on the styler, data or context. Since
- ``Styler.use`` and ``Styler.export`` are designed to be non-data dependent,
- they cannot be used for this purpose. Additionally the ``Styler.apply``
- and ``Styler.format`` type methods are not context aware, so a solution
- is to use ``pipe`` to dynamically wrap this functionality.
-
- Suppose we want to code a generic styling function that highlights the final
- level of a MultiIndex. The number of levels in the Index is dynamic so we
- need the ``Styler`` context to define the level.
-
- >>> def highlight_last_level(styler):
- ... return styler.apply_index(
- ... lambda v: "background-color: pink; color: yellow", axis="columns",
- ... level=styler.columns.nlevels-1
- ... ) # doctest: +SKIP
- >>> df.columns = pd.MultiIndex.from_product([["A", "B"], ["X", "Y"]])
- >>> df.style.pipe(highlight_last_level) # doctest: +SKIP
-
- .. figure:: ../../_static/style/df_pipe_applymap.png
-
- Additionally suppose we want to highlight a column header if there is any
- missing data in that column.
- In this case we need the data object itself to determine the effect on the
- column headers.
-
- >>> def highlight_header_missing(styler, level):
- ... def dynamic_highlight(s):
- ... return np.where(
- ... styler.data.isna().any(), "background-color: red;", ""
- ... )
- ... return styler.apply_index(dynamic_highlight, axis=1, level=level)
- >>> df.style.pipe(highlight_header_missing, level=1) # doctest: +SKIP
-
- .. figure:: ../../_static/style/df_pipe_applydata.png
- """
- return com.pipe(self, func, *args, **kwargs)
-
-
-def _validate_apply_axis_arg(
- arg: NDFrame | Sequence | np.ndarray,
- arg_name: str,
- dtype: Any | None,
- data: NDFrame,
-) -> np.ndarray:
- """
- For the apply-type methods, ``axis=None`` creates ``data`` as DataFrame, and for
- ``axis=[1,0]`` it creates a Series. Where ``arg`` is expected as an element
- of some operator with ``data`` we must make sure that the two are compatible shapes,
- or raise.
-
- Parameters
- ----------
- arg : sequence, Series or DataFrame
- the user input arg
- arg_name : string
- name of the arg for use in error messages
- dtype : numpy dtype, optional
- forced numpy dtype if given
- data : Series or DataFrame
- underling subset of Styler data on which operations are performed
-
- Returns
- -------
- ndarray
- """
- dtype = {"dtype": dtype} if dtype else {}
- # raise if input is wrong for axis:
- if isinstance(arg, Series) and isinstance(data, DataFrame):
- raise ValueError(
- f"'{arg_name}' is a Series but underlying data for operations "
- f"is a DataFrame since 'axis=None'"
- )
- if isinstance(arg, DataFrame) and isinstance(data, Series):
- raise ValueError(
- f"'{arg_name}' is a DataFrame but underlying data for "
- f"operations is a Series with 'axis in [0,1]'"
- )
- if isinstance(arg, (Series, DataFrame)): # align indx / cols to data
- arg = arg.reindex_like(data, method=None).to_numpy(**dtype)
- else:
- arg = np.asarray(arg, **dtype)
- assert isinstance(arg, np.ndarray) # mypy requirement
- if arg.shape != data.shape: # check valid input
- raise ValueError(
- f"supplied '{arg_name}' is not correct shape for data over "
- f"selected 'axis': got {arg.shape}, "
- f"expected {data.shape}"
- )
- return arg
-
-
-def _background_gradient(
- data,
- cmap: str | Colormap = "PuBu",
- low: float = 0,
- high: float = 0,
- text_color_threshold: float = 0.408,
- vmin: float | None = None,
- vmax: float | None = None,
- gmap: Sequence | np.ndarray | DataFrame | Series | None = None,
- text_only: bool = False,
-):
- """
- Color background in a range according to the data or a gradient map
- """
- if gmap is None: # the data is used the gmap
- gmap = data.to_numpy(dtype=float, na_value=np.nan)
- else: # else validate gmap against the underlying data
- gmap = _validate_apply_axis_arg(gmap, "gmap", float, data)
-
- with _mpl(Styler.background_gradient) as (_, _matplotlib):
- smin = np.nanmin(gmap) if vmin is None else vmin
- smax = np.nanmax(gmap) if vmax is None else vmax
- rng = smax - smin
- # extend lower / upper bounds, compresses color range
- norm = _matplotlib.colors.Normalize(smin - (rng * low), smax + (rng * high))
-
- if cmap is None:
- rgbas = _matplotlib.colormaps[_matplotlib.rcParams["image.cmap"]](
- norm(gmap)
- )
- else:
- rgbas = _matplotlib.colormaps.get_cmap(cmap)(norm(gmap))
-
- def relative_luminance(rgba) -> float:
- """
- Calculate relative luminance of a color.
-
- The calculation adheres to the W3C standards
- (https://www.w3.org/WAI/GL/wiki/Relative_luminance)
-
- Parameters
- ----------
- color : rgb or rgba tuple
-
- Returns
- -------
- float
- The relative luminance as a value from 0 to 1
- """
- r, g, b = (
- x / 12.92 if x <= 0.04045 else ((x + 0.055) / 1.055) ** 2.4
- for x in rgba[:3]
- )
- return 0.2126 * r + 0.7152 * g + 0.0722 * b
-
- def css(rgba, text_only) -> str:
- if not text_only:
- dark = relative_luminance(rgba) < text_color_threshold
- text_color = "#f1f1f1" if dark else "#000000"
- return (
- f"background-color: {_matplotlib.colors.rgb2hex(rgba)};"
- + f"color: {text_color};"
- )
- else:
- return f"color: {_matplotlib.colors.rgb2hex(rgba)};"
-
- if data.ndim == 1:
- return [css(rgba, text_only) for rgba in rgbas]
- else:
- return DataFrame(
- [[css(rgba, text_only) for rgba in row] for row in rgbas],
- index=data.index,
- columns=data.columns,
- )
-
-
-def _highlight_between(
- data: NDFrame,
- props: str,
- left: Scalar | Sequence | np.ndarray | NDFrame | None = None,
- right: Scalar | Sequence | np.ndarray | NDFrame | None = None,
- inclusive: bool | str = True,
-) -> np.ndarray:
- """
- Return an array of css props based on condition of data values within given range.
- """
- if np.iterable(left) and not isinstance(left, str):
- left = _validate_apply_axis_arg(left, "left", None, data)
-
- if np.iterable(right) and not isinstance(right, str):
- right = _validate_apply_axis_arg(right, "right", None, data)
-
- # get ops with correct boundary attribution
- if inclusive == "both":
- ops = (operator.ge, operator.le)
- elif inclusive == "neither":
- ops = (operator.gt, operator.lt)
- elif inclusive == "left":
- ops = (operator.ge, operator.lt)
- elif inclusive == "right":
- ops = (operator.gt, operator.le)
- else:
- raise ValueError(
- f"'inclusive' values can be 'both', 'left', 'right', or 'neither' "
- f"got {inclusive}"
- )
-
- g_left = (
- # error: Argument 2 to "ge" has incompatible type "Union[str, float,
- # Period, Timedelta, Interval[Any], datetime64, timedelta64, datetime,
- # Sequence[Any], ndarray[Any, Any], NDFrame]"; expected "Union
- # [SupportsDunderLE, SupportsDunderGE, SupportsDunderGT, SupportsDunderLT]"
- ops[0](data, left) # type: ignore[arg-type]
- if left is not None
- else np.full(data.shape, True, dtype=bool)
- )
- if isinstance(g_left, (DataFrame, Series)):
- g_left = g_left.where(pd.notna(g_left), False)
- l_right = (
- # error: Argument 2 to "le" has incompatible type "Union[str, float,
- # Period, Timedelta, Interval[Any], datetime64, timedelta64, datetime,
- # Sequence[Any], ndarray[Any, Any], NDFrame]"; expected "Union
- # [SupportsDunderLE, SupportsDunderGE, SupportsDunderGT, SupportsDunderLT]"
- ops[1](data, right) # type: ignore[arg-type]
- if right is not None
- else np.full(data.shape, True, dtype=bool)
- )
- if isinstance(l_right, (DataFrame, Series)):
- l_right = l_right.where(pd.notna(l_right), False)
- return np.where(g_left & l_right, props, "")
-
-
-def _highlight_value(data: DataFrame | Series, op: str, props: str) -> np.ndarray:
- """
- Return an array of css strings based on the condition of values matching an op.
- """
- value = getattr(data, op)(skipna=True)
- if isinstance(data, DataFrame): # min/max must be done twice to return scalar
- value = getattr(value, op)(skipna=True)
- cond = data == value
- cond = cond.where(pd.notna(cond), False)
- return np.where(cond, props, "")
-
-
-def _bar(
- data: NDFrame,
- align: str | float | Callable,
- colors: str | list | tuple,
- cmap: Any,
- width: float,
- height: float,
- vmin: float | None,
- vmax: float | None,
- base_css: str,
-):
- """
- Draw bar chart in data cells using HTML CSS linear gradient.
-
- Parameters
- ----------
- data : Series or DataFrame
- Underling subset of Styler data on which operations are performed.
- align : str in {"left", "right", "mid", "zero", "mean"}, int, float, callable
- Method for how bars are structured or scalar value of centre point.
- colors : list-like of str
- Two listed colors as string in valid CSS.
- width : float in [0,1]
- The percentage of the cell, measured from left, where drawn bars will reside.
- height : float in [0,1]
- The percentage of the cell's height where drawn bars will reside, centrally
- aligned.
- vmin : float, optional
- Overwrite the minimum value of the window.
- vmax : float, optional
- Overwrite the maximum value of the window.
- base_css : str
- Additional CSS that is included in the cell before bars are drawn.
- """
-
- def css_bar(start: float, end: float, color: str) -> str:
- """
- Generate CSS code to draw a bar from start to end in a table cell.
-
- Uses linear-gradient.
-
- Parameters
- ----------
- start : float
- Relative positional start of bar coloring in [0,1]
- end : float
- Relative positional end of the bar coloring in [0,1]
- color : str
- CSS valid color to apply.
-
- Returns
- -------
- str : The CSS applicable to the cell.
-
- Notes
- -----
- Uses ``base_css`` from outer scope.
- """
- cell_css = base_css
- if end > start:
- cell_css += "background: linear-gradient(90deg,"
- if start > 0:
- cell_css += f" transparent {start*100:.1f}%, {color} {start*100:.1f}%,"
- cell_css += f" {color} {end*100:.1f}%, transparent {end*100:.1f}%)"
- return cell_css
-
- def css_calc(x, left: float, right: float, align: str, color: str | list | tuple):
- """
- Return the correct CSS for bar placement based on calculated values.
-
- Parameters
- ----------
- x : float
- Value which determines the bar placement.
- left : float
- Value marking the left side of calculation, usually minimum of data.
- right : float
- Value marking the right side of the calculation, usually maximum of data
- (left < right).
- align : {"left", "right", "zero", "mid"}
- How the bars will be positioned.
- "left", "right", "zero" can be used with any values for ``left``, ``right``.
- "mid" can only be used where ``left <= 0`` and ``right >= 0``.
- "zero" is used to specify a center when all values ``x``, ``left``,
- ``right`` are translated, e.g. by say a mean or median.
-
- Returns
- -------
- str : Resultant CSS with linear gradient.
-
- Notes
- -----
- Uses ``colors``, ``width`` and ``height`` from outer scope.
- """
- if pd.isna(x):
- return base_css
-
- if isinstance(color, (list, tuple)):
- color = color[0] if x < 0 else color[1]
- assert isinstance(color, str) # mypy redefinition
-
- x = left if x < left else x
- x = right if x > right else x # trim data if outside of the window
-
- start: float = 0
- end: float = 1
-
- if align == "left":
- # all proportions are measured from the left side between left and right
- end = (x - left) / (right - left)
-
- elif align == "right":
- # all proportions are measured from the right side between left and right
- start = (x - left) / (right - left)
-
- else:
- z_frac: float = 0.5 # location of zero based on the left-right range
- if align == "zero":
- # all proportions are measured from the center at zero
- limit: float = max(abs(left), abs(right))
- left, right = -limit, limit
- elif align == "mid":
- # bars drawn from zero either leftwards or rightwards with center at mid
- mid: float = (left + right) / 2
- z_frac = (
- -mid / (right - left) + 0.5 if mid < 0 else -left / (right - left)
- )
-
- if x < 0:
- start, end = (x - left) / (right - left), z_frac
- else:
- start, end = z_frac, (x - left) / (right - left)
-
- ret = css_bar(start * width, end * width, color)
- if height < 1 and "background: linear-gradient(" in ret:
- return (
- ret + f" no-repeat center; background-size: 100% {height * 100:.1f}%;"
- )
- else:
- return ret
-
- values = data.to_numpy()
- left = np.nanmin(values) if vmin is None else vmin
- right = np.nanmax(values) if vmax is None else vmax
- z: float = 0 # adjustment to translate data
-
- if align == "mid":
- if left >= 0: # "mid" is documented to act as "left" if all values positive
- align, left = "left", 0 if vmin is None else vmin
- elif right <= 0: # "mid" is documented to act as "right" if all values negative
- align, right = "right", 0 if vmax is None else vmax
- elif align == "mean":
- z, align = np.nanmean(values), "zero"
- elif callable(align):
- z, align = align(values), "zero"
- elif isinstance(align, (float, int)):
- z, align = float(align), "zero"
- elif align not in ("left", "right", "zero"):
- raise ValueError(
- "`align` should be in {'left', 'right', 'mid', 'mean', 'zero'} or be a "
- "value defining the center line or a callable that returns a float"
- )
-
- rgbas = None
- if cmap is not None:
- # use the matplotlib colormap input
- with _mpl(Styler.bar) as (_, _matplotlib):
- cmap = (
- _matplotlib.colormaps[cmap]
- if isinstance(cmap, str)
- else cmap # assumed to be a Colormap instance as documented
- )
- norm = _matplotlib.colors.Normalize(left, right)
- rgbas = cmap(norm(values))
- if data.ndim == 1:
- rgbas = [_matplotlib.colors.rgb2hex(rgba) for rgba in rgbas]
- else:
- rgbas = [
- [_matplotlib.colors.rgb2hex(rgba) for rgba in row] for row in rgbas
- ]
-
- assert isinstance(align, str) # mypy: should now be in [left, right, mid, zero]
- if data.ndim == 1:
- return [
- css_calc(
- x - z, left - z, right - z, align, colors if rgbas is None else rgbas[i]
- )
- for i, x in enumerate(values)
- ]
- else:
- return np.array(
- [
- [
- css_calc(
- x - z,
- left - z,
- right - z,
- align,
- colors if rgbas is None else rgbas[i][j],
- )
- for j, x in enumerate(row)
- ]
- for i, row in enumerate(values)
- ]
- )
diff --git a/contrib/python/pandas/py3/pandas/io/formats/style_render.py b/contrib/python/pandas/py3/pandas/io/formats/style_render.py
deleted file mode 100644
index 3482e29fb1a..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/style_render.py
+++ /dev/null
@@ -1,2342 +0,0 @@
-from __future__ import annotations
-
-from collections import defaultdict
-from functools import partial
-import re
-from typing import (
- Any,
- Callable,
- DefaultDict,
- Dict,
- List,
- Optional,
- Sequence,
- Tuple,
- TypedDict,
- Union,
-)
-from uuid import uuid4
-
-import numpy as np
-
-from pandas._config import get_option
-
-from pandas._libs import lib
-from pandas._typing import (
- Axis,
- Level,
-)
-from pandas.compat._optional import import_optional_dependency
-
-from pandas.core.dtypes.common import (
- is_complex,
- is_float,
- is_integer,
-)
-from pandas.core.dtypes.generic import ABCSeries
-
-from pandas import (
- DataFrame,
- Index,
- IndexSlice,
- MultiIndex,
- Series,
- isna,
-)
-from pandas.api.types import is_list_like
-import pandas.core.common as com
-
-jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.")
-from markupsafe import escape as escape_html # markupsafe is jinja2 dependency
-
-BaseFormatter = Union[str, Callable]
-ExtFormatter = Union[BaseFormatter, Dict[Any, Optional[BaseFormatter]]]
-CSSPair = Tuple[str, Union[str, float]]
-CSSList = List[CSSPair]
-CSSProperties = Union[str, CSSList]
-
-
-class CSSDict(TypedDict):
- selector: str
- props: CSSProperties
-
-
-CSSStyles = List[CSSDict]
-Subset = Union[slice, Sequence, Index]
-
-
-class StylerRenderer:
- """
- Base class to process rendering a Styler with a specified jinja2 template.
- """
-
- loader = jinja2.PackageLoader("pandas", "io/formats/templates")
- env = jinja2.Environment(loader=loader, trim_blocks=True)
- template_html = env.get_template("html.tpl")
- template_html_table = env.get_template("html_table.tpl")
- template_html_style = env.get_template("html_style.tpl")
- template_latex = env.get_template("latex.tpl")
- template_string = env.get_template("string.tpl")
-
- def __init__(
- self,
- data: DataFrame | Series,
- uuid: str | None = None,
- uuid_len: int = 5,
- table_styles: CSSStyles | None = None,
- table_attributes: str | None = None,
- caption: str | tuple | list | None = None,
- cell_ids: bool = True,
- precision: int | None = None,
- ) -> None:
- # validate ordered args
- if isinstance(data, Series):
- data = data.to_frame()
- if not isinstance(data, DataFrame):
- raise TypeError("``data`` must be a Series or DataFrame")
- self.data: DataFrame = data
- self.index: Index = data.index
- self.columns: Index = data.columns
- if not isinstance(uuid_len, int) or uuid_len < 0:
- raise TypeError("``uuid_len`` must be an integer in range [0, 32].")
- self.uuid = uuid or uuid4().hex[: min(32, uuid_len)]
- self.uuid_len = len(self.uuid)
- self.table_styles = table_styles
- self.table_attributes = table_attributes
- self.caption = caption
- self.cell_ids = cell_ids
- self.css = {
- "row_heading": "row_heading",
- "col_heading": "col_heading",
- "index_name": "index_name",
- "col": "col",
- "row": "row",
- "col_trim": "col_trim",
- "row_trim": "row_trim",
- "level": "level",
- "data": "data",
- "blank": "blank",
- "foot": "foot",
- }
- self.concatenated: list[StylerRenderer] = []
- # add rendering variables
- self.hide_index_names: bool = False
- self.hide_column_names: bool = False
- self.hide_index_: list = [False] * self.index.nlevels
- self.hide_columns_: list = [False] * self.columns.nlevels
- self.hidden_rows: Sequence[int] = [] # sequence for specific hidden rows/cols
- self.hidden_columns: Sequence[int] = []
- self.ctx: DefaultDict[tuple[int, int], CSSList] = defaultdict(list)
- self.ctx_index: DefaultDict[tuple[int, int], CSSList] = defaultdict(list)
- self.ctx_columns: DefaultDict[tuple[int, int], CSSList] = defaultdict(list)
- self.cell_context: DefaultDict[tuple[int, int], str] = defaultdict(str)
- self._todo: list[tuple[Callable, tuple, dict]] = []
- self.tooltips: Tooltips | None = None
- precision = (
- get_option("styler.format.precision") if precision is None else precision
- )
- self._display_funcs: DefaultDict[ # maps (row, col) -> format func
- tuple[int, int], Callable[[Any], str]
- ] = defaultdict(lambda: partial(_default_formatter, precision=precision))
- self._display_funcs_index: DefaultDict[ # maps (row, level) -> format func
- tuple[int, int], Callable[[Any], str]
- ] = defaultdict(lambda: partial(_default_formatter, precision=precision))
- self._display_funcs_columns: DefaultDict[ # maps (level, col) -> format func
- tuple[int, int], Callable[[Any], str]
- ] = defaultdict(lambda: partial(_default_formatter, precision=precision))
-
- def _render(
- self,
- sparse_index: bool,
- sparse_columns: bool,
- max_rows: int | None = None,
- max_cols: int | None = None,
- blank: str = "",
- ):
- """
- Computes and applies styles and then generates the general render dicts.
-
- Also extends the `ctx` and `ctx_index` attributes with those of concatenated
- stylers for use within `_translate_latex`
- """
- self._compute()
- dxs = []
- ctx_len = len(self.index)
- for i, concatenated in enumerate(self.concatenated):
- concatenated.hide_index_ = self.hide_index_
- concatenated.hidden_columns = self.hidden_columns
- foot = f"{self.css['foot']}{i}"
- concatenated.css = {
- **self.css,
- "data": f"{foot}_data",
- "row_heading": f"{foot}_row_heading",
- "row": f"{foot}_row",
- "foot": f"{foot}_foot",
- }
- dx = concatenated._render(
- sparse_index, sparse_columns, max_rows, max_cols, blank
- )
- dxs.append(dx)
-
- for (r, c), v in concatenated.ctx.items():
- self.ctx[(r + ctx_len, c)] = v
- for (r, c), v in concatenated.ctx_index.items():
- self.ctx_index[(r + ctx_len, c)] = v
-
- ctx_len += len(concatenated.index)
-
- d = self._translate(
- sparse_index, sparse_columns, max_rows, max_cols, blank, dxs
- )
- return d
-
- def _render_html(
- self,
- sparse_index: bool,
- sparse_columns: bool,
- max_rows: int | None = None,
- max_cols: int | None = None,
- **kwargs,
- ) -> str:
- """
- Renders the ``Styler`` including all applied styles to HTML.
- Generates a dict with necessary kwargs passed to jinja2 template.
- """
- d = self._render(sparse_index, sparse_columns, max_rows, max_cols, "&nbsp;")
- d.update(kwargs)
- return self.template_html.render(
- **d,
- html_table_tpl=self.template_html_table,
- html_style_tpl=self.template_html_style,
- )
-
- def _render_latex(
- self, sparse_index: bool, sparse_columns: bool, clines: str | None, **kwargs
- ) -> str:
- """
- Render a Styler in latex format
- """
- d = self._render(sparse_index, sparse_columns, None, None)
- self._translate_latex(d, clines=clines)
- self.template_latex.globals["parse_wrap"] = _parse_latex_table_wrapping
- self.template_latex.globals["parse_table"] = _parse_latex_table_styles
- self.template_latex.globals["parse_cell"] = _parse_latex_cell_styles
- self.template_latex.globals["parse_header"] = _parse_latex_header_span
- d.update(kwargs)
- return self.template_latex.render(**d)
-
- def _render_string(
- self,
- sparse_index: bool,
- sparse_columns: bool,
- max_rows: int | None = None,
- max_cols: int | None = None,
- **kwargs,
- ) -> str:
- """
- Render a Styler in string format
- """
- d = self._render(sparse_index, sparse_columns, max_rows, max_cols)
- d.update(kwargs)
- return self.template_string.render(**d)
-
- def _compute(self):
- """
- Execute the style functions built up in `self._todo`.
-
- Relies on the conventions that all style functions go through
- .apply or .applymap. The append styles to apply as tuples of
-
- (application method, *args, **kwargs)
- """
- self.ctx.clear()
- self.ctx_index.clear()
- self.ctx_columns.clear()
- r = self
- for func, args, kwargs in self._todo:
- r = func(self)(*args, **kwargs)
- return r
-
- def _translate(
- self,
- sparse_index: bool,
- sparse_cols: bool,
- max_rows: int | None = None,
- max_cols: int | None = None,
- blank: str = "&nbsp;",
- dxs: list[dict] | None = None,
- ):
- """
- Process Styler data and settings into a dict for template rendering.
-
- Convert data and settings from ``Styler`` attributes such as ``self.data``,
- ``self.tooltips`` including applying any methods in ``self._todo``.
-
- Parameters
- ----------
- sparse_index : bool
- Whether to sparsify the index or print all hierarchical index elements.
- Upstream defaults are typically to `pandas.options.styler.sparse.index`.
- sparse_cols : bool
- Whether to sparsify the columns or print all hierarchical column elements.
- Upstream defaults are typically to `pandas.options.styler.sparse.columns`.
- max_rows, max_cols : int, optional
- Specific max rows and cols. max_elements always take precedence in render.
- blank : str
- Entry to top-left blank cells.
- dxs : list[dict]
- The render dicts of the concatenated Stylers.
-
- Returns
- -------
- d : dict
- The following structure: {uuid, table_styles, caption, head, body,
- cellstyle, table_attributes}
- """
- if dxs is None:
- dxs = []
- self.css["blank_value"] = blank
-
- # construct render dict
- d = {
- "uuid": self.uuid,
- "table_styles": format_table_styles(self.table_styles or []),
- "caption": self.caption,
- }
-
- max_elements = get_option("styler.render.max_elements")
- max_rows = max_rows if max_rows else get_option("styler.render.max_rows")
- max_cols = max_cols if max_cols else get_option("styler.render.max_columns")
- max_rows, max_cols = _get_trimming_maximums(
- len(self.data.index),
- len(self.data.columns),
- max_elements,
- max_rows,
- max_cols,
- )
-
- self.cellstyle_map_columns: DefaultDict[
- tuple[CSSPair, ...], list[str]
- ] = defaultdict(list)
- head = self._translate_header(sparse_cols, max_cols)
- d.update({"head": head})
-
- # for sparsifying a MultiIndex and for use with latex clines
- idx_lengths = _get_level_lengths(
- self.index, sparse_index, max_rows, self.hidden_rows
- )
- d.update({"index_lengths": idx_lengths})
-
- self.cellstyle_map: DefaultDict[tuple[CSSPair, ...], list[str]] = defaultdict(
- list
- )
- self.cellstyle_map_index: DefaultDict[
- tuple[CSSPair, ...], list[str]
- ] = defaultdict(list)
- body: list = self._translate_body(idx_lengths, max_rows, max_cols)
- d.update({"body": body})
-
- ctx_maps = {
- "cellstyle": "cellstyle_map",
- "cellstyle_index": "cellstyle_map_index",
- "cellstyle_columns": "cellstyle_map_columns",
- } # add the cell_ids styles map to the render dictionary in right format
- for k, attr in ctx_maps.items():
- map = [
- {"props": list(props), "selectors": selectors}
- for props, selectors in getattr(self, attr).items()
- ]
- d.update({k: map})
-
- for dx in dxs: # self.concatenated is not empty
- d["body"].extend(dx["body"]) # type: ignore[union-attr]
- d["cellstyle"].extend(dx["cellstyle"]) # type: ignore[union-attr]
- d["cellstyle_index"].extend( # type: ignore[union-attr]
- dx["cellstyle_index"]
- )
-
- table_attr = self.table_attributes
- if not get_option("styler.html.mathjax"):
- table_attr = table_attr or ""
- if 'class="' in table_attr:
- table_attr = table_attr.replace('class="', 'class="tex2jax_ignore ')
- else:
- table_attr += ' class="tex2jax_ignore"'
- d.update({"table_attributes": table_attr})
-
- if self.tooltips:
- d = self.tooltips._translate(self, d)
-
- return d
-
- def _translate_header(self, sparsify_cols: bool, max_cols: int):
- """
- Build each <tr> within table <head> as a list
-
- Using the structure:
- +----------------------------+---------------+---------------------------+
- | index_blanks ... | column_name_0 | column_headers (level_0) |
- 1) | .. | .. | .. |
- | index_blanks ... | column_name_n | column_headers (level_n) |
- +----------------------------+---------------+---------------------------+
- 2) | index_names (level_0 to level_n) ... | column_blanks ... |
- +----------------------------+---------------+---------------------------+
-
- Parameters
- ----------
- sparsify_cols : bool
- Whether column_headers section will add colspan attributes (>1) to elements.
- max_cols : int
- Maximum number of columns to render. If exceeded will contain `...` filler.
-
- Returns
- -------
- head : list
- The associated HTML elements needed for template rendering.
- """
- # for sparsifying a MultiIndex
- col_lengths = _get_level_lengths(
- self.columns, sparsify_cols, max_cols, self.hidden_columns
- )
-
- clabels = self.data.columns.tolist()
- if self.data.columns.nlevels == 1:
- clabels = [[x] for x in clabels]
- clabels = list(zip(*clabels))
-
- head = []
- # 1) column headers
- for r, hide in enumerate(self.hide_columns_):
- if hide or not clabels:
- continue
-
- header_row = self._generate_col_header_row(
- (r, clabels), max_cols, col_lengths
- )
- head.append(header_row)
-
- # 2) index names
- if (
- self.data.index.names
- and com.any_not_none(*self.data.index.names)
- and not all(self.hide_index_)
- and not self.hide_index_names
- ):
- index_names_row = self._generate_index_names_row(
- clabels, max_cols, col_lengths
- )
- head.append(index_names_row)
-
- return head
-
- def _generate_col_header_row(self, iter: tuple, max_cols: int, col_lengths: dict):
- """
- Generate the row containing column headers:
-
- +----------------------------+---------------+---------------------------+
- | index_blanks ... | column_name_i | column_headers (level_i) |
- +----------------------------+---------------+---------------------------+
-
- Parameters
- ----------
- iter : tuple
- Looping variables from outer scope
- max_cols : int
- Permissible number of columns
- col_lengths :
- c
-
- Returns
- -------
- list of elements
- """
-
- r, clabels = iter
-
- # number of index blanks is governed by number of hidden index levels
- index_blanks = [
- _element("th", self.css["blank"], self.css["blank_value"], True)
- ] * (self.index.nlevels - sum(self.hide_index_) - 1)
-
- name = self.data.columns.names[r]
- column_name = [
- _element(
- "th",
- (
- f"{self.css['blank']} {self.css['level']}{r}"
- if name is None
- else f"{self.css['index_name']} {self.css['level']}{r}"
- ),
- name
- if (name is not None and not self.hide_column_names)
- else self.css["blank_value"],
- not all(self.hide_index_),
- )
- ]
-
- column_headers: list = []
- visible_col_count: int = 0
- for c, value in enumerate(clabels[r]):
- header_element_visible = _is_visible(c, r, col_lengths)
- if header_element_visible:
- visible_col_count += col_lengths.get((r, c), 0)
- if self._check_trim(
- visible_col_count,
- max_cols,
- column_headers,
- "th",
- f"{self.css['col_heading']} {self.css['level']}{r} "
- f"{self.css['col_trim']}",
- ):
- break
-
- header_element = _element(
- "th",
- (
- f"{self.css['col_heading']} {self.css['level']}{r} "
- f"{self.css['col']}{c}"
- ),
- value,
- header_element_visible,
- display_value=self._display_funcs_columns[(r, c)](value),
- attributes=(
- f'colspan="{col_lengths.get((r, c), 0)}"'
- if col_lengths.get((r, c), 0) > 1
- else ""
- ),
- )
-
- if self.cell_ids:
- header_element["id"] = f"{self.css['level']}{r}_{self.css['col']}{c}"
- if (
- header_element_visible
- and (r, c) in self.ctx_columns
- and self.ctx_columns[r, c]
- ):
- header_element["id"] = f"{self.css['level']}{r}_{self.css['col']}{c}"
- self.cellstyle_map_columns[tuple(self.ctx_columns[r, c])].append(
- f"{self.css['level']}{r}_{self.css['col']}{c}"
- )
-
- column_headers.append(header_element)
-
- return index_blanks + column_name + column_headers
-
- def _generate_index_names_row(self, iter: tuple, max_cols: int, col_lengths: dict):
- """
- Generate the row containing index names
-
- +----------------------------+---------------+---------------------------+
- | index_names (level_0 to level_n) ... | column_blanks ... |
- +----------------------------+---------------+---------------------------+
-
- Parameters
- ----------
- iter : tuple
- Looping variables from outer scope
- max_cols : int
- Permissible number of columns
-
- Returns
- -------
- list of elements
- """
-
- clabels = iter
-
- index_names = [
- _element(
- "th",
- f"{self.css['index_name']} {self.css['level']}{c}",
- self.css["blank_value"] if name is None else name,
- not self.hide_index_[c],
- )
- for c, name in enumerate(self.data.index.names)
- ]
-
- column_blanks: list = []
- visible_col_count: int = 0
- if clabels:
- last_level = self.columns.nlevels - 1 # use last level since never sparsed
- for c, value in enumerate(clabels[last_level]):
- header_element_visible = _is_visible(c, last_level, col_lengths)
- if header_element_visible:
- visible_col_count += 1
- if self._check_trim(
- visible_col_count,
- max_cols,
- column_blanks,
- "th",
- f"{self.css['blank']} {self.css['col']}{c} {self.css['col_trim']}",
- self.css["blank_value"],
- ):
- break
-
- column_blanks.append(
- _element(
- "th",
- f"{self.css['blank']} {self.css['col']}{c}",
- self.css["blank_value"],
- c not in self.hidden_columns,
- )
- )
-
- return index_names + column_blanks
-
- def _translate_body(self, idx_lengths: dict, max_rows: int, max_cols: int):
- """
- Build each <tr> within table <body> as a list
-
- Use the following structure:
- +--------------------------------------------+---------------------------+
- | index_header_0 ... index_header_n | data_by_column ... |
- +--------------------------------------------+---------------------------+
-
- Also add elements to the cellstyle_map for more efficient grouped elements in
- <style></style> block
-
- Parameters
- ----------
- sparsify_index : bool
- Whether index_headers section will add rowspan attributes (>1) to elements.
-
- Returns
- -------
- body : list
- The associated HTML elements needed for template rendering.
- """
- rlabels = self.data.index.tolist()
- if not isinstance(self.data.index, MultiIndex):
- rlabels = [[x] for x in rlabels]
-
- body: list = []
- visible_row_count: int = 0
- for r, row_tup in [
- z for z in enumerate(self.data.itertuples()) if z[0] not in self.hidden_rows
- ]:
- visible_row_count += 1
- if self._check_trim(
- visible_row_count,
- max_rows,
- body,
- "row",
- ):
- break
-
- body_row = self._generate_body_row(
- (r, row_tup, rlabels), max_cols, idx_lengths
- )
- body.append(body_row)
- return body
-
- def _check_trim(
- self,
- count: int,
- max: int,
- obj: list,
- element: str,
- css: str | None = None,
- value: str = "...",
- ) -> bool:
- """
- Indicates whether to break render loops and append a trimming indicator
-
- Parameters
- ----------
- count : int
- The loop count of previous visible items.
- max : int
- The allowable rendered items in the loop.
- obj : list
- The current render collection of the rendered items.
- element : str
- The type of element to append in the case a trimming indicator is needed.
- css : str, optional
- The css to add to the trimming indicator element.
- value : str, optional
- The value of the elements display if necessary.
-
- Returns
- -------
- result : bool
- Whether a trimming element was required and appended.
- """
- if count > max:
- if element == "row":
- obj.append(self._generate_trimmed_row(max))
- else:
- obj.append(_element(element, css, value, True, attributes=""))
- return True
- return False
-
- def _generate_trimmed_row(self, max_cols: int) -> list:
- """
- When a render has too many rows we generate a trimming row containing "..."
-
- Parameters
- ----------
- max_cols : int
- Number of permissible columns
-
- Returns
- -------
- list of elements
- """
- index_headers = [
- _element(
- "th",
- (
- f"{self.css['row_heading']} {self.css['level']}{c} "
- f"{self.css['row_trim']}"
- ),
- "...",
- not self.hide_index_[c],
- attributes="",
- )
- for c in range(self.data.index.nlevels)
- ]
-
- data: list = []
- visible_col_count: int = 0
- for c, _ in enumerate(self.columns):
- data_element_visible = c not in self.hidden_columns
- if data_element_visible:
- visible_col_count += 1
- if self._check_trim(
- visible_col_count,
- max_cols,
- data,
- "td",
- f"{self.css['data']} {self.css['row_trim']} {self.css['col_trim']}",
- ):
- break
-
- data.append(
- _element(
- "td",
- f"{self.css['data']} {self.css['col']}{c} {self.css['row_trim']}",
- "...",
- data_element_visible,
- attributes="",
- )
- )
-
- return index_headers + data
-
- def _generate_body_row(
- self,
- iter: tuple,
- max_cols: int,
- idx_lengths: dict,
- ):
- """
- Generate a regular row for the body section of appropriate format.
-
- +--------------------------------------------+---------------------------+
- | index_header_0 ... index_header_n | data_by_column ... |
- +--------------------------------------------+---------------------------+
-
- Parameters
- ----------
- iter : tuple
- Iterable from outer scope: row number, row data tuple, row index labels.
- max_cols : int
- Number of permissible columns.
- idx_lengths : dict
- A map of the sparsification structure of the index
-
- Returns
- -------
- list of elements
- """
- r, row_tup, rlabels = iter
-
- index_headers = []
- for c, value in enumerate(rlabels[r]):
- header_element_visible = (
- _is_visible(r, c, idx_lengths) and not self.hide_index_[c]
- )
- header_element = _element(
- "th",
- (
- f"{self.css['row_heading']} {self.css['level']}{c} "
- f"{self.css['row']}{r}"
- ),
- value,
- header_element_visible,
- display_value=self._display_funcs_index[(r, c)](value),
- attributes=(
- f'rowspan="{idx_lengths.get((c, r), 0)}"'
- if idx_lengths.get((c, r), 0) > 1
- else ""
- ),
- )
-
- if self.cell_ids:
- header_element[
- "id"
- ] = f"{self.css['level']}{c}_{self.css['row']}{r}" # id is given
- if (
- header_element_visible
- and (r, c) in self.ctx_index
- and self.ctx_index[r, c]
- ):
- # always add id if a style is specified
- header_element["id"] = f"{self.css['level']}{c}_{self.css['row']}{r}"
- self.cellstyle_map_index[tuple(self.ctx_index[r, c])].append(
- f"{self.css['level']}{c}_{self.css['row']}{r}"
- )
-
- index_headers.append(header_element)
-
- data: list = []
- visible_col_count: int = 0
- for c, value in enumerate(row_tup[1:]):
- data_element_visible = (
- c not in self.hidden_columns and r not in self.hidden_rows
- )
- if data_element_visible:
- visible_col_count += 1
- if self._check_trim(
- visible_col_count,
- max_cols,
- data,
- "td",
- f"{self.css['data']} {self.css['row']}{r} {self.css['col_trim']}",
- ):
- break
-
- # add custom classes from cell context
- cls = ""
- if (r, c) in self.cell_context:
- cls = " " + self.cell_context[r, c]
-
- data_element = _element(
- "td",
- (
- f"{self.css['data']} {self.css['row']}{r} "
- f"{self.css['col']}{c}{cls}"
- ),
- value,
- data_element_visible,
- attributes="",
- display_value=self._display_funcs[(r, c)](value),
- )
-
- if self.cell_ids:
- data_element["id"] = f"{self.css['row']}{r}_{self.css['col']}{c}"
- if data_element_visible and (r, c) in self.ctx and self.ctx[r, c]:
- # always add id if needed due to specified style
- data_element["id"] = f"{self.css['row']}{r}_{self.css['col']}{c}"
- self.cellstyle_map[tuple(self.ctx[r, c])].append(
- f"{self.css['row']}{r}_{self.css['col']}{c}"
- )
-
- data.append(data_element)
-
- return index_headers + data
-
- def _translate_latex(self, d: dict, clines: str | None) -> None:
- r"""
- Post-process the default render dict for the LaTeX template format.
-
- Processing items included are:
- - Remove hidden columns from the non-headers part of the body.
- - Place cellstyles directly in td cells rather than use cellstyle_map.
- - Remove hidden indexes or reinsert missing th elements if part of multiindex
- or multirow sparsification (so that \multirow and \multicol work correctly).
- """
- index_levels = self.index.nlevels
- visible_index_level_n = index_levels - sum(self.hide_index_)
- d["head"] = [
- [
- {**col, "cellstyle": self.ctx_columns[r, c - visible_index_level_n]}
- for c, col in enumerate(row)
- if col["is_visible"]
- ]
- for r, row in enumerate(d["head"])
- ]
-
- def _concatenated_visible_rows(obj, n, row_indices):
- """
- Extract all visible row indices recursively from concatenated stylers.
- """
- row_indices.extend(
- [r + n for r in range(len(obj.index)) if r not in obj.hidden_rows]
- )
- n += len(obj.index)
- for concatenated in obj.concatenated:
- n = _concatenated_visible_rows(concatenated, n, row_indices)
- return n
-
- def concatenated_visible_rows(obj):
- row_indices: list[int] = []
- _concatenated_visible_rows(obj, 0, row_indices)
- # TODO try to consolidate the concat visible rows
- # methods to a single function / recursion for simplicity
- return row_indices
-
- body = []
- for r, row in zip(concatenated_visible_rows(self), d["body"]):
- # note: cannot enumerate d["body"] because rows were dropped if hidden
- # during _translate_body so must zip to acquire the true r-index associated
- # with the ctx obj which contains the cell styles.
- if all(self.hide_index_):
- row_body_headers = []
- else:
- row_body_headers = [
- {
- **col,
- "display_value": col["display_value"]
- if col["is_visible"]
- else "",
- "cellstyle": self.ctx_index[r, c],
- }
- for c, col in enumerate(row[:index_levels])
- if (col["type"] == "th" and not self.hide_index_[c])
- ]
-
- row_body_cells = [
- {**col, "cellstyle": self.ctx[r, c]}
- for c, col in enumerate(row[index_levels:])
- if (col["is_visible"] and col["type"] == "td")
- ]
-
- body.append(row_body_headers + row_body_cells)
- d["body"] = body
-
- # clines are determined from info on index_lengths and hidden_rows and input
- # to a dict defining which row clines should be added in the template.
- if clines not in [
- None,
- "all;data",
- "all;index",
- "skip-last;data",
- "skip-last;index",
- ]:
- raise ValueError(
- f"`clines` value of {clines} is invalid. Should either be None or one "
- f"of 'all;data', 'all;index', 'skip-last;data', 'skip-last;index'."
- )
- if clines is not None:
- data_len = len(row_body_cells) if "data" in clines and d["body"] else 0
-
- d["clines"] = defaultdict(list)
- visible_row_indexes: list[int] = [
- r for r in range(len(self.data.index)) if r not in self.hidden_rows
- ]
- visible_index_levels: list[int] = [
- i for i in range(index_levels) if not self.hide_index_[i]
- ]
- for rn, r in enumerate(visible_row_indexes):
- for lvln, lvl in enumerate(visible_index_levels):
- if lvl == index_levels - 1 and "skip-last" in clines:
- continue
- idx_len = d["index_lengths"].get((lvl, r), None)
- if idx_len is not None: # i.e. not a sparsified entry
- d["clines"][rn + idx_len].append(
- f"\\cline{{{lvln+1}-{len(visible_index_levels)+data_len}}}"
- )
-
- def format(
- self,
- formatter: ExtFormatter | None = None,
- subset: Subset | None = None,
- na_rep: str | None = None,
- precision: int | None = None,
- decimal: str = ".",
- thousands: str | None = None,
- escape: str | None = None,
- hyperlinks: str | None = None,
- ) -> StylerRenderer:
- r"""
- Format the text display value of cells.
-
- Parameters
- ----------
- formatter : str, callable, dict or None
- Object to define how values are displayed. See notes.
- subset : label, array-like, IndexSlice, optional
- A valid 2d input to `DataFrame.loc[<subset>]`, or, in the case of a 1d input
- or single key, to `DataFrame.loc[:, <subset>]` where the columns are
- prioritised, to limit ``data`` to *before* applying the function.
- na_rep : str, optional
- Representation for missing values.
- If ``na_rep`` is None, no special formatting is applied.
- precision : int, optional
- Floating point precision to use for display purposes, if not determined by
- the specified ``formatter``.
-
- .. versionadded:: 1.3.0
-
- decimal : str, default "."
- Character used as decimal separator for floats, complex and integers.
-
- .. versionadded:: 1.3.0
-
- thousands : str, optional, default None
- Character used as thousands separator for floats, complex and integers.
-
- .. versionadded:: 1.3.0
-
- escape : str, optional
- Use 'html' to replace the characters ``&``, ``<``, ``>``, ``'``, and ``"``
- in cell display string with HTML-safe sequences.
- Use 'latex' to replace the characters ``&``, ``%``, ``$``, ``#``, ``_``,
- ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with
- LaTeX-safe sequences.
- Escaping is done before ``formatter``.
-
- .. versionadded:: 1.3.0
-
- hyperlinks : {"html", "latex"}, optional
- Convert string patterns containing https://, http://, ftp:// or www. to
- HTML <a> tags as clickable URL hyperlinks if "html", or LaTeX \href
- commands if "latex".
-
- .. versionadded:: 1.4.0
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.format_index: Format the text display value of index labels.
-
- Notes
- -----
- This method assigns a formatting function, ``formatter``, to each cell in the
- DataFrame. If ``formatter`` is ``None``, then the default formatter is used.
- If a callable then that function should take a data value as input and return
- a displayable representation, such as a string. If ``formatter`` is
- given as a string this is assumed to be a valid Python format specification
- and is wrapped to a callable as ``string.format(x)``. If a ``dict`` is given,
- keys should correspond to column names, and values should be string or
- callable, as above.
-
- The default formatter currently expresses floats and complex numbers with the
- pandas display precision unless using the ``precision`` argument here. The
- default formatter does not adjust the representation of missing values unless
- the ``na_rep`` argument is used.
-
- The ``subset`` argument defines which region to apply the formatting function
- to. If the ``formatter`` argument is given in dict form but does not include
- all columns within the subset then these columns will have the default formatter
- applied. Any columns in the formatter dict excluded from the subset will
- be ignored.
-
- When using a ``formatter`` string the dtypes must be compatible, otherwise a
- `ValueError` will be raised.
-
- When instantiating a Styler, default formatting can be applied be setting the
- ``pandas.options``:
-
- - ``styler.format.formatter``: default None.
- - ``styler.format.na_rep``: default None.
- - ``styler.format.precision``: default 6.
- - ``styler.format.decimal``: default ".".
- - ``styler.format.thousands``: default None.
- - ``styler.format.escape``: default None.
-
- .. warning::
- `Styler.format` is ignored when using the output format `Styler.to_excel`,
- since Excel and Python have inherrently different formatting structures.
- However, it is possible to use the `number-format` pseudo CSS attribute
- to force Excel permissible formatting. See examples.
-
- Examples
- --------
- Using ``na_rep`` and ``precision`` with the default ``formatter``
-
- >>> df = pd.DataFrame([[np.nan, 1.0, 'A'], [2.0, np.nan, 3.0]])
- >>> df.style.format(na_rep='MISS', precision=3) # doctest: +SKIP
- 0 1 2
- 0 MISS 1.000 A
- 1 2.000 MISS 3.000
-
- Using a ``formatter`` specification on consistent column dtypes
-
- >>> df.style.format('{:.2f}', na_rep='MISS', subset=[0,1]) # doctest: +SKIP
- 0 1 2
- 0 MISS 1.00 A
- 1 2.00 MISS 3.000000
-
- Using the default ``formatter`` for unspecified columns
-
- >>> df.style.format({0: '{:.2f}', 1: '£ {:.1f}'}, na_rep='MISS', precision=1)
- ... # doctest: +SKIP
- 0 1 2
- 0 MISS £ 1.0 A
- 1 2.00 MISS 3.0
-
- Multiple ``na_rep`` or ``precision`` specifications under the default
- ``formatter``.
-
- >>> (df.style.format(na_rep='MISS', precision=1, subset=[0])
- ... .format(na_rep='PASS', precision=2, subset=[1, 2])) # doctest: +SKIP
- 0 1 2
- 0 MISS 1.00 A
- 1 2.0 PASS 3.00
-
- Using a callable ``formatter`` function.
-
- >>> func = lambda s: 'STRING' if isinstance(s, str) else 'FLOAT'
- >>> df.style.format({0: '{:.1f}', 2: func}, precision=4, na_rep='MISS')
- ... # doctest: +SKIP
- 0 1 2
- 0 MISS 1.0000 STRING
- 1 2.0 MISS FLOAT
-
- Using a ``formatter`` with HTML ``escape`` and ``na_rep``.
-
- >>> df = pd.DataFrame([['<div></div>', '"A&B"', None]])
- >>> s = df.style.format(
- ... '<a href="a.com/{0}">{0}</a>', escape="html", na_rep="NA"
- ... )
- >>> s.to_html() # doctest: +SKIP
- ...
- <td .. ><a href="a.com/&lt;div&gt;&lt;/div&gt;">&lt;div&gt;&lt;/div&gt;</a></td>
- <td .. ><a href="a.com/&#34;A&amp;B&#34;">&#34;A&amp;B&#34;</a></td>
- <td .. >NA</td>
- ...
-
- Using a ``formatter`` with LaTeX ``escape``.
-
- >>> df = pd.DataFrame([["123"], ["~ ^"], ["$%#"]])
- >>> df.style.format("\\textbf{{{}}}", escape="latex").to_latex()
- ... # doctest: +SKIP
- \begin{tabular}{ll}
- {} & {0} \\
- 0 & \textbf{123} \\
- 1 & \textbf{\textasciitilde \space \textasciicircum } \\
- 2 & \textbf{\$\%\#} \\
- \end{tabular}
-
- Pandas defines a `number-format` pseudo CSS attribute instead of the `.format`
- method to create `to_excel` permissible formatting. Note that semi-colons are
- CSS protected characters but used as separators in Excel's format string.
- Replace semi-colons with the section separator character (ASCII-245) when
- defining the formatting here.
-
- >>> df = pd.DataFrame({"A": [1, 0, -1]})
- >>> pseudo_css = "number-format: 0§[Red](0)§-§@;"
- >>> filename = "formatted_file.xlsx"
- >>> df.style.applymap(lambda v: pseudo_css).to_excel(filename) # doctest: +SKIP
-
- .. figure:: ../../_static/style/format_excel_css.png
- """
- if all(
- (
- formatter is None,
- subset is None,
- precision is None,
- decimal == ".",
- thousands is None,
- na_rep is None,
- escape is None,
- hyperlinks is None,
- )
- ):
- self._display_funcs.clear()
- return self # clear the formatter / revert to default and avoid looping
-
- subset = slice(None) if subset is None else subset
- subset = non_reducing_slice(subset)
- data = self.data.loc[subset]
-
- if not isinstance(formatter, dict):
- formatter = {col: formatter for col in data.columns}
-
- cis = self.columns.get_indexer_for(data.columns)
- ris = self.index.get_indexer_for(data.index)
- for ci in cis:
- format_func = _maybe_wrap_formatter(
- formatter.get(self.columns[ci]),
- na_rep=na_rep,
- precision=precision,
- decimal=decimal,
- thousands=thousands,
- escape=escape,
- hyperlinks=hyperlinks,
- )
- for ri in ris:
- self._display_funcs[(ri, ci)] = format_func
-
- return self
-
- def format_index(
- self,
- formatter: ExtFormatter | None = None,
- axis: Axis = 0,
- level: Level | list[Level] | None = None,
- na_rep: str | None = None,
- precision: int | None = None,
- decimal: str = ".",
- thousands: str | None = None,
- escape: str | None = None,
- hyperlinks: str | None = None,
- ) -> StylerRenderer:
- r"""
- Format the text display value of index labels or column headers.
-
- .. versionadded:: 1.4.0
-
- Parameters
- ----------
- formatter : str, callable, dict or None
- Object to define how values are displayed. See notes.
- axis : {0, "index", 1, "columns"}
- Whether to apply the formatter to the index or column headers.
- level : int, str, list
- The level(s) over which to apply the generic formatter.
- na_rep : str, optional
- Representation for missing values.
- If ``na_rep`` is None, no special formatting is applied.
- precision : int, optional
- Floating point precision to use for display purposes, if not determined by
- the specified ``formatter``.
- decimal : str, default "."
- Character used as decimal separator for floats, complex and integers.
- thousands : str, optional, default None
- Character used as thousands separator for floats, complex and integers.
- escape : str, optional
- Use 'html' to replace the characters ``&``, ``<``, ``>``, ``'``, and ``"``
- in cell display string with HTML-safe sequences.
- Use 'latex' to replace the characters ``&``, ``%``, ``$``, ``#``, ``_``,
- ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with
- LaTeX-safe sequences.
- Escaping is done before ``formatter``.
- hyperlinks : {"html", "latex"}, optional
- Convert string patterns containing https://, http://, ftp:// or www. to
- HTML <a> tags as clickable URL hyperlinks if "html", or LaTeX \href
- commands if "latex".
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.format: Format the text display value of data cells.
-
- Notes
- -----
- This method assigns a formatting function, ``formatter``, to each level label
- in the DataFrame's index or column headers. If ``formatter`` is ``None``,
- then the default formatter is used.
- If a callable then that function should take a label value as input and return
- a displayable representation, such as a string. If ``formatter`` is
- given as a string this is assumed to be a valid Python format specification
- and is wrapped to a callable as ``string.format(x)``. If a ``dict`` is given,
- keys should correspond to MultiIndex level numbers or names, and values should
- be string or callable, as above.
-
- The default formatter currently expresses floats and complex numbers with the
- pandas display precision unless using the ``precision`` argument here. The
- default formatter does not adjust the representation of missing values unless
- the ``na_rep`` argument is used.
-
- The ``level`` argument defines which levels of a MultiIndex to apply the
- method to. If the ``formatter`` argument is given in dict form but does
- not include all levels within the level argument then these unspecified levels
- will have the default formatter applied. Any levels in the formatter dict
- specifically excluded from the level argument will be ignored.
-
- When using a ``formatter`` string the dtypes must be compatible, otherwise a
- `ValueError` will be raised.
-
- .. warning::
- `Styler.format_index` is ignored when using the output format
- `Styler.to_excel`, since Excel and Python have inherrently different
- formatting structures.
- However, it is possible to use the `number-format` pseudo CSS attribute
- to force Excel permissible formatting. See documentation for `Styler.format`.
-
- Examples
- --------
- Using ``na_rep`` and ``precision`` with the default ``formatter``
-
- >>> df = pd.DataFrame([[1, 2, 3]], columns=[2.0, np.nan, 4.0])
- >>> df.style.format_index(axis=1, na_rep='MISS', precision=3) # doctest: +SKIP
- 2.000 MISS 4.000
- 0 1 2 3
-
- Using a ``formatter`` specification on consistent dtypes in a level
-
- >>> df.style.format_index('{:.2f}', axis=1, na_rep='MISS') # doctest: +SKIP
- 2.00 MISS 4.00
- 0 1 2 3
-
- Using the default ``formatter`` for unspecified levels
-
- >>> df = pd.DataFrame([[1, 2, 3]],
- ... columns=pd.MultiIndex.from_arrays([["a", "a", "b"],[2, np.nan, 4]]))
- >>> df.style.format_index({0: lambda v: upper(v)}, axis=1, precision=1)
- ... # doctest: +SKIP
- A B
- 2.0 nan 4.0
- 0 1 2 3
-
- Using a callable ``formatter`` function.
-
- >>> func = lambda s: 'STRING' if isinstance(s, str) else 'FLOAT'
- >>> df.style.format_index(func, axis=1, na_rep='MISS')
- ... # doctest: +SKIP
- STRING STRING
- FLOAT MISS FLOAT
- 0 1 2 3
-
- Using a ``formatter`` with HTML ``escape`` and ``na_rep``.
-
- >>> df = pd.DataFrame([[1, 2, 3]], columns=['"A"', 'A&B', None])
- >>> s = df.style.format_index('$ {0}', axis=1, escape="html", na_rep="NA")
- ... # doctest: +SKIP
- <th .. >$ &#34;A&#34;</th>
- <th .. >$ A&amp;B</th>
- <th .. >NA</td>
- ...
-
- Using a ``formatter`` with LaTeX ``escape``.
-
- >>> df = pd.DataFrame([[1, 2, 3]], columns=["123", "~", "$%#"])
- >>> df.style.format_index("\\textbf{{{}}}", escape="latex", axis=1).to_latex()
- ... # doctest: +SKIP
- \begin{tabular}{lrrr}
- {} & {\textbf{123}} & {\textbf{\textasciitilde }} & {\textbf{\$\%\#}} \\
- 0 & 1 & 2 & 3 \\
- \end{tabular}
- """
- axis = self.data._get_axis_number(axis)
- if axis == 0:
- display_funcs_, obj = self._display_funcs_index, self.index
- else:
- display_funcs_, obj = self._display_funcs_columns, self.columns
- levels_ = refactor_levels(level, obj)
-
- if all(
- (
- formatter is None,
- level is None,
- precision is None,
- decimal == ".",
- thousands is None,
- na_rep is None,
- escape is None,
- hyperlinks is None,
- )
- ):
- display_funcs_.clear()
- return self # clear the formatter / revert to default and avoid looping
-
- if not isinstance(formatter, dict):
- formatter = {level: formatter for level in levels_}
- else:
- formatter = {
- obj._get_level_number(level): formatter_
- for level, formatter_ in formatter.items()
- }
-
- for lvl in levels_:
- format_func = _maybe_wrap_formatter(
- formatter.get(lvl),
- na_rep=na_rep,
- precision=precision,
- decimal=decimal,
- thousands=thousands,
- escape=escape,
- hyperlinks=hyperlinks,
- )
-
- for idx in [(i, lvl) if axis == 0 else (lvl, i) for i in range(len(obj))]:
- display_funcs_[idx] = format_func
-
- return self
-
- def relabel_index(
- self,
- labels: Sequence | Index,
- axis: Axis = 0,
- level: Level | list[Level] | None = None,
- ) -> StylerRenderer:
- r"""
- Relabel the index, or column header, keys to display a set of specified values.
-
- .. versionadded:: 1.5.0
-
- Parameters
- ----------
- labels : list-like or Index
- New labels to display. Must have same length as the underlying values not
- hidden.
- axis : {"index", 0, "columns", 1}
- Apply to the index or columns.
- level : int, str, list, optional
- The level(s) over which to apply the new labels. If `None` will apply
- to all levels of an Index or MultiIndex which are not hidden.
-
- Returns
- -------
- Styler
-
- See Also
- --------
- Styler.format_index: Format the text display value of index or column headers.
- Styler.hide: Hide the index, column headers, or specified data from display.
-
- Notes
- -----
- As part of Styler, this method allows the display of an index to be
- completely user-specified without affecting the underlying DataFrame data,
- index, or column headers. This means that the flexibility of indexing is
- maintained whilst the final display is customisable.
-
- Since Styler is designed to be progressively constructed with method chaining,
- this method is adapted to react to the **currently specified hidden elements**.
- This is useful because it means one does not have to specify all the new
- labels if the majority of an index, or column headers, have already been hidden.
- The following produce equivalent display (note the length of ``labels`` in
- each case).
-
- .. code-block:: python
-
- # relabel first, then hide
- df = pd.DataFrame({"col": ["a", "b", "c"]})
- df.style.relabel_index(["A", "B", "C"]).hide([0,1])
- # hide first, then relabel
- df = pd.DataFrame({"col": ["a", "b", "c"]})
- df.style.hide([0,1]).relabel_index(["C"])
-
- This method should be used, rather than :meth:`Styler.format_index`, in one of
- the following cases (see examples):
-
- - A specified set of labels are required which are not a function of the
- underlying index keys.
- - The function of the underlying index keys requires a counter variable,
- such as those available upon enumeration.
-
- Examples
- --------
- Basic use
-
- >>> df = pd.DataFrame({"col": ["a", "b", "c"]})
- >>> df.style.relabel_index(["A", "B", "C"]) # doctest: +SKIP
- col
- A a
- B b
- C c
-
- Chaining with pre-hidden elements
-
- >>> df.style.hide([0,1]).relabel_index(["C"]) # doctest: +SKIP
- col
- C c
-
- Using a MultiIndex
-
- >>> midx = pd.MultiIndex.from_product([[0, 1], [0, 1], [0, 1]])
- >>> df = pd.DataFrame({"col": list(range(8))}, index=midx)
- >>> styler = df.style # doctest: +SKIP
- col
- 0 0 0 0
- 1 1
- 1 0 2
- 1 3
- 1 0 0 4
- 1 5
- 1 0 6
- 1 7
- >>> styler.hide((midx.get_level_values(0)==0)|(midx.get_level_values(1)==0))
- ... # doctest: +SKIP
- >>> styler.hide(level=[0,1]) # doctest: +SKIP
- >>> styler.relabel_index(["binary6", "binary7"]) # doctest: +SKIP
- col
- binary6 6
- binary7 7
-
- We can also achieve the above by indexing first and then re-labeling
-
- >>> styler = df.loc[[(1,1,0), (1,1,1)]].style
- >>> styler.hide(level=[0,1]).relabel_index(["binary6", "binary7"])
- ... # doctest: +SKIP
- col
- binary6 6
- binary7 7
-
- Defining a formatting function which uses an enumeration counter. Also note
- that the value of the index key is passed in the case of string labels so it
- can also be inserted into the label, using curly brackets (or double curly
- brackets if the string if pre-formatted),
-
- >>> df = pd.DataFrame({"samples": np.random.rand(10)})
- >>> styler = df.loc[np.random.randint(0,10,3)].style
- >>> styler.relabel_index([f"sample{i+1} ({{}})" for i in range(3)])
- ... # doctest: +SKIP
- samples
- sample1 (5) 0.315811
- sample2 (0) 0.495941
- sample3 (2) 0.067946
- """
- axis = self.data._get_axis_number(axis)
- if axis == 0:
- display_funcs_, obj = self._display_funcs_index, self.index
- hidden_labels, hidden_lvls = self.hidden_rows, self.hide_index_
- else:
- display_funcs_, obj = self._display_funcs_columns, self.columns
- hidden_labels, hidden_lvls = self.hidden_columns, self.hide_columns_
- visible_len = len(obj) - len(set(hidden_labels))
- if len(labels) != visible_len:
- raise ValueError(
- "``labels`` must be of length equal to the number of "
- f"visible labels along ``axis`` ({visible_len})."
- )
-
- if level is None:
- level = [i for i in range(obj.nlevels) if not hidden_lvls[i]]
- levels_ = refactor_levels(level, obj)
-
- def alias_(x, value):
- if isinstance(value, str):
- return value.format(x)
- return value
-
- for ai, i in enumerate([i for i in range(len(obj)) if i not in hidden_labels]):
- if len(levels_) == 1:
- idx = (i, levels_[0]) if axis == 0 else (levels_[0], i)
- display_funcs_[idx] = partial(alias_, value=labels[ai])
- else:
- for aj, lvl in enumerate(levels_):
- idx = (i, lvl) if axis == 0 else (lvl, i)
- display_funcs_[idx] = partial(alias_, value=labels[ai][aj])
-
- return self
-
-
-def _element(
- html_element: str,
- html_class: str | None,
- value: Any,
- is_visible: bool,
- **kwargs,
-) -> dict:
- """
- Template to return container with information for a <td></td> or <th></th> element.
- """
- if "display_value" not in kwargs:
- kwargs["display_value"] = value
- return {
- "type": html_element,
- "value": value,
- "class": html_class,
- "is_visible": is_visible,
- **kwargs,
- }
-
-
-def _get_trimming_maximums(
- rn,
- cn,
- max_elements,
- max_rows=None,
- max_cols=None,
- scaling_factor: float = 0.8,
-) -> tuple[int, int]:
- """
- Recursively reduce the number of rows and columns to satisfy max elements.
-
- Parameters
- ----------
- rn, cn : int
- The number of input rows / columns
- max_elements : int
- The number of allowable elements
- max_rows, max_cols : int, optional
- Directly specify an initial maximum rows or columns before compression.
- scaling_factor : float
- Factor at which to reduce the number of rows / columns to fit.
-
- Returns
- -------
- rn, cn : tuple
- New rn and cn values that satisfy the max_elements constraint
- """
-
- def scale_down(rn, cn):
- if cn >= rn:
- return rn, int(cn * scaling_factor)
- else:
- return int(rn * scaling_factor), cn
-
- if max_rows:
- rn = max_rows if rn > max_rows else rn
- if max_cols:
- cn = max_cols if cn > max_cols else cn
-
- while rn * cn > max_elements:
- rn, cn = scale_down(rn, cn)
-
- return rn, cn
-
-
-def _get_level_lengths(
- index: Index,
- sparsify: bool,
- max_index: int,
- hidden_elements: Sequence[int] | None = None,
-):
- """
- Given an index, find the level length for each element.
-
- Parameters
- ----------
- index : Index
- Index or columns to determine lengths of each element
- sparsify : bool
- Whether to hide or show each distinct element in a MultiIndex
- max_index : int
- The maximum number of elements to analyse along the index due to trimming
- hidden_elements : sequence of int
- Index positions of elements hidden from display in the index affecting
- length
-
- Returns
- -------
- Dict :
- Result is a dictionary of (level, initial_position): span
- """
- if isinstance(index, MultiIndex):
- levels = index.format(sparsify=lib.no_default, adjoin=False)
- else:
- levels = index.format()
-
- if hidden_elements is None:
- hidden_elements = []
-
- lengths = {}
- if not isinstance(index, MultiIndex):
- for i, value in enumerate(levels):
- if i not in hidden_elements:
- lengths[(0, i)] = 1
- return lengths
-
- for i, lvl in enumerate(levels):
- visible_row_count = 0 # used to break loop due to display trimming
- for j, row in enumerate(lvl):
- if visible_row_count > max_index:
- break
- if not sparsify:
- # then lengths will always equal 1 since no aggregation.
- if j not in hidden_elements:
- lengths[(i, j)] = 1
- visible_row_count += 1
- elif (row is not lib.no_default) and (j not in hidden_elements):
- # this element has not been sparsified so must be the start of section
- last_label = j
- lengths[(i, last_label)] = 1
- visible_row_count += 1
- elif row is not lib.no_default:
- # even if the above is hidden, keep track of it in case length > 1 and
- # later elements are visible
- last_label = j
- lengths[(i, last_label)] = 0
- elif j not in hidden_elements:
- # then element must be part of sparsified section and is visible
- visible_row_count += 1
- if visible_row_count > max_index:
- break # do not add a length since the render trim limit reached
- if lengths[(i, last_label)] == 0:
- # if previous iteration was first-of-section but hidden then offset
- last_label = j
- lengths[(i, last_label)] = 1
- else:
- # else add to previous iteration
- lengths[(i, last_label)] += 1
-
- non_zero_lengths = {
- element: length for element, length in lengths.items() if length >= 1
- }
-
- return non_zero_lengths
-
-
-def _is_visible(idx_row, idx_col, lengths) -> bool:
- """
- Index -> {(idx_row, idx_col): bool}).
- """
- return (idx_col, idx_row) in lengths
-
-
-def format_table_styles(styles: CSSStyles) -> CSSStyles:
- """
- looks for multiple CSS selectors and separates them:
- [{'selector': 'td, th', 'props': 'a:v;'}]
- ---> [{'selector': 'td', 'props': 'a:v;'},
- {'selector': 'th', 'props': 'a:v;'}]
- """
- return [
- {"selector": selector, "props": css_dict["props"]}
- for css_dict in styles
- for selector in css_dict["selector"].split(",")
- ]
-
-
-def _default_formatter(x: Any, precision: int, thousands: bool = False) -> Any:
- """
- Format the display of a value
-
- Parameters
- ----------
- x : Any
- Input variable to be formatted
- precision : Int
- Floating point precision used if ``x`` is float or complex.
- thousands : bool, default False
- Whether to group digits with thousands separated with ",".
-
- Returns
- -------
- value : Any
- Matches input type, or string if input is float or complex or int with sep.
- """
- if is_float(x) or is_complex(x):
- return f"{x:,.{precision}f}" if thousands else f"{x:.{precision}f}"
- elif is_integer(x):
- return f"{x:,.0f}" if thousands else f"{x:.0f}"
- return x
-
-
-def _wrap_decimal_thousands(
- formatter: Callable, decimal: str, thousands: str | None
-) -> Callable:
- """
- Takes a string formatting function and wraps logic to deal with thousands and
- decimal parameters, in the case that they are non-standard and that the input
- is a (float, complex, int).
- """
-
- def wrapper(x):
- if is_float(x) or is_integer(x) or is_complex(x):
- if decimal != "." and thousands is not None and thousands != ",":
- return (
- formatter(x)
- .replace(",", "§_§-") # rare string to avoid "," <-> "." clash.
- .replace(".", decimal)
- .replace("§_§-", thousands)
- )
- elif decimal != "." and (thousands is None or thousands == ","):
- return formatter(x).replace(".", decimal)
- elif decimal == "." and thousands is not None and thousands != ",":
- return formatter(x).replace(",", thousands)
- return formatter(x)
-
- return wrapper
-
-
-def _str_escape(x, escape):
- """if escaping: only use on str, else return input"""
- if isinstance(x, str):
- if escape == "html":
- return escape_html(x)
- elif escape == "latex":
- return _escape_latex(x)
- else:
- raise ValueError(
- f"`escape` only permitted in {{'html', 'latex'}}, got {escape}"
- )
- return x
-
-
-def _render_href(x, format):
- """uses regex to detect a common URL pattern and converts to href tag in format."""
- if isinstance(x, str):
- if format == "html":
- href = '<a href="{0}" target="_blank">{0}</a>'
- elif format == "latex":
- href = r"\href{{{0}}}{{{0}}}"
- else:
- raise ValueError("``hyperlinks`` format can only be 'html' or 'latex'")
- pat = r"((http|ftp)s?:\/\/|www.)[\w/\-?=%.:@]+\.[\w/\-&?=%.,':;~!@#$*()\[\]]+"
- return re.sub(pat, lambda m: href.format(m.group(0)), x)
- return x
-
-
-def _maybe_wrap_formatter(
- formatter: BaseFormatter | None = None,
- na_rep: str | None = None,
- precision: int | None = None,
- decimal: str = ".",
- thousands: str | None = None,
- escape: str | None = None,
- hyperlinks: str | None = None,
-) -> Callable:
- """
- Allows formatters to be expressed as str, callable or None, where None returns
- a default formatting function. wraps with na_rep, and precision where they are
- available.
- """
- # Get initial func from input string, input callable, or from default factory
- if isinstance(formatter, str):
- func_0 = lambda x: formatter.format(x)
- elif callable(formatter):
- func_0 = formatter
- elif formatter is None:
- precision = (
- get_option("styler.format.precision") if precision is None else precision
- )
- func_0 = partial(
- _default_formatter, precision=precision, thousands=(thousands is not None)
- )
- else:
- raise TypeError(f"'formatter' expected str or callable, got {type(formatter)}")
-
- # Replace chars if escaping
- if escape is not None:
- func_1 = lambda x: func_0(_str_escape(x, escape=escape))
- else:
- func_1 = func_0
-
- # Replace decimals and thousands if non-standard inputs detected
- if decimal != "." or (thousands is not None and thousands != ","):
- func_2 = _wrap_decimal_thousands(func_1, decimal=decimal, thousands=thousands)
- else:
- func_2 = func_1
-
- # Render links
- if hyperlinks is not None:
- func_3 = lambda x: func_2(_render_href(x, format=hyperlinks))
- else:
- func_3 = func_2
-
- # Replace missing values if na_rep
- if na_rep is None:
- return func_3
- else:
- return lambda x: na_rep if (isna(x) is True) else func_3(x)
-
-
-def non_reducing_slice(slice_: Subset):
- """
- Ensure that a slice doesn't reduce to a Series or Scalar.
-
- Any user-passed `subset` should have this called on it
- to make sure we're always working with DataFrames.
- """
- # default to column slice, like DataFrame
- # ['A', 'B'] -> IndexSlices[:, ['A', 'B']]
- kinds = (ABCSeries, np.ndarray, Index, list, str)
- if isinstance(slice_, kinds):
- slice_ = IndexSlice[:, slice_]
-
- def pred(part) -> bool:
- """
- Returns
- -------
- bool
- True if slice does *not* reduce,
- False if `part` is a tuple.
- """
- # true when slice does *not* reduce, False when part is a tuple,
- # i.e. MultiIndex slice
- if isinstance(part, tuple):
- # GH#39421 check for sub-slice:
- return any((isinstance(s, slice) or is_list_like(s)) for s in part)
- else:
- return isinstance(part, slice) or is_list_like(part)
-
- if not is_list_like(slice_):
- if not isinstance(slice_, slice):
- # a 1-d slice, like df.loc[1]
- slice_ = [[slice_]]
- else:
- # slice(a, b, c)
- slice_ = [slice_] # to tuplize later
- else:
- # error: Item "slice" of "Union[slice, Sequence[Any]]" has no attribute
- # "__iter__" (not iterable) -> is specifically list_like in conditional
- slice_ = [p if pred(p) else [p] for p in slice_] # type: ignore[union-attr]
- return tuple(slice_)
-
-
-def maybe_convert_css_to_tuples(style: CSSProperties) -> CSSList:
- """
- Convert css-string to sequence of tuples format if needed.
- 'color:red; border:1px solid black;' -> [('color', 'red'),
- ('border','1px solid red')]
- """
- if isinstance(style, str):
- s = style.split(";")
- try:
- return [
- (x.split(":")[0].strip(), x.split(":")[1].strip())
- for x in s
- if x.strip() != ""
- ]
- except IndexError:
- raise ValueError(
- "Styles supplied as string must follow CSS rule formats, "
- f"for example 'attr: val;'. '{style}' was given."
- )
- return style
-
-
-def refactor_levels(
- level: Level | list[Level] | None,
- obj: Index,
-) -> list[int]:
- """
- Returns a consistent levels arg for use in ``hide_index`` or ``hide_columns``.
-
- Parameters
- ----------
- level : int, str, list
- Original ``level`` arg supplied to above methods.
- obj:
- Either ``self.index`` or ``self.columns``
-
- Returns
- -------
- list : refactored arg with a list of levels to hide
- """
- if level is None:
- levels_: list[int] = list(range(obj.nlevels))
- elif isinstance(level, int):
- levels_ = [level]
- elif isinstance(level, str):
- levels_ = [obj._get_level_number(level)]
- elif isinstance(level, list):
- levels_ = [
- obj._get_level_number(lev) if not isinstance(lev, int) else lev
- for lev in level
- ]
- else:
- raise ValueError("`level` must be of type `int`, `str` or list of such")
- return levels_
-
-
-class Tooltips:
- """
- An extension to ``Styler`` that allows for and manipulates tooltips on hover
- of ``<td>`` cells in the HTML result.
-
- Parameters
- ----------
- css_name: str, default "pd-t"
- Name of the CSS class that controls visualisation of tooltips.
- css_props: list-like, default; see Notes
- List of (attr, value) tuples defining properties of the CSS class.
- tooltips: DataFrame, default empty
- DataFrame of strings aligned with underlying Styler data for tooltip
- display.
-
- Notes
- -----
- The default properties for the tooltip CSS class are:
-
- - visibility: hidden
- - position: absolute
- - z-index: 1
- - background-color: black
- - color: white
- - transform: translate(-20px, -20px)
-
- Hidden visibility is a key prerequisite to the hover functionality, and should
- always be included in any manual properties specification.
- """
-
- def __init__(
- self,
- css_props: CSSProperties = [
- ("visibility", "hidden"),
- ("position", "absolute"),
- ("z-index", 1),
- ("background-color", "black"),
- ("color", "white"),
- ("transform", "translate(-20px, -20px)"),
- ],
- css_name: str = "pd-t",
- tooltips: DataFrame = DataFrame(),
- ) -> None:
- self.class_name = css_name
- self.class_properties = css_props
- self.tt_data = tooltips
- self.table_styles: CSSStyles = []
-
- @property
- def _class_styles(self):
- """
- Combine the ``_Tooltips`` CSS class name and CSS properties to the format
- required to extend the underlying ``Styler`` `table_styles` to allow
- tooltips to render in HTML.
-
- Returns
- -------
- styles : List
- """
- return [
- {
- "selector": f".{self.class_name}",
- "props": maybe_convert_css_to_tuples(self.class_properties),
- }
- ]
-
- def _pseudo_css(self, uuid: str, name: str, row: int, col: int, text: str):
- """
- For every table data-cell that has a valid tooltip (not None, NaN or
- empty string) must create two pseudo CSS entries for the specific
- <td> element id which are added to overall table styles:
- an on hover visibility change and a content change
- dependent upon the user's chosen display string.
-
- For example:
- [{"selector": "T__row1_col1:hover .pd-t",
- "props": [("visibility", "visible")]},
- {"selector": "T__row1_col1 .pd-t::after",
- "props": [("content", "Some Valid Text String")]}]
-
- Parameters
- ----------
- uuid: str
- The uuid of the Styler instance
- name: str
- The css-name of the class used for styling tooltips
- row : int
- The row index of the specified tooltip string data
- col : int
- The col index of the specified tooltip string data
- text : str
- The textual content of the tooltip to be displayed in HTML.
-
- Returns
- -------
- pseudo_css : List
- """
- selector_id = "#T_" + uuid + "_row" + str(row) + "_col" + str(col)
- return [
- {
- "selector": selector_id + f":hover .{name}",
- "props": [("visibility", "visible")],
- },
- {
- "selector": selector_id + f" .{name}::after",
- "props": [("content", f'"{text}"')],
- },
- ]
-
- def _translate(self, styler: StylerRenderer, d: dict):
- """
- Mutate the render dictionary to allow for tooltips:
-
- - Add ``<span>`` HTML element to each data cells ``display_value``. Ignores
- headers.
- - Add table level CSS styles to control pseudo classes.
-
- Parameters
- ----------
- styler_data : DataFrame
- Underlying ``Styler`` DataFrame used for reindexing.
- uuid : str
- The underlying ``Styler`` uuid for CSS id.
- d : dict
- The dictionary prior to final render
-
- Returns
- -------
- render_dict : Dict
- """
- self.tt_data = self.tt_data.reindex_like(styler.data)
- if self.tt_data.empty:
- return d
-
- name = self.class_name
- mask = (self.tt_data.isna()) | (self.tt_data.eq("")) # empty string = no ttip
- self.table_styles = [
- style
- for sublist in [
- self._pseudo_css(styler.uuid, name, i, j, str(self.tt_data.iloc[i, j]))
- for i in range(len(self.tt_data.index))
- for j in range(len(self.tt_data.columns))
- if not (
- mask.iloc[i, j]
- or i in styler.hidden_rows
- or j in styler.hidden_columns
- )
- ]
- for style in sublist
- ]
-
- if self.table_styles:
- # add span class to every cell only if at least 1 non-empty tooltip
- for row in d["body"]:
- for item in row:
- if item["type"] == "td":
- item["display_value"] = (
- str(item["display_value"])
- + f'<span class="{self.class_name}"></span>'
- )
- d["table_styles"].extend(self._class_styles)
- d["table_styles"].extend(self.table_styles)
-
- return d
-
-
-def _parse_latex_table_wrapping(table_styles: CSSStyles, caption: str | None) -> bool:
- """
- Indicate whether LaTeX {tabular} should be wrapped with a {table} environment.
-
- Parses the `table_styles` and detects any selectors which must be included outside
- of {tabular}, i.e. indicating that wrapping must occur, and therefore return True,
- or if a caption exists and requires similar.
- """
- IGNORED_WRAPPERS = ["toprule", "midrule", "bottomrule", "column_format"]
- # ignored selectors are included with {tabular} so do not need wrapping
- return (
- table_styles is not None
- and any(d["selector"] not in IGNORED_WRAPPERS for d in table_styles)
- ) or caption is not None
-
-
-def _parse_latex_table_styles(table_styles: CSSStyles, selector: str) -> str | None:
- """
- Return the first 'props' 'value' from ``tables_styles`` identified by ``selector``.
-
- Examples
- --------
- >>> table_styles = [{'selector': 'foo', 'props': [('attr','value')]},
- ... {'selector': 'bar', 'props': [('attr', 'overwritten')]},
- ... {'selector': 'bar', 'props': [('a1', 'baz'), ('a2', 'ignore')]}]
- >>> _parse_latex_table_styles(table_styles, selector='bar')
- 'baz'
-
- Notes
- -----
- The replacement of "§" with ":" is to avoid the CSS problem where ":" has structural
- significance and cannot be used in LaTeX labels, but is often required by them.
- """
- for style in table_styles[::-1]: # in reverse for most recently applied style
- if style["selector"] == selector:
- return str(style["props"][0][1]).replace("§", ":")
- return None
-
-
-def _parse_latex_cell_styles(
- latex_styles: CSSList, display_value: str, convert_css: bool = False
-) -> str:
- r"""
- Mutate the ``display_value`` string including LaTeX commands from ``latex_styles``.
-
- This method builds a recursive latex chain of commands based on the
- CSSList input, nested around ``display_value``.
-
- If a CSS style is given as ('<command>', '<options>') this is translated to
- '\<command><options>{display_value}', and this value is treated as the
- display value for the next iteration.
-
- The most recent style forms the inner component, for example for styles:
- `[('c1', 'o1'), ('c2', 'o2')]` this returns: `\c1o1{\c2o2{display_value}}`
-
- Sometimes latex commands have to be wrapped with curly braces in different ways:
- We create some parsing flags to identify the different behaviours:
-
- - `--rwrap` : `\<command><options>{<display_value>}`
- - `--wrap` : `{\<command><options> <display_value>}`
- - `--nowrap` : `\<command><options> <display_value>`
- - `--lwrap` : `{\<command><options>} <display_value>`
- - `--dwrap` : `{\<command><options>}{<display_value>}`
-
- For example for styles:
- `[('c1', 'o1--wrap'), ('c2', 'o2')]` this returns: `{\c1o1 \c2o2{display_value}}
- """
- if convert_css:
- latex_styles = _parse_latex_css_conversion(latex_styles)
- for command, options in latex_styles[::-1]: # in reverse for most recent style
- formatter = {
- "--wrap": f"{{\\{command}--to_parse {display_value}}}",
- "--nowrap": f"\\{command}--to_parse {display_value}",
- "--lwrap": f"{{\\{command}--to_parse}} {display_value}",
- "--rwrap": f"\\{command}--to_parse{{{display_value}}}",
- "--dwrap": f"{{\\{command}--to_parse}}{{{display_value}}}",
- }
- display_value = f"\\{command}{options} {display_value}"
- for arg in ["--nowrap", "--wrap", "--lwrap", "--rwrap", "--dwrap"]:
- if arg in str(options):
- display_value = formatter[arg].replace(
- "--to_parse", _parse_latex_options_strip(value=options, arg=arg)
- )
- break # only ever one purposeful entry
- return display_value
-
-
-def _parse_latex_header_span(
- cell: dict[str, Any],
- multirow_align: str,
- multicol_align: str,
- wrap: bool = False,
- convert_css: bool = False,
-) -> str:
- r"""
- Refactor the cell `display_value` if a 'colspan' or 'rowspan' attribute is present.
-
- 'rowspan' and 'colspan' do not occur simultaneouly. If they are detected then
- the `display_value` is altered to a LaTeX `multirow` or `multicol` command
- respectively, with the appropriate cell-span.
-
- ``wrap`` is used to enclose the `display_value` in braces which is needed for
- column headers using an siunitx package.
-
- Requires the package {multirow}, whereas multicol support is usually built in
- to the {tabular} environment.
-
- Examples
- --------
- >>> cell = {'cellstyle': '', 'display_value':'text', 'attributes': 'colspan="3"'}
- >>> _parse_latex_header_span(cell, 't', 'c')
- '\\multicolumn{3}{c}{text}'
- """
- display_val = _parse_latex_cell_styles(
- cell["cellstyle"], cell["display_value"], convert_css
- )
- if "attributes" in cell:
- attrs = cell["attributes"]
- if 'colspan="' in attrs:
- colspan = attrs[attrs.find('colspan="') + 9 :] # len('colspan="') = 9
- colspan = int(colspan[: colspan.find('"')])
- if "naive-l" == multicol_align:
- out = f"{{{display_val}}}" if wrap else f"{display_val}"
- blanks = " & {}" if wrap else " &"
- return out + blanks * (colspan - 1)
- elif "naive-r" == multicol_align:
- out = f"{{{display_val}}}" if wrap else f"{display_val}"
- blanks = "{} & " if wrap else "& "
- return blanks * (colspan - 1) + out
- return f"\\multicolumn{{{colspan}}}{{{multicol_align}}}{{{display_val}}}"
- elif 'rowspan="' in attrs:
- if multirow_align == "naive":
- return display_val
- rowspan = attrs[attrs.find('rowspan="') + 9 :]
- rowspan = int(rowspan[: rowspan.find('"')])
- return f"\\multirow[{multirow_align}]{{{rowspan}}}{{*}}{{{display_val}}}"
- if wrap:
- return f"{{{display_val}}}"
- else:
- return display_val
-
-
-def _parse_latex_options_strip(value: str | float, arg: str) -> str:
- """
- Strip a css_value which may have latex wrapping arguments, css comment identifiers,
- and whitespaces, to a valid string for latex options parsing.
-
- For example: 'red /* --wrap */ ' --> 'red'
- """
- return str(value).replace(arg, "").replace("/*", "").replace("*/", "").strip()
-
-
-def _parse_latex_css_conversion(styles: CSSList) -> CSSList:
- """
- Convert CSS (attribute,value) pairs to equivalent LaTeX (command,options) pairs.
-
- Ignore conversion if tagged with `--latex` option, skipped if no conversion found.
- """
-
- def font_weight(value, arg):
- if value in ("bold", "bolder"):
- return "bfseries", f"{arg}"
- return None
-
- def font_style(value, arg):
- if value == "italic":
- return "itshape", f"{arg}"
- if value == "oblique":
- return "slshape", f"{arg}"
- return None
-
- def color(value, user_arg, command, comm_arg):
- """
- CSS colors have 5 formats to process:
-
- - 6 digit hex code: "#ff23ee" --> [HTML]{FF23EE}
- - 3 digit hex code: "#f0e" --> [HTML]{FF00EE}
- - rgba: rgba(128, 255, 0, 0.5) --> [rgb]{0.502, 1.000, 0.000}
- - rgb: rgb(128, 255, 0,) --> [rbg]{0.502, 1.000, 0.000}
- - string: red --> {red}
-
- Additionally rgb or rgba can be expressed in % which is also parsed.
- """
- arg = user_arg if user_arg != "" else comm_arg
-
- if value[0] == "#" and len(value) == 7: # color is hex code
- return command, f"[HTML]{{{value[1:].upper()}}}{arg}"
- if value[0] == "#" and len(value) == 4: # color is short hex code
- val = f"{value[1].upper()*2}{value[2].upper()*2}{value[3].upper()*2}"
- return command, f"[HTML]{{{val}}}{arg}"
- elif value[:3] == "rgb": # color is rgb or rgba
- r = re.findall("(?<=\\()[0-9\\s%]+(?=,)", value)[0].strip()
- r = float(r[:-1]) / 100 if "%" in r else int(r) / 255
- g = re.findall("(?<=,)[0-9\\s%]+(?=,)", value)[0].strip()
- g = float(g[:-1]) / 100 if "%" in g else int(g) / 255
- if value[3] == "a": # color is rgba
- b = re.findall("(?<=,)[0-9\\s%]+(?=,)", value)[1].strip()
- else: # color is rgb
- b = re.findall("(?<=,)[0-9\\s%]+(?=\\))", value)[0].strip()
- b = float(b[:-1]) / 100 if "%" in b else int(b) / 255
- return command, f"[rgb]{{{r:.3f}, {g:.3f}, {b:.3f}}}{arg}"
- else:
- return command, f"{{{value}}}{arg}" # color is likely string-named
-
- CONVERTED_ATTRIBUTES: dict[str, Callable] = {
- "font-weight": font_weight,
- "background-color": partial(color, command="cellcolor", comm_arg="--lwrap"),
- "color": partial(color, command="color", comm_arg=""),
- "font-style": font_style,
- }
-
- latex_styles: CSSList = []
- for attribute, value in styles:
- if isinstance(value, str) and "--latex" in value:
- # return the style without conversion but drop '--latex'
- latex_styles.append((attribute, value.replace("--latex", "")))
- if attribute in CONVERTED_ATTRIBUTES:
- arg = ""
- for x in ["--wrap", "--nowrap", "--lwrap", "--dwrap", "--rwrap"]:
- if x in str(value):
- arg, value = x, _parse_latex_options_strip(value, x)
- break
- latex_style = CONVERTED_ATTRIBUTES[attribute](value, arg)
- if latex_style is not None:
- latex_styles.extend([latex_style])
- return latex_styles
-
-
-def _escape_latex(s):
- r"""
- Replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, ``{``, ``}``,
- ``~``, ``^``, and ``\`` in the string with LaTeX-safe sequences.
-
- Use this if you need to display text that might contain such characters in LaTeX.
-
- Parameters
- ----------
- s : str
- Input to be escaped
-
- Return
- ------
- str :
- Escaped string
- """
- return (
- s.replace("\\", "ab2§=§8yz") # rare string for final conversion: avoid \\ clash
- .replace("ab2§=§8yz ", "ab2§=§8yz\\space ") # since \backslash gobbles spaces
- .replace("&", "\\&")
- .replace("%", "\\%")
- .replace("$", "\\$")
- .replace("#", "\\#")
- .replace("_", "\\_")
- .replace("{", "\\{")
- .replace("}", "\\}")
- .replace("~ ", "~\\space ") # since \textasciitilde gobbles spaces
- .replace("~", "\\textasciitilde ")
- .replace("^ ", "^\\space ") # since \textasciicircum gobbles spaces
- .replace("^", "\\textasciicircum ")
- .replace("ab2§=§8yz", "\\textbackslash ")
- )
diff --git a/contrib/python/pandas/py3/pandas/io/formats/templates/html.tpl b/contrib/python/pandas/py3/pandas/io/formats/templates/html.tpl
deleted file mode 100644
index 8c63be3ad78..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/templates/html.tpl
+++ /dev/null
@@ -1,16 +0,0 @@
-{# Update the html_style/table_structure.html documentation too #}
-{% if doctype_html %}
-<!DOCTYPE html>
-<html>
-<head>
-<meta charset="{{encoding}}">
-{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
-</head>
-<body>
-{% include html_table_tpl %}
-</body>
-</html>
-{% elif not doctype_html %}
-{% if not exclude_styles %}{% include html_style_tpl %}{% endif %}
-{% include html_table_tpl %}
-{% endif %}
diff --git a/contrib/python/pandas/py3/pandas/io/formats/templates/html_style.tpl b/contrib/python/pandas/py3/pandas/io/formats/templates/html_style.tpl
deleted file mode 100644
index 5c3fcd97f51..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/templates/html_style.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-{%- block before_style -%}{%- endblock before_style -%}
-{% block style %}
-<style type="text/css">
-{% block table_styles %}
-{% for s in table_styles %}
-#T_{{uuid}} {{s.selector}} {
-{% for p,val in s.props %}
- {{p}}: {{val}};
-{% endfor %}
-}
-{% endfor %}
-{% endblock table_styles %}
-{% block before_cellstyle %}{% endblock before_cellstyle %}
-{% block cellstyle %}
-{% for cs in [cellstyle, cellstyle_index, cellstyle_columns] %}
-{% for s in cs %}
-{% for selector in s.selectors %}{% if not loop.first %}, {% endif %}#T_{{uuid}}_{{selector}}{% endfor %} {
-{% for p,val in s.props %}
- {{p}}: {{val}};
-{% endfor %}
-}
-{% endfor %}
-{% endfor %}
-{% endblock cellstyle %}
-</style>
-{% endblock style %}
diff --git a/contrib/python/pandas/py3/pandas/io/formats/templates/html_table.tpl b/contrib/python/pandas/py3/pandas/io/formats/templates/html_table.tpl
deleted file mode 100644
index 17118d2bb21..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/templates/html_table.tpl
+++ /dev/null
@@ -1,63 +0,0 @@
-{% block before_table %}{% endblock before_table %}
-{% block table %}
-{% if exclude_styles %}
-<table>
-{% else %}
-<table id="T_{{uuid}}"{% if table_attributes %} {{table_attributes}}{% endif %}>
-{% endif %}
-{% block caption %}
-{% if caption and caption is string %}
- <caption>{{caption}}</caption>
-{% elif caption and caption is sequence %}
- <caption>{{caption[0]}}</caption>
-{% endif %}
-{% endblock caption %}
-{% block thead %}
- <thead>
-{% block before_head_rows %}{% endblock %}
-{% for r in head %}
-{% block head_tr scoped %}
- <tr>
-{% if exclude_styles %}
-{% for c in r %}
-{% if c.is_visible != False %}
- <{{c.type}} {{c.attributes}}>{{c.display_value}}</{{c.type}}>
-{% endif %}
-{% endfor %}
-{% else %}
-{% for c in r %}
-{% if c.is_visible != False %}
- <{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}_{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}}</{{c.type}}>
-{% endif %}
-{% endfor %}
-{% endif %}
- </tr>
-{% endblock head_tr %}
-{% endfor %}
-{% block after_head_rows %}{% endblock %}
- </thead>
-{% endblock thead %}
-{% block tbody %}
- <tbody>
-{% block before_rows %}{% endblock before_rows %}
-{% for r in body %}
-{% block tr scoped %}
- <tr>
-{% if exclude_styles %}
-{% for c in r %}{% if c.is_visible != False %}
- <{{c.type}} {{c.attributes}}>{{c.display_value}}</{{c.type}}>
-{% endif %}{% endfor %}
-{% else %}
-{% for c in r %}{% if c.is_visible != False %}
- <{{c.type}} {%- if c.id is defined %} id="T_{{uuid}}_{{c.id}}" {%- endif %} class="{{c.class}}" {{c.attributes}}>{{c.display_value}}</{{c.type}}>
-{% endif %}{% endfor %}
-{% endif %}
- </tr>
-{% endblock tr %}
-{% endfor %}
-{% block after_rows %}{% endblock after_rows %}
- </tbody>
-{% endblock tbody %}
-</table>
-{% endblock table %}
-{% block after_table %}{% endblock after_table %}
diff --git a/contrib/python/pandas/py3/pandas/io/formats/templates/latex.tpl b/contrib/python/pandas/py3/pandas/io/formats/templates/latex.tpl
deleted file mode 100644
index ae341bbc298..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/templates/latex.tpl
+++ /dev/null
@@ -1,5 +0,0 @@
-{% if environment == "longtable" %}
-{% include "latex_longtable.tpl" %}
-{% else %}
-{% include "latex_table.tpl" %}
-{% endif %}
diff --git a/contrib/python/pandas/py3/pandas/io/formats/templates/latex_longtable.tpl b/contrib/python/pandas/py3/pandas/io/formats/templates/latex_longtable.tpl
deleted file mode 100644
index b97843eeb91..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/templates/latex_longtable.tpl
+++ /dev/null
@@ -1,82 +0,0 @@
-\begin{longtable}
-{%- set position = parse_table(table_styles, 'position') %}
-{%- if position is not none %}
-[{{position}}]
-{%- endif %}
-{%- set column_format = parse_table(table_styles, 'column_format') %}
-{% raw %}{{% endraw %}{{column_format}}{% raw %}}{% endraw %}
-
-{% for style in table_styles %}
-{% if style['selector'] not in ['position', 'position_float', 'caption', 'toprule', 'midrule', 'bottomrule', 'column_format', 'label'] %}
-\{{style['selector']}}{{parse_table(table_styles, style['selector'])}}
-{% endif %}
-{% endfor %}
-{% if caption and caption is string %}
-\caption{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %}
-{%- set label = parse_table(table_styles, 'label') %}
-{%- if label is not none %}
- \label{{label}}
-{%- endif %} \\
-{% elif caption and caption is sequence %}
-\caption[{{caption[1]}}]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %}
-{%- set label = parse_table(table_styles, 'label') %}
-{%- if label is not none %}
- \label{{label}}
-{%- endif %} \\
-{% else %}
-{%- set label = parse_table(table_styles, 'label') %}
-{%- if label is not none %}
-\label{{label}} \\
-{% endif %}
-{% endif %}
-{% set toprule = parse_table(table_styles, 'toprule') %}
-{% if toprule is not none %}
-\{{toprule}}
-{% endif %}
-{% for row in head %}
-{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx)}}{% endfor %} \\
-{% endfor %}
-{% set midrule = parse_table(table_styles, 'midrule') %}
-{% if midrule is not none %}
-\{{midrule}}
-{% endif %}
-\endfirsthead
-{% if caption and caption is string %}
-\caption[]{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %} \\
-{% elif caption and caption is sequence %}
-\caption[]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %} \\
-{% endif %}
-{% if toprule is not none %}
-\{{toprule}}
-{% endif %}
-{% for row in head %}
-{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx)}}{% endfor %} \\
-{% endfor %}
-{% if midrule is not none %}
-\{{midrule}}
-{% endif %}
-\endhead
-{% if midrule is not none %}
-\{{midrule}}
-{% endif %}
-\multicolumn{% raw %}{{% endraw %}{{body[0]|length}}{% raw %}}{% endraw %}{r}{Continued on next page} \\
-{% if midrule is not none %}
-\{{midrule}}
-{% endif %}
-\endfoot
-{% set bottomrule = parse_table(table_styles, 'bottomrule') %}
-{% if bottomrule is not none %}
-\{{bottomrule}}
-{% endif %}
-\endlastfoot
-{% for row in body %}
-{% for c in row %}{% if not loop.first %} & {% endif %}
- {%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %}
-{%- endfor %} \\
-{% if clines and clines[loop.index] | length > 0 %}
- {%- for cline in clines[loop.index] %}{% if not loop.first %} {% endif %}{{ cline }}{% endfor %}
-
-{% endif %}
-{% endfor %}
-\end{longtable}
-{% raw %}{% endraw %}
diff --git a/contrib/python/pandas/py3/pandas/io/formats/templates/latex_table.tpl b/contrib/python/pandas/py3/pandas/io/formats/templates/latex_table.tpl
deleted file mode 100644
index 7858cb4c945..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/templates/latex_table.tpl
+++ /dev/null
@@ -1,57 +0,0 @@
-{% if environment or parse_wrap(table_styles, caption) %}
-\begin{% raw %}{{% endraw %}{{environment if environment else "table"}}{% raw %}}{% endraw %}
-{%- set position = parse_table(table_styles, 'position') %}
-{%- if position is not none %}
-[{{position}}]
-{%- endif %}
-
-{% set position_float = parse_table(table_styles, 'position_float') %}
-{% if position_float is not none%}
-\{{position_float}}
-{% endif %}
-{% if caption and caption is string %}
-\caption{% raw %}{{% endraw %}{{caption}}{% raw %}}{% endraw %}
-
-{% elif caption and caption is sequence %}
-\caption[{{caption[1]}}]{% raw %}{{% endraw %}{{caption[0]}}{% raw %}}{% endraw %}
-
-{% endif %}
-{% for style in table_styles %}
-{% if style['selector'] not in ['position', 'position_float', 'caption', 'toprule', 'midrule', 'bottomrule', 'column_format'] %}
-\{{style['selector']}}{{parse_table(table_styles, style['selector'])}}
-{% endif %}
-{% endfor %}
-{% endif %}
-\begin{tabular}
-{%- set column_format = parse_table(table_styles, 'column_format') %}
-{% raw %}{{% endraw %}{{column_format}}{% raw %}}{% endraw %}
-
-{% set toprule = parse_table(table_styles, 'toprule') %}
-{% if toprule is not none %}
-\{{toprule}}
-{% endif %}
-{% for row in head %}
-{% for c in row %}{%- if not loop.first %} & {% endif %}{{parse_header(c, multirow_align, multicol_align, siunitx, convert_css)}}{% endfor %} \\
-{% endfor %}
-{% set midrule = parse_table(table_styles, 'midrule') %}
-{% if midrule is not none %}
-\{{midrule}}
-{% endif %}
-{% for row in body %}
-{% for c in row %}{% if not loop.first %} & {% endif %}
- {%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align, False, convert_css)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %}
-{%- endfor %} \\
-{% if clines and clines[loop.index] | length > 0 %}
- {%- for cline in clines[loop.index] %}{% if not loop.first %} {% endif %}{{ cline }}{% endfor %}
-
-{% endif %}
-{% endfor %}
-{% set bottomrule = parse_table(table_styles, 'bottomrule') %}
-{% if bottomrule is not none %}
-\{{bottomrule}}
-{% endif %}
-\end{tabular}
-{% if environment or parse_wrap(table_styles, caption) %}
-\end{% raw %}{{% endraw %}{{environment if environment else "table"}}{% raw %}}{% endraw %}
-
-{% endif %}
diff --git a/contrib/python/pandas/py3/pandas/io/formats/templates/string.tpl b/contrib/python/pandas/py3/pandas/io/formats/templates/string.tpl
deleted file mode 100644
index 06aeb2b4e41..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/templates/string.tpl
+++ /dev/null
@@ -1,12 +0,0 @@
-{% for r in head %}
-{% for c in r %}{% if c["is_visible"] %}
-{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %}
-{% endif %}{% endfor %}
-
-{% endfor %}
-{% for r in body %}
-{% for c in r %}{% if c["is_visible"] %}
-{{ c["display_value"] }}{% if not loop.last %}{{ delimiter }}{% endif %}
-{% endif %}{% endfor %}
-
-{% endfor %}
diff --git a/contrib/python/pandas/py3/pandas/io/formats/xml.py b/contrib/python/pandas/py3/pandas/io/formats/xml.py
deleted file mode 100644
index cc258e02710..00000000000
--- a/contrib/python/pandas/py3/pandas/io/formats/xml.py
+++ /dev/null
@@ -1,555 +0,0 @@
-"""
-:mod:`pandas.io.formats.xml` is a module for formatting data in XML.
-"""
-from __future__ import annotations
-
-import codecs
-import io
-from typing import (
- TYPE_CHECKING,
- Any,
-)
-
-from pandas._typing import (
- CompressionOptions,
- FilePath,
- ReadBuffer,
- StorageOptions,
- WriteBuffer,
-)
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import doc
-
-from pandas.core.dtypes.common import is_list_like
-from pandas.core.dtypes.missing import isna
-
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.common import get_handle
-from pandas.io.xml import (
- get_data_from_filepath,
- preprocess_data,
-)
-
-if TYPE_CHECKING:
- from pandas import DataFrame
-
-
-@doc(
- storage_options=_shared_docs["storage_options"],
- compression_options=_shared_docs["compression_options"] % "path_or_buffer",
-)
-class BaseXMLFormatter:
- """
- Subclass for formatting data in XML.
-
- Parameters
- ----------
- path_or_buffer : str or file-like
- This can be either a string of raw XML, a valid URL,
- file or file-like object.
-
- index : bool
- Whether to include index in xml document.
-
- row_name : str
- Name for root of xml document. Default is 'data'.
-
- root_name : str
- Name for row elements of xml document. Default is 'row'.
-
- na_rep : str
- Missing data representation.
-
- attrs_cols : list
- List of columns to write as attributes in row element.
-
- elem_cols : list
- List of columns to write as children in row element.
-
- namespaces : dict
- The namespaces to define in XML document as dicts with key
- being namespace and value the URI.
-
- prefix : str
- The prefix for each element in XML document including root.
-
- encoding : str
- Encoding of xml object or document.
-
- xml_declaration : bool
- Whether to include xml declaration at top line item in xml.
-
- pretty_print : bool
- Whether to write xml document with line breaks and indentation.
-
- stylesheet : str or file-like
- A URL, file, file-like object, or a raw string containing XSLT.
-
- {compression_options}
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- {storage_options}
-
- See also
- --------
- pandas.io.formats.xml.EtreeXMLFormatter
- pandas.io.formats.xml.LxmlXMLFormatter
-
- """
-
- def __init__(
- self,
- frame: DataFrame,
- path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
- index: bool = True,
- root_name: str | None = "data",
- row_name: str | None = "row",
- na_rep: str | None = None,
- attr_cols: list[str] | None = None,
- elem_cols: list[str] | None = None,
- namespaces: dict[str | None, str] | None = None,
- prefix: str | None = None,
- encoding: str = "utf-8",
- xml_declaration: bool | None = True,
- pretty_print: bool | None = True,
- stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
- ) -> None:
- self.frame = frame
- self.path_or_buffer = path_or_buffer
- self.index = index
- self.root_name = root_name
- self.row_name = row_name
- self.na_rep = na_rep
- self.attr_cols = attr_cols
- self.elem_cols = elem_cols
- self.namespaces = namespaces
- self.prefix = prefix
- self.encoding = encoding
- self.xml_declaration = xml_declaration
- self.pretty_print = pretty_print
- self.stylesheet = stylesheet
- self.compression = compression
- self.storage_options = storage_options
-
- self.orig_cols = self.frame.columns.tolist()
- self.frame_dicts = self.process_dataframe()
-
- self.validate_columns()
- self.validate_encoding()
- self.prefix_uri = self.get_prefix_uri()
- self.handle_indexes()
-
- def build_tree(self) -> bytes:
- """
- Build tree from data.
-
- This method initializes the root and builds attributes and elements
- with optional namespaces.
- """
- raise AbstractMethodError(self)
-
- def validate_columns(self) -> None:
- """
- Validate elems_cols and attrs_cols.
-
- This method will check if columns is list-like.
-
- Raises
- ------
- ValueError
- * If value is not a list and less then length of nodes.
- """
- if self.attr_cols and not is_list_like(self.attr_cols):
- raise TypeError(
- f"{type(self.attr_cols).__name__} is not a valid type for attr_cols"
- )
-
- if self.elem_cols and not is_list_like(self.elem_cols):
- raise TypeError(
- f"{type(self.elem_cols).__name__} is not a valid type for elem_cols"
- )
-
- def validate_encoding(self) -> None:
- """
- Validate encoding.
-
- This method will check if encoding is among listed under codecs.
-
- Raises
- ------
- LookupError
- * If encoding is not available in codecs.
- """
-
- codecs.lookup(self.encoding)
-
- def process_dataframe(self) -> dict[int | str, dict[str, Any]]:
- """
- Adjust Data Frame to fit xml output.
-
- This method will adjust underlying data frame for xml output,
- including optionally replacing missing values and including indexes.
- """
-
- df = self.frame
-
- if self.index:
- df = df.reset_index()
-
- if self.na_rep is not None:
- df = df.fillna(self.na_rep)
-
- return df.to_dict(orient="index")
-
- def handle_indexes(self) -> None:
- """
- Handle indexes.
-
- This method will add indexes into attr_cols or elem_cols.
- """
-
- if not self.index:
- return
-
- first_key = next(iter(self.frame_dicts))
- indexes: list[str] = [
- x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols
- ]
-
- if self.attr_cols:
- self.attr_cols = indexes + self.attr_cols
-
- if self.elem_cols:
- self.elem_cols = indexes + self.elem_cols
-
- def get_prefix_uri(self) -> str:
- """
- Get uri of namespace prefix.
-
- This method retrieves corresponding URI to prefix in namespaces.
-
- Raises
- ------
- KeyError
- *If prefix is not included in namespace dict.
- """
-
- raise AbstractMethodError(self)
-
- def other_namespaces(self) -> dict:
- """
- Define other namespaces.
-
- This method will build dictionary of namespaces attributes
- for root element, conditionally with optional namespaces and
- prefix.
- """
-
- nmsp_dict: dict[str, str] = {}
- if self.namespaces and self.prefix is None:
- nmsp_dict = {"xmlns": n for p, n in self.namespaces.items() if p != ""}
-
- if self.namespaces and self.prefix:
- nmsp_dict = {"xmlns": n for p, n in self.namespaces.items() if p == ""}
-
- return nmsp_dict
-
- def build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any:
- """
- Create attributes of row.
-
- This method adds attributes using attr_cols to row element and
- works with tuples for multindex or hierarchical columns.
- """
-
- if not self.attr_cols:
- return elem_row
-
- for col in self.attr_cols:
- attr_name = self._get_flat_col_name(col)
- try:
- if not isna(d[col]):
- elem_row.attrib[attr_name] = str(d[col])
- except KeyError:
- raise KeyError(f"no valid column, {col}")
- return elem_row
-
- def _get_flat_col_name(self, col: str | tuple) -> str:
- flat_col = col
- if isinstance(col, tuple):
- flat_col = (
- "".join([str(c) for c in col]).strip()
- if "" in col
- else "_".join([str(c) for c in col]).strip()
- )
- return f"{self.prefix_uri}{flat_col}"
-
- def build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
- """
- Create child elements of row.
-
- This method adds child elements using elem_cols to row element and
- works with tuples for multindex or hierarchical columns.
- """
-
- raise AbstractMethodError(self)
-
- def _build_elems(self, sub_element_cls, d: dict[str, Any], elem_row: Any) -> None:
- if not self.elem_cols:
- return
-
- for col in self.elem_cols:
- elem_name = self._get_flat_col_name(col)
- try:
- val = None if isna(d[col]) or d[col] == "" else str(d[col])
- sub_element_cls(elem_row, elem_name).text = val
- except KeyError:
- raise KeyError(f"no valid column, {col}")
-
- def write_output(self) -> str | None:
- xml_doc = self.build_tree()
-
- if self.path_or_buffer is not None:
- with get_handle(
- self.path_or_buffer,
- "wb",
- compression=self.compression,
- storage_options=self.storage_options,
- is_text=False,
- ) as handles:
- handles.handle.write(xml_doc)
- return None
-
- else:
- return xml_doc.decode(self.encoding).rstrip()
-
-
-class EtreeXMLFormatter(BaseXMLFormatter):
- """
- Class for formatting data in xml using Python standard library
- modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
- """
-
- def build_tree(self) -> bytes:
- from xml.etree.ElementTree import (
- Element,
- SubElement,
- tostring,
- )
-
- self.root = Element(
- f"{self.prefix_uri}{self.root_name}", attrib=self.other_namespaces()
- )
-
- for d in self.frame_dicts.values():
- elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
-
- if not self.attr_cols and not self.elem_cols:
- self.elem_cols = list(d.keys())
- self.build_elems(d, elem_row)
-
- else:
- elem_row = self.build_attribs(d, elem_row)
- self.build_elems(d, elem_row)
-
- self.out_xml = tostring(self.root, method="xml", encoding=self.encoding)
-
- if self.pretty_print:
- self.out_xml = self.prettify_tree()
-
- if self.xml_declaration:
- self.out_xml = self.add_declaration()
- else:
- self.out_xml = self.remove_declaration()
-
- if self.stylesheet is not None:
- raise ValueError(
- "To use stylesheet, you need lxml installed and selected as parser."
- )
-
- return self.out_xml
-
- def get_prefix_uri(self) -> str:
- from xml.etree.ElementTree import register_namespace
-
- uri = ""
- if self.namespaces:
- for p, n in self.namespaces.items():
- if isinstance(p, str) and isinstance(n, str):
- register_namespace(p, n)
- if self.prefix:
- try:
- uri = f"{{{self.namespaces[self.prefix]}}}"
- except KeyError:
- raise KeyError(f"{self.prefix} is not included in namespaces")
- else:
- uri = f'{{{self.namespaces[""]}}}'
-
- return uri
-
- def build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
- from xml.etree.ElementTree import SubElement
-
- self._build_elems(SubElement, d, elem_row)
-
- def prettify_tree(self) -> bytes:
- """
- Output tree for pretty print format.
-
- This method will pretty print xml with line breaks and indentation.
- """
-
- from xml.dom.minidom import parseString
-
- dom = parseString(self.out_xml)
-
- return dom.toprettyxml(indent=" ", encoding=self.encoding)
-
- def add_declaration(self) -> bytes:
- """
- Add xml declaration.
-
- This method will add xml declaration of working tree. Currently,
- xml_declaration is supported in etree starting in Python 3.8.
- """
- decl = f'<?xml version="1.0" encoding="{self.encoding}"?>\n'
-
- return (
- self.out_xml
- if self.out_xml.startswith(b"<?xml")
- else decl.encode(self.encoding) + self.out_xml
- )
-
- def remove_declaration(self) -> bytes:
- """
- Remove xml declaration.
-
- This method will remove xml declaration of working tree. Currently,
- pretty_print is not supported in etree.
- """
-
- return self.out_xml.split(b"?>")[-1].strip()
-
-
-class LxmlXMLFormatter(BaseXMLFormatter):
- """
- Class for formatting data in xml using Python standard library
- modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
- """
-
- def __init__(self, *args, **kwargs) -> None:
- super().__init__(*args, **kwargs)
-
- self.convert_empty_str_key()
-
- def build_tree(self) -> bytes:
- """
- Build tree from data.
-
- This method initializes the root and builds attributes and elements
- with optional namespaces.
- """
- from lxml.etree import (
- Element,
- SubElement,
- tostring,
- )
-
- self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces)
-
- for d in self.frame_dicts.values():
- elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
-
- if not self.attr_cols and not self.elem_cols:
- self.elem_cols = list(d.keys())
- self.build_elems(d, elem_row)
-
- else:
- elem_row = self.build_attribs(d, elem_row)
- self.build_elems(d, elem_row)
-
- self.out_xml = tostring(
- self.root,
- pretty_print=self.pretty_print,
- method="xml",
- encoding=self.encoding,
- xml_declaration=self.xml_declaration,
- )
-
- if self.stylesheet is not None:
- self.out_xml = self.transform_doc()
-
- return self.out_xml
-
- def convert_empty_str_key(self) -> None:
- """
- Replace zero-length string in `namespaces`.
-
- This method will replace '' with None to align to `lxml`
- requirement that empty string prefixes are not allowed.
- """
-
- if self.namespaces and "" in self.namespaces.keys():
- self.namespaces[None] = self.namespaces.pop("", "default")
-
- def get_prefix_uri(self) -> str:
- uri = ""
- if self.namespaces:
- if self.prefix:
- try:
- uri = f"{{{self.namespaces[self.prefix]}}}"
- except KeyError:
- raise KeyError(f"{self.prefix} is not included in namespaces")
- else:
- uri = f'{{{self.namespaces[""]}}}'
-
- return uri
-
- def build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
- from lxml.etree import SubElement
-
- self._build_elems(SubElement, d, elem_row)
-
- def transform_doc(self) -> bytes:
- """
- Parse stylesheet from file or buffer and run it.
-
- This method will parse stylesheet object into tree for parsing
- conditionally by its specific object type, then transforms
- original tree with XSLT script.
- """
- from lxml.etree import (
- XSLT,
- XMLParser,
- fromstring,
- parse,
- )
-
- style_doc = self.stylesheet
- assert style_doc is not None # is ensured by caller
-
- handle_data = get_data_from_filepath(
- filepath_or_buffer=style_doc,
- encoding=self.encoding,
- compression=self.compression,
- storage_options=self.storage_options,
- )
-
- with preprocess_data(handle_data) as xml_data:
- curr_parser = XMLParser(encoding=self.encoding)
-
- if isinstance(xml_data, io.StringIO):
- xsl_doc = fromstring(
- xml_data.getvalue().encode(self.encoding), parser=curr_parser
- )
- else:
- xsl_doc = parse(xml_data, parser=curr_parser)
-
- transformer = XSLT(xsl_doc)
- new_doc = transformer(self.root)
-
- return bytes(new_doc)
diff --git a/contrib/python/pandas/py3/pandas/io/gbq.py b/contrib/python/pandas/py3/pandas/io/gbq.py
deleted file mode 100644
index d6c73664ab6..00000000000
--- a/contrib/python/pandas/py3/pandas/io/gbq.py
+++ /dev/null
@@ -1,227 +0,0 @@
-""" Google BigQuery support """
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Any,
-)
-
-from pandas.compat._optional import import_optional_dependency
-
-if TYPE_CHECKING:
- from pandas import DataFrame
-
-
-def _try_import():
- # since pandas is a dependency of pandas-gbq
- # we need to import on first use
- msg = (
- "pandas-gbq is required to load data from Google BigQuery. "
- "See the docs: https://pandas-gbq.readthedocs.io."
- )
- pandas_gbq = import_optional_dependency("pandas_gbq", extra=msg)
- return pandas_gbq
-
-
-def read_gbq(
- query: str,
- project_id: str | None = None,
- index_col: str | None = None,
- col_order: list[str] | None = None,
- reauth: bool = False,
- auth_local_webserver: bool = True,
- dialect: str | None = None,
- location: str | None = None,
- configuration: dict[str, Any] | None = None,
- credentials=None,
- use_bqstorage_api: bool | None = None,
- max_results: int | None = None,
- progress_bar_type: str | None = None,
-) -> DataFrame:
- """
- Load data from Google BigQuery.
-
- This function requires the `pandas-gbq package
- <https://pandas-gbq.readthedocs.io>`__.
-
- See the `How to authenticate with Google BigQuery
- <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
- guide for authentication instructions.
-
- Parameters
- ----------
- query : str
- SQL-Like Query to return data values.
- project_id : str, optional
- Google BigQuery Account project ID. Optional when available from
- the environment.
- index_col : str, optional
- Name of result column to use for index in results DataFrame.
- col_order : list(str), optional
- List of BigQuery column names in the desired order for results
- DataFrame.
- reauth : bool, default False
- Force Google BigQuery to re-authenticate the user. This is useful
- if multiple accounts are used.
- auth_local_webserver : bool, default True
- Use the `local webserver flow`_ instead of the `console flow`_
- when getting user credentials.
-
- .. _local webserver flow:
- https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
- .. _console flow:
- https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
-
- *New in version 0.2.0 of pandas-gbq*.
-
- .. versionchanged:: 1.5.0
- Default value is changed to ``True``. Google has deprecated the
- ``auth_local_webserver = False`` `"out of band" (copy-paste)
- flow
- <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
- dialect : str, default 'legacy'
- Note: The default value is changing to 'standard' in a future version.
-
- SQL syntax dialect to use. Value can be one of:
-
- ``'legacy'``
- Use BigQuery's legacy SQL dialect. For more information see
- `BigQuery Legacy SQL Reference
- <https://cloud.google.com/bigquery/docs/reference/legacy-sql>`__.
- ``'standard'``
- Use BigQuery's standard SQL, which is
- compliant with the SQL 2011 standard. For more information
- see `BigQuery Standard SQL Reference
- <https://cloud.google.com/bigquery/docs/reference/standard-sql/>`__.
- location : str, optional
- Location where the query job should run. See the `BigQuery locations
- documentation
- <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
- list of available locations. The location must match that of any
- datasets used in the query.
-
- *New in version 0.5.0 of pandas-gbq*.
- configuration : dict, optional
- Query config parameters for job processing.
- For example:
-
- configuration = {'query': {'useQueryCache': False}}
-
- For more information see `BigQuery REST API Reference
- <https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query>`__.
- credentials : google.auth.credentials.Credentials, optional
- Credentials for accessing Google APIs. Use this parameter to override
- default credentials, such as to use Compute Engine
- :class:`google.auth.compute_engine.Credentials` or Service Account
- :class:`google.oauth2.service_account.Credentials` directly.
-
- *New in version 0.8.0 of pandas-gbq*.
- use_bqstorage_api : bool, default False
- Use the `BigQuery Storage API
- <https://cloud.google.com/bigquery/docs/reference/storage/>`__ to
- download query results quickly, but at an increased cost. To use this
- API, first `enable it in the Cloud Console
- <https://console.cloud.google.com/apis/library/bigquerystorage.googleapis.com>`__.
- You must also have the `bigquery.readsessions.create
- <https://cloud.google.com/bigquery/docs/access-control#roles>`__
- permission on the project you are billing queries to.
-
- This feature requires version 0.10.0 or later of the ``pandas-gbq``
- package. It also requires the ``google-cloud-bigquery-storage`` and
- ``fastavro`` packages.
-
- max_results : int, optional
- If set, limit the maximum number of rows to fetch from the query
- results.
-
- *New in version 0.12.0 of pandas-gbq*.
-
- .. versionadded:: 1.1.0
- progress_bar_type : Optional, str
- If set, use the `tqdm <https://tqdm.github.io/>`__ library to
- display a progress bar while the data downloads. Install the
- ``tqdm`` package to use this feature.
-
- Possible values of ``progress_bar_type`` include:
-
- ``None``
- No progress bar.
- ``'tqdm'``
- Use the :func:`tqdm.tqdm` function to print a progress bar
- to :data:`sys.stderr`.
- ``'tqdm_notebook'``
- Use the :func:`tqdm.tqdm_notebook` function to display a
- progress bar as a Jupyter notebook widget.
- ``'tqdm_gui'``
- Use the :func:`tqdm.tqdm_gui` function to display a
- progress bar as a graphical dialog box.
-
- Note that this feature requires version 0.12.0 or later of the
- ``pandas-gbq`` package. And it requires the ``tqdm`` package. Slightly
- different than ``pandas-gbq``, here the default is ``None``.
-
- Returns
- -------
- df: DataFrame
- DataFrame representing results of query.
-
- See Also
- --------
- pandas_gbq.read_gbq : This function in the pandas-gbq library.
- DataFrame.to_gbq : Write a DataFrame to Google BigQuery.
- """
- pandas_gbq = _try_import()
-
- kwargs: dict[str, str | bool | int | None] = {}
-
- # START: new kwargs. Don't populate unless explicitly set.
- if use_bqstorage_api is not None:
- kwargs["use_bqstorage_api"] = use_bqstorage_api
- if max_results is not None:
- kwargs["max_results"] = max_results
-
- kwargs["progress_bar_type"] = progress_bar_type
- # END: new kwargs
-
- return pandas_gbq.read_gbq(
- query,
- project_id=project_id,
- index_col=index_col,
- col_order=col_order,
- reauth=reauth,
- auth_local_webserver=auth_local_webserver,
- dialect=dialect,
- location=location,
- configuration=configuration,
- credentials=credentials,
- **kwargs,
- )
-
-
-def to_gbq(
- dataframe: DataFrame,
- destination_table: str,
- project_id: str | None = None,
- chunksize: int | None = None,
- reauth: bool = False,
- if_exists: str = "fail",
- auth_local_webserver: bool = True,
- table_schema: list[dict[str, str]] | None = None,
- location: str | None = None,
- progress_bar: bool = True,
- credentials=None,
-) -> None:
- pandas_gbq = _try_import()
- pandas_gbq.to_gbq(
- dataframe,
- destination_table,
- project_id=project_id,
- chunksize=chunksize,
- reauth=reauth,
- if_exists=if_exists,
- auth_local_webserver=auth_local_webserver,
- table_schema=table_schema,
- location=location,
- progress_bar=progress_bar,
- credentials=credentials,
- )
diff --git a/contrib/python/pandas/py3/pandas/io/html.py b/contrib/python/pandas/py3/pandas/io/html.py
deleted file mode 100644
index 42e2ab6ceca..00000000000
--- a/contrib/python/pandas/py3/pandas/io/html.py
+++ /dev/null
@@ -1,1230 +0,0 @@
-"""
-:mod:`pandas.io.html` is a module containing functionality for dealing with
-HTML IO.
-
-"""
-
-from __future__ import annotations
-
-from collections import abc
-import numbers
-import re
-from typing import (
- TYPE_CHECKING,
- Iterable,
- Literal,
- Pattern,
- Sequence,
- cast,
-)
-
-from pandas._libs import lib
-from pandas._typing import (
- BaseBuffer,
- DtypeBackend,
- FilePath,
- ReadBuffer,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.errors import (
- AbstractMethodError,
- EmptyDataError,
-)
-from pandas.util._validators import check_dtype_backend
-
-from pandas.core.dtypes.common import is_list_like
-
-from pandas import isna
-from pandas.core.indexes.base import Index
-from pandas.core.indexes.multi import MultiIndex
-from pandas.core.series import Series
-
-from pandas.io.common import (
- file_exists,
- get_handle,
- is_url,
- stringify_path,
- urlopen,
- validate_header_arg,
-)
-from pandas.io.formats.printing import pprint_thing
-from pandas.io.parsers import TextParser
-
-if TYPE_CHECKING:
- from pandas import DataFrame
-
-_IMPORTS = False
-_HAS_BS4 = False
-_HAS_LXML = False
-_HAS_HTML5LIB = False
-
-
-def _importers() -> None:
- # import things we need
- # but make this done on a first use basis
-
- global _IMPORTS
- if _IMPORTS:
- return
-
- global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
- bs4 = import_optional_dependency("bs4", errors="ignore")
- _HAS_BS4 = bs4 is not None
-
- lxml = import_optional_dependency("lxml.etree", errors="ignore")
- _HAS_LXML = lxml is not None
-
- html5lib = import_optional_dependency("html5lib", errors="ignore")
- _HAS_HTML5LIB = html5lib is not None
-
- _IMPORTS = True
-
-
-#############
-# READ HTML #
-#############
-_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
-
-
-def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
- """
- Replace extra whitespace inside of a string with a single space.
-
- Parameters
- ----------
- s : str or unicode
- The string from which to remove extra whitespace.
- regex : re.Pattern
- The regular expression to use to remove extra whitespace.
-
- Returns
- -------
- subd : str or unicode
- `s` with all extra whitespace replaced with a single space.
- """
- return regex.sub(" ", s.strip())
-
-
-def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:
- """
- Get an iterator given an integer, slice or container.
-
- Parameters
- ----------
- skiprows : int, slice, container
- The iterator to use to skip rows; can also be a slice.
-
- Raises
- ------
- TypeError
- * If `skiprows` is not a slice, integer, or Container
-
- Returns
- -------
- it : iterable
- A proper iterator to use to skip rows of a DataFrame.
- """
- if isinstance(skiprows, slice):
- start, step = skiprows.start or 0, skiprows.step or 1
- return list(range(start, skiprows.stop, step))
- elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
- return cast("int | Sequence[int]", skiprows)
- elif skiprows is None:
- return 0
- raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
-
-
-def _read(obj: FilePath | BaseBuffer, encoding: str | None) -> str | bytes:
- """
- Try to read from a url, file or string.
-
- Parameters
- ----------
- obj : str, unicode, path object, or file-like object
-
- Returns
- -------
- raw_text : str
- """
- text: str | bytes
- if (
- is_url(obj)
- or hasattr(obj, "read")
- or (isinstance(obj, str) and file_exists(obj))
- ):
- with get_handle(obj, "r", encoding=encoding) as handles:
- text = handles.handle.read()
- elif isinstance(obj, (str, bytes)):
- text = obj
- else:
- raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
- return text
-
-
-class _HtmlFrameParser:
- """
- Base class for parsers that parse HTML into DataFrames.
-
- Parameters
- ----------
- io : str or file-like
- This can be either a string of raw HTML, a valid URL using the HTTP,
- FTP, or FILE protocols or a file-like object.
-
- match : str or regex
- The text to match in the document.
-
- attrs : dict
- List of HTML <table> element attributes to match.
-
- encoding : str
- Encoding to be used by parser
-
- displayed_only : bool
- Whether or not items with "display:none" should be ignored
-
- extract_links : {None, "all", "header", "body", "footer"}
- Table elements in the specified section(s) with <a> tags will have their
- href extracted.
-
- .. versionadded:: 1.5.0
-
- Attributes
- ----------
- io : str or file-like
- raw HTML, URL, or file-like object
-
- match : regex
- The text to match in the raw HTML
-
- attrs : dict-like
- A dictionary of valid table attributes to use to search for table
- elements.
-
- encoding : str
- Encoding to be used by parser
-
- displayed_only : bool
- Whether or not items with "display:none" should be ignored
-
- extract_links : {None, "all", "header", "body", "footer"}
- Table elements in the specified section(s) with <a> tags will have their
- href extracted.
-
- .. versionadded:: 1.5.0
-
- Notes
- -----
- To subclass this class effectively you must override the following methods:
- * :func:`_build_doc`
- * :func:`_attr_getter`
- * :func:`_href_getter`
- * :func:`_text_getter`
- * :func:`_parse_td`
- * :func:`_parse_thead_tr`
- * :func:`_parse_tbody_tr`
- * :func:`_parse_tfoot_tr`
- * :func:`_parse_tables`
- * :func:`_equals_tag`
- See each method's respective documentation for details on their
- functionality.
- """
-
- def __init__(
- self,
- io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
- match: str | Pattern,
- attrs: dict[str, str] | None,
- encoding: str,
- displayed_only: bool,
- extract_links: Literal[None, "header", "footer", "body", "all"],
- ) -> None:
- self.io = io
- self.match = match
- self.attrs = attrs
- self.encoding = encoding
- self.displayed_only = displayed_only
- self.extract_links = extract_links
-
- def parse_tables(self):
- """
- Parse and return all tables from the DOM.
-
- Returns
- -------
- list of parsed (header, body, footer) tuples from tables.
- """
- tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
- return (self._parse_thead_tbody_tfoot(table) for table in tables)
-
- def _attr_getter(self, obj, attr):
- """
- Return the attribute value of an individual DOM node.
-
- Parameters
- ----------
- obj : node-like
- A DOM node.
-
- attr : str or unicode
- The attribute, such as "colspan"
-
- Returns
- -------
- str or unicode
- The attribute value.
- """
- # Both lxml and BeautifulSoup have the same implementation:
- return obj.get(attr)
-
- def _href_getter(self, obj):
- """
- Return a href if the DOM node contains a child <a> or None.
-
- Parameters
- ----------
- obj : node-like
- A DOM node.
-
- Returns
- -------
- href : str or unicode
- The href from the <a> child of the DOM node.
- """
- raise AbstractMethodError(self)
-
- def _text_getter(self, obj):
- """
- Return the text of an individual DOM node.
-
- Parameters
- ----------
- obj : node-like
- A DOM node.
-
- Returns
- -------
- text : str or unicode
- The text from an individual DOM node.
- """
- raise AbstractMethodError(self)
-
- def _parse_td(self, obj):
- """
- Return the td elements from a row element.
-
- Parameters
- ----------
- obj : node-like
- A DOM <tr> node.
-
- Returns
- -------
- list of node-like
- These are the elements of each row, i.e., the columns.
- """
- raise AbstractMethodError(self)
-
- def _parse_thead_tr(self, table):
- """
- Return the list of thead row elements from the parsed table element.
-
- Parameters
- ----------
- table : a table element that contains zero or more thead elements.
-
- Returns
- -------
- list of node-like
- These are the <tr> row elements of a table.
- """
- raise AbstractMethodError(self)
-
- def _parse_tbody_tr(self, table):
- """
- Return the list of tbody row elements from the parsed table element.
-
- HTML5 table bodies consist of either 0 or more <tbody> elements (which
- only contain <tr> elements) or 0 or more <tr> elements. This method
- checks for both structures.
-
- Parameters
- ----------
- table : a table element that contains row elements.
-
- Returns
- -------
- list of node-like
- These are the <tr> row elements of a table.
- """
- raise AbstractMethodError(self)
-
- def _parse_tfoot_tr(self, table):
- """
- Return the list of tfoot row elements from the parsed table element.
-
- Parameters
- ----------
- table : a table element that contains row elements.
-
- Returns
- -------
- list of node-like
- These are the <tr> row elements of a table.
- """
- raise AbstractMethodError(self)
-
- def _parse_tables(self, doc, match, attrs):
- """
- Return all tables from the parsed DOM.
-
- Parameters
- ----------
- doc : the DOM from which to parse the table element.
-
- match : str or regular expression
- The text to search for in the DOM tree.
-
- attrs : dict
- A dictionary of table attributes that can be used to disambiguate
- multiple tables on a page.
-
- Raises
- ------
- ValueError : `match` does not match any text in the document.
-
- Returns
- -------
- list of node-like
- HTML <table> elements to be parsed into raw data.
- """
- raise AbstractMethodError(self)
-
- def _equals_tag(self, obj, tag):
- """
- Return whether an individual DOM node matches a tag
-
- Parameters
- ----------
- obj : node-like
- A DOM node.
-
- tag : str
- Tag name to be checked for equality.
-
- Returns
- -------
- boolean
- Whether `obj`'s tag name is `tag`
- """
- raise AbstractMethodError(self)
-
- def _build_doc(self):
- """
- Return a tree-like object that can be used to iterate over the DOM.
-
- Returns
- -------
- node-like
- The DOM from which to parse the table element.
- """
- raise AbstractMethodError(self)
-
- def _parse_thead_tbody_tfoot(self, table_html):
- """
- Given a table, return parsed header, body, and foot.
-
- Parameters
- ----------
- table_html : node-like
-
- Returns
- -------
- tuple of (header, body, footer), each a list of list-of-text rows.
-
- Notes
- -----
- Header and body are lists-of-lists. Top level list is a list of
- rows. Each row is a list of str text.
-
- Logic: Use <thead>, <tbody>, <tfoot> elements to identify
- header, body, and footer, otherwise:
- - Put all rows into body
- - Move rows from top of body to header only if
- all elements inside row are <th>
- - Move rows from bottom of body to footer only if
- all elements inside row are <th>
- """
- header_rows = self._parse_thead_tr(table_html)
- body_rows = self._parse_tbody_tr(table_html)
- footer_rows = self._parse_tfoot_tr(table_html)
-
- def row_is_all_th(row):
- return all(self._equals_tag(t, "th") for t in self._parse_td(row))
-
- if not header_rows:
- # The table has no <thead>. Move the top all-<th> rows from
- # body_rows to header_rows. (This is a common case because many
- # tables in the wild have no <thead> or <tfoot>
- while body_rows and row_is_all_th(body_rows[0]):
- header_rows.append(body_rows.pop(0))
-
- header = self._expand_colspan_rowspan(header_rows, section="header")
- body = self._expand_colspan_rowspan(body_rows, section="body")
- footer = self._expand_colspan_rowspan(footer_rows, section="footer")
-
- return header, body, footer
-
- def _expand_colspan_rowspan(
- self, rows, section: Literal["header", "footer", "body"]
- ):
- """
- Given a list of <tr>s, return a list of text rows.
-
- Parameters
- ----------
- rows : list of node-like
- List of <tr>s
- section : the section that the rows belong to (header, body or footer).
-
- Returns
- -------
- list of list
- Each returned row is a list of str text, or tuple (text, link)
- if extract_links is not None.
-
- Notes
- -----
- Any cell with ``rowspan`` or ``colspan`` will have its contents copied
- to subsequent cells.
- """
- all_texts = [] # list of rows, each a list of str
- text: str | tuple
- remainder: list[
- tuple[int, str | tuple, int]
- ] = [] # list of (index, text, nrows)
-
- for tr in rows:
- texts = [] # the output for this row
- next_remainder = []
-
- index = 0
- tds = self._parse_td(tr)
- for td in tds:
- # Append texts from previous rows with rowspan>1 that come
- # before this <td>
- while remainder and remainder[0][0] <= index:
- prev_i, prev_text, prev_rowspan = remainder.pop(0)
- texts.append(prev_text)
- if prev_rowspan > 1:
- next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
- index += 1
-
- # Append the text from this <td>, colspan times
- text = _remove_whitespace(self._text_getter(td))
- if self.extract_links in ("all", section):
- href = self._href_getter(td)
- text = (text, href)
- rowspan = int(self._attr_getter(td, "rowspan") or 1)
- colspan = int(self._attr_getter(td, "colspan") or 1)
-
- for _ in range(colspan):
- texts.append(text)
- if rowspan > 1:
- next_remainder.append((index, text, rowspan - 1))
- index += 1
-
- # Append texts from previous rows at the final position
- for prev_i, prev_text, prev_rowspan in remainder:
- texts.append(prev_text)
- if prev_rowspan > 1:
- next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
-
- all_texts.append(texts)
- remainder = next_remainder
-
- # Append rows that only appear because the previous row had non-1
- # rowspan
- while remainder:
- next_remainder = []
- texts = []
- for prev_i, prev_text, prev_rowspan in remainder:
- texts.append(prev_text)
- if prev_rowspan > 1:
- next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
- all_texts.append(texts)
- remainder = next_remainder
-
- return all_texts
-
- def _handle_hidden_tables(self, tbl_list, attr_name):
- """
- Return list of tables, potentially removing hidden elements
-
- Parameters
- ----------
- tbl_list : list of node-like
- Type of list elements will vary depending upon parser used
- attr_name : str
- Name of the accessor for retrieving HTML attributes
-
- Returns
- -------
- list of node-like
- Return type matches `tbl_list`
- """
- if not self.displayed_only:
- return tbl_list
-
- return [
- x
- for x in tbl_list
- if "display:none"
- not in getattr(x, attr_name).get("style", "").replace(" ", "")
- ]
-
-
-class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
- """
- HTML to DataFrame parser that uses BeautifulSoup under the hood.
-
- See Also
- --------
- pandas.io.html._HtmlFrameParser
- pandas.io.html._LxmlFrameParser
-
- Notes
- -----
- Documentation strings for this class are in the base class
- :class:`pandas.io.html._HtmlFrameParser`.
- """
-
- def __init__(self, *args, **kwargs) -> None:
- super().__init__(*args, **kwargs)
- from bs4 import SoupStrainer
-
- self._strainer = SoupStrainer("table")
-
- def _parse_tables(self, doc, match, attrs):
- element_name = self._strainer.name
- tables = doc.find_all(element_name, attrs=attrs)
-
- if not tables:
- raise ValueError("No tables found")
-
- result = []
- unique_tables = set()
- tables = self._handle_hidden_tables(tables, "attrs")
-
- for table in tables:
- if self.displayed_only:
- for elem in table.find_all(style=re.compile(r"display:\s*none")):
- elem.decompose()
-
- if table not in unique_tables and table.find(string=match) is not None:
- result.append(table)
- unique_tables.add(table)
-
- if not result:
- raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
- return result
-
- def _href_getter(self, obj) -> str | None:
- a = obj.find("a", href=True)
- return None if not a else a["href"]
-
- def _text_getter(self, obj):
- return obj.text
-
- def _equals_tag(self, obj, tag):
- return obj.name == tag
-
- def _parse_td(self, row):
- return row.find_all(("td", "th"), recursive=False)
-
- def _parse_thead_tr(self, table):
- return table.select("thead tr")
-
- def _parse_tbody_tr(self, table):
- from_tbody = table.select("tbody tr")
- from_root = table.find_all("tr", recursive=False)
- # HTML spec: at most one of these lists has content
- return from_tbody + from_root
-
- def _parse_tfoot_tr(self, table):
- return table.select("tfoot tr")
-
- def _setup_build_doc(self):
- raw_text = _read(self.io, self.encoding)
- if not raw_text:
- raise ValueError(f"No text parsed from document: {self.io}")
- return raw_text
-
- def _build_doc(self):
- from bs4 import BeautifulSoup
-
- bdoc = self._setup_build_doc()
- if isinstance(bdoc, bytes) and self.encoding is not None:
- udoc = bdoc.decode(self.encoding)
- from_encoding = None
- else:
- udoc = bdoc
- from_encoding = self.encoding
-
- soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
-
- for br in soup.find_all("br"):
- br.replace_with("\n" + br.text)
-
- return soup
-
-
-def _build_xpath_expr(attrs) -> str:
- """
- Build an xpath expression to simulate bs4's ability to pass in kwargs to
- search for attributes when using the lxml parser.
-
- Parameters
- ----------
- attrs : dict
- A dict of HTML attributes. These are NOT checked for validity.
-
- Returns
- -------
- expr : unicode
- An XPath expression that checks for the given HTML attributes.
- """
- # give class attribute as class_ because class is a python keyword
- if "class_" in attrs:
- attrs["class"] = attrs.pop("class_")
-
- s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])
- return f"[{s}]"
-
-
-_re_namespace = {"re": "http://exslt.org/regular-expressions"}
-
-
-class _LxmlFrameParser(_HtmlFrameParser):
- """
- HTML to DataFrame parser that uses lxml under the hood.
-
- Warning
- -------
- This parser can only handle HTTP, FTP, and FILE urls.
-
- See Also
- --------
- _HtmlFrameParser
- _BeautifulSoupLxmlFrameParser
-
- Notes
- -----
- Documentation strings for this class are in the base class
- :class:`_HtmlFrameParser`.
- """
-
- def _href_getter(self, obj) -> str | None:
- href = obj.xpath(".//a/@href")
- return None if not href else href[0]
-
- def _text_getter(self, obj):
- return obj.text_content()
-
- def _parse_td(self, row):
- # Look for direct children only: the "row" element here may be a
- # <thead> or <tfoot> (see _parse_thead_tr).
- return row.xpath("./td|./th")
-
- def _parse_tables(self, doc, match, kwargs):
- pattern = match.pattern
-
- # 1. check all descendants for the given pattern and only search tables
- # GH 49929
- xpath_expr = f"//table[.//text()[re:test(., {repr(pattern)})]]"
-
- # if any table attributes were given build an xpath expression to
- # search for them
- if kwargs:
- xpath_expr += _build_xpath_expr(kwargs)
-
- tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
-
- tables = self._handle_hidden_tables(tables, "attrib")
- if self.displayed_only:
- for table in tables:
- # lxml utilizes XPATH 1.0 which does not have regex
- # support. As a result, we find all elements with a style
- # attribute and iterate them to check for display:none
- for elem in table.xpath(".//*[@style]"):
- if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
- elem.getparent().remove(elem)
-
- if not tables:
- raise ValueError(f"No tables found matching regex {repr(pattern)}")
- return tables
-
- def _equals_tag(self, obj, tag):
- return obj.tag == tag
-
- def _build_doc(self):
- """
- Raises
- ------
- ValueError
- * If a URL that lxml cannot parse is passed.
-
- Exception
- * Any other ``Exception`` thrown. For example, trying to parse a
- URL that is syntactically correct on a machine with no internet
- connection will fail.
-
- See Also
- --------
- pandas.io.html._HtmlFrameParser._build_doc
- """
- from lxml.etree import XMLSyntaxError
- from lxml.html import (
- HTMLParser,
- fromstring,
- parse,
- )
-
- parser = HTMLParser(recover=True, encoding=self.encoding)
-
- try:
- if is_url(self.io):
- with urlopen(self.io) as f:
- r = parse(f, parser=parser)
- else:
- # try to parse the input in the simplest way
- r = parse(self.io, parser=parser)
- try:
- r = r.getroot()
- except AttributeError:
- pass
- except (UnicodeDecodeError, OSError) as e:
- # if the input is a blob of html goop
- if not is_url(self.io):
- r = fromstring(self.io, parser=parser)
-
- try:
- r = r.getroot()
- except AttributeError:
- pass
- else:
- raise e
- else:
- if not hasattr(r, "text_content"):
- raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
-
- for br in r.xpath("*//br"):
- br.tail = "\n" + (br.tail or "")
-
- return r
-
- def _parse_thead_tr(self, table):
- rows = []
-
- for thead in table.xpath(".//thead"):
- rows.extend(thead.xpath("./tr"))
-
- # HACK: lxml does not clean up the clearly-erroneous
- # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
- # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
- # children as though it's a <tr>.
- #
- # Better solution would be to use html5lib.
- elements_at_root = thead.xpath("./td|./th")
- if elements_at_root:
- rows.append(thead)
-
- return rows
-
- def _parse_tbody_tr(self, table):
- from_tbody = table.xpath(".//tbody//tr")
- from_root = table.xpath("./tr")
- # HTML spec: at most one of these lists has content
- return from_tbody + from_root
-
- def _parse_tfoot_tr(self, table):
- return table.xpath(".//tfoot//tr")
-
-
-def _expand_elements(body) -> None:
- data = [len(elem) for elem in body]
- lens = Series(data)
- lens_max = lens.max()
- not_max = lens[lens != lens_max]
-
- empty = [""]
- for ind, length in not_max.items():
- body[ind] += empty * (lens_max - length)
-
-
-def _data_to_frame(**kwargs):
- head, body, foot = kwargs.pop("data")
- header = kwargs.pop("header")
- kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
- if head:
- body = head + body
-
- # Infer header when there is a <thead> or top <th>-only rows
- if header is None:
- if len(head) == 1:
- header = 0
- else:
- # ignore all-empty-text rows
- header = [i for i, row in enumerate(head) if any(text for text in row)]
-
- if foot:
- body += foot
-
- # fill out elements of body that are "ragged"
- _expand_elements(body)
- with TextParser(body, header=header, **kwargs) as tp:
- return tp.read()
-
-
-_valid_parsers = {
- "lxml": _LxmlFrameParser,
- None: _LxmlFrameParser,
- "html5lib": _BeautifulSoupHtml5LibFrameParser,
- "bs4": _BeautifulSoupHtml5LibFrameParser,
-}
-
-
-def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:
- """
- Choose the parser based on the input flavor.
-
- Parameters
- ----------
- flavor : str
- The type of parser to use. This must be a valid backend.
-
- Returns
- -------
- cls : _HtmlFrameParser subclass
- The parser class based on the requested input flavor.
-
- Raises
- ------
- ValueError
- * If `flavor` is not a valid backend.
- ImportError
- * If you do not have the requested `flavor`
- """
- valid_parsers = list(_valid_parsers.keys())
- if flavor not in valid_parsers:
- raise ValueError(
- f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"
- )
-
- if flavor in ("bs4", "html5lib"):
- if not _HAS_HTML5LIB:
- raise ImportError("html5lib not found, please install it")
- if not _HAS_BS4:
- raise ImportError("BeautifulSoup4 (bs4) not found, please install it")
- # Although we call this above, we want to raise here right before use.
- bs4 = import_optional_dependency("bs4") # noqa:F841
-
- else:
- if not _HAS_LXML:
- raise ImportError("lxml not found, please install it")
- return _valid_parsers[flavor]
-
-
-def _print_as_set(s) -> str:
- arg = ", ".join([pprint_thing(el) for el in s])
- return f"{{{arg}}}"
-
-
-def _validate_flavor(flavor):
- if flavor is None:
- flavor = "lxml", "bs4"
- elif isinstance(flavor, str):
- flavor = (flavor,)
- elif isinstance(flavor, abc.Iterable):
- if not all(isinstance(flav, str) for flav in flavor):
- raise TypeError(
- f"Object of type {repr(type(flavor).__name__)} "
- f"is not an iterable of strings"
- )
- else:
- msg = repr(flavor) if isinstance(flavor, str) else str(flavor)
- msg += " is not a valid flavor"
- raise ValueError(msg)
-
- flavor = tuple(flavor)
- valid_flavors = set(_valid_parsers)
- flavor_set = set(flavor)
-
- if not flavor_set & valid_flavors:
- raise ValueError(
- f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "
- f"flavors are {_print_as_set(valid_flavors)}"
- )
- return flavor
-
-
-def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):
- flavor = _validate_flavor(flavor)
- compiled_match = re.compile(match) # you can pass a compiled regex here
-
- retained = None
- for flav in flavor:
- parser = _parser_dispatch(flav)
- p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
-
- try:
- tables = p.parse_tables()
- except ValueError as caught:
- # if `io` is an io-like object, check if it's seekable
- # and try to rewind it before trying the next parser
- if hasattr(io, "seekable") and io.seekable():
- io.seek(0)
- elif hasattr(io, "seekable") and not io.seekable():
- # if we couldn't rewind it, let the user know
- raise ValueError(
- f"The flavor {flav} failed to parse your input. "
- "Since you passed a non-rewindable file "
- "object, we can't rewind it to try "
- "another parser. Try read_html() with a different flavor."
- ) from caught
-
- retained = caught
- else:
- break
- else:
- assert retained is not None # for mypy
- raise retained
-
- ret = []
- for table in tables:
- try:
- df = _data_to_frame(data=table, **kwargs)
- # Cast MultiIndex header to an Index of tuples when extracting header
- # links and replace nan with None (therefore can't use mi.to_flat_index()).
- # This maintains consistency of selection (e.g. df.columns.str[1])
- if extract_links in ("all", "header") and isinstance(
- df.columns, MultiIndex
- ):
- df.columns = Index(
- ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
- tupleize_cols=False,
- )
-
- ret.append(df)
- except EmptyDataError: # empty table
- continue
- return ret
-
-
-def read_html(
- io: FilePath | ReadBuffer[str],
- *,
- match: str | Pattern = ".+",
- flavor: str | None = None,
- header: int | Sequence[int] | None = None,
- index_col: int | Sequence[int] | None = None,
- skiprows: int | Sequence[int] | slice | None = None,
- attrs: dict[str, str] | None = None,
- parse_dates: bool = False,
- thousands: str | None = ",",
- encoding: str | None = None,
- decimal: str = ".",
- converters: dict | None = None,
- na_values: Iterable[object] | None = None,
- keep_default_na: bool = True,
- displayed_only: bool = True,
- extract_links: Literal[None, "header", "footer", "body", "all"] = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-) -> list[DataFrame]:
- r"""
- Read HTML tables into a ``list`` of ``DataFrame`` objects.
-
- Parameters
- ----------
- io : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a string ``read()`` function.
- The string can represent a URL or the HTML itself. Note that
- lxml only accepts the http, ftp and file url protocols. If you have a
- URL that starts with ``'https'`` you might try removing the ``'s'``.
-
- match : str or compiled regular expression, optional
- The set of tables containing text matching this regex or string will be
- returned. Unless the HTML is extremely simple you will probably need to
- pass a non-empty string here. Defaults to '.+' (match any non-empty
- string). The default value will return all tables contained on a page.
- This value is converted to a regular expression so that there is
- consistent behavior between Beautiful Soup and lxml.
-
- flavor : str, optional
- The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
- each other, they are both there for backwards compatibility. The
- default of ``None`` tries to use ``lxml`` to parse and if that fails it
- falls back on ``bs4`` + ``html5lib``.
-
- header : int or list-like, optional
- The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
- make the columns headers.
-
- index_col : int or list-like, optional
- The column (or list of columns) to use to create the index.
-
- skiprows : int, list-like or slice, optional
- Number of rows to skip after parsing the column integer. 0-based. If a
- sequence of integers or a slice is given, will skip the rows indexed by
- that sequence. Note that a single element sequence means 'skip the nth
- row' whereas an integer means 'skip n rows'.
-
- attrs : dict, optional
- This is a dictionary of attributes that you can pass to use to identify
- the table in the HTML. These are not checked for validity before being
- passed to lxml or Beautiful Soup. However, these attributes must be
- valid HTML table attributes to work correctly. For example, ::
-
- attrs = {'id': 'table'}
-
- is a valid attribute dictionary because the 'id' HTML tag attribute is
- a valid HTML attribute for *any* HTML tag as per `this document
- <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::
-
- attrs = {'asdf': 'table'}
-
- is *not* a valid attribute dictionary because 'asdf' is not a valid
- HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
- table attributes can be found `here
- <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
- working draft of the HTML 5 spec can be found `here
- <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the
- latest information on table attributes for the modern web.
-
- parse_dates : bool, optional
- See :func:`~read_csv` for more details.
-
- thousands : str, optional
- Separator to use to parse thousands. Defaults to ``','``.
-
- encoding : str, optional
- The encoding used to decode the web page. Defaults to ``None``.``None``
- preserves the previous encoding behavior, which depends on the
- underlying parser library (e.g., the parser library will try to use
- the encoding provided by the document).
-
- decimal : str, default '.'
- Character to recognize as decimal point (e.g. use ',' for European
- data).
-
- converters : dict, default None
- Dict of functions for converting values in certain columns. Keys can
- either be integers or column labels, values are functions that take one
- input argument, the cell (not column) content, and return the
- transformed content.
-
- na_values : iterable, default None
- Custom NA values.
-
- keep_default_na : bool, default True
- If na_values are specified and keep_default_na is False the default NaN
- values are overridden, otherwise they're appended to.
-
- displayed_only : bool, default True
- Whether elements with "display: none" should be parsed.
-
- extract_links : {None, "all", "header", "body", "footer"}
- Table elements in the specified section(s) with <a> tags will have their
- href extracted.
-
- .. versionadded:: 1.5.0
-
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- Returns
- -------
- dfs
- A list of DataFrames.
-
- See Also
- --------
- read_csv : Read a comma-separated values (csv) file into DataFrame.
-
- Notes
- -----
- Before using this function you should read the :ref:`gotchas about the
- HTML parsing libraries <io.html.gotchas>`.
-
- Expect to do some cleanup after you call this function. For example, you
- might need to manually assign column names if the column names are
- converted to NaN when you pass the `header=0` argument. We try to assume as
- little as possible about the structure of the table and push the
- idiosyncrasies of the HTML contained in the table to the user.
-
- This function searches for ``<table>`` elements and only for ``<tr>``
- and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
- element in the table. ``<td>`` stands for "table data". This function
- attempts to properly handle ``colspan`` and ``rowspan`` attributes.
- If the function has a ``<thead>`` argument, it is used to construct
- the header, otherwise the function attempts to find the header within
- the body (by putting rows with only ``<th>`` elements into the header).
-
- Similar to :func:`~read_csv` the `header` argument is applied
- **after** `skiprows` is applied.
-
- This function will *always* return a list of :class:`DataFrame` *or*
- it will fail, e.g., it will *not* return an empty list.
-
- Examples
- --------
- See the :ref:`read_html documentation in the IO section of the docs
- <io.read_html>` for some examples of reading in HTML tables.
- """
- _importers()
-
- # Type check here. We don't want to parse only to fail because of an
- # invalid value of an integer skiprows.
- if isinstance(skiprows, numbers.Integral) and skiprows < 0:
- raise ValueError(
- "cannot skip rows starting from the end of the "
- "data (you passed a negative value)"
- )
- if extract_links not in [None, "header", "footer", "body", "all"]:
- raise ValueError(
- "`extract_links` must be one of "
- '{None, "header", "footer", "body", "all"}, got '
- f'"{extract_links}"'
- )
- validate_header_arg(header)
- check_dtype_backend(dtype_backend)
-
- io = stringify_path(io)
-
- return _parse(
- flavor=flavor,
- io=io,
- match=match,
- header=header,
- index_col=index_col,
- skiprows=skiprows,
- parse_dates=parse_dates,
- thousands=thousands,
- attrs=attrs,
- encoding=encoding,
- decimal=decimal,
- converters=converters,
- na_values=na_values,
- keep_default_na=keep_default_na,
- displayed_only=displayed_only,
- extract_links=extract_links,
- dtype_backend=dtype_backend,
- )
diff --git a/contrib/python/pandas/py3/pandas/io/json/__init__.py b/contrib/python/pandas/py3/pandas/io/json/__init__.py
deleted file mode 100644
index 52c65dd6f0c..00000000000
--- a/contrib/python/pandas/py3/pandas/io/json/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from pandas.io.json._json import (
- dumps,
- loads,
- read_json,
- to_json,
-)
-from pandas.io.json._table_schema import build_table_schema
-
-__all__ = [
- "dumps",
- "loads",
- "read_json",
- "to_json",
- "build_table_schema",
-]
diff --git a/contrib/python/pandas/py3/pandas/io/json/_json.py b/contrib/python/pandas/py3/pandas/io/json/_json.py
deleted file mode 100644
index 27255d70796..00000000000
--- a/contrib/python/pandas/py3/pandas/io/json/_json.py
+++ /dev/null
@@ -1,1420 +0,0 @@
-from __future__ import annotations
-
-from abc import (
- ABC,
- abstractmethod,
-)
-from collections import abc
-from io import StringIO
-from itertools import islice
-from types import TracebackType
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Generic,
- Literal,
- Mapping,
- TypeVar,
- overload,
-)
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.json import (
- dumps,
- loads,
-)
-from pandas._libs.tslibs import iNaT
-from pandas._typing import (
- CompressionOptions,
- DtypeArg,
- DtypeBackend,
- FilePath,
- IndexLabel,
- JSONEngine,
- JSONSerializable,
- ReadBuffer,
- StorageOptions,
- WriteBuffer,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import doc
-from pandas.util._validators import check_dtype_backend
-
-from pandas.core.dtypes.common import (
- ensure_str,
- is_period_dtype,
-)
-from pandas.core.dtypes.generic import ABCIndex
-
-from pandas import (
- ArrowDtype,
- DataFrame,
- MultiIndex,
- Series,
- isna,
- notna,
- to_datetime,
-)
-from pandas.core.reshape.concat import concat
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.common import (
- IOHandles,
- dedup_names,
- extension_to_compression,
- file_exists,
- get_handle,
- is_fsspec_url,
- is_potential_multi_index,
- is_url,
- stringify_path,
-)
-from pandas.io.json._normalize import convert_to_line_delimits
-from pandas.io.json._table_schema import (
- build_table_schema,
- parse_table_schema,
-)
-from pandas.io.parsers.readers import validate_integer
-
-if TYPE_CHECKING:
- from pandas.core.generic import NDFrame
-
-FrameSeriesStrT = TypeVar("FrameSeriesStrT", bound=Literal["frame", "series"])
-
-
-# interface to/from
-@overload
-def to_json(
- path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes],
- obj: NDFrame,
- orient: str | None = ...,
- date_format: str = ...,
- double_precision: int = ...,
- force_ascii: bool = ...,
- date_unit: str = ...,
- default_handler: Callable[[Any], JSONSerializable] | None = ...,
- lines: bool = ...,
- compression: CompressionOptions = ...,
- index: bool = ...,
- indent: int = ...,
- storage_options: StorageOptions = ...,
- mode: Literal["a", "w"] = ...,
-) -> None:
- ...
-
-
-@overload
-def to_json(
- path_or_buf: None,
- obj: NDFrame,
- orient: str | None = ...,
- date_format: str = ...,
- double_precision: int = ...,
- force_ascii: bool = ...,
- date_unit: str = ...,
- default_handler: Callable[[Any], JSONSerializable] | None = ...,
- lines: bool = ...,
- compression: CompressionOptions = ...,
- index: bool = ...,
- indent: int = ...,
- storage_options: StorageOptions = ...,
- mode: Literal["a", "w"] = ...,
-) -> str:
- ...
-
-
-def to_json(
- path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] | None,
- obj: NDFrame,
- orient: str | None = None,
- date_format: str = "epoch",
- double_precision: int = 10,
- force_ascii: bool = True,
- date_unit: str = "ms",
- default_handler: Callable[[Any], JSONSerializable] | None = None,
- lines: bool = False,
- compression: CompressionOptions = "infer",
- index: bool = True,
- indent: int = 0,
- storage_options: StorageOptions = None,
- mode: Literal["a", "w"] = "w",
-) -> str | None:
- if not index and orient not in ["split", "table"]:
- raise ValueError(
- "'index=False' is only valid when 'orient' is 'split' or 'table'"
- )
-
- if lines and orient != "records":
- raise ValueError("'lines' keyword only valid when 'orient' is records")
-
- if mode not in ["a", "w"]:
- msg = (
- f"mode={mode} is not a valid option."
- "Only 'w' and 'a' are currently supported."
- )
- raise ValueError(msg)
-
- if mode == "a" and (not lines or orient != "records"):
- msg = (
- "mode='a' (append) is only supported when"
- "lines is True and orient is 'records'"
- )
- raise ValueError(msg)
-
- if orient == "table" and isinstance(obj, Series):
- obj = obj.to_frame(name=obj.name or "values")
-
- writer: type[Writer]
- if orient == "table" and isinstance(obj, DataFrame):
- writer = JSONTableWriter
- elif isinstance(obj, Series):
- writer = SeriesWriter
- elif isinstance(obj, DataFrame):
- writer = FrameWriter
- else:
- raise NotImplementedError("'obj' should be a Series or a DataFrame")
-
- s = writer(
- obj,
- orient=orient,
- date_format=date_format,
- double_precision=double_precision,
- ensure_ascii=force_ascii,
- date_unit=date_unit,
- default_handler=default_handler,
- index=index,
- indent=indent,
- ).write()
-
- if lines:
- s = convert_to_line_delimits(s)
-
- if path_or_buf is not None:
- # apply compression and byte/text conversion
- with get_handle(
- path_or_buf, mode, compression=compression, storage_options=storage_options
- ) as handles:
- handles.handle.write(s)
- else:
- return s
- return None
-
-
-class Writer(ABC):
- _default_orient: str
-
- def __init__(
- self,
- obj: NDFrame,
- orient: str | None,
- date_format: str,
- double_precision: int,
- ensure_ascii: bool,
- date_unit: str,
- index: bool,
- default_handler: Callable[[Any], JSONSerializable] | None = None,
- indent: int = 0,
- ) -> None:
- self.obj = obj
-
- if orient is None:
- orient = self._default_orient
-
- self.orient = orient
- self.date_format = date_format
- self.double_precision = double_precision
- self.ensure_ascii = ensure_ascii
- self.date_unit = date_unit
- self.default_handler = default_handler
- self.index = index
- self.indent = indent
-
- self.is_copy = None
- self._format_axes()
-
- def _format_axes(self):
- raise AbstractMethodError(self)
-
- def write(self) -> str:
- iso_dates = self.date_format == "iso"
- return dumps(
- self.obj_to_write,
- orient=self.orient,
- double_precision=self.double_precision,
- ensure_ascii=self.ensure_ascii,
- date_unit=self.date_unit,
- iso_dates=iso_dates,
- default_handler=self.default_handler,
- indent=self.indent,
- )
-
- @property
- @abstractmethod
- def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
- """Object to write in JSON format."""
-
-
-class SeriesWriter(Writer):
- _default_orient = "index"
-
- @property
- def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
- if not self.index and self.orient == "split":
- return {"name": self.obj.name, "data": self.obj.values}
- else:
- return self.obj
-
- def _format_axes(self):
- if not self.obj.index.is_unique and self.orient == "index":
- raise ValueError(f"Series index must be unique for orient='{self.orient}'")
-
-
-class FrameWriter(Writer):
- _default_orient = "columns"
-
- @property
- def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
- if not self.index and self.orient == "split":
- obj_to_write = self.obj.to_dict(orient="split")
- del obj_to_write["index"]
- else:
- obj_to_write = self.obj
- return obj_to_write
-
- def _format_axes(self):
- """
- Try to format axes if they are datelike.
- """
- if not self.obj.index.is_unique and self.orient in ("index", "columns"):
- raise ValueError(
- f"DataFrame index must be unique for orient='{self.orient}'."
- )
- if not self.obj.columns.is_unique and self.orient in (
- "index",
- "columns",
- "records",
- ):
- raise ValueError(
- f"DataFrame columns must be unique for orient='{self.orient}'."
- )
-
-
-class JSONTableWriter(FrameWriter):
- _default_orient = "records"
-
- def __init__(
- self,
- obj,
- orient: str | None,
- date_format: str,
- double_precision: int,
- ensure_ascii: bool,
- date_unit: str,
- index: bool,
- default_handler: Callable[[Any], JSONSerializable] | None = None,
- indent: int = 0,
- ) -> None:
- """
- Adds a `schema` attribute with the Table Schema, resets
- the index (can't do in caller, because the schema inference needs
- to know what the index is, forces orient to records, and forces
- date_format to 'iso'.
- """
- super().__init__(
- obj,
- orient,
- date_format,
- double_precision,
- ensure_ascii,
- date_unit,
- index,
- default_handler=default_handler,
- indent=indent,
- )
-
- if date_format != "iso":
- msg = (
- "Trying to write with `orient='table'` and "
- f"`date_format='{date_format}'`. Table Schema requires dates "
- "to be formatted with `date_format='iso'`"
- )
- raise ValueError(msg)
-
- self.schema = build_table_schema(obj, index=self.index)
-
- # NotImplemented on a column MultiIndex
- if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
- raise NotImplementedError(
- "orient='table' is not supported for MultiIndex columns"
- )
-
- # TODO: Do this timedelta properly in objToJSON.c See GH #15137
- if (
- (obj.ndim == 1)
- and (obj.name in set(obj.index.names))
- or len(obj.columns.intersection(obj.index.names))
- ):
- msg = "Overlapping names between the index and columns"
- raise ValueError(msg)
-
- obj = obj.copy()
- timedeltas = obj.select_dtypes(include=["timedelta"]).columns
- if len(timedeltas):
- obj[timedeltas] = obj[timedeltas].applymap(lambda x: x.isoformat())
- # Convert PeriodIndex to datetimes before serializing
- if is_period_dtype(obj.index.dtype):
- obj.index = obj.index.to_timestamp()
-
- # exclude index from obj if index=False
- if not self.index:
- self.obj = obj.reset_index(drop=True)
- else:
- self.obj = obj.reset_index(drop=False)
- self.date_format = "iso"
- self.orient = "records"
- self.index = index
-
- @property
- def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]:
- return {"schema": self.schema, "data": self.obj}
-
-
-@overload
-def read_json(
- path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
- *,
- orient: str | None = ...,
- typ: Literal["frame"] = ...,
- dtype: DtypeArg | None = ...,
- convert_axes=...,
- convert_dates: bool | list[str] = ...,
- keep_default_dates: bool = ...,
- precise_float: bool = ...,
- date_unit: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- lines: bool = ...,
- chunksize: int,
- compression: CompressionOptions = ...,
- nrows: int | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- engine: JSONEngine = ...,
-) -> JsonReader[Literal["frame"]]:
- ...
-
-
-@overload
-def read_json(
- path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
- *,
- orient: str | None = ...,
- typ: Literal["series"],
- dtype: DtypeArg | None = ...,
- convert_axes=...,
- convert_dates: bool | list[str] = ...,
- keep_default_dates: bool = ...,
- precise_float: bool = ...,
- date_unit: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- lines: bool = ...,
- chunksize: int,
- compression: CompressionOptions = ...,
- nrows: int | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- engine: JSONEngine = ...,
-) -> JsonReader[Literal["series"]]:
- ...
-
-
-@overload
-def read_json(
- path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
- *,
- orient: str | None = ...,
- typ: Literal["series"],
- dtype: DtypeArg | None = ...,
- convert_axes=...,
- convert_dates: bool | list[str] = ...,
- keep_default_dates: bool = ...,
- precise_float: bool = ...,
- date_unit: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- lines: bool = ...,
- chunksize: None = ...,
- compression: CompressionOptions = ...,
- nrows: int | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- engine: JSONEngine = ...,
-) -> Series:
- ...
-
-
-@overload
-def read_json(
- path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
- *,
- orient: str | None = ...,
- typ: Literal["frame"] = ...,
- dtype: DtypeArg | None = ...,
- convert_axes=...,
- convert_dates: bool | list[str] = ...,
- keep_default_dates: bool = ...,
- precise_float: bool = ...,
- date_unit: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- lines: bool = ...,
- chunksize: None = ...,
- compression: CompressionOptions = ...,
- nrows: int | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- engine: JSONEngine = ...,
-) -> DataFrame:
- ...
-
-
-@doc(
- storage_options=_shared_docs["storage_options"],
- decompression_options=_shared_docs["decompression_options"] % "path_or_buf",
-)
-def read_json(
- path_or_buf: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
- *,
- orient: str | None = None,
- typ: Literal["frame", "series"] = "frame",
- dtype: DtypeArg | None = None,
- convert_axes=None,
- convert_dates: bool | list[str] = True,
- keep_default_dates: bool = True,
- precise_float: bool = False,
- date_unit: str | None = None,
- encoding: str | None = None,
- encoding_errors: str | None = "strict",
- lines: bool = False,
- chunksize: int | None = None,
- compression: CompressionOptions = "infer",
- nrows: int | None = None,
- storage_options: StorageOptions = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- engine: JSONEngine = "ujson",
-) -> DataFrame | Series | JsonReader:
- """
- Convert a JSON string to pandas object.
-
- Parameters
- ----------
- path_or_buf : a valid JSON str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file. For file URLs, a host is
- expected. A local file could be:
- ``file://localhost/path/to/table.json``.
-
- If you want to pass in a path object, pandas accepts any
- ``os.PathLike``.
-
- By file-like object, we refer to objects with a ``read()`` method,
- such as a file handle (e.g. via builtin ``open`` function)
- or ``StringIO``.
- orient : str, optional
- Indication of expected JSON string format.
- Compatible JSON strings can be produced by ``to_json()`` with a
- corresponding orient value.
- The set of possible orients is:
-
- - ``'split'`` : dict like
- ``{{index -> [index], columns -> [columns], data -> [values]}}``
- - ``'records'`` : list like
- ``[{{column -> value}}, ... , {{column -> value}}]``
- - ``'index'`` : dict like ``{{index -> {{column -> value}}}}``
- - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}``
- - ``'values'`` : just the values array
-
- The allowed and default values depend on the value
- of the `typ` parameter.
-
- * when ``typ == 'series'``,
-
- - allowed orients are ``{{'split','records','index'}}``
- - default is ``'index'``
- - The Series index must be unique for orient ``'index'``.
-
- * when ``typ == 'frame'``,
-
- - allowed orients are ``{{'split','records','index',
- 'columns','values', 'table'}}``
- - default is ``'columns'``
- - The DataFrame index must be unique for orients ``'index'`` and
- ``'columns'``.
- - The DataFrame columns must be unique for orients ``'index'``,
- ``'columns'``, and ``'records'``.
-
- typ : {{'frame', 'series'}}, default 'frame'
- The type of object to recover.
-
- dtype : bool or dict, default None
- If True, infer dtypes; if a dict of column to dtype, then use those;
- if False, then don't infer dtypes at all, applies only to the data.
-
- For all ``orient`` values except ``'table'``, default is True.
-
- convert_axes : bool, default None
- Try to convert the axes to the proper dtypes.
-
- For all ``orient`` values except ``'table'``, default is True.
-
- convert_dates : bool or list of str, default True
- If True then default datelike columns may be converted (depending on
- keep_default_dates).
- If False, no dates will be converted.
- If a list of column names, then those columns will be converted and
- default datelike columns may also be converted (depending on
- keep_default_dates).
-
- keep_default_dates : bool, default True
- If parsing dates (convert_dates is not False), then try to parse the
- default datelike columns.
- A column label is datelike if
-
- * it ends with ``'_at'``,
-
- * it ends with ``'_time'``,
-
- * it begins with ``'timestamp'``,
-
- * it is ``'modified'``, or
-
- * it is ``'date'``.
-
- precise_float : bool, default False
- Set to enable usage of higher precision (strtod) function when
- decoding string to double values. Default (False) is to use fast but
- less precise builtin functionality.
-
- date_unit : str, default None
- The timestamp unit to detect if converting dates. The default behaviour
- is to try and detect the correct precision, but if this is not desired
- then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
- milliseconds, microseconds or nanoseconds respectively.
-
- encoding : str, default is 'utf-8'
- The encoding to use to decode py3 bytes.
-
- encoding_errors : str, optional, default "strict"
- How encoding errors are treated. `List of possible values
- <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
-
- .. versionadded:: 1.3.0
-
- lines : bool, default False
- Read the file as a json object per line.
-
- chunksize : int, optional
- Return JsonReader object for iteration.
- See the `line-delimited json docs
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json>`_
- for more information on ``chunksize``.
- This can only be passed if `lines=True`.
- If this is None, the file will be read into memory all at once.
-
- .. versionchanged:: 1.2
-
- ``JsonReader`` is a context manager.
-
- {decompression_options}
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- nrows : int, optional
- The number of lines from the line-delimited jsonfile that has to be read.
- This can only be passed if `lines=True`.
- If this is None, all the rows will be returned.
-
- .. versionadded:: 1.1
-
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- engine : {{"ujson", "pyarrow"}}, default "ujson"
- Parser engine to use. The ``"pyarrow"`` engine is only available when
- ``lines=True``.
-
- .. versionadded:: 2.0
-
- Returns
- -------
- Series or DataFrame
- The type returned depends on the value of `typ`.
-
- See Also
- --------
- DataFrame.to_json : Convert a DataFrame to a JSON string.
- Series.to_json : Convert a Series to a JSON string.
- json_normalize : Normalize semi-structured JSON data into a flat table.
-
- Notes
- -----
- Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
- :class:`Index` name of `index` gets written with :func:`to_json`, the
- subsequent read operation will incorrectly set the :class:`Index` name to
- ``None``. This is because `index` is also used by :func:`DataFrame.to_json`
- to denote a missing :class:`Index` name, and the subsequent
- :func:`read_json` operation cannot distinguish between the two. The same
- limitation is encountered with a :class:`MultiIndex` and any names
- beginning with ``'level_'``.
-
- Examples
- --------
- >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
- ... index=['row 1', 'row 2'],
- ... columns=['col 1', 'col 2'])
-
- Encoding/decoding a Dataframe using ``'split'`` formatted JSON:
-
- >>> df.to_json(orient='split')
- '\
-{{\
-"columns":["col 1","col 2"],\
-"index":["row 1","row 2"],\
-"data":[["a","b"],["c","d"]]\
-}}\
-'
- >>> pd.read_json(_, orient='split')
- col 1 col 2
- row 1 a b
- row 2 c d
-
- Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
-
- >>> df.to_json(orient='index')
- '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}'
-
- >>> pd.read_json(_, orient='index')
- col 1 col 2
- row 1 a b
- row 2 c d
-
- Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
- Note that index labels are not preserved with this encoding.
-
- >>> df.to_json(orient='records')
- '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]'
- >>> pd.read_json(_, orient='records')
- col 1 col 2
- 0 a b
- 1 c d
-
- Encoding with Table Schema
-
- >>> df.to_json(orient='table')
- '\
-{{"schema":{{"fields":[\
-{{"name":"index","type":"string"}},\
-{{"name":"col 1","type":"string"}},\
-{{"name":"col 2","type":"string"}}],\
-"primaryKey":["index"],\
-"pandas_version":"1.4.0"}},\
-"data":[\
-{{"index":"row 1","col 1":"a","col 2":"b"}},\
-{{"index":"row 2","col 1":"c","col 2":"d"}}]\
-}}\
-'
- """
- if orient == "table" and dtype:
- raise ValueError("cannot pass both dtype and orient='table'")
- if orient == "table" and convert_axes:
- raise ValueError("cannot pass both convert_axes and orient='table'")
-
- check_dtype_backend(dtype_backend)
-
- if dtype is None and orient != "table":
- # error: Incompatible types in assignment (expression has type "bool", variable
- # has type "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float],
- # Type[int], Type[complex], Type[bool], Type[object], Dict[Hashable,
- # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float],
- # Type[int], Type[complex], Type[bool], Type[object]]], None]")
- dtype = True # type: ignore[assignment]
- if convert_axes is None and orient != "table":
- convert_axes = True
-
- json_reader = JsonReader(
- path_or_buf,
- orient=orient,
- typ=typ,
- dtype=dtype,
- convert_axes=convert_axes,
- convert_dates=convert_dates,
- keep_default_dates=keep_default_dates,
- precise_float=precise_float,
- date_unit=date_unit,
- encoding=encoding,
- lines=lines,
- chunksize=chunksize,
- compression=compression,
- nrows=nrows,
- storage_options=storage_options,
- encoding_errors=encoding_errors,
- dtype_backend=dtype_backend,
- engine=engine,
- )
-
- if chunksize:
- return json_reader
- else:
- return json_reader.read()
-
-
-class JsonReader(abc.Iterator, Generic[FrameSeriesStrT]):
- """
- JsonReader provides an interface for reading in a JSON file.
-
- If initialized with ``lines=True`` and ``chunksize``, can be iterated over
- ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
- whole document.
- """
-
- def __init__(
- self,
- filepath_or_buffer,
- orient,
- typ: FrameSeriesStrT,
- dtype,
- convert_axes,
- convert_dates,
- keep_default_dates: bool,
- precise_float: bool,
- date_unit,
- encoding,
- lines: bool,
- chunksize: int | None,
- compression: CompressionOptions,
- nrows: int | None,
- storage_options: StorageOptions = None,
- encoding_errors: str | None = "strict",
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- engine: JSONEngine = "ujson",
- ) -> None:
- self.orient = orient
- self.typ = typ
- self.dtype = dtype
- self.convert_axes = convert_axes
- self.convert_dates = convert_dates
- self.keep_default_dates = keep_default_dates
- self.precise_float = precise_float
- self.date_unit = date_unit
- self.encoding = encoding
- self.engine = engine
- self.compression = compression
- self.storage_options = storage_options
- self.lines = lines
- self.chunksize = chunksize
- self.nrows_seen = 0
- self.nrows = nrows
- self.encoding_errors = encoding_errors
- self.handles: IOHandles[str] | None = None
- self.dtype_backend = dtype_backend
-
- if self.engine not in {"pyarrow", "ujson"}:
- raise ValueError(
- f"The engine type {self.engine} is currently not supported."
- )
- if self.chunksize is not None:
- self.chunksize = validate_integer("chunksize", self.chunksize, 1)
- if not self.lines:
- raise ValueError("chunksize can only be passed if lines=True")
- if self.engine == "pyarrow":
- raise ValueError(
- "currently pyarrow engine doesn't support chunksize parameter"
- )
- if self.nrows is not None:
- self.nrows = validate_integer("nrows", self.nrows, 0)
- if not self.lines:
- raise ValueError("nrows can only be passed if lines=True")
- if self.engine == "pyarrow":
- if not self.lines:
- raise ValueError(
- "currently pyarrow engine only supports "
- "the line-delimited JSON format"
- )
- self.data = filepath_or_buffer
- elif self.engine == "ujson":
- data = self._get_data_from_filepath(filepath_or_buffer)
- self.data = self._preprocess_data(data)
-
- def _preprocess_data(self, data):
- """
- At this point, the data either has a `read` attribute (e.g. a file
- object or a StringIO) or is a string that is a JSON document.
-
- If self.chunksize, we prepare the data for the `__next__` method.
- Otherwise, we read it into memory for the `read` method.
- """
- if hasattr(data, "read") and not (self.chunksize or self.nrows):
- with self:
- data = data.read()
- if not hasattr(data, "read") and (self.chunksize or self.nrows):
- data = StringIO(data)
-
- return data
-
- def _get_data_from_filepath(self, filepath_or_buffer):
- """
- The function read_json accepts three input types:
- 1. filepath (string-like)
- 2. file-like object (e.g. open file object, StringIO)
- 3. JSON string
-
- This method turns (1) into (2) to simplify the rest of the processing.
- It returns input types (2) and (3) unchanged.
-
- It raises FileNotFoundError if the input is a string ending in
- one of .json, .json.gz, .json.bz2, etc. but no such file exists.
- """
- # if it is a string but the file does not exist, it might be a JSON string
- filepath_or_buffer = stringify_path(filepath_or_buffer)
- if (
- not isinstance(filepath_or_buffer, str)
- or is_url(filepath_or_buffer)
- or is_fsspec_url(filepath_or_buffer)
- or file_exists(filepath_or_buffer)
- ):
- self.handles = get_handle(
- filepath_or_buffer,
- "r",
- encoding=self.encoding,
- compression=self.compression,
- storage_options=self.storage_options,
- errors=self.encoding_errors,
- )
- filepath_or_buffer = self.handles.handle
- elif (
- isinstance(filepath_or_buffer, str)
- and filepath_or_buffer.lower().endswith(
- (".json",) + tuple(f".json{c}" for c in extension_to_compression)
- )
- and not file_exists(filepath_or_buffer)
- ):
- raise FileNotFoundError(f"File {filepath_or_buffer} does not exist")
-
- return filepath_or_buffer
-
- def _combine_lines(self, lines) -> str:
- """
- Combines a list of JSON objects into one JSON object.
- """
- return (
- f'[{",".join([line for line in (line.strip() for line in lines) if line])}]'
- )
-
- @overload
- def read(self: JsonReader[Literal["frame"]]) -> DataFrame:
- ...
-
- @overload
- def read(self: JsonReader[Literal["series"]]) -> Series:
- ...
-
- @overload
- def read(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:
- ...
-
- def read(self) -> DataFrame | Series:
- """
- Read the whole JSON input into a pandas object.
- """
- obj: DataFrame | Series
- with self:
- if self.engine == "pyarrow":
- pyarrow_json = import_optional_dependency("pyarrow.json")
- pa_table = pyarrow_json.read_json(self.data)
-
- mapping: type[ArrowDtype] | None | Callable
- if self.dtype_backend == "pyarrow":
- mapping = ArrowDtype
- elif self.dtype_backend == "numpy_nullable":
- from pandas.io._util import _arrow_dtype_mapping
-
- mapping = _arrow_dtype_mapping().get
- else:
- mapping = None
-
- return pa_table.to_pandas(types_mapper=mapping)
- elif self.engine == "ujson":
- if self.lines:
- if self.chunksize:
- obj = concat(self)
- elif self.nrows:
- lines = list(islice(self.data, self.nrows))
- lines_json = self._combine_lines(lines)
- obj = self._get_object_parser(lines_json)
- else:
- data = ensure_str(self.data)
- data_lines = data.split("\n")
- obj = self._get_object_parser(self._combine_lines(data_lines))
- else:
- obj = self._get_object_parser(self.data)
- if self.dtype_backend is not lib.no_default:
- return obj.convert_dtypes(
- infer_objects=False, dtype_backend=self.dtype_backend
- )
- else:
- return obj
-
- def _get_object_parser(self, json) -> DataFrame | Series:
- """
- Parses a json document into a pandas object.
- """
- typ = self.typ
- dtype = self.dtype
- kwargs = {
- "orient": self.orient,
- "dtype": self.dtype,
- "convert_axes": self.convert_axes,
- "convert_dates": self.convert_dates,
- "keep_default_dates": self.keep_default_dates,
- "precise_float": self.precise_float,
- "date_unit": self.date_unit,
- "dtype_backend": self.dtype_backend,
- }
- obj = None
- if typ == "frame":
- obj = FrameParser(json, **kwargs).parse()
-
- if typ == "series" or obj is None:
- if not isinstance(dtype, bool):
- kwargs["dtype"] = dtype
- obj = SeriesParser(json, **kwargs).parse()
-
- return obj
-
- def close(self) -> None:
- """
- If we opened a stream earlier, in _get_data_from_filepath, we should
- close it.
-
- If an open stream or file was passed, we leave it open.
- """
- if self.handles is not None:
- self.handles.close()
-
- def __iter__(self: JsonReader[FrameSeriesStrT]) -> JsonReader[FrameSeriesStrT]:
- return self
-
- @overload
- def __next__(self: JsonReader[Literal["frame"]]) -> DataFrame:
- ...
-
- @overload
- def __next__(self: JsonReader[Literal["series"]]) -> Series:
- ...
-
- @overload
- def __next__(self: JsonReader[Literal["frame", "series"]]) -> DataFrame | Series:
- ...
-
- def __next__(self) -> DataFrame | Series:
- if self.nrows and self.nrows_seen >= self.nrows:
- self.close()
- raise StopIteration
-
- lines = list(islice(self.data, self.chunksize))
- if not lines:
- self.close()
- raise StopIteration
-
- try:
- lines_json = self._combine_lines(lines)
- obj = self._get_object_parser(lines_json)
-
- # Make sure that the returned objects have the right index.
- obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
- self.nrows_seen += len(obj)
- except Exception as ex:
- self.close()
- raise ex
-
- if self.dtype_backend is not lib.no_default:
- return obj.convert_dtypes(
- infer_objects=False, dtype_backend=self.dtype_backend
- )
- else:
- return obj
-
- def __enter__(self) -> JsonReader[FrameSeriesStrT]:
- return self
-
- def __exit__(
- self,
- exc_type: type[BaseException] | None,
- exc_value: BaseException | None,
- traceback: TracebackType | None,
- ) -> None:
- self.close()
-
-
-class Parser:
- _split_keys: tuple[str, ...]
- _default_orient: str
-
- _STAMP_UNITS = ("s", "ms", "us", "ns")
- _MIN_STAMPS = {
- "s": 31536000,
- "ms": 31536000000,
- "us": 31536000000000,
- "ns": 31536000000000000,
- }
-
- def __init__(
- self,
- json,
- orient,
- dtype: DtypeArg | None = None,
- convert_axes: bool = True,
- convert_dates: bool | list[str] = True,
- keep_default_dates: bool = False,
- precise_float: bool = False,
- date_unit=None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- ) -> None:
- self.json = json
-
- if orient is None:
- orient = self._default_orient
-
- self.orient = orient
-
- self.dtype = dtype
-
- if date_unit is not None:
- date_unit = date_unit.lower()
- if date_unit not in self._STAMP_UNITS:
- raise ValueError(f"date_unit must be one of {self._STAMP_UNITS}")
- self.min_stamp = self._MIN_STAMPS[date_unit]
- else:
- self.min_stamp = self._MIN_STAMPS["s"]
-
- self.precise_float = precise_float
- self.convert_axes = convert_axes
- self.convert_dates = convert_dates
- self.date_unit = date_unit
- self.keep_default_dates = keep_default_dates
- self.obj: DataFrame | Series | None = None
- self.dtype_backend = dtype_backend
-
- def check_keys_split(self, decoded) -> None:
- """
- Checks that dict has only the appropriate keys for orient='split'.
- """
- bad_keys = set(decoded.keys()).difference(set(self._split_keys))
- if bad_keys:
- bad_keys_joined = ", ".join(bad_keys)
- raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}")
-
- def parse(self):
- self._parse()
-
- if self.obj is None:
- return None
- if self.convert_axes:
- self._convert_axes()
- self._try_convert_types()
- return self.obj
-
- def _parse(self):
- raise AbstractMethodError(self)
-
- def _convert_axes(self) -> None:
- """
- Try to convert axes.
- """
- obj = self.obj
- assert obj is not None # for mypy
- for axis_name in obj._AXIS_ORDERS:
- new_axis, result = self._try_convert_data(
- name=axis_name,
- data=obj._get_axis(axis_name),
- use_dtypes=False,
- convert_dates=True,
- )
- if result:
- setattr(self.obj, axis_name, new_axis)
-
- def _try_convert_types(self):
- raise AbstractMethodError(self)
-
- def _try_convert_data(
- self,
- name,
- data,
- use_dtypes: bool = True,
- convert_dates: bool | list[str] = True,
- ):
- """
- Try to parse a ndarray like into a column by inferring dtype.
- """
- # don't try to coerce, unless a force conversion
- if use_dtypes:
- if not self.dtype:
- if all(notna(data)):
- return data, False
- return data.fillna(np.nan), True
-
- elif self.dtype is True:
- pass
- else:
- # dtype to force
- dtype = (
- self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype
- )
- if dtype is not None:
- try:
- return data.astype(dtype), True
- except (TypeError, ValueError):
- return data, False
-
- if convert_dates:
- new_data, result = self._try_convert_to_date(data)
- if result:
- return new_data, True
-
- if self.dtype_backend is not lib.no_default and not isinstance(data, ABCIndex):
- # Fall through for conversion later on
- return data, True
- elif data.dtype == "object":
- # try float
- try:
- data = data.astype("float64")
- except (TypeError, ValueError):
- pass
-
- if data.dtype.kind == "f":
- if data.dtype != "float64":
- # coerce floats to 64
- try:
- data = data.astype("float64")
- except (TypeError, ValueError):
- pass
-
- # don't coerce 0-len data
- if len(data) and data.dtype in ("float", "object"):
- # coerce ints if we can
- try:
- new_data = data.astype("int64")
- if (new_data == data).all():
- data = new_data
- except (TypeError, ValueError, OverflowError):
- pass
-
- # coerce ints to 64
- if data.dtype == "int":
- # coerce floats to 64
- try:
- data = data.astype("int64")
- except (TypeError, ValueError):
- pass
-
- # if we have an index, we want to preserve dtypes
- if name == "index" and len(data):
- if self.orient == "split":
- return data, False
-
- return data, True
-
- def _try_convert_to_date(self, data):
- """
- Try to parse a ndarray like into a date column.
-
- Try to coerce object in epoch/iso formats and integer/float in epoch
- formats. Return a boolean if parsing was successful.
- """
- # no conversion on empty
- if not len(data):
- return data, False
-
- new_data = data
- if new_data.dtype == "object":
- try:
- new_data = data.astype("int64")
- except OverflowError:
- return data, False
- except (TypeError, ValueError):
- pass
-
- # ignore numbers that are out of range
- if issubclass(new_data.dtype.type, np.number):
- in_range = (
- isna(new_data._values)
- | (new_data > self.min_stamp)
- | (new_data._values == iNaT)
- )
- if not in_range.all():
- return data, False
-
- date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
- for date_unit in date_units:
- try:
- new_data = to_datetime(new_data, errors="raise", unit=date_unit)
- except (ValueError, OverflowError, TypeError):
- continue
- return new_data, True
- return data, False
-
- def _try_convert_dates(self):
- raise AbstractMethodError(self)
-
-
-class SeriesParser(Parser):
- _default_orient = "index"
- _split_keys = ("name", "index", "data")
-
- def _parse(self) -> None:
- data = loads(self.json, precise_float=self.precise_float)
-
- if self.orient == "split":
- decoded = {str(k): v for k, v in data.items()}
- self.check_keys_split(decoded)
- self.obj = Series(**decoded)
- else:
- self.obj = Series(data)
-
- def _try_convert_types(self) -> None:
- if self.obj is None:
- return
- obj, result = self._try_convert_data(
- "data", self.obj, convert_dates=self.convert_dates
- )
- if result:
- self.obj = obj
-
-
-class FrameParser(Parser):
- _default_orient = "columns"
- _split_keys = ("columns", "index", "data")
-
- def _parse(self) -> None:
- json = self.json
- orient = self.orient
-
- if orient == "columns":
- self.obj = DataFrame(
- loads(json, precise_float=self.precise_float), dtype=None
- )
- elif orient == "split":
- decoded = {
- str(k): v
- for k, v in loads(json, precise_float=self.precise_float).items()
- }
- self.check_keys_split(decoded)
- orig_names = [
- (tuple(col) if isinstance(col, list) else col)
- for col in decoded["columns"]
- ]
- decoded["columns"] = dedup_names(
- orig_names,
- is_potential_multi_index(orig_names, None),
- )
- self.obj = DataFrame(dtype=None, **decoded)
- elif orient == "index":
- self.obj = DataFrame.from_dict(
- loads(json, precise_float=self.precise_float),
- dtype=None,
- orient="index",
- )
- elif orient == "table":
- self.obj = parse_table_schema(json, precise_float=self.precise_float)
- else:
- self.obj = DataFrame(
- loads(json, precise_float=self.precise_float), dtype=None
- )
-
- def _process_converter(self, f, filt=None) -> None:
- """
- Take a conversion function and possibly recreate the frame.
- """
- if filt is None:
- filt = lambda col, c: True
-
- obj = self.obj
- assert obj is not None # for mypy
-
- needs_new_obj = False
- new_obj = {}
- for i, (col, c) in enumerate(obj.items()):
- if filt(col, c):
- new_data, result = f(col, c)
- if result:
- c = new_data
- needs_new_obj = True
- new_obj[i] = c
-
- if needs_new_obj:
- # possibly handle dup columns
- new_frame = DataFrame(new_obj, index=obj.index)
- new_frame.columns = obj.columns
- self.obj = new_frame
-
- def _try_convert_types(self) -> None:
- if self.obj is None:
- return
- if self.convert_dates:
- self._try_convert_dates()
-
- self._process_converter(
- lambda col, c: self._try_convert_data(col, c, convert_dates=False)
- )
-
- def _try_convert_dates(self) -> None:
- if self.obj is None:
- return
-
- # our columns to parse
- convert_dates_list_bool = self.convert_dates
- if isinstance(convert_dates_list_bool, bool):
- convert_dates_list_bool = []
- convert_dates = set(convert_dates_list_bool)
-
- def is_ok(col) -> bool:
- """
- Return if this col is ok to try for a date parse.
- """
- if not isinstance(col, str):
- return False
-
- col_lower = col.lower()
- if (
- col_lower.endswith("_at")
- or col_lower.endswith("_time")
- or col_lower == "modified"
- or col_lower == "date"
- or col_lower == "datetime"
- or col_lower.startswith("timestamp")
- ):
- return True
- return False
-
- self._process_converter(
- lambda col, c: self._try_convert_to_date(c),
- lambda col, c: (
- (self.keep_default_dates and is_ok(col)) or col in convert_dates
- ),
- )
diff --git a/contrib/python/pandas/py3/pandas/io/json/_normalize.py b/contrib/python/pandas/py3/pandas/io/json/_normalize.py
deleted file mode 100644
index 577d677e7b3..00000000000
--- a/contrib/python/pandas/py3/pandas/io/json/_normalize.py
+++ /dev/null
@@ -1,536 +0,0 @@
-# ---------------------------------------------------------------------
-# JSON normalization routines
-from __future__ import annotations
-
-from collections import (
- abc,
- defaultdict,
-)
-import copy
-import sys
-from typing import (
- Any,
- DefaultDict,
- Iterable,
-)
-
-import numpy as np
-
-from pandas._libs.writers import convert_json_to_lines
-from pandas._typing import (
- IgnoreRaise,
- Scalar,
-)
-
-import pandas as pd
-from pandas import DataFrame
-
-
-def convert_to_line_delimits(s: str) -> str:
- """
- Helper function that converts JSON lists to line delimited JSON.
- """
- # Determine we have a JSON list to turn to lines otherwise just return the
- # json object, only lists can
- if not s[0] == "[" and s[-1] == "]":
- return s
- s = s[1:-1]
-
- return convert_json_to_lines(s)
-
-
-def nested_to_record(
- ds,
- prefix: str = "",
- sep: str = ".",
- level: int = 0,
- max_level: int | None = None,
-):
- """
- A simplified json_normalize
-
- Converts a nested dict into a flat dict ("record"), unlike json_normalize,
- it does not attempt to extract a subset of the data.
-
- Parameters
- ----------
- ds : dict or list of dicts
- prefix: the prefix, optional, default: ""
- sep : str, default '.'
- Nested records will generate names separated by sep,
- e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
- level: int, optional, default: 0
- The number of levels in the json string.
-
- max_level: int, optional, default: None
- The max depth to normalize.
-
- Returns
- -------
- d - dict or list of dicts, matching `ds`
-
- Examples
- --------
- >>> nested_to_record(
- ... dict(flat1=1, dict1=dict(c=1, d=2), nested=dict(e=dict(c=1, d=2), d=2))
- ... )
- {\
-'flat1': 1, \
-'dict1.c': 1, \
-'dict1.d': 2, \
-'nested.e.c': 1, \
-'nested.e.d': 2, \
-'nested.d': 2\
-}
- """
- singleton = False
- if isinstance(ds, dict):
- ds = [ds]
- singleton = True
- new_ds = []
- for d in ds:
- new_d = copy.deepcopy(d)
- for k, v in d.items():
- # each key gets renamed with prefix
- if not isinstance(k, str):
- k = str(k)
- if level == 0:
- newkey = k
- else:
- newkey = prefix + sep + k
-
- # flatten if type is dict and
- # current dict level < maximum level provided and
- # only dicts gets recurse-flattened
- # only at level>1 do we rename the rest of the keys
- if not isinstance(v, dict) or (
- max_level is not None and level >= max_level
- ):
- if level != 0: # so we skip copying for top level, common case
- v = new_d.pop(k)
- new_d[newkey] = v
- continue
-
- v = new_d.pop(k)
- new_d.update(nested_to_record(v, newkey, sep, level + 1, max_level))
- new_ds.append(new_d)
-
- if singleton:
- return new_ds[0]
- return new_ds
-
-
-def _normalise_json(
- data: Any,
- key_string: str,
- normalized_dict: dict[str, Any],
- separator: str,
-) -> dict[str, Any]:
- """
- Main recursive function
- Designed for the most basic use case of pd.json_normalize(data)
- intended as a performance improvement, see #15621
-
- Parameters
- ----------
- data : Any
- Type dependent on types contained within nested Json
- key_string : str
- New key (with separator(s) in) for data
- normalized_dict : dict
- The new normalized/flattened Json dict
- separator : str, default '.'
- Nested records will generate names separated by sep,
- e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
- """
- if isinstance(data, dict):
- for key, value in data.items():
- new_key = f"{key_string}{separator}{key}"
-
- if not key_string:
- if sys.version_info < (3, 9):
- from pandas.util._str_methods import removeprefix
-
- new_key = removeprefix(new_key, separator)
- else:
- new_key = new_key.removeprefix(separator)
-
- _normalise_json(
- data=value,
- key_string=new_key,
- normalized_dict=normalized_dict,
- separator=separator,
- )
- else:
- normalized_dict[key_string] = data
- return normalized_dict
-
-
-def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
- """
- Order the top level keys and then recursively go to depth
-
- Parameters
- ----------
- data : dict or list of dicts
- separator : str, default '.'
- Nested records will generate names separated by sep,
- e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
-
- Returns
- -------
- dict or list of dicts, matching `normalised_json_object`
- """
- top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
- nested_dict_ = _normalise_json(
- data={k: v for k, v in data.items() if isinstance(v, dict)},
- key_string="",
- normalized_dict={},
- separator=separator,
- )
- return {**top_dict_, **nested_dict_}
-
-
-def _simple_json_normalize(
- ds: dict | list[dict],
- sep: str = ".",
-) -> dict | list[dict] | Any:
- """
- A optimized basic json_normalize
-
- Converts a nested dict into a flat dict ("record"), unlike
- json_normalize and nested_to_record it doesn't do anything clever.
- But for the most basic use cases it enhances performance.
- E.g. pd.json_normalize(data)
-
- Parameters
- ----------
- ds : dict or list of dicts
- sep : str, default '.'
- Nested records will generate names separated by sep,
- e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar
-
- Returns
- -------
- frame : DataFrame
- d - dict or list of dicts, matching `normalised_json_object`
-
- Examples
- --------
- >>> _simple_json_normalize(
- ... {
- ... "flat1": 1,
- ... "dict1": {"c": 1, "d": 2},
- ... "nested": {"e": {"c": 1, "d": 2}, "d": 2},
- ... }
- ... )
- {\
-'flat1': 1, \
-'dict1.c': 1, \
-'dict1.d': 2, \
-'nested.e.c': 1, \
-'nested.e.d': 2, \
-'nested.d': 2\
-}
-
- """
- normalised_json_object = {}
- # expect a dictionary, as most jsons are. However, lists are perfectly valid
- if isinstance(ds, dict):
- normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
- elif isinstance(ds, list):
- normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
- return normalised_json_list
- return normalised_json_object
-
-
-def json_normalize(
- data: dict | list[dict],
- record_path: str | list | None = None,
- meta: str | list[str | list[str]] | None = None,
- meta_prefix: str | None = None,
- record_prefix: str | None = None,
- errors: IgnoreRaise = "raise",
- sep: str = ".",
- max_level: int | None = None,
-) -> DataFrame:
- """
- Normalize semi-structured JSON data into a flat table.
-
- Parameters
- ----------
- data : dict or list of dicts
- Unserialized JSON objects.
- record_path : str or list of str, default None
- Path in each object to list of records. If not passed, data will be
- assumed to be an array of records.
- meta : list of paths (str or list of str), default None
- Fields to use as metadata for each record in resulting table.
- meta_prefix : str, default None
- If True, prefix records with dotted (?) path, e.g. foo.bar.field if
- meta is ['foo', 'bar'].
- record_prefix : str, default None
- If True, prefix records with dotted (?) path, e.g. foo.bar.field if
- path to records is ['foo', 'bar'].
- errors : {'raise', 'ignore'}, default 'raise'
- Configures error handling.
-
- * 'ignore' : will ignore KeyError if keys listed in meta are not
- always present.
- * 'raise' : will raise KeyError if keys listed in meta are not
- always present.
- sep : str, default '.'
- Nested records will generate names separated by sep.
- e.g., for sep='.', {'foo': {'bar': 0}} -> foo.bar.
- max_level : int, default None
- Max number of levels(depth of dict) to normalize.
- if None, normalizes all levels.
-
- Returns
- -------
- frame : DataFrame
- Normalize semi-structured JSON data into a flat table.
-
- Examples
- --------
- >>> data = [
- ... {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
- ... {"name": {"given": "Mark", "family": "Regner"}},
- ... {"id": 2, "name": "Faye Raker"},
- ... ]
- >>> pd.json_normalize(data)
- id name.first name.last name.given name.family name
- 0 1.0 Coleen Volk NaN NaN NaN
- 1 NaN NaN NaN Mark Regner NaN
- 2 2.0 NaN NaN NaN NaN Faye Raker
-
- >>> data = [
- ... {
- ... "id": 1,
- ... "name": "Cole Volk",
- ... "fitness": {"height": 130, "weight": 60},
- ... },
- ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
- ... {
- ... "id": 2,
- ... "name": "Faye Raker",
- ... "fitness": {"height": 130, "weight": 60},
- ... },
- ... ]
- >>> pd.json_normalize(data, max_level=0)
- id name fitness
- 0 1.0 Cole Volk {'height': 130, 'weight': 60}
- 1 NaN Mark Reg {'height': 130, 'weight': 60}
- 2 2.0 Faye Raker {'height': 130, 'weight': 60}
-
- Normalizes nested data up to level 1.
-
- >>> data = [
- ... {
- ... "id": 1,
- ... "name": "Cole Volk",
- ... "fitness": {"height": 130, "weight": 60},
- ... },
- ... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
- ... {
- ... "id": 2,
- ... "name": "Faye Raker",
- ... "fitness": {"height": 130, "weight": 60},
- ... },
- ... ]
- >>> pd.json_normalize(data, max_level=1)
- id name fitness.height fitness.weight
- 0 1.0 Cole Volk 130 60
- 1 NaN Mark Reg 130 60
- 2 2.0 Faye Raker 130 60
-
- >>> data = [
- ... {
- ... "state": "Florida",
- ... "shortname": "FL",
- ... "info": {"governor": "Rick Scott"},
- ... "counties": [
- ... {"name": "Dade", "population": 12345},
- ... {"name": "Broward", "population": 40000},
- ... {"name": "Palm Beach", "population": 60000},
- ... ],
- ... },
- ... {
- ... "state": "Ohio",
- ... "shortname": "OH",
- ... "info": {"governor": "John Kasich"},
- ... "counties": [
- ... {"name": "Summit", "population": 1234},
- ... {"name": "Cuyahoga", "population": 1337},
- ... ],
- ... },
- ... ]
- >>> result = pd.json_normalize(
- ... data, "counties", ["state", "shortname", ["info", "governor"]]
- ... )
- >>> result
- name population state shortname info.governor
- 0 Dade 12345 Florida FL Rick Scott
- 1 Broward 40000 Florida FL Rick Scott
- 2 Palm Beach 60000 Florida FL Rick Scott
- 3 Summit 1234 Ohio OH John Kasich
- 4 Cuyahoga 1337 Ohio OH John Kasich
-
- >>> data = {"A": [1, 2]}
- >>> pd.json_normalize(data, "A", record_prefix="Prefix.")
- Prefix.0
- 0 1
- 1 2
-
- Returns normalized data with columns prefixed with the given string.
- """
-
- def _pull_field(
- js: dict[str, Any], spec: list | str, extract_record: bool = False
- ) -> Scalar | Iterable:
- """Internal function to pull field"""
- result = js
- try:
- if isinstance(spec, list):
- for field in spec:
- if result is None:
- raise KeyError(field)
- result = result[field]
- else:
- result = result[spec]
- except KeyError as e:
- if extract_record:
- raise KeyError(
- f"Key {e} not found. If specifying a record_path, all elements of "
- f"data should have the path."
- ) from e
- if errors == "ignore":
- return np.nan
- else:
- raise KeyError(
- f"Key {e} not found. To replace missing values of {e} with "
- f"np.nan, pass in errors='ignore'"
- ) from e
-
- return result
-
- def _pull_records(js: dict[str, Any], spec: list | str) -> list:
- """
- Internal function to pull field for records, and similar to
- _pull_field, but require to return list. And will raise error
- if has non iterable value.
- """
- result = _pull_field(js, spec, extract_record=True)
-
- # GH 31507 GH 30145, GH 26284 if result is not list, raise TypeError if not
- # null, otherwise return an empty list
- if not isinstance(result, list):
- if pd.isnull(result):
- result = []
- else:
- raise TypeError(
- f"{js} has non list value {result} for path {spec}. "
- "Must be list or null."
- )
- return result
-
- if isinstance(data, list) and not data:
- return DataFrame()
- elif isinstance(data, dict):
- # A bit of a hackjob
- data = [data]
- elif isinstance(data, abc.Iterable) and not isinstance(data, str):
- # GH35923 Fix pd.json_normalize to not skip the first element of a
- # generator input
- data = list(data)
- else:
- raise NotImplementedError
-
- # check to see if a simple recursive function is possible to
- # improve performance (see #15621) but only for cases such
- # as pd.Dataframe(data) or pd.Dataframe(data, sep)
- if (
- record_path is None
- and meta is None
- and meta_prefix is None
- and record_prefix is None
- and max_level is None
- ):
- return DataFrame(_simple_json_normalize(data, sep=sep))
-
- if record_path is None:
- if any([isinstance(x, dict) for x in y.values()] for y in data):
- # naive normalization, this is idempotent for flat records
- # and potentially will inflate the data considerably for
- # deeply nested structures:
- # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
- #
- # TODO: handle record value which are lists, at least error
- # reasonably
- data = nested_to_record(data, sep=sep, max_level=max_level)
- return DataFrame(data)
- elif not isinstance(record_path, list):
- record_path = [record_path]
-
- if meta is None:
- meta = []
- elif not isinstance(meta, list):
- meta = [meta]
-
- _meta = [m if isinstance(m, list) else [m] for m in meta]
-
- # Disastrously inefficient for now
- records: list = []
- lengths = []
-
- meta_vals: DefaultDict = defaultdict(list)
- meta_keys = [sep.join(val) for val in _meta]
-
- def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
- if isinstance(data, dict):
- data = [data]
- if len(path) > 1:
- for obj in data:
- for val, key in zip(_meta, meta_keys):
- if level + 1 == len(val):
- seen_meta[key] = _pull_field(obj, val[-1])
-
- _recursive_extract(obj[path[0]], path[1:], seen_meta, level=level + 1)
- else:
- for obj in data:
- recs = _pull_records(obj, path[0])
- recs = [
- nested_to_record(r, sep=sep, max_level=max_level)
- if isinstance(r, dict)
- else r
- for r in recs
- ]
-
- # For repeating the metadata later
- lengths.append(len(recs))
- for val, key in zip(_meta, meta_keys):
- if level + 1 > len(val):
- meta_val = seen_meta[key]
- else:
- meta_val = _pull_field(obj, val[level:])
- meta_vals[key].append(meta_val)
- records.extend(recs)
-
- _recursive_extract(data, record_path, {}, level=0)
-
- result = DataFrame(records)
-
- if record_prefix is not None:
- result = result.rename(columns=lambda x: f"{record_prefix}{x}")
-
- # Data types, a problem
- for k, v in meta_vals.items():
- if meta_prefix is not None:
- k = meta_prefix + k
-
- if k in result:
- raise ValueError(
- f"Conflicting metadata name {k}, need distinguishing prefix "
- )
- result[k] = np.array(v, dtype=object).repeat(lengths)
- return result
diff --git a/contrib/python/pandas/py3/pandas/io/json/_table_schema.py b/contrib/python/pandas/py3/pandas/io/json/_table_schema.py
deleted file mode 100644
index 372aaf98c3b..00000000000
--- a/contrib/python/pandas/py3/pandas/io/json/_table_schema.py
+++ /dev/null
@@ -1,382 +0,0 @@
-"""
-Table Schema builders
-
-https://specs.frictionlessdata.io/table-schema/
-"""
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Any,
- cast,
-)
-import warnings
-
-from pandas._libs.json import loads
-from pandas._libs.tslibs import timezones
-from pandas._typing import (
- DtypeObj,
- JSONSerializable,
-)
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.base import _registry as registry
-from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_categorical_dtype,
- is_datetime64_dtype,
- is_datetime64tz_dtype,
- is_extension_array_dtype,
- is_integer_dtype,
- is_numeric_dtype,
- is_period_dtype,
- is_string_dtype,
- is_timedelta64_dtype,
-)
-from pandas.core.dtypes.dtypes import CategoricalDtype
-
-from pandas import DataFrame
-import pandas.core.common as com
-
-if TYPE_CHECKING:
- from pandas import Series
- from pandas.core.indexes.multi import MultiIndex
-
-
-TABLE_SCHEMA_VERSION = "1.4.0"
-
-
-def as_json_table_type(x: DtypeObj) -> str:
- """
- Convert a NumPy / pandas type to its corresponding json_table.
-
- Parameters
- ----------
- x : np.dtype or ExtensionDtype
-
- Returns
- -------
- str
- the Table Schema data types
-
- Notes
- -----
- This table shows the relationship between NumPy / pandas dtypes,
- and Table Schema dtypes.
-
- ============== =================
- Pandas type Table Schema type
- ============== =================
- int64 integer
- float64 number
- bool boolean
- datetime64[ns] datetime
- timedelta64[ns] duration
- object str
- categorical any
- =============== =================
- """
- if is_integer_dtype(x):
- return "integer"
- elif is_bool_dtype(x):
- return "boolean"
- elif is_numeric_dtype(x):
- return "number"
- elif is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x):
- return "datetime"
- elif is_timedelta64_dtype(x):
- return "duration"
- elif is_categorical_dtype(x):
- return "any"
- elif is_extension_array_dtype(x):
- return "any"
- elif is_string_dtype(x):
- return "string"
- else:
- return "any"
-
-
-def set_default_names(data):
- """Sets index names to 'index' for regular, or 'level_x' for Multi"""
- if com.all_not_none(*data.index.names):
- nms = data.index.names
- if len(nms) == 1 and data.index.name == "index":
- warnings.warn(
- "Index name of 'index' is not round-trippable.",
- stacklevel=find_stack_level(),
- )
- elif len(nms) > 1 and any(x.startswith("level_") for x in nms):
- warnings.warn(
- "Index names beginning with 'level_' are not round-trippable.",
- stacklevel=find_stack_level(),
- )
- return data
-
- data = data.copy()
- if data.index.nlevels > 1:
- data.index.names = com.fill_missing_names(data.index.names)
- else:
- data.index.name = data.index.name or "index"
- return data
-
-
-def convert_pandas_type_to_json_field(arr) -> dict[str, JSONSerializable]:
- dtype = arr.dtype
- name: JSONSerializable
- if arr.name is None:
- name = "values"
- else:
- name = arr.name
- field: dict[str, JSONSerializable] = {
- "name": name,
- "type": as_json_table_type(dtype),
- }
-
- if is_categorical_dtype(dtype):
- cats = dtype.categories
- ordered = dtype.ordered
-
- field["constraints"] = {"enum": list(cats)}
- field["ordered"] = ordered
- elif is_period_dtype(dtype):
- field["freq"] = dtype.freq.freqstr
- elif is_datetime64tz_dtype(dtype):
- if timezones.is_utc(dtype.tz):
- # timezone.utc has no "zone" attr
- field["tz"] = "UTC"
- else:
- field["tz"] = dtype.tz.zone
- elif is_extension_array_dtype(dtype):
- field["extDtype"] = dtype.name
- return field
-
-
-def convert_json_field_to_pandas_type(field) -> str | CategoricalDtype:
- """
- Converts a JSON field descriptor into its corresponding NumPy / pandas type
-
- Parameters
- ----------
- field
- A JSON field descriptor
-
- Returns
- -------
- dtype
-
- Raises
- ------
- ValueError
- If the type of the provided field is unknown or currently unsupported
-
- Examples
- --------
- >>> convert_json_field_to_pandas_type({"name": "an_int", "type": "integer"})
- 'int64'
-
- >>> convert_json_field_to_pandas_type(
- ... {
- ... "name": "a_categorical",
- ... "type": "any",
- ... "constraints": {"enum": ["a", "b", "c"]},
- ... "ordered": True,
- ... }
- ... )
- CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)
-
- >>> convert_json_field_to_pandas_type({"name": "a_datetime", "type": "datetime"})
- 'datetime64[ns]'
-
- >>> convert_json_field_to_pandas_type(
- ... {"name": "a_datetime_with_tz", "type": "datetime", "tz": "US/Central"}
- ... )
- 'datetime64[ns, US/Central]'
- """
- typ = field["type"]
- if typ == "string":
- return "object"
- elif typ == "integer":
- return field.get("extDtype", "int64")
- elif typ == "number":
- return field.get("extDtype", "float64")
- elif typ == "boolean":
- return field.get("extDtype", "bool")
- elif typ == "duration":
- return "timedelta64"
- elif typ == "datetime":
- if field.get("tz"):
- return f"datetime64[ns, {field['tz']}]"
- elif field.get("freq"):
- # GH#47747 using datetime over period to minimize the change surface
- return f"period[{field['freq']}]"
- else:
- return "datetime64[ns]"
- elif typ == "any":
- if "constraints" in field and "ordered" in field:
- return CategoricalDtype(
- categories=field["constraints"]["enum"], ordered=field["ordered"]
- )
- elif "extDtype" in field:
- return registry.find(field["extDtype"])
- else:
- return "object"
-
- raise ValueError(f"Unsupported or invalid field type: {typ}")
-
-
-def build_table_schema(
- data: DataFrame | Series,
- index: bool = True,
- primary_key: bool | None = None,
- version: bool = True,
-) -> dict[str, JSONSerializable]:
- """
- Create a Table schema from ``data``.
-
- Parameters
- ----------
- data : Series, DataFrame
- index : bool, default True
- Whether to include ``data.index`` in the schema.
- primary_key : bool or None, default True
- Column names to designate as the primary key.
- The default `None` will set `'primaryKey'` to the index
- level or levels if the index is unique.
- version : bool, default True
- Whether to include a field `pandas_version` with the version
- of pandas that last revised the table schema. This version
- can be different from the installed pandas version.
-
- Returns
- -------
- dict
-
- Notes
- -----
- See `Table Schema
- <https://pandas.pydata.org/docs/user_guide/io.html#table-schema>`__ for
- conversion types.
- Timedeltas as converted to ISO8601 duration format with
- 9 decimal places after the seconds field for nanosecond precision.
-
- Categoricals are converted to the `any` dtype, and use the `enum` field
- constraint to list the allowed values. The `ordered` attribute is included
- in an `ordered` field.
-
- Examples
- --------
- >>> from pandas.io.json._table_schema import build_table_schema
- >>> df = pd.DataFrame(
- ... {'A': [1, 2, 3],
- ... 'B': ['a', 'b', 'c'],
- ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
- ... }, index=pd.Index(range(3), name='idx'))
- >>> build_table_schema(df)
- {'fields': \
-[{'name': 'idx', 'type': 'integer'}, \
-{'name': 'A', 'type': 'integer'}, \
-{'name': 'B', 'type': 'string'}, \
-{'name': 'C', 'type': 'datetime'}], \
-'primaryKey': ['idx'], \
-'pandas_version': '1.4.0'}
- """
- if index is True:
- data = set_default_names(data)
-
- schema: dict[str, Any] = {}
- fields = []
-
- if index:
- if data.index.nlevels > 1:
- data.index = cast("MultiIndex", data.index)
- for level, name in zip(data.index.levels, data.index.names):
- new_field = convert_pandas_type_to_json_field(level)
- new_field["name"] = name
- fields.append(new_field)
- else:
- fields.append(convert_pandas_type_to_json_field(data.index))
-
- if data.ndim > 1:
- for column, s in data.items():
- fields.append(convert_pandas_type_to_json_field(s))
- else:
- fields.append(convert_pandas_type_to_json_field(data))
-
- schema["fields"] = fields
- if index and data.index.is_unique and primary_key is None:
- if data.index.nlevels == 1:
- schema["primaryKey"] = [data.index.name]
- else:
- schema["primaryKey"] = data.index.names
- elif primary_key is not None:
- schema["primaryKey"] = primary_key
-
- if version:
- schema["pandas_version"] = TABLE_SCHEMA_VERSION
- return schema
-
-
-def parse_table_schema(json, precise_float):
- """
- Builds a DataFrame from a given schema
-
- Parameters
- ----------
- json :
- A JSON table schema
- precise_float : bool
- Flag controlling precision when decoding string to double values, as
- dictated by ``read_json``
-
- Returns
- -------
- df : DataFrame
-
- Raises
- ------
- NotImplementedError
- If the JSON table schema contains either timezone or timedelta data
-
- Notes
- -----
- Because :func:`DataFrame.to_json` uses the string 'index' to denote a
- name-less :class:`Index`, this function sets the name of the returned
- :class:`DataFrame` to ``None`` when said string is encountered with a
- normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
- applies to any strings beginning with 'level_'. Therefore, an
- :class:`Index` name of 'index' and :class:`MultiIndex` names starting
- with 'level_' are not supported.
-
- See Also
- --------
- build_table_schema : Inverse function.
- pandas.read_json
- """
- table = loads(json, precise_float=precise_float)
- col_order = [field["name"] for field in table["schema"]["fields"]]
- df = DataFrame(table["data"], columns=col_order)[col_order]
-
- dtypes = {
- field["name"]: convert_json_field_to_pandas_type(field)
- for field in table["schema"]["fields"]
- }
-
- # No ISO constructor for Timedelta as of yet, so need to raise
- if "timedelta64" in dtypes.values():
- raise NotImplementedError(
- 'table="orient" can not yet read ISO-formatted Timedelta data'
- )
-
- df = df.astype(dtypes)
-
- if "primaryKey" in table["schema"]:
- df = df.set_index(table["schema"]["primaryKey"])
- if len(df.index.names) == 1:
- if df.index.name == "index":
- df.index.name = None
- else:
- df.index.names = [
- None if x.startswith("level_") else x for x in df.index.names
- ]
-
- return df
diff --git a/contrib/python/pandas/py3/pandas/io/orc.py b/contrib/python/pandas/py3/pandas/io/orc.py
deleted file mode 100644
index abecfdd464a..00000000000
--- a/contrib/python/pandas/py3/pandas/io/orc.py
+++ /dev/null
@@ -1,205 +0,0 @@
-""" orc compat """
-from __future__ import annotations
-
-import io
-from types import ModuleType
-from typing import (
- Any,
- Literal,
-)
-
-from pandas._libs import lib
-from pandas._typing import (
- DtypeBackend,
- FilePath,
- ReadBuffer,
- WriteBuffer,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.util._validators import check_dtype_backend
-
-from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_interval_dtype,
- is_period_dtype,
- is_unsigned_integer_dtype,
-)
-
-import pandas as pd
-from pandas.core.frame import DataFrame
-
-from pandas.io.common import get_handle
-
-
-def read_orc(
- path: FilePath | ReadBuffer[bytes],
- columns: list[str] | None = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- **kwargs,
-) -> DataFrame:
- """
- Load an ORC object from the file path, returning a DataFrame.
-
- Parameters
- ----------
- path : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``read()`` function. The string could be a URL.
- Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
- expected. A local file could be:
- ``file://localhost/path/to/table.orc``.
- columns : list, default None
- If not None, only these columns will be read from the file.
- Output always follows the ordering of the file and not the columns list.
- This mirrors the original behaviour of
- :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- **kwargs
- Any additional kwargs are passed to pyarrow.
-
- Returns
- -------
- DataFrame
-
- Notes
- -----
- Before using this function you should read the :ref:`user guide about ORC <io.orc>`
- and :ref:`install optional dependencies <install.warn_orc>`.
- """
- # we require a newer version of pyarrow than we support for parquet
-
- orc = import_optional_dependency("pyarrow.orc")
-
- check_dtype_backend(dtype_backend)
-
- with get_handle(path, "rb", is_text=False) as handles:
- orc_file = orc.ORCFile(handles.handle)
- pa_table = orc_file.read(columns=columns, **kwargs)
- if dtype_backend is not lib.no_default:
- if dtype_backend == "pyarrow":
- df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
- else:
- from pandas.io._util import _arrow_dtype_mapping
-
- mapping = _arrow_dtype_mapping()
- df = pa_table.to_pandas(types_mapper=mapping.get)
- return df
- else:
- return pa_table.to_pandas()
-
-
-def to_orc(
- df: DataFrame,
- path: FilePath | WriteBuffer[bytes] | None = None,
- *,
- engine: Literal["pyarrow"] = "pyarrow",
- index: bool | None = None,
- engine_kwargs: dict[str, Any] | None = None,
-) -> bytes | None:
- """
- Write a DataFrame to the ORC format.
-
- .. versionadded:: 1.5.0
-
- Parameters
- ----------
- df : DataFrame
- The dataframe to be written to ORC. Raises NotImplementedError
- if dtype of one or more columns is category, unsigned integers,
- intervals, periods or sparse.
- path : str, file-like object or None, default None
- If a string, it will be used as Root Directory path
- when writing a partitioned dataset. By file-like object,
- we refer to objects with a write() method, such as a file handle
- (e.g. via builtin open function). If path is None,
- a bytes object is returned.
- engine : str, default 'pyarrow'
- ORC library to use. Pyarrow must be >= 7.0.0.
- index : bool, optional
- If ``True``, include the dataframe's index(es) in the file output. If
- ``False``, they will not be written to the file.
- If ``None``, similar to ``infer`` the dataframe's index(es)
- will be saved. However, instead of being saved as values,
- the RangeIndex will be stored as a range in the metadata so it
- doesn't require much space and is faster. Other indexes will
- be included as columns in the file output.
- engine_kwargs : dict[str, Any] or None, default None
- Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
-
- Returns
- -------
- bytes if no path argument is provided else None
-
- Raises
- ------
- NotImplementedError
- Dtype of one or more columns is category, unsigned integers, interval,
- period or sparse.
- ValueError
- engine is not pyarrow.
-
- Notes
- -----
- * Before using this function you should read the
- :ref:`user guide about ORC <io.orc>` and
- :ref:`install optional dependencies <install.warn_orc>`.
- * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
- library.
- * For supported dtypes please refer to `supported ORC features in Arrow
- <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
- * Currently timezones in datetime columns are not preserved when a
- dataframe is converted into ORC files.
- """
- if index is None:
- index = df.index.names[0] is not None
- if engine_kwargs is None:
- engine_kwargs = {}
-
- # If unsupported dtypes are found raise NotImplementedError
- # In Pyarrow 9.0.0 this check will no longer be needed
- for dtype in df.dtypes:
- if (
- is_categorical_dtype(dtype)
- or is_interval_dtype(dtype)
- or is_period_dtype(dtype)
- or is_unsigned_integer_dtype(dtype)
- ):
- raise NotImplementedError(
- "The dtype of one or more columns is not supported yet."
- )
-
- if engine != "pyarrow":
- raise ValueError("engine must be 'pyarrow'")
- engine = import_optional_dependency(engine, min_version="7.0.0")
- orc = import_optional_dependency("pyarrow.orc")
-
- was_none = path is None
- if was_none:
- path = io.BytesIO()
- assert path is not None # For mypy
- with get_handle(path, "wb", is_text=False) as handles:
- assert isinstance(engine, ModuleType) # For mypy
- try:
- orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index),
- handles.handle,
- **engine_kwargs,
- )
- except TypeError as e:
- raise NotImplementedError(
- "The dtype of one or more columns is not supported yet."
- ) from e
-
- if was_none:
- assert isinstance(path, io.BytesIO) # For mypy
- return path.getvalue()
- return None
diff --git a/contrib/python/pandas/py3/pandas/io/parquet.py b/contrib/python/pandas/py3/pandas/io/parquet.py
deleted file mode 100644
index 18d9649a310..00000000000
--- a/contrib/python/pandas/py3/pandas/io/parquet.py
+++ /dev/null
@@ -1,516 +0,0 @@
-""" parquet compat """
-from __future__ import annotations
-
-import io
-import os
-from typing import (
- Any,
- Literal,
-)
-import warnings
-from warnings import catch_warnings
-
-from pandas._libs import lib
-from pandas._typing import (
- DtypeBackend,
- FilePath,
- ReadBuffer,
- StorageOptions,
- WriteBuffer,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import doc
-from pandas.util._exceptions import find_stack_level
-from pandas.util._validators import check_dtype_backend
-
-import pandas as pd
-from pandas import (
- DataFrame,
- get_option,
-)
-from pandas.core.shared_docs import _shared_docs
-from pandas.util.version import Version
-
-from pandas.io.common import (
- IOHandles,
- get_handle,
- is_fsspec_url,
- is_url,
- stringify_path,
-)
-
-
-def get_engine(engine: str) -> BaseImpl:
- """return our implementation"""
- if engine == "auto":
- engine = get_option("io.parquet.engine")
-
- if engine == "auto":
- # try engines in this order
- engine_classes = [PyArrowImpl, FastParquetImpl]
-
- error_msgs = ""
- for engine_class in engine_classes:
- try:
- return engine_class()
- except ImportError as err:
- error_msgs += "\n - " + str(err)
-
- raise ImportError(
- "Unable to find a usable engine; "
- "tried using: 'pyarrow', 'fastparquet'.\n"
- "A suitable version of "
- "pyarrow or fastparquet is required for parquet "
- "support.\n"
- "Trying to import the above resulted in these errors:"
- f"{error_msgs}"
- )
-
- if engine == "pyarrow":
- return PyArrowImpl()
- elif engine == "fastparquet":
- return FastParquetImpl()
-
- raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
-
-
-def _get_path_or_handle(
- path: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
- fs: Any,
- storage_options: StorageOptions = None,
- mode: str = "rb",
- is_dir: bool = False,
-) -> tuple[
- FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], IOHandles[bytes] | None, Any
-]:
- """File handling for PyArrow."""
- path_or_handle = stringify_path(path)
- if is_fsspec_url(path_or_handle) and fs is None:
- fsspec = import_optional_dependency("fsspec")
-
- fs, path_or_handle = fsspec.core.url_to_fs(
- path_or_handle, **(storage_options or {})
- )
- elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
- # can't write to a remote url
- # without making use of fsspec at the moment
- raise ValueError("storage_options passed with buffer, or non-supported URL")
-
- handles = None
- if (
- not fs
- and not is_dir
- and isinstance(path_or_handle, str)
- and not os.path.isdir(path_or_handle)
- ):
- # use get_handle only when we are very certain that it is not a directory
- # fsspec resources can also point to directories
- # this branch is used for example when reading from non-fsspec URLs
- handles = get_handle(
- path_or_handle, mode, is_text=False, storage_options=storage_options
- )
- fs = None
- path_or_handle = handles.handle
- return path_or_handle, handles, fs
-
-
-class BaseImpl:
- @staticmethod
- def validate_dataframe(df: DataFrame) -> None:
- if not isinstance(df, DataFrame):
- raise ValueError("to_parquet only supports IO with DataFrames")
-
- def write(self, df: DataFrame, path, compression, **kwargs):
- raise AbstractMethodError(self)
-
- def read(self, path, columns=None, **kwargs) -> DataFrame:
- raise AbstractMethodError(self)
-
-
-class PyArrowImpl(BaseImpl):
- def __init__(self) -> None:
- import_optional_dependency(
- "pyarrow", extra="pyarrow is required for parquet support."
- )
- import pyarrow.parquet
-
- # import utils to register the pyarrow extension types
- import pandas.core.arrays.arrow.extension_types # pyright: ignore # noqa:F401
-
- self.api = pyarrow
-
- def write(
- self,
- df: DataFrame,
- path: FilePath | WriteBuffer[bytes],
- compression: str | None = "snappy",
- index: bool | None = None,
- storage_options: StorageOptions = None,
- partition_cols: list[str] | None = None,
- **kwargs,
- ) -> None:
- self.validate_dataframe(df)
-
- from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)}
- if index is not None:
- from_pandas_kwargs["preserve_index"] = index
-
- table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
-
- path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
- path,
- kwargs.pop("filesystem", None),
- storage_options=storage_options,
- mode="wb",
- is_dir=partition_cols is not None,
- )
- if (
- isinstance(path_or_handle, io.BufferedWriter)
- and hasattr(path_or_handle, "name")
- and isinstance(path_or_handle.name, (str, bytes))
- ):
- path_or_handle = path_or_handle.name
- if isinstance(path_or_handle, bytes):
- path_or_handle = path_or_handle.decode()
-
- try:
- if partition_cols is not None:
- # writes to multiple files under the given path
- self.api.parquet.write_to_dataset(
- table,
- path_or_handle,
- compression=compression,
- partition_cols=partition_cols,
- **kwargs,
- )
- else:
- # write to single output file
- self.api.parquet.write_table(
- table, path_or_handle, compression=compression, **kwargs
- )
- finally:
- if handles is not None:
- handles.close()
-
- def read(
- self,
- path,
- columns=None,
- use_nullable_dtypes: bool = False,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- storage_options: StorageOptions = None,
- **kwargs,
- ) -> DataFrame:
- kwargs["use_pandas_metadata"] = True
-
- to_pandas_kwargs = {}
- if dtype_backend == "numpy_nullable":
- from pandas.io._util import _arrow_dtype_mapping
-
- mapping = _arrow_dtype_mapping()
- to_pandas_kwargs["types_mapper"] = mapping.get
- elif dtype_backend == "pyarrow":
- to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] # noqa
-
- manager = get_option("mode.data_manager")
- if manager == "array":
- to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment]
-
- path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
- path,
- kwargs.pop("filesystem", None),
- storage_options=storage_options,
- mode="rb",
- )
- try:
- pa_table = self.api.parquet.read_table(
- path_or_handle, columns=columns, **kwargs
- )
- result = pa_table.to_pandas(**to_pandas_kwargs)
-
- if manager == "array":
- result = result._as_manager("array", copy=False)
- return result
- finally:
- if handles is not None:
- handles.close()
-
-
-class FastParquetImpl(BaseImpl):
- def __init__(self) -> None:
- # since pandas is a dependency of fastparquet
- # we need to import on first use
- fastparquet = import_optional_dependency(
- "fastparquet", extra="fastparquet is required for parquet support."
- )
- self.api = fastparquet
-
- def write(
- self,
- df: DataFrame,
- path,
- compression: Literal["snappy", "gzip", "brotli"] | None = "snappy",
- index=None,
- partition_cols=None,
- storage_options: StorageOptions = None,
- **kwargs,
- ) -> None:
- self.validate_dataframe(df)
-
- if "partition_on" in kwargs and partition_cols is not None:
- raise ValueError(
- "Cannot use both partition_on and "
- "partition_cols. Use partition_cols for partitioning data"
- )
- if "partition_on" in kwargs:
- partition_cols = kwargs.pop("partition_on")
-
- if partition_cols is not None:
- kwargs["file_scheme"] = "hive"
-
- # cannot use get_handle as write() does not accept file buffers
- path = stringify_path(path)
- if is_fsspec_url(path):
- fsspec = import_optional_dependency("fsspec")
-
- # if filesystem is provided by fsspec, file must be opened in 'wb' mode.
- kwargs["open_with"] = lambda path, _: fsspec.open(
- path, "wb", **(storage_options or {})
- ).open()
- elif storage_options:
- raise ValueError(
- "storage_options passed with file object or non-fsspec file path"
- )
-
- with catch_warnings(record=True):
- self.api.write(
- path,
- df,
- compression=compression,
- write_index=index,
- partition_on=partition_cols,
- **kwargs,
- )
-
- def read(
- self, path, columns=None, storage_options: StorageOptions = None, **kwargs
- ) -> DataFrame:
- parquet_kwargs: dict[str, Any] = {}
- use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
- dtype_backend = kwargs.pop("dtype_backend", lib.no_default)
- if Version(self.api.__version__) >= Version("0.7.1"):
- # We are disabling nullable dtypes for fastparquet pending discussion
- parquet_kwargs["pandas_nulls"] = False
- if use_nullable_dtypes:
- raise ValueError(
- "The 'use_nullable_dtypes' argument is not supported for the "
- "fastparquet engine"
- )
- if dtype_backend is not lib.no_default:
- raise ValueError(
- "The 'dtype_backend' argument is not supported for the "
- "fastparquet engine"
- )
- path = stringify_path(path)
- handles = None
- if is_fsspec_url(path):
- fsspec = import_optional_dependency("fsspec")
-
- if Version(self.api.__version__) > Version("0.6.1"):
- parquet_kwargs["fs"] = fsspec.open(
- path, "rb", **(storage_options or {})
- ).fs
- else:
- parquet_kwargs["open_with"] = lambda path, _: fsspec.open(
- path, "rb", **(storage_options or {})
- ).open()
- elif isinstance(path, str) and not os.path.isdir(path):
- # use get_handle only when we are very certain that it is not a directory
- # fsspec resources can also point to directories
- # this branch is used for example when reading from non-fsspec URLs
- handles = get_handle(
- path, "rb", is_text=False, storage_options=storage_options
- )
- path = handles.handle
-
- try:
- parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
- return parquet_file.to_pandas(columns=columns, **kwargs)
- finally:
- if handles is not None:
- handles.close()
-
-
-@doc(storage_options=_shared_docs["storage_options"])
-def to_parquet(
- df: DataFrame,
- path: FilePath | WriteBuffer[bytes] | None = None,
- engine: str = "auto",
- compression: str | None = "snappy",
- index: bool | None = None,
- storage_options: StorageOptions = None,
- partition_cols: list[str] | None = None,
- **kwargs,
-) -> bytes | None:
- """
- Write a DataFrame to the parquet format.
-
- Parameters
- ----------
- df : DataFrame
- path : str, path object, file-like object, or None, default None
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``write()`` function. If None, the result is
- returned as bytes. If a string, it will be used as Root Directory path
- when writing a partitioned dataset. The engine fastparquet does not
- accept file-like objects.
-
- .. versionchanged:: 1.2.0
-
- engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
- Parquet library to use. If 'auto', then the option
- ``io.parquet.engine`` is used. The default ``io.parquet.engine``
- behavior is to try 'pyarrow', falling back to 'fastparquet' if
- 'pyarrow' is unavailable.
- compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}},
- default 'snappy'. Name of the compression to use. Use ``None``
- for no compression. The supported compression methods actually
- depend on which engine is used. For 'pyarrow', 'snappy', 'gzip',
- 'brotli', 'lz4', 'zstd' are all supported. For 'fastparquet',
- only 'gzip' and 'snappy' are supported.
- index : bool, default None
- If ``True``, include the dataframe's index(es) in the file output. If
- ``False``, they will not be written to the file.
- If ``None``, similar to ``True`` the dataframe's index(es)
- will be saved. However, instead of being saved as values,
- the RangeIndex will be stored as a range in the metadata so it
- doesn't require much space and is faster. Other indexes will
- be included as columns in the file output.
- partition_cols : str or list, optional, default None
- Column names by which to partition the dataset.
- Columns are partitioned in the order they are given.
- Must be None if path is not a string.
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- kwargs
- Additional keyword arguments passed to the engine
-
- Returns
- -------
- bytes if no path argument is provided else None
- """
- if isinstance(partition_cols, str):
- partition_cols = [partition_cols]
- impl = get_engine(engine)
-
- path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
-
- impl.write(
- df,
- path_or_buf,
- compression=compression,
- index=index,
- partition_cols=partition_cols,
- storage_options=storage_options,
- **kwargs,
- )
-
- if path is None:
- assert isinstance(path_or_buf, io.BytesIO)
- return path_or_buf.getvalue()
- else:
- return None
-
-
-@doc(storage_options=_shared_docs["storage_options"])
-def read_parquet(
- path: FilePath | ReadBuffer[bytes],
- engine: str = "auto",
- columns: list[str] | None = None,
- storage_options: StorageOptions = None,
- use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- **kwargs,
-) -> DataFrame:
- """
- Load a parquet object from the file path, returning a DataFrame.
-
- Parameters
- ----------
- path : str, path object or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``read()`` function.
- The string could be a URL. Valid URL schemes include http, ftp, s3,
- gs, and file. For file URLs, a host is expected. A local file could be:
- ``file://localhost/path/to/table.parquet``.
- A file URL can also be a path to a directory that contains multiple
- partitioned parquet files. Both pyarrow and fastparquet support
- paths to directories as well as file URLs. A directory path could be:
- ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
- engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
- Parquet library to use. If 'auto', then the option
- ``io.parquet.engine`` is used. The default ``io.parquet.engine``
- behavior is to try 'pyarrow', falling back to 'fastparquet' if
- 'pyarrow' is unavailable.
- columns : list, default=None
- If not None, only these columns will be read from the file.
-
- {storage_options}
-
- .. versionadded:: 1.3.0
-
- use_nullable_dtypes : bool, default False
- If True, use dtypes that use ``pd.NA`` as missing value indicator
- for the resulting DataFrame. (only applicable for the ``pyarrow``
- engine)
- As new dtypes are added that support ``pd.NA`` in the future, the
- output with this option will change to use those dtypes.
- Note: this is an experimental option, and behaviour (e.g. additional
- support dtypes) may change without notice.
-
- .. deprecated:: 2.0
-
- dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- **kwargs
- Any additional kwargs are passed to the engine.
-
- Returns
- -------
- DataFrame
- """
- impl = get_engine(engine)
-
- if use_nullable_dtypes is not lib.no_default:
- msg = (
- "The argument 'use_nullable_dtypes' is deprecated and will be removed "
- "in a future version."
- )
- if use_nullable_dtypes is True:
- msg += (
- "Use dtype_backend='numpy_nullable' instead of use_nullable_dtype=True."
- )
- warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
- else:
- use_nullable_dtypes = False
- check_dtype_backend(dtype_backend)
-
- return impl.read(
- path,
- columns=columns,
- storage_options=storage_options,
- use_nullable_dtypes=use_nullable_dtypes,
- dtype_backend=dtype_backend,
- **kwargs,
- )
diff --git a/contrib/python/pandas/py3/pandas/io/parsers/__init__.py b/contrib/python/pandas/py3/pandas/io/parsers/__init__.py
deleted file mode 100644
index ff11968db15..00000000000
--- a/contrib/python/pandas/py3/pandas/io/parsers/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from pandas.io.parsers.readers import (
- TextFileReader,
- TextParser,
- read_csv,
- read_fwf,
- read_table,
-)
-
-__all__ = ["TextFileReader", "TextParser", "read_csv", "read_fwf", "read_table"]
diff --git a/contrib/python/pandas/py3/pandas/io/parsers/arrow_parser_wrapper.py b/contrib/python/pandas/py3/pandas/io/parsers/arrow_parser_wrapper.py
deleted file mode 100644
index 8f7649226b8..00000000000
--- a/contrib/python/pandas/py3/pandas/io/parsers/arrow_parser_wrapper.py
+++ /dev/null
@@ -1,164 +0,0 @@
-from __future__ import annotations
-
-from pandas._typing import ReadBuffer
-from pandas.compat._optional import import_optional_dependency
-
-from pandas.core.dtypes.inference import is_integer
-
-import pandas as pd
-from pandas import DataFrame
-
-from pandas.io._util import _arrow_dtype_mapping
-from pandas.io.parsers.base_parser import ParserBase
-
-
-class ArrowParserWrapper(ParserBase):
- """
- Wrapper for the pyarrow engine for read_csv()
- """
-
- def __init__(self, src: ReadBuffer[bytes], **kwds) -> None:
- super().__init__(kwds)
- self.kwds = kwds
- self.src = src
-
- self._parse_kwds()
-
- def _parse_kwds(self):
- """
- Validates keywords before passing to pyarrow.
- """
- encoding: str | None = self.kwds.get("encoding")
- self.encoding = "utf-8" if encoding is None else encoding
-
- self.usecols, self.usecols_dtype = self._validate_usecols_arg(
- self.kwds["usecols"]
- )
- na_values = self.kwds["na_values"]
- if isinstance(na_values, dict):
- raise ValueError(
- "The pyarrow engine doesn't support passing a dict for na_values"
- )
- self.na_values = list(self.kwds["na_values"])
-
- def _get_pyarrow_options(self) -> None:
- """
- Rename some arguments to pass to pyarrow
- """
- mapping = {
- "usecols": "include_columns",
- "na_values": "null_values",
- "escapechar": "escape_char",
- "skip_blank_lines": "ignore_empty_lines",
- "decimal": "decimal_point",
- }
- for pandas_name, pyarrow_name in mapping.items():
- if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
- self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
-
- self.parse_options = {
- option_name: option_value
- for option_name, option_value in self.kwds.items()
- if option_value is not None
- and option_name
- in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
- }
- self.convert_options = {
- option_name: option_value
- for option_name, option_value in self.kwds.items()
- if option_value is not None
- and option_name
- in (
- "include_columns",
- "null_values",
- "true_values",
- "false_values",
- "decimal_point",
- )
- }
- self.read_options = {
- "autogenerate_column_names": self.header is None,
- "skip_rows": self.header
- if self.header is not None
- else self.kwds["skiprows"],
- "encoding": self.encoding,
- }
-
- def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
- """
- Processes data read in based on kwargs.
-
- Parameters
- ----------
- frame: DataFrame
- The DataFrame to process.
-
- Returns
- -------
- DataFrame
- The processed DataFrame.
- """
- num_cols = len(frame.columns)
- multi_index_named = True
- if self.header is None:
- if self.names is None:
- if self.header is None:
- self.names = range(num_cols)
- if len(self.names) != num_cols:
- # usecols is passed through to pyarrow, we only handle index col here
- # The only way self.names is not the same length as number of cols is
- # if we have int index_col. We should just pad the names(they will get
- # removed anyways) to expected length then.
- self.names = list(range(num_cols - len(self.names))) + self.names
- multi_index_named = False
- frame.columns = self.names
- # we only need the frame not the names
- frame.columns, frame = self._do_date_conversions(frame.columns, frame)
- if self.index_col is not None:
- for i, item in enumerate(self.index_col):
- if is_integer(item):
- self.index_col[i] = frame.columns[item]
- else:
- # String case
- if item not in frame.columns:
- raise ValueError(f"Index {item} invalid")
- frame.set_index(self.index_col, drop=True, inplace=True)
- # Clear names if headerless and no name given
- if self.header is None and not multi_index_named:
- frame.index.names = [None] * len(frame.index.names)
-
- if self.kwds.get("dtype") is not None:
- try:
- frame = frame.astype(self.kwds.get("dtype"))
- except TypeError as e:
- # GH#44901 reraise to keep api consistent
- raise ValueError(e)
- return frame
-
- def read(self) -> DataFrame:
- """
- Reads the contents of a CSV file into a DataFrame and
- processes it according to the kwargs passed in the
- constructor.
-
- Returns
- -------
- DataFrame
- The DataFrame created from the CSV file.
- """
- pyarrow_csv = import_optional_dependency("pyarrow.csv")
- self._get_pyarrow_options()
-
- table = pyarrow_csv.read_csv(
- self.src,
- read_options=pyarrow_csv.ReadOptions(**self.read_options),
- parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
- convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
- )
- if self.kwds["dtype_backend"] == "pyarrow":
- frame = table.to_pandas(types_mapper=pd.ArrowDtype)
- elif self.kwds["dtype_backend"] == "numpy_nullable":
- frame = table.to_pandas(types_mapper=_arrow_dtype_mapping().get)
- else:
- frame = table.to_pandas()
- return self._finalize_pandas_output(frame)
diff --git a/contrib/python/pandas/py3/pandas/io/parsers/base_parser.py b/contrib/python/pandas/py3/pandas/io/parsers/base_parser.py
deleted file mode 100644
index 22ab8607e06..00000000000
--- a/contrib/python/pandas/py3/pandas/io/parsers/base_parser.py
+++ /dev/null
@@ -1,1388 +0,0 @@
-from __future__ import annotations
-
-from collections import defaultdict
-from copy import copy
-import csv
-import datetime
-from enum import Enum
-import itertools
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Iterable,
- List,
- Mapping,
- Sequence,
- Tuple,
- cast,
- final,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import (
- lib,
- parsers,
-)
-import pandas._libs.ops as libops
-from pandas._libs.parsers import STR_NA_VALUES
-from pandas._libs.tslibs import parsing
-from pandas._typing import (
- ArrayLike,
- DtypeArg,
- DtypeObj,
- Scalar,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.errors import (
- ParserError,
- ParserWarning,
-)
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.astype import astype_array
-from pandas.core.dtypes.common import (
- ensure_object,
- is_bool_dtype,
- is_dict_like,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.dtypes import (
- CategoricalDtype,
- ExtensionDtype,
-)
-from pandas.core.dtypes.missing import isna
-
-from pandas import (
- ArrowDtype,
- DatetimeIndex,
- StringDtype,
-)
-from pandas.core import algorithms
-from pandas.core.arrays import (
- ArrowExtensionArray,
- BooleanArray,
- Categorical,
- ExtensionArray,
- FloatingArray,
- IntegerArray,
-)
-from pandas.core.arrays.boolean import BooleanDtype
-from pandas.core.indexes.api import (
- Index,
- MultiIndex,
- default_index,
- ensure_index_from_sequences,
-)
-from pandas.core.series import Series
-from pandas.core.tools import datetimes as tools
-
-from pandas.io.common import is_potential_multi_index
-
-if TYPE_CHECKING:
- from pandas import DataFrame
-
-
-class ParserBase:
- class BadLineHandleMethod(Enum):
- ERROR = 0
- WARN = 1
- SKIP = 2
-
- _implicit_index: bool = False
- _first_chunk: bool
-
- def __init__(self, kwds) -> None:
- self.names = kwds.get("names")
- self.orig_names: Sequence[Hashable] | None = None
-
- self.index_col = kwds.get("index_col", None)
- self.unnamed_cols: set = set()
- self.index_names: Sequence[Hashable] | None = None
- self.col_names: Sequence[Hashable] | None = None
-
- self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
- self._parse_date_cols: Iterable = []
- self.date_parser = kwds.pop("date_parser", lib.no_default)
- self.date_format = kwds.pop("date_format", None)
- self.dayfirst = kwds.pop("dayfirst", False)
- self.keep_date_col = kwds.pop("keep_date_col", False)
-
- self.na_values = kwds.get("na_values")
- self.na_fvalues = kwds.get("na_fvalues")
- self.na_filter = kwds.get("na_filter", False)
- self.keep_default_na = kwds.get("keep_default_na", True)
-
- self.dtype = copy(kwds.get("dtype", None))
- self.converters = kwds.get("converters")
- self.dtype_backend = kwds.get("dtype_backend")
-
- self.true_values = kwds.get("true_values")
- self.false_values = kwds.get("false_values")
- self.cache_dates = kwds.pop("cache_dates", True)
-
- self._date_conv = _make_date_converter(
- date_parser=self.date_parser,
- date_format=self.date_format,
- dayfirst=self.dayfirst,
- cache_dates=self.cache_dates,
- )
-
- # validate header options for mi
- self.header = kwds.get("header")
- if is_list_like(self.header, allow_sets=False):
- if kwds.get("usecols"):
- raise ValueError(
- "cannot specify usecols when specifying a multi-index header"
- )
- if kwds.get("names"):
- raise ValueError(
- "cannot specify names when specifying a multi-index header"
- )
-
- # validate index_col that only contains integers
- if self.index_col is not None:
- if not (
- is_list_like(self.index_col, allow_sets=False)
- and all(map(is_integer, self.index_col))
- or is_integer(self.index_col)
- ):
- raise ValueError(
- "index_col must only contain row numbers "
- "when specifying a multi-index header"
- )
-
- self._name_processed = False
-
- self._first_chunk = True
-
- self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
-
- # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
- # Normally, this arg would get pre-processed earlier on
- self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
-
- def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:
- """
- Check if parse_dates are in columns.
-
- If user has provided names for parse_dates, check if those columns
- are available.
-
- Parameters
- ----------
- columns : list
- List of names of the dataframe.
-
- Returns
- -------
- The names of the columns which will get parsed later if a dict or list
- is given as specification.
-
- Raises
- ------
- ValueError
- If column to parse_date is not in dataframe.
-
- """
- cols_needed: Iterable
- if is_dict_like(self.parse_dates):
- cols_needed = itertools.chain(*self.parse_dates.values())
- elif is_list_like(self.parse_dates):
- # a column in parse_dates could be represented
- # ColReference = Union[int, str]
- # DateGroups = List[ColReference]
- # ParseDates = Union[DateGroups, List[DateGroups],
- # Dict[ColReference, DateGroups]]
- cols_needed = itertools.chain.from_iterable(
- col if is_list_like(col) and not isinstance(col, tuple) else [col]
- for col in self.parse_dates
- )
- else:
- cols_needed = []
-
- cols_needed = list(cols_needed)
-
- # get only columns that are references using names (str), not by index
- missing_cols = ", ".join(
- sorted(
- {
- col
- for col in cols_needed
- if isinstance(col, str) and col not in columns
- }
- )
- )
- if missing_cols:
- raise ValueError(
- f"Missing column provided to 'parse_dates': '{missing_cols}'"
- )
- # Convert positions to actual column names
- return [
- col if (isinstance(col, str) or col in columns) else columns[col]
- for col in cols_needed
- ]
-
- def close(self) -> None:
- pass
-
- @final
- @property
- def _has_complex_date_col(self) -> bool:
- return isinstance(self.parse_dates, dict) or (
- isinstance(self.parse_dates, list)
- and len(self.parse_dates) > 0
- and isinstance(self.parse_dates[0], list)
- )
-
- @final
- def _should_parse_dates(self, i: int) -> bool:
- if isinstance(self.parse_dates, bool):
- return self.parse_dates
- else:
- if self.index_names is not None:
- name = self.index_names[i]
- else:
- name = None
- j = i if self.index_col is None else self.index_col[i]
-
- if is_scalar(self.parse_dates):
- return (j == self.parse_dates) or (
- name is not None and name == self.parse_dates
- )
- else:
- return (j in self.parse_dates) or (
- name is not None and name in self.parse_dates
- )
-
- @final
- def _extract_multi_indexer_columns(
- self,
- header,
- index_names: Sequence[Hashable] | None,
- passed_names: bool = False,
- ) -> tuple[
- Sequence[Hashable], Sequence[Hashable] | None, Sequence[Hashable] | None, bool
- ]:
- """
- Extract and return the names, index_names, col_names if the column
- names are a MultiIndex.
-
- Parameters
- ----------
- header: list of lists
- The header rows
- index_names: list, optional
- The names of the future index
- passed_names: bool, default False
- A flag specifying if names where passed
-
- """
- if len(header) < 2:
- return header[0], index_names, None, passed_names
-
- # the names are the tuples of the header that are not the index cols
- # 0 is the name of the index, assuming index_col is a list of column
- # numbers
- ic = self.index_col
- if ic is None:
- ic = []
-
- if not isinstance(ic, (list, tuple, np.ndarray)):
- ic = [ic]
- sic = set(ic)
-
- # clean the index_names
- index_names = header.pop(-1)
- index_names, _, _ = self._clean_index_names(index_names, self.index_col)
-
- # extract the columns
- field_count = len(header[0])
-
- # check if header lengths are equal
- if not all(len(header_iter) == field_count for header_iter in header[1:]):
- raise ParserError("Header rows must have an equal number of columns.")
-
- def extract(r):
- return tuple(r[i] for i in range(field_count) if i not in sic)
-
- columns = list(zip(*(extract(r) for r in header)))
- names = columns.copy()
- for single_ic in sorted(ic):
- names.insert(single_ic, single_ic)
-
- # Clean the column names (if we have an index_col).
- if len(ic):
- col_names = [
- r[ic[0]]
- if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols)
- else None
- for r in header
- ]
- else:
- col_names = [None] * len(header)
-
- passed_names = True
-
- return names, index_names, col_names, passed_names
-
- @final
- def _maybe_make_multi_index_columns(
- self,
- columns: Sequence[Hashable],
- col_names: Sequence[Hashable] | None = None,
- ) -> Sequence[Hashable] | MultiIndex:
- # possibly create a column mi here
- if is_potential_multi_index(columns):
- list_columns = cast(List[Tuple], columns)
- return MultiIndex.from_tuples(list_columns, names=col_names)
- return columns
-
- @final
- def _make_index(
- self, data, alldata, columns, indexnamerow: list[Scalar] | None = None
- ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
- index: Index | None
- if not is_index_col(self.index_col) or not self.index_col:
- index = None
-
- elif not self._has_complex_date_col:
- simple_index = self._get_simple_index(alldata, columns)
- index = self._agg_index(simple_index)
- elif self._has_complex_date_col:
- if not self._name_processed:
- (self.index_names, _, self.index_col) = self._clean_index_names(
- list(columns), self.index_col
- )
- self._name_processed = True
- date_index = self._get_complex_date_index(data, columns)
- index = self._agg_index(date_index, try_parse_dates=False)
-
- # add names for the index
- if indexnamerow:
- coffset = len(indexnamerow) - len(columns)
- assert index is not None
- index = index.set_names(indexnamerow[:coffset])
-
- # maybe create a mi on the columns
- columns = self._maybe_make_multi_index_columns(columns, self.col_names)
-
- return index, columns
-
- @final
- def _get_simple_index(self, data, columns):
- def ix(col):
- if not isinstance(col, str):
- return col
- raise ValueError(f"Index {col} invalid")
-
- to_remove = []
- index = []
- for idx in self.index_col:
- i = ix(idx)
- to_remove.append(i)
- index.append(data[i])
-
- # remove index items from content and columns, don't pop in
- # loop
- for i in sorted(to_remove, reverse=True):
- data.pop(i)
- if not self._implicit_index:
- columns.pop(i)
-
- return index
-
- @final
- def _get_complex_date_index(self, data, col_names):
- def _get_name(icol):
- if isinstance(icol, str):
- return icol
-
- if col_names is None:
- raise ValueError(f"Must supply column order to use {icol!s} as index")
-
- for i, c in enumerate(col_names):
- if i == icol:
- return c
-
- to_remove = []
- index = []
- for idx in self.index_col:
- name = _get_name(idx)
- to_remove.append(name)
- index.append(data[name])
-
- # remove index items from content and columns, don't pop in
- # loop
- for c in sorted(to_remove, reverse=True):
- data.pop(c)
- col_names.remove(c)
-
- return index
-
- def _clean_mapping(self, mapping):
- """converts col numbers to names"""
- if not isinstance(mapping, dict):
- return mapping
- clean = {}
- # for mypy
- assert self.orig_names is not None
-
- for col, v in mapping.items():
- if isinstance(col, int) and col not in self.orig_names:
- col = self.orig_names[col]
- clean[col] = v
- if isinstance(mapping, defaultdict):
- remaining_cols = set(self.orig_names) - set(clean.keys())
- clean.update({col: mapping[col] for col in remaining_cols})
- return clean
-
- @final
- def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
- arrays = []
- converters = self._clean_mapping(self.converters)
-
- for i, arr in enumerate(index):
- if try_parse_dates and self._should_parse_dates(i):
- arr = self._date_conv(
- arr,
- col=self.index_names[i] if self.index_names is not None else None,
- )
-
- if self.na_filter:
- col_na_values = self.na_values
- col_na_fvalues = self.na_fvalues
- else:
- col_na_values = set()
- col_na_fvalues = set()
-
- if isinstance(self.na_values, dict):
- assert self.index_names is not None
- col_name = self.index_names[i]
- if col_name is not None:
- col_na_values, col_na_fvalues = _get_na_values(
- col_name, self.na_values, self.na_fvalues, self.keep_default_na
- )
-
- clean_dtypes = self._clean_mapping(self.dtype)
-
- cast_type = None
- index_converter = False
- if self.index_names is not None:
- if isinstance(clean_dtypes, dict):
- cast_type = clean_dtypes.get(self.index_names[i], None)
-
- if isinstance(converters, dict):
- index_converter = converters.get(self.index_names[i]) is not None
-
- try_num_bool = not (
- cast_type and is_string_dtype(cast_type) or index_converter
- )
-
- arr, _ = self._infer_types(
- arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
- )
- arrays.append(arr)
-
- names = self.index_names
- index = ensure_index_from_sequences(arrays, names)
-
- return index
-
- @final
- def _convert_to_ndarrays(
- self,
- dct: Mapping,
- na_values,
- na_fvalues,
- verbose: bool = False,
- converters=None,
- dtypes=None,
- ):
- result = {}
- for c, values in dct.items():
- conv_f = None if converters is None else converters.get(c, None)
- if isinstance(dtypes, dict):
- cast_type = dtypes.get(c, None)
- else:
- # single dtype or None
- cast_type = dtypes
-
- if self.na_filter:
- col_na_values, col_na_fvalues = _get_na_values(
- c, na_values, na_fvalues, self.keep_default_na
- )
- else:
- col_na_values, col_na_fvalues = set(), set()
-
- if c in self._parse_date_cols:
- # GH#26203 Do not convert columns which get converted to dates
- # but replace nans to ensure to_datetime works
- mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
- np.putmask(values, mask, np.nan)
- result[c] = values
- continue
-
- if conv_f is not None:
- # conv_f applied to data before inference
- if cast_type is not None:
- warnings.warn(
- (
- "Both a converter and dtype were specified "
- f"for column {c} - only the converter will be used."
- ),
- ParserWarning,
- stacklevel=find_stack_level(),
- )
-
- try:
- values = lib.map_infer(values, conv_f)
- except ValueError:
- # error: Argument 2 to "isin" has incompatible type "List[Any]";
- # expected "Union[Union[ExtensionArray, ndarray], Index, Series]"
- mask = algorithms.isin(
- values, list(na_values) # type: ignore[arg-type]
- ).view(np.uint8)
- values = lib.map_infer_mask(values, conv_f, mask)
-
- cvals, na_count = self._infer_types(
- values,
- set(col_na_values) | col_na_fvalues,
- cast_type is None,
- try_num_bool=False,
- )
- else:
- is_ea = is_extension_array_dtype(cast_type)
- is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
- # skip inference if specified dtype is object
- # or casting to an EA
- try_num_bool = not (cast_type and is_str_or_ea_dtype)
-
- # general type inference and conversion
- cvals, na_count = self._infer_types(
- values,
- set(col_na_values) | col_na_fvalues,
- cast_type is None,
- try_num_bool,
- )
-
- # type specified in dtype param or cast_type is an EA
- if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea):
- if not is_ea and na_count > 0:
- if is_bool_dtype(cast_type):
- raise ValueError(f"Bool column has NA values in column {c}")
- cast_type = pandas_dtype(cast_type)
- cvals = self._cast_types(cvals, cast_type, c)
-
- result[c] = cvals
- if verbose and na_count:
- print(f"Filled {na_count} NA values in column {c!s}")
- return result
-
- @final
- def _set_noconvert_dtype_columns(
- self, col_indices: list[int], names: Sequence[Hashable]
- ) -> set[int]:
- """
- Set the columns that should not undergo dtype conversions.
-
- Currently, any column that is involved with date parsing will not
- undergo such conversions. If usecols is specified, the positions of the columns
- not to cast is relative to the usecols not to all columns.
-
- Parameters
- ----------
- col_indices: The indices specifying order and positions of the columns
- names: The column names which order is corresponding with the order
- of col_indices
-
- Returns
- -------
- A set of integers containing the positions of the columns not to convert.
- """
- usecols: list[int] | list[str] | None
- noconvert_columns = set()
- if self.usecols_dtype == "integer":
- # A set of integers will be converted to a list in
- # the correct order every single time.
- usecols = sorted(self.usecols)
- elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
- # The names attribute should have the correct columns
- # in the proper order for indexing with parse_dates.
- usecols = col_indices
- else:
- # Usecols is empty.
- usecols = None
-
- def _set(x) -> int:
- if usecols is not None and is_integer(x):
- x = usecols[x]
-
- if not is_integer(x):
- x = col_indices[names.index(x)]
-
- return x
-
- if isinstance(self.parse_dates, list):
- for val in self.parse_dates:
- if isinstance(val, list):
- for k in val:
- noconvert_columns.add(_set(k))
- else:
- noconvert_columns.add(_set(val))
-
- elif isinstance(self.parse_dates, dict):
- for val in self.parse_dates.values():
- if isinstance(val, list):
- for k in val:
- noconvert_columns.add(_set(k))
- else:
- noconvert_columns.add(_set(val))
-
- elif self.parse_dates:
- if isinstance(self.index_col, list):
- for k in self.index_col:
- noconvert_columns.add(_set(k))
- elif self.index_col is not None:
- noconvert_columns.add(_set(self.index_col))
-
- return noconvert_columns
-
- def _infer_types(
- self, values, na_values, no_dtype_specified, try_num_bool: bool = True
- ) -> tuple[ArrayLike, int]:
- """
- Infer types of values, possibly casting
-
- Parameters
- ----------
- values : ndarray
- na_values : set
- no_dtype_specified: Specifies if we want to cast explicitly
- try_num_bool : bool, default try
- try to cast values to numeric (first preference) or boolean
-
- Returns
- -------
- converted : ndarray or ExtensionArray
- na_count : int
- """
- na_count = 0
- if issubclass(values.dtype.type, (np.number, np.bool_)):
- # If our array has numeric dtype, we don't have to check for strings in isin
- na_values = np.array([val for val in na_values if not isinstance(val, str)])
- mask = algorithms.isin(values, na_values)
- na_count = mask.astype("uint8", copy=False).sum()
- if na_count > 0:
- if is_integer_dtype(values):
- values = values.astype(np.float64)
- np.putmask(values, mask, np.nan)
- return values, na_count
-
- dtype_backend = self.dtype_backend
- non_default_dtype_backend = (
- no_dtype_specified and dtype_backend is not lib.no_default
- )
- result: ArrayLike
-
- if try_num_bool and is_object_dtype(values.dtype):
- # exclude e.g DatetimeIndex here
- try:
- result, result_mask = lib.maybe_convert_numeric(
- values,
- na_values,
- False,
- convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa
- )
- except (ValueError, TypeError):
- # e.g. encountering datetime string gets ValueError
- # TypeError can be raised in floatify
- na_count = parsers.sanitize_objects(values, na_values)
- result = values
- else:
- if non_default_dtype_backend:
- if result_mask is None:
- result_mask = np.zeros(result.shape, dtype=np.bool_)
-
- if result_mask.all():
- result = IntegerArray(
- np.ones(result_mask.shape, dtype=np.int64), result_mask
- )
- elif is_integer_dtype(result):
- result = IntegerArray(result, result_mask)
- elif is_bool_dtype(result):
- result = BooleanArray(result, result_mask)
- elif is_float_dtype(result):
- result = FloatingArray(result, result_mask)
-
- na_count = result_mask.sum()
- else:
- na_count = isna(result).sum()
- else:
- result = values
- if values.dtype == np.object_:
- na_count = parsers.sanitize_objects(values, na_values)
-
- if result.dtype == np.object_ and try_num_bool:
- result, bool_mask = libops.maybe_convert_bool(
- np.asarray(values),
- true_values=self.true_values,
- false_values=self.false_values,
- convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa
- )
- if result.dtype == np.bool_ and non_default_dtype_backend:
- if bool_mask is None:
- bool_mask = np.zeros(result.shape, dtype=np.bool_)
- result = BooleanArray(result, bool_mask)
- elif result.dtype == np.object_ and non_default_dtype_backend:
- # read_excel sends array of datetime objects
- inferred_type = lib.infer_dtype(result)
- if inferred_type != "datetime":
- result = StringDtype().construct_array_type()._from_sequence(values)
-
- if dtype_backend == "pyarrow":
- pa = import_optional_dependency("pyarrow")
- if isinstance(result, np.ndarray):
- result = ArrowExtensionArray(pa.array(result, from_pandas=True))
- else:
- # ExtensionArray
- result = ArrowExtensionArray(
- pa.array(result.to_numpy(), from_pandas=True)
- )
-
- return result, na_count
-
- def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
- """
- Cast values to specified type
-
- Parameters
- ----------
- values : ndarray or ExtensionArray
- cast_type : np.dtype or ExtensionDtype
- dtype to cast values to
- column : string
- column name - used only for error reporting
-
- Returns
- -------
- converted : ndarray or ExtensionArray
- """
- if isinstance(cast_type, CategoricalDtype):
- known_cats = cast_type.categories is not None
-
- if not is_object_dtype(values.dtype) and not known_cats:
- # TODO: this is for consistency with
- # c-parser which parses all categories
- # as strings
- values = lib.ensure_string_array(
- values, skipna=False, convert_na_value=False
- )
-
- cats = Index(values).unique().dropna()
- values = Categorical._from_inferred_categories(
- cats, cats.get_indexer(values), cast_type, true_values=self.true_values
- )
-
- # use the EA's implementation of casting
- elif isinstance(cast_type, ExtensionDtype):
- array_type = cast_type.construct_array_type()
- try:
- if isinstance(cast_type, BooleanDtype):
- # error: Unexpected keyword argument "true_values" for
- # "_from_sequence_of_strings" of "ExtensionArray"
- return array_type._from_sequence_of_strings( # type: ignore[call-arg] # noqa:E501
- values,
- dtype=cast_type,
- true_values=self.true_values,
- false_values=self.false_values,
- )
- else:
- return array_type._from_sequence_of_strings(values, dtype=cast_type)
- except NotImplementedError as err:
- raise NotImplementedError(
- f"Extension Array: {array_type} must implement "
- "_from_sequence_of_strings in order to be used in parser methods"
- ) from err
-
- elif isinstance(values, ExtensionArray):
- values = values.astype(cast_type, copy=False)
- elif issubclass(cast_type.type, str):
- # TODO: why skipna=True here and False above? some tests depend
- # on it here, but nothing fails if we change it above
- # (as no tests get there as of 2022-12-06)
- values = lib.ensure_string_array(
- values, skipna=True, convert_na_value=False
- )
- else:
- try:
- values = astype_array(values, cast_type, copy=True)
- except ValueError as err:
- raise ValueError(
- f"Unable to convert column {column} to type {cast_type}"
- ) from err
- return values
-
- @overload
- def _do_date_conversions(
- self,
- names: Index,
- data: DataFrame,
- ) -> tuple[Sequence[Hashable] | Index, DataFrame]:
- ...
-
- @overload
- def _do_date_conversions(
- self,
- names: Sequence[Hashable],
- data: Mapping[Hashable, ArrayLike],
- ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:
- ...
-
- def _do_date_conversions(
- self,
- names: Sequence[Hashable] | Index,
- data: Mapping[Hashable, ArrayLike] | DataFrame,
- ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:
- # returns data, columns
-
- if self.parse_dates is not None:
- data, names = _process_date_conversion(
- data,
- self._date_conv,
- self.parse_dates,
- self.index_col,
- self.index_names,
- names,
- keep_date_col=self.keep_date_col,
- dtype_backend=self.dtype_backend,
- )
-
- return names, data
-
- def _check_data_length(
- self,
- columns: Sequence[Hashable],
- data: Sequence[ArrayLike],
- ) -> None:
- """Checks if length of data is equal to length of column names.
-
- One set of trailing commas is allowed. self.index_col not False
- results in a ParserError previously when lengths do not match.
-
- Parameters
- ----------
- columns: list of column names
- data: list of array-likes containing the data column-wise.
- """
- if not self.index_col and len(columns) != len(data) and columns:
- empty_str = is_object_dtype(data[-1]) and data[-1] == ""
- # error: No overload variant of "__ror__" of "ndarray" matches
- # argument type "ExtensionArray"
- empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator]
- if len(columns) == len(data) - 1 and np.all(empty_str_or_na):
- return
- warnings.warn(
- "Length of header or names does not match length of data. This leads "
- "to a loss of data with index_col=False.",
- ParserWarning,
- stacklevel=find_stack_level(),
- )
-
- @overload
- def _evaluate_usecols(
- self,
- usecols: set[int] | Callable[[Hashable], object],
- names: Sequence[Hashable],
- ) -> set[int]:
- ...
-
- @overload
- def _evaluate_usecols(
- self, usecols: set[str], names: Sequence[Hashable]
- ) -> set[str]:
- ...
-
- def _evaluate_usecols(
- self,
- usecols: Callable[[Hashable], object] | set[str] | set[int],
- names: Sequence[Hashable],
- ) -> set[str] | set[int]:
- """
- Check whether or not the 'usecols' parameter
- is a callable. If so, enumerates the 'names'
- parameter and returns a set of indices for
- each entry in 'names' that evaluates to True.
- If not a callable, returns 'usecols'.
- """
- if callable(usecols):
- return {i for i, name in enumerate(names) if usecols(name)}
- return usecols
-
- def _validate_usecols_names(self, usecols, names):
- """
- Validates that all usecols are present in a given
- list of names. If not, raise a ValueError that
- shows what usecols are missing.
-
- Parameters
- ----------
- usecols : iterable of usecols
- The columns to validate are present in names.
- names : iterable of names
- The column names to check against.
-
- Returns
- -------
- usecols : iterable of usecols
- The `usecols` parameter if the validation succeeds.
-
- Raises
- ------
- ValueError : Columns were missing. Error message will list them.
- """
- missing = [c for c in usecols if c not in names]
- if len(missing) > 0:
- raise ValueError(
- f"Usecols do not match columns, columns expected but not found: "
- f"{missing}"
- )
-
- return usecols
-
- def _validate_usecols_arg(self, usecols):
- """
- Validate the 'usecols' parameter.
-
- Checks whether or not the 'usecols' parameter contains all integers
- (column selection by index), strings (column by name) or is a callable.
- Raises a ValueError if that is not the case.
-
- Parameters
- ----------
- usecols : list-like, callable, or None
- List of columns to use when parsing or a callable that can be used
- to filter a list of table columns.
-
- Returns
- -------
- usecols_tuple : tuple
- A tuple of (verified_usecols, usecols_dtype).
-
- 'verified_usecols' is either a set if an array-like is passed in or
- 'usecols' if a callable or None is passed in.
-
- 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
- is passed in or None if a callable or None is passed in.
- """
- msg = (
- "'usecols' must either be list-like of all strings, all unicode, "
- "all integers or a callable."
- )
- if usecols is not None:
- if callable(usecols):
- return usecols, None
-
- if not is_list_like(usecols):
- # see gh-20529
- #
- # Ensure it is iterable container but not string.
- raise ValueError(msg)
-
- usecols_dtype = lib.infer_dtype(usecols, skipna=False)
-
- if usecols_dtype not in ("empty", "integer", "string"):
- raise ValueError(msg)
-
- usecols = set(usecols)
-
- return usecols, usecols_dtype
- return usecols, None
-
- def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:
- if not is_index_col(index_col):
- return None, columns, index_col
-
- columns = list(columns)
-
- # In case of no rows and multiindex columns we have to set index_names to
- # list of Nones GH#38292
- if not columns:
- return [None] * len(index_col), columns, index_col
-
- cp_cols = list(columns)
- index_names: list[str | int | None] = []
-
- # don't mutate
- index_col = list(index_col)
-
- for i, c in enumerate(index_col):
- if isinstance(c, str):
- index_names.append(c)
- for j, name in enumerate(cp_cols):
- if name == c:
- index_col[i] = j
- columns.remove(name)
- break
- else:
- name = cp_cols[c]
- columns.remove(name)
- index_names.append(name)
-
- # Only clean index names that were placeholders.
- for i, name in enumerate(index_names):
- if isinstance(name, str) and name in self.unnamed_cols:
- index_names[i] = None
-
- return index_names, columns, index_col
-
- def _get_empty_meta(
- self, columns, index_col, index_names, dtype: DtypeArg | None = None
- ):
- columns = list(columns)
-
- # Convert `dtype` to a defaultdict of some kind.
- # This will enable us to write `dtype[col_name]`
- # without worrying about KeyError issues later on.
- dtype_dict: defaultdict[Hashable, Any]
- if not is_dict_like(dtype):
- # if dtype == None, default will be object.
- default_dtype = dtype or object
- dtype_dict = defaultdict(lambda: default_dtype)
- else:
- dtype = cast(dict, dtype)
- dtype_dict = defaultdict(
- lambda: object,
- {columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
- )
-
- # Even though we have no data, the "index" of the empty DataFrame
- # could for example still be an empty MultiIndex. Thus, we need to
- # check whether we have any index columns specified, via either:
- #
- # 1) index_col (column indices)
- # 2) index_names (column names)
- #
- # Both must be non-null to ensure a successful construction. Otherwise,
- # we have to create a generic empty Index.
- index: Index
- if (index_col is None or index_col is False) or index_names is None:
- index = default_index(0)
- else:
- data = [Series([], dtype=dtype_dict[name]) for name in index_names]
- index = ensure_index_from_sequences(data, names=index_names)
- index_col.sort()
-
- for i, n in enumerate(index_col):
- columns.pop(n - i)
-
- col_dict = {
- col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns
- }
-
- return index, columns, col_dict
-
-
-def _make_date_converter(
- date_parser=lib.no_default,
- dayfirst: bool = False,
- cache_dates: bool = True,
- date_format: dict[Hashable, str] | str | None = None,
-):
- if date_parser is not lib.no_default:
- warnings.warn(
- "The argument 'date_parser' is deprecated and will "
- "be removed in a future version. "
- "Please use 'date_format' instead, or read your data in as 'object' dtype "
- "and then call 'to_datetime'.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- if date_parser is not lib.no_default and date_format is not None:
- raise TypeError("Cannot use both 'date_parser' and 'date_format'")
-
- def unpack_if_single_element(arg):
- # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
- if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1:
- return arg[0]
- return arg
-
- def converter(*date_cols, col: Hashable):
- if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm":
- return date_cols[0]
-
- if date_parser is lib.no_default:
- strs = parsing.concat_date_cols(date_cols)
- date_fmt = (
- date_format.get(col) if isinstance(date_format, dict) else date_format
- )
-
- result = tools.to_datetime(
- ensure_object(strs),
- format=date_fmt,
- utc=False,
- dayfirst=dayfirst,
- errors="ignore",
- cache=cache_dates,
- )
- if isinstance(result, DatetimeIndex):
- arr = result.to_numpy()
- arr.flags.writeable = True
- return arr
- return result._values
- else:
- try:
- result = tools.to_datetime(
- date_parser(*(unpack_if_single_element(arg) for arg in date_cols)),
- errors="ignore",
- cache=cache_dates,
- )
- if isinstance(result, datetime.datetime):
- raise Exception("scalar parser")
- return result
- except Exception:
- return tools.to_datetime(
- parsing.try_parse_dates(
- parsing.concat_date_cols(date_cols),
- parser=date_parser,
- ),
- errors="ignore",
- )
-
- return converter
-
-
-parser_defaults = {
- "delimiter": None,
- "escapechar": None,
- "quotechar": '"',
- "quoting": csv.QUOTE_MINIMAL,
- "doublequote": True,
- "skipinitialspace": False,
- "lineterminator": None,
- "header": "infer",
- "index_col": None,
- "names": None,
- "skiprows": None,
- "skipfooter": 0,
- "nrows": None,
- "na_values": None,
- "keep_default_na": True,
- "true_values": None,
- "false_values": None,
- "converters": None,
- "dtype": None,
- "cache_dates": True,
- "thousands": None,
- "comment": None,
- "decimal": ".",
- # 'engine': 'c',
- "parse_dates": False,
- "keep_date_col": False,
- "dayfirst": False,
- "date_parser": lib.no_default,
- "date_format": None,
- "usecols": None,
- # 'iterator': False,
- "chunksize": None,
- "verbose": False,
- "encoding": None,
- "compression": None,
- "skip_blank_lines": True,
- "encoding_errors": "strict",
- "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,
- "dtype_backend": lib.no_default,
-}
-
-
-def _process_date_conversion(
- data_dict,
- converter: Callable,
- parse_spec,
- index_col,
- index_names,
- columns,
- keep_date_col: bool = False,
- dtype_backend=lib.no_default,
-):
- def _isindex(colspec):
- return (isinstance(index_col, list) and colspec in index_col) or (
- isinstance(index_names, list) and colspec in index_names
- )
-
- new_cols = []
- new_data = {}
-
- orig_names = columns
- columns = list(columns)
-
- date_cols = set()
-
- if parse_spec is None or isinstance(parse_spec, bool):
- return data_dict, columns
-
- if isinstance(parse_spec, list):
- # list of column lists
- for colspec in parse_spec:
- if is_scalar(colspec) or isinstance(colspec, tuple):
- if isinstance(colspec, int) and colspec not in data_dict:
- colspec = orig_names[colspec]
- if _isindex(colspec):
- continue
- elif dtype_backend == "pyarrow":
- import pyarrow as pa
-
- dtype = data_dict[colspec].dtype
- if isinstance(dtype, ArrowDtype) and (
- pa.types.is_timestamp(dtype.pyarrow_dtype)
- or pa.types.is_date(dtype.pyarrow_dtype)
- ):
- continue
-
- # Pyarrow engine returns Series which we need to convert to
- # numpy array before converter, its a no-op for other parsers
- data_dict[colspec] = converter(
- np.asarray(data_dict[colspec]), col=colspec
- )
- else:
- new_name, col, old_names = _try_convert_dates(
- converter, colspec, data_dict, orig_names
- )
- if new_name in data_dict:
- raise ValueError(f"New date column already in dict {new_name}")
- new_data[new_name] = col
- new_cols.append(new_name)
- date_cols.update(old_names)
-
- elif isinstance(parse_spec, dict):
- # dict of new name to column list
- for new_name, colspec in parse_spec.items():
- if new_name in data_dict:
- raise ValueError(f"Date column {new_name} already in dict")
-
- _, col, old_names = _try_convert_dates(
- converter,
- colspec,
- data_dict,
- orig_names,
- target_name=new_name,
- )
-
- new_data[new_name] = col
-
- # If original column can be converted to date we keep the converted values
- # This can only happen if values are from single column
- if len(colspec) == 1:
- new_data[colspec[0]] = col
-
- new_cols.append(new_name)
- date_cols.update(old_names)
-
- data_dict.update(new_data)
- new_cols.extend(columns)
-
- if not keep_date_col:
- for c in list(date_cols):
- data_dict.pop(c)
- new_cols.remove(c)
-
- return data_dict, new_cols
-
-
-def _try_convert_dates(
- parser: Callable, colspec, data_dict, columns, target_name: str | None = None
-):
- colset = set(columns)
- colnames = []
-
- for c in colspec:
- if c in colset:
- colnames.append(c)
- elif isinstance(c, int) and c not in columns:
- colnames.append(columns[c])
- else:
- colnames.append(c)
-
- new_name: tuple | str
- if all(isinstance(x, tuple) for x in colnames):
- new_name = tuple(map("_".join, zip(*colnames)))
- else:
- new_name = "_".join([str(x) for x in colnames])
- to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]
-
- new_col = parser(*to_parse, col=new_name if target_name is None else target_name)
- return new_name, new_col, colnames
-
-
-def _get_na_values(col, na_values, na_fvalues, keep_default_na):
- """
- Get the NaN values for a given column.
-
- Parameters
- ----------
- col : str
- The name of the column.
- na_values : array-like, dict
- The object listing the NaN values as strings.
- na_fvalues : array-like, dict
- The object listing the NaN values as floats.
- keep_default_na : bool
- If `na_values` is a dict, and the column is not mapped in the
- dictionary, whether to return the default NaN values or the empty set.
-
- Returns
- -------
- nan_tuple : A length-two tuple composed of
-
- 1) na_values : the string NaN values for that column.
- 2) na_fvalues : the float NaN values for that column.
- """
- if isinstance(na_values, dict):
- if col in na_values:
- return na_values[col], na_fvalues[col]
- else:
- if keep_default_na:
- return STR_NA_VALUES, set()
-
- return set(), set()
- else:
- return na_values, na_fvalues
-
-
-def _validate_parse_dates_arg(parse_dates):
- """
- Check whether or not the 'parse_dates' parameter
- is a non-boolean scalar. Raises a ValueError if
- that is the case.
- """
- msg = (
- "Only booleans, lists, and dictionaries are accepted "
- "for the 'parse_dates' parameter"
- )
-
- if parse_dates is not None:
- if is_scalar(parse_dates):
- if not lib.is_bool(parse_dates):
- raise TypeError(msg)
-
- elif not isinstance(parse_dates, (list, dict)):
- raise TypeError(msg)
-
- return parse_dates
-
-
-def is_index_col(col) -> bool:
- return col is not None and col is not False
diff --git a/contrib/python/pandas/py3/pandas/io/parsers/c_parser_wrapper.py b/contrib/python/pandas/py3/pandas/io/parsers/c_parser_wrapper.py
deleted file mode 100644
index 0e8a711e615..00000000000
--- a/contrib/python/pandas/py3/pandas/io/parsers/c_parser_wrapper.py
+++ /dev/null
@@ -1,423 +0,0 @@
-from __future__ import annotations
-
-from collections import defaultdict
-from typing import (
- TYPE_CHECKING,
- Hashable,
- Mapping,
- Sequence,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import (
- lib,
- parsers,
-)
-from pandas._typing import (
- ArrayLike,
- DtypeArg,
- DtypeObj,
- ReadCsvBuffer,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.errors import DtypeWarning
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import (
- is_categorical_dtype,
- pandas_dtype,
-)
-from pandas.core.dtypes.concat import (
- concat_compat,
- union_categoricals,
-)
-
-from pandas.core.indexes.api import ensure_index_from_sequences
-
-from pandas.io.common import (
- dedup_names,
- is_potential_multi_index,
-)
-from pandas.io.parsers.base_parser import (
- ParserBase,
- ParserError,
- is_index_col,
-)
-
-if TYPE_CHECKING:
- from pandas import (
- Index,
- MultiIndex,
- )
-
-
-class CParserWrapper(ParserBase):
- low_memory: bool
- _reader: parsers.TextReader
-
- def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
- super().__init__(kwds)
- self.kwds = kwds
- kwds = kwds.copy()
-
- self.low_memory = kwds.pop("low_memory", False)
-
- # #2442
- # error: Cannot determine type of 'index_col'
- kwds["allow_leading_cols"] = (
- self.index_col is not False # type: ignore[has-type]
- )
-
- # GH20529, validate usecol arg before TextReader
- kwds["usecols"] = self.usecols
-
- # Have to pass int, would break tests using TextReader directly otherwise :(
- kwds["on_bad_lines"] = self.on_bad_lines.value
-
- for key in (
- "storage_options",
- "encoding",
- "memory_map",
- "compression",
- ):
- kwds.pop(key, None)
-
- kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
- if "dtype_backend" not in kwds or kwds["dtype_backend"] is lib.no_default:
- kwds["dtype_backend"] = "numpy"
- if kwds["dtype_backend"] == "pyarrow":
- # Fail here loudly instead of in cython after reading
- import_optional_dependency("pyarrow")
- self._reader = parsers.TextReader(src, **kwds)
-
- self.unnamed_cols = self._reader.unnamed_cols
-
- # error: Cannot determine type of 'names'
- passed_names = self.names is None # type: ignore[has-type]
-
- if self._reader.header is None:
- self.names = None
- else:
- # error: Cannot determine type of 'names'
- # error: Cannot determine type of 'index_names'
- (
- self.names, # type: ignore[has-type]
- self.index_names,
- self.col_names,
- passed_names,
- ) = self._extract_multi_indexer_columns(
- self._reader.header,
- self.index_names, # type: ignore[has-type]
- passed_names,
- )
-
- # error: Cannot determine type of 'names'
- if self.names is None: # type: ignore[has-type]
- self.names = list(range(self._reader.table_width))
-
- # gh-9755
- #
- # need to set orig_names here first
- # so that proper indexing can be done
- # with _set_noconvert_columns
- #
- # once names has been filtered, we will
- # then set orig_names again to names
- # error: Cannot determine type of 'names'
- self.orig_names = self.names[:] # type: ignore[has-type]
-
- if self.usecols:
- usecols = self._evaluate_usecols(self.usecols, self.orig_names)
-
- # GH 14671
- # assert for mypy, orig_names is List or None, None would error in issubset
- assert self.orig_names is not None
- if self.usecols_dtype == "string" and not set(usecols).issubset(
- self.orig_names
- ):
- self._validate_usecols_names(usecols, self.orig_names)
-
- # error: Cannot determine type of 'names'
- if len(self.names) > len(usecols): # type: ignore[has-type]
- # error: Cannot determine type of 'names'
- self.names = [ # type: ignore[has-type]
- n
- # error: Cannot determine type of 'names'
- for i, n in enumerate(self.names) # type: ignore[has-type]
- if (i in usecols or n in usecols)
- ]
-
- # error: Cannot determine type of 'names'
- if len(self.names) < len(usecols): # type: ignore[has-type]
- # error: Cannot determine type of 'names'
- self._validate_usecols_names(
- usecols,
- self.names, # type: ignore[has-type]
- )
-
- # error: Cannot determine type of 'names'
- self._validate_parse_dates_presence(self.names) # type: ignore[has-type]
- self._set_noconvert_columns()
-
- # error: Cannot determine type of 'names'
- self.orig_names = self.names # type: ignore[has-type]
-
- if not self._has_complex_date_col:
- # error: Cannot determine type of 'index_col'
- if self._reader.leading_cols == 0 and is_index_col(
- self.index_col # type: ignore[has-type]
- ):
- self._name_processed = True
- (
- index_names,
- # error: Cannot determine type of 'names'
- self.names, # type: ignore[has-type]
- self.index_col,
- ) = self._clean_index_names(
- # error: Cannot determine type of 'names'
- self.names, # type: ignore[has-type]
- # error: Cannot determine type of 'index_col'
- self.index_col, # type: ignore[has-type]
- )
-
- if self.index_names is None:
- self.index_names = index_names
-
- if self._reader.header is None and not passed_names:
- assert self.index_names is not None
- self.index_names = [None] * len(self.index_names)
-
- self._implicit_index = self._reader.leading_cols > 0
-
- def close(self) -> None:
- # close handles opened by C parser
- try:
- self._reader.close()
- except ValueError:
- pass
-
- def _set_noconvert_columns(self) -> None:
- """
- Set the columns that should not undergo dtype conversions.
-
- Currently, any column that is involved with date parsing will not
- undergo such conversions.
- """
- assert self.orig_names is not None
- # error: Cannot determine type of 'names'
-
- # much faster than using orig_names.index(x) xref GH#44106
- names_dict = {x: i for i, x in enumerate(self.orig_names)}
- col_indices = [names_dict[x] for x in self.names] # type: ignore[has-type]
- # error: Cannot determine type of 'names'
- noconvert_columns = self._set_noconvert_dtype_columns(
- col_indices,
- self.names, # type: ignore[has-type]
- )
- for col in noconvert_columns:
- self._reader.set_noconvert(col)
-
- def read(
- self,
- nrows: int | None = None,
- ) -> tuple[
- Index | MultiIndex | None,
- Sequence[Hashable] | MultiIndex,
- Mapping[Hashable, ArrayLike],
- ]:
- index: Index | MultiIndex | None
- column_names: Sequence[Hashable] | MultiIndex
- try:
- if self.low_memory:
- chunks = self._reader.read_low_memory(nrows)
- # destructive to chunks
- data = _concatenate_chunks(chunks)
-
- else:
- data = self._reader.read(nrows)
- except StopIteration:
- if self._first_chunk:
- self._first_chunk = False
- names = dedup_names(
- self.orig_names,
- is_potential_multi_index(self.orig_names, self.index_col),
- )
- index, columns, col_dict = self._get_empty_meta(
- names,
- self.index_col,
- self.index_names,
- dtype=self.kwds.get("dtype"),
- )
- columns = self._maybe_make_multi_index_columns(columns, self.col_names)
-
- if self.usecols is not None:
- columns = self._filter_usecols(columns)
-
- col_dict = {k: v for k, v in col_dict.items() if k in columns}
-
- return index, columns, col_dict
-
- else:
- self.close()
- raise
-
- # Done with first read, next time raise StopIteration
- self._first_chunk = False
-
- # error: Cannot determine type of 'names'
- names = self.names # type: ignore[has-type]
-
- if self._reader.leading_cols:
- if self._has_complex_date_col:
- raise NotImplementedError("file structure not yet supported")
-
- # implicit index, no index names
- arrays = []
-
- if self.index_col and self._reader.leading_cols != len(self.index_col):
- raise ParserError(
- "Could not construct index. Requested to use "
- f"{len(self.index_col)} number of columns, but "
- f"{self._reader.leading_cols} left to parse."
- )
-
- for i in range(self._reader.leading_cols):
- if self.index_col is None:
- values = data.pop(i)
- else:
- values = data.pop(self.index_col[i])
-
- values = self._maybe_parse_dates(values, i, try_parse_dates=True)
- arrays.append(values)
-
- index = ensure_index_from_sequences(arrays)
-
- if self.usecols is not None:
- names = self._filter_usecols(names)
-
- names = dedup_names(names, is_potential_multi_index(names, self.index_col))
-
- # rename dict keys
- data_tups = sorted(data.items())
- data = {k: v for k, (i, v) in zip(names, data_tups)}
-
- column_names, date_data = self._do_date_conversions(names, data)
-
- # maybe create a mi on the columns
- column_names = self._maybe_make_multi_index_columns(
- column_names, self.col_names
- )
-
- else:
- # rename dict keys
- data_tups = sorted(data.items())
-
- # ugh, mutation
-
- # assert for mypy, orig_names is List or None, None would error in list(...)
- assert self.orig_names is not None
- names = list(self.orig_names)
- names = dedup_names(names, is_potential_multi_index(names, self.index_col))
-
- if self.usecols is not None:
- names = self._filter_usecols(names)
-
- # columns as list
- alldata = [x[1] for x in data_tups]
- if self.usecols is None:
- self._check_data_length(names, alldata)
-
- data = {k: v for k, (i, v) in zip(names, data_tups)}
-
- names, date_data = self._do_date_conversions(names, data)
- index, column_names = self._make_index(date_data, alldata, names)
-
- return index, column_names, date_data
-
- def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
- # hackish
- usecols = self._evaluate_usecols(self.usecols, names)
- if usecols is not None and len(names) != len(usecols):
- names = [
- name for i, name in enumerate(names) if i in usecols or name in usecols
- ]
- return names
-
- def _get_index_names(self):
- names = list(self._reader.header[0])
- idx_names = None
-
- if self._reader.leading_cols == 0 and self.index_col is not None:
- (idx_names, names, self.index_col) = self._clean_index_names(
- names, self.index_col
- )
-
- return names, idx_names
-
- def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
- if try_parse_dates and self._should_parse_dates(index):
- values = self._date_conv(
- values,
- col=self.index_names[index] if self.index_names is not None else None,
- )
- return values
-
-
-def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
- """
- Concatenate chunks of data read with low_memory=True.
-
- The tricky part is handling Categoricals, where different chunks
- may have different inferred categories.
- """
- names = list(chunks[0].keys())
- warning_columns = []
-
- result: dict = {}
- for name in names:
- arrs = [chunk.pop(name) for chunk in chunks]
- # Check each arr for consistent types.
- dtypes = {a.dtype for a in arrs}
- non_cat_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
-
- dtype = dtypes.pop()
- if is_categorical_dtype(dtype):
- result[name] = union_categoricals(arrs, sort_categories=False)
- else:
- result[name] = concat_compat(arrs)
- if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object):
- warning_columns.append(str(name))
-
- if warning_columns:
- warning_names = ",".join(warning_columns)
- warning_message = " ".join(
- [
- f"Columns ({warning_names}) have mixed types. "
- f"Specify dtype option on import or set low_memory=False."
- ]
- )
- warnings.warn(warning_message, DtypeWarning, stacklevel=find_stack_level())
- return result
-
-
-def ensure_dtype_objs(
- dtype: DtypeArg | dict[Hashable, DtypeArg] | None
-) -> DtypeObj | dict[Hashable, DtypeObj] | None:
- """
- Ensure we have either None, a dtype object, or a dictionary mapping to
- dtype objects.
- """
- if isinstance(dtype, defaultdict):
- # "None" not callable [misc]
- default_dtype = pandas_dtype(dtype.default_factory()) # type: ignore[misc]
- dtype_converted: defaultdict = defaultdict(lambda: default_dtype)
- for key in dtype.keys():
- dtype_converted[key] = pandas_dtype(dtype[key])
- return dtype_converted
- elif isinstance(dtype, dict):
- return {k: pandas_dtype(dtype[k]) for k in dtype}
- elif dtype is not None:
- return pandas_dtype(dtype)
- return dtype
diff --git a/contrib/python/pandas/py3/pandas/io/parsers/python_parser.py b/contrib/python/pandas/py3/pandas/io/parsers/python_parser.py
deleted file mode 100644
index 263966269c0..00000000000
--- a/contrib/python/pandas/py3/pandas/io/parsers/python_parser.py
+++ /dev/null
@@ -1,1351 +0,0 @@
-from __future__ import annotations
-
-from collections import (
- abc,
- defaultdict,
-)
-import csv
-from io import StringIO
-import re
-import sys
-from typing import (
- IO,
- TYPE_CHECKING,
- DefaultDict,
- Hashable,
- Iterator,
- List,
- Literal,
- Mapping,
- Sequence,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import (
- ArrayLike,
- ReadCsvBuffer,
- Scalar,
-)
-from pandas.errors import (
- EmptyDataError,
- ParserError,
-)
-
-from pandas.core.dtypes.common import is_integer
-from pandas.core.dtypes.inference import is_dict_like
-
-from pandas.io.common import (
- dedup_names,
- is_potential_multi_index,
-)
-from pandas.io.parsers.base_parser import (
- ParserBase,
- parser_defaults,
-)
-
-if TYPE_CHECKING:
- from pandas import (
- Index,
- MultiIndex,
- )
-
-# BOM character (byte order mark)
-# This exists at the beginning of a file to indicate endianness
-# of a file (stream). Unfortunately, this marker screws up parsing,
-# so we need to remove it if we see it.
-_BOM = "\ufeff"
-
-
-class PythonParser(ParserBase):
- def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
- """
- Workhorse function for processing nested list into DataFrame
- """
- super().__init__(kwds)
-
- self.data: Iterator[str] | None = None
- self.buf: list = []
- self.pos = 0
- self.line_pos = 0
-
- self.skiprows = kwds["skiprows"]
-
- if callable(self.skiprows):
- self.skipfunc = self.skiprows
- else:
- self.skipfunc = lambda x: x in self.skiprows
-
- self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"])
- self.delimiter = kwds["delimiter"]
-
- self.quotechar = kwds["quotechar"]
- if isinstance(self.quotechar, str):
- self.quotechar = str(self.quotechar)
-
- self.escapechar = kwds["escapechar"]
- self.doublequote = kwds["doublequote"]
- self.skipinitialspace = kwds["skipinitialspace"]
- self.lineterminator = kwds["lineterminator"]
- self.quoting = kwds["quoting"]
- self.skip_blank_lines = kwds["skip_blank_lines"]
-
- self.names_passed = kwds["names"] or None
-
- self.has_index_names = False
- if "has_index_names" in kwds:
- self.has_index_names = kwds["has_index_names"]
-
- self.verbose = kwds["verbose"]
-
- self.thousands = kwds["thousands"]
- self.decimal = kwds["decimal"]
-
- self.comment = kwds["comment"]
-
- # Set self.data to something that can read lines.
- if isinstance(f, list):
- # read_excel: f is a list
- self.data = cast(Iterator[str], f)
- else:
- assert hasattr(f, "readline")
- self._make_reader(f)
-
- # Get columns in two steps: infer from data, then
- # infer column indices from self.usecols if it is specified.
- self._col_indices: list[int] | None = None
- columns: list[list[Scalar | None]]
- (
- columns,
- self.num_original_columns,
- self.unnamed_cols,
- ) = self._infer_columns()
-
- # Now self.columns has the set of columns that we will process.
- # The original set is stored in self.original_columns.
- # error: Cannot determine type of 'index_names'
- (
- self.columns,
- self.index_names,
- self.col_names,
- _,
- ) = self._extract_multi_indexer_columns(
- columns,
- self.index_names, # type: ignore[has-type]
- )
-
- # get popped off for index
- self.orig_names: list[Hashable] = list(self.columns)
-
- # needs to be cleaned/refactored
- # multiple date column thing turning into a real spaghetti factory
-
- if not self._has_complex_date_col:
- (index_names, self.orig_names, self.columns) = self._get_index_name(
- self.columns
- )
- self._name_processed = True
- if self.index_names is None:
- self.index_names = index_names
-
- if self._col_indices is None:
- self._col_indices = list(range(len(self.columns)))
-
- self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
- no_thousands_columns: set[int] | None = None
- if self.parse_dates:
- no_thousands_columns = self._set_noconvert_dtype_columns(
- self._col_indices, self.columns
- )
- self._no_thousands_columns = no_thousands_columns
-
- if len(self.decimal) != 1:
- raise ValueError("Only length-1 decimal markers supported")
-
- decimal = re.escape(self.decimal)
- if self.thousands is None:
- regex = rf"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
- else:
- thousands = re.escape(self.thousands)
- regex = (
- rf"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
- rf"([0-9]?(E|e)\-?[0-9]+)?$"
- )
- self.num = re.compile(regex)
-
- def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
- sep = self.delimiter
-
- if sep is None or len(sep) == 1:
- if self.lineterminator:
- raise ValueError(
- "Custom line terminators not supported in python parser (yet)"
- )
-
- class MyDialect(csv.Dialect):
- delimiter = self.delimiter
- quotechar = self.quotechar
- escapechar = self.escapechar
- doublequote = self.doublequote
- skipinitialspace = self.skipinitialspace
- quoting = self.quoting
- lineterminator = "\n"
-
- dia = MyDialect
-
- if sep is not None:
- dia.delimiter = sep
- else:
- # attempt to sniff the delimiter from the first valid line,
- # i.e. no comment line and not in skiprows
- line = f.readline()
- lines = self._check_comments([[line]])[0]
- while self.skipfunc(self.pos) or not lines:
- self.pos += 1
- line = f.readline()
- lines = self._check_comments([[line]])[0]
- lines_str = cast(List[str], lines)
-
- # since `line` was a string, lines will be a list containing
- # only a single string
- line = lines_str[0]
-
- self.pos += 1
- self.line_pos += 1
- sniffed = csv.Sniffer().sniff(line)
- dia.delimiter = sniffed.delimiter
-
- # Note: encoding is irrelevant here
- line_rdr = csv.reader(StringIO(line), dialect=dia)
- self.buf.extend(list(line_rdr))
-
- # Note: encoding is irrelevant here
- reader = csv.reader(f, dialect=dia, strict=True)
-
- else:
-
- def _read():
- line = f.readline()
- pat = re.compile(sep)
-
- yield pat.split(line.strip())
-
- for line in f:
- yield pat.split(line.strip())
-
- reader = _read()
-
- # error: Incompatible types in assignment (expression has type "_reader",
- # variable has type "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
- # TextIOWrapper, mmap, None]")
- self.data = reader # type: ignore[assignment]
-
- def read(
- self, rows: int | None = None
- ) -> tuple[
- Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike]
- ]:
- try:
- content = self._get_lines(rows)
- except StopIteration:
- if self._first_chunk:
- content = []
- else:
- self.close()
- raise
-
- # done with first read, next time raise StopIteration
- self._first_chunk = False
-
- columns: Sequence[Hashable] = list(self.orig_names)
- if not len(content): # pragma: no cover
- # DataFrame with the right metadata, even though it's length 0
- # error: Cannot determine type of 'index_col'
- names = dedup_names(
- self.orig_names,
- is_potential_multi_index(
- self.orig_names,
- self.index_col, # type: ignore[has-type]
- ),
- )
- # error: Cannot determine type of 'index_col'
- index, columns, col_dict = self._get_empty_meta(
- names,
- self.index_col, # type: ignore[has-type]
- self.index_names,
- self.dtype,
- )
- conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names)
- return index, conv_columns, col_dict
-
- # handle new style for names in index
- count_empty_content_vals = count_empty_vals(content[0])
- indexnamerow = None
- if self.has_index_names and count_empty_content_vals == len(columns):
- indexnamerow = content[0]
- content = content[1:]
-
- alldata = self._rows_to_cols(content)
- data, columns = self._exclude_implicit_index(alldata)
-
- conv_data = self._convert_data(data)
- columns, conv_data = self._do_date_conversions(columns, conv_data)
-
- index, result_columns = self._make_index(
- conv_data, alldata, columns, indexnamerow
- )
-
- return index, result_columns, conv_data
-
- def _exclude_implicit_index(
- self,
- alldata: list[np.ndarray],
- ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]:
- # error: Cannot determine type of 'index_col'
- names = dedup_names(
- self.orig_names,
- is_potential_multi_index(
- self.orig_names,
- self.index_col, # type: ignore[has-type]
- ),
- )
-
- offset = 0
- if self._implicit_index:
- # error: Cannot determine type of 'index_col'
- offset = len(self.index_col) # type: ignore[has-type]
-
- len_alldata = len(alldata)
- self._check_data_length(names, alldata)
-
- return {
- name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata
- }, names
-
- # legacy
- def get_chunk(
- self, size: int | None = None
- ) -> tuple[
- Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike]
- ]:
- if size is None:
- # error: "PythonParser" has no attribute "chunksize"
- size = self.chunksize # type: ignore[attr-defined]
- return self.read(rows=size)
-
- def _convert_data(
- self,
- data: Mapping[Hashable, np.ndarray],
- ) -> Mapping[Hashable, ArrayLike]:
- # apply converters
- clean_conv = self._clean_mapping(self.converters)
- clean_dtypes = self._clean_mapping(self.dtype)
-
- # Apply NA values.
- clean_na_values = {}
- clean_na_fvalues = {}
-
- if isinstance(self.na_values, dict):
- for col in self.na_values:
- na_value = self.na_values[col]
- na_fvalue = self.na_fvalues[col]
-
- if isinstance(col, int) and col not in self.orig_names:
- col = self.orig_names[col]
-
- clean_na_values[col] = na_value
- clean_na_fvalues[col] = na_fvalue
- else:
- clean_na_values = self.na_values
- clean_na_fvalues = self.na_fvalues
-
- return self._convert_to_ndarrays(
- data,
- clean_na_values,
- clean_na_fvalues,
- self.verbose,
- clean_conv,
- clean_dtypes,
- )
-
- def _infer_columns(
- self,
- ) -> tuple[list[list[Scalar | None]], int, set[Scalar | None]]:
- names = self.names
- num_original_columns = 0
- clear_buffer = True
- unnamed_cols: set[Scalar | None] = set()
- self._header_line = None
-
- if self.header is not None:
- header = self.header
-
- if isinstance(header, (list, tuple, np.ndarray)):
- have_mi_columns = len(header) > 1
- # we have a mi columns, so read an extra line
- if have_mi_columns:
- header = list(header) + [header[-1] + 1]
- else:
- have_mi_columns = False
- header = [header]
-
- columns: list[list[Scalar | None]] = []
- for level, hr in enumerate(header):
- try:
- line = self._buffered_line()
-
- while self.line_pos <= hr:
- line = self._next_line()
-
- except StopIteration as err:
- if 0 < self.line_pos <= hr and (
- not have_mi_columns or hr != header[-1]
- ):
- # If no rows we want to raise a different message and if
- # we have mi columns, the last line is not part of the header
- joi = list(map(str, header[:-1] if have_mi_columns else header))
- msg = f"[{','.join(joi)}], len of {len(joi)}, "
- raise ValueError(
- f"Passed header={msg}"
- f"but only {self.line_pos} lines in file"
- ) from err
-
- # We have an empty file, so check
- # if columns are provided. That will
- # serve as the 'line' for parsing
- if have_mi_columns and hr > 0:
- if clear_buffer:
- self._clear_buffer()
- columns.append([None] * len(columns[-1]))
- return columns, num_original_columns, unnamed_cols
-
- if not self.names:
- raise EmptyDataError("No columns to parse from file") from err
-
- line = self.names[:]
-
- this_columns: list[Scalar | None] = []
- this_unnamed_cols = []
-
- for i, c in enumerate(line):
- if c == "":
- if have_mi_columns:
- col_name = f"Unnamed: {i}_level_{level}"
- else:
- col_name = f"Unnamed: {i}"
-
- this_unnamed_cols.append(i)
- this_columns.append(col_name)
- else:
- this_columns.append(c)
-
- if not have_mi_columns:
- counts: DefaultDict = defaultdict(int)
- # Ensure that regular columns are used before unnamed ones
- # to keep given names and mangle unnamed columns
- col_loop_order = [
- i
- for i in range(len(this_columns))
- if i not in this_unnamed_cols
- ] + this_unnamed_cols
-
- # TODO: Use pandas.io.common.dedup_names instead (see #50371)
- for i in col_loop_order:
- col = this_columns[i]
- old_col = col
- cur_count = counts[col]
-
- if cur_count > 0:
- while cur_count > 0:
- counts[old_col] = cur_count + 1
- col = f"{old_col}.{cur_count}"
- if col in this_columns:
- cur_count += 1
- else:
- cur_count = counts[col]
-
- if (
- self.dtype is not None
- and is_dict_like(self.dtype)
- and self.dtype.get(old_col) is not None
- and self.dtype.get(col) is None
- ):
- self.dtype.update({col: self.dtype.get(old_col)})
- this_columns[i] = col
- counts[col] = cur_count + 1
- elif have_mi_columns:
- # if we have grabbed an extra line, but its not in our
- # format so save in the buffer, and create an blank extra
- # line for the rest of the parsing code
- if hr == header[-1]:
- lc = len(this_columns)
- # error: Cannot determine type of 'index_col'
- sic = self.index_col # type: ignore[has-type]
- ic = len(sic) if sic is not None else 0
- unnamed_count = len(this_unnamed_cols)
-
- # if wrong number of blanks or no index, not our format
- if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0:
- clear_buffer = False
- this_columns = [None] * lc
- self.buf = [self.buf[-1]]
-
- columns.append(this_columns)
- unnamed_cols.update({this_columns[i] for i in this_unnamed_cols})
-
- if len(columns) == 1:
- num_original_columns = len(this_columns)
-
- if clear_buffer:
- self._clear_buffer()
-
- first_line: list[Scalar] | None
- if names is not None:
- # Read first row after header to check if data are longer
- try:
- first_line = self._next_line()
- except StopIteration:
- first_line = None
-
- len_first_data_row = 0 if first_line is None else len(first_line)
-
- if len(names) > len(columns[0]) and len(names) > len_first_data_row:
- raise ValueError(
- "Number of passed names did not match "
- "number of header fields in the file"
- )
- if len(columns) > 1:
- raise TypeError("Cannot pass names with multi-index columns")
-
- if self.usecols is not None:
- # Set _use_cols. We don't store columns because they are
- # overwritten.
- self._handle_usecols(columns, names, num_original_columns)
- else:
- num_original_columns = len(names)
- if self._col_indices is not None and len(names) != len(
- self._col_indices
- ):
- columns = [[names[i] for i in sorted(self._col_indices)]]
- else:
- columns = [names]
- else:
- columns = self._handle_usecols(
- columns, columns[0], num_original_columns
- )
- else:
- try:
- line = self._buffered_line()
-
- except StopIteration as err:
- if not names:
- raise EmptyDataError("No columns to parse from file") from err
-
- line = names[:]
-
- # Store line, otherwise it is lost for guessing the index
- self._header_line = line
- ncols = len(line)
- num_original_columns = ncols
-
- if not names:
- columns = [list(range(ncols))]
- columns = self._handle_usecols(
- columns, columns[0], num_original_columns
- )
- else:
- if self.usecols is None or len(names) >= num_original_columns:
- columns = self._handle_usecols([names], names, num_original_columns)
- num_original_columns = len(names)
- else:
- if not callable(self.usecols) and len(names) != len(self.usecols):
- raise ValueError(
- "Number of passed names did not match number of "
- "header fields in the file"
- )
- # Ignore output but set used columns.
- self._handle_usecols([names], names, ncols)
- columns = [names]
- num_original_columns = ncols
-
- return columns, num_original_columns, unnamed_cols
-
- def _handle_usecols(
- self,
- columns: list[list[Scalar | None]],
- usecols_key: list[Scalar | None],
- num_original_columns: int,
- ) -> list[list[Scalar | None]]:
- """
- Sets self._col_indices
-
- usecols_key is used if there are string usecols.
- """
- col_indices: set[int] | list[int]
- if self.usecols is not None:
- if callable(self.usecols):
- col_indices = self._evaluate_usecols(self.usecols, usecols_key)
- elif any(isinstance(u, str) for u in self.usecols):
- if len(columns) > 1:
- raise ValueError(
- "If using multiple headers, usecols must be integers."
- )
- col_indices = []
-
- for col in self.usecols:
- if isinstance(col, str):
- try:
- col_indices.append(usecols_key.index(col))
- except ValueError:
- self._validate_usecols_names(self.usecols, usecols_key)
- else:
- col_indices.append(col)
- else:
- missing_usecols = [
- col for col in self.usecols if col >= num_original_columns
- ]
- if missing_usecols:
- raise ParserError(
- "Defining usecols without of bounds indices is not allowed. "
- f"{missing_usecols} are out of bounds.",
- )
- col_indices = self.usecols
-
- columns = [
- [n for i, n in enumerate(column) if i in col_indices]
- for column in columns
- ]
- self._col_indices = sorted(col_indices)
- return columns
-
- def _buffered_line(self) -> list[Scalar]:
- """
- Return a line from buffer, filling buffer if required.
- """
- if len(self.buf) > 0:
- return self.buf[0]
- else:
- return self._next_line()
-
- def _check_for_bom(self, first_row: list[Scalar]) -> list[Scalar]:
- """
- Checks whether the file begins with the BOM character.
- If it does, remove it. In addition, if there is quoting
- in the field subsequent to the BOM, remove it as well
- because it technically takes place at the beginning of
- the name, not the middle of it.
- """
- # first_row will be a list, so we need to check
- # that that list is not empty before proceeding.
- if not first_row:
- return first_row
-
- # The first element of this row is the one that could have the
- # BOM that we want to remove. Check that the first element is a
- # string before proceeding.
- if not isinstance(first_row[0], str):
- return first_row
-
- # Check that the string is not empty, as that would
- # obviously not have a BOM at the start of it.
- if not first_row[0]:
- return first_row
-
- # Since the string is non-empty, check that it does
- # in fact begin with a BOM.
- first_elt = first_row[0][0]
- if first_elt != _BOM:
- return first_row
-
- first_row_bom = first_row[0]
- new_row: str
-
- if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar:
- start = 2
- quote = first_row_bom[1]
- end = first_row_bom[2:].index(quote) + 2
-
- # Extract the data between the quotation marks
- new_row = first_row_bom[start:end]
-
- # Extract any remaining data after the second
- # quotation mark.
- if len(first_row_bom) > end + 1:
- new_row += first_row_bom[end + 1 :]
-
- else:
- # No quotation so just remove BOM from first element
- new_row = first_row_bom[1:]
-
- new_row_list: list[Scalar] = [new_row]
- return new_row_list + first_row[1:]
-
- def _is_line_empty(self, line: list[Scalar]) -> bool:
- """
- Check if a line is empty or not.
-
- Parameters
- ----------
- line : str, array-like
- The line of data to check.
-
- Returns
- -------
- boolean : Whether or not the line is empty.
- """
- return not line or all(not x for x in line)
-
- def _next_line(self) -> list[Scalar]:
- if isinstance(self.data, list):
- while self.skipfunc(self.pos):
- if self.pos >= len(self.data):
- break
- self.pos += 1
-
- while True:
- try:
- line = self._check_comments([self.data[self.pos]])[0]
- self.pos += 1
- # either uncommented or blank to begin with
- if not self.skip_blank_lines and (
- self._is_line_empty(self.data[self.pos - 1]) or line
- ):
- break
- if self.skip_blank_lines:
- ret = self._remove_empty_lines([line])
- if ret:
- line = ret[0]
- break
- except IndexError:
- raise StopIteration
- else:
- while self.skipfunc(self.pos):
- self.pos += 1
- # assert for mypy, data is Iterator[str] or None, would error in next
- assert self.data is not None
- next(self.data)
-
- while True:
- orig_line = self._next_iter_line(row_num=self.pos + 1)
- self.pos += 1
-
- if orig_line is not None:
- line = self._check_comments([orig_line])[0]
-
- if self.skip_blank_lines:
- ret = self._remove_empty_lines([line])
-
- if ret:
- line = ret[0]
- break
- elif self._is_line_empty(orig_line) or line:
- break
-
- # This was the first line of the file,
- # which could contain the BOM at the
- # beginning of it.
- if self.pos == 1:
- line = self._check_for_bom(line)
-
- self.line_pos += 1
- self.buf.append(line)
- return line
-
- def _alert_malformed(self, msg: str, row_num: int) -> None:
- """
- Alert a user about a malformed row, depending on value of
- `self.on_bad_lines` enum.
-
- If `self.on_bad_lines` is ERROR, the alert will be `ParserError`.
- If `self.on_bad_lines` is WARN, the alert will be printed out.
-
- Parameters
- ----------
- msg: str
- The error message to display.
- row_num: int
- The row number where the parsing error occurred.
- Because this row number is displayed, we 1-index,
- even though we 0-index internally.
- """
- if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
- raise ParserError(msg)
- if self.on_bad_lines == self.BadLineHandleMethod.WARN:
- base = f"Skipping line {row_num}: "
- sys.stderr.write(base + msg + "\n")
-
- def _next_iter_line(self, row_num: int) -> list[Scalar] | None:
- """
- Wrapper around iterating through `self.data` (CSV source).
-
- When a CSV error is raised, we check for specific
- error messages that allow us to customize the
- error message displayed to the user.
-
- Parameters
- ----------
- row_num: int
- The row number of the line being parsed.
- """
- try:
- # assert for mypy, data is Iterator[str] or None, would error in next
- assert self.data is not None
- line = next(self.data)
- # for mypy
- assert isinstance(line, list)
- return line
- except csv.Error as e:
- if self.on_bad_lines in (
- self.BadLineHandleMethod.ERROR,
- self.BadLineHandleMethod.WARN,
- ):
- msg = str(e)
-
- if "NULL byte" in msg or "line contains NUL" in msg:
- msg = (
- "NULL byte detected. This byte "
- "cannot be processed in Python's "
- "native csv library at the moment, "
- "so please pass in engine='c' instead"
- )
-
- if self.skipfooter > 0:
- reason = (
- "Error could possibly be due to "
- "parsing errors in the skipped footer rows "
- "(the skipfooter keyword is only applied "
- "after Python's csv library has parsed "
- "all rows)."
- )
- msg += ". " + reason
-
- self._alert_malformed(msg, row_num)
- return None
-
- def _check_comments(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
- if self.comment is None:
- return lines
- ret = []
- for line in lines:
- rl = []
- for x in line:
- if (
- not isinstance(x, str)
- or self.comment not in x
- or x in self.na_values
- ):
- rl.append(x)
- else:
- x = x[: x.find(self.comment)]
- if len(x) > 0:
- rl.append(x)
- break
- ret.append(rl)
- return ret
-
- def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
- """
- Iterate through the lines and remove any that are
- either empty or contain only one whitespace value
-
- Parameters
- ----------
- lines : list of list of Scalars
- The array of lines that we are to filter.
-
- Returns
- -------
- filtered_lines : list of list of Scalars
- The same array of lines with the "empty" ones removed.
- """
- ret = []
- for line in lines:
- # Remove empty lines and lines with only one whitespace value
- if (
- len(line) > 1
- or len(line) == 1
- and (not isinstance(line[0], str) or line[0].strip())
- ):
- ret.append(line)
- return ret
-
- def _check_thousands(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
- if self.thousands is None:
- return lines
-
- return self._search_replace_num_columns(
- lines=lines, search=self.thousands, replace=""
- )
-
- def _search_replace_num_columns(
- self, lines: list[list[Scalar]], search: str, replace: str
- ) -> list[list[Scalar]]:
- ret = []
- for line in lines:
- rl = []
- for i, x in enumerate(line):
- if (
- not isinstance(x, str)
- or search not in x
- or (self._no_thousands_columns and i in self._no_thousands_columns)
- or not self.num.search(x.strip())
- ):
- rl.append(x)
- else:
- rl.append(x.replace(search, replace))
- ret.append(rl)
- return ret
-
- def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
- if self.decimal == parser_defaults["decimal"]:
- return lines
-
- return self._search_replace_num_columns(
- lines=lines, search=self.decimal, replace="."
- )
-
- def _clear_buffer(self) -> None:
- self.buf = []
-
- _implicit_index = False
-
- def _get_index_name(
- self, columns: Sequence[Hashable]
- ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]:
- """
- Try several cases to get lines:
-
- 0) There are headers on row 0 and row 1 and their
- total summed lengths equals the length of the next line.
- Treat row 0 as columns and row 1 as indices
- 1) Look for implicit index: there are more columns
- on row 1 than row 0. If this is true, assume that row
- 1 lists index columns and row 0 lists normal columns.
- 2) Get index from the columns if it was listed.
- """
- orig_names = list(columns)
- columns = list(columns)
-
- line: list[Scalar] | None
- if self._header_line is not None:
- line = self._header_line
- else:
- try:
- line = self._next_line()
- except StopIteration:
- line = None
-
- next_line: list[Scalar] | None
- try:
- next_line = self._next_line()
- except StopIteration:
- next_line = None
-
- # implicitly index_col=0 b/c 1 fewer column names
- implicit_first_cols = 0
- if line is not None:
- # leave it 0, #2442
- # Case 1
- # error: Cannot determine type of 'index_col'
- index_col = self.index_col # type: ignore[has-type]
- if index_col is not False:
- implicit_first_cols = len(line) - self.num_original_columns
-
- # Case 0
- if (
- next_line is not None
- and self.header is not None
- and index_col is not False
- ):
- if len(next_line) == len(line) + self.num_original_columns:
- # column and index names on diff rows
- self.index_col = list(range(len(line)))
- self.buf = self.buf[1:]
-
- for c in reversed(line):
- columns.insert(0, c)
-
- # Update list of original names to include all indices.
- orig_names = list(columns)
- self.num_original_columns = len(columns)
- return line, orig_names, columns
-
- if implicit_first_cols > 0:
- # Case 1
- self._implicit_index = True
- if self.index_col is None:
- self.index_col = list(range(implicit_first_cols))
-
- index_name = None
-
- else:
- # Case 2
- (index_name, _, self.index_col) = self._clean_index_names(
- columns, self.index_col
- )
-
- return index_name, orig_names, columns
-
- def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
- col_len = self.num_original_columns
-
- if self._implicit_index:
- col_len += len(self.index_col)
-
- max_len = max(len(row) for row in content)
-
- # Check that there are no rows with too many
- # elements in their row (rows with too few
- # elements are padded with NaN).
- # error: Non-overlapping identity check (left operand type: "List[int]",
- # right operand type: "Literal[False]")
- if (
- max_len > col_len
- and self.index_col is not False # type: ignore[comparison-overlap]
- and self.usecols is None
- ):
- footers = self.skipfooter if self.skipfooter else 0
- bad_lines = []
-
- iter_content = enumerate(content)
- content_len = len(content)
- content = []
-
- for i, _content in iter_content:
- actual_len = len(_content)
-
- if actual_len > col_len:
- if callable(self.on_bad_lines):
- new_l = self.on_bad_lines(_content)
- if new_l is not None:
- content.append(new_l)
- elif self.on_bad_lines in (
- self.BadLineHandleMethod.ERROR,
- self.BadLineHandleMethod.WARN,
- ):
- row_num = self.pos - (content_len - i + footers)
- bad_lines.append((row_num, actual_len))
-
- if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
- break
- else:
- content.append(_content)
-
- for row_num, actual_len in bad_lines:
- msg = (
- f"Expected {col_len} fields in line {row_num + 1}, saw "
- f"{actual_len}"
- )
- if (
- self.delimiter
- and len(self.delimiter) > 1
- and self.quoting != csv.QUOTE_NONE
- ):
- # see gh-13374
- reason = (
- "Error could possibly be due to quotes being "
- "ignored when a multi-char delimiter is used."
- )
- msg += ". " + reason
-
- self._alert_malformed(msg, row_num + 1)
-
- # see gh-13320
- zipped_content = list(lib.to_object_array(content, min_width=col_len).T)
-
- if self.usecols:
- assert self._col_indices is not None
- col_indices = self._col_indices
-
- if self._implicit_index:
- zipped_content = [
- a
- for i, a in enumerate(zipped_content)
- if (
- i < len(self.index_col)
- or i - len(self.index_col) in col_indices
- )
- ]
- else:
- zipped_content = [
- a for i, a in enumerate(zipped_content) if i in col_indices
- ]
- return zipped_content
-
- def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
- lines = self.buf
- new_rows = None
-
- # already fetched some number
- if rows is not None:
- # we already have the lines in the buffer
- if len(self.buf) >= rows:
- new_rows, self.buf = self.buf[:rows], self.buf[rows:]
-
- # need some lines
- else:
- rows -= len(self.buf)
-
- if new_rows is None:
- if isinstance(self.data, list):
- if self.pos > len(self.data):
- raise StopIteration
- if rows is None:
- new_rows = self.data[self.pos :]
- new_pos = len(self.data)
- else:
- new_rows = self.data[self.pos : self.pos + rows]
- new_pos = self.pos + rows
-
- new_rows = self._remove_skipped_rows(new_rows)
- lines.extend(new_rows)
- self.pos = new_pos
-
- else:
- new_rows = []
- try:
- if rows is not None:
- rows_to_skip = 0
- if self.skiprows is not None and self.pos is not None:
- # Only read additional rows if pos is in skiprows
- rows_to_skip = len(
- set(self.skiprows) - set(range(self.pos))
- )
-
- for _ in range(rows + rows_to_skip):
- # assert for mypy, data is Iterator[str] or None, would
- # error in next
- assert self.data is not None
- new_rows.append(next(self.data))
-
- len_new_rows = len(new_rows)
- new_rows = self._remove_skipped_rows(new_rows)
- lines.extend(new_rows)
- else:
- rows = 0
-
- while True:
- new_row = self._next_iter_line(row_num=self.pos + rows + 1)
- rows += 1
-
- if new_row is not None:
- new_rows.append(new_row)
- len_new_rows = len(new_rows)
-
- except StopIteration:
- len_new_rows = len(new_rows)
- new_rows = self._remove_skipped_rows(new_rows)
- lines.extend(new_rows)
- if len(lines) == 0:
- raise
- self.pos += len_new_rows
-
- self.buf = []
- else:
- lines = new_rows
-
- if self.skipfooter:
- lines = lines[: -self.skipfooter]
-
- lines = self._check_comments(lines)
- if self.skip_blank_lines:
- lines = self._remove_empty_lines(lines)
- lines = self._check_thousands(lines)
- return self._check_decimal(lines)
-
- def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar]]:
- if self.skiprows:
- return [
- row for i, row in enumerate(new_rows) if not self.skipfunc(i + self.pos)
- ]
- return new_rows
-
-
-class FixedWidthReader(abc.Iterator):
- """
- A reader of fixed-width lines.
- """
-
- def __init__(
- self,
- f: IO[str] | ReadCsvBuffer[str],
- colspecs: list[tuple[int, int]] | Literal["infer"],
- delimiter: str | None,
- comment: str | None,
- skiprows: set[int] | None = None,
- infer_nrows: int = 100,
- ) -> None:
- self.f = f
- self.buffer: Iterator | None = None
- self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
- self.comment = comment
- if colspecs == "infer":
- self.colspecs = self.detect_colspecs(
- infer_nrows=infer_nrows, skiprows=skiprows
- )
- else:
- self.colspecs = colspecs
-
- if not isinstance(self.colspecs, (tuple, list)):
- raise TypeError(
- "column specifications must be a list or tuple, "
- f"input was a {type(colspecs).__name__}"
- )
-
- for colspec in self.colspecs:
- if not (
- isinstance(colspec, (tuple, list))
- and len(colspec) == 2
- and isinstance(colspec[0], (int, np.integer, type(None)))
- and isinstance(colspec[1], (int, np.integer, type(None)))
- ):
- raise TypeError(
- "Each column specification must be "
- "2 element tuple or list of integers"
- )
-
- def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]:
- """
- Read rows from self.f, skipping as specified.
-
- We distinguish buffer_rows (the first <= infer_nrows
- lines) from the rows returned to detect_colspecs
- because it's simpler to leave the other locations
- with skiprows logic alone than to modify them to
- deal with the fact we skipped some rows here as
- well.
-
- Parameters
- ----------
- infer_nrows : int
- Number of rows to read from self.f, not counting
- rows that are skipped.
- skiprows: set, optional
- Indices of rows to skip.
-
- Returns
- -------
- detect_rows : list of str
- A list containing the rows to read.
-
- """
- if skiprows is None:
- skiprows = set()
- buffer_rows = []
- detect_rows = []
- for i, row in enumerate(self.f):
- if i not in skiprows:
- detect_rows.append(row)
- buffer_rows.append(row)
- if len(detect_rows) >= infer_nrows:
- break
- self.buffer = iter(buffer_rows)
- return detect_rows
-
- def detect_colspecs(
- self, infer_nrows: int = 100, skiprows: set[int] | None = None
- ) -> list[tuple[int, int]]:
- # Regex escape the delimiters
- delimiters = "".join([rf"\{x}" for x in self.delimiter])
- pattern = re.compile(f"([^{delimiters}]+)")
- rows = self.get_rows(infer_nrows, skiprows)
- if not rows:
- raise EmptyDataError("No rows from which to infer column width")
- max_len = max(map(len, rows))
- mask = np.zeros(max_len + 1, dtype=int)
- if self.comment is not None:
- rows = [row.partition(self.comment)[0] for row in rows]
- for row in rows:
- for m in pattern.finditer(row):
- mask[m.start() : m.end()] = 1
- shifted = np.roll(mask, 1)
- shifted[0] = 0
- edges = np.where((mask ^ shifted) == 1)[0]
- edge_pairs = list(zip(edges[::2], edges[1::2]))
- return edge_pairs
-
- def __next__(self) -> list[str]:
- # Argument 1 to "next" has incompatible type "Union[IO[str],
- # ReadCsvBuffer[str]]"; expected "SupportsNext[str]"
- if self.buffer is not None:
- try:
- line = next(self.buffer)
- except StopIteration:
- self.buffer = None
- line = next(self.f) # type: ignore[arg-type]
- else:
- line = next(self.f) # type: ignore[arg-type]
- # Note: 'colspecs' is a sequence of half-open intervals.
- return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
-
-
-class FixedWidthFieldParser(PythonParser):
- """
- Specialization that Converts fixed-width fields into DataFrames.
- See PythonParser for details.
- """
-
- def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
- # Support iterators, convert to a list.
- self.colspecs = kwds.pop("colspecs")
- self.infer_nrows = kwds.pop("infer_nrows")
- PythonParser.__init__(self, f, **kwds)
-
- def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
- self.data = FixedWidthReader(
- f,
- self.colspecs,
- self.delimiter,
- self.comment,
- self.skiprows,
- self.infer_nrows,
- )
-
- def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
- """
- Returns the list of lines without the empty ones. With fixed-width
- fields, empty lines become arrays of empty strings.
-
- See PythonParser._remove_empty_lines.
- """
- return [
- line
- for line in lines
- if any(not isinstance(e, str) or e.strip() for e in line)
- ]
-
-
-def count_empty_vals(vals) -> int:
- return sum(1 for v in vals if v == "" or v is None)
-
-
-def _validate_skipfooter_arg(skipfooter: int) -> int:
- """
- Validate the 'skipfooter' parameter.
-
- Checks whether 'skipfooter' is a non-negative integer.
- Raises a ValueError if that is not the case.
-
- Parameters
- ----------
- skipfooter : non-negative integer
- The number of rows to skip at the end of the file.
-
- Returns
- -------
- validated_skipfooter : non-negative integer
- The original input if the validation succeeds.
-
- Raises
- ------
- ValueError : 'skipfooter' was not a non-negative integer.
- """
- if not is_integer(skipfooter):
- raise ValueError("skipfooter must be an integer")
-
- if skipfooter < 0:
- raise ValueError("skipfooter cannot be negative")
-
- return skipfooter
diff --git a/contrib/python/pandas/py3/pandas/io/parsers/readers.py b/contrib/python/pandas/py3/pandas/io/parsers/readers.py
deleted file mode 100644
index 558355fd5e6..00000000000
--- a/contrib/python/pandas/py3/pandas/io/parsers/readers.py
+++ /dev/null
@@ -1,2127 +0,0 @@
-"""
-Module contains tools for processing files into DataFrames or other objects
-
-GH#48849 provides a convenient way of deprecating keyword arguments
-"""
-from __future__ import annotations
-
-from collections import abc
-import csv
-import sys
-from textwrap import fill
-from types import TracebackType
-from typing import (
- IO,
- Any,
- Callable,
- Hashable,
- Literal,
- NamedTuple,
- Sequence,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.parsers import STR_NA_VALUES
-from pandas._typing import (
- CompressionOptions,
- CSVEngine,
- DtypeArg,
- DtypeBackend,
- FilePath,
- IndexLabel,
- ReadCsvBuffer,
- StorageOptions,
-)
-from pandas.errors import (
- AbstractMethodError,
- ParserWarning,
-)
-from pandas.util._decorators import Appender
-from pandas.util._exceptions import find_stack_level
-from pandas.util._validators import check_dtype_backend
-
-from pandas.core.dtypes.common import (
- is_file_like,
- is_float,
- is_integer,
- is_list_like,
-)
-
-from pandas.core.frame import DataFrame
-from pandas.core.indexes.api import RangeIndex
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.common import (
- IOHandles,
- get_handle,
- stringify_path,
- validate_header_arg,
-)
-from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper
-from pandas.io.parsers.base_parser import (
- ParserBase,
- is_index_col,
- parser_defaults,
-)
-from pandas.io.parsers.c_parser_wrapper import CParserWrapper
-from pandas.io.parsers.python_parser import (
- FixedWidthFieldParser,
- PythonParser,
-)
-
-_doc_read_csv_and_table = (
- r"""
-{summary}
-
-Also supports optionally iterating or breaking of the file
-into chunks.
-
-Additional help can be found in the online docs for
-`IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
-
-Parameters
-----------
-filepath_or_buffer : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
- expected. A local file could be: file://localhost/path/to/table.csv.
-
- If you want to pass in a path object, pandas accepts any ``os.PathLike``.
-
- By file-like object, we refer to objects with a ``read()`` method, such as
- a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
-sep : str, default {_default_sep}
- Delimiter to use. If sep is None, the C engine cannot automatically detect
- the separator, but the Python parsing engine can, meaning the latter will
- be used and automatically detect the separator by Python's builtin sniffer
- tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
- different from ``'\s+'`` will be interpreted as regular expressions and
- will also force the use of the Python parsing engine. Note that regex
- delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
-delimiter : str, default ``None``
- Alias for sep.
-header : int, list of int, None, default 'infer'
- Row number(s) to use as the column names, and the start of the
- data. Default behavior is to infer the column names: if no names
- are passed the behavior is identical to ``header=0`` and column
- names are inferred from the first line of the file, if column
- names are passed explicitly then the behavior is identical to
- ``header=None``. Explicitly pass ``header=0`` to be able to
- replace existing names. The header can be a list of integers that
- specify row locations for a multi-index on the columns
- e.g. [0,1,3]. Intervening rows that are not specified will be
- skipped (e.g. 2 in this example is skipped). Note that this
- parameter ignores commented lines and empty lines if
- ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
- data rather than the first line of the file.
-names : array-like, optional
- List of column names to use. If the file contains a header row,
- then you should explicitly pass ``header=0`` to override the column names.
- Duplicates in this list are not allowed.
-index_col : int, str, sequence of int / str, or False, optional, default ``None``
- Column(s) to use as the row labels of the ``DataFrame``, either given as
- string name or column index. If a sequence of int / str is given, a
- MultiIndex is used.
-
- Note: ``index_col=False`` can be used to force pandas to *not* use the first
- column as the index, e.g. when you have a malformed file with delimiters at
- the end of each line.
-usecols : list-like or callable, optional
- Return a subset of the columns. If list-like, all elements must either
- be positional (i.e. integer indices into the document columns) or strings
- that correspond to column names provided either by the user in `names` or
- inferred from the document header row(s). If ``names`` are given, the document
- header row(s) are not taken into account. For example, a valid list-like
- `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
- Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
- To instantiate a DataFrame from ``data`` with element order preserved use
- ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
- in ``['foo', 'bar']`` order or
- ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
- for ``['bar', 'foo']`` order.
-
- If callable, the callable function will be evaluated against the column
- names, returning names where the callable function evaluates to True. An
- example of a valid callable argument would be ``lambda x: x.upper() in
- ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
- parsing time and lower memory usage.
-dtype : Type name or dict of column -> type, optional
- Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
- 'c': 'Int64'}}
- Use `str` or `object` together with suitable `na_values` settings
- to preserve and not interpret dtype.
- If converters are specified, they will be applied INSTEAD
- of dtype conversion.
-
- .. versionadded:: 1.5.0
-
- Support for defaultdict was added. Specify a defaultdict as input where
- the default determines the dtype of the columns which are not explicitly
- listed.
-engine : {{'c', 'python', 'pyarrow'}}, optional
- Parser engine to use. The C and pyarrow engines are faster, while the python engine
- is currently more feature-complete. Multithreading is currently only supported by
- the pyarrow engine.
-
- .. versionadded:: 1.4.0
-
- The "pyarrow" engine was added as an *experimental* engine, and some features
- are unsupported, or may not work correctly, with this engine.
-converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can either
- be integers or column labels.
-true_values : list, optional
- Values to consider as True in addition to case-insensitive variants of "True".
-false_values : list, optional
- Values to consider as False in addition to case-insensitive variants of "False".
-skipinitialspace : bool, default False
- Skip spaces after delimiter.
-skiprows : list-like, int or callable, optional
- Line numbers to skip (0-indexed) or number of lines to skip (int)
- at the start of the file.
-
- If callable, the callable function will be evaluated against the row
- indices, returning True if the row should be skipped and False otherwise.
- An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
-skipfooter : int, default 0
- Number of lines at bottom of file to skip (Unsupported with engine='c').
-nrows : int, optional
- Number of rows of file to read. Useful for reading pieces of large files.
-na_values : scalar, str, list-like, or dict, optional
- Additional strings to recognize as NA/NaN. If dict passed, specific
- per-column NA values. By default the following values are interpreted as
- NaN: '"""
- + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
- + """'.
-keep_default_na : bool, default True
- Whether or not to include the default NaN values when parsing the data.
- Depending on whether `na_values` is passed in, the behavior is as follows:
-
- * If `keep_default_na` is True, and `na_values` are specified, `na_values`
- is appended to the default NaN values used for parsing.
- * If `keep_default_na` is True, and `na_values` are not specified, only
- the default NaN values are used for parsing.
- * If `keep_default_na` is False, and `na_values` are specified, only
- the NaN values specified `na_values` are used for parsing.
- * If `keep_default_na` is False, and `na_values` are not specified, no
- strings will be parsed as NaN.
-
- Note that if `na_filter` is passed in as False, the `keep_default_na` and
- `na_values` parameters will be ignored.
-na_filter : bool, default True
- Detect missing value markers (empty strings and the value of na_values). In
- data without any NAs, passing na_filter=False can improve the performance
- of reading a large file.
-verbose : bool, default False
- Indicate number of NA values placed in non-numeric columns.
-skip_blank_lines : bool, default True
- If True, skip over blank lines rather than interpreting as NaN values.
-parse_dates : bool or list of int or names or list of lists or dict, \
-default False
- The behavior is as follows:
-
- * boolean. If True -> try parsing the index.
- * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
- each as a separate date column.
- * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
- a single date column.
- * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
- result 'foo'
-
- If a column or index cannot be represented as an array of datetimes,
- say because of an unparsable value or a mixture of timezones, the column
- or index will be returned unaltered as an object data type. For
- non-standard datetime parsing, use ``pd.to_datetime`` after
- ``pd.read_csv``.
-
- Note: A fast-path exists for iso8601-formatted dates.
-infer_datetime_format : bool, default False
- If True and `parse_dates` is enabled, pandas will attempt to infer the
- format of the datetime strings in the columns, and if it can be inferred,
- switch to a faster method of parsing them. In some cases this can increase
- the parsing speed by 5-10x.
-
- .. deprecated:: 2.0.0
- A strict version of this argument is now the default, passing it has no effect.
-
-keep_date_col : bool, default False
- If True and `parse_dates` specifies combining multiple columns then
- keep the original columns.
-date_parser : function, optional
- Function to use for converting a sequence of string columns to an array of
- datetime instances. The default uses ``dateutil.parser.parser`` to do the
- conversion. Pandas will try to call `date_parser` in three different ways,
- advancing to the next if an exception occurs: 1) Pass one or more arrays
- (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
- string values from the columns defined by `parse_dates` into a single array
- and pass that; and 3) call `date_parser` once for each row using one or
- more strings (corresponding to the columns defined by `parse_dates`) as
- arguments.
-
- .. deprecated:: 2.0.0
- Use ``date_format`` instead, or read in as ``object`` and then apply
- :func:`to_datetime` as-needed.
-date_format : str or dict of column -> format, default ``None``
- If used in conjunction with ``parse_dates``, will parse dates according to this
- format. For anything more complex,
- please read in as ``object`` and then apply :func:`to_datetime` as-needed.
-
- .. versionadded:: 2.0.0
-dayfirst : bool, default False
- DD/MM format dates, international and European format.
-cache_dates : bool, default True
- If True, use a cache of unique, converted dates to apply the datetime
- conversion. May produce significant speed-up when parsing duplicate
- date strings, especially ones with timezone offsets.
-
-iterator : bool, default False
- Return TextFileReader object for iteration or getting chunks with
- ``get_chunk()``.
-
- .. versionchanged:: 1.2
-
- ``TextFileReader`` is a context manager.
-chunksize : int, optional
- Return TextFileReader object for iteration.
- See the `IO Tools docs
- <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
- for more information on ``iterator`` and ``chunksize``.
-
- .. versionchanged:: 1.2
-
- ``TextFileReader`` is a context manager.
-{decompression_options}
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
-thousands : str, optional
- Thousands separator.
-decimal : str, default '.'
- Character to recognize as decimal point (e.g. use ',' for European data).
-lineterminator : str (length 1), optional
- Character to break file into lines. Only valid with C parser.
-quotechar : str (length 1), optional
- The character used to denote the start and end of a quoted item. Quoted
- items can include the delimiter and it will be ignored.
-quoting : int or csv.QUOTE_* instance, default 0
- Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
- QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
-doublequote : bool, default ``True``
- When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
- whether or not to interpret two consecutive quotechar elements INSIDE a
- field as a single ``quotechar`` element.
-escapechar : str (length 1), optional
- One-character string used to escape other characters.
-comment : str, optional
- Indicates remainder of line should not be parsed. If found at the beginning
- of a line, the line will be ignored altogether. This parameter must be a
- single character. Like empty lines (as long as ``skip_blank_lines=True``),
- fully commented lines are ignored by the parameter `header` but not by
- `skiprows`. For example, if ``comment='#'``, parsing
- ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
- treated as the header.
-encoding : str, optional, default "utf-8"
- Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
- standard encodings
- <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
-
- .. versionchanged:: 1.2
-
- When ``encoding`` is ``None``, ``errors="replace"`` is passed to
- ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
- This behavior was previously only the case for ``engine="python"``.
-
- .. versionchanged:: 1.3.0
-
- ``encoding_errors`` is a new argument. ``encoding`` has no longer an
- influence on how encoding errors are handled.
-
-encoding_errors : str, optional, default "strict"
- How encoding errors are treated. `List of possible values
- <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
-
- .. versionadded:: 1.3.0
-
-dialect : str or csv.Dialect, optional
- If provided, this parameter will override values (default or not) for the
- following parameters: `delimiter`, `doublequote`, `escapechar`,
- `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
- override values, a ParserWarning will be issued. See csv.Dialect
- documentation for more details.
-on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error'
- Specifies what to do upon encountering a bad line (a line with too many fields).
- Allowed values are :
-
- - 'error', raise an Exception when a bad line is encountered.
- - 'warn', raise a warning when a bad line is encountered and skip that line.
- - 'skip', skip bad lines without raising or warning when they are encountered.
-
- .. versionadded:: 1.3.0
-
- .. versionadded:: 1.4.0
-
- - callable, function with signature
- ``(bad_line: list[str]) -> list[str] | None`` that will process a single
- bad line. ``bad_line`` is a list of strings split by the ``sep``.
- If the function returns ``None``, the bad line will be ignored.
- If the function returns a new list of strings with more elements than
- expected, a ``ParserWarning`` will be emitted while dropping extra elements.
- Only supported when ``engine="python"``
-
-delim_whitespace : bool, default False
- Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
- used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
- is set to True, nothing should be passed in for the ``delimiter``
- parameter.
-low_memory : bool, default True
- Internally process the file in chunks, resulting in lower memory use
- while parsing, but possibly mixed type inference. To ensure no mixed
- types either set False, or specify the type with the `dtype` parameter.
- Note that the entire file is read into a single DataFrame regardless,
- use the `chunksize` or `iterator` parameter to return the data in chunks.
- (Only valid with C parser).
-memory_map : bool, default False
- If a filepath is provided for `filepath_or_buffer`, map the file object
- directly onto memory and access the data directly from there. Using this
- option can improve performance because there is no longer any I/O overhead.
-float_precision : str, optional
- Specifies which converter the C engine should use for floating-point
- values. The options are ``None`` or 'high' for the ordinary converter,
- 'legacy' for the original lower precision pandas converter, and
- 'round_trip' for the round-trip converter.
-
- .. versionchanged:: 1.2
-
-{storage_options}
-
- .. versionadded:: 1.2
-
-dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
-Returns
--------
-DataFrame or TextFileReader
- A comma-separated values (csv) file is returned as two-dimensional
- data structure with labeled axes.
-
-See Also
---------
-DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
-read_csv : Read a comma-separated values (csv) file into DataFrame.
-read_fwf : Read a table of fixed-width formatted lines into DataFrame.
-
-Examples
---------
->>> pd.{func_name}('data.csv') # doctest: +SKIP
-"""
-)
-
-
-_c_parser_defaults = {
- "delim_whitespace": False,
- "na_filter": True,
- "low_memory": True,
- "memory_map": False,
- "float_precision": None,
-}
-
-_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
-
-_c_unsupported = {"skipfooter"}
-_python_unsupported = {"low_memory", "float_precision"}
-_pyarrow_unsupported = {
- "skipfooter",
- "float_precision",
- "chunksize",
- "comment",
- "nrows",
- "thousands",
- "memory_map",
- "dialect",
- "on_bad_lines",
- "delim_whitespace",
- "quoting",
- "lineterminator",
- "converters",
- "iterator",
- "dayfirst",
- "verbose",
- "skipinitialspace",
- "low_memory",
-}
-
-
-class _DeprecationConfig(NamedTuple):
- default_value: Any
- msg: str | None
-
-
-@overload
-def validate_integer(name, val: None, min_val: int = ...) -> None:
- ...
-
-
-@overload
-def validate_integer(name, val: float, min_val: int = ...) -> int:
- ...
-
-
-@overload
-def validate_integer(name, val: int | None, min_val: int = ...) -> int | None:
- ...
-
-
-def validate_integer(name, val: int | float | None, min_val: int = 0) -> int | None:
- """
- Checks whether the 'name' parameter for parsing is either
- an integer OR float that can SAFELY be cast to an integer
- without losing accuracy. Raises a ValueError if that is
- not the case.
-
- Parameters
- ----------
- name : str
- Parameter name (used for error reporting)
- val : int or float
- The value to check
- min_val : int
- Minimum allowed value (val < min_val will result in a ValueError)
- """
- if val is None:
- return val
-
- msg = f"'{name:s}' must be an integer >={min_val:d}"
- if is_float(val):
- if int(val) != val:
- raise ValueError(msg)
- val = int(val)
- elif not (is_integer(val) and val >= min_val):
- raise ValueError(msg)
-
- return int(val)
-
-
-def _validate_names(names: Sequence[Hashable] | None) -> None:
- """
- Raise ValueError if the `names` parameter contains duplicates or has an
- invalid data type.
-
- Parameters
- ----------
- names : array-like or None
- An array containing a list of the names used for the output DataFrame.
-
- Raises
- ------
- ValueError
- If names are not unique or are not ordered (e.g. set).
- """
- if names is not None:
- if len(names) != len(set(names)):
- raise ValueError("Duplicate names are not allowed.")
- if not (
- is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView)
- ):
- raise ValueError("Names should be an ordered collection.")
-
-
-def _read(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds
-) -> DataFrame | TextFileReader:
- """Generic reader of line files."""
- # if we pass a date_parser and parse_dates=False, we should not parse the
- # dates GH#44366
- if kwds.get("parse_dates", None) is None:
- if (
- kwds.get("date_parser", lib.no_default) is lib.no_default
- and kwds.get("date_format", None) is None
- ):
- kwds["parse_dates"] = False
- else:
- kwds["parse_dates"] = True
-
- # Extract some of the arguments (pass chunksize on).
- iterator = kwds.get("iterator", False)
- chunksize = kwds.get("chunksize", None)
- if kwds.get("engine") == "pyarrow":
- if iterator:
- raise ValueError(
- "The 'iterator' option is not supported with the 'pyarrow' engine"
- )
-
- if chunksize is not None:
- raise ValueError(
- "The 'chunksize' option is not supported with the 'pyarrow' engine"
- )
- else:
- chunksize = validate_integer("chunksize", chunksize, 1)
-
- nrows = kwds.get("nrows", None)
-
- # Check for duplicates in names.
- _validate_names(kwds.get("names", None))
-
- # Create the parser.
- parser = TextFileReader(filepath_or_buffer, **kwds)
-
- if chunksize or iterator:
- return parser
-
- with parser:
- return parser.read(nrows)
-
-
-# iterator=True -> TextFileReader
-@overload
-def read_csv(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] | None = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: Literal[True],
- chunksize: int | None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: Literal["high", "legacy"] | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> TextFileReader:
- ...
-
-
-# chunksize=int -> TextFileReader
-@overload
-def read_csv(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] | None = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: bool = ...,
- chunksize: int,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: Literal["high", "legacy"] | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> TextFileReader:
- ...
-
-
-# default case -> DataFrame
-@overload
-def read_csv(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] | None = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: Literal[False] = ...,
- chunksize: None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: Literal["high", "legacy"] | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> DataFrame:
- ...
-
-
-# Unions -> DataFrame | TextFileReader
-@overload
-def read_csv(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] | None = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: bool = ...,
- chunksize: int | None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: Literal["high", "legacy"] | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> DataFrame | TextFileReader:
- ...
-
-
-@Appender(
- _doc_read_csv_and_table.format(
- func_name="read_csv",
- summary="Read a comma-separated values (csv) file into DataFrame.",
- _default_sep="','",
- storage_options=_shared_docs["storage_options"],
- decompression_options=_shared_docs["decompression_options"]
- % "filepath_or_buffer",
- )
-)
-def read_csv(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = lib.no_default,
- delimiter: str | None | lib.NoDefault = None,
- # Column and Index Locations and Names
- header: int | Sequence[int] | None | Literal["infer"] = "infer",
- names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
- index_col: IndexLabel | Literal[False] | None = None,
- usecols=None,
- # General Parsing Configuration
- dtype: DtypeArg | None = None,
- engine: CSVEngine | None = None,
- converters=None,
- true_values=None,
- false_values=None,
- skipinitialspace: bool = False,
- skiprows=None,
- skipfooter: int = 0,
- nrows: int | None = None,
- # NA and Missing Data Handling
- na_values=None,
- keep_default_na: bool = True,
- na_filter: bool = True,
- verbose: bool = False,
- skip_blank_lines: bool = True,
- # Datetime Handling
- parse_dates: bool | Sequence[Hashable] | None = None,
- infer_datetime_format: bool | lib.NoDefault = lib.no_default,
- keep_date_col: bool = False,
- date_parser=lib.no_default,
- date_format: str | None = None,
- dayfirst: bool = False,
- cache_dates: bool = True,
- # Iteration
- iterator: bool = False,
- chunksize: int | None = None,
- # Quoting, Compression, and File Format
- compression: CompressionOptions = "infer",
- thousands: str | None = None,
- decimal: str = ".",
- lineterminator: str | None = None,
- quotechar: str = '"',
- quoting: int = csv.QUOTE_MINIMAL,
- doublequote: bool = True,
- escapechar: str | None = None,
- comment: str | None = None,
- encoding: str | None = None,
- encoding_errors: str | None = "strict",
- dialect: str | csv.Dialect | None = None,
- # Error Handling
- on_bad_lines: str = "error",
- # Internal
- delim_whitespace: bool = False,
- low_memory=_c_parser_defaults["low_memory"],
- memory_map: bool = False,
- float_precision: Literal["high", "legacy"] | None = None,
- storage_options: StorageOptions = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-) -> DataFrame | TextFileReader:
- if infer_datetime_format is not lib.no_default:
- warnings.warn(
- "The argument 'infer_datetime_format' is deprecated and will "
- "be removed in a future version. "
- "A strict version of it is now the default, see "
- "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
- "You can safely remove this argument.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- # locals() should never be modified
- kwds = locals().copy()
- del kwds["filepath_or_buffer"]
- del kwds["sep"]
-
- kwds_defaults = _refine_defaults_read(
- dialect,
- delimiter,
- delim_whitespace,
- engine,
- sep,
- on_bad_lines,
- names,
- defaults={"delimiter": ","},
- dtype_backend=dtype_backend,
- )
- kwds.update(kwds_defaults)
-
- return _read(filepath_or_buffer, kwds)
-
-
-# iterator=True -> TextFileReader
-@overload
-def read_table(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: Literal[True],
- chunksize: int | None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: str | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> TextFileReader:
- ...
-
-
-# chunksize=int -> TextFileReader
-@overload
-def read_table(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: bool = ...,
- chunksize: int,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: str | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> TextFileReader:
- ...
-
-
-# default -> DataFrame
-@overload
-def read_table(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: Literal[False] = ...,
- chunksize: None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: str | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> DataFrame:
- ...
-
-
-# Unions -> DataFrame | TextFileReader
-@overload
-def read_table(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: bool = ...,
- chunksize: int | None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: str | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> DataFrame | TextFileReader:
- ...
-
-
-@Appender(
- _doc_read_csv_and_table.format(
- func_name="read_table",
- summary="Read general delimited file into DataFrame.",
- _default_sep=r"'\\t' (tab-stop)",
- storage_options=_shared_docs["storage_options"],
- decompression_options=_shared_docs["decompression_options"]
- % "filepath_or_buffer",
- )
-)
-def read_table(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = lib.no_default,
- delimiter: str | None | lib.NoDefault = None,
- # Column and Index Locations and Names
- header: int | Sequence[int] | None | Literal["infer"] = "infer",
- names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
- index_col: IndexLabel | Literal[False] | None = None,
- usecols=None,
- # General Parsing Configuration
- dtype: DtypeArg | None = None,
- engine: CSVEngine | None = None,
- converters=None,
- true_values=None,
- false_values=None,
- skipinitialspace: bool = False,
- skiprows=None,
- skipfooter: int = 0,
- nrows: int | None = None,
- # NA and Missing Data Handling
- na_values=None,
- keep_default_na: bool = True,
- na_filter: bool = True,
- verbose: bool = False,
- skip_blank_lines: bool = True,
- # Datetime Handling
- parse_dates: bool | Sequence[Hashable] = False,
- infer_datetime_format: bool | lib.NoDefault = lib.no_default,
- keep_date_col: bool = False,
- date_parser=lib.no_default,
- date_format: str | None = None,
- dayfirst: bool = False,
- cache_dates: bool = True,
- # Iteration
- iterator: bool = False,
- chunksize: int | None = None,
- # Quoting, Compression, and File Format
- compression: CompressionOptions = "infer",
- thousands: str | None = None,
- decimal: str = ".",
- lineterminator: str | None = None,
- quotechar: str = '"',
- quoting: int = csv.QUOTE_MINIMAL,
- doublequote: bool = True,
- escapechar: str | None = None,
- comment: str | None = None,
- encoding: str | None = None,
- encoding_errors: str | None = "strict",
- dialect: str | csv.Dialect | None = None,
- # Error Handling
- on_bad_lines: str = "error",
- # Internal
- delim_whitespace: bool = False,
- low_memory=_c_parser_defaults["low_memory"],
- memory_map: bool = False,
- float_precision: str | None = None,
- storage_options: StorageOptions = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-) -> DataFrame | TextFileReader:
- if infer_datetime_format is not lib.no_default:
- warnings.warn(
- "The argument 'infer_datetime_format' is deprecated and will "
- "be removed in a future version. "
- "A strict version of it is now the default, see "
- "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
- "You can safely remove this argument.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
-
- # locals() should never be modified
- kwds = locals().copy()
- del kwds["filepath_or_buffer"]
- del kwds["sep"]
-
- kwds_defaults = _refine_defaults_read(
- dialect,
- delimiter,
- delim_whitespace,
- engine,
- sep,
- on_bad_lines,
- names,
- defaults={"delimiter": "\t"},
- dtype_backend=dtype_backend,
- )
- kwds.update(kwds_defaults)
-
- return _read(filepath_or_buffer, kwds)
-
-
-def read_fwf(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- colspecs: Sequence[tuple[int, int]] | str | None = "infer",
- widths: Sequence[int] | None = None,
- infer_nrows: int = 100,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- **kwds,
-) -> DataFrame | TextFileReader:
- r"""
- Read a table of fixed-width formatted lines into DataFrame.
-
- Also supports optionally iterating or breaking of the file
- into chunks.
-
- Additional help can be found in the `online docs for IO Tools
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
-
- Parameters
- ----------
- filepath_or_buffer : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a text ``read()`` function.The string could be a URL.
- Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
- expected. A local file could be:
- ``file://localhost/path/to/table.csv``.
- colspecs : list of tuple (int, int) or 'infer'. optional
- A list of tuples giving the extents of the fixed-width
- fields of each line as half-open intervals (i.e., [from, to[ ).
- String value 'infer' can be used to instruct the parser to try
- detecting the column specifications from the first 100 rows of
- the data which are not being skipped via skiprows (default='infer').
- widths : list of int, optional
- A list of field widths which can be used instead of 'colspecs' if
- the intervals are contiguous.
- infer_nrows : int, default 100
- The number of rows to consider when letting the parser determine the
- `colspecs`.
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- **kwds : optional
- Optional keyword arguments can be passed to ``TextFileReader``.
-
- Returns
- -------
- DataFrame or TextFileReader
- A comma-separated values (csv) file is returned as two-dimensional
- data structure with labeled axes.
-
- See Also
- --------
- DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
- read_csv : Read a comma-separated values (csv) file into DataFrame.
-
- Examples
- --------
- >>> pd.read_fwf('data.csv') # doctest: +SKIP
- """
- # Check input arguments.
- if colspecs is None and widths is None:
- raise ValueError("Must specify either colspecs or widths")
- if colspecs not in (None, "infer") and widths is not None:
- raise ValueError("You must specify only one of 'widths' and 'colspecs'")
-
- # Compute 'colspecs' from 'widths', if specified.
- if widths is not None:
- colspecs, col = [], 0
- for w in widths:
- colspecs.append((col, col + w))
- col += w
-
- # for mypy
- assert colspecs is not None
-
- # GH#40830
- # Ensure length of `colspecs` matches length of `names`
- names = kwds.get("names")
- if names is not None:
- if len(names) != len(colspecs) and colspecs != "infer":
- # need to check len(index_col) as it might contain
- # unnamed indices, in which case it's name is not required
- len_index = 0
- if kwds.get("index_col") is not None:
- index_col: Any = kwds.get("index_col")
- if index_col is not False:
- if not is_list_like(index_col):
- len_index = 1
- else:
- len_index = len(index_col)
- if kwds.get("usecols") is None and len(names) + len_index != len(colspecs):
- # If usecols is used colspec may be longer than names
- raise ValueError("Length of colspecs must match length of names")
-
- kwds["colspecs"] = colspecs
- kwds["infer_nrows"] = infer_nrows
- kwds["engine"] = "python-fwf"
-
- check_dtype_backend(dtype_backend)
- kwds["dtype_backend"] = dtype_backend
- return _read(filepath_or_buffer, kwds)
-
-
-class TextFileReader(abc.Iterator):
- """
-
- Passed dialect overrides any of the related parser options
-
- """
-
- def __init__(
- self,
- f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list,
- engine: CSVEngine | None = None,
- **kwds,
- ) -> None:
- if engine is not None:
- engine_specified = True
- else:
- engine = "python"
- engine_specified = False
- self.engine = engine
- self._engine_specified = kwds.get("engine_specified", engine_specified)
-
- _validate_skipfooter(kwds)
-
- dialect = _extract_dialect(kwds)
- if dialect is not None:
- if engine == "pyarrow":
- raise ValueError(
- "The 'dialect' option is not supported with the 'pyarrow' engine"
- )
- kwds = _merge_with_dialect_properties(dialect, kwds)
-
- if kwds.get("header", "infer") == "infer":
- kwds["header"] = 0 if kwds.get("names") is None else None
-
- self.orig_options = kwds
-
- # miscellanea
- self._currow = 0
-
- options = self._get_options_with_defaults(engine)
- options["storage_options"] = kwds.get("storage_options", None)
-
- self.chunksize = options.pop("chunksize", None)
- self.nrows = options.pop("nrows", None)
-
- self._check_file_or_buffer(f, engine)
- self.options, self.engine = self._clean_options(options, engine)
-
- if "has_index_names" in kwds:
- self.options["has_index_names"] = kwds["has_index_names"]
-
- self.handles: IOHandles | None = None
- self._engine = self._make_engine(f, self.engine)
-
- def close(self) -> None:
- if self.handles is not None:
- self.handles.close()
- self._engine.close()
-
- def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:
- kwds = self.orig_options
-
- options = {}
- default: object | None
-
- for argname, default in parser_defaults.items():
- value = kwds.get(argname, default)
-
- # see gh-12935
- if (
- engine == "pyarrow"
- and argname in _pyarrow_unsupported
- and value != default
- and value != getattr(value, "value", default)
- ):
- raise ValueError(
- f"The {repr(argname)} option is not supported with the "
- f"'pyarrow' engine"
- )
- options[argname] = value
-
- for argname, default in _c_parser_defaults.items():
- if argname in kwds:
- value = kwds[argname]
-
- if engine != "c" and value != default:
- if "python" in engine and argname not in _python_unsupported:
- pass
- else:
- raise ValueError(
- f"The {repr(argname)} option is not supported with the "
- f"{repr(engine)} engine"
- )
- else:
- value = default
- options[argname] = value
-
- if engine == "python-fwf":
- for argname, default in _fwf_defaults.items():
- options[argname] = kwds.get(argname, default)
-
- return options
-
- def _check_file_or_buffer(self, f, engine: CSVEngine) -> None:
- # see gh-16530
- if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"):
- # The C engine doesn't need the file-like to have the "__iter__"
- # attribute. However, the Python engine needs "__iter__(...)"
- # when iterating through such an object, meaning it
- # needs to have that attribute
- raise ValueError(
- "The 'python' engine cannot iterate through this file buffer."
- )
-
- def _clean_options(
- self, options: dict[str, Any], engine: CSVEngine
- ) -> tuple[dict[str, Any], CSVEngine]:
- result = options.copy()
-
- fallback_reason = None
-
- # C engine not supported yet
- if engine == "c":
- if options["skipfooter"] > 0:
- fallback_reason = "the 'c' engine does not support skipfooter"
- engine = "python"
-
- sep = options["delimiter"]
- delim_whitespace = options["delim_whitespace"]
-
- if sep is None and not delim_whitespace:
- if engine in ("c", "pyarrow"):
- fallback_reason = (
- f"the '{engine}' engine does not support "
- "sep=None with delim_whitespace=False"
- )
- engine = "python"
- elif sep is not None and len(sep) > 1:
- if engine == "c" and sep == r"\s+":
- result["delim_whitespace"] = True
- del result["delimiter"]
- elif engine not in ("python", "python-fwf"):
- # wait until regex engine integrated
- fallback_reason = (
- f"the '{engine}' engine does not support "
- "regex separators (separators > 1 char and "
- r"different from '\s+' are interpreted as regex)"
- )
- engine = "python"
- elif delim_whitespace:
- if "python" in engine:
- result["delimiter"] = r"\s+"
- elif sep is not None:
- encodeable = True
- encoding = sys.getfilesystemencoding() or "utf-8"
- try:
- if len(sep.encode(encoding)) > 1:
- encodeable = False
- except UnicodeDecodeError:
- encodeable = False
- if not encodeable and engine not in ("python", "python-fwf"):
- fallback_reason = (
- f"the separator encoded in {encoding} "
- f"is > 1 char long, and the '{engine}' engine "
- "does not support such separators"
- )
- engine = "python"
-
- quotechar = options["quotechar"]
- if quotechar is not None and isinstance(quotechar, (str, bytes)):
- if (
- len(quotechar) == 1
- and ord(quotechar) > 127
- and engine not in ("python", "python-fwf")
- ):
- fallback_reason = (
- "ord(quotechar) > 127, meaning the "
- "quotechar is larger than one byte, "
- f"and the '{engine}' engine does not support such quotechars"
- )
- engine = "python"
-
- if fallback_reason and self._engine_specified:
- raise ValueError(fallback_reason)
-
- if engine == "c":
- for arg in _c_unsupported:
- del result[arg]
-
- if "python" in engine:
- for arg in _python_unsupported:
- if fallback_reason and result[arg] != _c_parser_defaults[arg]:
- raise ValueError(
- "Falling back to the 'python' engine because "
- f"{fallback_reason}, but this causes {repr(arg)} to be "
- "ignored as it is not supported by the 'python' engine."
- )
- del result[arg]
-
- if fallback_reason:
- warnings.warn(
- (
- "Falling back to the 'python' engine because "
- f"{fallback_reason}; you can avoid this warning by specifying "
- "engine='python'."
- ),
- ParserWarning,
- stacklevel=find_stack_level(),
- )
-
- index_col = options["index_col"]
- names = options["names"]
- converters = options["converters"]
- na_values = options["na_values"]
- skiprows = options["skiprows"]
-
- validate_header_arg(options["header"])
-
- if index_col is True:
- raise ValueError("The value of index_col couldn't be 'True'")
- if is_index_col(index_col):
- if not isinstance(index_col, (list, tuple, np.ndarray)):
- index_col = [index_col]
- result["index_col"] = index_col
-
- names = list(names) if names is not None else names
-
- # type conversion-related
- if converters is not None:
- if not isinstance(converters, dict):
- raise TypeError(
- "Type converters must be a dict or subclass, "
- f"input was a {type(converters).__name__}"
- )
- else:
- converters = {}
-
- # Converting values to NA
- keep_default_na = options["keep_default_na"]
- na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
-
- # handle skiprows; this is internally handled by the
- # c-engine, so only need for python and pyarrow parsers
- if engine == "pyarrow":
- if not is_integer(skiprows) and skiprows is not None:
- # pyarrow expects skiprows to be passed as an integer
- raise ValueError(
- "skiprows argument must be an integer when using "
- "engine='pyarrow'"
- )
- else:
- if is_integer(skiprows):
- skiprows = list(range(skiprows))
- if skiprows is None:
- skiprows = set()
- elif not callable(skiprows):
- skiprows = set(skiprows)
-
- # put stuff back
- result["names"] = names
- result["converters"] = converters
- result["na_values"] = na_values
- result["na_fvalues"] = na_fvalues
- result["skiprows"] = skiprows
-
- return result, engine
-
- def __next__(self) -> DataFrame:
- try:
- return self.get_chunk()
- except StopIteration:
- self.close()
- raise
-
- def _make_engine(
- self,
- f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO,
- engine: CSVEngine = "c",
- ) -> ParserBase:
- mapping: dict[str, type[ParserBase]] = {
- "c": CParserWrapper,
- "python": PythonParser,
- "pyarrow": ArrowParserWrapper,
- "python-fwf": FixedWidthFieldParser,
- }
- if engine not in mapping:
- raise ValueError(
- f"Unknown engine: {engine} (valid options are {mapping.keys()})"
- )
- if not isinstance(f, list):
- # open file here
- is_text = True
- mode = "r"
- if engine == "pyarrow":
- is_text = False
- mode = "rb"
- elif (
- engine == "c"
- and self.options.get("encoding", "utf-8") == "utf-8"
- and isinstance(stringify_path(f), str)
- ):
- # c engine can decode utf-8 bytes, adding TextIOWrapper makes
- # the c-engine especially for memory_map=True far slower
- is_text = False
- if "b" not in mode:
- mode += "b"
- self.handles = get_handle(
- f,
- mode,
- encoding=self.options.get("encoding", None),
- compression=self.options.get("compression", None),
- memory_map=self.options.get("memory_map", False),
- is_text=is_text,
- errors=self.options.get("encoding_errors", "strict"),
- storage_options=self.options.get("storage_options", None),
- )
- assert self.handles is not None
- f = self.handles.handle
-
- elif engine != "python":
- msg = f"Invalid file path or buffer object type: {type(f)}"
- raise ValueError(msg)
-
- try:
- return mapping[engine](f, **self.options)
- except Exception:
- if self.handles is not None:
- self.handles.close()
- raise
-
- def _failover_to_python(self) -> None:
- raise AbstractMethodError(self)
-
- def read(self, nrows: int | None = None) -> DataFrame:
- if self.engine == "pyarrow":
- try:
- # error: "ParserBase" has no attribute "read"
- df = self._engine.read() # type: ignore[attr-defined]
- except Exception:
- self.close()
- raise
- else:
- nrows = validate_integer("nrows", nrows)
- try:
- # error: "ParserBase" has no attribute "read"
- (
- index,
- columns,
- col_dict,
- ) = self._engine.read( # type: ignore[attr-defined]
- nrows
- )
- except Exception:
- self.close()
- raise
-
- if index is None:
- if col_dict:
- # Any column is actually fine:
- new_rows = len(next(iter(col_dict.values())))
- index = RangeIndex(self._currow, self._currow + new_rows)
- else:
- new_rows = 0
- else:
- new_rows = len(index)
-
- df = DataFrame(col_dict, columns=columns, index=index)
-
- self._currow += new_rows
- return df
-
- def get_chunk(self, size: int | None = None) -> DataFrame:
- if size is None:
- size = self.chunksize
- if self.nrows is not None:
- if self._currow >= self.nrows:
- raise StopIteration
- size = min(size, self.nrows - self._currow)
- return self.read(nrows=size)
-
- def __enter__(self) -> TextFileReader:
- return self
-
- def __exit__(
- self,
- exc_type: type[BaseException] | None,
- exc_value: BaseException | None,
- traceback: TracebackType | None,
- ) -> None:
- self.close()
-
-
-def TextParser(*args, **kwds) -> TextFileReader:
- """
- Converts lists of lists/tuples into DataFrames with proper type inference
- and optional (e.g. string to datetime) conversion. Also enables iterating
- lazily over chunks of large files
-
- Parameters
- ----------
- data : file-like object or list
- delimiter : separator character to use
- dialect : str or csv.Dialect instance, optional
- Ignored if delimiter is longer than 1 character
- names : sequence, default
- header : int, default 0
- Row to use to parse column labels. Defaults to the first row. Prior
- rows will be discarded
- index_col : int or list, optional
- Column or columns to use as the (possibly hierarchical) index
- has_index_names: bool, default False
- True if the cols defined in index_col have an index name and are
- not in the header.
- na_values : scalar, str, list-like, or dict, optional
- Additional strings to recognize as NA/NaN.
- keep_default_na : bool, default True
- thousands : str, optional
- Thousands separator
- comment : str, optional
- Comment out remainder of line
- parse_dates : bool, default False
- keep_date_col : bool, default False
- date_parser : function, optional
-
- .. deprecated:: 2.0.0
- date_format : str or dict of column -> format, default ``None``
-
- .. versionadded:: 2.0.0
- skiprows : list of integers
- Row numbers to skip
- skipfooter : int
- Number of line at bottom of file to skip
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can
- either be integers or column labels, values are functions that take one
- input argument, the cell (not column) content, and return the
- transformed content.
- encoding : str, optional
- Encoding to use for UTF when reading/writing (ex. 'utf-8')
- float_precision : str, optional
- Specifies which converter the C engine should use for floating-point
- values. The options are `None` or `high` for the ordinary converter,
- `legacy` for the original lower precision pandas converter, and
- `round_trip` for the round-trip converter.
-
- .. versionchanged:: 1.2
- """
- kwds["engine"] = "python"
- return TextFileReader(*args, **kwds)
-
-
-def _clean_na_values(na_values, keep_default_na: bool = True):
- na_fvalues: set | dict
- if na_values is None:
- if keep_default_na:
- na_values = STR_NA_VALUES
- else:
- na_values = set()
- na_fvalues = set()
- elif isinstance(na_values, dict):
- old_na_values = na_values.copy()
- na_values = {} # Prevent aliasing.
-
- # Convert the values in the na_values dictionary
- # into array-likes for further use. This is also
- # where we append the default NaN values, provided
- # that `keep_default_na=True`.
- for k, v in old_na_values.items():
- if not is_list_like(v):
- v = [v]
-
- if keep_default_na:
- v = set(v) | STR_NA_VALUES
-
- na_values[k] = v
- na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
- else:
- if not is_list_like(na_values):
- na_values = [na_values]
- na_values = _stringify_na_values(na_values)
- if keep_default_na:
- na_values = na_values | STR_NA_VALUES
-
- na_fvalues = _floatify_na_values(na_values)
-
- return na_values, na_fvalues
-
-
-def _floatify_na_values(na_values):
- # create float versions of the na_values
- result = set()
- for v in na_values:
- try:
- v = float(v)
- if not np.isnan(v):
- result.add(v)
- except (TypeError, ValueError, OverflowError):
- pass
- return result
-
-
-def _stringify_na_values(na_values):
- """return a stringified and numeric for these values"""
- result: list[str | float] = []
- for x in na_values:
- result.append(str(x))
- result.append(x)
- try:
- v = float(x)
-
- # we are like 999 here
- if v == int(v):
- v = int(v)
- result.append(f"{v}.0")
- result.append(str(v))
-
- result.append(v)
- except (TypeError, ValueError, OverflowError):
- pass
- try:
- result.append(int(x))
- except (TypeError, ValueError, OverflowError):
- pass
- return set(result)
-
-
-def _refine_defaults_read(
- dialect: str | csv.Dialect | None,
- delimiter: str | None | lib.NoDefault,
- delim_whitespace: bool,
- engine: CSVEngine | None,
- sep: str | None | lib.NoDefault,
- on_bad_lines: str | Callable,
- names: Sequence[Hashable] | None | lib.NoDefault,
- defaults: dict[str, Any],
- dtype_backend: DtypeBackend | lib.NoDefault,
-):
- """Validate/refine default values of input parameters of read_csv, read_table.
-
- Parameters
- ----------
- dialect : str or csv.Dialect
- If provided, this parameter will override values (default or not) for the
- following parameters: `delimiter`, `doublequote`, `escapechar`,
- `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
- override values, a ParserWarning will be issued. See csv.Dialect
- documentation for more details.
- delimiter : str or object
- Alias for sep.
- delim_whitespace : bool
- Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
- used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
- is set to True, nothing should be passed in for the ``delimiter``
- parameter.
- engine : {{'c', 'python'}}
- Parser engine to use. The C engine is faster while the python engine is
- currently more feature-complete.
- sep : str or object
- A delimiter provided by the user (str) or a sentinel value, i.e.
- pandas._libs.lib.no_default.
- on_bad_lines : str, callable
- An option for handling bad lines or a sentinel value(None).
- names : array-like, optional
- List of column names to use. If the file contains a header row,
- then you should explicitly pass ``header=0`` to override the column names.
- Duplicates in this list are not allowed.
- defaults: dict
- Default values of input parameters.
-
- Returns
- -------
- kwds : dict
- Input parameters with correct values.
-
- Raises
- ------
- ValueError :
- If a delimiter was specified with ``sep`` (or ``delimiter``) and
- ``delim_whitespace=True``.
- """
- # fix types for sep, delimiter to Union(str, Any)
- delim_default = defaults["delimiter"]
- kwds: dict[str, Any] = {}
- # gh-23761
- #
- # When a dialect is passed, it overrides any of the overlapping
- # parameters passed in directly. We don't want to warn if the
- # default parameters were passed in (since it probably means
- # that the user didn't pass them in explicitly in the first place).
- #
- # "delimiter" is the annoying corner case because we alias it to
- # "sep" before doing comparison to the dialect values later on.
- # Thus, we need a flag to indicate that we need to "override"
- # the comparison to dialect values by checking if default values
- # for BOTH "delimiter" and "sep" were provided.
- if dialect is not None:
- kwds["sep_override"] = delimiter is None and (
- sep is lib.no_default or sep == delim_default
- )
-
- if delimiter and (sep is not lib.no_default):
- raise ValueError("Specified a sep and a delimiter; you can only specify one.")
-
- kwds["names"] = None if names is lib.no_default else names
-
- # Alias sep -> delimiter.
- if delimiter is None:
- delimiter = sep
-
- if delim_whitespace and (delimiter is not lib.no_default):
- raise ValueError(
- "Specified a delimiter with both sep and "
- "delim_whitespace=True; you can only specify one."
- )
-
- if delimiter == "\n":
- raise ValueError(
- r"Specified \n as separator or delimiter. This forces the python engine "
- "which does not accept a line terminator. Hence it is not allowed to use "
- "the line terminator as separator.",
- )
-
- if delimiter is lib.no_default:
- # assign default separator value
- kwds["delimiter"] = delim_default
- else:
- kwds["delimiter"] = delimiter
-
- if engine is not None:
- kwds["engine_specified"] = True
- else:
- kwds["engine"] = "c"
- kwds["engine_specified"] = False
-
- if on_bad_lines == "error":
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
- elif on_bad_lines == "warn":
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
- elif on_bad_lines == "skip":
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
- elif callable(on_bad_lines):
- if engine != "python":
- raise ValueError(
- "on_bad_line can only be a callable function if engine='python'"
- )
- kwds["on_bad_lines"] = on_bad_lines
- else:
- raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")
-
- check_dtype_backend(dtype_backend)
-
- kwds["dtype_backend"] = dtype_backend
-
- return kwds
-
-
-def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None:
- """
- Extract concrete csv dialect instance.
-
- Returns
- -------
- csv.Dialect or None
- """
- if kwds.get("dialect") is None:
- return None
-
- dialect = kwds["dialect"]
- if dialect in csv.list_dialects():
- dialect = csv.get_dialect(dialect)
-
- _validate_dialect(dialect)
-
- return dialect
-
-
-MANDATORY_DIALECT_ATTRS = (
- "delimiter",
- "doublequote",
- "escapechar",
- "skipinitialspace",
- "quotechar",
- "quoting",
-)
-
-
-def _validate_dialect(dialect: csv.Dialect) -> None:
- """
- Validate csv dialect instance.
-
- Raises
- ------
- ValueError
- If incorrect dialect is provided.
- """
- for param in MANDATORY_DIALECT_ATTRS:
- if not hasattr(dialect, param):
- raise ValueError(f"Invalid dialect {dialect} provided")
-
-
-def _merge_with_dialect_properties(
- dialect: csv.Dialect,
- defaults: dict[str, Any],
-) -> dict[str, Any]:
- """
- Merge default kwargs in TextFileReader with dialect parameters.
-
- Parameters
- ----------
- dialect : csv.Dialect
- Concrete csv dialect. See csv.Dialect documentation for more details.
- defaults : dict
- Keyword arguments passed to TextFileReader.
-
- Returns
- -------
- kwds : dict
- Updated keyword arguments, merged with dialect parameters.
- """
- kwds = defaults.copy()
-
- for param in MANDATORY_DIALECT_ATTRS:
- dialect_val = getattr(dialect, param)
-
- parser_default = parser_defaults[param]
- provided = kwds.get(param, parser_default)
-
- # Messages for conflicting values between the dialect
- # instance and the actual parameters provided.
- conflict_msgs = []
-
- # Don't warn if the default parameter was passed in,
- # even if it conflicts with the dialect (gh-23761).
- if provided not in (parser_default, dialect_val):
- msg = (
- f"Conflicting values for '{param}': '{provided}' was "
- f"provided, but the dialect specifies '{dialect_val}'. "
- "Using the dialect-specified value."
- )
-
- # Annoying corner case for not warning about
- # conflicts between dialect and delimiter parameter.
- # Refer to the outer "_read_" function for more info.
- if not (param == "delimiter" and kwds.pop("sep_override", False)):
- conflict_msgs.append(msg)
-
- if conflict_msgs:
- warnings.warn(
- "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level()
- )
- kwds[param] = dialect_val
- return kwds
-
-
-def _validate_skipfooter(kwds: dict[str, Any]) -> None:
- """
- Check whether skipfooter is compatible with other kwargs in TextFileReader.
-
- Parameters
- ----------
- kwds : dict
- Keyword arguments passed to TextFileReader.
-
- Raises
- ------
- ValueError
- If skipfooter is not compatible with other parameters.
- """
- if kwds.get("skipfooter"):
- if kwds.get("iterator") or kwds.get("chunksize"):
- raise ValueError("'skipfooter' not supported for iteration")
- if kwds.get("nrows"):
- raise ValueError("'skipfooter' not supported with 'nrows'")
diff --git a/contrib/python/pandas/py3/pandas/io/pickle.py b/contrib/python/pandas/py3/pandas/io/pickle.py
deleted file mode 100644
index a9ab925536d..00000000000
--- a/contrib/python/pandas/py3/pandas/io/pickle.py
+++ /dev/null
@@ -1,204 +0,0 @@
-""" pickle compat """
-from __future__ import annotations
-
-import pickle
-from typing import Any
-import warnings
-
-from pandas._typing import (
- CompressionOptions,
- FilePath,
- ReadPickleBuffer,
- StorageOptions,
- WriteBuffer,
-)
-from pandas.compat import pickle_compat as pc
-from pandas.util._decorators import doc
-
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.common import get_handle
-
-
-@doc(
- storage_options=_shared_docs["storage_options"],
- compression_options=_shared_docs["compression_options"] % "filepath_or_buffer",
-)
-def to_pickle(
- obj: Any,
- filepath_or_buffer: FilePath | WriteBuffer[bytes],
- compression: CompressionOptions = "infer",
- protocol: int = pickle.HIGHEST_PROTOCOL,
- storage_options: StorageOptions = None,
-) -> None:
- """
- Pickle (serialize) object to file.
-
- Parameters
- ----------
- obj : any object
- Any python object.
- filepath_or_buffer : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``write()`` function.
- Also accepts URL. URL has to be of S3 or GCS.
- {compression_options}
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- protocol : int
- Int which indicates which protocol should be used by the pickler,
- default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
- values for this parameter depend on the version of Python. For Python
- 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
- For Python >= 3.4, 4 is a valid value. A negative value for the
- protocol parameter is equivalent to setting its value to
- HIGHEST_PROTOCOL.
-
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- .. [1] https://docs.python.org/3/library/pickle.html
-
- See Also
- --------
- read_pickle : Load pickled pandas object (or any object) from file.
- DataFrame.to_hdf : Write DataFrame to an HDF5 file.
- DataFrame.to_sql : Write DataFrame to a SQL database.
- DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
-
- Examples
- --------
- >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP
- >>> original_df # doctest: +SKIP
- foo bar
- 0 0 5
- 1 1 6
- 2 2 7
- 3 3 8
- 4 4 9
- >>> pd.to_pickle(original_df, "./dummy.pkl") # doctest: +SKIP
-
- >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP
- >>> unpickled_df # doctest: +SKIP
- foo bar
- 0 0 5
- 1 1 6
- 2 2 7
- 3 3 8
- 4 4 9
- """ # noqa: E501
- if protocol < 0:
- protocol = pickle.HIGHEST_PROTOCOL
-
- with get_handle(
- filepath_or_buffer,
- "wb",
- compression=compression,
- is_text=False,
- storage_options=storage_options,
- ) as handles:
- # letting pickle write directly to the buffer is more memory-efficient
- pickle.dump(obj, handles.handle, protocol=protocol)
-
-
-@doc(
- storage_options=_shared_docs["storage_options"],
- decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer",
-)
-def read_pickle(
- filepath_or_buffer: FilePath | ReadPickleBuffer,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
-):
- """
- Load pickled pandas object (or any object) from file.
-
- .. warning::
-
- Loading pickled data received from untrusted sources can be
- unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.
-
- Parameters
- ----------
- filepath_or_buffer : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``readlines()`` function.
- Also accepts URL. URL is not limited to S3 and GCS.
-
- {decompression_options}
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- Returns
- -------
- same type as object stored in file
-
- See Also
- --------
- DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
- Series.to_pickle : Pickle (serialize) Series object to file.
- read_hdf : Read HDF5 file into a DataFrame.
- read_sql : Read SQL query or database table into a DataFrame.
- read_parquet : Load a parquet object, returning a DataFrame.
-
- Notes
- -----
- read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3
- provided the object was serialized with to_pickle.
-
- Examples
- --------
- >>> original_df = pd.DataFrame(
- ... {{"foo": range(5), "bar": range(5, 10)}}
- ... ) # doctest: +SKIP
- >>> original_df # doctest: +SKIP
- foo bar
- 0 0 5
- 1 1 6
- 2 2 7
- 3 3 8
- 4 4 9
- >>> pd.to_pickle(original_df, "./dummy.pkl") # doctest: +SKIP
-
- >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP
- >>> unpickled_df # doctest: +SKIP
- foo bar
- 0 0 5
- 1 1 6
- 2 2 7
- 3 3 8
- 4 4 9
- """
- excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError)
- with get_handle(
- filepath_or_buffer,
- "rb",
- compression=compression,
- is_text=False,
- storage_options=storage_options,
- ) as handles:
- # 1) try standard library Pickle
- # 2) try pickle_compat (older pandas version) to handle subclass changes
- # 3) try pickle_compat with latin-1 encoding upon a UnicodeDecodeError
-
- try:
- # TypeError for Cython complaints about object.__new__ vs Tick.__new__
- try:
- with warnings.catch_warnings(record=True):
- # We want to silence any warnings about, e.g. moved modules.
- warnings.simplefilter("ignore", Warning)
- return pickle.load(handles.handle)
- except excs_to_catch:
- # e.g.
- # "No module named 'pandas.core.sparse.series'"
- # "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
- return pc.load(handles.handle, encoding=None)
- except UnicodeDecodeError:
- # e.g. can occur for files written in py27; see GH#28645 and GH#31988
- return pc.load(handles.handle, encoding="latin-1")
diff --git a/contrib/python/pandas/py3/pandas/io/pytables.py b/contrib/python/pandas/py3/pandas/io/pytables.py
deleted file mode 100644
index 746c9459616..00000000000
--- a/contrib/python/pandas/py3/pandas/io/pytables.py
+++ /dev/null
@@ -1,5289 +0,0 @@
-"""
-High level interface to PyTables for reading and writing pandas data structures
-to disk
-"""
-from __future__ import annotations
-
-from contextlib import suppress
-import copy
-from datetime import (
- date,
- tzinfo,
-)
-import itertools
-import os
-import re
-from textwrap import dedent
-from types import TracebackType
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Final,
- Hashable,
- Iterator,
- Literal,
- Sequence,
- cast,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._config import (
- config,
- get_option,
-)
-
-from pandas._libs import (
- lib,
- writers as libwriters,
-)
-from pandas._libs.tslibs import timezones
-from pandas._typing import (
- AnyArrayLike,
- ArrayLike,
- AxisInt,
- DtypeArg,
- FilePath,
- Shape,
- npt,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.compat.pickle_compat import patch_pickle
-from pandas.errors import (
- AttributeConflictWarning,
- ClosedFileError,
- IncompatibilityWarning,
- PerformanceWarning,
- PossibleDataLossError,
-)
-from pandas.util._decorators import cache_readonly
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import (
- ensure_object,
- is_bool_dtype,
- is_categorical_dtype,
- is_complex_dtype,
- is_datetime64_dtype,
- is_datetime64tz_dtype,
- is_extension_array_dtype,
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- is_string_dtype,
- is_timedelta64_dtype,
- needs_i8_conversion,
-)
-from pandas.core.dtypes.missing import array_equivalent
-
-from pandas import (
- DataFrame,
- DatetimeIndex,
- Index,
- MultiIndex,
- PeriodIndex,
- RangeIndex,
- Series,
- TimedeltaIndex,
- concat,
- isna,
-)
-from pandas.core.arrays import (
- Categorical,
- DatetimeArray,
- PeriodArray,
-)
-import pandas.core.common as com
-from pandas.core.computation.pytables import (
- PyTablesExpr,
- maybe_expression,
-)
-from pandas.core.construction import extract_array
-from pandas.core.indexes.api import ensure_index
-from pandas.core.internals import (
- ArrayManager,
- BlockManager,
-)
-
-from pandas.io.common import stringify_path
-from pandas.io.formats.printing import (
- adjoin,
- pprint_thing,
-)
-
-if TYPE_CHECKING:
- from tables import (
- Col,
- File,
- Node,
- )
-
- from pandas.core.internals import Block
-
-
-# versioning attribute
-_version = "0.15.2"
-
-# encoding
-_default_encoding = "UTF-8"
-
-
-def _ensure_decoded(s):
- """if we have bytes, decode them to unicode"""
- if isinstance(s, np.bytes_):
- s = s.decode("UTF-8")
- return s
-
-
-def _ensure_encoding(encoding: str | None) -> str:
- # set the encoding if we need
- if encoding is None:
- encoding = _default_encoding
-
- return encoding
-
-
-def _ensure_str(name):
- """
- Ensure that an index / column name is a str (python 3); otherwise they
- may be np.string dtype. Non-string dtypes are passed through unchanged.
-
- https://github.com/pandas-dev/pandas/issues/13492
- """
- if isinstance(name, str):
- name = str(name)
- return name
-
-
-Term = PyTablesExpr
-
-
-def _ensure_term(where, scope_level: int):
- """
- Ensure that the where is a Term or a list of Term.
-
- This makes sure that we are capturing the scope of variables that are
- passed create the terms here with a frame_level=2 (we are 2 levels down)
- """
- # only consider list/tuple here as an ndarray is automatically a coordinate
- # list
- level = scope_level + 1
- if isinstance(where, (list, tuple)):
- where = [
- Term(term, scope_level=level + 1) if maybe_expression(term) else term
- for term in where
- if term is not None
- ]
- elif maybe_expression(where):
- where = Term(where, scope_level=level)
- return where if where is None or len(where) else None
-
-
-incompatibility_doc: Final = """
-where criteria is being ignored as this version [%s] is too old (or
-not-defined), read the file in and write it out to a new file to upgrade (with
-the copy_to method)
-"""
-
-attribute_conflict_doc: Final = """
-the [%s] attribute of the existing index is [%s] which conflicts with the new
-[%s], resetting the attribute to None
-"""
-
-performance_doc: Final = """
-your performance may suffer as PyTables will pickle object types that it cannot
-map directly to c-types [inferred_type->%s,key->%s] [items->%s]
-"""
-
-# formats
-_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
-
-# axes map
-_AXES_MAP = {DataFrame: [0]}
-
-# register our configuration options
-dropna_doc: Final = """
-: boolean
- drop ALL nan rows when appending to a table
-"""
-format_doc: Final = """
-: format
- default format writing format, if None, then
- put will default to 'fixed' and append will default to 'table'
-"""
-
-with config.config_prefix("io.hdf"):
- config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
- config.register_option(
- "default_format",
- None,
- format_doc,
- validator=config.is_one_of_factory(["fixed", "table", None]),
- )
-
-# oh the troubles to reduce import time
-_table_mod = None
-_table_file_open_policy_is_strict = False
-
-
-def _tables():
- global _table_mod
- global _table_file_open_policy_is_strict
- if _table_mod is None:
- import tables
-
- _table_mod = tables
-
- # set the file open policy
- # return the file open policy; this changes as of pytables 3.1
- # depending on the HDF5 version
- with suppress(AttributeError):
- _table_file_open_policy_is_strict = (
- tables.file._FILE_OPEN_POLICY == "strict"
- )
-
- return _table_mod
-
-
-# interface to/from ###
-
-
-def to_hdf(
- path_or_buf: FilePath | HDFStore,
- key: str,
- value: DataFrame | Series,
- mode: str = "a",
- complevel: int | None = None,
- complib: str | None = None,
- append: bool = False,
- format: str | None = None,
- index: bool = True,
- min_itemsize: int | dict[str, int] | None = None,
- nan_rep=None,
- dropna: bool | None = None,
- data_columns: Literal[True] | list[str] | None = None,
- errors: str = "strict",
- encoding: str = "UTF-8",
-) -> None:
- """store this object, close it if we opened it"""
- if append:
- f = lambda store: store.append(
- key,
- value,
- format=format,
- index=index,
- min_itemsize=min_itemsize,
- nan_rep=nan_rep,
- dropna=dropna,
- data_columns=data_columns,
- errors=errors,
- encoding=encoding,
- )
- else:
- # NB: dropna is not passed to `put`
- f = lambda store: store.put(
- key,
- value,
- format=format,
- index=index,
- min_itemsize=min_itemsize,
- nan_rep=nan_rep,
- data_columns=data_columns,
- errors=errors,
- encoding=encoding,
- dropna=dropna,
- )
-
- path_or_buf = stringify_path(path_or_buf)
- if isinstance(path_or_buf, str):
- with HDFStore(
- path_or_buf, mode=mode, complevel=complevel, complib=complib
- ) as store:
- f(store)
- else:
- f(path_or_buf)
-
-
-def read_hdf(
- path_or_buf: FilePath | HDFStore,
- key=None,
- mode: str = "r",
- errors: str = "strict",
- where: str | list | None = None,
- start: int | None = None,
- stop: int | None = None,
- columns: list[str] | None = None,
- iterator: bool = False,
- chunksize: int | None = None,
- **kwargs,
-):
- """
- Read from the store, close it if we opened it.
-
- Retrieve pandas object stored in file, optionally based on where
- criteria.
-
- .. warning::
-
- Pandas uses PyTables for reading and writing HDF5 files, which allows
- serializing object-dtype data with pickle when using the "fixed" format.
- Loading pickled data received from untrusted sources can be unsafe.
-
- See: https://docs.python.org/3/library/pickle.html for more.
-
- Parameters
- ----------
- path_or_buf : str, path object, pandas.HDFStore
- Any valid string path is acceptable. Only supports the local file system,
- remote URLs and file-like objects are not supported.
-
- If you want to pass in a path object, pandas accepts any
- ``os.PathLike``.
-
- Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
-
- key : object, optional
- The group identifier in the store. Can be omitted if the HDF file
- contains a single pandas object.
- mode : {'r', 'r+', 'a'}, default 'r'
- Mode to use when opening the file. Ignored if path_or_buf is a
- :class:`pandas.HDFStore`. Default is 'r'.
- errors : str, default 'strict'
- Specifies how encoding and decoding errors are to be handled.
- See the errors argument for :func:`open` for a full list
- of options.
- where : list, optional
- A list of Term (or convertible) objects.
- start : int, optional
- Row number to start selection.
- stop : int, optional
- Row number to stop selection.
- columns : list, optional
- A list of columns names to return.
- iterator : bool, optional
- Return an iterator object.
- chunksize : int, optional
- Number of rows to include in an iteration when using an iterator.
- **kwargs
- Additional keyword arguments passed to HDFStore.
-
- Returns
- -------
- object
- The selected object. Return type depends on the object stored.
-
- See Also
- --------
- DataFrame.to_hdf : Write a HDF file from a DataFrame.
- HDFStore : Low-level access to HDF files.
-
- Examples
- --------
- >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP
- >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP
- >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP
- """
- if mode not in ["r", "r+", "a"]:
- raise ValueError(
- f"mode {mode} is not allowed while performing a read. "
- f"Allowed modes are r, r+ and a."
- )
- # grab the scope
- if where is not None:
- where = _ensure_term(where, scope_level=1)
-
- if isinstance(path_or_buf, HDFStore):
- if not path_or_buf.is_open:
- raise OSError("The HDFStore must be open for reading.")
-
- store = path_or_buf
- auto_close = False
- else:
- path_or_buf = stringify_path(path_or_buf)
- if not isinstance(path_or_buf, str):
- raise NotImplementedError(
- "Support for generic buffers has not been implemented."
- )
- try:
- exists = os.path.exists(path_or_buf)
-
- # if filepath is too long
- except (TypeError, ValueError):
- exists = False
-
- if not exists:
- raise FileNotFoundError(f"File {path_or_buf} does not exist")
-
- store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
- # can't auto open/close if we are using an iterator
- # so delegate to the iterator
- auto_close = True
-
- try:
- if key is None:
- groups = store.groups()
- if len(groups) == 0:
- raise ValueError(
- "Dataset(s) incompatible with Pandas data types, "
- "not table, or no datasets found in HDF5 file."
- )
- candidate_only_group = groups[0]
-
- # For the HDF file to have only one dataset, all other groups
- # should then be metadata groups for that candidate group. (This
- # assumes that the groups() method enumerates parent groups
- # before their children.)
- for group_to_check in groups[1:]:
- if not _is_metadata_of(group_to_check, candidate_only_group):
- raise ValueError(
- "key must be provided when HDF5 "
- "file contains multiple datasets."
- )
- key = candidate_only_group._v_pathname
- return store.select(
- key,
- where=where,
- start=start,
- stop=stop,
- columns=columns,
- iterator=iterator,
- chunksize=chunksize,
- auto_close=auto_close,
- )
- except (ValueError, TypeError, KeyError):
- if not isinstance(path_or_buf, HDFStore):
- # if there is an error, close the store if we opened it.
- with suppress(AttributeError):
- store.close()
-
- raise
-
-
-def _is_metadata_of(group: Node, parent_group: Node) -> bool:
- """Check if a given group is a metadata group for a given parent_group."""
- if group._v_depth <= parent_group._v_depth:
- return False
-
- current = group
- while current._v_depth > 1:
- parent = current._v_parent
- if parent == parent_group and current._v_name == "meta":
- return True
- current = current._v_parent
- return False
-
-
-class HDFStore:
- """
- Dict-like IO interface for storing pandas objects in PyTables.
-
- Either Fixed or Table format.
-
- .. warning::
-
- Pandas uses PyTables for reading and writing HDF5 files, which allows
- serializing object-dtype data with pickle when using the "fixed" format.
- Loading pickled data received from untrusted sources can be unsafe.
-
- See: https://docs.python.org/3/library/pickle.html for more.
-
- Parameters
- ----------
- path : str
- File path to HDF5 file.
- mode : {'a', 'w', 'r', 'r+'}, default 'a'
-
- ``'r'``
- Read-only; no data can be modified.
- ``'w'``
- Write; a new file is created (an existing file with the same
- name would be deleted).
- ``'a'``
- Append; an existing file is opened for reading and writing,
- and if the file does not exist it is created.
- ``'r+'``
- It is similar to ``'a'``, but the file must already exist.
- complevel : int, 0-9, default None
- Specifies a compression level for data.
- A value of 0 or None disables compression.
- complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
- Specifies the compression library to be used.
- As of v0.20.2 these additional compressors for Blosc are supported
- (default if no compressor specified: 'blosc:blosclz'):
- {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
- 'blosc:zlib', 'blosc:zstd'}.
- Specifying a compression library which is not available issues
- a ValueError.
- fletcher32 : bool, default False
- If applying compression use the fletcher32 checksum.
- **kwargs
- These parameters will be passed to the PyTables open_file method.
-
- Examples
- --------
- >>> bar = pd.DataFrame(np.random.randn(10, 4))
- >>> store = pd.HDFStore('test.h5')
- >>> store['foo'] = bar # write to HDF5
- >>> bar = store['foo'] # retrieve
- >>> store.close()
-
- **Create or load HDF5 file in-memory**
-
- When passing the `driver` option to the PyTables open_file method through
- **kwargs, the HDF5 file is loaded or created in-memory and will only be
- written when closed:
-
- >>> bar = pd.DataFrame(np.random.randn(10, 4))
- >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')
- >>> store['foo'] = bar
- >>> store.close() # only now, data is written to disk
- """
-
- _handle: File | None
- _mode: str
-
- def __init__(
- self,
- path,
- mode: str = "a",
- complevel: int | None = None,
- complib=None,
- fletcher32: bool = False,
- **kwargs,
- ) -> None:
- if "format" in kwargs:
- raise ValueError("format is not a defined argument for HDFStore")
-
- tables = import_optional_dependency("tables")
-
- if complib is not None and complib not in tables.filters.all_complibs:
- raise ValueError(
- f"complib only supports {tables.filters.all_complibs} compression."
- )
-
- if complib is None and complevel is not None:
- complib = tables.filters.default_complib
-
- self._path = stringify_path(path)
- if mode is None:
- mode = "a"
- self._mode = mode
- self._handle = None
- self._complevel = complevel if complevel else 0
- self._complib = complib
- self._fletcher32 = fletcher32
- self._filters = None
- self.open(mode=mode, **kwargs)
-
- def __fspath__(self) -> str:
- return self._path
-
- @property
- def root(self):
- """return the root node"""
- self._check_if_open()
- assert self._handle is not None # for mypy
- return self._handle.root
-
- @property
- def filename(self) -> str:
- return self._path
-
- def __getitem__(self, key: str):
- return self.get(key)
-
- def __setitem__(self, key: str, value) -> None:
- self.put(key, value)
-
- def __delitem__(self, key: str) -> None:
- return self.remove(key)
-
- def __getattr__(self, name: str):
- """allow attribute access to get stores"""
- try:
- return self.get(name)
- except (KeyError, ClosedFileError):
- pass
- raise AttributeError(
- f"'{type(self).__name__}' object has no attribute '{name}'"
- )
-
- def __contains__(self, key: str) -> bool:
- """
- check for existence of this key
- can match the exact pathname or the pathnm w/o the leading '/'
- """
- node = self.get_node(key)
- if node is not None:
- name = node._v_pathname
- if key in (name, name[1:]):
- return True
- return False
-
- def __len__(self) -> int:
- return len(self.groups())
-
- def __repr__(self) -> str:
- pstr = pprint_thing(self._path)
- return f"{type(self)}\nFile path: {pstr}\n"
-
- def __enter__(self) -> HDFStore:
- return self
-
- def __exit__(
- self,
- exc_type: type[BaseException] | None,
- exc_value: BaseException | None,
- traceback: TracebackType | None,
- ) -> None:
- self.close()
-
- def keys(self, include: str = "pandas") -> list[str]:
- """
- Return a list of keys corresponding to objects stored in HDFStore.
-
- Parameters
- ----------
-
- include : str, default 'pandas'
- When kind equals 'pandas' return pandas objects.
- When kind equals 'native' return native HDF5 Table objects.
-
- .. versionadded:: 1.1.0
-
- Returns
- -------
- list
- List of ABSOLUTE path-names (e.g. have the leading '/').
-
- Raises
- ------
- raises ValueError if kind has an illegal value
- """
- if include == "pandas":
- return [n._v_pathname for n in self.groups()]
-
- elif include == "native":
- assert self._handle is not None # mypy
- return [
- n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")
- ]
- raise ValueError(
- f"`include` should be either 'pandas' or 'native' but is '{include}'"
- )
-
- def __iter__(self) -> Iterator[str]:
- return iter(self.keys())
-
- def items(self) -> Iterator[tuple[str, list]]:
- """
- iterate on key->group
- """
- for g in self.groups():
- yield g._v_pathname, g
-
- def open(self, mode: str = "a", **kwargs) -> None:
- """
- Open the file in the specified mode
-
- Parameters
- ----------
- mode : {'a', 'w', 'r', 'r+'}, default 'a'
- See HDFStore docstring or tables.open_file for info about modes
- **kwargs
- These parameters will be passed to the PyTables open_file method.
- """
- tables = _tables()
-
- if self._mode != mode:
- # if we are changing a write mode to read, ok
- if self._mode in ["a", "w"] and mode in ["r", "r+"]:
- pass
- elif mode in ["w"]:
- # this would truncate, raise here
- if self.is_open:
- raise PossibleDataLossError(
- f"Re-opening the file [{self._path}] with mode [{self._mode}] "
- "will delete the current file!"
- )
-
- self._mode = mode
-
- # close and reopen the handle
- if self.is_open:
- self.close()
-
- if self._complevel and self._complevel > 0:
- self._filters = _tables().Filters(
- self._complevel, self._complib, fletcher32=self._fletcher32
- )
-
- if _table_file_open_policy_is_strict and self.is_open:
- msg = (
- "Cannot open HDF5 file, which is already opened, "
- "even in read-only mode."
- )
- raise ValueError(msg)
-
- self._handle = tables.open_file(self._path, self._mode, **kwargs)
-
- def close(self) -> None:
- """
- Close the PyTables file handle
- """
- if self._handle is not None:
- self._handle.close()
- self._handle = None
-
- @property
- def is_open(self) -> bool:
- """
- return a boolean indicating whether the file is open
- """
- if self._handle is None:
- return False
- return bool(self._handle.isopen)
-
- def flush(self, fsync: bool = False) -> None:
- """
- Force all buffered modifications to be written to disk.
-
- Parameters
- ----------
- fsync : bool (default False)
- call ``os.fsync()`` on the file handle to force writing to disk.
-
- Notes
- -----
- Without ``fsync=True``, flushing may not guarantee that the OS writes
- to disk. With fsync, the operation will block until the OS claims the
- file has been written; however, other caching layers may still
- interfere.
- """
- if self._handle is not None:
- self._handle.flush()
- if fsync:
- with suppress(OSError):
- os.fsync(self._handle.fileno())
-
- def get(self, key: str):
- """
- Retrieve pandas object stored in file.
-
- Parameters
- ----------
- key : str
-
- Returns
- -------
- object
- Same type as object stored in file.
- """
- with patch_pickle():
- # GH#31167 Without this patch, pickle doesn't know how to unpickle
- # old DateOffset objects now that they are cdef classes.
- group = self.get_node(key)
- if group is None:
- raise KeyError(f"No object named {key} in the file")
- return self._read_group(group)
-
- def select(
- self,
- key: str,
- where=None,
- start=None,
- stop=None,
- columns=None,
- iterator: bool = False,
- chunksize=None,
- auto_close: bool = False,
- ):
- """
- Retrieve pandas object stored in file, optionally based on where criteria.
-
- .. warning::
-
- Pandas uses PyTables for reading and writing HDF5 files, which allows
- serializing object-dtype data with pickle when using the "fixed" format.
- Loading pickled data received from untrusted sources can be unsafe.
-
- See: https://docs.python.org/3/library/pickle.html for more.
-
- Parameters
- ----------
- key : str
- Object being retrieved from file.
- where : list or None
- List of Term (or convertible) objects, optional.
- start : int or None
- Row number to start selection.
- stop : int, default None
- Row number to stop selection.
- columns : list or None
- A list of columns that if not None, will limit the return columns.
- iterator : bool or False
- Returns an iterator.
- chunksize : int or None
- Number or rows to include in iteration, return an iterator.
- auto_close : bool or False
- Should automatically close the store when finished.
-
- Returns
- -------
- object
- Retrieved object from file.
- """
- group = self.get_node(key)
- if group is None:
- raise KeyError(f"No object named {key} in the file")
-
- # create the storer and axes
- where = _ensure_term(where, scope_level=1)
- s = self._create_storer(group)
- s.infer_axes()
-
- # function to call on iteration
- def func(_start, _stop, _where):
- return s.read(start=_start, stop=_stop, where=_where, columns=columns)
-
- # create the iterator
- it = TableIterator(
- self,
- s,
- func,
- where=where,
- nrows=s.nrows,
- start=start,
- stop=stop,
- iterator=iterator,
- chunksize=chunksize,
- auto_close=auto_close,
- )
-
- return it.get_result()
-
- def select_as_coordinates(
- self,
- key: str,
- where=None,
- start: int | None = None,
- stop: int | None = None,
- ):
- """
- return the selection as an Index
-
- .. warning::
-
- Pandas uses PyTables for reading and writing HDF5 files, which allows
- serializing object-dtype data with pickle when using the "fixed" format.
- Loading pickled data received from untrusted sources can be unsafe.
-
- See: https://docs.python.org/3/library/pickle.html for more.
-
-
- Parameters
- ----------
- key : str
- where : list of Term (or convertible) objects, optional
- start : integer (defaults to None), row number to start selection
- stop : integer (defaults to None), row number to stop selection
- """
- where = _ensure_term(where, scope_level=1)
- tbl = self.get_storer(key)
- if not isinstance(tbl, Table):
- raise TypeError("can only read_coordinates with a table")
- return tbl.read_coordinates(where=where, start=start, stop=stop)
-
- def select_column(
- self,
- key: str,
- column: str,
- start: int | None = None,
- stop: int | None = None,
- ):
- """
- return a single column from the table. This is generally only useful to
- select an indexable
-
- .. warning::
-
- Pandas uses PyTables for reading and writing HDF5 files, which allows
- serializing object-dtype data with pickle when using the "fixed" format.
- Loading pickled data received from untrusted sources can be unsafe.
-
- See: https://docs.python.org/3/library/pickle.html for more.
-
- Parameters
- ----------
- key : str
- column : str
- The column of interest.
- start : int or None, default None
- stop : int or None, default None
-
- Raises
- ------
- raises KeyError if the column is not found (or key is not a valid
- store)
- raises ValueError if the column can not be extracted individually (it
- is part of a data block)
-
- """
- tbl = self.get_storer(key)
- if not isinstance(tbl, Table):
- raise TypeError("can only read_column with a table")
- return tbl.read_column(column=column, start=start, stop=stop)
-
- def select_as_multiple(
- self,
- keys,
- where=None,
- selector=None,
- columns=None,
- start=None,
- stop=None,
- iterator: bool = False,
- chunksize=None,
- auto_close: bool = False,
- ):
- """
- Retrieve pandas objects from multiple tables.
-
- .. warning::
-
- Pandas uses PyTables for reading and writing HDF5 files, which allows
- serializing object-dtype data with pickle when using the "fixed" format.
- Loading pickled data received from untrusted sources can be unsafe.
-
- See: https://docs.python.org/3/library/pickle.html for more.
-
- Parameters
- ----------
- keys : a list of the tables
- selector : the table to apply the where criteria (defaults to keys[0]
- if not supplied)
- columns : the columns I want back
- start : integer (defaults to None), row number to start selection
- stop : integer (defaults to None), row number to stop selection
- iterator : bool, return an iterator, default False
- chunksize : nrows to include in iteration, return an iterator
- auto_close : bool, default False
- Should automatically close the store when finished.
-
- Raises
- ------
- raises KeyError if keys or selector is not found or keys is empty
- raises TypeError if keys is not a list or tuple
- raises ValueError if the tables are not ALL THE SAME DIMENSIONS
- """
- # default to single select
- where = _ensure_term(where, scope_level=1)
- if isinstance(keys, (list, tuple)) and len(keys) == 1:
- keys = keys[0]
- if isinstance(keys, str):
- return self.select(
- key=keys,
- where=where,
- columns=columns,
- start=start,
- stop=stop,
- iterator=iterator,
- chunksize=chunksize,
- auto_close=auto_close,
- )
-
- if not isinstance(keys, (list, tuple)):
- raise TypeError("keys must be a list/tuple")
-
- if not len(keys):
- raise ValueError("keys must have a non-zero length")
-
- if selector is None:
- selector = keys[0]
-
- # collect the tables
- tbls = [self.get_storer(k) for k in keys]
- s = self.get_storer(selector)
-
- # validate rows
- nrows = None
- for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
- if t is None:
- raise KeyError(f"Invalid table [{k}]")
- if not t.is_table:
- raise TypeError(
- f"object [{t.pathname}] is not a table, and cannot be used in all "
- "select as multiple"
- )
-
- if nrows is None:
- nrows = t.nrows
- elif t.nrows != nrows:
- raise ValueError("all tables must have exactly the same nrows!")
-
- # The isinstance checks here are redundant with the check above,
- # but necessary for mypy; see GH#29757
- _tbls = [x for x in tbls if isinstance(x, Table)]
-
- # axis is the concentration axes
- axis = {t.non_index_axes[0][0] for t in _tbls}.pop()
-
- def func(_start, _stop, _where):
- # retrieve the objs, _where is always passed as a set of
- # coordinates here
- objs = [
- t.read(where=_where, columns=columns, start=_start, stop=_stop)
- for t in tbls
- ]
-
- # concat and return
- return concat(objs, axis=axis, verify_integrity=False)._consolidate()
-
- # create the iterator
- it = TableIterator(
- self,
- s,
- func,
- where=where,
- nrows=nrows,
- start=start,
- stop=stop,
- iterator=iterator,
- chunksize=chunksize,
- auto_close=auto_close,
- )
-
- return it.get_result(coordinates=True)
-
- def put(
- self,
- key: str,
- value: DataFrame | Series,
- format=None,
- index: bool = True,
- append: bool = False,
- complib=None,
- complevel: int | None = None,
- min_itemsize: int | dict[str, int] | None = None,
- nan_rep=None,
- data_columns: Literal[True] | list[str] | None = None,
- encoding=None,
- errors: str = "strict",
- track_times: bool = True,
- dropna: bool = False,
- ) -> None:
- """
- Store object in HDFStore.
-
- Parameters
- ----------
- key : str
- value : {Series, DataFrame}
- format : 'fixed(f)|table(t)', default is 'fixed'
- Format to use when storing object in HDFStore. Value can be one of:
-
- ``'fixed'``
- Fixed format. Fast writing/reading. Not-appendable, nor searchable.
- ``'table'``
- Table format. Write as a PyTables Table structure which may perform
- worse but allow more flexible operations like searching / selecting
- subsets of the data.
- index : bool, default True
- Write DataFrame index as a column.
- append : bool, default False
- This will force Table format, append the input data to the existing.
- data_columns : list of columns or True, default None
- List of columns to create as data columns, or True to use all columns.
- See `here
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
- encoding : str, default None
- Provide an encoding for strings.
- track_times : bool, default True
- Parameter is propagated to 'create_table' method of 'PyTables'.
- If set to False it enables to have the same h5 files (same hashes)
- independent on creation time.
- dropna : bool, default False, optional
- Remove missing values.
-
- .. versionadded:: 1.1.0
- """
- if format is None:
- format = get_option("io.hdf.default_format") or "fixed"
- format = self._validate_format(format)
- self._write_to_group(
- key,
- value,
- format=format,
- index=index,
- append=append,
- complib=complib,
- complevel=complevel,
- min_itemsize=min_itemsize,
- nan_rep=nan_rep,
- data_columns=data_columns,
- encoding=encoding,
- errors=errors,
- track_times=track_times,
- dropna=dropna,
- )
-
- def remove(self, key: str, where=None, start=None, stop=None) -> None:
- """
- Remove pandas object partially by specifying the where condition
-
- Parameters
- ----------
- key : str
- Node to remove or delete rows from
- where : list of Term (or convertible) objects, optional
- start : integer (defaults to None), row number to start selection
- stop : integer (defaults to None), row number to stop selection
-
- Returns
- -------
- number of rows removed (or None if not a Table)
-
- Raises
- ------
- raises KeyError if key is not a valid store
-
- """
- where = _ensure_term(where, scope_level=1)
- try:
- s = self.get_storer(key)
- except KeyError:
- # the key is not a valid store, re-raising KeyError
- raise
- except AssertionError:
- # surface any assertion errors for e.g. debugging
- raise
- except Exception as err:
- # In tests we get here with ClosedFileError, TypeError, and
- # _table_mod.NoSuchNodeError. TODO: Catch only these?
-
- if where is not None:
- raise ValueError(
- "trying to remove a node with a non-None where clause!"
- ) from err
-
- # we are actually trying to remove a node (with children)
- node = self.get_node(key)
- if node is not None:
- node._f_remove(recursive=True)
- return None
-
- # remove the node
- if com.all_none(where, start, stop):
- s.group._f_remove(recursive=True)
-
- # delete from the table
- else:
- if not s.is_table:
- raise ValueError(
- "can only remove with where on objects written as tables"
- )
- return s.delete(where=where, start=start, stop=stop)
-
- def append(
- self,
- key: str,
- value: DataFrame | Series,
- format=None,
- axes=None,
- index: bool | list[str] = True,
- append: bool = True,
- complib=None,
- complevel: int | None = None,
- columns=None,
- min_itemsize: int | dict[str, int] | None = None,
- nan_rep=None,
- chunksize=None,
- expectedrows=None,
- dropna: bool | None = None,
- data_columns: Literal[True] | list[str] | None = None,
- encoding=None,
- errors: str = "strict",
- ) -> None:
- """
- Append to Table in file.
-
- Node must already exist and be Table format.
-
- Parameters
- ----------
- key : str
- value : {Series, DataFrame}
- format : 'table' is the default
- Format to use when storing object in HDFStore. Value can be one of:
-
- ``'table'``
- Table format. Write as a PyTables Table structure which may perform
- worse but allow more flexible operations like searching / selecting
- subsets of the data.
- index : bool, default True
- Write DataFrame index as a column.
- append : bool, default True
- Append the input data to the existing.
- data_columns : list of columns, or True, default None
- List of columns to create as indexed data columns for on-disk
- queries, or True to use all columns. By default only the axes
- of the object are indexed. See `here
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
- min_itemsize : dict of columns that specify minimum str sizes
- nan_rep : str to use as str nan representation
- chunksize : size to chunk the writing
- expectedrows : expected TOTAL row size of this table
- encoding : default None, provide an encoding for str
- dropna : bool, default False, optional
- Do not write an ALL nan row to the store settable
- by the option 'io.hdf.dropna_table'.
-
- Notes
- -----
- Does *not* check if data being appended overlaps with existing
- data in the table, so be careful
- """
- if columns is not None:
- raise TypeError(
- "columns is not a supported keyword in append, try data_columns"
- )
-
- if dropna is None:
- dropna = get_option("io.hdf.dropna_table")
- if format is None:
- format = get_option("io.hdf.default_format") or "table"
- format = self._validate_format(format)
- self._write_to_group(
- key,
- value,
- format=format,
- axes=axes,
- index=index,
- append=append,
- complib=complib,
- complevel=complevel,
- min_itemsize=min_itemsize,
- nan_rep=nan_rep,
- chunksize=chunksize,
- expectedrows=expectedrows,
- dropna=dropna,
- data_columns=data_columns,
- encoding=encoding,
- errors=errors,
- )
-
- def append_to_multiple(
- self,
- d: dict,
- value,
- selector,
- data_columns=None,
- axes=None,
- dropna: bool = False,
- **kwargs,
- ) -> None:
- """
- Append to multiple tables
-
- Parameters
- ----------
- d : a dict of table_name to table_columns, None is acceptable as the
- values of one node (this will get all the remaining columns)
- value : a pandas object
- selector : a string that designates the indexable table; all of its
- columns will be designed as data_columns, unless data_columns is
- passed, in which case these are used
- data_columns : list of columns to create as data columns, or True to
- use all columns
- dropna : if evaluates to True, drop rows from all tables if any single
- row in each table has all NaN. Default False.
-
- Notes
- -----
- axes parameter is currently not accepted
-
- """
- if axes is not None:
- raise TypeError(
- "axes is currently not accepted as a parameter to append_to_multiple; "
- "you can create the tables independently instead"
- )
-
- if not isinstance(d, dict):
- raise ValueError(
- "append_to_multiple must have a dictionary specified as the "
- "way to split the value"
- )
-
- if selector not in d:
- raise ValueError(
- "append_to_multiple requires a selector that is in passed dict"
- )
-
- # figure out the splitting axis (the non_index_axis)
- axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
-
- # figure out how to split the value
- remain_key = None
- remain_values: list = []
- for k, v in d.items():
- if v is None:
- if remain_key is not None:
- raise ValueError(
- "append_to_multiple can only have one value in d that is None"
- )
- remain_key = k
- else:
- remain_values.extend(v)
- if remain_key is not None:
- ordered = value.axes[axis]
- ordd = ordered.difference(Index(remain_values))
- ordd = sorted(ordered.get_indexer(ordd))
- d[remain_key] = ordered.take(ordd)
-
- # data_columns
- if data_columns is None:
- data_columns = d[selector]
-
- # ensure rows are synchronized across the tables
- if dropna:
- idxs = (value[cols].dropna(how="all").index for cols in d.values())
- valid_index = next(idxs)
- for index in idxs:
- valid_index = valid_index.intersection(index)
- value = value.loc[valid_index]
-
- min_itemsize = kwargs.pop("min_itemsize", None)
-
- # append
- for k, v in d.items():
- dc = data_columns if k == selector else None
-
- # compute the val
- val = value.reindex(v, axis=axis)
-
- filtered = (
- {key: value for (key, value) in min_itemsize.items() if key in v}
- if min_itemsize is not None
- else None
- )
- self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)
-
- def create_table_index(
- self,
- key: str,
- columns=None,
- optlevel: int | None = None,
- kind: str | None = None,
- ) -> None:
- """
- Create a pytables index on the table.
-
- Parameters
- ----------
- key : str
- columns : None, bool, or listlike[str]
- Indicate which columns to create an index on.
-
- * False : Do not create any indexes.
- * True : Create indexes on all columns.
- * None : Create indexes on all columns.
- * listlike : Create indexes on the given columns.
-
- optlevel : int or None, default None
- Optimization level, if None, pytables defaults to 6.
- kind : str or None, default None
- Kind of index, if None, pytables defaults to "medium".
-
- Raises
- ------
- TypeError: raises if the node is not a table
- """
- # version requirements
- _tables()
- s = self.get_storer(key)
- if s is None:
- return
-
- if not isinstance(s, Table):
- raise TypeError("cannot create table index on a Fixed format store")
- s.create_index(columns=columns, optlevel=optlevel, kind=kind)
-
- def groups(self) -> list:
- """
- Return a list of all the top-level nodes.
-
- Each node returned is not a pandas storage object.
-
- Returns
- -------
- list
- List of objects.
- """
- _tables()
- self._check_if_open()
- assert self._handle is not None # for mypy
- assert _table_mod is not None # for mypy
- return [
- g
- for g in self._handle.walk_groups()
- if (
- not isinstance(g, _table_mod.link.Link)
- and (
- getattr(g._v_attrs, "pandas_type", None)
- or getattr(g, "table", None)
- or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
- )
- )
- ]
-
- def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]:
- """
- Walk the pytables group hierarchy for pandas objects.
-
- This generator will yield the group path, subgroups and pandas object
- names for each group.
-
- Any non-pandas PyTables objects that are not a group will be ignored.
-
- The `where` group itself is listed first (preorder), then each of its
- child groups (following an alphanumerical order) is also traversed,
- following the same procedure.
-
- Parameters
- ----------
- where : str, default "/"
- Group where to start walking.
-
- Yields
- ------
- path : str
- Full path to a group (without trailing '/').
- groups : list
- Names (strings) of the groups contained in `path`.
- leaves : list
- Names (strings) of the pandas objects contained in `path`.
- """
- _tables()
- self._check_if_open()
- assert self._handle is not None # for mypy
- assert _table_mod is not None # for mypy
-
- for g in self._handle.walk_groups(where):
- if getattr(g._v_attrs, "pandas_type", None) is not None:
- continue
-
- groups = []
- leaves = []
- for child in g._v_children.values():
- pandas_type = getattr(child._v_attrs, "pandas_type", None)
- if pandas_type is None:
- if isinstance(child, _table_mod.group.Group):
- groups.append(child._v_name)
- else:
- leaves.append(child._v_name)
-
- yield (g._v_pathname.rstrip("/"), groups, leaves)
-
- def get_node(self, key: str) -> Node | None:
- """return the node with the key or None if it does not exist"""
- self._check_if_open()
- if not key.startswith("/"):
- key = "/" + key
-
- assert self._handle is not None
- assert _table_mod is not None # for mypy
- try:
- node = self._handle.get_node(self.root, key)
- except _table_mod.exceptions.NoSuchNodeError:
- return None
-
- assert isinstance(node, _table_mod.Node), type(node)
- return node
-
- def get_storer(self, key: str) -> GenericFixed | Table:
- """return the storer object for a key, raise if not in the file"""
- group = self.get_node(key)
- if group is None:
- raise KeyError(f"No object named {key} in the file")
-
- s = self._create_storer(group)
- s.infer_axes()
- return s
-
- def copy(
- self,
- file,
- mode: str = "w",
- propindexes: bool = True,
- keys=None,
- complib=None,
- complevel: int | None = None,
- fletcher32: bool = False,
- overwrite: bool = True,
- ) -> HDFStore:
- """
- Copy the existing store to a new file, updating in place.
-
- Parameters
- ----------
- propindexes : bool, default True
- Restore indexes in copied file.
- keys : list, optional
- List of keys to include in the copy (defaults to all).
- overwrite : bool, default True
- Whether to overwrite (remove and replace) existing nodes in the new store.
- mode, complib, complevel, fletcher32 same as in HDFStore.__init__
-
- Returns
- -------
- open file handle of the new store
- """
- new_store = HDFStore(
- file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
- )
- if keys is None:
- keys = list(self.keys())
- if not isinstance(keys, (tuple, list)):
- keys = [keys]
- for k in keys:
- s = self.get_storer(k)
- if s is not None:
- if k in new_store:
- if overwrite:
- new_store.remove(k)
-
- data = self.select(k)
- if isinstance(s, Table):
- index: bool | list[str] = False
- if propindexes:
- index = [a.name for a in s.axes if a.is_indexed]
- new_store.append(
- k,
- data,
- index=index,
- data_columns=getattr(s, "data_columns", None),
- encoding=s.encoding,
- )
- else:
- new_store.put(k, data, encoding=s.encoding)
-
- return new_store
-
- def info(self) -> str:
- """
- Print detailed information on the store.
-
- Returns
- -------
- str
- """
- path = pprint_thing(self._path)
- output = f"{type(self)}\nFile path: {path}\n"
-
- if self.is_open:
- lkeys = sorted(self.keys())
- if len(lkeys):
- keys = []
- values = []
-
- for k in lkeys:
- try:
- s = self.get_storer(k)
- if s is not None:
- keys.append(pprint_thing(s.pathname or k))
- values.append(pprint_thing(s or "invalid_HDFStore node"))
- except AssertionError:
- # surface any assertion errors for e.g. debugging
- raise
- except Exception as detail:
- keys.append(k)
- dstr = pprint_thing(detail)
- values.append(f"[invalid_HDFStore node: {dstr}]")
-
- output += adjoin(12, keys, values)
- else:
- output += "Empty"
- else:
- output += "File is CLOSED"
-
- return output
-
- # ------------------------------------------------------------------------
- # private methods
-
- def _check_if_open(self):
- if not self.is_open:
- raise ClosedFileError(f"{self._path} file is not open!")
-
- def _validate_format(self, format: str) -> str:
- """validate / deprecate formats"""
- # validate
- try:
- format = _FORMAT_MAP[format.lower()]
- except KeyError as err:
- raise TypeError(f"invalid HDFStore format specified [{format}]") from err
-
- return format
-
- def _create_storer(
- self,
- group,
- format=None,
- value: DataFrame | Series | None = None,
- encoding: str = "UTF-8",
- errors: str = "strict",
- ) -> GenericFixed | Table:
- """return a suitable class to operate"""
- cls: type[GenericFixed] | type[Table]
-
- if value is not None and not isinstance(value, (Series, DataFrame)):
- raise TypeError("value must be None, Series, or DataFrame")
-
- pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
- tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
-
- # infer the pt from the passed value
- if pt is None:
- if value is None:
- _tables()
- assert _table_mod is not None # for mypy
- if getattr(group, "table", None) or isinstance(
- group, _table_mod.table.Table
- ):
- pt = "frame_table"
- tt = "generic_table"
- else:
- raise TypeError(
- "cannot create a storer if the object is not existing "
- "nor a value are passed"
- )
- else:
- if isinstance(value, Series):
- pt = "series"
- else:
- pt = "frame"
-
- # we are actually a table
- if format == "table":
- pt += "_table"
-
- # a storer node
- if "table" not in pt:
- _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
- try:
- cls = _STORER_MAP[pt]
- except KeyError as err:
- raise TypeError(
- f"cannot properly create the storer for: [_STORER_MAP] [group->"
- f"{group},value->{type(value)},format->{format}"
- ) from err
- return cls(self, group, encoding=encoding, errors=errors)
-
- # existing node (and must be a table)
- if tt is None:
- # if we are a writer, determine the tt
- if value is not None:
- if pt == "series_table":
- index = getattr(value, "index", None)
- if index is not None:
- if index.nlevels == 1:
- tt = "appendable_series"
- elif index.nlevels > 1:
- tt = "appendable_multiseries"
- elif pt == "frame_table":
- index = getattr(value, "index", None)
- if index is not None:
- if index.nlevels == 1:
- tt = "appendable_frame"
- elif index.nlevels > 1:
- tt = "appendable_multiframe"
-
- _TABLE_MAP = {
- "generic_table": GenericTable,
- "appendable_series": AppendableSeriesTable,
- "appendable_multiseries": AppendableMultiSeriesTable,
- "appendable_frame": AppendableFrameTable,
- "appendable_multiframe": AppendableMultiFrameTable,
- "worm": WORMTable,
- }
- try:
- cls = _TABLE_MAP[tt]
- except KeyError as err:
- raise TypeError(
- f"cannot properly create the storer for: [_TABLE_MAP] [group->"
- f"{group},value->{type(value)},format->{format}"
- ) from err
-
- return cls(self, group, encoding=encoding, errors=errors)
-
- def _write_to_group(
- self,
- key: str,
- value: DataFrame | Series,
- format,
- axes=None,
- index: bool | list[str] = True,
- append: bool = False,
- complib=None,
- complevel: int | None = None,
- fletcher32=None,
- min_itemsize: int | dict[str, int] | None = None,
- chunksize=None,
- expectedrows=None,
- dropna: bool = False,
- nan_rep=None,
- data_columns=None,
- encoding=None,
- errors: str = "strict",
- track_times: bool = True,
- ) -> None:
- # we don't want to store a table node at all if our object is 0-len
- # as there are not dtypes
- if getattr(value, "empty", None) and (format == "table" or append):
- return
-
- group = self._identify_group(key, append)
-
- s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
- if append:
- # raise if we are trying to append to a Fixed format,
- # or a table that exists (and we are putting)
- if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
- raise ValueError("Can only append to Tables")
- if not s.is_exists:
- s.set_object_info()
- else:
- s.set_object_info()
-
- if not s.is_table and complib:
- raise ValueError("Compression not supported on Fixed format stores")
-
- # write the object
- s.write(
- obj=value,
- axes=axes,
- append=append,
- complib=complib,
- complevel=complevel,
- fletcher32=fletcher32,
- min_itemsize=min_itemsize,
- chunksize=chunksize,
- expectedrows=expectedrows,
- dropna=dropna,
- nan_rep=nan_rep,
- data_columns=data_columns,
- track_times=track_times,
- )
-
- if isinstance(s, Table) and index:
- s.create_index(columns=index)
-
- def _read_group(self, group: Node):
- s = self._create_storer(group)
- s.infer_axes()
- return s.read()
-
- def _identify_group(self, key: str, append: bool) -> Node:
- """Identify HDF5 group based on key, delete/create group if needed."""
- group = self.get_node(key)
-
- # we make this assertion for mypy; the get_node call will already
- # have raised if this is incorrect
- assert self._handle is not None
-
- # remove the node if we are not appending
- if group is not None and not append:
- self._handle.remove_node(group, recursive=True)
- group = None
-
- if group is None:
- group = self._create_nodes_and_group(key)
-
- return group
-
- def _create_nodes_and_group(self, key: str) -> Node:
- """Create nodes from key and return group name."""
- # assertion for mypy
- assert self._handle is not None
-
- paths = key.split("/")
- # recursively create the groups
- path = "/"
- for p in paths:
- if not len(p):
- continue
- new_path = path
- if not path.endswith("/"):
- new_path += "/"
- new_path += p
- group = self.get_node(new_path)
- if group is None:
- group = self._handle.create_group(path, p)
- path = new_path
- return group
-
-
-class TableIterator:
- """
- Define the iteration interface on a table
-
- Parameters
- ----------
- store : HDFStore
- s : the referred storer
- func : the function to execute the query
- where : the where of the query
- nrows : the rows to iterate on
- start : the passed start value (default is None)
- stop : the passed stop value (default is None)
- iterator : bool, default False
- Whether to use the default iterator.
- chunksize : the passed chunking value (default is 100000)
- auto_close : bool, default False
- Whether to automatically close the store at the end of iteration.
- """
-
- chunksize: int | None
- store: HDFStore
- s: GenericFixed | Table
-
- def __init__(
- self,
- store: HDFStore,
- s: GenericFixed | Table,
- func,
- where,
- nrows,
- start=None,
- stop=None,
- iterator: bool = False,
- chunksize: int | None = None,
- auto_close: bool = False,
- ) -> None:
- self.store = store
- self.s = s
- self.func = func
- self.where = where
-
- # set start/stop if they are not set if we are a table
- if self.s.is_table:
- if nrows is None:
- nrows = 0
- if start is None:
- start = 0
- if stop is None:
- stop = nrows
- stop = min(nrows, stop)
-
- self.nrows = nrows
- self.start = start
- self.stop = stop
-
- self.coordinates = None
- if iterator or chunksize is not None:
- if chunksize is None:
- chunksize = 100000
- self.chunksize = int(chunksize)
- else:
- self.chunksize = None
-
- self.auto_close = auto_close
-
- def __iter__(self) -> Iterator:
- # iterate
- current = self.start
- if self.coordinates is None:
- raise ValueError("Cannot iterate until get_result is called.")
- while current < self.stop:
- stop = min(current + self.chunksize, self.stop)
- value = self.func(None, None, self.coordinates[current:stop])
- current = stop
- if value is None or not len(value):
- continue
-
- yield value
-
- self.close()
-
- def close(self) -> None:
- if self.auto_close:
- self.store.close()
-
- def get_result(self, coordinates: bool = False):
- # return the actual iterator
- if self.chunksize is not None:
- if not isinstance(self.s, Table):
- raise TypeError("can only use an iterator or chunksize on a table")
-
- self.coordinates = self.s.read_coordinates(where=self.where)
-
- return self
-
- # if specified read via coordinates (necessary for multiple selections
- if coordinates:
- if not isinstance(self.s, Table):
- raise TypeError("can only read_coordinates on a table")
- where = self.s.read_coordinates(
- where=self.where, start=self.start, stop=self.stop
- )
- else:
- where = self.where
-
- # directly return the result
- results = self.func(self.start, self.stop, where)
- self.close()
- return results
-
-
-class IndexCol:
- """
- an index column description class
-
- Parameters
- ----------
- axis : axis which I reference
- values : the ndarray like converted values
- kind : a string description of this type
- typ : the pytables type
- pos : the position in the pytables
-
- """
-
- is_an_indexable: bool = True
- is_data_indexable: bool = True
- _info_fields = ["freq", "tz", "index_name"]
-
- def __init__(
- self,
- name: str,
- values=None,
- kind=None,
- typ=None,
- cname: str | None = None,
- axis=None,
- pos=None,
- freq=None,
- tz=None,
- index_name=None,
- ordered=None,
- table=None,
- meta=None,
- metadata=None,
- ) -> None:
- if not isinstance(name, str):
- raise ValueError("`name` must be a str.")
-
- self.values = values
- self.kind = kind
- self.typ = typ
- self.name = name
- self.cname = cname or name
- self.axis = axis
- self.pos = pos
- self.freq = freq
- self.tz = tz
- self.index_name = index_name
- self.ordered = ordered
- self.table = table
- self.meta = meta
- self.metadata = metadata
-
- if pos is not None:
- self.set_pos(pos)
-
- # These are ensured as long as the passed arguments match the
- # constructor annotations.
- assert isinstance(self.name, str)
- assert isinstance(self.cname, str)
-
- @property
- def itemsize(self) -> int:
- # Assumes self.typ has already been initialized
- return self.typ.itemsize
-
- @property
- def kind_attr(self) -> str:
- return f"{self.name}_kind"
-
- def set_pos(self, pos: int) -> None:
- """set the position of this column in the Table"""
- self.pos = pos
- if pos is not None and self.typ is not None:
- self.typ._v_pos = pos
-
- def __repr__(self) -> str:
- temp = tuple(
- map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
- )
- return ",".join(
- [
- f"{key}->{value}"
- for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
- ]
- )
-
- def __eq__(self, other: Any) -> bool:
- """compare 2 col items"""
- return all(
- getattr(self, a, None) == getattr(other, a, None)
- for a in ["name", "cname", "axis", "pos"]
- )
-
- def __ne__(self, other) -> bool:
- return not self.__eq__(other)
-
- @property
- def is_indexed(self) -> bool:
- """return whether I am an indexed column"""
- if not hasattr(self.table, "cols"):
- # e.g. if infer hasn't been called yet, self.table will be None.
- return False
- return getattr(self.table.cols, self.cname).is_indexed
-
- def convert(
- self, values: np.ndarray, nan_rep, encoding: str, errors: str
- ) -> tuple[np.ndarray, np.ndarray] | tuple[Index, Index]:
- """
- Convert the data from this selection to the appropriate pandas type.
- """
- assert isinstance(values, np.ndarray), type(values)
-
- # values is a recarray
- if values.dtype.fields is not None:
- # Copy, otherwise values will be a view
- # preventing the original recarry from being free'ed
- values = values[self.cname].copy()
-
- val_kind = _ensure_decoded(self.kind)
- values = _maybe_convert(values, val_kind, encoding, errors)
-
- kwargs = {}
- kwargs["name"] = _ensure_decoded(self.index_name)
-
- if self.freq is not None:
- kwargs["freq"] = _ensure_decoded(self.freq)
-
- factory: type[Index] | type[DatetimeIndex] = Index
- if is_datetime64_dtype(values.dtype) or is_datetime64tz_dtype(values.dtype):
- factory = DatetimeIndex
- elif values.dtype == "i8" and "freq" in kwargs:
- # PeriodIndex data is stored as i8
- # error: Incompatible types in assignment (expression has type
- # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type
- # "Union[Type[Index], Type[DatetimeIndex]]")
- factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment]
- ordinal=x, **kwds
- )
-
- # making an Index instance could throw a number of different errors
- try:
- new_pd_index = factory(values, **kwargs)
- except ValueError:
- # if the output freq is different that what we recorded,
- # it should be None (see also 'doc example part 2')
- if "freq" in kwargs:
- kwargs["freq"] = None
- new_pd_index = factory(values, **kwargs)
- final_pd_index = _set_tz(new_pd_index, self.tz)
- return final_pd_index, final_pd_index
-
- def take_data(self):
- """return the values"""
- return self.values
-
- @property
- def attrs(self):
- return self.table._v_attrs
-
- @property
- def description(self):
- return self.table.description
-
- @property
- def col(self):
- """return my current col description"""
- return getattr(self.description, self.cname, None)
-
- @property
- def cvalues(self):
- """return my cython values"""
- return self.values
-
- def __iter__(self) -> Iterator:
- return iter(self.values)
-
- def maybe_set_size(self, min_itemsize=None) -> None:
- """
- maybe set a string col itemsize:
- min_itemsize can be an integer or a dict with this columns name
- with an integer size
- """
- if _ensure_decoded(self.kind) == "string":
- if isinstance(min_itemsize, dict):
- min_itemsize = min_itemsize.get(self.name)
-
- if min_itemsize is not None and self.typ.itemsize < min_itemsize:
- self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
-
- def validate_names(self) -> None:
- pass
-
- def validate_and_set(self, handler: AppendableTable, append: bool) -> None:
- self.table = handler.table
- self.validate_col()
- self.validate_attr(append)
- self.validate_metadata(handler)
- self.write_metadata(handler)
- self.set_attr()
-
- def validate_col(self, itemsize=None):
- """validate this column: return the compared against itemsize"""
- # validate this column for string truncation (or reset to the max size)
- if _ensure_decoded(self.kind) == "string":
- c = self.col
- if c is not None:
- if itemsize is None:
- itemsize = self.itemsize
- if c.itemsize < itemsize:
- raise ValueError(
- f"Trying to store a string with len [{itemsize}] in "
- f"[{self.cname}] column but\nthis column has a limit of "
- f"[{c.itemsize}]!\nConsider using min_itemsize to "
- "preset the sizes on these columns"
- )
- return c.itemsize
-
- return None
-
- def validate_attr(self, append: bool) -> None:
- # check for backwards incompatibility
- if append:
- existing_kind = getattr(self.attrs, self.kind_attr, None)
- if existing_kind is not None and existing_kind != self.kind:
- raise TypeError(
- f"incompatible kind in col [{existing_kind} - {self.kind}]"
- )
-
- def update_info(self, info) -> None:
- """
- set/update the info for this indexable with the key/value
- if there is a conflict raise/warn as needed
- """
- for key in self._info_fields:
- value = getattr(self, key, None)
- idx = info.setdefault(self.name, {})
-
- existing_value = idx.get(key)
- if key in idx and value is not None and existing_value != value:
- # frequency/name just warn
- if key in ["freq", "index_name"]:
- ws = attribute_conflict_doc % (key, existing_value, value)
- warnings.warn(
- ws, AttributeConflictWarning, stacklevel=find_stack_level()
- )
-
- # reset
- idx[key] = None
- setattr(self, key, None)
-
- else:
- raise ValueError(
- f"invalid info for [{self.name}] for [{key}], "
- f"existing_value [{existing_value}] conflicts with "
- f"new value [{value}]"
- )
- else:
- if value is not None or existing_value is not None:
- idx[key] = value
-
- def set_info(self, info) -> None:
- """set my state from the passed info"""
- idx = info.get(self.name)
- if idx is not None:
- self.__dict__.update(idx)
-
- def set_attr(self) -> None:
- """set the kind for this column"""
- setattr(self.attrs, self.kind_attr, self.kind)
-
- def validate_metadata(self, handler: AppendableTable) -> None:
- """validate that kind=category does not change the categories"""
- if self.meta == "category":
- new_metadata = self.metadata
- cur_metadata = handler.read_metadata(self.cname)
- if (
- new_metadata is not None
- and cur_metadata is not None
- and not array_equivalent(new_metadata, cur_metadata)
- ):
- raise ValueError(
- "cannot append a categorical with "
- "different categories to the existing"
- )
-
- def write_metadata(self, handler: AppendableTable) -> None:
- """set the meta data"""
- if self.metadata is not None:
- handler.write_metadata(self.cname, self.metadata)
-
-
-class GenericIndexCol(IndexCol):
- """an index which is not represented in the data of the table"""
-
- @property
- def is_indexed(self) -> bool:
- return False
-
- def convert(
- self, values: np.ndarray, nan_rep, encoding: str, errors: str
- ) -> tuple[Index, Index]:
- """
- Convert the data from this selection to the appropriate pandas type.
-
- Parameters
- ----------
- values : np.ndarray
- nan_rep : str
- encoding : str
- errors : str
- """
- assert isinstance(values, np.ndarray), type(values)
-
- index = RangeIndex(len(values))
- return index, index
-
- def set_attr(self) -> None:
- pass
-
-
-class DataCol(IndexCol):
- """
- a data holding column, by definition this is not indexable
-
- Parameters
- ----------
- data : the actual data
- cname : the column name in the table to hold the data (typically
- values)
- meta : a string description of the metadata
- metadata : the actual metadata
- """
-
- is_an_indexable = False
- is_data_indexable = False
- _info_fields = ["tz", "ordered"]
-
- def __init__(
- self,
- name: str,
- values=None,
- kind=None,
- typ=None,
- cname: str | None = None,
- pos=None,
- tz=None,
- ordered=None,
- table=None,
- meta=None,
- metadata=None,
- dtype: DtypeArg | None = None,
- data=None,
- ) -> None:
- super().__init__(
- name=name,
- values=values,
- kind=kind,
- typ=typ,
- pos=pos,
- cname=cname,
- tz=tz,
- ordered=ordered,
- table=table,
- meta=meta,
- metadata=metadata,
- )
- self.dtype = dtype
- self.data = data
-
- @property
- def dtype_attr(self) -> str:
- return f"{self.name}_dtype"
-
- @property
- def meta_attr(self) -> str:
- return f"{self.name}_meta"
-
- def __repr__(self) -> str:
- temp = tuple(
- map(
- pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
- )
- )
- return ",".join(
- [
- f"{key}->{value}"
- for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
- ]
- )
-
- def __eq__(self, other: Any) -> bool:
- """compare 2 col items"""
- return all(
- getattr(self, a, None) == getattr(other, a, None)
- for a in ["name", "cname", "dtype", "pos"]
- )
-
- def set_data(self, data: ArrayLike) -> None:
- assert data is not None
- assert self.dtype is None
-
- data, dtype_name = _get_data_and_dtype_name(data)
-
- self.data = data
- self.dtype = dtype_name
- self.kind = _dtype_to_kind(dtype_name)
-
- def take_data(self):
- """return the data"""
- return self.data
-
- @classmethod
- def _get_atom(cls, values: ArrayLike) -> Col:
- """
- Get an appropriately typed and shaped pytables.Col object for values.
- """
- dtype = values.dtype
- # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no
- # attribute "itemsize"
- itemsize = dtype.itemsize # type: ignore[union-attr]
-
- shape = values.shape
- if values.ndim == 1:
- # EA, use block shape pretending it is 2D
- # TODO(EA2D): not necessary with 2D EAs
- shape = (1, values.size)
-
- if isinstance(values, Categorical):
- codes = values.codes
- atom = cls.get_atom_data(shape, kind=codes.dtype.name)
- elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
- atom = cls.get_atom_datetime64(shape)
- elif is_timedelta64_dtype(dtype):
- atom = cls.get_atom_timedelta64(shape)
- elif is_complex_dtype(dtype):
- atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
- elif is_string_dtype(dtype):
- atom = cls.get_atom_string(shape, itemsize)
- else:
- atom = cls.get_atom_data(shape, kind=dtype.name)
-
- return atom
-
- @classmethod
- def get_atom_string(cls, shape, itemsize):
- return _tables().StringCol(itemsize=itemsize, shape=shape[0])
-
- @classmethod
- def get_atom_coltype(cls, kind: str) -> type[Col]:
- """return the PyTables column class for this column"""
- if kind.startswith("uint"):
- k4 = kind[4:]
- col_name = f"UInt{k4}Col"
- elif kind.startswith("period"):
- # we store as integer
- col_name = "Int64Col"
- else:
- kcap = kind.capitalize()
- col_name = f"{kcap}Col"
-
- return getattr(_tables(), col_name)
-
- @classmethod
- def get_atom_data(cls, shape, kind: str) -> Col:
- return cls.get_atom_coltype(kind=kind)(shape=shape[0])
-
- @classmethod
- def get_atom_datetime64(cls, shape):
- return _tables().Int64Col(shape=shape[0])
-
- @classmethod
- def get_atom_timedelta64(cls, shape):
- return _tables().Int64Col(shape=shape[0])
-
- @property
- def shape(self):
- return getattr(self.data, "shape", None)
-
- @property
- def cvalues(self):
- """return my cython values"""
- return self.data
-
- def validate_attr(self, append) -> None:
- """validate that we have the same order as the existing & same dtype"""
- if append:
- existing_fields = getattr(self.attrs, self.kind_attr, None)
- if existing_fields is not None and existing_fields != list(self.values):
- raise ValueError("appended items do not match existing items in table!")
-
- existing_dtype = getattr(self.attrs, self.dtype_attr, None)
- if existing_dtype is not None and existing_dtype != self.dtype:
- raise ValueError(
- "appended items dtype do not match existing items dtype in table!"
- )
-
- def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
- """
- Convert the data from this selection to the appropriate pandas type.
-
- Parameters
- ----------
- values : np.ndarray
- nan_rep :
- encoding : str
- errors : str
-
- Returns
- -------
- index : listlike to become an Index
- data : ndarraylike to become a column
- """
- assert isinstance(values, np.ndarray), type(values)
-
- # values is a recarray
- if values.dtype.fields is not None:
- values = values[self.cname]
-
- assert self.typ is not None
- if self.dtype is None:
- # Note: in tests we never have timedelta64 or datetime64,
- # so the _get_data_and_dtype_name may be unnecessary
- converted, dtype_name = _get_data_and_dtype_name(values)
- kind = _dtype_to_kind(dtype_name)
- else:
- converted = values
- dtype_name = self.dtype
- kind = self.kind
-
- assert isinstance(converted, np.ndarray) # for mypy
-
- # use the meta if needed
- meta = _ensure_decoded(self.meta)
- metadata = self.metadata
- ordered = self.ordered
- tz = self.tz
-
- assert dtype_name is not None
- # convert to the correct dtype
- dtype = _ensure_decoded(dtype_name)
-
- # reverse converts
- if dtype == "datetime64":
- # recreate with tz if indicated
- converted = _set_tz(converted, tz, coerce=True)
-
- elif dtype == "timedelta64":
- converted = np.asarray(converted, dtype="m8[ns]")
- elif dtype == "date":
- try:
- converted = np.asarray(
- [date.fromordinal(v) for v in converted], dtype=object
- )
- except ValueError:
- converted = np.asarray(
- [date.fromtimestamp(v) for v in converted], dtype=object
- )
-
- elif meta == "category":
- # we have a categorical
- categories = metadata
- codes = converted.ravel()
-
- # if we have stored a NaN in the categories
- # then strip it; in theory we could have BOTH
- # -1s in the codes and nulls :<
- if categories is None:
- # Handle case of NaN-only categorical columns in which case
- # the categories are an empty array; when this is stored,
- # pytables cannot write a zero-len array, so on readback
- # the categories would be None and `read_hdf()` would fail.
- categories = Index([], dtype=np.float64)
- else:
- mask = isna(categories)
- if mask.any():
- categories = categories[~mask]
- codes[codes != -1] -= mask.astype(int).cumsum()._values
-
- converted = Categorical.from_codes(
- codes, categories=categories, ordered=ordered
- )
-
- else:
- try:
- converted = converted.astype(dtype, copy=False)
- except TypeError:
- converted = converted.astype("O", copy=False)
-
- # convert nans / decode
- if _ensure_decoded(kind) == "string":
- converted = _unconvert_string_array(
- converted, nan_rep=nan_rep, encoding=encoding, errors=errors
- )
-
- return self.values, converted
-
- def set_attr(self) -> None:
- """set the data for this column"""
- setattr(self.attrs, self.kind_attr, self.values)
- setattr(self.attrs, self.meta_attr, self.meta)
- assert self.dtype is not None
- setattr(self.attrs, self.dtype_attr, self.dtype)
-
-
-class DataIndexableCol(DataCol):
- """represent a data column that can be indexed"""
-
- is_data_indexable = True
-
- def validate_names(self) -> None:
- if not is_object_dtype(Index(self.values)):
- # TODO: should the message here be more specifically non-str?
- raise ValueError("cannot have non-object label DataIndexableCol")
-
- @classmethod
- def get_atom_string(cls, shape, itemsize):
- return _tables().StringCol(itemsize=itemsize)
-
- @classmethod
- def get_atom_data(cls, shape, kind: str) -> Col:
- return cls.get_atom_coltype(kind=kind)()
-
- @classmethod
- def get_atom_datetime64(cls, shape):
- return _tables().Int64Col()
-
- @classmethod
- def get_atom_timedelta64(cls, shape):
- return _tables().Int64Col()
-
-
-class GenericDataIndexableCol(DataIndexableCol):
- """represent a generic pytables data column"""
-
-
-class Fixed:
- """
- represent an object in my store
- facilitate read/write of various types of objects
- this is an abstract base class
-
- Parameters
- ----------
- parent : HDFStore
- group : Node
- The group node where the table resides.
- """
-
- pandas_kind: str
- format_type: str = "fixed" # GH#30962 needed by dask
- obj_type: type[DataFrame | Series]
- ndim: int
- parent: HDFStore
- is_table: bool = False
-
- def __init__(
- self,
- parent: HDFStore,
- group: Node,
- encoding: str | None = "UTF-8",
- errors: str = "strict",
- ) -> None:
- assert isinstance(parent, HDFStore), type(parent)
- assert _table_mod is not None # needed for mypy
- assert isinstance(group, _table_mod.Node), type(group)
- self.parent = parent
- self.group = group
- self.encoding = _ensure_encoding(encoding)
- self.errors = errors
-
- @property
- def is_old_version(self) -> bool:
- return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
-
- @property
- def version(self) -> tuple[int, int, int]:
- """compute and set our version"""
- version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
- try:
- version = tuple(int(x) for x in version.split("."))
- if len(version) == 2:
- version = version + (0,)
- except AttributeError:
- version = (0, 0, 0)
- return version
-
- @property
- def pandas_type(self):
- return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
-
- def __repr__(self) -> str:
- """return a pretty representation of myself"""
- self.infer_axes()
- s = self.shape
- if s is not None:
- if isinstance(s, (list, tuple)):
- jshape = ",".join([pprint_thing(x) for x in s])
- s = f"[{jshape}]"
- return f"{self.pandas_type:12.12} (shape->{s})"
- return self.pandas_type
-
- def set_object_info(self) -> None:
- """set my pandas type & version"""
- self.attrs.pandas_type = str(self.pandas_kind)
- self.attrs.pandas_version = str(_version)
-
- def copy(self) -> Fixed:
- new_self = copy.copy(self)
- return new_self
-
- @property
- def shape(self):
- return self.nrows
-
- @property
- def pathname(self):
- return self.group._v_pathname
-
- @property
- def _handle(self):
- return self.parent._handle
-
- @property
- def _filters(self):
- return self.parent._filters
-
- @property
- def _complevel(self) -> int:
- return self.parent._complevel
-
- @property
- def _fletcher32(self) -> bool:
- return self.parent._fletcher32
-
- @property
- def attrs(self):
- return self.group._v_attrs
-
- def set_attrs(self) -> None:
- """set our object attributes"""
-
- def get_attrs(self) -> None:
- """get our object attributes"""
-
- @property
- def storable(self):
- """return my storable"""
- return self.group
-
- @property
- def is_exists(self) -> bool:
- return False
-
- @property
- def nrows(self):
- return getattr(self.storable, "nrows", None)
-
- def validate(self, other) -> Literal[True] | None:
- """validate against an existing storable"""
- if other is None:
- return None
- return True
-
- def validate_version(self, where=None) -> None:
- """are we trying to operate on an old version?"""
-
- def infer_axes(self) -> bool:
- """
- infer the axes of my storer
- return a boolean indicating if we have a valid storer or not
- """
- s = self.storable
- if s is None:
- return False
- self.get_attrs()
- return True
-
- def read(
- self,
- where=None,
- columns=None,
- start: int | None = None,
- stop: int | None = None,
- ):
- raise NotImplementedError(
- "cannot read on an abstract storer: subclasses should implement"
- )
-
- def write(self, **kwargs):
- raise NotImplementedError(
- "cannot write on an abstract storer: subclasses should implement"
- )
-
- def delete(
- self, where=None, start: int | None = None, stop: int | None = None
- ) -> None:
- """
- support fully deleting the node in its entirety (only) - where
- specification must be None
- """
- if com.all_none(where, start, stop):
- self._handle.remove_node(self.group, recursive=True)
- return None
-
- raise TypeError("cannot delete on an abstract storer")
-
-
-class GenericFixed(Fixed):
- """a generified fixed version"""
-
- _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
- _reverse_index_map = {v: k for k, v in _index_type_map.items()}
- attributes: list[str] = []
-
- # indexer helpers
- def _class_to_alias(self, cls) -> str:
- return self._index_type_map.get(cls, "")
-
- def _alias_to_class(self, alias):
- if isinstance(alias, type): # pragma: no cover
- # compat: for a short period of time master stored types
- return alias
- return self._reverse_index_map.get(alias, Index)
-
- def _get_index_factory(self, attrs):
- index_class = self._alias_to_class(
- _ensure_decoded(getattr(attrs, "index_class", ""))
- )
-
- factory: Callable
-
- if index_class == DatetimeIndex:
-
- def f(values, freq=None, tz=None):
- # data are already in UTC, localize and convert if tz present
- dta = DatetimeArray._simple_new(values.values, freq=freq)
- result = DatetimeIndex._simple_new(dta, name=None)
- if tz is not None:
- result = result.tz_localize("UTC").tz_convert(tz)
- return result
-
- factory = f
- elif index_class == PeriodIndex:
-
- def f(values, freq=None, tz=None):
- parr = PeriodArray._simple_new(values, freq=freq)
- return PeriodIndex._simple_new(parr, name=None)
-
- factory = f
- else:
- factory = index_class
-
- kwargs = {}
- if "freq" in attrs:
- kwargs["freq"] = attrs["freq"]
- if index_class is Index:
- # DTI/PI would be gotten by _alias_to_class
- factory = TimedeltaIndex
-
- if "tz" in attrs:
- if isinstance(attrs["tz"], bytes):
- # created by python2
- kwargs["tz"] = attrs["tz"].decode("utf-8")
- else:
- # created by python3
- kwargs["tz"] = attrs["tz"]
- assert index_class is DatetimeIndex # just checking
-
- return factory, kwargs
-
- def validate_read(self, columns, where) -> None:
- """
- raise if any keywords are passed which are not-None
- """
- if columns is not None:
- raise TypeError(
- "cannot pass a column specification when reading "
- "a Fixed format store. this store must be selected in its entirety"
- )
- if where is not None:
- raise TypeError(
- "cannot pass a where specification when reading "
- "from a Fixed format store. this store must be selected in its entirety"
- )
-
- @property
- def is_exists(self) -> bool:
- return True
-
- def set_attrs(self) -> None:
- """set our object attributes"""
- self.attrs.encoding = self.encoding
- self.attrs.errors = self.errors
-
- def get_attrs(self) -> None:
- """retrieve our attributes"""
- self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
- self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
- for n in self.attributes:
- setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
-
- # error: Signature of "write" incompatible with supertype "Fixed"
- def write(self, obj, **kwargs) -> None: # type: ignore[override]
- self.set_attrs()
-
- def read_array(self, key: str, start: int | None = None, stop: int | None = None):
- """read an array for the specified node (off of group"""
- import tables
-
- node = getattr(self.group, key)
- attrs = node._v_attrs
-
- transposed = getattr(attrs, "transposed", False)
-
- if isinstance(node, tables.VLArray):
- ret = node[0][start:stop]
- else:
- dtype = _ensure_decoded(getattr(attrs, "value_type", None))
- shape = getattr(attrs, "shape", None)
-
- if shape is not None:
- # length 0 axis
- ret = np.empty(shape, dtype=dtype)
- else:
- ret = node[start:stop]
-
- if dtype == "datetime64":
- # reconstruct a timezone if indicated
- tz = getattr(attrs, "tz", None)
- ret = _set_tz(ret, tz, coerce=True)
-
- elif dtype == "timedelta64":
- ret = np.asarray(ret, dtype="m8[ns]")
-
- if transposed:
- return ret.T
- else:
- return ret
-
- def read_index(
- self, key: str, start: int | None = None, stop: int | None = None
- ) -> Index:
- variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
-
- if variety == "multi":
- return self.read_multi_index(key, start=start, stop=stop)
- elif variety == "regular":
- node = getattr(self.group, key)
- index = self.read_index_node(node, start=start, stop=stop)
- return index
- else: # pragma: no cover
- raise TypeError(f"unrecognized index variety: {variety}")
-
- def write_index(self, key: str, index: Index) -> None:
- if isinstance(index, MultiIndex):
- setattr(self.attrs, f"{key}_variety", "multi")
- self.write_multi_index(key, index)
- else:
- setattr(self.attrs, f"{key}_variety", "regular")
- converted = _convert_index("index", index, self.encoding, self.errors)
-
- self.write_array(key, converted.values)
-
- node = getattr(self.group, key)
- node._v_attrs.kind = converted.kind
- node._v_attrs.name = index.name
-
- if isinstance(index, (DatetimeIndex, PeriodIndex)):
- node._v_attrs.index_class = self._class_to_alias(type(index))
-
- if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
- node._v_attrs.freq = index.freq
-
- if isinstance(index, DatetimeIndex) and index.tz is not None:
- node._v_attrs.tz = _get_tz(index.tz)
-
- def write_multi_index(self, key: str, index: MultiIndex) -> None:
- setattr(self.attrs, f"{key}_nlevels", index.nlevels)
-
- for i, (lev, level_codes, name) in enumerate(
- zip(index.levels, index.codes, index.names)
- ):
- # write the level
- if is_extension_array_dtype(lev):
- raise NotImplementedError(
- "Saving a MultiIndex with an extension dtype is not supported."
- )
- level_key = f"{key}_level{i}"
- conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
- self.write_array(level_key, conv_level.values)
- node = getattr(self.group, level_key)
- node._v_attrs.kind = conv_level.kind
- node._v_attrs.name = name
-
- # write the name
- setattr(node._v_attrs, f"{key}_name{name}", name)
-
- # write the labels
- label_key = f"{key}_label{i}"
- self.write_array(label_key, level_codes)
-
- def read_multi_index(
- self, key: str, start: int | None = None, stop: int | None = None
- ) -> MultiIndex:
- nlevels = getattr(self.attrs, f"{key}_nlevels")
-
- levels = []
- codes = []
- names: list[Hashable] = []
- for i in range(nlevels):
- level_key = f"{key}_level{i}"
- node = getattr(self.group, level_key)
- lev = self.read_index_node(node, start=start, stop=stop)
- levels.append(lev)
- names.append(lev.name)
-
- label_key = f"{key}_label{i}"
- level_codes = self.read_array(label_key, start=start, stop=stop)
- codes.append(level_codes)
-
- return MultiIndex(
- levels=levels, codes=codes, names=names, verify_integrity=True
- )
-
- def read_index_node(
- self, node: Node, start: int | None = None, stop: int | None = None
- ) -> Index:
- data = node[start:stop]
- # If the index was an empty array write_array_empty() will
- # have written a sentinel. Here we replace it with the original.
- if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
- data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
- kind = _ensure_decoded(node._v_attrs.kind)
- name = None
-
- if "name" in node._v_attrs:
- name = _ensure_str(node._v_attrs.name)
- name = _ensure_decoded(name)
-
- attrs = node._v_attrs
- factory, kwargs = self._get_index_factory(attrs)
-
- if kind in ("date", "object"):
- index = factory(
- _unconvert_index(
- data, kind, encoding=self.encoding, errors=self.errors
- ),
- dtype=object,
- **kwargs,
- )
- else:
- index = factory(
- _unconvert_index(
- data, kind, encoding=self.encoding, errors=self.errors
- ),
- **kwargs,
- )
-
- index.name = name
-
- return index
-
- def write_array_empty(self, key: str, value: ArrayLike) -> None:
- """write a 0-len array"""
- # ugly hack for length 0 axes
- arr = np.empty((1,) * value.ndim)
- self._handle.create_array(self.group, key, arr)
- node = getattr(self.group, key)
- node._v_attrs.value_type = str(value.dtype)
- node._v_attrs.shape = value.shape
-
- def write_array(
- self, key: str, obj: AnyArrayLike, items: Index | None = None
- ) -> None:
- # TODO: we only have a few tests that get here, the only EA
- # that gets passed is DatetimeArray, and we never have
- # both self._filters and EA
-
- value = extract_array(obj, extract_numpy=True)
-
- if key in self.group:
- self._handle.remove_node(self.group, key)
-
- # Transform needed to interface with pytables row/col notation
- empty_array = value.size == 0
- transposed = False
-
- if is_categorical_dtype(value.dtype):
- raise NotImplementedError(
- "Cannot store a category dtype in a HDF5 dataset that uses format="
- '"fixed". Use format="table".'
- )
- if not empty_array:
- if hasattr(value, "T"):
- # ExtensionArrays (1d) may not have transpose.
- value = value.T
- transposed = True
-
- atom = None
- if self._filters is not None:
- with suppress(ValueError):
- # get the atom for this datatype
- atom = _tables().Atom.from_dtype(value.dtype)
-
- if atom is not None:
- # We only get here if self._filters is non-None and
- # the Atom.from_dtype call succeeded
-
- # create an empty chunked array and fill it from value
- if not empty_array:
- ca = self._handle.create_carray(
- self.group, key, atom, value.shape, filters=self._filters
- )
- ca[:] = value
-
- else:
- self.write_array_empty(key, value)
-
- elif value.dtype.type == np.object_:
- # infer the type, warn if we have a non-string type here (for
- # performance)
- inferred_type = lib.infer_dtype(value, skipna=False)
- if empty_array:
- pass
- elif inferred_type == "string":
- pass
- else:
- ws = performance_doc % (inferred_type, key, items)
- warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())
-
- vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
- vlarr.append(value)
-
- elif is_datetime64_dtype(value.dtype):
- self._handle.create_array(self.group, key, value.view("i8"))
- getattr(self.group, key)._v_attrs.value_type = "datetime64"
- elif is_datetime64tz_dtype(value.dtype):
- # store as UTC
- # with a zone
-
- # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
- # attribute "asi8"
- self._handle.create_array(
- self.group, key, value.asi8 # type: ignore[union-attr]
- )
-
- node = getattr(self.group, key)
- # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
- # attribute "tz"
- node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]
- node._v_attrs.value_type = "datetime64"
- elif is_timedelta64_dtype(value.dtype):
- self._handle.create_array(self.group, key, value.view("i8"))
- getattr(self.group, key)._v_attrs.value_type = "timedelta64"
- elif empty_array:
- self.write_array_empty(key, value)
- else:
- self._handle.create_array(self.group, key, value)
-
- getattr(self.group, key)._v_attrs.transposed = transposed
-
-
-class SeriesFixed(GenericFixed):
- pandas_kind = "series"
- attributes = ["name"]
-
- name: Hashable
-
- @property
- def shape(self):
- try:
- return (len(self.group.values),)
- except (TypeError, AttributeError):
- return None
-
- def read(
- self,
- where=None,
- columns=None,
- start: int | None = None,
- stop: int | None = None,
- ) -> Series:
- self.validate_read(columns, where)
- index = self.read_index("index", start=start, stop=stop)
- values = self.read_array("values", start=start, stop=stop)
- return Series(values, index=index, name=self.name, copy=False)
-
- # error: Signature of "write" incompatible with supertype "Fixed"
- def write(self, obj, **kwargs) -> None: # type: ignore[override]
- super().write(obj, **kwargs)
- self.write_index("index", obj.index)
- self.write_array("values", obj)
- self.attrs.name = obj.name
-
-
-class BlockManagerFixed(GenericFixed):
- attributes = ["ndim", "nblocks"]
-
- nblocks: int
-
- @property
- def shape(self) -> Shape | None:
- try:
- ndim = self.ndim
-
- # items
- items = 0
- for i in range(self.nblocks):
- node = getattr(self.group, f"block{i}_items")
- shape = getattr(node, "shape", None)
- if shape is not None:
- items += shape[0]
-
- # data shape
- node = self.group.block0_values
- shape = getattr(node, "shape", None)
- if shape is not None:
- shape = list(shape[0 : (ndim - 1)])
- else:
- shape = []
-
- shape.append(items)
-
- return shape
- except AttributeError:
- return None
-
- def read(
- self,
- where=None,
- columns=None,
- start: int | None = None,
- stop: int | None = None,
- ) -> DataFrame:
- # start, stop applied to rows, so 0th axis only
- self.validate_read(columns, where)
- select_axis = self.obj_type()._get_block_manager_axis(0)
-
- axes = []
- for i in range(self.ndim):
- _start, _stop = (start, stop) if i == select_axis else (None, None)
- ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
- axes.append(ax)
-
- items = axes[0]
- dfs = []
-
- for i in range(self.nblocks):
- blk_items = self.read_index(f"block{i}_items")
- values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
-
- columns = items[items.get_indexer(blk_items)]
- df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
- dfs.append(df)
-
- if len(dfs) > 0:
- out = concat(dfs, axis=1, copy=True)
- out = out.reindex(columns=items, copy=False)
- return out
-
- return DataFrame(columns=axes[0], index=axes[1])
-
- # error: Signature of "write" incompatible with supertype "Fixed"
- def write(self, obj, **kwargs) -> None: # type: ignore[override]
- super().write(obj, **kwargs)
-
- # TODO(ArrayManager) HDFStore relies on accessing the blocks
- if isinstance(obj._mgr, ArrayManager):
- obj = obj._as_manager("block")
-
- data = obj._mgr
- if not data.is_consolidated():
- data = data.consolidate()
-
- self.attrs.ndim = data.ndim
- for i, ax in enumerate(data.axes):
- if i == 0 and (not ax.is_unique):
- raise ValueError("Columns index has to be unique for fixed format")
- self.write_index(f"axis{i}", ax)
-
- # Supporting mixed-type DataFrame objects...nontrivial
- self.attrs.nblocks = len(data.blocks)
- for i, blk in enumerate(data.blocks):
- # I have no idea why, but writing values before items fixed #2299
- blk_items = data.items.take(blk.mgr_locs)
- self.write_array(f"block{i}_values", blk.values, items=blk_items)
- self.write_index(f"block{i}_items", blk_items)
-
-
-class FrameFixed(BlockManagerFixed):
- pandas_kind = "frame"
- obj_type = DataFrame
-
-
-class Table(Fixed):
- """
- represent a table:
- facilitate read/write of various types of tables
-
- Attrs in Table Node
- -------------------
- These are attributes that are store in the main table node, they are
- necessary to recreate these tables when read back in.
-
- index_axes : a list of tuples of the (original indexing axis and
- index column)
- non_index_axes: a list of tuples of the (original index axis and
- columns on a non-indexing axis)
- values_axes : a list of the columns which comprise the data of this
- table
- data_columns : a list of the columns that we are allowing indexing
- (these become single columns in values_axes)
- nan_rep : the string to use for nan representations for string
- objects
- levels : the names of levels
- metadata : the names of the metadata columns
- """
-
- pandas_kind = "wide_table"
- format_type: str = "table" # GH#30962 needed by dask
- table_type: str
- levels: int | list[Hashable] = 1
- is_table = True
-
- metadata: list
-
- def __init__(
- self,
- parent: HDFStore,
- group: Node,
- encoding: str | None = None,
- errors: str = "strict",
- index_axes: list[IndexCol] | None = None,
- non_index_axes: list[tuple[AxisInt, Any]] | None = None,
- values_axes: list[DataCol] | None = None,
- data_columns: list | None = None,
- info: dict | None = None,
- nan_rep=None,
- ) -> None:
- super().__init__(parent, group, encoding=encoding, errors=errors)
- self.index_axes = index_axes or []
- self.non_index_axes = non_index_axes or []
- self.values_axes = values_axes or []
- self.data_columns = data_columns or []
- self.info = info or {}
- self.nan_rep = nan_rep
-
- @property
- def table_type_short(self) -> str:
- return self.table_type.split("_")[0]
-
- def __repr__(self) -> str:
- """return a pretty representation of myself"""
- self.infer_axes()
- jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
- dc = f",dc->[{jdc}]"
-
- ver = ""
- if self.is_old_version:
- jver = ".".join([str(x) for x in self.version])
- ver = f"[{jver}]"
-
- jindex_axes = ",".join([a.name for a in self.index_axes])
- return (
- f"{self.pandas_type:12.12}{ver} "
- f"(typ->{self.table_type_short},nrows->{self.nrows},"
- f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
- )
-
- def __getitem__(self, c: str):
- """return the axis for c"""
- for a in self.axes:
- if c == a.name:
- return a
- return None
-
- def validate(self, other) -> None:
- """validate against an existing table"""
- if other is None:
- return
-
- if other.table_type != self.table_type:
- raise TypeError(
- "incompatible table_type with existing "
- f"[{other.table_type} - {self.table_type}]"
- )
-
- for c in ["index_axes", "non_index_axes", "values_axes"]:
- sv = getattr(self, c, None)
- ov = getattr(other, c, None)
- if sv != ov:
- # show the error for the specific axes
- # Argument 1 to "enumerate" has incompatible type
- # "Optional[Any]"; expected "Iterable[Any]" [arg-type]
- for i, sax in enumerate(sv): # type: ignore[arg-type]
- # Value of type "Optional[Any]" is not indexable [index]
- oax = ov[i] # type: ignore[index]
- if sax != oax:
- raise ValueError(
- f"invalid combination of [{c}] on appending data "
- f"[{sax}] vs current table [{oax}]"
- )
-
- # should never get here
- raise Exception(
- f"invalid combination of [{c}] on appending data [{sv}] vs "
- f"current table [{ov}]"
- )
-
- @property
- def is_multi_index(self) -> bool:
- """the levels attribute is 1 or a list in the case of a multi-index"""
- return isinstance(self.levels, list)
-
- def validate_multiindex(
- self, obj: DataFrame | Series
- ) -> tuple[DataFrame, list[Hashable]]:
- """
- validate that we can store the multi-index; reset and return the
- new object
- """
- levels = com.fill_missing_names(obj.index.names)
- try:
- reset_obj = obj.reset_index()
- except ValueError as err:
- raise ValueError(
- "duplicate names/columns in the multi-index when storing as a table"
- ) from err
- assert isinstance(reset_obj, DataFrame) # for mypy
- return reset_obj, levels
-
- @property
- def nrows_expected(self) -> int:
- """based on our axes, compute the expected nrows"""
- return np.prod([i.cvalues.shape[0] for i in self.index_axes])
-
- @property
- def is_exists(self) -> bool:
- """has this table been created"""
- return "table" in self.group
-
- @property
- def storable(self):
- return getattr(self.group, "table", None)
-
- @property
- def table(self):
- """return the table group (this is my storable)"""
- return self.storable
-
- @property
- def dtype(self):
- return self.table.dtype
-
- @property
- def description(self):
- return self.table.description
-
- @property
- def axes(self):
- return itertools.chain(self.index_axes, self.values_axes)
-
- @property
- def ncols(self) -> int:
- """the number of total columns in the values axes"""
- return sum(len(a.values) for a in self.values_axes)
-
- @property
- def is_transposed(self) -> bool:
- return False
-
- @property
- def data_orientation(self) -> tuple[int, ...]:
- """return a tuple of my permutated axes, non_indexable at the front"""
- return tuple(
- itertools.chain(
- [int(a[0]) for a in self.non_index_axes],
- [int(a.axis) for a in self.index_axes],
- )
- )
-
- def queryables(self) -> dict[str, Any]:
- """return a dict of the kinds allowable columns for this object"""
- # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
- axis_names = {0: "index", 1: "columns"}
-
- # compute the values_axes queryables
- d1 = [(a.cname, a) for a in self.index_axes]
- d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
- d3 = [
- (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
- ]
-
- return dict(d1 + d2 + d3)
-
- def index_cols(self):
- """return a list of my index cols"""
- # Note: each `i.cname` below is assured to be a str.
- return [(i.axis, i.cname) for i in self.index_axes]
-
- def values_cols(self) -> list[str]:
- """return a list of my values cols"""
- return [i.cname for i in self.values_axes]
-
- def _get_metadata_path(self, key: str) -> str:
- """return the metadata pathname for this key"""
- group = self.group._v_pathname
- return f"{group}/meta/{key}/meta"
-
- def write_metadata(self, key: str, values: np.ndarray) -> None:
- """
- Write out a metadata array to the key as a fixed-format Series.
-
- Parameters
- ----------
- key : str
- values : ndarray
- """
- self.parent.put(
- self._get_metadata_path(key),
- Series(values, copy=False),
- format="table",
- encoding=self.encoding,
- errors=self.errors,
- nan_rep=self.nan_rep,
- )
-
- def read_metadata(self, key: str):
- """return the meta data array for this key"""
- if getattr(getattr(self.group, "meta", None), key, None) is not None:
- return self.parent.select(self._get_metadata_path(key))
- return None
-
- def set_attrs(self) -> None:
- """set our table type & indexables"""
- self.attrs.table_type = str(self.table_type)
- self.attrs.index_cols = self.index_cols()
- self.attrs.values_cols = self.values_cols()
- self.attrs.non_index_axes = self.non_index_axes
- self.attrs.data_columns = self.data_columns
- self.attrs.nan_rep = self.nan_rep
- self.attrs.encoding = self.encoding
- self.attrs.errors = self.errors
- self.attrs.levels = self.levels
- self.attrs.info = self.info
-
- def get_attrs(self) -> None:
- """retrieve our attributes"""
- self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
- self.data_columns = getattr(self.attrs, "data_columns", None) or []
- self.info = getattr(self.attrs, "info", None) or {}
- self.nan_rep = getattr(self.attrs, "nan_rep", None)
- self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
- self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
- self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
- self.index_axes = [a for a in self.indexables if a.is_an_indexable]
- self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
-
- def validate_version(self, where=None) -> None:
- """are we trying to operate on an old version?"""
- if where is not None:
- if self.is_old_version:
- ws = incompatibility_doc % ".".join([str(x) for x in self.version])
- warnings.warn(
- ws,
- IncompatibilityWarning,
- stacklevel=find_stack_level(),
- )
-
- def validate_min_itemsize(self, min_itemsize) -> None:
- """
- validate the min_itemsize doesn't contain items that are not in the
- axes this needs data_columns to be defined
- """
- if min_itemsize is None:
- return
- if not isinstance(min_itemsize, dict):
- return
-
- q = self.queryables()
- for k in min_itemsize:
- # ok, apply generally
- if k == "values":
- continue
- if k not in q:
- raise ValueError(
- f"min_itemsize has the key [{k}] which is not an axis or "
- "data_column"
- )
-
- @cache_readonly
- def indexables(self):
- """create/cache the indexables if they don't exist"""
- _indexables = []
-
- desc = self.description
- table_attrs = self.table.attrs
-
- # Note: each of the `name` kwargs below are str, ensured
- # by the definition in index_cols.
- # index columns
- for i, (axis, name) in enumerate(self.attrs.index_cols):
- atom = getattr(desc, name)
- md = self.read_metadata(name)
- meta = "category" if md is not None else None
-
- kind_attr = f"{name}_kind"
- kind = getattr(table_attrs, kind_attr, None)
-
- index_col = IndexCol(
- name=name,
- axis=axis,
- pos=i,
- kind=kind,
- typ=atom,
- table=self.table,
- meta=meta,
- metadata=md,
- )
- _indexables.append(index_col)
-
- # values columns
- dc = set(self.data_columns)
- base_pos = len(_indexables)
-
- def f(i, c):
- assert isinstance(c, str)
- klass = DataCol
- if c in dc:
- klass = DataIndexableCol
-
- atom = getattr(desc, c)
- adj_name = _maybe_adjust_name(c, self.version)
-
- # TODO: why kind_attr here?
- values = getattr(table_attrs, f"{adj_name}_kind", None)
- dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
- # Argument 1 to "_dtype_to_kind" has incompatible type
- # "Optional[Any]"; expected "str" [arg-type]
- kind = _dtype_to_kind(dtype) # type: ignore[arg-type]
-
- md = self.read_metadata(c)
- # TODO: figure out why these two versions of `meta` dont always match.
- # meta = "category" if md is not None else None
- meta = getattr(table_attrs, f"{adj_name}_meta", None)
-
- obj = klass(
- name=adj_name,
- cname=c,
- values=values,
- kind=kind,
- pos=base_pos + i,
- typ=atom,
- table=self.table,
- meta=meta,
- metadata=md,
- dtype=dtype,
- )
- return obj
-
- # Note: the definition of `values_cols` ensures that each
- # `c` below is a str.
- _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
-
- return _indexables
-
- def create_index(
- self, columns=None, optlevel=None, kind: str | None = None
- ) -> None:
- """
- Create a pytables index on the specified columns.
-
- Parameters
- ----------
- columns : None, bool, or listlike[str]
- Indicate which columns to create an index on.
-
- * False : Do not create any indexes.
- * True : Create indexes on all columns.
- * None : Create indexes on all columns.
- * listlike : Create indexes on the given columns.
-
- optlevel : int or None, default None
- Optimization level, if None, pytables defaults to 6.
- kind : str or None, default None
- Kind of index, if None, pytables defaults to "medium".
-
- Raises
- ------
- TypeError if trying to create an index on a complex-type column.
-
- Notes
- -----
- Cannot index Time64Col or ComplexCol.
- Pytables must be >= 3.0.
- """
- if not self.infer_axes():
- return
- if columns is False:
- return
-
- # index all indexables and data_columns
- if columns is None or columns is True:
- columns = [a.cname for a in self.axes if a.is_data_indexable]
- if not isinstance(columns, (tuple, list)):
- columns = [columns]
-
- kw = {}
- if optlevel is not None:
- kw["optlevel"] = optlevel
- if kind is not None:
- kw["kind"] = kind
-
- table = self.table
- for c in columns:
- v = getattr(table.cols, c, None)
- if v is not None:
- # remove the index if the kind/optlevel have changed
- if v.is_indexed:
- index = v.index
- cur_optlevel = index.optlevel
- cur_kind = index.kind
-
- if kind is not None and cur_kind != kind:
- v.remove_index()
- else:
- kw["kind"] = cur_kind
-
- if optlevel is not None and cur_optlevel != optlevel:
- v.remove_index()
- else:
- kw["optlevel"] = cur_optlevel
-
- # create the index
- if not v.is_indexed:
- if v.type.startswith("complex"):
- raise TypeError(
- "Columns containing complex values can be stored but "
- "cannot be indexed when using table format. Either use "
- "fixed format, set index=False, or do not include "
- "the columns containing complex values to "
- "data_columns when initializing the table."
- )
- v.create_index(**kw)
- elif c in self.non_index_axes[0][1]:
- # GH 28156
- raise AttributeError(
- f"column {c} is not a data_column.\n"
- f"In order to read column {c} you must reload the dataframe \n"
- f"into HDFStore and include {c} with the data_columns argument."
- )
-
- def _read_axes(
- self, where, start: int | None = None, stop: int | None = None
- ) -> list[tuple[ArrayLike, ArrayLike]]:
- """
- Create the axes sniffed from the table.
-
- Parameters
- ----------
- where : ???
- start : int or None, default None
- stop : int or None, default None
-
- Returns
- -------
- List[Tuple[index_values, column_values]]
- """
- # create the selection
- selection = Selection(self, where=where, start=start, stop=stop)
- values = selection.select()
-
- results = []
- # convert the data
- for a in self.axes:
- a.set_info(self.info)
- res = a.convert(
- values,
- nan_rep=self.nan_rep,
- encoding=self.encoding,
- errors=self.errors,
- )
- results.append(res)
-
- return results
-
- @classmethod
- def get_object(cls, obj, transposed: bool):
- """return the data for this obj"""
- return obj
-
- def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
- """
- take the input data_columns and min_itemize and create a data
- columns spec
- """
- if not len(non_index_axes):
- return []
-
- axis, axis_labels = non_index_axes[0]
- info = self.info.get(axis, {})
- if info.get("type") == "MultiIndex" and data_columns:
- raise ValueError(
- f"cannot use a multi-index on axis [{axis}] with "
- f"data_columns {data_columns}"
- )
-
- # evaluate the passed data_columns, True == use all columns
- # take only valid axis labels
- if data_columns is True:
- data_columns = list(axis_labels)
- elif data_columns is None:
- data_columns = []
-
- # if min_itemsize is a dict, add the keys (exclude 'values')
- if isinstance(min_itemsize, dict):
- existing_data_columns = set(data_columns)
- data_columns = list(data_columns) # ensure we do not modify
- data_columns.extend(
- [
- k
- for k in min_itemsize.keys()
- if k != "values" and k not in existing_data_columns
- ]
- )
-
- # return valid columns in the order of our axis
- return [c for c in data_columns if c in axis_labels]
-
- def _create_axes(
- self,
- axes,
- obj: DataFrame,
- validate: bool = True,
- nan_rep=None,
- data_columns=None,
- min_itemsize=None,
- ):
- """
- Create and return the axes.
-
- Parameters
- ----------
- axes: list or None
- The names or numbers of the axes to create.
- obj : DataFrame
- The object to create axes on.
- validate: bool, default True
- Whether to validate the obj against an existing object already written.
- nan_rep :
- A value to use for string column nan_rep.
- data_columns : List[str], True, or None, default None
- Specify the columns that we want to create to allow indexing on.
-
- * True : Use all available columns.
- * None : Use no columns.
- * List[str] : Use the specified columns.
-
- min_itemsize: Dict[str, int] or None, default None
- The min itemsize for a column in bytes.
- """
- if not isinstance(obj, DataFrame):
- group = self.group._v_name
- raise TypeError(
- f"cannot properly create the storer for: [group->{group},"
- f"value->{type(obj)}]"
- )
-
- # set the default axes if needed
- if axes is None:
- axes = [0]
-
- # map axes to numbers
- axes = [obj._get_axis_number(a) for a in axes]
-
- # do we have an existing table (if so, use its axes & data_columns)
- if self.infer_axes():
- table_exists = True
- axes = [a.axis for a in self.index_axes]
- data_columns = list(self.data_columns)
- nan_rep = self.nan_rep
- # TODO: do we always have validate=True here?
- else:
- table_exists = False
-
- new_info = self.info
-
- assert self.ndim == 2 # with next check, we must have len(axes) == 1
- # currently support on ndim-1 axes
- if len(axes) != self.ndim - 1:
- raise ValueError(
- "currently only support ndim-1 indexers in an AppendableTable"
- )
-
- # create according to the new data
- new_non_index_axes: list = []
-
- # nan_representation
- if nan_rep is None:
- nan_rep = "nan"
-
- # We construct the non-index-axis first, since that alters new_info
- idx = [x for x in [0, 1] if x not in axes][0]
-
- a = obj.axes[idx]
- # we might be able to change the axes on the appending data if necessary
- append_axis = list(a)
- if table_exists:
- indexer = len(new_non_index_axes) # i.e. 0
- exist_axis = self.non_index_axes[indexer][1]
- if not array_equivalent(np.array(append_axis), np.array(exist_axis)):
- # ahah! -> reindex
- if array_equivalent(
- np.array(sorted(append_axis)), np.array(sorted(exist_axis))
- ):
- append_axis = exist_axis
-
- # the non_index_axes info
- info = new_info.setdefault(idx, {})
- info["names"] = list(a.names)
- info["type"] = type(a).__name__
-
- new_non_index_axes.append((idx, append_axis))
-
- # Now we can construct our new index axis
- idx = axes[0]
- a = obj.axes[idx]
- axis_name = obj._get_axis_name(idx)
- new_index = _convert_index(axis_name, a, self.encoding, self.errors)
- new_index.axis = idx
-
- # Because we are always 2D, there is only one new_index, so
- # we know it will have pos=0
- new_index.set_pos(0)
- new_index.update_info(new_info)
- new_index.maybe_set_size(min_itemsize) # check for column conflicts
-
- new_index_axes = [new_index]
- j = len(new_index_axes) # i.e. 1
- assert j == 1
-
- # reindex by our non_index_axes & compute data_columns
- assert len(new_non_index_axes) == 1
- for a in new_non_index_axes:
- obj = _reindex_axis(obj, a[0], a[1])
-
- transposed = new_index.axis == 1
-
- # figure out data_columns and get out blocks
- data_columns = self.validate_data_columns(
- data_columns, min_itemsize, new_non_index_axes
- )
-
- frame = self.get_object(obj, transposed)._consolidate()
-
- blocks, blk_items = self._get_blocks_and_items(
- frame, table_exists, new_non_index_axes, self.values_axes, data_columns
- )
-
- # add my values
- vaxes = []
- for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):
- # shape of the data column are the indexable axes
- klass = DataCol
- name = None
-
- # we have a data_column
- if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
- klass = DataIndexableCol
- name = b_items[0]
- if not (name is None or isinstance(name, str)):
- # TODO: should the message here be more specifically non-str?
- raise ValueError("cannot have non-object label DataIndexableCol")
-
- # make sure that we match up the existing columns
- # if we have an existing table
- existing_col: DataCol | None
-
- if table_exists and validate:
- try:
- existing_col = self.values_axes[i]
- except (IndexError, KeyError) as err:
- raise ValueError(
- f"Incompatible appended table [{blocks}]"
- f"with existing table [{self.values_axes}]"
- ) from err
- else:
- existing_col = None
-
- new_name = name or f"values_block_{i}"
- data_converted = _maybe_convert_for_string_atom(
- new_name,
- blk.values,
- existing_col=existing_col,
- min_itemsize=min_itemsize,
- nan_rep=nan_rep,
- encoding=self.encoding,
- errors=self.errors,
- columns=b_items,
- )
- adj_name = _maybe_adjust_name(new_name, self.version)
-
- typ = klass._get_atom(data_converted)
- kind = _dtype_to_kind(data_converted.dtype.name)
- tz = None
- if getattr(data_converted, "tz", None) is not None:
- tz = _get_tz(data_converted.tz)
-
- meta = metadata = ordered = None
- if is_categorical_dtype(data_converted.dtype):
- ordered = data_converted.ordered
- meta = "category"
- metadata = np.array(data_converted.categories, copy=False).ravel()
-
- data, dtype_name = _get_data_and_dtype_name(data_converted)
-
- col = klass(
- name=adj_name,
- cname=new_name,
- values=list(b_items),
- typ=typ,
- pos=j,
- kind=kind,
- tz=tz,
- ordered=ordered,
- meta=meta,
- metadata=metadata,
- dtype=dtype_name,
- data=data,
- )
- col.update_info(new_info)
-
- vaxes.append(col)
-
- j += 1
-
- dcs = [col.name for col in vaxes if col.is_data_indexable]
-
- new_table = type(self)(
- parent=self.parent,
- group=self.group,
- encoding=self.encoding,
- errors=self.errors,
- index_axes=new_index_axes,
- non_index_axes=new_non_index_axes,
- values_axes=vaxes,
- data_columns=dcs,
- info=new_info,
- nan_rep=nan_rep,
- )
- if hasattr(self, "levels"):
- # TODO: get this into constructor, only for appropriate subclass
- new_table.levels = self.levels
-
- new_table.validate_min_itemsize(min_itemsize)
-
- if validate and table_exists:
- new_table.validate(self)
-
- return new_table
-
- @staticmethod
- def _get_blocks_and_items(
- frame: DataFrame,
- table_exists: bool,
- new_non_index_axes,
- values_axes,
- data_columns,
- ):
- # Helper to clarify non-state-altering parts of _create_axes
-
- # TODO(ArrayManager) HDFStore relies on accessing the blocks
- if isinstance(frame._mgr, ArrayManager):
- frame = frame._as_manager("block")
-
- def get_blk_items(mgr):
- return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]
-
- mgr = frame._mgr
- mgr = cast(BlockManager, mgr)
- blocks: list[Block] = list(mgr.blocks)
- blk_items: list[Index] = get_blk_items(mgr)
-
- if len(data_columns):
- # TODO: prove that we only get here with axis == 1?
- # It is the case in all extant tests, but NOT the case
- # outside this `if len(data_columns)` check.
-
- axis, axis_labels = new_non_index_axes[0]
- new_labels = Index(axis_labels).difference(Index(data_columns))
- mgr = frame.reindex(new_labels, axis=axis)._mgr
- mgr = cast(BlockManager, mgr)
-
- blocks = list(mgr.blocks)
- blk_items = get_blk_items(mgr)
- for c in data_columns:
- # This reindex would raise ValueError if we had a duplicate
- # index, so we can infer that (as long as axis==1) we
- # get a single column back, so a single block.
- mgr = frame.reindex([c], axis=axis)._mgr
- mgr = cast(BlockManager, mgr)
- blocks.extend(mgr.blocks)
- blk_items.extend(get_blk_items(mgr))
-
- # reorder the blocks in the same order as the existing table if we can
- if table_exists:
- by_items = {
- tuple(b_items.tolist()): (b, b_items)
- for b, b_items in zip(blocks, blk_items)
- }
- new_blocks: list[Block] = []
- new_blk_items = []
- for ea in values_axes:
- items = tuple(ea.values)
- try:
- b, b_items = by_items.pop(items)
- new_blocks.append(b)
- new_blk_items.append(b_items)
- except (IndexError, KeyError) as err:
- jitems = ",".join([pprint_thing(item) for item in items])
- raise ValueError(
- f"cannot match existing table structure for [{jitems}] "
- "on appending data"
- ) from err
- blocks = new_blocks
- blk_items = new_blk_items
-
- return blocks, blk_items
-
- def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame:
- """process axes filters"""
- # make a copy to avoid side effects
- if columns is not None:
- columns = list(columns)
-
- # make sure to include levels if we have them
- if columns is not None and self.is_multi_index:
- assert isinstance(self.levels, list) # assured by is_multi_index
- for n in self.levels:
- if n not in columns:
- columns.insert(0, n)
-
- # reorder by any non_index_axes & limit to the select columns
- for axis, labels in self.non_index_axes:
- obj = _reindex_axis(obj, axis, labels, columns)
-
- def process_filter(field, filt, op):
- for axis_name in obj._AXIS_ORDERS:
- axis_number = obj._get_axis_number(axis_name)
- axis_values = obj._get_axis(axis_name)
- assert axis_number is not None
-
- # see if the field is the name of an axis
- if field == axis_name:
- # if we have a multi-index, then need to include
- # the levels
- if self.is_multi_index:
- filt = filt.union(Index(self.levels))
-
- takers = op(axis_values, filt)
- return obj.loc(axis=axis_number)[takers]
-
- # this might be the name of a file IN an axis
- elif field in axis_values:
- # we need to filter on this dimension
- values = ensure_index(getattr(obj, field).values)
- filt = ensure_index(filt)
-
- # hack until we support reversed dim flags
- if isinstance(obj, DataFrame):
- axis_number = 1 - axis_number
-
- takers = op(values, filt)
- return obj.loc(axis=axis_number)[takers]
-
- raise ValueError(f"cannot find the field [{field}] for filtering!")
-
- # apply the selection filters (but keep in the same order)
- if selection.filter is not None:
- for field, op, filt in selection.filter.format():
- obj = process_filter(field, filt, op)
-
- return obj
-
- def create_description(
- self,
- complib,
- complevel: int | None,
- fletcher32: bool,
- expectedrows: int | None,
- ) -> dict[str, Any]:
- """create the description of the table from the axes & values"""
- # provided expected rows if its passed
- if expectedrows is None:
- expectedrows = max(self.nrows_expected, 10000)
-
- d = {"name": "table", "expectedrows": expectedrows}
-
- # description from the axes & values
- d["description"] = {a.cname: a.typ for a in self.axes}
-
- if complib:
- if complevel is None:
- complevel = self._complevel or 9
- filters = _tables().Filters(
- complevel=complevel,
- complib=complib,
- fletcher32=fletcher32 or self._fletcher32,
- )
- d["filters"] = filters
- elif self._filters is not None:
- d["filters"] = self._filters
-
- return d
-
- def read_coordinates(
- self, where=None, start: int | None = None, stop: int | None = None
- ):
- """
- select coordinates (row numbers) from a table; return the
- coordinates object
- """
- # validate the version
- self.validate_version(where)
-
- # infer the data kind
- if not self.infer_axes():
- return False
-
- # create the selection
- selection = Selection(self, where=where, start=start, stop=stop)
- coords = selection.select_coords()
- if selection.filter is not None:
- for field, op, filt in selection.filter.format():
- data = self.read_column(
- field, start=coords.min(), stop=coords.max() + 1
- )
- coords = coords[op(data.iloc[coords - coords.min()], filt).values]
-
- return Index(coords)
-
- def read_column(
- self,
- column: str,
- where=None,
- start: int | None = None,
- stop: int | None = None,
- ):
- """
- return a single column from the table, generally only indexables
- are interesting
- """
- # validate the version
- self.validate_version()
-
- # infer the data kind
- if not self.infer_axes():
- return False
-
- if where is not None:
- raise TypeError("read_column does not currently accept a where clause")
-
- # find the axes
- for a in self.axes:
- if column == a.name:
- if not a.is_data_indexable:
- raise ValueError(
- f"column [{column}] can not be extracted individually; "
- "it is not data indexable"
- )
-
- # column must be an indexable or a data column
- c = getattr(self.table.cols, column)
- a.set_info(self.info)
- col_values = a.convert(
- c[start:stop],
- nan_rep=self.nan_rep,
- encoding=self.encoding,
- errors=self.errors,
- )
- return Series(_set_tz(col_values[1], a.tz), name=column, copy=False)
-
- raise KeyError(f"column [{column}] not found in the table")
-
-
-class WORMTable(Table):
- """
- a write-once read-many table: this format DOES NOT ALLOW appending to a
- table. writing is a one-time operation the data are stored in a format
- that allows for searching the data on disk
- """
-
- table_type = "worm"
-
- def read(
- self,
- where=None,
- columns=None,
- start: int | None = None,
- stop: int | None = None,
- ):
- """
- read the indices and the indexing array, calculate offset rows and return
- """
- raise NotImplementedError("WORMTable needs to implement read")
-
- def write(self, **kwargs) -> None:
- """
- write in a format that we can search later on (but cannot append
- to): write out the indices and the values using _write_array
- (e.g. a CArray) create an indexing table so that we can search
- """
- raise NotImplementedError("WORMTable needs to implement write")
-
-
-class AppendableTable(Table):
- """support the new appendable table formats"""
-
- table_type = "appendable"
-
- # error: Signature of "write" incompatible with supertype "Fixed"
- def write( # type: ignore[override]
- self,
- obj,
- axes=None,
- append: bool = False,
- complib=None,
- complevel=None,
- fletcher32=None,
- min_itemsize=None,
- chunksize=None,
- expectedrows=None,
- dropna: bool = False,
- nan_rep=None,
- data_columns=None,
- track_times: bool = True,
- ) -> None:
- if not append and self.is_exists:
- self._handle.remove_node(self.group, "table")
-
- # create the axes
- table = self._create_axes(
- axes=axes,
- obj=obj,
- validate=append,
- min_itemsize=min_itemsize,
- nan_rep=nan_rep,
- data_columns=data_columns,
- )
-
- for a in table.axes:
- a.validate_names()
-
- if not table.is_exists:
- # create the table
- options = table.create_description(
- complib=complib,
- complevel=complevel,
- fletcher32=fletcher32,
- expectedrows=expectedrows,
- )
-
- # set the table attributes
- table.set_attrs()
-
- options["track_times"] = track_times
-
- # create the table
- table._handle.create_table(table.group, **options)
-
- # update my info
- table.attrs.info = table.info
-
- # validate the axes and set the kinds
- for a in table.axes:
- a.validate_and_set(table, append)
-
- # add the rows
- table.write_data(chunksize, dropna=dropna)
-
- def write_data(self, chunksize: int | None, dropna: bool = False) -> None:
- """
- we form the data into a 2-d including indexes,values,mask write chunk-by-chunk
- """
- names = self.dtype.names
- nrows = self.nrows_expected
-
- # if dropna==True, then drop ALL nan rows
- masks = []
- if dropna:
- for a in self.values_axes:
- # figure the mask: only do if we can successfully process this
- # column, otherwise ignore the mask
- mask = isna(a.data).all(axis=0)
- if isinstance(mask, np.ndarray):
- masks.append(mask.astype("u1", copy=False))
-
- # consolidate masks
- if len(masks):
- mask = masks[0]
- for m in masks[1:]:
- mask = mask & m
- mask = mask.ravel()
- else:
- mask = None
-
- # broadcast the indexes if needed
- indexes = [a.cvalues for a in self.index_axes]
- nindexes = len(indexes)
- assert nindexes == 1, nindexes # ensures we dont need to broadcast
-
- # transpose the values so first dimension is last
- # reshape the values if needed
- values = [a.take_data() for a in self.values_axes]
- values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
- bvalues = []
- for i, v in enumerate(values):
- new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
- bvalues.append(v.reshape(new_shape))
-
- # write the chunks
- if chunksize is None:
- chunksize = 100000
-
- rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
- chunks = nrows // chunksize + 1
- for i in range(chunks):
- start_i = i * chunksize
- end_i = min((i + 1) * chunksize, nrows)
- if start_i >= end_i:
- break
-
- self.write_data_chunk(
- rows,
- indexes=[a[start_i:end_i] for a in indexes],
- mask=mask[start_i:end_i] if mask is not None else None,
- values=[v[start_i:end_i] for v in bvalues],
- )
-
- def write_data_chunk(
- self,
- rows: np.ndarray,
- indexes: list[np.ndarray],
- mask: npt.NDArray[np.bool_] | None,
- values: list[np.ndarray],
- ) -> None:
- """
- Parameters
- ----------
- rows : an empty memory space where we are putting the chunk
- indexes : an array of the indexes
- mask : an array of the masks
- values : an array of the values
- """
- # 0 len
- for v in values:
- if not np.prod(v.shape):
- return
-
- nrows = indexes[0].shape[0]
- if nrows != len(rows):
- rows = np.empty(nrows, dtype=self.dtype)
- names = self.dtype.names
- nindexes = len(indexes)
-
- # indexes
- for i, idx in enumerate(indexes):
- rows[names[i]] = idx
-
- # values
- for i, v in enumerate(values):
- rows[names[i + nindexes]] = v
-
- # mask
- if mask is not None:
- m = ~mask.ravel().astype(bool, copy=False)
- if not m.all():
- rows = rows[m]
-
- if len(rows):
- self.table.append(rows)
- self.table.flush()
-
- def delete(self, where=None, start: int | None = None, stop: int | None = None):
- # delete all rows (and return the nrows)
- if where is None or not len(where):
- if start is None and stop is None:
- nrows = self.nrows
- self._handle.remove_node(self.group, recursive=True)
- else:
- # pytables<3.0 would remove a single row with stop=None
- if stop is None:
- stop = self.nrows
- nrows = self.table.remove_rows(start=start, stop=stop)
- self.table.flush()
- return nrows
-
- # infer the data kind
- if not self.infer_axes():
- return None
-
- # create the selection
- table = self.table
- selection = Selection(self, where, start=start, stop=stop)
- values = selection.select_coords()
-
- # delete the rows in reverse order
- sorted_series = Series(values, copy=False).sort_values()
- ln = len(sorted_series)
-
- if ln:
- # construct groups of consecutive rows
- diff = sorted_series.diff()
- groups = list(diff[diff > 1].index)
-
- # 1 group
- if not len(groups):
- groups = [0]
-
- # final element
- if groups[-1] != ln:
- groups.append(ln)
-
- # initial element
- if groups[0] != 0:
- groups.insert(0, 0)
-
- # we must remove in reverse order!
- pg = groups.pop()
- for g in reversed(groups):
- rows = sorted_series.take(range(g, pg))
- table.remove_rows(
- start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
- )
- pg = g
-
- self.table.flush()
-
- # return the number of rows removed
- return ln
-
-
-class AppendableFrameTable(AppendableTable):
- """support the new appendable table formats"""
-
- pandas_kind = "frame_table"
- table_type = "appendable_frame"
- ndim = 2
- obj_type: type[DataFrame | Series] = DataFrame
-
- @property
- def is_transposed(self) -> bool:
- return self.index_axes[0].axis == 1
-
- @classmethod
- def get_object(cls, obj, transposed: bool):
- """these are written transposed"""
- if transposed:
- obj = obj.T
- return obj
-
- def read(
- self,
- where=None,
- columns=None,
- start: int | None = None,
- stop: int | None = None,
- ):
- # validate the version
- self.validate_version(where)
-
- # infer the data kind
- if not self.infer_axes():
- return None
-
- result = self._read_axes(where=where, start=start, stop=stop)
-
- info = (
- self.info.get(self.non_index_axes[0][0], {})
- if len(self.non_index_axes)
- else {}
- )
-
- inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
- assert len(inds) == 1
- ind = inds[0]
-
- index = result[ind][0]
-
- frames = []
- for i, a in enumerate(self.axes):
- if a not in self.values_axes:
- continue
- index_vals, cvalues = result[i]
-
- # we could have a multi-index constructor here
- # ensure_index doesn't recognized our list-of-tuples here
- if info.get("type") != "MultiIndex":
- cols = Index(index_vals)
- else:
- cols = MultiIndex.from_tuples(index_vals)
-
- names = info.get("names")
- if names is not None:
- cols.set_names(names, inplace=True)
-
- if self.is_transposed:
- values = cvalues
- index_ = cols
- cols_ = Index(index, name=getattr(index, "name", None))
- else:
- values = cvalues.T
- index_ = Index(index, name=getattr(index, "name", None))
- cols_ = cols
-
- # if we have a DataIndexableCol, its shape will only be 1 dim
- if values.ndim == 1 and isinstance(values, np.ndarray):
- values = values.reshape((1, values.shape[0]))
-
- if isinstance(values, np.ndarray):
- df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
- elif isinstance(values, Index):
- df = DataFrame(values, columns=cols_, index=index_)
- else:
- # Categorical
- df = DataFrame._from_arrays([values], columns=cols_, index=index_)
- assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
- frames.append(df)
-
- if len(frames) == 1:
- df = frames[0]
- else:
- df = concat(frames, axis=1)
-
- selection = Selection(self, where=where, start=start, stop=stop)
- # apply the selection filters & axis orderings
- df = self.process_axes(df, selection=selection, columns=columns)
-
- return df
-
-
-class AppendableSeriesTable(AppendableFrameTable):
- """support the new appendable table formats"""
-
- pandas_kind = "series_table"
- table_type = "appendable_series"
- ndim = 2
- obj_type = Series
-
- @property
- def is_transposed(self) -> bool:
- return False
-
- @classmethod
- def get_object(cls, obj, transposed: bool):
- return obj
-
- def write(self, obj, data_columns=None, **kwargs):
- """we are going to write this as a frame table"""
- if not isinstance(obj, DataFrame):
- name = obj.name or "values"
- obj = obj.to_frame(name)
- return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
-
- def read(
- self,
- where=None,
- columns=None,
- start: int | None = None,
- stop: int | None = None,
- ) -> Series:
- is_multi_index = self.is_multi_index
- if columns is not None and is_multi_index:
- assert isinstance(self.levels, list) # needed for mypy
- for n in self.levels:
- if n not in columns:
- columns.insert(0, n)
- s = super().read(where=where, columns=columns, start=start, stop=stop)
- if is_multi_index:
- s.set_index(self.levels, inplace=True)
-
- s = s.iloc[:, 0]
-
- # remove the default name
- if s.name == "values":
- s.name = None
- return s
-
-
-class AppendableMultiSeriesTable(AppendableSeriesTable):
- """support the new appendable table formats"""
-
- pandas_kind = "series_table"
- table_type = "appendable_multiseries"
-
- def write(self, obj, **kwargs):
- """we are going to write this as a frame table"""
- name = obj.name or "values"
- newobj, self.levels = self.validate_multiindex(obj)
- assert isinstance(self.levels, list) # for mypy
- cols = list(self.levels)
- cols.append(name)
- newobj.columns = Index(cols)
- return super().write(obj=newobj, **kwargs)
-
-
-class GenericTable(AppendableFrameTable):
- """a table that read/writes the generic pytables table format"""
-
- pandas_kind = "frame_table"
- table_type = "generic_table"
- ndim = 2
- obj_type = DataFrame
- levels: list[Hashable]
-
- @property
- def pandas_type(self) -> str:
- return self.pandas_kind
-
- @property
- def storable(self):
- return getattr(self.group, "table", None) or self.group
-
- def get_attrs(self) -> None:
- """retrieve our attributes"""
- self.non_index_axes = []
- self.nan_rep = None
- self.levels = []
-
- self.index_axes = [a for a in self.indexables if a.is_an_indexable]
- self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
- self.data_columns = [a.name for a in self.values_axes]
-
- @cache_readonly
- def indexables(self):
- """create the indexables from the table description"""
- d = self.description
-
- # TODO: can we get a typ for this? AFAICT it is the only place
- # where we aren't passing one
- # the index columns is just a simple index
- md = self.read_metadata("index")
- meta = "category" if md is not None else None
- index_col = GenericIndexCol(
- name="index", axis=0, table=self.table, meta=meta, metadata=md
- )
-
- _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]
-
- for i, n in enumerate(d._v_names):
- assert isinstance(n, str)
-
- atom = getattr(d, n)
- md = self.read_metadata(n)
- meta = "category" if md is not None else None
- dc = GenericDataIndexableCol(
- name=n,
- pos=i,
- values=[n],
- typ=atom,
- table=self.table,
- meta=meta,
- metadata=md,
- )
- _indexables.append(dc)
-
- return _indexables
-
- def write(self, **kwargs):
- raise NotImplementedError("cannot write on an generic table")
-
-
-class AppendableMultiFrameTable(AppendableFrameTable):
- """a frame with a multi-index"""
-
- table_type = "appendable_multiframe"
- obj_type = DataFrame
- ndim = 2
- _re_levels = re.compile(r"^level_\d+$")
-
- @property
- def table_type_short(self) -> str:
- return "appendable_multi"
-
- def write(self, obj, data_columns=None, **kwargs):
- if data_columns is None:
- data_columns = []
- elif data_columns is True:
- data_columns = obj.columns.tolist()
- obj, self.levels = self.validate_multiindex(obj)
- assert isinstance(self.levels, list) # for mypy
- for n in self.levels:
- if n not in data_columns:
- data_columns.insert(0, n)
- return super().write(obj=obj, data_columns=data_columns, **kwargs)
-
- def read(
- self,
- where=None,
- columns=None,
- start: int | None = None,
- stop: int | None = None,
- ):
- df = super().read(where=where, columns=columns, start=start, stop=stop)
- df = df.set_index(self.levels)
-
- # remove names for 'level_%d'
- df.index = df.index.set_names(
- [None if self._re_levels.search(name) else name for name in df.index.names]
- )
-
- return df
-
-
-def _reindex_axis(
- obj: DataFrame, axis: AxisInt, labels: Index, other=None
-) -> DataFrame:
- ax = obj._get_axis(axis)
- labels = ensure_index(labels)
-
- # try not to reindex even if other is provided
- # if it equals our current index
- if other is not None:
- other = ensure_index(other)
- if (other is None or labels.equals(other)) and labels.equals(ax):
- return obj
-
- labels = ensure_index(labels.unique())
- if other is not None:
- labels = ensure_index(other.unique()).intersection(labels, sort=False)
- if not labels.equals(ax):
- slicer: list[slice | Index] = [slice(None, None)] * obj.ndim
- slicer[axis] = labels
- obj = obj.loc[tuple(slicer)]
- return obj
-
-
-# tz to/from coercion
-
-
-def _get_tz(tz: tzinfo) -> str | tzinfo:
- """for a tz-aware type, return an encoded zone"""
- zone = timezones.get_timezone(tz)
- return zone
-
-
-@overload
-def _set_tz(
- values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False
-) -> DatetimeIndex:
- ...
-
-
-@overload
-def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray:
- ...
-
-
-def _set_tz(
- values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False
-) -> np.ndarray | DatetimeIndex:
- """
- coerce the values to a DatetimeIndex if tz is set
- preserve the input shape if possible
-
- Parameters
- ----------
- values : ndarray or Index
- tz : str or tzinfo
- coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
- """
- if isinstance(values, DatetimeIndex):
- # If values is tzaware, the tz gets dropped in the values.ravel()
- # call below (which returns an ndarray). So we are only non-lossy
- # if `tz` matches `values.tz`.
- assert values.tz is None or values.tz == tz
-
- if tz is not None:
- if isinstance(values, DatetimeIndex):
- name = values.name
- values = values.asi8
- else:
- name = None
- values = values.ravel()
-
- tz = _ensure_decoded(tz)
- values = DatetimeIndex(values, name=name)
- values = values.tz_localize("UTC").tz_convert(tz)
- elif coerce:
- values = np.asarray(values, dtype="M8[ns]")
-
- # error: Incompatible return value type (got "Union[ndarray, Index]",
- # expected "Union[ndarray, DatetimeIndex]")
- return values # type: ignore[return-value]
-
-
-def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
- assert isinstance(name, str)
-
- index_name = index.name
- # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";
- # expected "Union[ExtensionArray, ndarray]"
- converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]
- kind = _dtype_to_kind(dtype_name)
- atom = DataIndexableCol._get_atom(converted)
-
- if (
- (isinstance(index.dtype, np.dtype) and is_integer_dtype(index))
- or needs_i8_conversion(index.dtype)
- or is_bool_dtype(index.dtype)
- ):
- # Includes Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
- # in which case "kind" is "integer", "integer", "datetime64",
- # "timedelta64", and "integer", respectively.
- return IndexCol(
- name,
- values=converted,
- kind=kind,
- typ=atom,
- freq=getattr(index, "freq", None),
- tz=getattr(index, "tz", None),
- index_name=index_name,
- )
-
- if isinstance(index, MultiIndex):
- raise TypeError("MultiIndex not supported here!")
-
- inferred_type = lib.infer_dtype(index, skipna=False)
- # we won't get inferred_type of "datetime64" or "timedelta64" as these
- # would go through the DatetimeIndex/TimedeltaIndex paths above
-
- values = np.asarray(index)
-
- if inferred_type == "date":
- converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
- return IndexCol(
- name, converted, "date", _tables().Time32Col(), index_name=index_name
- )
- elif inferred_type == "string":
- converted = _convert_string_array(values, encoding, errors)
- itemsize = converted.dtype.itemsize
- return IndexCol(
- name,
- converted,
- "string",
- _tables().StringCol(itemsize),
- index_name=index_name,
- )
-
- elif inferred_type in ["integer", "floating"]:
- return IndexCol(
- name, values=converted, kind=kind, typ=atom, index_name=index_name
- )
- else:
- assert isinstance(converted, np.ndarray) and converted.dtype == object
- assert kind == "object", kind
- atom = _tables().ObjectAtom()
- return IndexCol(name, converted, kind, atom, index_name=index_name)
-
-
-def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:
- index: Index | np.ndarray
-
- if kind == "datetime64":
- index = DatetimeIndex(data)
- elif kind == "timedelta64":
- index = TimedeltaIndex(data)
- elif kind == "date":
- try:
- index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
- except ValueError:
- index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
- elif kind in ("integer", "float", "bool"):
- index = np.asarray(data)
- elif kind in ("string"):
- index = _unconvert_string_array(
- data, nan_rep=None, encoding=encoding, errors=errors
- )
- elif kind == "object":
- index = np.asarray(data[0])
- else: # pragma: no cover
- raise ValueError(f"unrecognized index type {kind}")
- return index
-
-
-def _maybe_convert_for_string_atom(
- name: str,
- bvalues: ArrayLike,
- existing_col,
- min_itemsize,
- nan_rep,
- encoding,
- errors,
- columns: list[str],
-):
- if bvalues.dtype != object:
- return bvalues
-
- bvalues = cast(np.ndarray, bvalues)
-
- dtype_name = bvalues.dtype.name
- inferred_type = lib.infer_dtype(bvalues, skipna=False)
-
- if inferred_type == "date":
- raise TypeError("[date] is not implemented as a table column")
- if inferred_type == "datetime":
- # after GH#8260
- # this only would be hit for a multi-timezone dtype which is an error
- raise TypeError(
- "too many timezones in this block, create separate data columns"
- )
-
- if not (inferred_type == "string" or dtype_name == "object"):
- return bvalues
-
- mask = isna(bvalues)
- data = bvalues.copy()
- data[mask] = nan_rep
-
- # see if we have a valid string type
- inferred_type = lib.infer_dtype(data, skipna=False)
- if inferred_type != "string":
- # we cannot serialize this data, so report an exception on a column
- # by column basis
-
- # expected behaviour:
- # search block for a non-string object column by column
- for i in range(data.shape[0]):
- col = data[i]
- inferred_type = lib.infer_dtype(col, skipna=False)
- if inferred_type != "string":
- error_column_label = columns[i] if len(columns) > i else f"No.{i}"
- raise TypeError(
- f"Cannot serialize the column [{error_column_label}]\n"
- f"because its data contents are not [string] but "
- f"[{inferred_type}] object dtype"
- )
-
- # itemsize is the maximum length of a string (along any dimension)
-
- data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
- itemsize = data_converted.itemsize
-
- # specified min_itemsize?
- if isinstance(min_itemsize, dict):
- min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
- itemsize = max(min_itemsize or 0, itemsize)
-
- # check for column in the values conflicts
- if existing_col is not None:
- eci = existing_col.validate_col(itemsize)
- if eci is not None and eci > itemsize:
- itemsize = eci
-
- data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
- return data_converted
-
-
-def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
- """
- Take a string-like that is object dtype and coerce to a fixed size string type.
-
- Parameters
- ----------
- data : np.ndarray[object]
- encoding : str
- errors : str
- Handler for encoding errors.
-
- Returns
- -------
- np.ndarray[fixed-length-string]
- """
- # encode if needed
- if len(data):
- data = (
- Series(data.ravel(), copy=False)
- .str.encode(encoding, errors)
- ._values.reshape(data.shape)
- )
-
- # create the sized dtype
- ensured = ensure_object(data.ravel())
- itemsize = max(1, libwriters.max_len_string_array(ensured))
-
- data = np.asarray(data, dtype=f"S{itemsize}")
- return data
-
-
-def _unconvert_string_array(
- data: np.ndarray, nan_rep, encoding: str, errors: str
-) -> np.ndarray:
- """
- Inverse of _convert_string_array.
-
- Parameters
- ----------
- data : np.ndarray[fixed-length-string]
- nan_rep : the storage repr of NaN
- encoding : str
- errors : str
- Handler for encoding errors.
-
- Returns
- -------
- np.ndarray[object]
- Decoded data.
- """
- shape = data.shape
- data = np.asarray(data.ravel(), dtype=object)
-
- if len(data):
- itemsize = libwriters.max_len_string_array(ensure_object(data))
- dtype = f"U{itemsize}"
-
- if isinstance(data[0], bytes):
- data = Series(data, copy=False).str.decode(encoding, errors=errors)._values
- else:
- data = data.astype(dtype, copy=False).astype(object, copy=False)
-
- if nan_rep is None:
- nan_rep = "nan"
-
- libwriters.string_array_replace_from_nan_rep(data, nan_rep)
- return data.reshape(shape)
-
-
-def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
- assert isinstance(val_kind, str), type(val_kind)
- if _need_convert(val_kind):
- conv = _get_converter(val_kind, encoding, errors)
- values = conv(values)
- return values
-
-
-def _get_converter(kind: str, encoding: str, errors: str):
- if kind == "datetime64":
- return lambda x: np.asarray(x, dtype="M8[ns]")
- elif kind == "string":
- return lambda x: _unconvert_string_array(
- x, nan_rep=None, encoding=encoding, errors=errors
- )
- else: # pragma: no cover
- raise ValueError(f"invalid kind {kind}")
-
-
-def _need_convert(kind: str) -> bool:
- if kind in ("datetime64", "string"):
- return True
- return False
-
-
-def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:
- """
- Prior to 0.10.1, we named values blocks like: values_block_0 an the
- name values_0, adjust the given name if necessary.
-
- Parameters
- ----------
- name : str
- version : Tuple[int, int, int]
-
- Returns
- -------
- str
- """
- if isinstance(version, str) or len(version) < 3:
- raise ValueError("Version is incorrect, expected sequence of 3 integers.")
-
- if version[0] == 0 and version[1] <= 10 and version[2] == 0:
- m = re.search(r"values_block_(\d+)", name)
- if m:
- grp = m.groups()[0]
- name = f"values_{grp}"
- return name
-
-
-def _dtype_to_kind(dtype_str: str) -> str:
- """
- Find the "kind" string describing the given dtype name.
- """
- dtype_str = _ensure_decoded(dtype_str)
-
- if dtype_str.startswith("string") or dtype_str.startswith("bytes"):
- kind = "string"
- elif dtype_str.startswith("float"):
- kind = "float"
- elif dtype_str.startswith("complex"):
- kind = "complex"
- elif dtype_str.startswith("int") or dtype_str.startswith("uint"):
- kind = "integer"
- elif dtype_str.startswith("datetime64"):
- kind = "datetime64"
- elif dtype_str.startswith("timedelta"):
- kind = "timedelta64"
- elif dtype_str.startswith("bool"):
- kind = "bool"
- elif dtype_str.startswith("category"):
- kind = "category"
- elif dtype_str.startswith("period"):
- # We store the `freq` attr so we can restore from integers
- kind = "integer"
- elif dtype_str == "object":
- kind = "object"
- else:
- raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
-
- return kind
-
-
-def _get_data_and_dtype_name(data: ArrayLike):
- """
- Convert the passed data into a storable form and a dtype string.
- """
- if isinstance(data, Categorical):
- data = data.codes
-
- # For datetime64tz we need to drop the TZ in tests TODO: why?
- dtype_name = data.dtype.name.split("[")[0]
-
- if data.dtype.kind in ["m", "M"]:
- data = np.asarray(data.view("i8"))
- # TODO: we used to reshape for the dt64tz case, but no longer
- # doing that doesn't seem to break anything. why?
-
- elif isinstance(data, PeriodIndex):
- data = data.asi8
-
- data = np.asarray(data)
- return data, dtype_name
-
-
-class Selection:
- """
- Carries out a selection operation on a tables.Table object.
-
- Parameters
- ----------
- table : a Table object
- where : list of Terms (or convertible to)
- start, stop: indices to start and/or stop selection
-
- """
-
- def __init__(
- self,
- table: Table,
- where=None,
- start: int | None = None,
- stop: int | None = None,
- ) -> None:
- self.table = table
- self.where = where
- self.start = start
- self.stop = stop
- self.condition = None
- self.filter = None
- self.terms = None
- self.coordinates = None
-
- if is_list_like(where):
- # see if we have a passed coordinate like
- with suppress(ValueError):
- inferred = lib.infer_dtype(where, skipna=False)
- if inferred in ("integer", "boolean"):
- where = np.asarray(where)
- if where.dtype == np.bool_:
- start, stop = self.start, self.stop
- if start is None:
- start = 0
- if stop is None:
- stop = self.table.nrows
- self.coordinates = np.arange(start, stop)[where]
- elif issubclass(where.dtype.type, np.integer):
- if (self.start is not None and (where < self.start).any()) or (
- self.stop is not None and (where >= self.stop).any()
- ):
- raise ValueError(
- "where must have index locations >= start and < stop"
- )
- self.coordinates = where
-
- if self.coordinates is None:
- self.terms = self.generate(where)
-
- # create the numexpr & the filter
- if self.terms is not None:
- self.condition, self.filter = self.terms.evaluate()
-
- def generate(self, where):
- """where can be a : dict,list,tuple,string"""
- if where is None:
- return None
-
- q = self.table.queryables()
- try:
- return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
- except NameError as err:
- # raise a nice message, suggesting that the user should use
- # data_columns
- qkeys = ",".join(q.keys())
- msg = dedent(
- f"""\
- The passed where expression: {where}
- contains an invalid variable reference
- all of the variable references must be a reference to
- an axis (e.g. 'index' or 'columns'), or a data_column
- The currently defined references are: {qkeys}
- """
- )
- raise ValueError(msg) from err
-
- def select(self):
- """
- generate the selection
- """
- if self.condition is not None:
- return self.table.table.read_where(
- self.condition.format(), start=self.start, stop=self.stop
- )
- elif self.coordinates is not None:
- return self.table.table.read_coordinates(self.coordinates)
- return self.table.table.read(start=self.start, stop=self.stop)
-
- def select_coords(self):
- """
- generate the selection
- """
- start, stop = self.start, self.stop
- nrows = self.table.nrows
- if start is None:
- start = 0
- elif start < 0:
- start += nrows
- if stop is None:
- stop = nrows
- elif stop < 0:
- stop += nrows
-
- if self.condition is not None:
- return self.table.table.get_where_list(
- self.condition.format(), start=start, stop=stop, sort=True
- )
- elif self.coordinates is not None:
- return self.coordinates
-
- return np.arange(start, stop)
diff --git a/contrib/python/pandas/py3/pandas/io/sas/__init__.py b/contrib/python/pandas/py3/pandas/io/sas/__init__.py
deleted file mode 100644
index 317730745b6..00000000000
--- a/contrib/python/pandas/py3/pandas/io/sas/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from pandas.io.sas.sasreader import read_sas
-
-__all__ = ["read_sas"]
diff --git a/contrib/python/pandas/py3/pandas/io/sas/_byteswap.pyi b/contrib/python/pandas/py3/pandas/io/sas/_byteswap.pyi
deleted file mode 100644
index bb0dbfc6a50..00000000000
--- a/contrib/python/pandas/py3/pandas/io/sas/_byteswap.pyi
+++ /dev/null
@@ -1,5 +0,0 @@
-def read_float_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ...
-def read_double_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ...
-def read_uint16_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
-def read_uint32_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
-def read_uint64_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
diff --git a/contrib/python/pandas/py3/pandas/io/sas/_sas.pyi b/contrib/python/pandas/py3/pandas/io/sas/_sas.pyi
deleted file mode 100644
index 5d65e2b56b5..00000000000
--- a/contrib/python/pandas/py3/pandas/io/sas/_sas.pyi
+++ /dev/null
@@ -1,7 +0,0 @@
-from pandas.io.sas.sas7bdat import SAS7BDATReader
-
-class Parser:
- def __init__(self, parser: SAS7BDATReader) -> None: ...
- def read(self, nrows: int) -> None: ...
-
-def get_subheader_index(signature: bytes) -> int: ...
diff --git a/contrib/python/pandas/py3/pandas/io/sas/byteswap.pyx b/contrib/python/pandas/py3/pandas/io/sas/byteswap.pyx
deleted file mode 100644
index 511af5140b5..00000000000
--- a/contrib/python/pandas/py3/pandas/io/sas/byteswap.pyx
+++ /dev/null
@@ -1,93 +0,0 @@
-"""
-The following are faster versions of struct.unpack that avoid the overhead of Python
-function calls.
-
-In the SAS7BDAT parser, they may be called up to (n_rows * n_cols) times.
-"""
-from cython cimport Py_ssize_t
-from libc.stdint cimport (
- uint16_t,
- uint32_t,
- uint64_t,
-)
-
-
-def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
- assert offset + 4 < len(data)
- cdef:
- const char *data_ptr = data
- float res = (<float*>(data_ptr + offset))[0]
- if byteswap:
- res = _byteswap_float(res)
- return res
-
-
-def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
- assert offset + 8 < len(data)
- cdef:
- const char *data_ptr = data
- double res = (<double*>(data_ptr + offset))[0]
- if byteswap:
- res = _byteswap_double(res)
- return res
-
-
-def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
- assert offset + 2 < len(data)
- cdef:
- const char *data_ptr = data
- uint16_t res = (<uint16_t *>(data_ptr + offset))[0]
- if byteswap:
- res = _byteswap2(res)
- return res
-
-
-def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
- assert offset + 4 < len(data)
- cdef:
- const char *data_ptr = data
- uint32_t res = (<uint32_t *>(data_ptr + offset))[0]
- if byteswap:
- res = _byteswap4(res)
- return res
-
-
-def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
- assert offset + 8 < len(data)
- cdef:
- const char *data_ptr = data
- uint64_t res = (<uint64_t *>(data_ptr + offset))[0]
- if byteswap:
- res = _byteswap8(res)
- return res
-
-
-# Byteswapping
-
-cdef extern from *:
- """
- #ifdef _MSC_VER
- #define _byteswap2 _byteswap_ushort
- #define _byteswap4 _byteswap_ulong
- #define _byteswap8 _byteswap_uint64
- #else
- #define _byteswap2 __builtin_bswap16
- #define _byteswap4 __builtin_bswap32
- #define _byteswap8 __builtin_bswap64
- #endif
- """
- uint16_t _byteswap2(uint16_t)
- uint32_t _byteswap4(uint32_t)
- uint64_t _byteswap8(uint64_t)
-
-
-cdef float _byteswap_float(float num):
- cdef uint32_t *intptr = <uint32_t *>&num
- intptr[0] = _byteswap4(intptr[0])
- return num
-
-
-cdef double _byteswap_double(double num):
- cdef uint64_t *intptr = <uint64_t *>&num
- intptr[0] = _byteswap8(intptr[0])
- return num
diff --git a/contrib/python/pandas/py3/pandas/io/sas/sas.pyx b/contrib/python/pandas/py3/pandas/io/sas/sas.pyx
deleted file mode 100644
index 6669686d7aa..00000000000
--- a/contrib/python/pandas/py3/pandas/io/sas/sas.pyx
+++ /dev/null
@@ -1,548 +0,0 @@
-# cython: language_level=3, initializedcheck=False
-# cython: warn.maybe_uninitialized=True, warn.unused=True
-from cython cimport Py_ssize_t
-from libc.stddef cimport size_t
-from libc.stdint cimport (
- int64_t,
- uint8_t,
- uint16_t,
- uint32_t,
- uint64_t,
-)
-from libc.stdlib cimport (
- calloc,
- free,
-)
-
-import numpy as np
-
-import pandas.io.sas.sas_constants as const
-
-
-cdef object np_nan = np.nan
-
-
-cdef struct Buffer:
- # Convenience wrapper for uint8_t data to allow fast and safe reads and writes.
- # We use this as a replacement for np.array(..., dtype=np.uint8) because it's
- # much slower to create NumPy arrays and we create Buffer instances many times
- # when reading a SAS7BDAT file (roughly once per row that is being read).
- uint8_t *data
- size_t length
-
-
-cdef uint8_t buf_get(Buffer buf, size_t offset) except? 255:
- assert offset < buf.length, "Out of bounds read"
- return buf.data[offset]
-
-
-cdef bint buf_set(Buffer buf, size_t offset, uint8_t value) except 0:
- assert offset < buf.length, "Out of bounds write"
- buf.data[offset] = value
- return True
-
-
-cdef bytes buf_as_bytes(Buffer buf, size_t offset, size_t length):
- assert offset + length <= buf.length, "Out of bounds read"
- return buf.data[offset:offset+length]
-
-
-cdef Buffer buf_new(size_t length) except *:
- cdef uint8_t *data = <uint8_t *>calloc(length, sizeof(uint8_t))
- if data == NULL:
- raise MemoryError(f"Failed to allocate {length} bytes")
- return Buffer(data, length)
-
-
-cdef buf_free(Buffer buf):
- if buf.data != NULL:
- free(buf.data)
-
-# rle_decompress decompresses data using a Run Length Encoding
-# algorithm. It is partially documented here:
-#
-# https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
-cdef int rle_decompress(Buffer inbuff, Buffer outbuff) except? 0:
-
- cdef:
- uint8_t control_byte, x
- int rpos = 0
- int i, nbytes, end_of_first_byte
- size_t ipos = 0
- Py_ssize_t _
-
- while ipos < inbuff.length:
- control_byte = buf_get(inbuff, ipos) & 0xF0
- end_of_first_byte = <int>(buf_get(inbuff, ipos) & 0x0F)
- ipos += 1
-
- if control_byte == 0x00:
- nbytes = <int>(buf_get(inbuff, ipos)) + 64 + end_of_first_byte * 256
- ipos += 1
- for _ in range(nbytes):
- buf_set(outbuff, rpos, buf_get(inbuff, ipos))
- rpos += 1
- ipos += 1
- elif control_byte == 0x40:
- # not documented
- nbytes = <int>(buf_get(inbuff, ipos)) + 18 + end_of_first_byte * 256
- ipos += 1
- for _ in range(nbytes):
- buf_set(outbuff, rpos, buf_get(inbuff, ipos))
- rpos += 1
- ipos += 1
- elif control_byte == 0x60:
- nbytes = end_of_first_byte * 256 + <int>(buf_get(inbuff, ipos)) + 17
- ipos += 1
- for _ in range(nbytes):
- buf_set(outbuff, rpos, 0x20)
- rpos += 1
- elif control_byte == 0x70:
- nbytes = end_of_first_byte * 256 + <int>(buf_get(inbuff, ipos)) + 17
- ipos += 1
- for _ in range(nbytes):
- buf_set(outbuff, rpos, 0x00)
- rpos += 1
- elif control_byte == 0x80:
- nbytes = end_of_first_byte + 1
- for i in range(nbytes):
- buf_set(outbuff, rpos, buf_get(inbuff, ipos + i))
- rpos += 1
- ipos += nbytes
- elif control_byte == 0x90:
- nbytes = end_of_first_byte + 17
- for i in range(nbytes):
- buf_set(outbuff, rpos, buf_get(inbuff, ipos + i))
- rpos += 1
- ipos += nbytes
- elif control_byte == 0xA0:
- nbytes = end_of_first_byte + 33
- for i in range(nbytes):
- buf_set(outbuff, rpos, buf_get(inbuff, ipos + i))
- rpos += 1
- ipos += nbytes
- elif control_byte == 0xB0:
- nbytes = end_of_first_byte + 49
- for i in range(nbytes):
- buf_set(outbuff, rpos, buf_get(inbuff, ipos + i))
- rpos += 1
- ipos += nbytes
- elif control_byte == 0xC0:
- nbytes = end_of_first_byte + 3
- x = buf_get(inbuff, ipos)
- ipos += 1
- for _ in range(nbytes):
- buf_set(outbuff, rpos, x)
- rpos += 1
- elif control_byte == 0xD0:
- nbytes = end_of_first_byte + 2
- for _ in range(nbytes):
- buf_set(outbuff, rpos, 0x40)
- rpos += 1
- elif control_byte == 0xE0:
- nbytes = end_of_first_byte + 2
- for _ in range(nbytes):
- buf_set(outbuff, rpos, 0x20)
- rpos += 1
- elif control_byte == 0xF0:
- nbytes = end_of_first_byte + 2
- for _ in range(nbytes):
- buf_set(outbuff, rpos, 0x00)
- rpos += 1
- else:
- raise ValueError(f"unknown control byte: {control_byte}")
-
- return rpos
-
-
-# rdc_decompress decompresses data using the Ross Data Compression algorithm:
-#
-# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
-cdef int rdc_decompress(Buffer inbuff, Buffer outbuff) except? 0:
-
- cdef:
- uint8_t cmd
- uint16_t ctrl_bits = 0, ctrl_mask = 0, ofs, cnt
- int rpos = 0, k, ii
- size_t ipos = 0
-
- ii = -1
-
- while ipos < inbuff.length:
- ii += 1
- ctrl_mask = ctrl_mask >> 1
- if ctrl_mask == 0:
- ctrl_bits = ((<uint16_t>buf_get(inbuff, ipos) << 8) +
- <uint16_t>buf_get(inbuff, ipos + 1))
- ipos += 2
- ctrl_mask = 0x8000
-
- if ctrl_bits & ctrl_mask == 0:
- buf_set(outbuff, rpos, buf_get(inbuff, ipos))
- ipos += 1
- rpos += 1
- continue
-
- cmd = (buf_get(inbuff, ipos) >> 4) & 0x0F
- cnt = <uint16_t>(buf_get(inbuff, ipos) & 0x0F)
- ipos += 1
-
- # short RLE
- if cmd == 0:
- cnt += 3
- for k in range(cnt):
- buf_set(outbuff, rpos + k, buf_get(inbuff, ipos))
- rpos += cnt
- ipos += 1
-
- # long RLE
- elif cmd == 1:
- cnt += <uint16_t>buf_get(inbuff, ipos) << 4
- cnt += 19
- ipos += 1
- for k in range(cnt):
- buf_set(outbuff, rpos + k, buf_get(inbuff, ipos))
- rpos += cnt
- ipos += 1
-
- # long pattern
- elif cmd == 2:
- ofs = cnt + 3
- ofs += <uint16_t>buf_get(inbuff, ipos) << 4
- ipos += 1
- cnt = <uint16_t>buf_get(inbuff, ipos)
- ipos += 1
- cnt += 16
- for k in range(cnt):
- buf_set(outbuff, rpos + k, buf_get(outbuff, rpos - <int>ofs + k))
- rpos += cnt
-
- # short pattern
- else:
- ofs = cnt + 3
- ofs += <uint16_t>buf_get(inbuff, ipos) << 4
- ipos += 1
- for k in range(cmd):
- buf_set(outbuff, rpos + k, buf_get(outbuff, rpos - <int>ofs + k))
- rpos += cmd
-
- return rpos
-
-
-cdef enum ColumnTypes:
- column_type_decimal = 1
- column_type_string = 2
-
-
-# Const aliases
-assert len(const.page_meta_types) == 2
-cdef:
- int page_meta_types_0 = const.page_meta_types[0]
- int page_meta_types_1 = const.page_meta_types[1]
- int page_mix_type = const.page_mix_type
- int page_data_type = const.page_data_type
- int subheader_pointers_offset = const.subheader_pointers_offset
-
- # Copy of subheader_signature_to_index that allows for much faster lookups.
- # Lookups are done in get_subheader_index. The C structures are initialized
- # in _init_subheader_signatures().
- uint32_t subheader_signatures_32bit[13]
- int subheader_indices_32bit[13]
- uint64_t subheader_signatures_64bit[17]
- int subheader_indices_64bit[17]
- int data_subheader_index = const.SASIndex.data_subheader_index
-
-
-def _init_subheader_signatures():
- subheaders_32bit = [
- (sig, idx)
- for sig, idx in const.subheader_signature_to_index.items()
- if len(sig) == 4
- ]
- subheaders_64bit = [
- (sig, idx)
- for sig, idx in const.subheader_signature_to_index.items()
- if len(sig) == 8
- ]
- assert len(subheaders_32bit) == 13
- assert len(subheaders_64bit) == 17
- assert len(const.subheader_signature_to_index) == 13 + 17
- for i, (signature, idx) in enumerate(subheaders_32bit):
- subheader_signatures_32bit[i] = (<uint32_t *><char *>signature)[0]
- subheader_indices_32bit[i] = idx
- for i, (signature, idx) in enumerate(subheaders_64bit):
- subheader_signatures_64bit[i] = (<uint64_t *><char *>signature)[0]
- subheader_indices_64bit[i] = idx
-
-
-_init_subheader_signatures()
-
-
-def get_subheader_index(bytes signature):
- """Fast version of 'subheader_signature_to_index.get(signature)'."""
- cdef:
- uint32_t sig32
- uint64_t sig64
- Py_ssize_t i
- assert len(signature) in (4, 8)
- if len(signature) == 4:
- sig32 = (<uint32_t *><char *>signature)[0]
- for i in range(len(subheader_signatures_32bit)):
- if subheader_signatures_32bit[i] == sig32:
- return subheader_indices_32bit[i]
- else:
- sig64 = (<uint64_t *><char *>signature)[0]
- for i in range(len(subheader_signatures_64bit)):
- if subheader_signatures_64bit[i] == sig64:
- return subheader_indices_64bit[i]
-
- return data_subheader_index
-
-
-cdef class Parser:
-
- cdef:
- int column_count
- int64_t[:] lengths
- int64_t[:] offsets
- int64_t[:] column_types
- uint8_t[:, :] byte_chunk
- object[:, :] string_chunk
- uint8_t *cached_page
- int cached_page_len
- int current_row_on_page_index
- int current_page_block_count
- int current_page_data_subheader_pointers_len
- int current_page_subheaders_count
- int current_row_in_chunk_index
- int current_row_in_file_index
- bint blank_missing
- int header_length
- int row_length
- int bit_offset
- int subheader_pointer_length
- int current_page_type
- bint is_little_endian
- int (*decompress)(Buffer, Buffer) except? 0
- object parser
-
- def __init__(self, object parser):
- cdef:
- int j
- char[:] column_types
-
- self.parser = parser
- self.blank_missing = parser.blank_missing
- self.header_length = self.parser.header_length
- self.column_count = parser.column_count
- self.lengths = parser.column_data_lengths()
- self.offsets = parser.column_data_offsets()
- self.byte_chunk = parser._byte_chunk
- self.string_chunk = parser._string_chunk
- self.row_length = parser.row_length
- self.bit_offset = self.parser._page_bit_offset
- self.subheader_pointer_length = self.parser._subheader_pointer_length
- self.is_little_endian = parser.byte_order == "<"
- self.column_types = np.empty(self.column_count, dtype="int64")
-
- # page indicators
- self.update_next_page()
-
- column_types = parser.column_types()
-
- # map column types
- for j in range(self.column_count):
- if column_types[j] == b"d":
- self.column_types[j] = column_type_decimal
- elif column_types[j] == b"s":
- self.column_types[j] = column_type_string
- else:
- raise ValueError(f"unknown column type: {self.parser.columns[j].ctype}")
-
- # compression
- if parser.compression == const.rle_compression:
- self.decompress = rle_decompress
- elif parser.compression == const.rdc_compression:
- self.decompress = rdc_decompress
- else:
- self.decompress = NULL
-
- # update to current state of the parser
- self.current_row_in_chunk_index = parser._current_row_in_chunk_index
- self.current_row_in_file_index = parser._current_row_in_file_index
- self.current_row_on_page_index = parser._current_row_on_page_index
-
- def read(self, int nrows):
- cdef:
- bint done
- Py_ssize_t i
-
- for i in range(nrows):
- done = self.readline()
- if done:
- break
-
- # update the parser
- self.parser._current_row_on_page_index = self.current_row_on_page_index
- self.parser._current_row_in_chunk_index = self.current_row_in_chunk_index
- self.parser._current_row_in_file_index = self.current_row_in_file_index
-
- cdef bint read_next_page(self) except? True:
- cdef bint done
-
- done = self.parser._read_next_page()
- if done:
- self.cached_page = NULL
- else:
- self.update_next_page()
- return done
-
- cdef update_next_page(self):
- # update data for the current page
-
- self.cached_page = <uint8_t *>self.parser._cached_page
- self.cached_page_len = len(self.parser._cached_page)
- self.current_row_on_page_index = 0
- self.current_page_type = self.parser._current_page_type
- self.current_page_block_count = self.parser._current_page_block_count
- self.current_page_data_subheader_pointers_len = len(
- self.parser._current_page_data_subheader_pointers
- )
- self.current_page_subheaders_count = self.parser._current_page_subheaders_count
-
- cdef bint readline(self) except? True:
-
- cdef:
- int offset, length, bit_offset, align_correction
- int subheader_pointer_length, mn
- bint done, flag
-
- bit_offset = self.bit_offset
- subheader_pointer_length = self.subheader_pointer_length
-
- # If there is no page, go to the end of the header and read a page.
- if self.cached_page == NULL:
- self.parser._path_or_buf.seek(self.header_length)
- done = self.read_next_page()
- if done:
- return True
-
- # Loop until a data row is read
- while True:
- if self.current_page_type in (page_meta_types_0, page_meta_types_1):
- flag = self.current_row_on_page_index >=\
- self.current_page_data_subheader_pointers_len
- if flag:
- done = self.read_next_page()
- if done:
- return True
- continue
- offset, length = self.parser._current_page_data_subheader_pointers[
- self.current_row_on_page_index
- ]
- self.process_byte_array_with_data(offset, length)
- return False
- elif self.current_page_type == page_mix_type:
- align_correction = (
- bit_offset
- + subheader_pointers_offset
- + self.current_page_subheaders_count * subheader_pointer_length
- )
- align_correction = align_correction % 8
- offset = bit_offset + align_correction
- offset += subheader_pointers_offset
- offset += self.current_page_subheaders_count * subheader_pointer_length
- offset += self.current_row_on_page_index * self.row_length
- self.process_byte_array_with_data(offset, self.row_length)
- mn = min(self.parser.row_count, self.parser._mix_page_row_count)
- if self.current_row_on_page_index == mn:
- done = self.read_next_page()
- if done:
- return True
- return False
- elif self.current_page_type == page_data_type:
- self.process_byte_array_with_data(
- bit_offset
- + subheader_pointers_offset
- + self.current_row_on_page_index * self.row_length,
- self.row_length,
- )
- flag = self.current_row_on_page_index == self.current_page_block_count
- if flag:
- done = self.read_next_page()
- if done:
- return True
- return False
- else:
- raise ValueError(f"unknown page type: {self.current_page_type}")
-
- cdef void process_byte_array_with_data(self, int offset, int length) except *:
-
- cdef:
- Py_ssize_t j
- int s, k, m, jb, js, current_row, rpos
- int64_t lngt, start, ct
- Buffer source, decompressed_source
- int64_t[:] column_types
- int64_t[:] lengths
- int64_t[:] offsets
- uint8_t[:, :] byte_chunk
- object[:, :] string_chunk
- bint compressed
-
- assert offset + length <= self.cached_page_len, "Out of bounds read"
- source = Buffer(&self.cached_page[offset], length)
-
- compressed = self.decompress != NULL and length < self.row_length
- if compressed:
- decompressed_source = buf_new(self.row_length)
- rpos = self.decompress(source, decompressed_source)
- if rpos != self.row_length:
- raise ValueError(
- f"Expected decompressed line of length {self.row_length} bytes "
- f"but decompressed {rpos} bytes"
- )
- source = decompressed_source
-
- current_row = self.current_row_in_chunk_index
- column_types = self.column_types
- lengths = self.lengths
- offsets = self.offsets
- byte_chunk = self.byte_chunk
- string_chunk = self.string_chunk
- s = 8 * self.current_row_in_chunk_index
- js = 0
- jb = 0
- for j in range(self.column_count):
- lngt = lengths[j]
- if lngt == 0:
- break
- start = offsets[j]
- ct = column_types[j]
- if ct == column_type_decimal:
- # decimal
- if self.is_little_endian:
- m = s + 8 - lngt
- else:
- m = s
- for k in range(lngt):
- byte_chunk[jb, m + k] = buf_get(source, start + k)
- jb += 1
- elif column_types[j] == column_type_string:
- # string
- # Skip trailing whitespace. This is equivalent to calling
- # .rstrip(b"\x00 ") but without Python call overhead.
- while lngt > 0 and buf_get(source, start + lngt - 1) in b"\x00 ":
- lngt -= 1
- if lngt == 0 and self.blank_missing:
- string_chunk[js, current_row] = np_nan
- else:
- string_chunk[js, current_row] = buf_as_bytes(source, start, lngt)
- js += 1
-
- self.current_row_on_page_index += 1
- self.current_row_in_chunk_index += 1
- self.current_row_in_file_index += 1
-
- if compressed:
- buf_free(decompressed_source)
diff --git a/contrib/python/pandas/py3/pandas/io/sas/sas7bdat.py b/contrib/python/pandas/py3/pandas/io/sas/sas7bdat.py
deleted file mode 100644
index 7fd93e7ed0e..00000000000
--- a/contrib/python/pandas/py3/pandas/io/sas/sas7bdat.py
+++ /dev/null
@@ -1,747 +0,0 @@
-"""
-Read SAS7BDAT files
-
-Based on code written by Jared Hobbs:
- https://bitbucket.org/jaredhobbs/sas7bdat
-
-See also:
- https://github.com/BioStatMatt/sas7bdat
-
-Partial documentation of the file format:
- https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
-
-Reference for binary data compression:
- http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
-"""
-from __future__ import annotations
-
-from collections import abc
-from datetime import (
- datetime,
- timedelta,
-)
-import sys
-from typing import cast
-
-import numpy as np
-
-from pandas._typing import (
- CompressionOptions,
- FilePath,
- ReadBuffer,
-)
-from pandas.errors import (
- EmptyDataError,
- OutOfBoundsDatetime,
-)
-
-import pandas as pd
-from pandas import (
- DataFrame,
- isna,
-)
-
-from pandas.io.common import get_handle
-from pandas.io.sas._byteswap import (
- read_double_with_byteswap,
- read_float_with_byteswap,
- read_uint16_with_byteswap,
- read_uint32_with_byteswap,
- read_uint64_with_byteswap,
-)
-from pandas.io.sas._sas import (
- Parser,
- get_subheader_index,
-)
-import pandas.io.sas.sas_constants as const
-from pandas.io.sas.sasreader import ReaderBase
-
-
-def _parse_datetime(sas_datetime: float, unit: str):
- if isna(sas_datetime):
- return pd.NaT
-
- if unit == "s":
- return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime)
-
- elif unit == "d":
- return datetime(1960, 1, 1) + timedelta(days=sas_datetime)
-
- else:
- raise ValueError("unit must be 'd' or 's'")
-
-
-def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
- """
- Convert to Timestamp if possible, otherwise to datetime.datetime.
- SAS float64 lacks precision for more than ms resolution so the fit
- to datetime.datetime is ok.
-
- Parameters
- ----------
- sas_datetimes : {Series, Sequence[float]}
- Dates or datetimes in SAS
- unit : {str}
- "d" if the floats represent dates, "s" for datetimes
-
- Returns
- -------
- Series
- Series of datetime64 dtype or datetime.datetime.
- """
- try:
- return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01")
- except OutOfBoundsDatetime:
- s_series = sas_datetimes.apply(_parse_datetime, unit=unit)
- s_series = cast(pd.Series, s_series)
- return s_series
-
-
-class _Column:
- col_id: int
- name: str | bytes
- label: str | bytes
- format: str | bytes
- ctype: bytes
- length: int
-
- def __init__(
- self,
- col_id: int,
- # These can be bytes when convert_header_text is False
- name: str | bytes,
- label: str | bytes,
- format: str | bytes,
- ctype: bytes,
- length: int,
- ) -> None:
- self.col_id = col_id
- self.name = name
- self.label = label
- self.format = format
- self.ctype = ctype
- self.length = length
-
-
-# SAS7BDAT represents a SAS data file in SAS7BDAT format.
-class SAS7BDATReader(ReaderBase, abc.Iterator):
- """
- Read SAS files in SAS7BDAT format.
-
- Parameters
- ----------
- path_or_buf : path name or buffer
- Name of SAS file or file-like object pointing to SAS file
- contents.
- index : column identifier, defaults to None
- Column to use as index.
- convert_dates : bool, defaults to True
- Attempt to convert dates to Pandas datetime values. Note that
- some rarely used SAS date formats may be unsupported.
- blank_missing : bool, defaults to True
- Convert empty strings to missing values (SAS uses blanks to
- indicate missing character variables).
- chunksize : int, defaults to None
- Return SAS7BDATReader object for iterations, returns chunks
- with given number of lines.
- encoding : str, 'infer', defaults to None
- String encoding acc. to Python standard encodings,
- encoding='infer' tries to detect the encoding from the file header,
- encoding=None will leave the data in binary format.
- convert_text : bool, defaults to True
- If False, text variables are left as raw bytes.
- convert_header_text : bool, defaults to True
- If False, header text, including column names, are left as raw
- bytes.
- """
-
- _int_length: int
- _cached_page: bytes | None
-
- def __init__(
- self,
- path_or_buf: FilePath | ReadBuffer[bytes],
- index=None,
- convert_dates: bool = True,
- blank_missing: bool = True,
- chunksize: int | None = None,
- encoding: str | None = None,
- convert_text: bool = True,
- convert_header_text: bool = True,
- compression: CompressionOptions = "infer",
- ) -> None:
- self.index = index
- self.convert_dates = convert_dates
- self.blank_missing = blank_missing
- self.chunksize = chunksize
- self.encoding = encoding
- self.convert_text = convert_text
- self.convert_header_text = convert_header_text
-
- self.default_encoding = "latin-1"
- self.compression = b""
- self.column_names_raw: list[bytes] = []
- self.column_names: list[str | bytes] = []
- self.column_formats: list[str | bytes] = []
- self.columns: list[_Column] = []
-
- self._current_page_data_subheader_pointers: list[tuple[int, int]] = []
- self._cached_page = None
- self._column_data_lengths: list[int] = []
- self._column_data_offsets: list[int] = []
- self._column_types: list[bytes] = []
-
- self._current_row_in_file_index = 0
- self._current_row_on_page_index = 0
- self._current_row_in_file_index = 0
-
- self.handles = get_handle(
- path_or_buf, "rb", is_text=False, compression=compression
- )
-
- self._path_or_buf = self.handles.handle
-
- # Same order as const.SASIndex
- self._subheader_processors = [
- self._process_rowsize_subheader,
- self._process_columnsize_subheader,
- self._process_subheader_counts,
- self._process_columntext_subheader,
- self._process_columnname_subheader,
- self._process_columnattributes_subheader,
- self._process_format_subheader,
- self._process_columnlist_subheader,
- None, # Data
- ]
-
- try:
- self._get_properties()
- self._parse_metadata()
- except Exception:
- self.close()
- raise
-
- def column_data_lengths(self) -> np.ndarray:
- """Return a numpy int64 array of the column data lengths"""
- return np.asarray(self._column_data_lengths, dtype=np.int64)
-
- def column_data_offsets(self) -> np.ndarray:
- """Return a numpy int64 array of the column offsets"""
- return np.asarray(self._column_data_offsets, dtype=np.int64)
-
- def column_types(self) -> np.ndarray:
- """
- Returns a numpy character array of the column types:
- s (string) or d (double)
- """
- return np.asarray(self._column_types, dtype=np.dtype("S1"))
-
- def close(self) -> None:
- self.handles.close()
-
- def _get_properties(self) -> None:
- # Check magic number
- self._path_or_buf.seek(0)
- self._cached_page = self._path_or_buf.read(288)
- if self._cached_page[0 : len(const.magic)] != const.magic:
- raise ValueError("magic number mismatch (not a SAS file?)")
-
- # Get alignment information
- buf = self._read_bytes(const.align_1_offset, const.align_1_length)
- if buf == const.u64_byte_checker_value:
- self.U64 = True
- self._int_length = 8
- self._page_bit_offset = const.page_bit_offset_x64
- self._subheader_pointer_length = const.subheader_pointer_length_x64
- else:
- self.U64 = False
- self._page_bit_offset = const.page_bit_offset_x86
- self._subheader_pointer_length = const.subheader_pointer_length_x86
- self._int_length = 4
- buf = self._read_bytes(const.align_2_offset, const.align_2_length)
- if buf == const.align_1_checker_value:
- align1 = const.align_2_value
- else:
- align1 = 0
-
- # Get endianness information
- buf = self._read_bytes(const.endianness_offset, const.endianness_length)
- if buf == b"\x01":
- self.byte_order = "<"
- self.need_byteswap = sys.byteorder == "big"
- else:
- self.byte_order = ">"
- self.need_byteswap = sys.byteorder == "little"
-
- # Get encoding information
- buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
- if buf in const.encoding_names:
- self.inferred_encoding = const.encoding_names[buf]
- if self.encoding == "infer":
- self.encoding = self.inferred_encoding
- else:
- self.inferred_encoding = f"unknown (code={buf})"
-
- # Timestamp is epoch 01/01/1960
- epoch = datetime(1960, 1, 1)
- x = self._read_float(
- const.date_created_offset + align1, const.date_created_length
- )
- self.date_created = epoch + pd.to_timedelta(x, unit="s")
- x = self._read_float(
- const.date_modified_offset + align1, const.date_modified_length
- )
- self.date_modified = epoch + pd.to_timedelta(x, unit="s")
-
- self.header_length = self._read_uint(
- const.header_size_offset + align1, const.header_size_length
- )
-
- # Read the rest of the header into cached_page.
- buf = self._path_or_buf.read(self.header_length - 288)
- self._cached_page += buf
- # error: Argument 1 to "len" has incompatible type "Optional[bytes]";
- # expected "Sized"
- if len(self._cached_page) != self.header_length: # type: ignore[arg-type]
- raise ValueError("The SAS7BDAT file appears to be truncated.")
-
- self._page_length = self._read_uint(
- const.page_size_offset + align1, const.page_size_length
- )
-
- def __next__(self) -> DataFrame:
- da = self.read(nrows=self.chunksize or 1)
- if da.empty:
- self.close()
- raise StopIteration
- return da
-
- # Read a single float of the given width (4 or 8).
- def _read_float(self, offset: int, width: int):
- assert self._cached_page is not None
- if width == 4:
- return read_float_with_byteswap(
- self._cached_page, offset, self.need_byteswap
- )
- elif width == 8:
- return read_double_with_byteswap(
- self._cached_page, offset, self.need_byteswap
- )
- else:
- self.close()
- raise ValueError("invalid float width")
-
- # Read a single unsigned integer of the given width (1, 2, 4 or 8).
- def _read_uint(self, offset: int, width: int) -> int:
- assert self._cached_page is not None
- if width == 1:
- return self._read_bytes(offset, 1)[0]
- elif width == 2:
- return read_uint16_with_byteswap(
- self._cached_page, offset, self.need_byteswap
- )
- elif width == 4:
- return read_uint32_with_byteswap(
- self._cached_page, offset, self.need_byteswap
- )
- elif width == 8:
- return read_uint64_with_byteswap(
- self._cached_page, offset, self.need_byteswap
- )
- else:
- self.close()
- raise ValueError("invalid int width")
-
- def _read_bytes(self, offset: int, length: int):
- assert self._cached_page is not None
- if offset + length > len(self._cached_page):
- self.close()
- raise ValueError("The cached page is too small.")
- return self._cached_page[offset : offset + length]
-
- def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes:
- return self._convert_header_text(
- self._read_bytes(offset, length).rstrip(b"\x00 ")
- )
-
- def _parse_metadata(self) -> None:
- done = False
- while not done:
- self._cached_page = self._path_or_buf.read(self._page_length)
- if len(self._cached_page) <= 0:
- break
- if len(self._cached_page) != self._page_length:
- raise ValueError("Failed to read a meta data page from the SAS file.")
- done = self._process_page_meta()
-
- def _process_page_meta(self) -> bool:
- self._read_page_header()
- pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type]
- if self._current_page_type in pt:
- self._process_page_metadata()
- is_data_page = self._current_page_type == const.page_data_type
- is_mix_page = self._current_page_type == const.page_mix_type
- return bool(
- is_data_page
- or is_mix_page
- or self._current_page_data_subheader_pointers != []
- )
-
- def _read_page_header(self) -> None:
- bit_offset = self._page_bit_offset
- tx = const.page_type_offset + bit_offset
- self._current_page_type = (
- self._read_uint(tx, const.page_type_length) & const.page_type_mask2
- )
- tx = const.block_count_offset + bit_offset
- self._current_page_block_count = self._read_uint(tx, const.block_count_length)
- tx = const.subheader_count_offset + bit_offset
- self._current_page_subheaders_count = self._read_uint(
- tx, const.subheader_count_length
- )
-
- def _process_page_metadata(self) -> None:
- bit_offset = self._page_bit_offset
-
- for i in range(self._current_page_subheaders_count):
- offset = const.subheader_pointers_offset + bit_offset
- total_offset = offset + self._subheader_pointer_length * i
-
- subheader_offset = self._read_uint(total_offset, self._int_length)
- total_offset += self._int_length
-
- subheader_length = self._read_uint(total_offset, self._int_length)
- total_offset += self._int_length
-
- subheader_compression = self._read_uint(total_offset, 1)
- total_offset += 1
-
- subheader_type = self._read_uint(total_offset, 1)
-
- if (
- subheader_length == 0
- or subheader_compression == const.truncated_subheader_id
- ):
- continue
-
- subheader_signature = self._read_bytes(subheader_offset, self._int_length)
- subheader_index = get_subheader_index(subheader_signature)
- subheader_processor = self._subheader_processors[subheader_index]
-
- if subheader_processor is None:
- f1 = subheader_compression in (const.compressed_subheader_id, 0)
- f2 = subheader_type == const.compressed_subheader_type
- if self.compression and f1 and f2:
- self._current_page_data_subheader_pointers.append(
- (subheader_offset, subheader_length)
- )
- else:
- self.close()
- raise ValueError(
- f"Unknown subheader signature {subheader_signature}"
- )
- else:
- subheader_processor(subheader_offset, subheader_length)
-
- def _process_rowsize_subheader(self, offset: int, length: int) -> None:
- int_len = self._int_length
- lcs_offset = offset
- lcp_offset = offset
- if self.U64:
- lcs_offset += 682
- lcp_offset += 706
- else:
- lcs_offset += 354
- lcp_offset += 378
-
- self.row_length = self._read_uint(
- offset + const.row_length_offset_multiplier * int_len,
- int_len,
- )
- self.row_count = self._read_uint(
- offset + const.row_count_offset_multiplier * int_len,
- int_len,
- )
- self.col_count_p1 = self._read_uint(
- offset + const.col_count_p1_multiplier * int_len, int_len
- )
- self.col_count_p2 = self._read_uint(
- offset + const.col_count_p2_multiplier * int_len, int_len
- )
- mx = const.row_count_on_mix_page_offset_multiplier * int_len
- self._mix_page_row_count = self._read_uint(offset + mx, int_len)
- self._lcs = self._read_uint(lcs_offset, 2)
- self._lcp = self._read_uint(lcp_offset, 2)
-
- def _process_columnsize_subheader(self, offset: int, length: int) -> None:
- int_len = self._int_length
- offset += int_len
- self.column_count = self._read_uint(offset, int_len)
- if self.col_count_p1 + self.col_count_p2 != self.column_count:
- print(
- f"Warning: column count mismatch ({self.col_count_p1} + "
- f"{self.col_count_p2} != {self.column_count})\n"
- )
-
- # Unknown purpose
- def _process_subheader_counts(self, offset: int, length: int) -> None:
- pass
-
- def _process_columntext_subheader(self, offset: int, length: int) -> None:
- offset += self._int_length
- text_block_size = self._read_uint(offset, const.text_block_size_length)
-
- buf = self._read_bytes(offset, text_block_size)
- cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
- self.column_names_raw.append(cname_raw)
-
- if len(self.column_names_raw) == 1:
- compression_literal = b""
- for cl in const.compression_literals:
- if cl in cname_raw:
- compression_literal = cl
- self.compression = compression_literal
- offset -= self._int_length
-
- offset1 = offset + 16
- if self.U64:
- offset1 += 4
-
- buf = self._read_bytes(offset1, self._lcp)
- compression_literal = buf.rstrip(b"\x00")
- if compression_literal == b"":
- self._lcs = 0
- offset1 = offset + 32
- if self.U64:
- offset1 += 4
- buf = self._read_bytes(offset1, self._lcp)
- self.creator_proc = buf[0 : self._lcp]
- elif compression_literal == const.rle_compression:
- offset1 = offset + 40
- if self.U64:
- offset1 += 4
- buf = self._read_bytes(offset1, self._lcp)
- self.creator_proc = buf[0 : self._lcp]
- elif self._lcs > 0:
- self._lcp = 0
- offset1 = offset + 16
- if self.U64:
- offset1 += 4
- buf = self._read_bytes(offset1, self._lcs)
- self.creator_proc = buf[0 : self._lcp]
- if hasattr(self, "creator_proc"):
- self.creator_proc = self._convert_header_text(self.creator_proc)
-
- def _process_columnname_subheader(self, offset: int, length: int) -> None:
- int_len = self._int_length
- offset += int_len
- column_name_pointers_count = (length - 2 * int_len - 12) // 8
- for i in range(column_name_pointers_count):
- text_subheader = (
- offset
- + const.column_name_pointer_length * (i + 1)
- + const.column_name_text_subheader_offset
- )
- col_name_offset = (
- offset
- + const.column_name_pointer_length * (i + 1)
- + const.column_name_offset_offset
- )
- col_name_length = (
- offset
- + const.column_name_pointer_length * (i + 1)
- + const.column_name_length_offset
- )
-
- idx = self._read_uint(
- text_subheader, const.column_name_text_subheader_length
- )
- col_offset = self._read_uint(
- col_name_offset, const.column_name_offset_length
- )
- col_len = self._read_uint(col_name_length, const.column_name_length_length)
-
- name_raw = self.column_names_raw[idx]
- cname = name_raw[col_offset : col_offset + col_len]
- self.column_names.append(self._convert_header_text(cname))
-
- def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
- int_len = self._int_length
- column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
- for i in range(column_attributes_vectors_count):
- col_data_offset = (
- offset + int_len + const.column_data_offset_offset + i * (int_len + 8)
- )
- col_data_len = (
- offset
- + 2 * int_len
- + const.column_data_length_offset
- + i * (int_len + 8)
- )
- col_types = (
- offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
- )
-
- x = self._read_uint(col_data_offset, int_len)
- self._column_data_offsets.append(x)
-
- x = self._read_uint(col_data_len, const.column_data_length_length)
- self._column_data_lengths.append(x)
-
- x = self._read_uint(col_types, const.column_type_length)
- self._column_types.append(b"d" if x == 1 else b"s")
-
- def _process_columnlist_subheader(self, offset: int, length: int) -> None:
- # unknown purpose
- pass
-
- def _process_format_subheader(self, offset: int, length: int) -> None:
- int_len = self._int_length
- text_subheader_format = (
- offset + const.column_format_text_subheader_index_offset + 3 * int_len
- )
- col_format_offset = offset + const.column_format_offset_offset + 3 * int_len
- col_format_len = offset + const.column_format_length_offset + 3 * int_len
- text_subheader_label = (
- offset + const.column_label_text_subheader_index_offset + 3 * int_len
- )
- col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
- col_label_len = offset + const.column_label_length_offset + 3 * int_len
-
- x = self._read_uint(
- text_subheader_format, const.column_format_text_subheader_index_length
- )
- format_idx = min(x, len(self.column_names_raw) - 1)
-
- format_start = self._read_uint(
- col_format_offset, const.column_format_offset_length
- )
- format_len = self._read_uint(col_format_len, const.column_format_length_length)
-
- label_idx = self._read_uint(
- text_subheader_label, const.column_label_text_subheader_index_length
- )
- label_idx = min(label_idx, len(self.column_names_raw) - 1)
-
- label_start = self._read_uint(
- col_label_offset, const.column_label_offset_length
- )
- label_len = self._read_uint(col_label_len, const.column_label_length_length)
-
- label_names = self.column_names_raw[label_idx]
- column_label = self._convert_header_text(
- label_names[label_start : label_start + label_len]
- )
- format_names = self.column_names_raw[format_idx]
- column_format = self._convert_header_text(
- format_names[format_start : format_start + format_len]
- )
- current_column_number = len(self.columns)
-
- col = _Column(
- current_column_number,
- self.column_names[current_column_number],
- column_label,
- column_format,
- self._column_types[current_column_number],
- self._column_data_lengths[current_column_number],
- )
-
- self.column_formats.append(column_format)
- self.columns.append(col)
-
- def read(self, nrows: int | None = None) -> DataFrame:
- if (nrows is None) and (self.chunksize is not None):
- nrows = self.chunksize
- elif nrows is None:
- nrows = self.row_count
-
- if len(self._column_types) == 0:
- self.close()
- raise EmptyDataError("No columns to parse from file")
-
- if nrows > 0 and self._current_row_in_file_index >= self.row_count:
- return DataFrame()
-
- nrows = min(nrows, self.row_count - self._current_row_in_file_index)
-
- nd = self._column_types.count(b"d")
- ns = self._column_types.count(b"s")
-
- self._string_chunk = np.empty((ns, nrows), dtype=object)
- self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
-
- self._current_row_in_chunk_index = 0
- p = Parser(self)
- p.read(nrows)
-
- rslt = self._chunk_to_dataframe()
- if self.index is not None:
- rslt = rslt.set_index(self.index)
-
- return rslt
-
- def _read_next_page(self):
- self._current_page_data_subheader_pointers = []
- self._cached_page = self._path_or_buf.read(self._page_length)
- if len(self._cached_page) <= 0:
- return True
- elif len(self._cached_page) != self._page_length:
- self.close()
- msg = (
- "failed to read complete page from file (read "
- f"{len(self._cached_page):d} of {self._page_length:d} bytes)"
- )
- raise ValueError(msg)
-
- self._read_page_header()
- if self._current_page_type in const.page_meta_types:
- self._process_page_metadata()
-
- if self._current_page_type not in const.page_meta_types + [
- const.page_data_type,
- const.page_mix_type,
- ]:
- return self._read_next_page()
-
- return False
-
- def _chunk_to_dataframe(self) -> DataFrame:
- n = self._current_row_in_chunk_index
- m = self._current_row_in_file_index
- ix = range(m - n, m)
- rslt = {}
-
- js, jb = 0, 0
- for j in range(self.column_count):
- name = self.column_names[j]
-
- if self._column_types[j] == b"d":
- col_arr = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
- rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix)
- if self.convert_dates:
- if self.column_formats[j] in const.sas_date_formats:
- rslt[name] = _convert_datetimes(rslt[name], "d")
- elif self.column_formats[j] in const.sas_datetime_formats:
- rslt[name] = _convert_datetimes(rslt[name], "s")
- jb += 1
- elif self._column_types[j] == b"s":
- rslt[name] = pd.Series(self._string_chunk[js, :], index=ix)
- if self.convert_text and (self.encoding is not None):
- rslt[name] = self._decode_string(rslt[name].str)
- js += 1
- else:
- self.close()
- raise ValueError(f"unknown column type {repr(self._column_types[j])}")
-
- df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False)
- return df
-
- def _decode_string(self, b):
- return b.decode(self.encoding or self.default_encoding)
-
- def _convert_header_text(self, b: bytes) -> str | bytes:
- if self.convert_header_text:
- return self._decode_string(b)
- else:
- return b
diff --git a/contrib/python/pandas/py3/pandas/io/sas/sas_constants.py b/contrib/python/pandas/py3/pandas/io/sas/sas_constants.py
deleted file mode 100644
index a090b8a1acb..00000000000
--- a/contrib/python/pandas/py3/pandas/io/sas/sas_constants.py
+++ /dev/null
@@ -1,310 +0,0 @@
-from __future__ import annotations
-
-from typing import Final
-
-magic: Final = (
- b"\x00\x00\x00\x00\x00\x00\x00\x00"
- + b"\x00\x00\x00\x00\xc2\xea\x81\x60"
- + b"\xb3\x14\x11\xcf\xbd\x92\x08\x00"
- + b"\x09\xc7\x31\x8c\x18\x1f\x10\x11"
-)
-
-align_1_checker_value: Final = b"3"
-align_1_offset: Final = 32
-align_1_length: Final = 1
-align_1_value: Final = 4
-u64_byte_checker_value: Final = b"3"
-align_2_offset: Final = 35
-align_2_length: Final = 1
-align_2_value: Final = 4
-endianness_offset: Final = 37
-endianness_length: Final = 1
-platform_offset: Final = 39
-platform_length: Final = 1
-encoding_offset: Final = 70
-encoding_length: Final = 1
-dataset_offset: Final = 92
-dataset_length: Final = 64
-file_type_offset: Final = 156
-file_type_length: Final = 8
-date_created_offset: Final = 164
-date_created_length: Final = 8
-date_modified_offset: Final = 172
-date_modified_length: Final = 8
-header_size_offset: Final = 196
-header_size_length: Final = 4
-page_size_offset: Final = 200
-page_size_length: Final = 4
-page_count_offset: Final = 204
-page_count_length: Final = 4
-sas_release_offset: Final = 216
-sas_release_length: Final = 8
-sas_server_type_offset: Final = 224
-sas_server_type_length: Final = 16
-os_version_number_offset: Final = 240
-os_version_number_length: Final = 16
-os_maker_offset: Final = 256
-os_maker_length: Final = 16
-os_name_offset: Final = 272
-os_name_length: Final = 16
-page_bit_offset_x86: Final = 16
-page_bit_offset_x64: Final = 32
-subheader_pointer_length_x86: Final = 12
-subheader_pointer_length_x64: Final = 24
-page_type_offset: Final = 0
-page_type_length: Final = 2
-block_count_offset: Final = 2
-block_count_length: Final = 2
-subheader_count_offset: Final = 4
-subheader_count_length: Final = 2
-page_type_mask: Final = 0x0F00
-# Keep "page_comp_type" bits
-page_type_mask2: Final = 0xF000 | page_type_mask
-page_meta_type: Final = 0x0000
-page_data_type: Final = 0x0100
-page_mix_type: Final = 0x0200
-page_amd_type: Final = 0x0400
-page_meta2_type: Final = 0x4000
-page_comp_type: Final = 0x9000
-page_meta_types: Final = [page_meta_type, page_meta2_type]
-subheader_pointers_offset: Final = 8
-truncated_subheader_id: Final = 1
-compressed_subheader_id: Final = 4
-compressed_subheader_type: Final = 1
-text_block_size_length: Final = 2
-row_length_offset_multiplier: Final = 5
-row_count_offset_multiplier: Final = 6
-col_count_p1_multiplier: Final = 9
-col_count_p2_multiplier: Final = 10
-row_count_on_mix_page_offset_multiplier: Final = 15
-column_name_pointer_length: Final = 8
-column_name_text_subheader_offset: Final = 0
-column_name_text_subheader_length: Final = 2
-column_name_offset_offset: Final = 2
-column_name_offset_length: Final = 2
-column_name_length_offset: Final = 4
-column_name_length_length: Final = 2
-column_data_offset_offset: Final = 8
-column_data_length_offset: Final = 8
-column_data_length_length: Final = 4
-column_type_offset: Final = 14
-column_type_length: Final = 1
-column_format_text_subheader_index_offset: Final = 22
-column_format_text_subheader_index_length: Final = 2
-column_format_offset_offset: Final = 24
-column_format_offset_length: Final = 2
-column_format_length_offset: Final = 26
-column_format_length_length: Final = 2
-column_label_text_subheader_index_offset: Final = 28
-column_label_text_subheader_index_length: Final = 2
-column_label_offset_offset: Final = 30
-column_label_offset_length: Final = 2
-column_label_length_offset: Final = 32
-column_label_length_length: Final = 2
-rle_compression: Final = b"SASYZCRL"
-rdc_compression: Final = b"SASYZCR2"
-
-compression_literals: Final = [rle_compression, rdc_compression]
-
-# Incomplete list of encodings, using SAS nomenclature:
-# https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html
-# corresponding to the Python documentation of standard encodings
-# https://docs.python.org/3/library/codecs.html#standard-encodings
-encoding_names: Final = {
- 20: "utf-8",
- 29: "latin1",
- 30: "latin2",
- 31: "latin3",
- 32: "latin4",
- 33: "cyrillic",
- 34: "arabic",
- 35: "greek",
- 36: "hebrew",
- 37: "latin5",
- 38: "latin6",
- 39: "cp874",
- 40: "latin9",
- 41: "cp437",
- 42: "cp850",
- 43: "cp852",
- 44: "cp857",
- 45: "cp858",
- 46: "cp862",
- 47: "cp864",
- 48: "cp865",
- 49: "cp866",
- 50: "cp869",
- 51: "cp874",
- # 52: "", # not found
- # 53: "", # not found
- # 54: "", # not found
- 55: "cp720",
- 56: "cp737",
- 57: "cp775",
- 58: "cp860",
- 59: "cp863",
- 60: "cp1250",
- 61: "cp1251",
- 62: "cp1252",
- 63: "cp1253",
- 64: "cp1254",
- 65: "cp1255",
- 66: "cp1256",
- 67: "cp1257",
- 68: "cp1258",
- 118: "cp950",
- # 119: "", # not found
- 123: "big5",
- 125: "gb2312",
- 126: "cp936",
- 134: "euc_jp",
- 136: "cp932",
- 138: "shift_jis",
- 140: "euc-kr",
- 141: "cp949",
- 227: "latin8",
- # 228: "", # not found
- # 229: "" # not found
-}
-
-
-class SASIndex:
- row_size_index: Final = 0
- column_size_index: Final = 1
- subheader_counts_index: Final = 2
- column_text_index: Final = 3
- column_name_index: Final = 4
- column_attributes_index: Final = 5
- format_and_label_index: Final = 6
- column_list_index: Final = 7
- data_subheader_index: Final = 8
-
-
-subheader_signature_to_index: Final = {
- b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
- b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
- b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
- b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
- b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
- b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
- b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
- b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
- b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
- b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
- b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
- b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
- b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
- b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
- b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
- b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
- b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
- b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
- b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
- b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
- b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
- b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
- b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
- b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
- b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
- b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
- b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
- b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
- b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
- b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
-}
-
-
-# List of frequently used SAS date and datetime formats
-# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
-# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
-sas_date_formats: Final = (
- "DATE",
- "DAY",
- "DDMMYY",
- "DOWNAME",
- "JULDAY",
- "JULIAN",
- "MMDDYY",
- "MMYY",
- "MMYYC",
- "MMYYD",
- "MMYYP",
- "MMYYS",
- "MMYYN",
- "MONNAME",
- "MONTH",
- "MONYY",
- "QTR",
- "QTRR",
- "NENGO",
- "WEEKDATE",
- "WEEKDATX",
- "WEEKDAY",
- "WEEKV",
- "WORDDATE",
- "WORDDATX",
- "YEAR",
- "YYMM",
- "YYMMC",
- "YYMMD",
- "YYMMP",
- "YYMMS",
- "YYMMN",
- "YYMON",
- "YYMMDD",
- "YYQ",
- "YYQC",
- "YYQD",
- "YYQP",
- "YYQS",
- "YYQN",
- "YYQR",
- "YYQRC",
- "YYQRD",
- "YYQRP",
- "YYQRS",
- "YYQRN",
- "YYMMDDP",
- "YYMMDDC",
- "E8601DA",
- "YYMMDDN",
- "MMDDYYC",
- "MMDDYYS",
- "MMDDYYD",
- "YYMMDDS",
- "B8601DA",
- "DDMMYYN",
- "YYMMDDD",
- "DDMMYYB",
- "DDMMYYP",
- "MMDDYYP",
- "YYMMDDB",
- "MMDDYYN",
- "DDMMYYC",
- "DDMMYYD",
- "DDMMYYS",
- "MINGUO",
-)
-
-sas_datetime_formats: Final = (
- "DATETIME",
- "DTWKDATX",
- "B8601DN",
- "B8601DT",
- "B8601DX",
- "B8601DZ",
- "B8601LX",
- "E8601DN",
- "E8601DT",
- "E8601DX",
- "E8601DZ",
- "E8601LX",
- "DATEAMPM",
- "DTDATE",
- "DTMONYY",
- "DTMONYY",
- "DTWKDATX",
- "DTYEAR",
- "TOD",
- "MDYAMPM",
-)
diff --git a/contrib/python/pandas/py3/pandas/io/sas/sas_xport.py b/contrib/python/pandas/py3/pandas/io/sas/sas_xport.py
deleted file mode 100644
index 6767dec6e45..00000000000
--- a/contrib/python/pandas/py3/pandas/io/sas/sas_xport.py
+++ /dev/null
@@ -1,506 +0,0 @@
-"""
-Read a SAS XPort format file into a Pandas DataFrame.
-
-Based on code from Jack Cushman (github.com/jcushman/xport).
-
-The file format is defined here:
-
-https://support.sas.com/content/dam/SAS/support/en/technical-papers/record-layout-of-a-sas-version-5-or-6-data-set-in-sas-transport-xport-format.pdf
-"""
-from __future__ import annotations
-
-from collections import abc
-from datetime import datetime
-import struct
-import warnings
-
-import numpy as np
-
-from pandas._typing import (
- CompressionOptions,
- DatetimeNaTType,
- FilePath,
- ReadBuffer,
-)
-from pandas.util._decorators import Appender
-from pandas.util._exceptions import find_stack_level
-
-import pandas as pd
-
-from pandas.io.common import get_handle
-from pandas.io.sas.sasreader import ReaderBase
-
-_correct_line1 = (
- "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
- "000000000000000000000000000000 "
-)
-_correct_header1 = (
- "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000"
-)
-_correct_header2 = (
- "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
- "000000000000000000000000000000 "
-)
-_correct_obs_header = (
- "HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
- "000000000000000000000000000000 "
-)
-_fieldkeys = [
- "ntype",
- "nhfun",
- "field_length",
- "nvar0",
- "name",
- "label",
- "nform",
- "nfl",
- "num_decimals",
- "nfj",
- "nfill",
- "niform",
- "nifl",
- "nifd",
- "npos",
- "_",
-]
-
-
-_base_params_doc = """\
-Parameters
-----------
-filepath_or_buffer : str or file-like object
- Path to SAS file or object implementing binary read method."""
-
-_params2_doc = """\
-index : identifier of index column
- Identifier of column that should be used as index of the DataFrame.
-encoding : str
- Encoding for text data.
-chunksize : int
- Read file `chunksize` lines at a time, returns iterator."""
-
-_format_params_doc = """\
-format : str
- File format, only `xport` is currently supported."""
-
-_iterator_doc = """\
-iterator : bool, default False
- Return XportReader object for reading file incrementally."""
-
-
-_read_sas_doc = f"""Read a SAS file into a DataFrame.
-
-{_base_params_doc}
-{_format_params_doc}
-{_params2_doc}
-{_iterator_doc}
-
-Returns
--------
-DataFrame or XportReader
-
-Examples
---------
-Read a SAS Xport file:
-
->>> df = pd.read_sas('filename.XPT')
-
-Read a Xport file in 10,000 line chunks:
-
->>> itr = pd.read_sas('filename.XPT', chunksize=10000)
->>> for chunk in itr:
->>> do_something(chunk)
-
-"""
-
-_xport_reader_doc = f"""\
-Class for reading SAS Xport files.
-
-{_base_params_doc}
-{_params2_doc}
-
-Attributes
-----------
-member_info : list
- Contains information about the file
-fields : list
- Contains information about the variables in the file
-"""
-
-_read_method_doc = """\
-Read observations from SAS Xport file, returning as data frame.
-
-Parameters
-----------
-nrows : int
- Number of rows to read from data file; if None, read whole
- file.
-
-Returns
--------
-A DataFrame.
-"""
-
-
-def _parse_date(datestr: str) -> DatetimeNaTType:
- """Given a date in xport format, return Python date."""
- try:
- # e.g. "16FEB11:10:07:55"
- return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
- except ValueError:
- return pd.NaT
-
-
-def _split_line(s: str, parts):
- """
- Parameters
- ----------
- s: str
- Fixed-length string to split
- parts: list of (name, length) pairs
- Used to break up string, name '_' will be filtered from output.
-
- Returns
- -------
- Dict of name:contents of string at given location.
- """
- out = {}
- start = 0
- for name, length in parts:
- out[name] = s[start : start + length].strip()
- start += length
- del out["_"]
- return out
-
-
-def _handle_truncated_float_vec(vec, nbytes):
- # This feature is not well documented, but some SAS XPORT files
- # have 2-7 byte "truncated" floats. To read these truncated
- # floats, pad them with zeros on the right to make 8 byte floats.
- #
- # References:
- # https://github.com/jcushman/xport/pull/3
- # The R "foreign" library
-
- if nbytes != 8:
- vec1 = np.zeros(len(vec), np.dtype("S8"))
- dtype = np.dtype(f"S{nbytes},S{8 - nbytes}")
- vec2 = vec1.view(dtype=dtype)
- vec2["f0"] = vec
- return vec2
-
- return vec
-
-
-def _parse_float_vec(vec):
- """
- Parse a vector of float values representing IBM 8 byte floats into
- native 8 byte floats.
- """
- dtype = np.dtype(">u4,>u4")
- vec1 = vec.view(dtype=dtype)
- xport1 = vec1["f0"]
- xport2 = vec1["f1"]
-
- # Start by setting first half of ieee number to first half of IBM
- # number sans exponent
- ieee1 = xport1 & 0x00FFFFFF
-
- # The fraction bit to the left of the binary point in the ieee
- # format was set and the number was shifted 0, 1, 2, or 3
- # places. This will tell us how to adjust the ibm exponent to be a
- # power of 2 ieee exponent and how to shift the fraction bits to
- # restore the correct magnitude.
- shift = np.zeros(len(vec), dtype=np.uint8)
- shift[np.where(xport1 & 0x00200000)] = 1
- shift[np.where(xport1 & 0x00400000)] = 2
- shift[np.where(xport1 & 0x00800000)] = 3
-
- # shift the ieee number down the correct number of places then
- # set the second half of the ieee number to be the second half
- # of the ibm number shifted appropriately, ored with the bits
- # from the first half that would have been shifted in if we
- # could shift a double. All we are worried about are the low
- # order 3 bits of the first half since we're only shifting by
- # 1, 2, or 3.
- ieee1 >>= shift
- ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
-
- # clear the 1 bit to the left of the binary point
- ieee1 &= 0xFFEFFFFF
-
- # set the exponent of the ieee number to be the actual exponent
- # plus the shift count + 1023. Or this into the first half of the
- # ieee number. The ibm exponent is excess 64 but is adjusted by 65
- # since during conversion to ibm format the exponent is
- # incremented by 1 and the fraction bits left 4 positions to the
- # right of the radix point. (had to add >> 24 because C treats &
- # 0x7f as 0x7f000000 and Python doesn't)
- ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | (
- xport1 & 0x80000000
- )
-
- ieee = np.empty((len(ieee1),), dtype=">u4,>u4")
- ieee["f0"] = ieee1
- ieee["f1"] = ieee2
- ieee = ieee.view(dtype=">f8")
- ieee = ieee.astype("f8")
-
- return ieee
-
-
-class XportReader(ReaderBase, abc.Iterator):
- __doc__ = _xport_reader_doc
-
- def __init__(
- self,
- filepath_or_buffer: FilePath | ReadBuffer[bytes],
- index=None,
- encoding: str | None = "ISO-8859-1",
- chunksize=None,
- compression: CompressionOptions = "infer",
- ) -> None:
- self._encoding = encoding
- self._lines_read = 0
- self._index = index
- self._chunksize = chunksize
-
- self.handles = get_handle(
- filepath_or_buffer,
- "rb",
- encoding=encoding,
- is_text=False,
- compression=compression,
- )
- self.filepath_or_buffer = self.handles.handle
-
- try:
- self._read_header()
- except Exception:
- self.close()
- raise
-
- def close(self) -> None:
- self.handles.close()
-
- def _get_row(self):
- return self.filepath_or_buffer.read(80).decode()
-
- def _read_header(self):
- self.filepath_or_buffer.seek(0)
-
- # read file header
- line1 = self._get_row()
- if line1 != _correct_line1:
- if "**COMPRESSED**" in line1:
- # this was created with the PROC CPORT method and can't be read
- # https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/movefile/p1bm6aqp3fw4uin1hucwh718f6kp.htm
- raise ValueError(
- "Header record indicates a CPORT file, which is not readable."
- )
- raise ValueError("Header record is not an XPORT file.")
-
- line2 = self._get_row()
- fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]]
- file_info = _split_line(line2, fif)
- if file_info["prefix"] != "SAS SAS SASLIB":
- raise ValueError("Header record has invalid prefix.")
- file_info["created"] = _parse_date(file_info["created"])
- self.file_info = file_info
-
- line3 = self._get_row()
- file_info["modified"] = _parse_date(line3[:16])
-
- # read member header
- header1 = self._get_row()
- header2 = self._get_row()
- headflag1 = header1.startswith(_correct_header1)
- headflag2 = header2 == _correct_header2
- if not (headflag1 and headflag2):
- raise ValueError("Member header not found")
- # usually 140, could be 135
- fieldnamelength = int(header1[-5:-2])
-
- # member info
- mem = [
- ["prefix", 8],
- ["set_name", 8],
- ["sasdata", 8],
- ["version", 8],
- ["OS", 8],
- ["_", 24],
- ["created", 16],
- ]
- member_info = _split_line(self._get_row(), mem)
- mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]]
- member_info.update(_split_line(self._get_row(), mem))
- member_info["modified"] = _parse_date(member_info["modified"])
- member_info["created"] = _parse_date(member_info["created"])
- self.member_info = member_info
-
- # read field names
- types = {1: "numeric", 2: "char"}
- fieldcount = int(self._get_row()[54:58])
- datalength = fieldnamelength * fieldcount
- # round up to nearest 80
- if datalength % 80:
- datalength += 80 - datalength % 80
- fielddata = self.filepath_or_buffer.read(datalength)
- fields = []
- obs_length = 0
- while len(fielddata) >= fieldnamelength:
- # pull data for one field
- fieldbytes, fielddata = (
- fielddata[:fieldnamelength],
- fielddata[fieldnamelength:],
- )
-
- # rest at end gets ignored, so if field is short, pad out
- # to match struct pattern below
- fieldbytes = fieldbytes.ljust(140)
-
- fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", fieldbytes)
- field = dict(zip(_fieldkeys, fieldstruct))
- del field["_"]
- field["ntype"] = types[field["ntype"]]
- fl = field["field_length"]
- if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)):
- msg = f"Floating field width {fl} is not between 2 and 8."
- raise TypeError(msg)
-
- for k, v in field.items():
- try:
- field[k] = v.strip()
- except AttributeError:
- pass
-
- obs_length += field["field_length"]
- fields += [field]
-
- header = self._get_row()
- if not header == _correct_obs_header:
- raise ValueError("Observation header not found.")
-
- self.fields = fields
- self.record_length = obs_length
- self.record_start = self.filepath_or_buffer.tell()
-
- self.nobs = self._record_count()
- self.columns = [x["name"].decode() for x in self.fields]
-
- # Setup the dtype.
- dtypel = [
- ("s" + str(i), "S" + str(field["field_length"]))
- for i, field in enumerate(self.fields)
- ]
- dtype = np.dtype(dtypel)
- self._dtype = dtype
-
- def __next__(self) -> pd.DataFrame:
- return self.read(nrows=self._chunksize or 1)
-
- def _record_count(self) -> int:
- """
- Get number of records in file.
-
- This is maybe suboptimal because we have to seek to the end of
- the file.
-
- Side effect: returns file position to record_start.
- """
- self.filepath_or_buffer.seek(0, 2)
- total_records_length = self.filepath_or_buffer.tell() - self.record_start
-
- if total_records_length % 80 != 0:
- warnings.warn(
- "xport file may be corrupted.",
- stacklevel=find_stack_level(),
- )
-
- if self.record_length > 80:
- self.filepath_or_buffer.seek(self.record_start)
- return total_records_length // self.record_length
-
- self.filepath_or_buffer.seek(-80, 2)
- last_card_bytes = self.filepath_or_buffer.read(80)
- last_card = np.frombuffer(last_card_bytes, dtype=np.uint64)
-
- # 8 byte blank
- ix = np.flatnonzero(last_card == 2314885530818453536)
-
- if len(ix) == 0:
- tail_pad = 0
- else:
- tail_pad = 8 * len(ix)
-
- self.filepath_or_buffer.seek(self.record_start)
-
- return (total_records_length - tail_pad) // self.record_length
-
- def get_chunk(self, size=None) -> pd.DataFrame:
- """
- Reads lines from Xport file and returns as dataframe
-
- Parameters
- ----------
- size : int, defaults to None
- Number of lines to read. If None, reads whole file.
-
- Returns
- -------
- DataFrame
- """
- if size is None:
- size = self._chunksize
- return self.read(nrows=size)
-
- def _missing_double(self, vec):
- v = vec.view(dtype="u1,u1,u2,u4")
- miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0)
- miss1 = (
- ((v["f0"] >= 0x41) & (v["f0"] <= 0x5A))
- | (v["f0"] == 0x5F)
- | (v["f0"] == 0x2E)
- )
- miss &= miss1
- return miss
-
- @Appender(_read_method_doc)
- def read(self, nrows: int | None = None) -> pd.DataFrame:
- if nrows is None:
- nrows = self.nobs
-
- read_lines = min(nrows, self.nobs - self._lines_read)
- read_len = read_lines * self.record_length
- if read_len <= 0:
- self.close()
- raise StopIteration
- raw = self.filepath_or_buffer.read(read_len)
- data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
-
- df_data = {}
- for j, x in enumerate(self.columns):
- vec = data["s" + str(j)]
- ntype = self.fields[j]["ntype"]
- if ntype == "numeric":
- vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"])
- miss = self._missing_double(vec)
- v = _parse_float_vec(vec)
- v[miss] = np.nan
- elif self.fields[j]["ntype"] == "char":
- v = [y.rstrip() for y in vec]
-
- if self._encoding is not None:
- v = [y.decode(self._encoding) for y in v]
-
- df_data.update({x: v})
- df = pd.DataFrame(df_data)
-
- if self._index is None:
- df.index = pd.Index(range(self._lines_read, self._lines_read + read_lines))
- else:
- df = df.set_index(self._index)
-
- self._lines_read += read_lines
-
- return df
diff --git a/contrib/python/pandas/py3/pandas/io/sas/sasreader.py b/contrib/python/pandas/py3/pandas/io/sas/sasreader.py
deleted file mode 100644
index 22910484876..00000000000
--- a/contrib/python/pandas/py3/pandas/io/sas/sasreader.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""
-Read SAS sas7bdat or xport files.
-"""
-from __future__ import annotations
-
-from abc import (
- ABCMeta,
- abstractmethod,
-)
-from types import TracebackType
-from typing import (
- TYPE_CHECKING,
- Hashable,
- overload,
-)
-
-from pandas._typing import (
- CompressionOptions,
- FilePath,
- ReadBuffer,
-)
-from pandas.util._decorators import doc
-
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.common import stringify_path
-
-if TYPE_CHECKING:
- from pandas import DataFrame
-
-
-# TODO(PY38): replace with Protocol in Python 3.8
-class ReaderBase(metaclass=ABCMeta):
- """
- Protocol for XportReader and SAS7BDATReader classes.
- """
-
- @abstractmethod
- def read(self, nrows: int | None = None) -> DataFrame:
- pass
-
- @abstractmethod
- def close(self) -> None:
- pass
-
- def __enter__(self) -> ReaderBase:
- return self
-
- def __exit__(
- self,
- exc_type: type[BaseException] | None,
- exc_value: BaseException | None,
- traceback: TracebackType | None,
- ) -> None:
- self.close()
-
-
-@overload
-def read_sas(
- filepath_or_buffer: FilePath | ReadBuffer[bytes],
- *,
- format: str | None = ...,
- index: Hashable | None = ...,
- encoding: str | None = ...,
- chunksize: int = ...,
- iterator: bool = ...,
- compression: CompressionOptions = ...,
-) -> ReaderBase:
- ...
-
-
-@overload
-def read_sas(
- filepath_or_buffer: FilePath | ReadBuffer[bytes],
- *,
- format: str | None = ...,
- index: Hashable | None = ...,
- encoding: str | None = ...,
- chunksize: None = ...,
- iterator: bool = ...,
- compression: CompressionOptions = ...,
-) -> DataFrame | ReaderBase:
- ...
-
-
-@doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer")
-def read_sas(
- filepath_or_buffer: FilePath | ReadBuffer[bytes],
- *,
- format: str | None = None,
- index: Hashable | None = None,
- encoding: str | None = None,
- chunksize: int | None = None,
- iterator: bool = False,
- compression: CompressionOptions = "infer",
-) -> DataFrame | ReaderBase:
- """
- Read SAS files stored as either XPORT or SAS7BDAT format files.
-
- Parameters
- ----------
- filepath_or_buffer : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``read()`` function. The string could be a URL.
- Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
- expected. A local file could be:
- ``file://localhost/path/to/table.sas7bdat``.
- format : str {{'xport', 'sas7bdat'}} or None
- If None, file format is inferred from file extension. If 'xport' or
- 'sas7bdat', uses the corresponding format.
- index : identifier of index column, defaults to None
- Identifier of column that should be used as index of the DataFrame.
- encoding : str, default is None
- Encoding for text data. If None, text data are stored as raw bytes.
- chunksize : int
- Read file `chunksize` lines at a time, returns iterator.
-
- .. versionchanged:: 1.2
-
- ``TextFileReader`` is a context manager.
- iterator : bool, defaults to False
- If True, returns an iterator for reading the file incrementally.
-
- .. versionchanged:: 1.2
-
- ``TextFileReader`` is a context manager.
- {decompression_options}
-
- Returns
- -------
- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
- or XportReader
- """
- if format is None:
- buffer_error_msg = (
- "If this is a buffer object rather "
- "than a string name, you must specify a format string"
- )
- filepath_or_buffer = stringify_path(filepath_or_buffer)
- if not isinstance(filepath_or_buffer, str):
- raise ValueError(buffer_error_msg)
- fname = filepath_or_buffer.lower()
- if ".xpt" in fname:
- format = "xport"
- elif ".sas7bdat" in fname:
- format = "sas7bdat"
- else:
- raise ValueError(
- f"unable to infer format of SAS file from filename: {repr(fname)}"
- )
-
- reader: ReaderBase
- if format.lower() == "xport":
- from pandas.io.sas.sas_xport import XportReader
-
- reader = XportReader(
- filepath_or_buffer,
- index=index,
- encoding=encoding,
- chunksize=chunksize,
- compression=compression,
- )
- elif format.lower() == "sas7bdat":
- from pandas.io.sas.sas7bdat import SAS7BDATReader
-
- reader = SAS7BDATReader(
- filepath_or_buffer,
- index=index,
- encoding=encoding,
- chunksize=chunksize,
- compression=compression,
- )
- else:
- raise ValueError("unknown SAS format")
-
- if iterator or chunksize:
- return reader
-
- with reader:
- return reader.read()
diff --git a/contrib/python/pandas/py3/pandas/io/spss.py b/contrib/python/pandas/py3/pandas/io/spss.py
deleted file mode 100644
index 4dee5964961..00000000000
--- a/contrib/python/pandas/py3/pandas/io/spss.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from __future__ import annotations
-
-from pathlib import Path
-from typing import (
- TYPE_CHECKING,
- Sequence,
-)
-
-from pandas._libs import lib
-from pandas.compat._optional import import_optional_dependency
-from pandas.util._validators import check_dtype_backend
-
-from pandas.core.dtypes.inference import is_list_like
-
-from pandas.io.common import stringify_path
-
-if TYPE_CHECKING:
- from pandas._typing import DtypeBackend
-
- from pandas import DataFrame
-
-
-def read_spss(
- path: str | Path,
- usecols: Sequence[str] | None = None,
- convert_categoricals: bool = True,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-) -> DataFrame:
- """
- Load an SPSS file from the file path, returning a DataFrame.
-
- Parameters
- ----------
- path : str or Path
- File path.
- usecols : list-like, optional
- Return a subset of the columns. If None, return all columns.
- convert_categoricals : bool, default is True
- Convert categorical columns into pd.Categorical.
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- Returns
- -------
- DataFrame
- """
- pyreadstat = import_optional_dependency("pyreadstat")
- check_dtype_backend(dtype_backend)
-
- if usecols is not None:
- if not is_list_like(usecols):
- raise TypeError("usecols must be list-like.")
- usecols = list(usecols) # pyreadstat requires a list
-
- df, _ = pyreadstat.read_sav(
- stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals
- )
- if dtype_backend is not lib.no_default:
- df = df.convert_dtypes(dtype_backend=dtype_backend)
- return df
diff --git a/contrib/python/pandas/py3/pandas/io/sql.py b/contrib/python/pandas/py3/pandas/io/sql.py
deleted file mode 100644
index a627a60ef06..00000000000
--- a/contrib/python/pandas/py3/pandas/io/sql.py
+++ /dev/null
@@ -1,2447 +0,0 @@
-"""
-Collection of query wrappers / abstractions to both facilitate data
-retrieval and to reduce dependency on DB-specific API.
-"""
-
-from __future__ import annotations
-
-from abc import (
- ABC,
- abstractmethod,
-)
-from contextlib import (
- ExitStack,
- contextmanager,
-)
-from datetime import (
- date,
- datetime,
- time,
-)
-from functools import partial
-import re
-from typing import (
- TYPE_CHECKING,
- Any,
- Iterator,
- Literal,
- cast,
- overload,
-)
-import warnings
-
-import numpy as np
-
-from pandas._libs import lib
-from pandas._typing import (
- DateTimeErrorChoices,
- DtypeArg,
- DtypeBackend,
- IndexLabel,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.errors import (
- AbstractMethodError,
- DatabaseError,
-)
-from pandas.util._exceptions import find_stack_level
-from pandas.util._validators import check_dtype_backend
-
-from pandas.core.dtypes.common import (
- is_datetime64tz_dtype,
- is_dict_like,
- is_integer,
- is_list_like,
-)
-from pandas.core.dtypes.dtypes import DatetimeTZDtype
-from pandas.core.dtypes.missing import isna
-
-from pandas import get_option
-from pandas.core.api import (
- DataFrame,
- Series,
-)
-from pandas.core.arrays import ArrowExtensionArray
-from pandas.core.base import PandasObject
-import pandas.core.common as com
-from pandas.core.internals.construction import convert_object_array
-from pandas.core.tools.datetimes import to_datetime
-
-if TYPE_CHECKING:
- from sqlalchemy import Table
- from sqlalchemy.sql.expression import (
- Select,
- TextClause,
- )
-
-
-# -----------------------------------------------------------------------------
-# -- Helper functions
-
-
-def _process_parse_dates_argument(parse_dates):
- """Process parse_dates argument for read_sql functions"""
- # handle non-list entries for parse_dates gracefully
- if parse_dates is True or parse_dates is None or parse_dates is False:
- parse_dates = []
-
- elif not hasattr(parse_dates, "__iter__"):
- parse_dates = [parse_dates]
- return parse_dates
-
-
-def _handle_date_column(
- col, utc: bool = False, format: str | dict[str, Any] | None = None
-):
- if isinstance(format, dict):
- # GH35185 Allow custom error values in parse_dates argument of
- # read_sql like functions.
- # Format can take on custom to_datetime argument values such as
- # {"errors": "coerce"} or {"dayfirst": True}
- error: DateTimeErrorChoices = format.pop("errors", None) or "ignore"
- return to_datetime(col, errors=error, **format)
- else:
- # Allow passing of formatting string for integers
- # GH17855
- if format is None and (
- issubclass(col.dtype.type, np.floating)
- or issubclass(col.dtype.type, np.integer)
- ):
- format = "s"
- if format in ["D", "d", "h", "m", "s", "ms", "us", "ns"]:
- return to_datetime(col, errors="coerce", unit=format, utc=utc)
- elif is_datetime64tz_dtype(col.dtype):
- # coerce to UTC timezone
- # GH11216
- return to_datetime(col, utc=True)
- else:
- return to_datetime(col, errors="coerce", format=format, utc=utc)
-
-
-def _parse_date_columns(data_frame, parse_dates):
- """
- Force non-datetime columns to be read as such.
- Supports both string formatted and integer timestamp columns.
- """
- parse_dates = _process_parse_dates_argument(parse_dates)
-
- # we want to coerce datetime64_tz dtypes for now to UTC
- # we could in theory do a 'nice' conversion from a FixedOffset tz
- # GH11216
- for col_name, df_col in data_frame.items():
- if is_datetime64tz_dtype(df_col.dtype) or col_name in parse_dates:
- try:
- fmt = parse_dates[col_name]
- except TypeError:
- fmt = None
- data_frame[col_name] = _handle_date_column(df_col, format=fmt)
-
- return data_frame
-
-
-def _convert_arrays_to_dataframe(
- data,
- columns,
- coerce_float: bool = True,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
-) -> DataFrame:
- content = lib.to_object_array_tuples(data)
- arrays = convert_object_array(
- list(content.T),
- dtype=None,
- coerce_float=coerce_float,
- dtype_backend=dtype_backend,
- )
- if dtype_backend == "pyarrow":
- pa = import_optional_dependency("pyarrow")
- arrays = [
- ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays
- ]
- if arrays:
- df = DataFrame(dict(zip(list(range(len(columns))), arrays)))
- df.columns = columns
- return df
- else:
- return DataFrame(columns=columns)
-
-
-def _wrap_result(
- data,
- columns,
- index_col=None,
- coerce_float: bool = True,
- parse_dates=None,
- dtype: DtypeArg | None = None,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
-):
- """Wrap result set of query in a DataFrame."""
- frame = _convert_arrays_to_dataframe(data, columns, coerce_float, dtype_backend)
-
- if dtype:
- frame = frame.astype(dtype)
-
- frame = _parse_date_columns(frame, parse_dates)
-
- if index_col is not None:
- frame = frame.set_index(index_col)
-
- return frame
-
-
-def execute(sql, con, params=None):
- """
- Execute the given SQL query using the provided connection object.
-
- Parameters
- ----------
- sql : string
- SQL query to be executed.
- con : SQLAlchemy connection or sqlite3 connection
- If a DBAPI2 object, only sqlite3 is supported.
- params : list or tuple, optional, default: None
- List of parameters to pass to execute method.
-
- Returns
- -------
- Results Iterable
- """
- warnings.warn(
- "`pandas.io.sql.execute` is deprecated and "
- "will be removed in the future version.",
- FutureWarning,
- stacklevel=find_stack_level(),
- ) # GH50185
- sqlalchemy = import_optional_dependency("sqlalchemy", errors="ignore")
-
- if sqlalchemy is not None and isinstance(con, (str, sqlalchemy.engine.Engine)):
- raise TypeError("pandas.io.sql.execute requires a connection") # GH50185
- with pandasSQL_builder(con, need_transaction=True) as pandas_sql:
- return pandas_sql.execute(sql, params)
-
-
-# -----------------------------------------------------------------------------
-# -- Read and write to DataFrames
-
-
-@overload
-def read_sql_table(
- table_name,
- con,
- schema=...,
- index_col: str | list[str] | None = ...,
- coerce_float=...,
- parse_dates: list[str] | dict[str, str] | None = ...,
- columns: list[str] | None = ...,
- chunksize: None = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> DataFrame:
- ...
-
-
-@overload
-def read_sql_table(
- table_name,
- con,
- schema=...,
- index_col: str | list[str] | None = ...,
- coerce_float=...,
- parse_dates: list[str] | dict[str, str] | None = ...,
- columns: list[str] | None = ...,
- chunksize: int = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> Iterator[DataFrame]:
- ...
-
-
-def read_sql_table(
- table_name: str,
- con,
- schema: str | None = None,
- index_col: str | list[str] | None = None,
- coerce_float: bool = True,
- parse_dates: list[str] | dict[str, str] | None = None,
- columns: list[str] | None = None,
- chunksize: int | None = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-) -> DataFrame | Iterator[DataFrame]:
- """
- Read SQL database table into a DataFrame.
-
- Given a table name and a SQLAlchemy connectable, returns a DataFrame.
- This function does not support DBAPI connections.
-
- Parameters
- ----------
- table_name : str
- Name of SQL table in database.
- con : SQLAlchemy connectable or str
- A database URI could be provided as str.
- SQLite DBAPI connection mode not supported.
- schema : str, default None
- Name of SQL schema in database to query (if database flavor
- supports this). Uses default schema if None (default).
- index_col : str or list of str, optional, default: None
- Column(s) to set as index(MultiIndex).
- coerce_float : bool, default True
- Attempts to convert values of non-string, non-numeric objects (like
- decimal.Decimal) to floating point. Can result in loss of Precision.
- parse_dates : list or dict, default None
- - List of column names to parse as dates.
- - Dict of ``{column_name: format string}`` where format string is
- strftime compatible in case of parsing string times or is one of
- (D, s, ns, ms, us) in case of parsing integer timestamps.
- - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
- to the keyword arguments of :func:`pandas.to_datetime`
- Especially useful with databases without native Datetime support,
- such as SQLite.
- columns : list, default None
- List of column names to select from SQL table.
- chunksize : int, default None
- If specified, returns an iterator where `chunksize` is the number of
- rows to include in each chunk.
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- Returns
- -------
- DataFrame or Iterator[DataFrame]
- A SQL table is returned as two-dimensional data structure with labeled
- axes.
-
- See Also
- --------
- read_sql_query : Read SQL query into a DataFrame.
- read_sql : Read SQL query or database table into a DataFrame.
-
- Notes
- -----
- Any datetime values with time zone information will be converted to UTC.
-
- Examples
- --------
- >>> pd.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP
- """
-
- check_dtype_backend(dtype_backend)
- if dtype_backend is lib.no_default:
- dtype_backend = "numpy" # type: ignore[assignment]
-
- with pandasSQL_builder(con, schema=schema, need_transaction=True) as pandas_sql:
- if not pandas_sql.has_table(table_name):
- raise ValueError(f"Table {table_name} not found")
-
- table = pandas_sql.read_table(
- table_name,
- index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- columns=columns,
- chunksize=chunksize,
- dtype_backend=dtype_backend,
- )
-
- if table is not None:
- return table
- else:
- raise ValueError(f"Table {table_name} not found", con)
-
-
-@overload
-def read_sql_query(
- sql,
- con,
- index_col: str | list[str] | None = ...,
- coerce_float=...,
- params: list[str] | dict[str, str] | None = ...,
- parse_dates: list[str] | dict[str, str] | None = ...,
- chunksize: None = ...,
- dtype: DtypeArg | None = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> DataFrame:
- ...
-
-
-@overload
-def read_sql_query(
- sql,
- con,
- index_col: str | list[str] | None = ...,
- coerce_float=...,
- params: list[str] | dict[str, str] | None = ...,
- parse_dates: list[str] | dict[str, str] | None = ...,
- chunksize: int = ...,
- dtype: DtypeArg | None = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
-) -> Iterator[DataFrame]:
- ...
-
-
-def read_sql_query(
- sql,
- con,
- index_col: str | list[str] | None = None,
- coerce_float: bool = True,
- params: list[str] | dict[str, str] | None = None,
- parse_dates: list[str] | dict[str, str] | None = None,
- chunksize: int | None = None,
- dtype: DtypeArg | None = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-) -> DataFrame | Iterator[DataFrame]:
- """
- Read SQL query into a DataFrame.
-
- Returns a DataFrame corresponding to the result set of the query
- string. Optionally provide an `index_col` parameter to use one of the
- columns as the index, otherwise default integer index will be used.
-
- Parameters
- ----------
- sql : str SQL query or SQLAlchemy Selectable (select or text object)
- SQL query to be executed.
- con : SQLAlchemy connectable, str, or sqlite3 connection
- Using SQLAlchemy makes it possible to use any DB supported by that
- library. If a DBAPI2 object, only sqlite3 is supported.
- index_col : str or list of str, optional, default: None
- Column(s) to set as index(MultiIndex).
- coerce_float : bool, default True
- Attempts to convert values of non-string, non-numeric objects (like
- decimal.Decimal) to floating point. Useful for SQL result sets.
- params : list, tuple or dict, optional, default: None
- List of parameters to pass to execute method. The syntax used
- to pass parameters is database driver dependent. Check your
- database driver documentation for which of the five syntax styles,
- described in PEP 249's paramstyle, is supported.
- Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}.
- parse_dates : list or dict, default: None
- - List of column names to parse as dates.
- - Dict of ``{column_name: format string}`` where format string is
- strftime compatible in case of parsing string times, or is one of
- (D, s, ns, ms, us) in case of parsing integer timestamps.
- - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
- to the keyword arguments of :func:`pandas.to_datetime`
- Especially useful with databases without native Datetime support,
- such as SQLite.
- chunksize : int, default None
- If specified, return an iterator where `chunksize` is the number of
- rows to include in each chunk.
- dtype : Type name or dict of columns
- Data type for data or columns. E.g. np.float64 or
- {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’}.
-
- .. versionadded:: 1.3.0
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- Returns
- -------
- DataFrame or Iterator[DataFrame]
-
- See Also
- --------
- read_sql_table : Read SQL database table into a DataFrame.
- read_sql : Read SQL query or database table into a DataFrame.
-
- Notes
- -----
- Any datetime values with time zone information parsed via the `parse_dates`
- parameter will be converted to UTC.
- """
-
- check_dtype_backend(dtype_backend)
- if dtype_backend is lib.no_default:
- dtype_backend = "numpy" # type: ignore[assignment]
-
- with pandasSQL_builder(con) as pandas_sql:
- return pandas_sql.read_query(
- sql,
- index_col=index_col,
- params=params,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- chunksize=chunksize,
- dtype=dtype,
- dtype_backend=dtype_backend,
- )
-
-
-@overload
-def read_sql(
- sql,
- con,
- index_col: str | list[str] | None = ...,
- coerce_float=...,
- params=...,
- parse_dates=...,
- columns: list[str] = ...,
- chunksize: None = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- dtype: DtypeArg | None = None,
-) -> DataFrame:
- ...
-
-
-@overload
-def read_sql(
- sql,
- con,
- index_col: str | list[str] | None = ...,
- coerce_float=...,
- params=...,
- parse_dates=...,
- columns: list[str] = ...,
- chunksize: int = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- dtype: DtypeArg | None = None,
-) -> Iterator[DataFrame]:
- ...
-
-
-def read_sql(
- sql,
- con,
- index_col: str | list[str] | None = None,
- coerce_float: bool = True,
- params=None,
- parse_dates=None,
- columns: list[str] | None = None,
- chunksize: int | None = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- dtype: DtypeArg | None = None,
-) -> DataFrame | Iterator[DataFrame]:
- """
- Read SQL query or database table into a DataFrame.
-
- This function is a convenience wrapper around ``read_sql_table`` and
- ``read_sql_query`` (for backward compatibility). It will delegate
- to the specific function depending on the provided input. A SQL query
- will be routed to ``read_sql_query``, while a database table name will
- be routed to ``read_sql_table``. Note that the delegated function might
- have more specific notes about their functionality not listed here.
-
- Parameters
- ----------
- sql : str or SQLAlchemy Selectable (select or text object)
- SQL query to be executed or a table name.
- con : SQLAlchemy connectable, str, or sqlite3 connection
- Using SQLAlchemy makes it possible to use any DB supported by that
- library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible
- for engine disposal and connection closure for the SQLAlchemy connectable; str
- connections are closed automatically. See
- `here <https://docs.sqlalchemy.org/en/13/core/connections.html>`_.
- index_col : str or list of str, optional, default: None
- Column(s) to set as index(MultiIndex).
- coerce_float : bool, default True
- Attempts to convert values of non-string, non-numeric objects (like
- decimal.Decimal) to floating point, useful for SQL result sets.
- params : list, tuple or dict, optional, default: None
- List of parameters to pass to execute method. The syntax used
- to pass parameters is database driver dependent. Check your
- database driver documentation for which of the five syntax styles,
- described in PEP 249's paramstyle, is supported.
- Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}.
- parse_dates : list or dict, default: None
- - List of column names to parse as dates.
- - Dict of ``{column_name: format string}`` where format string is
- strftime compatible in case of parsing string times, or is one of
- (D, s, ns, ms, us) in case of parsing integer timestamps.
- - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
- to the keyword arguments of :func:`pandas.to_datetime`
- Especially useful with databases without native Datetime support,
- such as SQLite.
- columns : list, default: None
- List of column names to select from SQL table (only used when reading
- a table).
- chunksize : int, default None
- If specified, return an iterator where `chunksize` is the
- number of rows to include in each chunk.
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
- dtype : Type name or dict of columns
- Data type for data or columns. E.g. np.float64 or
- {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’}.
- The argument is ignored if a table is passed instead of a query.
-
- .. versionadded:: 2.0.0
-
- Returns
- -------
- DataFrame or Iterator[DataFrame]
-
- See Also
- --------
- read_sql_table : Read SQL database table into a DataFrame.
- read_sql_query : Read SQL query into a DataFrame.
-
- Examples
- --------
- Read data from SQL via either a SQL query or a SQL tablename.
- When using a SQLite database only SQL queries are accepted,
- providing only the SQL tablename will result in an error.
-
- >>> from sqlite3 import connect
- >>> conn = connect(':memory:')
- >>> df = pd.DataFrame(data=[[0, '10/11/12'], [1, '12/11/10']],
- ... columns=['int_column', 'date_column'])
- >>> df.to_sql('test_data', conn)
- 2
-
- >>> pd.read_sql('SELECT int_column, date_column FROM test_data', conn)
- int_column date_column
- 0 0 10/11/12
- 1 1 12/11/10
-
- >>> pd.read_sql('test_data', 'postgres:///db_name') # doctest:+SKIP
-
- Apply date parsing to columns through the ``parse_dates`` argument
- The ``parse_dates`` argument calls ``pd.to_datetime`` on the provided columns.
- Custom argument values for applying ``pd.to_datetime`` on a column are specified
- via a dictionary format:
-
- >>> pd.read_sql('SELECT int_column, date_column FROM test_data',
- ... conn,
- ... parse_dates={"date_column": {"format": "%d/%m/%y"}})
- int_column date_column
- 0 0 2012-11-10
- 1 1 2010-11-12
- """
-
- check_dtype_backend(dtype_backend)
- if dtype_backend is lib.no_default:
- dtype_backend = "numpy" # type: ignore[assignment]
-
- with pandasSQL_builder(con) as pandas_sql:
- if isinstance(pandas_sql, SQLiteDatabase):
- return pandas_sql.read_query(
- sql,
- index_col=index_col,
- params=params,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- chunksize=chunksize,
- dtype_backend=dtype_backend, # type: ignore[arg-type]
- dtype=dtype,
- )
-
- try:
- _is_table_name = pandas_sql.has_table(sql)
- except Exception:
- # using generic exception to catch errors from sql drivers (GH24988)
- _is_table_name = False
-
- if _is_table_name:
- return pandas_sql.read_table(
- sql,
- index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- columns=columns,
- chunksize=chunksize,
- dtype_backend=dtype_backend,
- )
- else:
- return pandas_sql.read_query(
- sql,
- index_col=index_col,
- params=params,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- chunksize=chunksize,
- dtype_backend=dtype_backend,
- dtype=dtype,
- )
-
-
-def to_sql(
- frame,
- name: str,
- con,
- schema: str | None = None,
- if_exists: Literal["fail", "replace", "append"] = "fail",
- index: bool = True,
- index_label: IndexLabel = None,
- chunksize: int | None = None,
- dtype: DtypeArg | None = None,
- method: str | None = None,
- engine: str = "auto",
- **engine_kwargs,
-) -> int | None:
- """
- Write records stored in a DataFrame to a SQL database.
-
- Parameters
- ----------
- frame : DataFrame, Series
- name : str
- Name of SQL table.
- con : SQLAlchemy connectable(engine/connection) or database string URI
- or sqlite3 DBAPI2 connection
- Using SQLAlchemy makes it possible to use any DB supported by that
- library.
- If a DBAPI2 object, only sqlite3 is supported.
- schema : str, optional
- Name of SQL schema in database to write to (if database flavor
- supports this). If None, use default schema (default).
- if_exists : {'fail', 'replace', 'append'}, default 'fail'
- - fail: If table exists, do nothing.
- - replace: If table exists, drop it, recreate it, and insert data.
- - append: If table exists, insert data. Create if does not exist.
- index : bool, default True
- Write DataFrame index as a column.
- index_label : str or sequence, optional
- Column label for index column(s). If None is given (default) and
- `index` is True, then the index names are used.
- A sequence should be given if the DataFrame uses MultiIndex.
- chunksize : int, optional
- Specify the number of rows in each batch to be written at a time.
- By default, all rows will be written at once.
- dtype : dict or scalar, optional
- Specifying the datatype for columns. If a dictionary is used, the
- keys should be the column names and the values should be the
- SQLAlchemy types or strings for the sqlite3 fallback mode. If a
- scalar is provided, it will be applied to all columns.
- method : {None, 'multi', callable}, optional
- Controls the SQL insertion clause used:
-
- - None : Uses standard SQL ``INSERT`` clause (one per row).
- - ``'multi'``: Pass multiple values in a single ``INSERT`` clause.
- - callable with signature ``(pd_table, conn, keys, data_iter) -> int | None``.
-
- Details and a sample callable implementation can be found in the
- section :ref:`insert method <io.sql.method>`.
- engine : {'auto', 'sqlalchemy'}, default 'auto'
- SQL engine library to use. If 'auto', then the option
- ``io.sql.engine`` is used. The default ``io.sql.engine``
- behavior is 'sqlalchemy'
-
- .. versionadded:: 1.3.0
-
- **engine_kwargs
- Any additional kwargs are passed to the engine.
-
- Returns
- -------
- None or int
- Number of rows affected by to_sql. None is returned if the callable
- passed into ``method`` does not return an integer number of rows.
-
- .. versionadded:: 1.4.0
-
- Notes
- -----
- The returned rows affected is the sum of the ``rowcount`` attribute of ``sqlite3.Cursor``
- or SQLAlchemy connectable. The returned value may not reflect the exact number of written
- rows as stipulated in the
- `sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>`__ or
- `SQLAlchemy <https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.BaseCursorResult.rowcount>`__
- """ # noqa:E501
- if if_exists not in ("fail", "replace", "append"):
- raise ValueError(f"'{if_exists}' is not valid for if_exists")
-
- if isinstance(frame, Series):
- frame = frame.to_frame()
- elif not isinstance(frame, DataFrame):
- raise NotImplementedError(
- "'frame' argument should be either a Series or a DataFrame"
- )
-
- with pandasSQL_builder(con, schema=schema, need_transaction=True) as pandas_sql:
- return pandas_sql.to_sql(
- frame,
- name,
- if_exists=if_exists,
- index=index,
- index_label=index_label,
- schema=schema,
- chunksize=chunksize,
- dtype=dtype,
- method=method,
- engine=engine,
- **engine_kwargs,
- )
-
-
-def has_table(table_name: str, con, schema: str | None = None) -> bool:
- """
- Check if DataBase has named table.
-
- Parameters
- ----------
- table_name: string
- Name of SQL table.
- con: SQLAlchemy connectable(engine/connection) or sqlite3 DBAPI2 connection
- Using SQLAlchemy makes it possible to use any DB supported by that
- library.
- If a DBAPI2 object, only sqlite3 is supported.
- schema : string, default None
- Name of SQL schema in database to write to (if database flavor supports
- this). If None, use default schema (default).
-
- Returns
- -------
- boolean
- """
- with pandasSQL_builder(con, schema=schema) as pandas_sql:
- return pandas_sql.has_table(table_name)
-
-
-table_exists = has_table
-
-
-def pandasSQL_builder(
- con,
- schema: str | None = None,
- need_transaction: bool = False,
-) -> PandasSQL:
- """
- Convenience function to return the correct PandasSQL subclass based on the
- provided parameters. Also creates a sqlalchemy connection and transaction
- if necessary.
- """
- import sqlite3
-
- if isinstance(con, sqlite3.Connection) or con is None:
- return SQLiteDatabase(con)
-
- sqlalchemy = import_optional_dependency("sqlalchemy", errors="ignore")
-
- if isinstance(con, str) and sqlalchemy is None:
- raise ImportError("Using URI string without sqlalchemy installed.")
-
- if sqlalchemy is not None and isinstance(con, (str, sqlalchemy.engine.Connectable)):
- return SQLDatabase(con, schema, need_transaction)
-
- warnings.warn(
- "pandas only supports SQLAlchemy connectable (engine/connection) or "
- "database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 "
- "objects are not tested. Please consider using SQLAlchemy.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- return SQLiteDatabase(con)
-
-
-class SQLTable(PandasObject):
- """
- For mapping Pandas tables to SQL tables.
- Uses fact that table is reflected by SQLAlchemy to
- do better type conversions.
- Also holds various flags needed to avoid having to
- pass them between functions all the time.
- """
-
- # TODO: support for multiIndex
-
- def __init__(
- self,
- name: str,
- pandas_sql_engine,
- frame=None,
- index: bool | str | list[str] | None = True,
- if_exists: Literal["fail", "replace", "append"] = "fail",
- prefix: str = "pandas",
- index_label=None,
- schema=None,
- keys=None,
- dtype: DtypeArg | None = None,
- ) -> None:
- self.name = name
- self.pd_sql = pandas_sql_engine
- self.prefix = prefix
- self.frame = frame
- self.index = self._index_name(index, index_label)
- self.schema = schema
- self.if_exists = if_exists
- self.keys = keys
- self.dtype = dtype
-
- if frame is not None:
- # We want to initialize based on a dataframe
- self.table = self._create_table_setup()
- else:
- # no data provided, read-only mode
- self.table = self.pd_sql.get_table(self.name, self.schema)
-
- if self.table is None:
- raise ValueError(f"Could not init table '{name}'")
-
- def exists(self):
- return self.pd_sql.has_table(self.name, self.schema)
-
- def sql_schema(self) -> str:
- from sqlalchemy.schema import CreateTable
-
- return str(CreateTable(self.table).compile(self.pd_sql.con))
-
- def _execute_create(self) -> None:
- # Inserting table into database, add to MetaData object
- self.table = self.table.to_metadata(self.pd_sql.meta)
- with self.pd_sql.run_transaction():
- self.table.create(bind=self.pd_sql.con)
-
- def create(self) -> None:
- if self.exists():
- if self.if_exists == "fail":
- raise ValueError(f"Table '{self.name}' already exists.")
- if self.if_exists == "replace":
- self.pd_sql.drop_table(self.name, self.schema)
- self._execute_create()
- elif self.if_exists == "append":
- pass
- else:
- raise ValueError(f"'{self.if_exists}' is not valid for if_exists")
- else:
- self._execute_create()
-
- def _execute_insert(self, conn, keys: list[str], data_iter) -> int:
- """
- Execute SQL statement inserting data
-
- Parameters
- ----------
- conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
- keys : list of str
- Column names
- data_iter : generator of list
- Each item contains a list of values to be inserted
- """
- data = [dict(zip(keys, row)) for row in data_iter]
- result = conn.execute(self.table.insert(), data)
- return result.rowcount
-
- def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int:
- """
- Alternative to _execute_insert for DBs support multivalue INSERT.
-
- Note: multi-value insert is usually faster for analytics DBs
- and tables containing a few columns
- but performance degrades quickly with increase of columns.
- """
-
- from sqlalchemy import insert
-
- data = [dict(zip(keys, row)) for row in data_iter]
- stmt = insert(self.table).values(data)
- result = conn.execute(stmt)
- return result.rowcount
-
- def insert_data(self) -> tuple[list[str], list[np.ndarray]]:
- if self.index is not None:
- temp = self.frame.copy()
- temp.index.names = self.index
- try:
- temp.reset_index(inplace=True)
- except ValueError as err:
- raise ValueError(f"duplicate name in index/columns: {err}") from err
- else:
- temp = self.frame
-
- column_names = list(map(str, temp.columns))
- ncols = len(column_names)
- # this just pre-allocates the list: None's will be replaced with ndarrays
- # error: List item 0 has incompatible type "None"; expected "ndarray"
- data_list: list[np.ndarray] = [None] * ncols # type: ignore[list-item]
-
- for i, (_, ser) in enumerate(temp.items()):
- if ser.dtype.kind == "M":
- d = ser.dt.to_pydatetime()
- elif ser.dtype.kind == "m":
- vals = ser._values
- if isinstance(vals, ArrowExtensionArray):
- vals = vals.to_numpy(dtype=np.dtype("m8[ns]"))
- # store as integers, see GH#6921, GH#7076
- d = vals.view("i8").astype(object)
- else:
- d = ser._values.astype(object)
-
- assert isinstance(d, np.ndarray), type(d)
-
- if ser._can_hold_na:
- # Note: this will miss timedeltas since they are converted to int
- mask = isna(d)
- d[mask] = None
-
- data_list[i] = d
-
- return column_names, data_list
-
- def insert(
- self, chunksize: int | None = None, method: str | None = None
- ) -> int | None:
- # set insert method
- if method is None:
- exec_insert = self._execute_insert
- elif method == "multi":
- exec_insert = self._execute_insert_multi
- elif callable(method):
- exec_insert = partial(method, self)
- else:
- raise ValueError(f"Invalid parameter `method`: {method}")
-
- keys, data_list = self.insert_data()
-
- nrows = len(self.frame)
-
- if nrows == 0:
- return 0
-
- if chunksize is None:
- chunksize = nrows
- elif chunksize == 0:
- raise ValueError("chunksize argument should be non-zero")
-
- chunks = (nrows // chunksize) + 1
- total_inserted = None
- with self.pd_sql.run_transaction() as conn:
- for i in range(chunks):
- start_i = i * chunksize
- end_i = min((i + 1) * chunksize, nrows)
- if start_i >= end_i:
- break
-
- chunk_iter = zip(*(arr[start_i:end_i] for arr in data_list))
- num_inserted = exec_insert(conn, keys, chunk_iter)
- # GH 46891
- if is_integer(num_inserted):
- if total_inserted is None:
- total_inserted = num_inserted
- else:
- total_inserted += num_inserted
- return total_inserted
-
- def _query_iterator(
- self,
- result,
- exit_stack: ExitStack,
- chunksize: str | None,
- columns,
- coerce_float: bool = True,
- parse_dates=None,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
- ):
- """Return generator through chunked result set."""
- has_read_data = False
- with exit_stack:
- while True:
- data = result.fetchmany(chunksize)
- if not data:
- if not has_read_data:
- yield DataFrame.from_records(
- [], columns=columns, coerce_float=coerce_float
- )
- break
-
- has_read_data = True
- self.frame = _convert_arrays_to_dataframe(
- data, columns, coerce_float, dtype_backend
- )
-
- self._harmonize_columns(
- parse_dates=parse_dates, dtype_backend=dtype_backend
- )
-
- if self.index is not None:
- self.frame.set_index(self.index, inplace=True)
-
- yield self.frame
-
- def read(
- self,
- exit_stack: ExitStack,
- coerce_float: bool = True,
- parse_dates=None,
- columns=None,
- chunksize=None,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
- ) -> DataFrame | Iterator[DataFrame]:
- from sqlalchemy import select
-
- if columns is not None and len(columns) > 0:
- cols = [self.table.c[n] for n in columns]
- if self.index is not None:
- for idx in self.index[::-1]:
- cols.insert(0, self.table.c[idx])
- sql_select = select(*cols)
- else:
- sql_select = select(self.table)
- result = self.pd_sql.execute(sql_select)
- column_names = result.keys()
-
- if chunksize is not None:
- return self._query_iterator(
- result,
- exit_stack,
- chunksize,
- column_names,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- dtype_backend=dtype_backend,
- )
- else:
- data = result.fetchall()
- self.frame = _convert_arrays_to_dataframe(
- data, column_names, coerce_float, dtype_backend
- )
-
- self._harmonize_columns(
- parse_dates=parse_dates, dtype_backend=dtype_backend
- )
-
- if self.index is not None:
- self.frame.set_index(self.index, inplace=True)
-
- return self.frame
-
- def _index_name(self, index, index_label):
- # for writing: index=True to include index in sql table
- if index is True:
- nlevels = self.frame.index.nlevels
- # if index_label is specified, set this as index name(s)
- if index_label is not None:
- if not isinstance(index_label, list):
- index_label = [index_label]
- if len(index_label) != nlevels:
- raise ValueError(
- "Length of 'index_label' should match number of "
- f"levels, which is {nlevels}"
- )
- return index_label
- # return the used column labels for the index columns
- if (
- nlevels == 1
- and "index" not in self.frame.columns
- and self.frame.index.name is None
- ):
- return ["index"]
- else:
- return com.fill_missing_names(self.frame.index.names)
-
- # for reading: index=(list of) string to specify column to set as index
- elif isinstance(index, str):
- return [index]
- elif isinstance(index, list):
- return index
- else:
- return None
-
- def _get_column_names_and_types(self, dtype_mapper):
- column_names_and_types = []
- if self.index is not None:
- for i, idx_label in enumerate(self.index):
- idx_type = dtype_mapper(self.frame.index._get_level_values(i))
- column_names_and_types.append((str(idx_label), idx_type, True))
-
- column_names_and_types += [
- (str(self.frame.columns[i]), dtype_mapper(self.frame.iloc[:, i]), False)
- for i in range(len(self.frame.columns))
- ]
-
- return column_names_and_types
-
- def _create_table_setup(self):
- from sqlalchemy import (
- Column,
- PrimaryKeyConstraint,
- Table,
- )
- from sqlalchemy.schema import MetaData
-
- column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type)
-
- columns: list[Any] = [
- Column(name, typ, index=is_index)
- for name, typ, is_index in column_names_and_types
- ]
-
- if self.keys is not None:
- if not is_list_like(self.keys):
- keys = [self.keys]
- else:
- keys = self.keys
- pkc = PrimaryKeyConstraint(*keys, name=self.name + "_pk")
- columns.append(pkc)
-
- schema = self.schema or self.pd_sql.meta.schema
-
- # At this point, attach to new metadata, only attach to self.meta
- # once table is created.
- meta = MetaData()
- return Table(self.name, meta, *columns, schema=schema)
-
- def _harmonize_columns(
- self,
- parse_dates=None,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
- ) -> None:
- """
- Make the DataFrame's column types align with the SQL table
- column types.
- Need to work around limited NA value support. Floats are always
- fine, ints must always be floats if there are Null values.
- Booleans are hard because converting bool column with None replaces
- all Nones with false. Therefore only convert bool if there are no
- NA values.
- Datetimes should already be converted to np.datetime64 if supported,
- but here we also force conversion if required.
- """
- parse_dates = _process_parse_dates_argument(parse_dates)
-
- for sql_col in self.table.columns:
- col_name = sql_col.name
- try:
- df_col = self.frame[col_name]
-
- # Handle date parsing upfront; don't try to convert columns
- # twice
- if col_name in parse_dates:
- try:
- fmt = parse_dates[col_name]
- except TypeError:
- fmt = None
- self.frame[col_name] = _handle_date_column(df_col, format=fmt)
- continue
-
- # the type the dataframe column should have
- col_type = self._get_dtype(sql_col.type)
-
- if (
- col_type is datetime
- or col_type is date
- or col_type is DatetimeTZDtype
- ):
- # Convert tz-aware Datetime SQL columns to UTC
- utc = col_type is DatetimeTZDtype
- self.frame[col_name] = _handle_date_column(df_col, utc=utc)
- elif dtype_backend == "numpy" and col_type is float:
- # floats support NA, can always convert!
- self.frame[col_name] = df_col.astype(col_type, copy=False)
-
- elif dtype_backend == "numpy" and len(df_col) == df_col.count():
- # No NA values, can convert ints and bools
- if col_type is np.dtype("int64") or col_type is bool:
- self.frame[col_name] = df_col.astype(col_type, copy=False)
- except KeyError:
- pass # this column not in results
-
- def _sqlalchemy_type(self, col):
- dtype: DtypeArg = self.dtype or {}
- if is_dict_like(dtype):
- dtype = cast(dict, dtype)
- if col.name in dtype:
- return dtype[col.name]
-
- # Infer type of column, while ignoring missing values.
- # Needed for inserting typed data containing NULLs, GH 8778.
- col_type = lib.infer_dtype(col, skipna=True)
-
- from sqlalchemy.types import (
- TIMESTAMP,
- BigInteger,
- Boolean,
- Date,
- DateTime,
- Float,
- Integer,
- SmallInteger,
- Text,
- Time,
- )
-
- if col_type in ("datetime64", "datetime"):
- # GH 9086: TIMESTAMP is the suggested type if the column contains
- # timezone information
- try:
- if col.dt.tz is not None:
- return TIMESTAMP(timezone=True)
- except AttributeError:
- # The column is actually a DatetimeIndex
- # GH 26761 or an Index with date-like data e.g. 9999-01-01
- if getattr(col, "tz", None) is not None:
- return TIMESTAMP(timezone=True)
- return DateTime
- if col_type == "timedelta64":
- warnings.warn(
- "the 'timedelta' type is not supported, and will be "
- "written as integer values (ns frequency) to the database.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- return BigInteger
- elif col_type == "floating":
- if col.dtype == "float32":
- return Float(precision=23)
- else:
- return Float(precision=53)
- elif col_type == "integer":
- # GH35076 Map pandas integer to optimal SQLAlchemy integer type
- if col.dtype.name.lower() in ("int8", "uint8", "int16"):
- return SmallInteger
- elif col.dtype.name.lower() in ("uint16", "int32"):
- return Integer
- elif col.dtype.name.lower() == "uint64":
- raise ValueError("Unsigned 64 bit integer datatype is not supported")
- else:
- return BigInteger
- elif col_type == "boolean":
- return Boolean
- elif col_type == "date":
- return Date
- elif col_type == "time":
- return Time
- elif col_type == "complex":
- raise ValueError("Complex datatypes not supported")
-
- return Text
-
- def _get_dtype(self, sqltype):
- from sqlalchemy.types import (
- TIMESTAMP,
- Boolean,
- Date,
- DateTime,
- Float,
- Integer,
- )
-
- if isinstance(sqltype, Float):
- return float
- elif isinstance(sqltype, Integer):
- # TODO: Refine integer size.
- return np.dtype("int64")
- elif isinstance(sqltype, TIMESTAMP):
- # we have a timezone capable type
- if not sqltype.timezone:
- return datetime
- return DatetimeTZDtype
- elif isinstance(sqltype, DateTime):
- # Caution: np.datetime64 is also a subclass of np.number.
- return datetime
- elif isinstance(sqltype, Date):
- return date
- elif isinstance(sqltype, Boolean):
- return bool
- return object
-
-
-class PandasSQL(PandasObject, ABC):
- """
- Subclasses Should define read_query and to_sql.
- """
-
- def __enter__(self):
- return self
-
- def __exit__(self, *args) -> None:
- pass
-
- def read_table(
- self,
- table_name: str,
- index_col: str | list[str] | None = None,
- coerce_float: bool = True,
- parse_dates=None,
- columns=None,
- schema: str | None = None,
- chunksize: int | None = None,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
- ) -> DataFrame | Iterator[DataFrame]:
- raise NotImplementedError
-
- @abstractmethod
- def read_query(
- self,
- sql: str,
- index_col: str | list[str] | None = None,
- coerce_float: bool = True,
- parse_dates=None,
- params=None,
- chunksize: int | None = None,
- dtype: DtypeArg | None = None,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
- ) -> DataFrame | Iterator[DataFrame]:
- pass
-
- @abstractmethod
- def to_sql(
- self,
- frame,
- name,
- if_exists: Literal["fail", "replace", "append"] = "fail",
- index: bool = True,
- index_label=None,
- schema=None,
- chunksize=None,
- dtype: DtypeArg | None = None,
- method=None,
- engine: str = "auto",
- **engine_kwargs,
- ) -> int | None:
- pass
-
- @abstractmethod
- def execute(self, sql: str | Select | TextClause, params=None):
- pass
-
- @abstractmethod
- def has_table(self, name: str, schema: str | None = None) -> bool:
- pass
-
- @abstractmethod
- def _create_sql_schema(
- self,
- frame: DataFrame,
- table_name: str,
- keys: list[str] | None = None,
- dtype: DtypeArg | None = None,
- schema: str | None = None,
- ):
- pass
-
-
-class BaseEngine:
- def insert_records(
- self,
- table: SQLTable,
- con,
- frame,
- name,
- index: bool | str | list[str] | None = True,
- schema=None,
- chunksize=None,
- method=None,
- **engine_kwargs,
- ) -> int | None:
- """
- Inserts data into already-prepared table
- """
- raise AbstractMethodError(self)
-
-
-class SQLAlchemyEngine(BaseEngine):
- def __init__(self) -> None:
- import_optional_dependency(
- "sqlalchemy", extra="sqlalchemy is required for SQL support."
- )
-
- def insert_records(
- self,
- table: SQLTable,
- con,
- frame,
- name,
- index: bool | str | list[str] | None = True,
- schema=None,
- chunksize=None,
- method=None,
- **engine_kwargs,
- ) -> int | None:
- from sqlalchemy import exc
-
- try:
- return table.insert(chunksize=chunksize, method=method)
- except exc.StatementError as err:
- # GH34431
- # https://stackoverflow.com/a/67358288/6067848
- msg = r"""(\(1054, "Unknown column 'inf(e0)?' in 'field list'"\))(?#
- )|inf can not be used with MySQL"""
- err_text = str(err.orig)
- if re.search(msg, err_text):
- raise ValueError("inf cannot be used with MySQL") from err
- raise err
-
-
-def get_engine(engine: str) -> BaseEngine:
- """return our implementation"""
- if engine == "auto":
- engine = get_option("io.sql.engine")
-
- if engine == "auto":
- # try engines in this order
- engine_classes = [SQLAlchemyEngine]
-
- error_msgs = ""
- for engine_class in engine_classes:
- try:
- return engine_class()
- except ImportError as err:
- error_msgs += "\n - " + str(err)
-
- raise ImportError(
- "Unable to find a usable engine; "
- "tried using: 'sqlalchemy'.\n"
- "A suitable version of "
- "sqlalchemy is required for sql I/O "
- "support.\n"
- "Trying to import the above resulted in these errors:"
- f"{error_msgs}"
- )
-
- if engine == "sqlalchemy":
- return SQLAlchemyEngine()
-
- raise ValueError("engine must be one of 'auto', 'sqlalchemy'")
-
-
-class SQLDatabase(PandasSQL):
- """
- This class enables conversion between DataFrame and SQL databases
- using SQLAlchemy to handle DataBase abstraction.
-
- Parameters
- ----------
- con : SQLAlchemy Connectable or URI string.
- Connectable to connect with the database. Using SQLAlchemy makes it
- possible to use any DB supported by that library.
- schema : string, default None
- Name of SQL schema in database to write to (if database flavor
- supports this). If None, use default schema (default).
- need_transaction : bool, default False
- If True, SQLDatabase will create a transaction.
-
- """
-
- def __init__(
- self, con, schema: str | None = None, need_transaction: bool = False
- ) -> None:
- from sqlalchemy import create_engine
- from sqlalchemy.engine import Engine
- from sqlalchemy.schema import MetaData
-
- # self.exit_stack cleans up the Engine and Connection and commits the
- # transaction if any of those objects was created below.
- # Cleanup happens either in self.__exit__ or at the end of the iterator
- # returned by read_sql when chunksize is not None.
- self.exit_stack = ExitStack()
- if isinstance(con, str):
- con = create_engine(con)
- self.exit_stack.callback(con.dispose)
- if isinstance(con, Engine):
- con = self.exit_stack.enter_context(con.connect())
- if need_transaction and not con.in_transaction():
- self.exit_stack.enter_context(con.begin())
- self.con = con
- self.meta = MetaData(schema=schema)
- self.returns_generator = False
-
- def __exit__(self, *args) -> None:
- if not self.returns_generator:
- self.exit_stack.close()
-
- @contextmanager
- def run_transaction(self):
- if not self.con.in_transaction():
- with self.con.begin():
- yield self.con
- else:
- yield self.con
-
- def execute(self, sql: str | Select | TextClause, params=None):
- """Simple passthrough to SQLAlchemy connectable"""
- args = [] if params is None else [params]
- if isinstance(sql, str):
- return self.con.exec_driver_sql(sql, *args)
- return self.con.execute(sql, *args)
-
- def read_table(
- self,
- table_name: str,
- index_col: str | list[str] | None = None,
- coerce_float: bool = True,
- parse_dates=None,
- columns=None,
- schema: str | None = None,
- chunksize: int | None = None,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
- ) -> DataFrame | Iterator[DataFrame]:
- """
- Read SQL database table into a DataFrame.
-
- Parameters
- ----------
- table_name : str
- Name of SQL table in database.
- index_col : string, optional, default: None
- Column to set as index.
- coerce_float : bool, default True
- Attempts to convert values of non-string, non-numeric objects
- (like decimal.Decimal) to floating point. This can result in
- loss of precision.
- parse_dates : list or dict, default: None
- - List of column names to parse as dates.
- - Dict of ``{column_name: format string}`` where format string is
- strftime compatible in case of parsing string times, or is one of
- (D, s, ns, ms, us) in case of parsing integer timestamps.
- - Dict of ``{column_name: arg}``, where the arg corresponds
- to the keyword arguments of :func:`pandas.to_datetime`.
- Especially useful with databases without native Datetime support,
- such as SQLite.
- columns : list, default: None
- List of column names to select from SQL table.
- schema : string, default None
- Name of SQL schema in database to query (if database flavor
- supports this). If specified, this overwrites the default
- schema of the SQL database object.
- chunksize : int, default None
- If specified, return an iterator where `chunksize` is the number
- of rows to include in each chunk.
- dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy dtypes
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- Returns
- -------
- DataFrame
-
- See Also
- --------
- pandas.read_sql_table
- SQLDatabase.read_query
-
- """
- self.meta.reflect(bind=self.con, only=[table_name])
- table = SQLTable(table_name, self, index=index_col, schema=schema)
- if chunksize is not None:
- self.returns_generator = True
- return table.read(
- self.exit_stack,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- columns=columns,
- chunksize=chunksize,
- dtype_backend=dtype_backend,
- )
-
- @staticmethod
- def _query_iterator(
- result,
- exit_stack: ExitStack,
- chunksize: int,
- columns,
- index_col=None,
- coerce_float: bool = True,
- parse_dates=None,
- dtype: DtypeArg | None = None,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
- ):
- """Return generator through chunked result set"""
- has_read_data = False
- with exit_stack:
- while True:
- data = result.fetchmany(chunksize)
- if not data:
- if not has_read_data:
- yield _wrap_result(
- [],
- columns,
- index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- dtype=dtype,
- dtype_backend=dtype_backend,
- )
- break
-
- has_read_data = True
- yield _wrap_result(
- data,
- columns,
- index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- dtype=dtype,
- dtype_backend=dtype_backend,
- )
-
- def read_query(
- self,
- sql: str,
- index_col: str | list[str] | None = None,
- coerce_float: bool = True,
- parse_dates=None,
- params=None,
- chunksize: int | None = None,
- dtype: DtypeArg | None = None,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
- ) -> DataFrame | Iterator[DataFrame]:
- """
- Read SQL query into a DataFrame.
-
- Parameters
- ----------
- sql : str
- SQL query to be executed.
- index_col : string, optional, default: None
- Column name to use as index for the returned DataFrame object.
- coerce_float : bool, default True
- Attempt to convert values of non-string, non-numeric objects (like
- decimal.Decimal) to floating point, useful for SQL result sets.
- params : list, tuple or dict, optional, default: None
- List of parameters to pass to execute method. The syntax used
- to pass parameters is database driver dependent. Check your
- database driver documentation for which of the five syntax styles,
- described in PEP 249's paramstyle, is supported.
- Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}
- parse_dates : list or dict, default: None
- - List of column names to parse as dates.
- - Dict of ``{column_name: format string}`` where format string is
- strftime compatible in case of parsing string times, or is one of
- (D, s, ns, ms, us) in case of parsing integer timestamps.
- - Dict of ``{column_name: arg dict}``, where the arg dict
- corresponds to the keyword arguments of
- :func:`pandas.to_datetime` Especially useful with databases
- without native Datetime support, such as SQLite.
- chunksize : int, default None
- If specified, return an iterator where `chunksize` is the number
- of rows to include in each chunk.
- dtype : Type name or dict of columns
- Data type for data or columns. E.g. np.float64 or
- {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’}
-
- .. versionadded:: 1.3.0
-
- Returns
- -------
- DataFrame
-
- See Also
- --------
- read_sql_table : Read SQL database table into a DataFrame.
- read_sql
-
- """
- result = self.execute(sql, params)
- columns = result.keys()
-
- if chunksize is not None:
- self.returns_generator = True
- return self._query_iterator(
- result,
- self.exit_stack,
- chunksize,
- columns,
- index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- dtype=dtype,
- dtype_backend=dtype_backend,
- )
- else:
- data = result.fetchall()
- frame = _wrap_result(
- data,
- columns,
- index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- dtype=dtype,
- dtype_backend=dtype_backend,
- )
- return frame
-
- read_sql = read_query
-
- def prep_table(
- self,
- frame,
- name,
- if_exists: Literal["fail", "replace", "append"] = "fail",
- index: bool | str | list[str] | None = True,
- index_label=None,
- schema=None,
- dtype: DtypeArg | None = None,
- ) -> SQLTable:
- """
- Prepares table in the database for data insertion. Creates it if needed, etc.
- """
- if dtype:
- if not is_dict_like(dtype):
- # error: Value expression in dictionary comprehension has incompatible
- # type "Union[ExtensionDtype, str, dtype[Any], Type[object],
- # Dict[Hashable, Union[ExtensionDtype, Union[str, dtype[Any]],
- # Type[str], Type[float], Type[int], Type[complex], Type[bool],
- # Type[object]]]]"; expected type "Union[ExtensionDtype, str,
- # dtype[Any], Type[object]]"
- dtype = {col_name: dtype for col_name in frame} # type: ignore[misc]
- else:
- dtype = cast(dict, dtype)
-
- from sqlalchemy.types import TypeEngine
-
- for col, my_type in dtype.items():
- if isinstance(my_type, type) and issubclass(my_type, TypeEngine):
- pass
- elif isinstance(my_type, TypeEngine):
- pass
- else:
- raise ValueError(f"The type of {col} is not a SQLAlchemy type")
-
- table = SQLTable(
- name,
- self,
- frame=frame,
- index=index,
- if_exists=if_exists,
- index_label=index_label,
- schema=schema,
- dtype=dtype,
- )
- table.create()
- return table
-
- def check_case_sensitive(
- self,
- name: str,
- schema: str | None,
- ) -> None:
- """
- Checks table name for issues with case-sensitivity.
- Method is called after data is inserted.
- """
- if not name.isdigit() and not name.islower():
- # check for potentially case sensitivity issues (GH7815)
- # Only check when name is not a number and name is not lower case
- from sqlalchemy import inspect as sqlalchemy_inspect
-
- insp = sqlalchemy_inspect(self.con)
- table_names = insp.get_table_names(schema=schema or self.meta.schema)
- if name not in table_names:
- msg = (
- f"The provided table name '{name}' is not found exactly as "
- "such in the database after writing the table, possibly "
- "due to case sensitivity issues. Consider using lower "
- "case table names."
- )
- warnings.warn(
- msg,
- UserWarning,
- stacklevel=find_stack_level(),
- )
-
- def to_sql(
- self,
- frame,
- name: str,
- if_exists: Literal["fail", "replace", "append"] = "fail",
- index: bool = True,
- index_label=None,
- schema: str | None = None,
- chunksize=None,
- dtype: DtypeArg | None = None,
- method=None,
- engine: str = "auto",
- **engine_kwargs,
- ) -> int | None:
- """
- Write records stored in a DataFrame to a SQL database.
-
- Parameters
- ----------
- frame : DataFrame
- name : string
- Name of SQL table.
- if_exists : {'fail', 'replace', 'append'}, default 'fail'
- - fail: If table exists, do nothing.
- - replace: If table exists, drop it, recreate it, and insert data.
- - append: If table exists, insert data. Create if does not exist.
- index : boolean, default True
- Write DataFrame index as a column.
- index_label : string or sequence, default None
- Column label for index column(s). If None is given (default) and
- `index` is True, then the index names are used.
- A sequence should be given if the DataFrame uses MultiIndex.
- schema : string, default None
- Name of SQL schema in database to write to (if database flavor
- supports this). If specified, this overwrites the default
- schema of the SQLDatabase object.
- chunksize : int, default None
- If not None, then rows will be written in batches of this size at a
- time. If None, all rows will be written at once.
- dtype : single type or dict of column name to SQL type, default None
- Optional specifying the datatype for columns. The SQL type should
- be a SQLAlchemy type. If all columns are of the same type, one
- single value can be used.
- method : {None', 'multi', callable}, default None
- Controls the SQL insertion clause used:
-
- * None : Uses standard SQL ``INSERT`` clause (one per row).
- * 'multi': Pass multiple values in a single ``INSERT`` clause.
- * callable with signature ``(pd_table, conn, keys, data_iter)``.
-
- Details and a sample callable implementation can be found in the
- section :ref:`insert method <io.sql.method>`.
- engine : {'auto', 'sqlalchemy'}, default 'auto'
- SQL engine library to use. If 'auto', then the option
- ``io.sql.engine`` is used. The default ``io.sql.engine``
- behavior is 'sqlalchemy'
-
- .. versionadded:: 1.3.0
-
- **engine_kwargs
- Any additional kwargs are passed to the engine.
- """
- sql_engine = get_engine(engine)
-
- table = self.prep_table(
- frame=frame,
- name=name,
- if_exists=if_exists,
- index=index,
- index_label=index_label,
- schema=schema,
- dtype=dtype,
- )
-
- total_inserted = sql_engine.insert_records(
- table=table,
- con=self.con,
- frame=frame,
- name=name,
- index=index,
- schema=schema,
- chunksize=chunksize,
- method=method,
- **engine_kwargs,
- )
-
- self.check_case_sensitive(name=name, schema=schema)
- return total_inserted
-
- @property
- def tables(self):
- return self.meta.tables
-
- def has_table(self, name: str, schema: str | None = None) -> bool:
- from sqlalchemy import inspect as sqlalchemy_inspect
-
- insp = sqlalchemy_inspect(self.con)
- return insp.has_table(name, schema or self.meta.schema)
-
- def get_table(self, table_name: str, schema: str | None = None) -> Table:
- from sqlalchemy import (
- Numeric,
- Table,
- )
-
- schema = schema or self.meta.schema
- tbl = Table(table_name, self.meta, autoload_with=self.con, schema=schema)
- for column in tbl.columns:
- if isinstance(column.type, Numeric):
- column.type.asdecimal = False
- return tbl
-
- def drop_table(self, table_name: str, schema: str | None = None) -> None:
- schema = schema or self.meta.schema
- if self.has_table(table_name, schema):
- self.meta.reflect(bind=self.con, only=[table_name], schema=schema)
- with self.run_transaction():
- self.get_table(table_name, schema).drop(bind=self.con)
- self.meta.clear()
-
- def _create_sql_schema(
- self,
- frame: DataFrame,
- table_name: str,
- keys: list[str] | None = None,
- dtype: DtypeArg | None = None,
- schema: str | None = None,
- ):
- table = SQLTable(
- table_name,
- self,
- frame=frame,
- index=False,
- keys=keys,
- dtype=dtype,
- schema=schema,
- )
- return str(table.sql_schema())
-
-
-# ---- SQL without SQLAlchemy ---
-# sqlite-specific sql strings and handler class
-# dictionary used for readability purposes
-_SQL_TYPES = {
- "string": "TEXT",
- "floating": "REAL",
- "integer": "INTEGER",
- "datetime": "TIMESTAMP",
- "date": "DATE",
- "time": "TIME",
- "boolean": "INTEGER",
-}
-
-
-def _get_unicode_name(name):
- try:
- uname = str(name).encode("utf-8", "strict").decode("utf-8")
- except UnicodeError as err:
- raise ValueError(f"Cannot convert identifier to UTF-8: '{name}'") from err
- return uname
-
-
-def _get_valid_sqlite_name(name):
- # See https://stackoverflow.com/questions/6514274/how-do-you-escape-strings\
- # -for-sqlite-table-column-names-in-python
- # Ensure the string can be encoded as UTF-8.
- # Ensure the string does not include any NUL characters.
- # Replace all " with "".
- # Wrap the entire thing in double quotes.
-
- uname = _get_unicode_name(name)
- if not len(uname):
- raise ValueError("Empty table or column name specified")
-
- nul_index = uname.find("\x00")
- if nul_index >= 0:
- raise ValueError("SQLite identifier cannot contain NULs")
- return '"' + uname.replace('"', '""') + '"'
-
-
-class SQLiteTable(SQLTable):
- """
- Patch the SQLTable for fallback support.
- Instead of a table variable just use the Create Table statement.
- """
-
- def __init__(self, *args, **kwargs) -> None:
- # GH 8341
- # register an adapter callable for datetime.time object
- import sqlite3
-
- # this will transform time(12,34,56,789) into '12:34:56.000789'
- # (this is what sqlalchemy does)
- def _adapt_time(t) -> str:
- # This is faster than strftime
- return f"{t.hour:02d}:{t.minute:02d}:{t.second:02d}.{t.microsecond:06d}"
-
- sqlite3.register_adapter(time, _adapt_time)
- super().__init__(*args, **kwargs)
-
- def sql_schema(self) -> str:
- return str(";\n".join(self.table))
-
- def _execute_create(self) -> None:
- with self.pd_sql.run_transaction() as conn:
- for stmt in self.table:
- conn.execute(stmt)
-
- def insert_statement(self, *, num_rows: int) -> str:
- names = list(map(str, self.frame.columns))
- wld = "?" # wildcard char
- escape = _get_valid_sqlite_name
-
- if self.index is not None:
- for idx in self.index[::-1]:
- names.insert(0, idx)
-
- bracketed_names = [escape(column) for column in names]
- col_names = ",".join(bracketed_names)
-
- row_wildcards = ",".join([wld] * len(names))
- wildcards = ",".join([f"({row_wildcards})" for _ in range(num_rows)])
- insert_statement = (
- f"INSERT INTO {escape(self.name)} ({col_names}) VALUES {wildcards}"
- )
- return insert_statement
-
- def _execute_insert(self, conn, keys, data_iter) -> int:
- data_list = list(data_iter)
- conn.executemany(self.insert_statement(num_rows=1), data_list)
- return conn.rowcount
-
- def _execute_insert_multi(self, conn, keys, data_iter) -> int:
- data_list = list(data_iter)
- flattened_data = [x for row in data_list for x in row]
- conn.execute(self.insert_statement(num_rows=len(data_list)), flattened_data)
- return conn.rowcount
-
- def _create_table_setup(self):
- """
- Return a list of SQL statements that creates a table reflecting the
- structure of a DataFrame. The first entry will be a CREATE TABLE
- statement while the rest will be CREATE INDEX statements.
- """
- column_names_and_types = self._get_column_names_and_types(self._sql_type_name)
- escape = _get_valid_sqlite_name
-
- create_tbl_stmts = [
- escape(cname) + " " + ctype for cname, ctype, _ in column_names_and_types
- ]
-
- if self.keys is not None and len(self.keys):
- if not is_list_like(self.keys):
- keys = [self.keys]
- else:
- keys = self.keys
- cnames_br = ", ".join([escape(c) for c in keys])
- create_tbl_stmts.append(
- f"CONSTRAINT {self.name}_pk PRIMARY KEY ({cnames_br})"
- )
- if self.schema:
- schema_name = self.schema + "."
- else:
- schema_name = ""
- create_stmts = [
- "CREATE TABLE "
- + schema_name
- + escape(self.name)
- + " (\n"
- + ",\n ".join(create_tbl_stmts)
- + "\n)"
- ]
-
- ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index]
- if len(ix_cols):
- cnames = "_".join(ix_cols)
- cnames_br = ",".join([escape(c) for c in ix_cols])
- create_stmts.append(
- "CREATE INDEX "
- + escape("ix_" + self.name + "_" + cnames)
- + "ON "
- + escape(self.name)
- + " ("
- + cnames_br
- + ")"
- )
-
- return create_stmts
-
- def _sql_type_name(self, col):
- dtype: DtypeArg = self.dtype or {}
- if is_dict_like(dtype):
- dtype = cast(dict, dtype)
- if col.name in dtype:
- return dtype[col.name]
-
- # Infer type of column, while ignoring missing values.
- # Needed for inserting typed data containing NULLs, GH 8778.
- col_type = lib.infer_dtype(col, skipna=True)
-
- if col_type == "timedelta64":
- warnings.warn(
- "the 'timedelta' type is not supported, and will be "
- "written as integer values (ns frequency) to the database.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- col_type = "integer"
-
- elif col_type == "datetime64":
- col_type = "datetime"
-
- elif col_type == "empty":
- col_type = "string"
-
- elif col_type == "complex":
- raise ValueError("Complex datatypes not supported")
-
- if col_type not in _SQL_TYPES:
- col_type = "string"
-
- return _SQL_TYPES[col_type]
-
-
-class SQLiteDatabase(PandasSQL):
- """
- Version of SQLDatabase to support SQLite connections (fallback without
- SQLAlchemy). This should only be used internally.
-
- Parameters
- ----------
- con : sqlite connection object
-
- """
-
- def __init__(self, con) -> None:
- self.con = con
-
- @contextmanager
- def run_transaction(self):
- cur = self.con.cursor()
- try:
- yield cur
- self.con.commit()
- except Exception:
- self.con.rollback()
- raise
- finally:
- cur.close()
-
- def execute(self, sql: str | Select | TextClause, params=None):
- if not isinstance(sql, str):
- raise TypeError("Query must be a string unless using sqlalchemy.")
- args = [] if params is None else [params]
- cur = self.con.cursor()
- try:
- cur.execute(sql, *args)
- return cur
- except Exception as exc:
- try:
- self.con.rollback()
- except Exception as inner_exc: # pragma: no cover
- ex = DatabaseError(
- f"Execution failed on sql: {sql}\n{exc}\nunable to rollback"
- )
- raise ex from inner_exc
-
- ex = DatabaseError(f"Execution failed on sql '{sql}': {exc}")
- raise ex from exc
-
- @staticmethod
- def _query_iterator(
- cursor,
- chunksize: int,
- columns,
- index_col=None,
- coerce_float: bool = True,
- parse_dates=None,
- dtype: DtypeArg | None = None,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
- ):
- """Return generator through chunked result set"""
- has_read_data = False
- while True:
- data = cursor.fetchmany(chunksize)
- if type(data) == tuple:
- data = list(data)
- if not data:
- cursor.close()
- if not has_read_data:
- result = DataFrame.from_records(
- [], columns=columns, coerce_float=coerce_float
- )
- if dtype:
- result = result.astype(dtype)
- yield result
- break
-
- has_read_data = True
- yield _wrap_result(
- data,
- columns,
- index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- dtype=dtype,
- dtype_backend=dtype_backend,
- )
-
- def read_query(
- self,
- sql,
- index_col=None,
- coerce_float: bool = True,
- parse_dates=None,
- params=None,
- chunksize: int | None = None,
- dtype: DtypeArg | None = None,
- dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
- ) -> DataFrame | Iterator[DataFrame]:
- cursor = self.execute(sql, params)
- columns = [col_desc[0] for col_desc in cursor.description]
-
- if chunksize is not None:
- return self._query_iterator(
- cursor,
- chunksize,
- columns,
- index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- dtype=dtype,
- dtype_backend=dtype_backend,
- )
- else:
- data = self._fetchall_as_list(cursor)
- cursor.close()
-
- frame = _wrap_result(
- data,
- columns,
- index_col=index_col,
- coerce_float=coerce_float,
- parse_dates=parse_dates,
- dtype=dtype,
- dtype_backend=dtype_backend,
- )
- return frame
-
- def _fetchall_as_list(self, cur):
- result = cur.fetchall()
- if not isinstance(result, list):
- result = list(result)
- return result
-
- def to_sql(
- self,
- frame,
- name,
- if_exists: str = "fail",
- index: bool = True,
- index_label=None,
- schema=None,
- chunksize=None,
- dtype: DtypeArg | None = None,
- method=None,
- engine: str = "auto",
- **engine_kwargs,
- ) -> int | None:
- """
- Write records stored in a DataFrame to a SQL database.
-
- Parameters
- ----------
- frame: DataFrame
- name: string
- Name of SQL table.
- if_exists: {'fail', 'replace', 'append'}, default 'fail'
- fail: If table exists, do nothing.
- replace: If table exists, drop it, recreate it, and insert data.
- append: If table exists, insert data. Create if it does not exist.
- index : bool, default True
- Write DataFrame index as a column
- index_label : string or sequence, default None
- Column label for index column(s). If None is given (default) and
- `index` is True, then the index names are used.
- A sequence should be given if the DataFrame uses MultiIndex.
- schema : string, default None
- Ignored parameter included for compatibility with SQLAlchemy
- version of ``to_sql``.
- chunksize : int, default None
- If not None, then rows will be written in batches of this
- size at a time. If None, all rows will be written at once.
- dtype : single type or dict of column name to SQL type, default None
- Optional specifying the datatype for columns. The SQL type should
- be a string. If all columns are of the same type, one single value
- can be used.
- method : {None, 'multi', callable}, default None
- Controls the SQL insertion clause used:
-
- * None : Uses standard SQL ``INSERT`` clause (one per row).
- * 'multi': Pass multiple values in a single ``INSERT`` clause.
- * callable with signature ``(pd_table, conn, keys, data_iter)``.
-
- Details and a sample callable implementation can be found in the
- section :ref:`insert method <io.sql.method>`.
- """
- if dtype:
- if not is_dict_like(dtype):
- # error: Value expression in dictionary comprehension has incompatible
- # type "Union[ExtensionDtype, str, dtype[Any], Type[object],
- # Dict[Hashable, Union[ExtensionDtype, Union[str, dtype[Any]],
- # Type[str], Type[float], Type[int], Type[complex], Type[bool],
- # Type[object]]]]"; expected type "Union[ExtensionDtype, str,
- # dtype[Any], Type[object]]"
- dtype = {col_name: dtype for col_name in frame} # type: ignore[misc]
- else:
- dtype = cast(dict, dtype)
-
- for col, my_type in dtype.items():
- if not isinstance(my_type, str):
- raise ValueError(f"{col} ({my_type}) not a string")
-
- table = SQLiteTable(
- name,
- self,
- frame=frame,
- index=index,
- if_exists=if_exists,
- index_label=index_label,
- dtype=dtype,
- )
- table.create()
- return table.insert(chunksize, method)
-
- def has_table(self, name: str, schema: str | None = None) -> bool:
- wld = "?"
- query = f"SELECT name FROM sqlite_master WHERE type='table' AND name={wld};"
-
- return len(self.execute(query, [name]).fetchall()) > 0
-
- def get_table(self, table_name: str, schema: str | None = None) -> None:
- return None # not supported in fallback mode
-
- def drop_table(self, name: str, schema: str | None = None) -> None:
- drop_sql = f"DROP TABLE {_get_valid_sqlite_name(name)}"
- self.execute(drop_sql)
-
- def _create_sql_schema(
- self,
- frame,
- table_name: str,
- keys=None,
- dtype: DtypeArg | None = None,
- schema: str | None = None,
- ):
- table = SQLiteTable(
- table_name,
- self,
- frame=frame,
- index=False,
- keys=keys,
- dtype=dtype,
- schema=schema,
- )
- return str(table.sql_schema())
-
-
-def get_schema(
- frame,
- name: str,
- keys=None,
- con=None,
- dtype: DtypeArg | None = None,
- schema: str | None = None,
-) -> str:
- """
- Get the SQL db table schema for the given frame.
-
- Parameters
- ----------
- frame : DataFrame
- name : str
- name of SQL table
- keys : string or sequence, default: None
- columns to use a primary key
- con: an open SQL database connection object or a SQLAlchemy connectable
- Using SQLAlchemy makes it possible to use any DB supported by that
- library, default: None
- If a DBAPI2 object, only sqlite3 is supported.
- dtype : dict of column name to SQL type, default None
- Optional specifying the datatype for columns. The SQL type should
- be a SQLAlchemy type, or a string for sqlite3 fallback connection.
- schema: str, default: None
- Optional specifying the schema to be used in creating the table.
-
- .. versionadded:: 1.2.0
- """
- with pandasSQL_builder(con=con) as pandas_sql:
- return pandas_sql._create_sql_schema(
- frame, name, keys=keys, dtype=dtype, schema=schema
- )
diff --git a/contrib/python/pandas/py3/pandas/io/stata.py b/contrib/python/pandas/py3/pandas/io/stata.py
deleted file mode 100644
index 860b347b834..00000000000
--- a/contrib/python/pandas/py3/pandas/io/stata.py
+++ /dev/null
@@ -1,3721 +0,0 @@
-"""
-Module contains tools for processing Stata files into DataFrames
-
-The StataReader below was originally written by Joe Presbrey as part of PyDTA.
-It has been extended and improved by Skipper Seabold from the Statsmodels
-project who also developed the StataWriter and was finally added to pandas in
-a once again improved version.
-
-You can find more information on http://presbrey.mit.edu/PyDTA and
-https://www.statsmodels.org/devel/
-"""
-from __future__ import annotations
-
-from collections import abc
-import datetime
-from io import BytesIO
-import os
-import struct
-import sys
-from types import TracebackType
-from typing import (
- IO,
- TYPE_CHECKING,
- Any,
- AnyStr,
- Callable,
- Final,
- Hashable,
- Sequence,
- cast,
-)
-import warnings
-
-from dateutil.relativedelta import relativedelta
-import numpy as np
-
-from pandas._libs.lib import infer_dtype
-from pandas._libs.writers import max_len_string_array
-from pandas._typing import (
- CompressionOptions,
- FilePath,
- ReadBuffer,
- StorageOptions,
- WriteBuffer,
-)
-from pandas.errors import (
- CategoricalConversionWarning,
- InvalidColumnName,
- PossiblePrecisionLoss,
- ValueLabelTypeMismatch,
-)
-from pandas.util._decorators import (
- Appender,
- doc,
-)
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import (
- ensure_object,
- is_categorical_dtype,
- is_datetime64_dtype,
- is_numeric_dtype,
-)
-
-from pandas import (
- Categorical,
- DatetimeIndex,
- NaT,
- Timestamp,
- isna,
- to_datetime,
- to_timedelta,
-)
-from pandas.core.arrays.boolean import BooleanDtype
-from pandas.core.arrays.integer import IntegerDtype
-from pandas.core.frame import DataFrame
-from pandas.core.indexes.base import Index
-from pandas.core.series import Series
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.common import get_handle
-
-if TYPE_CHECKING:
- from typing import Literal
-
-_version_error = (
- "Version of given Stata file is {version}. pandas supports importing "
- "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
- "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
- "and 119 (Stata 15/16, over 32,767 variables)."
-)
-
-_statafile_processing_params1 = """\
-convert_dates : bool, default True
- Convert date variables to DataFrame time values.
-convert_categoricals : bool, default True
- Read value labels and convert columns to Categorical/Factor variables."""
-
-_statafile_processing_params2 = """\
-index_col : str, optional
- Column to set as index.
-convert_missing : bool, default False
- Flag indicating whether to convert missing values to their Stata
- representations. If False, missing values are replaced with nan.
- If True, columns containing missing values are returned with
- object data types and missing values are represented by
- StataMissingValue objects.
-preserve_dtypes : bool, default True
- Preserve Stata datatypes. If False, numeric data are upcast to pandas
- default types for foreign data (float64 or int64).
-columns : list or None
- Columns to retain. Columns will be returned in the given order. None
- returns all columns.
-order_categoricals : bool, default True
- Flag indicating whether converted categorical data are ordered."""
-
-_chunksize_params = """\
-chunksize : int, default None
- Return StataReader object for iterations, returns chunks with
- given number of lines."""
-
-_iterator_params = """\
-iterator : bool, default False
- Return StataReader object."""
-
-_reader_notes = """\
-Notes
------
-Categorical variables read through an iterator may not have the same
-categories and dtype. This occurs when a variable stored in a DTA
-file is associated to an incomplete set of value labels that only
-label a strict subset of the values."""
-
-_read_stata_doc = f"""
-Read Stata file into DataFrame.
-
-Parameters
-----------
-filepath_or_buffer : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file. For file URLs, a host is
- expected. A local file could be: ``file://localhost/path/to/table.dta``.
-
- If you want to pass in a path object, pandas accepts any ``os.PathLike``.
-
- By file-like object, we refer to objects with a ``read()`` method,
- such as a file handle (e.g. via builtin ``open`` function)
- or ``StringIO``.
-{_statafile_processing_params1}
-{_statafile_processing_params2}
-{_chunksize_params}
-{_iterator_params}
-{_shared_docs["decompression_options"] % "filepath_or_buffer"}
-{_shared_docs["storage_options"]}
-
-Returns
--------
-DataFrame or StataReader
-
-See Also
---------
-io.stata.StataReader : Low-level reader for Stata data files.
-DataFrame.to_stata: Export Stata data files.
-
-{_reader_notes}
-
-Examples
---------
-
-Creating a dummy stata for this example
-
->>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 'parrot'],
-... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP
->>> df.to_stata('animals.dta') # doctest: +SKIP
-
-Read a Stata dta file:
-
->>> df = pd.read_stata('animals.dta') # doctest: +SKIP
-
-Read a Stata dta file in 10,000 line chunks:
-
->>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP
->>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP
->>> df.to_stata('filename.dta') # doctest: +SKIP
-
->>> with pd.read_stata('filename.dta', chunksize=10000) as itr: # doctest: +SKIP
->>> for chunk in itr:
-... # Operate on a single chunk, e.g., chunk.mean()
-... pass # doctest: +SKIP
-"""
-
-_read_method_doc = f"""\
-Reads observations from Stata file, converting them into a dataframe
-
-Parameters
-----------
-nrows : int
- Number of lines to read from data file, if None read whole file.
-{_statafile_processing_params1}
-{_statafile_processing_params2}
-
-Returns
--------
-DataFrame
-"""
-
-_stata_reader_doc = f"""\
-Class for reading Stata dta files.
-
-Parameters
-----------
-path_or_buf : path (string), buffer or path object
- string, path object (pathlib.Path or py._path.local.LocalPath) or object
- implementing a binary read() functions.
-{_statafile_processing_params1}
-{_statafile_processing_params2}
-{_chunksize_params}
-{_shared_docs["decompression_options"]}
-{_shared_docs["storage_options"]}
-
-{_reader_notes}
-"""
-
-
-_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
-
-
-stata_epoch: Final = datetime.datetime(1960, 1, 1)
-
-
-# TODO: Add typing. As of January 2020 it is not possible to type this function since
-# mypy doesn't understand that a Series and an int can be combined using mathematical
-# operations. (+, -).
-def _stata_elapsed_date_to_datetime_vec(dates, fmt) -> Series:
- """
- Convert from SIF to datetime. https://www.stata.com/help.cgi?datetime
-
- Parameters
- ----------
- dates : Series
- The Stata Internal Format date to convert to datetime according to fmt
- fmt : str
- The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
- Returns
-
- Returns
- -------
- converted : Series
- The converted dates
-
- Examples
- --------
- >>> dates = pd.Series([52])
- >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
- 0 1961-01-01
- dtype: datetime64[ns]
-
- Notes
- -----
- datetime/c - tc
- milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
- datetime/C - tC - NOT IMPLEMENTED
- milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
- date - td
- days since 01jan1960 (01jan1960 = 0)
- weekly date - tw
- weeks since 1960w1
- This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
- The datetime value is the start of the week in terms of days in the
- year, not ISO calendar weeks.
- monthly date - tm
- months since 1960m1
- quarterly date - tq
- quarters since 1960q1
- half-yearly date - th
- half-years since 1960h1 yearly
- date - ty
- years since 0000
- """
- MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year
- MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days
- MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days
- MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
- MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
-
- def convert_year_month_safe(year, month) -> Series:
- """
- Convert year and month to datetimes, using pandas vectorized versions
- when the date range falls within the range supported by pandas.
- Otherwise it falls back to a slower but more robust method
- using datetime.
- """
- if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
- return to_datetime(100 * year + month, format="%Y%m")
- else:
- index = getattr(year, "index", None)
- return Series(
- [datetime.datetime(y, m, 1) for y, m in zip(year, month)], index=index
- )
-
- def convert_year_days_safe(year, days) -> Series:
- """
- Converts year (e.g. 1999) and days since the start of the year to a
- datetime or datetime64 Series
- """
- if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
- return to_datetime(year, format="%Y") + to_timedelta(days, unit="d")
- else:
- index = getattr(year, "index", None)
- value = [
- datetime.datetime(y, 1, 1) + relativedelta(days=int(d))
- for y, d in zip(year, days)
- ]
- return Series(value, index=index)
-
- def convert_delta_safe(base, deltas, unit) -> Series:
- """
- Convert base dates and deltas to datetimes, using pandas vectorized
- versions if the deltas satisfy restrictions required to be expressed
- as dates in pandas.
- """
- index = getattr(deltas, "index", None)
- if unit == "d":
- if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
- values = [base + relativedelta(days=int(d)) for d in deltas]
- return Series(values, index=index)
- elif unit == "ms":
- if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
- values = [
- base + relativedelta(microseconds=(int(d) * 1000)) for d in deltas
- ]
- return Series(values, index=index)
- else:
- raise ValueError("format not understood")
- base = to_datetime(base)
- deltas = to_timedelta(deltas, unit=unit)
- return base + deltas
-
- # TODO(non-nano): If/when pandas supports more than datetime64[ns], this
- # should be improved to use correct range, e.g. datetime[Y] for yearly
- bad_locs = np.isnan(dates)
- has_bad_values = False
- if bad_locs.any():
- has_bad_values = True
- # reset cache to avoid SettingWithCopy checks (we own the DataFrame and the
- # `dates` Series is used to overwrite itself in the DataFramae)
- dates._reset_cacher()
- dates[bad_locs] = 1.0 # Replace with NaT
- dates = dates.astype(np.int64)
-
- if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
- base = stata_epoch
- ms = dates
- conv_dates = convert_delta_safe(base, ms, "ms")
- elif fmt.startswith(("%tC", "tC")):
- warnings.warn(
- "Encountered %tC format. Leaving in Stata Internal Format.",
- stacklevel=find_stack_level(),
- )
- conv_dates = Series(dates, dtype=object)
- if has_bad_values:
- conv_dates[bad_locs] = NaT
- return conv_dates
- # Delta days relative to base
- elif fmt.startswith(("%td", "td", "%d", "d")):
- base = stata_epoch
- days = dates
- conv_dates = convert_delta_safe(base, days, "d")
- # does not count leap days - 7 days is a week.
- # 52nd week may have more than 7 days
- elif fmt.startswith(("%tw", "tw")):
- year = stata_epoch.year + dates // 52
- days = (dates % 52) * 7
- conv_dates = convert_year_days_safe(year, days)
- elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
- year = stata_epoch.year + dates // 12
- month = (dates % 12) + 1
- conv_dates = convert_year_month_safe(year, month)
- elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base
- year = stata_epoch.year + dates // 4
- quarter_month = (dates % 4) * 3 + 1
- conv_dates = convert_year_month_safe(year, quarter_month)
- elif fmt.startswith(("%th", "th")): # Delta half-years relative to base
- year = stata_epoch.year + dates // 2
- month = (dates % 2) * 6 + 1
- conv_dates = convert_year_month_safe(year, month)
- elif fmt.startswith(("%ty", "ty")): # Years -- not delta
- year = dates
- first_month = np.ones_like(dates)
- conv_dates = convert_year_month_safe(year, first_month)
- else:
- raise ValueError(f"Date fmt {fmt} not understood")
-
- if has_bad_values: # Restore NaT for bad values
- conv_dates[bad_locs] = NaT
-
- return conv_dates
-
-
-def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series:
- """
- Convert from datetime to SIF. https://www.stata.com/help.cgi?datetime
-
- Parameters
- ----------
- dates : Series
- Series or array containing datetime.datetime or datetime64[ns] to
- convert to the Stata Internal Format given by fmt
- fmt : str
- The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
- """
- index = dates.index
- NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000
- US_PER_DAY = NS_PER_DAY / 1000
-
- def parse_dates_safe(
- dates, delta: bool = False, year: bool = False, days: bool = False
- ):
- d = {}
- if is_datetime64_dtype(dates.dtype):
- if delta:
- time_delta = dates - Timestamp(stata_epoch).as_unit("ns")
- d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds
- if days or year:
- date_index = DatetimeIndex(dates)
- d["year"] = date_index._data.year
- d["month"] = date_index._data.month
- if days:
- days_in_ns = dates.view(np.int64) - to_datetime(
- d["year"], format="%Y"
- ).view(np.int64)
- d["days"] = days_in_ns // NS_PER_DAY
-
- elif infer_dtype(dates, skipna=False) == "datetime":
- if delta:
- delta = dates._values - stata_epoch
-
- def f(x: datetime.timedelta) -> float:
- return US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds
-
- v = np.vectorize(f)
- d["delta"] = v(delta)
- if year:
- year_month = dates.apply(lambda x: 100 * x.year + x.month)
- d["year"] = year_month._values // 100
- d["month"] = year_month._values - d["year"] * 100
- if days:
-
- def g(x: datetime.datetime) -> int:
- return (x - datetime.datetime(x.year, 1, 1)).days
-
- v = np.vectorize(g)
- d["days"] = v(dates)
- else:
- raise ValueError(
- "Columns containing dates must contain either "
- "datetime64, datetime.datetime or null values."
- )
-
- return DataFrame(d, index=index)
-
- bad_loc = isna(dates)
- index = dates.index
- if bad_loc.any():
- dates = Series(dates)
- if is_datetime64_dtype(dates):
- dates[bad_loc] = to_datetime(stata_epoch)
- else:
- dates[bad_loc] = stata_epoch
-
- if fmt in ["%tc", "tc"]:
- d = parse_dates_safe(dates, delta=True)
- conv_dates = d.delta / 1000
- elif fmt in ["%tC", "tC"]:
- warnings.warn(
- "Stata Internal Format tC not supported.",
- stacklevel=find_stack_level(),
- )
- conv_dates = dates
- elif fmt in ["%td", "td"]:
- d = parse_dates_safe(dates, delta=True)
- conv_dates = d.delta // US_PER_DAY
- elif fmt in ["%tw", "tw"]:
- d = parse_dates_safe(dates, year=True, days=True)
- conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7
- elif fmt in ["%tm", "tm"]:
- d = parse_dates_safe(dates, year=True)
- conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1
- elif fmt in ["%tq", "tq"]:
- d = parse_dates_safe(dates, year=True)
- conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3
- elif fmt in ["%th", "th"]:
- d = parse_dates_safe(dates, year=True)
- conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(int)
- elif fmt in ["%ty", "ty"]:
- d = parse_dates_safe(dates, year=True)
- conv_dates = d.year
- else:
- raise ValueError(f"Format {fmt} is not a known Stata date format")
-
- conv_dates = Series(conv_dates, dtype=np.float64)
- missing_value = struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]
- conv_dates[bad_loc] = missing_value
-
- return Series(conv_dates, index=index)
-
-
-excessive_string_length_error: Final = """
-Fixed width strings in Stata .dta files are limited to 244 (or fewer)
-characters. Column '{0}' does not satisfy this restriction. Use the
-'version=117' parameter to write the newer (Stata 13 and later) format.
-"""
-
-
-precision_loss_doc: Final = """
-Column converted from {0} to {1}, and some data are outside of the lossless
-conversion range. This may result in a loss of precision in the saved data.
-"""
-
-
-value_label_mismatch_doc: Final = """
-Stata value labels (pandas categories) must be strings. Column {0} contains
-non-string labels which will be converted to strings. Please check that the
-Stata data file created has not lost information due to duplicate labels.
-"""
-
-
-invalid_name_doc: Final = """
-Not all pandas column names were valid Stata variable names.
-The following replacements have been made:
-
- {0}
-
-If this is not what you expect, please make sure you have Stata-compliant
-column names in your DataFrame (strings only, max 32 characters, only
-alphanumerics and underscores, no Stata reserved words)
-"""
-
-
-categorical_conversion_warning: Final = """
-One or more series with value labels are not fully labeled. Reading this
-dataset with an iterator results in categorical variable with different
-categories. This occurs since it is not possible to know all possible values
-until the entire dataset has been read. To avoid this warning, you can either
-read dataset without an iterator, or manually convert categorical data by
-``convert_categoricals`` to False and then accessing the variable labels
-through the value_labels method of the reader.
-"""
-
-
-def _cast_to_stata_types(data: DataFrame) -> DataFrame:
- """
- Checks the dtypes of the columns of a pandas DataFrame for
- compatibility with the data types and ranges supported by Stata, and
- converts if necessary.
-
- Parameters
- ----------
- data : DataFrame
- The DataFrame to check and convert
-
- Notes
- -----
- Numeric columns in Stata must be one of int8, int16, int32, float32 or
- float64, with some additional value restrictions. int8 and int16 columns
- are checked for violations of the value restrictions and upcast if needed.
- int64 data is not usable in Stata, and so it is downcast to int32 whenever
- the value are in the int32 range, and sidecast to float64 when larger than
- this range. If the int64 values are outside of the range of those
- perfectly representable as float64 values, a warning is raised.
-
- bool columns are cast to int8. uint columns are converted to int of the
- same size if there is no loss in precision, otherwise are upcast to a
- larger type. uint64 is currently not supported since it is concerted to
- object in a DataFrame.
- """
- ws = ""
- # original, if small, if large
- conversion_data: tuple[
- tuple[type, type, type],
- tuple[type, type, type],
- tuple[type, type, type],
- tuple[type, type, type],
- tuple[type, type, type],
- ] = (
- (np.bool_, np.int8, np.int8),
- (np.uint8, np.int8, np.int16),
- (np.uint16, np.int16, np.int32),
- (np.uint32, np.int32, np.int64),
- (np.uint64, np.int64, np.float64),
- )
-
- float32_max = struct.unpack("<f", b"\xff\xff\xff\x7e")[0]
- float64_max = struct.unpack("<d", b"\xff\xff\xff\xff\xff\xff\xdf\x7f")[0]
-
- for col in data:
- # Cast from unsupported types to supported types
- is_nullable_int = isinstance(data[col].dtype, (IntegerDtype, BooleanDtype))
- orig = data[col]
- # We need to find orig_missing before altering data below
- orig_missing = orig.isna()
- if is_nullable_int:
- missing_loc = data[col].isna()
- if missing_loc.any():
- # Replace with always safe value
- fv = 0 if isinstance(data[col].dtype, IntegerDtype) else False
- data.loc[missing_loc, col] = fv
- # Replace with NumPy-compatible column
- data[col] = data[col].astype(data[col].dtype.numpy_dtype)
- dtype = data[col].dtype
- for c_data in conversion_data:
- if dtype == c_data[0]:
- if data[col].max() <= np.iinfo(c_data[1]).max:
- dtype = c_data[1]
- else:
- dtype = c_data[2]
- if c_data[2] == np.int64: # Warn if necessary
- if data[col].max() >= 2**53:
- ws = precision_loss_doc.format("uint64", "float64")
-
- data[col] = data[col].astype(dtype)
-
- # Check values and upcast if necessary
- if dtype == np.int8:
- if data[col].max() > 100 or data[col].min() < -127:
- data[col] = data[col].astype(np.int16)
- elif dtype == np.int16:
- if data[col].max() > 32740 or data[col].min() < -32767:
- data[col] = data[col].astype(np.int32)
- elif dtype == np.int64:
- if data[col].max() <= 2147483620 and data[col].min() >= -2147483647:
- data[col] = data[col].astype(np.int32)
- else:
- data[col] = data[col].astype(np.float64)
- if data[col].max() >= 2**53 or data[col].min() <= -(2**53):
- ws = precision_loss_doc.format("int64", "float64")
- elif dtype in (np.float32, np.float64):
- if np.isinf(data[col]).any():
- raise ValueError(
- f"Column {col} contains infinity or -infinity"
- "which is outside the range supported by Stata."
- )
- value = data[col].max()
- if dtype == np.float32 and value > float32_max:
- data[col] = data[col].astype(np.float64)
- elif dtype == np.float64:
- if value > float64_max:
- raise ValueError(
- f"Column {col} has a maximum value ({value}) outside the range "
- f"supported by Stata ({float64_max})"
- )
- if is_nullable_int:
- if orig_missing.any():
- # Replace missing by Stata sentinel value
- sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name]
- data.loc[orig_missing, col] = sentinel
- if ws:
- warnings.warn(
- ws,
- PossiblePrecisionLoss,
- stacklevel=find_stack_level(),
- )
-
- return data
-
-
-class StataValueLabel:
- """
- Parse a categorical column and prepare formatted output
-
- Parameters
- ----------
- catarray : Series
- Categorical Series to encode
- encoding : {"latin-1", "utf-8"}
- Encoding to use for value labels.
- """
-
- def __init__(
- self, catarray: Series, encoding: Literal["latin-1", "utf-8"] = "latin-1"
- ) -> None:
- if encoding not in ("latin-1", "utf-8"):
- raise ValueError("Only latin-1 and utf-8 are supported.")
- self.labname = catarray.name
- self._encoding = encoding
- categories = catarray.cat.categories
- self.value_labels: list[tuple[float, str]] = list(
- zip(np.arange(len(categories)), categories)
- )
- self.value_labels.sort(key=lambda x: x[0])
-
- self._prepare_value_labels()
-
- def _prepare_value_labels(self):
- """Encode value labels."""
-
- self.text_len = 0
- self.txt: list[bytes] = []
- self.n = 0
- # Offsets (length of categories), converted to int32
- self.off = np.array([], dtype=np.int32)
- # Values, converted to int32
- self.val = np.array([], dtype=np.int32)
- self.len = 0
-
- # Compute lengths and setup lists of offsets and labels
- offsets: list[int] = []
- values: list[float] = []
- for vl in self.value_labels:
- category: str | bytes = vl[1]
- if not isinstance(category, str):
- category = str(category)
- warnings.warn(
- value_label_mismatch_doc.format(self.labname),
- ValueLabelTypeMismatch,
- stacklevel=find_stack_level(),
- )
- category = category.encode(self._encoding)
- offsets.append(self.text_len)
- self.text_len += len(category) + 1 # +1 for the padding
- values.append(vl[0])
- self.txt.append(category)
- self.n += 1
-
- if self.text_len > 32000:
- raise ValueError(
- "Stata value labels for a single variable must "
- "have a combined length less than 32,000 characters."
- )
-
- # Ensure int32
- self.off = np.array(offsets, dtype=np.int32)
- self.val = np.array(values, dtype=np.int32)
-
- # Total length
- self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
-
- def generate_value_label(self, byteorder: str) -> bytes:
- """
- Generate the binary representation of the value labels.
-
- Parameters
- ----------
- byteorder : str
- Byte order of the output
-
- Returns
- -------
- value_label : bytes
- Bytes containing the formatted value label
- """
- encoding = self._encoding
- bio = BytesIO()
- null_byte = b"\x00"
-
- # len
- bio.write(struct.pack(byteorder + "i", self.len))
-
- # labname
- labname = str(self.labname)[:32].encode(encoding)
- lab_len = 32 if encoding not in ("utf-8", "utf8") else 128
- labname = _pad_bytes(labname, lab_len + 1)
- bio.write(labname)
-
- # padding - 3 bytes
- for i in range(3):
- bio.write(struct.pack("c", null_byte))
-
- # value_label_table
- # n - int32
- bio.write(struct.pack(byteorder + "i", self.n))
-
- # textlen - int32
- bio.write(struct.pack(byteorder + "i", self.text_len))
-
- # off - int32 array (n elements)
- for offset in self.off:
- bio.write(struct.pack(byteorder + "i", offset))
-
- # val - int32 array (n elements)
- for value in self.val:
- bio.write(struct.pack(byteorder + "i", value))
-
- # txt - Text labels, null terminated
- for text in self.txt:
- bio.write(text + null_byte)
-
- return bio.getvalue()
-
-
-class StataNonCatValueLabel(StataValueLabel):
- """
- Prepare formatted version of value labels
-
- Parameters
- ----------
- labname : str
- Value label name
- value_labels: Dictionary
- Mapping of values to labels
- encoding : {"latin-1", "utf-8"}
- Encoding to use for value labels.
- """
-
- def __init__(
- self,
- labname: str,
- value_labels: dict[float, str],
- encoding: Literal["latin-1", "utf-8"] = "latin-1",
- ) -> None:
- if encoding not in ("latin-1", "utf-8"):
- raise ValueError("Only latin-1 and utf-8 are supported.")
-
- self.labname = labname
- self._encoding = encoding
- self.value_labels: list[tuple[float, str]] = sorted(
- value_labels.items(), key=lambda x: x[0]
- )
- self._prepare_value_labels()
-
-
-class StataMissingValue:
- """
- An observation's missing value.
-
- Parameters
- ----------
- value : {int, float}
- The Stata missing value code
-
- Notes
- -----
- More information: <https://www.stata.com/help.cgi?missing>
-
- Integer missing values make the code '.', '.a', ..., '.z' to the ranges
- 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ...
- 2147483647 (for int32). Missing values for floating point data types are
- more complex but the pattern is simple to discern from the following table.
-
- np.float32 missing values (float in Stata)
- 0000007f .
- 0008007f .a
- 0010007f .b
- ...
- 00c0007f .x
- 00c8007f .y
- 00d0007f .z
-
- np.float64 missing values (double in Stata)
- 000000000000e07f .
- 000000000001e07f .a
- 000000000002e07f .b
- ...
- 000000000018e07f .x
- 000000000019e07f .y
- 00000000001ae07f .z
- """
-
- # Construct a dictionary of missing values
- MISSING_VALUES: dict[float, str] = {}
- bases: Final = (101, 32741, 2147483621)
- for b in bases:
- # Conversion to long to avoid hash issues on 32 bit platforms #8968
- MISSING_VALUES[b] = "."
- for i in range(1, 27):
- MISSING_VALUES[i + b] = "." + chr(96 + i)
-
- float32_base: bytes = b"\x00\x00\x00\x7f"
- increment: int = struct.unpack("<i", b"\x00\x08\x00\x00")[0]
- for i in range(27):
- key = struct.unpack("<f", float32_base)[0]
- MISSING_VALUES[key] = "."
- if i > 0:
- MISSING_VALUES[key] += chr(96 + i)
- int_value = struct.unpack("<i", struct.pack("<f", key))[0] + increment
- float32_base = struct.pack("<i", int_value)
-
- float64_base: bytes = b"\x00\x00\x00\x00\x00\x00\xe0\x7f"
- increment = struct.unpack("q", b"\x00\x00\x00\x00\x00\x01\x00\x00")[0]
- for i in range(27):
- key = struct.unpack("<d", float64_base)[0]
- MISSING_VALUES[key] = "."
- if i > 0:
- MISSING_VALUES[key] += chr(96 + i)
- int_value = struct.unpack("q", struct.pack("<d", key))[0] + increment
- float64_base = struct.pack("q", int_value)
-
- BASE_MISSING_VALUES: Final = {
- "int8": 101,
- "int16": 32741,
- "int32": 2147483621,
- "float32": struct.unpack("<f", float32_base)[0],
- "float64": struct.unpack("<d", float64_base)[0],
- }
-
- def __init__(self, value: float) -> None:
- self._value = value
- # Conversion to int to avoid hash issues on 32 bit platforms #8968
- value = int(value) if value < 2147483648 else float(value)
- self._str = self.MISSING_VALUES[value]
-
- @property
- def string(self) -> str:
- """
- The Stata representation of the missing value: '.', '.a'..'.z'
-
- Returns
- -------
- str
- The representation of the missing value.
- """
- return self._str
-
- @property
- def value(self) -> float:
- """
- The binary representation of the missing value.
-
- Returns
- -------
- {int, float}
- The binary representation of the missing value.
- """
- return self._value
-
- def __str__(self) -> str:
- return self.string
-
- def __repr__(self) -> str:
- return f"{type(self)}({self})"
-
- def __eq__(self, other: Any) -> bool:
- return (
- isinstance(other, type(self))
- and self.string == other.string
- and self.value == other.value
- )
-
- @classmethod
- def get_base_missing_value(cls, dtype: np.dtype) -> float:
- if dtype.type is np.int8:
- value = cls.BASE_MISSING_VALUES["int8"]
- elif dtype.type is np.int16:
- value = cls.BASE_MISSING_VALUES["int16"]
- elif dtype.type is np.int32:
- value = cls.BASE_MISSING_VALUES["int32"]
- elif dtype.type is np.float32:
- value = cls.BASE_MISSING_VALUES["float32"]
- elif dtype.type is np.float64:
- value = cls.BASE_MISSING_VALUES["float64"]
- else:
- raise ValueError("Unsupported dtype")
- return value
-
-
-class StataParser:
- def __init__(self) -> None:
- # type code.
- # --------------------
- # str1 1 = 0x01
- # str2 2 = 0x02
- # ...
- # str244 244 = 0xf4
- # byte 251 = 0xfb (sic)
- # int 252 = 0xfc
- # long 253 = 0xfd
- # float 254 = 0xfe
- # double 255 = 0xff
- # --------------------
- # NOTE: the byte type seems to be reserved for categorical variables
- # with a label, but the underlying variable is -127 to 100
- # we're going to drop the label and cast to int
- self.DTYPE_MAP = dict(
- list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)]))
- + [
- (251, np.dtype(np.int8)),
- (252, np.dtype(np.int16)),
- (253, np.dtype(np.int32)),
- (254, np.dtype(np.float32)),
- (255, np.dtype(np.float64)),
- ]
- )
- self.DTYPE_MAP_XML: dict[int, np.dtype] = {
- 32768: np.dtype(np.uint8), # Keys to GSO
- 65526: np.dtype(np.float64),
- 65527: np.dtype(np.float32),
- 65528: np.dtype(np.int32),
- 65529: np.dtype(np.int16),
- 65530: np.dtype(np.int8),
- }
- self.TYPE_MAP = list(tuple(range(251)) + tuple("bhlfd"))
- self.TYPE_MAP_XML = {
- # Not really a Q, unclear how to handle byteswap
- 32768: "Q",
- 65526: "d",
- 65527: "f",
- 65528: "l",
- 65529: "h",
- 65530: "b",
- }
- # NOTE: technically, some of these are wrong. there are more numbers
- # that can be represented. it's the 27 ABOVE and BELOW the max listed
- # numeric data type in [U] 12.2.2 of the 11.2 manual
- float32_min = b"\xff\xff\xff\xfe"
- float32_max = b"\xff\xff\xff\x7e"
- float64_min = b"\xff\xff\xff\xff\xff\xff\xef\xff"
- float64_max = b"\xff\xff\xff\xff\xff\xff\xdf\x7f"
- self.VALID_RANGE = {
- "b": (-127, 100),
- "h": (-32767, 32740),
- "l": (-2147483647, 2147483620),
- "f": (
- np.float32(struct.unpack("<f", float32_min)[0]),
- np.float32(struct.unpack("<f", float32_max)[0]),
- ),
- "d": (
- np.float64(struct.unpack("<d", float64_min)[0]),
- np.float64(struct.unpack("<d", float64_max)[0]),
- ),
- }
-
- self.OLD_TYPE_MAPPING = {
- 98: 251, # byte
- 105: 252, # int
- 108: 253, # long
- 102: 254, # float
- 100: 255, # double
- }
-
- # These missing values are the generic '.' in Stata, and are used
- # to replace nans
- self.MISSING_VALUES = {
- "b": 101,
- "h": 32741,
- "l": 2147483621,
- "f": np.float32(struct.unpack("<f", b"\x00\x00\x00\x7f")[0]),
- "d": np.float64(
- struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]
- ),
- }
- self.NUMPY_TYPE_MAP = {
- "b": "i1",
- "h": "i2",
- "l": "i4",
- "f": "f4",
- "d": "f8",
- "Q": "u8",
- }
-
- # Reserved words cannot be used as variable names
- self.RESERVED_WORDS = (
- "aggregate",
- "array",
- "boolean",
- "break",
- "byte",
- "case",
- "catch",
- "class",
- "colvector",
- "complex",
- "const",
- "continue",
- "default",
- "delegate",
- "delete",
- "do",
- "double",
- "else",
- "eltypedef",
- "end",
- "enum",
- "explicit",
- "export",
- "external",
- "float",
- "for",
- "friend",
- "function",
- "global",
- "goto",
- "if",
- "inline",
- "int",
- "local",
- "long",
- "NULL",
- "pragma",
- "protected",
- "quad",
- "rowvector",
- "short",
- "typedef",
- "typename",
- "virtual",
- "_all",
- "_N",
- "_skip",
- "_b",
- "_pi",
- "str#",
- "in",
- "_pred",
- "strL",
- "_coef",
- "_rc",
- "using",
- "_cons",
- "_se",
- "with",
- "_n",
- )
-
-
-class StataReader(StataParser, abc.Iterator):
- __doc__ = _stata_reader_doc
-
- _path_or_buf: IO[bytes] # Will be assigned by `_open_file`.
-
- def __init__(
- self,
- path_or_buf: FilePath | ReadBuffer[bytes],
- convert_dates: bool = True,
- convert_categoricals: bool = True,
- index_col: str | None = None,
- convert_missing: bool = False,
- preserve_dtypes: bool = True,
- columns: Sequence[str] | None = None,
- order_categoricals: bool = True,
- chunksize: int | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
- ) -> None:
- super().__init__()
- self._col_sizes: list[int] = []
-
- # Arguments to the reader (can be temporarily overridden in
- # calls to read).
- self._convert_dates = convert_dates
- self._convert_categoricals = convert_categoricals
- self._index_col = index_col
- self._convert_missing = convert_missing
- self._preserve_dtypes = preserve_dtypes
- self._columns = columns
- self._order_categoricals = order_categoricals
- self._original_path_or_buf = path_or_buf
- self._compression = compression
- self._storage_options = storage_options
- self._encoding = ""
- self._chunksize = chunksize
- self._using_iterator = False
- self._entered = False
- if self._chunksize is None:
- self._chunksize = 1
- elif not isinstance(chunksize, int) or chunksize <= 0:
- raise ValueError("chunksize must be a positive integer when set.")
-
- # State variables for the file
- self._close_file: Callable[[], None] | None = None
- self._has_string_data = False
- self._missing_values = False
- self._can_read_value_labels = False
- self._column_selector_set = False
- self._value_labels_read = False
- self._data_read = False
- self._dtype: np.dtype | None = None
- self._lines_read = 0
-
- self._native_byteorder = _set_endianness(sys.byteorder)
-
- def _ensure_open(self) -> None:
- """
- Ensure the file has been opened and its header data read.
- """
- if not hasattr(self, "_path_or_buf"):
- self._open_file()
-
- def _open_file(self) -> None:
- """
- Open the file (with compression options, etc.), and read header information.
- """
- if not self._entered:
- warnings.warn(
- "StataReader is being used without using a context manager. "
- "Using StataReader as a context manager is the only supported method.",
- ResourceWarning,
- stacklevel=find_stack_level(),
- )
- handles = get_handle(
- self._original_path_or_buf,
- "rb",
- storage_options=self._storage_options,
- is_text=False,
- compression=self._compression,
- )
- if hasattr(handles.handle, "seekable") and handles.handle.seekable():
- # If the handle is directly seekable, use it without an extra copy.
- self._path_or_buf = handles.handle
- self._close_file = handles.close
- else:
- # Copy to memory, and ensure no encoding.
- with handles:
- self._path_or_buf = BytesIO(handles.handle.read())
- self._close_file = self._path_or_buf.close
-
- self._read_header()
- self._setup_dtype()
-
- def __enter__(self) -> StataReader:
- """enter context manager"""
- self._entered = True
- return self
-
- def __exit__(
- self,
- exc_type: type[BaseException] | None,
- exc_value: BaseException | None,
- traceback: TracebackType | None,
- ) -> None:
- if self._close_file:
- self._close_file()
-
- def close(self) -> None:
- """Close the handle if its open.
-
- .. deprecated: 2.0.0
-
- The close method is not part of the public API.
- The only supported way to use StataReader is to use it as a context manager.
- """
- warnings.warn(
- "The StataReader.close() method is not part of the public API and "
- "will be removed in a future version without notice. "
- "Using StataReader as a context manager is the only supported method.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- if self._close_file:
- self._close_file()
-
- def _set_encoding(self) -> None:
- """
- Set string encoding which depends on file version
- """
- if self._format_version < 118:
- self._encoding = "latin-1"
- else:
- self._encoding = "utf-8"
-
- def _read_int8(self) -> int:
- return struct.unpack("b", self._path_or_buf.read(1))[0]
-
- def _read_uint8(self) -> int:
- return struct.unpack("B", self._path_or_buf.read(1))[0]
-
- def _read_uint16(self) -> int:
- return struct.unpack(f"{self._byteorder}H", self._path_or_buf.read(2))[0]
-
- def _read_uint32(self) -> int:
- return struct.unpack(f"{self._byteorder}I", self._path_or_buf.read(4))[0]
-
- def _read_uint64(self) -> int:
- return struct.unpack(f"{self._byteorder}Q", self._path_or_buf.read(8))[0]
-
- def _read_int16(self) -> int:
- return struct.unpack(f"{self._byteorder}h", self._path_or_buf.read(2))[0]
-
- def _read_int32(self) -> int:
- return struct.unpack(f"{self._byteorder}i", self._path_or_buf.read(4))[0]
-
- def _read_int64(self) -> int:
- return struct.unpack(f"{self._byteorder}q", self._path_or_buf.read(8))[0]
-
- def _read_char8(self) -> bytes:
- return struct.unpack("c", self._path_or_buf.read(1))[0]
-
- def _read_int16_count(self, count: int) -> tuple[int, ...]:
- return struct.unpack(
- f"{self._byteorder}{'h' * count}",
- self._path_or_buf.read(2 * count),
- )
-
- def _read_header(self) -> None:
- first_char = self._read_char8()
- if first_char == b"<":
- self._read_new_header()
- else:
- self._read_old_header(first_char)
-
- self._has_string_data = len([x for x in self._typlist if type(x) is int]) > 0
-
- # calculate size of a data record
- self._col_sizes = [self._calcsize(typ) for typ in self._typlist]
-
- def _read_new_header(self) -> None:
- # The first part of the header is common to 117 - 119.
- self._path_or_buf.read(27) # stata_dta><header><release>
- self._format_version = int(self._path_or_buf.read(3))
- if self._format_version not in [117, 118, 119]:
- raise ValueError(_version_error.format(version=self._format_version))
- self._set_encoding()
- self._path_or_buf.read(21) # </release><byteorder>
- self._byteorder = ">" if self._path_or_buf.read(3) == b"MSF" else "<"
- self._path_or_buf.read(15) # </byteorder><K>
- self._nvar = (
- self._read_uint16() if self._format_version <= 118 else self._read_uint32()
- )
- self._path_or_buf.read(7) # </K><N>
-
- self._nobs = self._get_nobs()
- self._path_or_buf.read(11) # </N><label>
- self._data_label = self._get_data_label()
- self._path_or_buf.read(19) # </label><timestamp>
- self._time_stamp = self._get_time_stamp()
- self._path_or_buf.read(26) # </timestamp></header><map>
- self._path_or_buf.read(8) # 0x0000000000000000
- self._path_or_buf.read(8) # position of <map>
-
- self._seek_vartypes = self._read_int64() + 16
- self._seek_varnames = self._read_int64() + 10
- self._seek_sortlist = self._read_int64() + 10
- self._seek_formats = self._read_int64() + 9
- self._seek_value_label_names = self._read_int64() + 19
-
- # Requires version-specific treatment
- self._seek_variable_labels = self._get_seek_variable_labels()
-
- self._path_or_buf.read(8) # <characteristics>
- self._data_location = self._read_int64() + 6
- self._seek_strls = self._read_int64() + 7
- self._seek_value_labels = self._read_int64() + 14
-
- self._typlist, self._dtyplist = self._get_dtypes(self._seek_vartypes)
-
- self._path_or_buf.seek(self._seek_varnames)
- self._varlist = self._get_varlist()
-
- self._path_or_buf.seek(self._seek_sortlist)
- self._srtlist = self._read_int16_count(self._nvar + 1)[:-1]
-
- self._path_or_buf.seek(self._seek_formats)
- self._fmtlist = self._get_fmtlist()
-
- self._path_or_buf.seek(self._seek_value_label_names)
- self._lbllist = self._get_lbllist()
-
- self._path_or_buf.seek(self._seek_variable_labels)
- self._variable_labels = self._get_variable_labels()
-
- # Get data type information, works for versions 117-119.
- def _get_dtypes(
- self, seek_vartypes: int
- ) -> tuple[list[int | str], list[str | np.dtype]]:
- self._path_or_buf.seek(seek_vartypes)
- raw_typlist = [self._read_uint16() for _ in range(self._nvar)]
-
- def f(typ: int) -> int | str:
- if typ <= 2045:
- return typ
- try:
- return self.TYPE_MAP_XML[typ]
- except KeyError as err:
- raise ValueError(f"cannot convert stata types [{typ}]") from err
-
- typlist = [f(x) for x in raw_typlist]
-
- def g(typ: int) -> str | np.dtype:
- if typ <= 2045:
- return str(typ)
- try:
- return self.DTYPE_MAP_XML[typ]
- except KeyError as err:
- raise ValueError(f"cannot convert stata dtype [{typ}]") from err
-
- dtyplist = [g(x) for x in raw_typlist]
-
- return typlist, dtyplist
-
- def _get_varlist(self) -> list[str]:
- # 33 in order formats, 129 in formats 118 and 119
- b = 33 if self._format_version < 118 else 129
- return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]
-
- # Returns the format list
- def _get_fmtlist(self) -> list[str]:
- if self._format_version >= 118:
- b = 57
- elif self._format_version > 113:
- b = 49
- elif self._format_version > 104:
- b = 12
- else:
- b = 7
-
- return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]
-
- # Returns the label list
- def _get_lbllist(self) -> list[str]:
- if self._format_version >= 118:
- b = 129
- elif self._format_version > 108:
- b = 33
- else:
- b = 9
- return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]
-
- def _get_variable_labels(self) -> list[str]:
- if self._format_version >= 118:
- vlblist = [
- self._decode(self._path_or_buf.read(321)) for _ in range(self._nvar)
- ]
- elif self._format_version > 105:
- vlblist = [
- self._decode(self._path_or_buf.read(81)) for _ in range(self._nvar)
- ]
- else:
- vlblist = [
- self._decode(self._path_or_buf.read(32)) for _ in range(self._nvar)
- ]
- return vlblist
-
- def _get_nobs(self) -> int:
- if self._format_version >= 118:
- return self._read_uint64()
- else:
- return self._read_uint32()
-
- def _get_data_label(self) -> str:
- if self._format_version >= 118:
- strlen = self._read_uint16()
- return self._decode(self._path_or_buf.read(strlen))
- elif self._format_version == 117:
- strlen = self._read_int8()
- return self._decode(self._path_or_buf.read(strlen))
- elif self._format_version > 105:
- return self._decode(self._path_or_buf.read(81))
- else:
- return self._decode(self._path_or_buf.read(32))
-
- def _get_time_stamp(self) -> str:
- if self._format_version >= 118:
- strlen = self._read_int8()
- return self._path_or_buf.read(strlen).decode("utf-8")
- elif self._format_version == 117:
- strlen = self._read_int8()
- return self._decode(self._path_or_buf.read(strlen))
- elif self._format_version > 104:
- return self._decode(self._path_or_buf.read(18))
- else:
- raise ValueError()
-
- def _get_seek_variable_labels(self) -> int:
- if self._format_version == 117:
- self._path_or_buf.read(8) # <variable_labels>, throw away
- # Stata 117 data files do not follow the described format. This is
- # a work around that uses the previous label, 33 bytes for each
- # variable, 20 for the closing tag and 17 for the opening tag
- return self._seek_value_label_names + (33 * self._nvar) + 20 + 17
- elif self._format_version >= 118:
- return self._read_int64() + 17
- else:
- raise ValueError()
-
- def _read_old_header(self, first_char: bytes) -> None:
- self._format_version = int(first_char[0])
- if self._format_version not in [104, 105, 108, 111, 113, 114, 115]:
- raise ValueError(_version_error.format(version=self._format_version))
- self._set_encoding()
- self._byteorder = ">" if self._read_int8() == 0x1 else "<"
- self._filetype = self._read_int8()
- self._path_or_buf.read(1) # unused
-
- self._nvar = self._read_uint16()
- self._nobs = self._get_nobs()
-
- self._data_label = self._get_data_label()
-
- self._time_stamp = self._get_time_stamp()
-
- # descriptors
- if self._format_version > 108:
- typlist = [int(c) for c in self._path_or_buf.read(self._nvar)]
- else:
- buf = self._path_or_buf.read(self._nvar)
- typlistb = np.frombuffer(buf, dtype=np.uint8)
- typlist = []
- for tp in typlistb:
- if tp in self.OLD_TYPE_MAPPING:
- typlist.append(self.OLD_TYPE_MAPPING[tp])
- else:
- typlist.append(tp - 127) # bytes
-
- try:
- self._typlist = [self.TYPE_MAP[typ] for typ in typlist]
- except ValueError as err:
- invalid_types = ",".join([str(x) for x in typlist])
- raise ValueError(f"cannot convert stata types [{invalid_types}]") from err
- try:
- self._dtyplist = [self.DTYPE_MAP[typ] for typ in typlist]
- except ValueError as err:
- invalid_dtypes = ",".join([str(x) for x in typlist])
- raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") from err
-
- if self._format_version > 108:
- self._varlist = [
- self._decode(self._path_or_buf.read(33)) for _ in range(self._nvar)
- ]
- else:
- self._varlist = [
- self._decode(self._path_or_buf.read(9)) for _ in range(self._nvar)
- ]
- self._srtlist = self._read_int16_count(self._nvar + 1)[:-1]
-
- self._fmtlist = self._get_fmtlist()
-
- self._lbllist = self._get_lbllist()
-
- self._variable_labels = self._get_variable_labels()
-
- # ignore expansion fields (Format 105 and later)
- # When reading, read five bytes; the last four bytes now tell you
- # the size of the next read, which you discard. You then continue
- # like this until you read 5 bytes of zeros.
-
- if self._format_version > 104:
- while True:
- data_type = self._read_int8()
- if self._format_version > 108:
- data_len = self._read_int32()
- else:
- data_len = self._read_int16()
- if data_type == 0:
- break
- self._path_or_buf.read(data_len)
-
- # necessary data to continue parsing
- self._data_location = self._path_or_buf.tell()
-
- def _setup_dtype(self) -> np.dtype:
- """Map between numpy and state dtypes"""
- if self._dtype is not None:
- return self._dtype
-
- dtypes = [] # Convert struct data types to numpy data type
- for i, typ in enumerate(self._typlist):
- if typ in self.NUMPY_TYPE_MAP:
- typ = cast(str, typ) # only strs in NUMPY_TYPE_MAP
- dtypes.append((f"s{i}", f"{self._byteorder}{self.NUMPY_TYPE_MAP[typ]}"))
- else:
- dtypes.append((f"s{i}", f"S{typ}"))
- self._dtype = np.dtype(dtypes)
-
- return self._dtype
-
- def _calcsize(self, fmt: int | str) -> int:
- if isinstance(fmt, int):
- return fmt
- return struct.calcsize(self._byteorder + fmt)
-
- def _decode(self, s: bytes) -> str:
- # have bytes not strings, so must decode
- s = s.partition(b"\0")[0]
- try:
- return s.decode(self._encoding)
- except UnicodeDecodeError:
- # GH 25960, fallback to handle incorrect format produced when 117
- # files are converted to 118 files in Stata
- encoding = self._encoding
- msg = f"""
-One or more strings in the dta file could not be decoded using {encoding}, and
-so the fallback encoding of latin-1 is being used. This can happen when a file
-has been incorrectly encoded by Stata or some other software. You should verify
-the string values returned are correct."""
- warnings.warn(
- msg,
- UnicodeWarning,
- stacklevel=find_stack_level(),
- )
- return s.decode("latin-1")
-
- def _read_value_labels(self) -> None:
- self._ensure_open()
- if self._value_labels_read:
- # Don't read twice
- return
- if self._format_version <= 108:
- # Value labels are not supported in version 108 and earlier.
- self._value_labels_read = True
- self._value_label_dict: dict[str, dict[float, str]] = {}
- return
-
- if self._format_version >= 117:
- self._path_or_buf.seek(self._seek_value_labels)
- else:
- assert self._dtype is not None
- offset = self._nobs * self._dtype.itemsize
- self._path_or_buf.seek(self._data_location + offset)
-
- self._value_labels_read = True
- self._value_label_dict = {}
-
- while True:
- if self._format_version >= 117:
- if self._path_or_buf.read(5) == b"</val": # <lbl>
- break # end of value label table
-
- slength = self._path_or_buf.read(4)
- if not slength:
- break # end of value label table (format < 117)
- if self._format_version <= 117:
- labname = self._decode(self._path_or_buf.read(33))
- else:
- labname = self._decode(self._path_or_buf.read(129))
- self._path_or_buf.read(3) # padding
-
- n = self._read_uint32()
- txtlen = self._read_uint32()
- off = np.frombuffer(
- self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n
- )
- val = np.frombuffer(
- self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n
- )
- ii = np.argsort(off)
- off = off[ii]
- val = val[ii]
- txt = self._path_or_buf.read(txtlen)
- self._value_label_dict[labname] = {}
- for i in range(n):
- end = off[i + 1] if i < n - 1 else txtlen
- self._value_label_dict[labname][val[i]] = self._decode(
- txt[off[i] : end]
- )
- if self._format_version >= 117:
- self._path_or_buf.read(6) # </lbl>
- self._value_labels_read = True
-
- def _read_strls(self) -> None:
- self._path_or_buf.seek(self._seek_strls)
- # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
- self.GSO = {"0": ""}
- while True:
- if self._path_or_buf.read(3) != b"GSO":
- break
-
- if self._format_version == 117:
- v_o = self._read_uint64()
- else:
- buf = self._path_or_buf.read(12)
- # Only tested on little endian file on little endian machine.
- v_size = 2 if self._format_version == 118 else 3
- if self._byteorder == "<":
- buf = buf[0:v_size] + buf[4 : (12 - v_size)]
- else:
- # This path may not be correct, impossible to test
- buf = buf[0:v_size] + buf[(4 + v_size) :]
- v_o = struct.unpack("Q", buf)[0]
- typ = self._read_uint8()
- length = self._read_uint32()
- va = self._path_or_buf.read(length)
- if typ == 130:
- decoded_va = va[0:-1].decode(self._encoding)
- else:
- # Stata says typ 129 can be binary, so use str
- decoded_va = str(va)
- # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
- self.GSO[str(v_o)] = decoded_va
-
- def __next__(self) -> DataFrame:
- self._using_iterator = True
- return self.read(nrows=self._chunksize)
-
- def get_chunk(self, size: int | None = None) -> DataFrame:
- """
- Reads lines from Stata file and returns as dataframe
-
- Parameters
- ----------
- size : int, defaults to None
- Number of lines to read. If None, reads whole file.
-
- Returns
- -------
- DataFrame
- """
- if size is None:
- size = self._chunksize
- return self.read(nrows=size)
-
- @Appender(_read_method_doc)
- def read(
- self,
- nrows: int | None = None,
- convert_dates: bool | None = None,
- convert_categoricals: bool | None = None,
- index_col: str | None = None,
- convert_missing: bool | None = None,
- preserve_dtypes: bool | None = None,
- columns: Sequence[str] | None = None,
- order_categoricals: bool | None = None,
- ) -> DataFrame:
- self._ensure_open()
- # Handle empty file or chunk. If reading incrementally raise
- # StopIteration. If reading the whole thing return an empty
- # data frame.
- if (self._nobs == 0) and (nrows is None):
- self._can_read_value_labels = True
- self._data_read = True
- return DataFrame(columns=self._varlist)
-
- # Handle options
- if convert_dates is None:
- convert_dates = self._convert_dates
- if convert_categoricals is None:
- convert_categoricals = self._convert_categoricals
- if convert_missing is None:
- convert_missing = self._convert_missing
- if preserve_dtypes is None:
- preserve_dtypes = self._preserve_dtypes
- if columns is None:
- columns = self._columns
- if order_categoricals is None:
- order_categoricals = self._order_categoricals
- if index_col is None:
- index_col = self._index_col
-
- if nrows is None:
- nrows = self._nobs
-
- if (self._format_version >= 117) and (not self._value_labels_read):
- self._can_read_value_labels = True
- self._read_strls()
-
- # Read data
- assert self._dtype is not None
- dtype = self._dtype
- max_read_len = (self._nobs - self._lines_read) * dtype.itemsize
- read_len = nrows * dtype.itemsize
- read_len = min(read_len, max_read_len)
- if read_len <= 0:
- # Iterator has finished, should never be here unless
- # we are reading the file incrementally
- if convert_categoricals:
- self._read_value_labels()
- raise StopIteration
- offset = self._lines_read * dtype.itemsize
- self._path_or_buf.seek(self._data_location + offset)
- read_lines = min(nrows, self._nobs - self._lines_read)
- raw_data = np.frombuffer(
- self._path_or_buf.read(read_len), dtype=dtype, count=read_lines
- )
-
- self._lines_read += read_lines
- if self._lines_read == self._nobs:
- self._can_read_value_labels = True
- self._data_read = True
- # if necessary, swap the byte order to native here
- if self._byteorder != self._native_byteorder:
- raw_data = raw_data.byteswap().newbyteorder()
-
- if convert_categoricals:
- self._read_value_labels()
-
- if len(raw_data) == 0:
- data = DataFrame(columns=self._varlist)
- else:
- data = DataFrame.from_records(raw_data)
- data.columns = Index(self._varlist)
-
- # If index is not specified, use actual row number rather than
- # restarting at 0 for each chunk.
- if index_col is None:
- rng = range(self._lines_read - read_lines, self._lines_read)
- data.index = Index(rng) # set attr instead of set_index to avoid copy
-
- if columns is not None:
- data = self._do_select_columns(data, columns)
-
- # Decode strings
- for col, typ in zip(data, self._typlist):
- if type(typ) is int:
- data[col] = data[col].apply(self._decode, convert_dtype=True)
-
- data = self._insert_strls(data)
-
- cols_ = np.where([dtyp is not None for dtyp in self._dtyplist])[0]
- # Convert columns (if needed) to match input type
- ix = data.index
- requires_type_conversion = False
- data_formatted = []
- for i in cols_:
- if self._dtyplist[i] is not None:
- col = data.columns[i]
- dtype = data[col].dtype
- if dtype != np.dtype(object) and dtype != self._dtyplist[i]:
- requires_type_conversion = True
- data_formatted.append(
- (col, Series(data[col], ix, self._dtyplist[i]))
- )
- else:
- data_formatted.append((col, data[col]))
- if requires_type_conversion:
- data = DataFrame.from_dict(dict(data_formatted))
- del data_formatted
-
- data = self._do_convert_missing(data, convert_missing)
-
- if convert_dates:
-
- def any_startswith(x: str) -> bool:
- return any(x.startswith(fmt) for fmt in _date_formats)
-
- cols = np.where([any_startswith(x) for x in self._fmtlist])[0]
- for i in cols:
- col = data.columns[i]
- data[col] = _stata_elapsed_date_to_datetime_vec(
- data[col], self._fmtlist[i]
- )
-
- if convert_categoricals and self._format_version > 108:
- data = self._do_convert_categoricals(
- data, self._value_label_dict, self._lbllist, order_categoricals
- )
-
- if not preserve_dtypes:
- retyped_data = []
- convert = False
- for col in data:
- dtype = data[col].dtype
- if dtype in (np.dtype(np.float16), np.dtype(np.float32)):
- dtype = np.dtype(np.float64)
- convert = True
- elif dtype in (
- np.dtype(np.int8),
- np.dtype(np.int16),
- np.dtype(np.int32),
- ):
- dtype = np.dtype(np.int64)
- convert = True
- retyped_data.append((col, data[col].astype(dtype)))
- if convert:
- data = DataFrame.from_dict(dict(retyped_data))
-
- if index_col is not None:
- data = data.set_index(data.pop(index_col))
-
- return data
-
- def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame:
- # Check for missing values, and replace if found
- replacements = {}
- for i, colname in enumerate(data):
- fmt = self._typlist[i]
- if fmt not in self.VALID_RANGE:
- continue
-
- fmt = cast(str, fmt) # only strs in VALID_RANGE
- nmin, nmax = self.VALID_RANGE[fmt]
- series = data[colname]
-
- # appreciably faster to do this with ndarray instead of Series
- svals = series._values
- missing = (svals < nmin) | (svals > nmax)
-
- if not missing.any():
- continue
-
- if convert_missing: # Replacement follows Stata notation
- missing_loc = np.nonzero(np.asarray(missing))[0]
- umissing, umissing_loc = np.unique(series[missing], return_inverse=True)
- replacement = Series(series, dtype=object)
- for j, um in enumerate(umissing):
- missing_value = StataMissingValue(um)
-
- loc = missing_loc[umissing_loc == j]
- replacement.iloc[loc] = missing_value
- else: # All replacements are identical
- dtype = series.dtype
- if dtype not in (np.float32, np.float64):
- dtype = np.float64
- replacement = Series(series, dtype=dtype)
- if not replacement._values.flags["WRITEABLE"]:
- # only relevant for ArrayManager; construction
- # path for BlockManager ensures writeability
- replacement = replacement.copy()
- # Note: operating on ._values is much faster than directly
- # TODO: can we fix that?
- replacement._values[missing] = np.nan
- replacements[colname] = replacement
-
- if replacements:
- for col, value in replacements.items():
- data[col] = value
- return data
-
- def _insert_strls(self, data: DataFrame) -> DataFrame:
- if not hasattr(self, "GSO") or len(self.GSO) == 0:
- return data
- for i, typ in enumerate(self._typlist):
- if typ != "Q":
- continue
- # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
- data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]]
- return data
-
- def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFrame:
- if not self._column_selector_set:
- column_set = set(columns)
- if len(column_set) != len(columns):
- raise ValueError("columns contains duplicate entries")
- unmatched = column_set.difference(data.columns)
- if unmatched:
- joined = ", ".join(list(unmatched))
- raise ValueError(
- "The following columns were not "
- f"found in the Stata data set: {joined}"
- )
- # Copy information for retained columns for later processing
- dtyplist = []
- typlist = []
- fmtlist = []
- lbllist = []
- for col in columns:
- i = data.columns.get_loc(col)
- dtyplist.append(self._dtyplist[i])
- typlist.append(self._typlist[i])
- fmtlist.append(self._fmtlist[i])
- lbllist.append(self._lbllist[i])
-
- self._dtyplist = dtyplist
- self._typlist = typlist
- self._fmtlist = fmtlist
- self._lbllist = lbllist
- self._column_selector_set = True
-
- return data[columns]
-
- def _do_convert_categoricals(
- self,
- data: DataFrame,
- value_label_dict: dict[str, dict[float, str]],
- lbllist: Sequence[str],
- order_categoricals: bool,
- ) -> DataFrame:
- """
- Converts categorical columns to Categorical type.
- """
- value_labels = list(value_label_dict.keys())
- cat_converted_data = []
- for col, label in zip(data, lbllist):
- if label in value_labels:
- # Explicit call with ordered=True
- vl = value_label_dict[label]
- keys = np.array(list(vl.keys()))
- column = data[col]
- key_matches = column.isin(keys)
- if self._using_iterator and key_matches.all():
- initial_categories: np.ndarray | None = keys
- # If all categories are in the keys and we are iterating,
- # use the same keys for all chunks. If some are missing
- # value labels, then we will fall back to the categories
- # varying across chunks.
- else:
- if self._using_iterator:
- # warn is using an iterator
- warnings.warn(
- categorical_conversion_warning,
- CategoricalConversionWarning,
- stacklevel=find_stack_level(),
- )
- initial_categories = None
- cat_data = Categorical(
- column, categories=initial_categories, ordered=order_categoricals
- )
- if initial_categories is None:
- # If None here, then we need to match the cats in the Categorical
- categories = []
- for category in cat_data.categories:
- if category in vl:
- categories.append(vl[category])
- else:
- categories.append(category)
- else:
- # If all cats are matched, we can use the values
- categories = list(vl.values())
- try:
- # Try to catch duplicate categories
- # TODO: if we get a non-copying rename_categories, use that
- cat_data = cat_data.rename_categories(categories)
- except ValueError as err:
- vc = Series(categories, copy=False).value_counts()
- repeated_cats = list(vc.index[vc > 1])
- repeats = "-" * 80 + "\n" + "\n".join(repeated_cats)
- # GH 25772
- msg = f"""
-Value labels for column {col} are not unique. These cannot be converted to
-pandas categoricals.
-
-Either read the file with `convert_categoricals` set to False or use the
-low level interface in `StataReader` to separately read the values and the
-value_labels.
-
-The repeated labels are:
-{repeats}
-"""
- raise ValueError(msg) from err
- # TODO: is the next line needed above in the data(...) method?
- cat_series = Series(cat_data, index=data.index, copy=False)
- cat_converted_data.append((col, cat_series))
- else:
- cat_converted_data.append((col, data[col]))
- data = DataFrame(dict(cat_converted_data), copy=False)
- return data
-
- @property
- def data_label(self) -> str:
- """
- Return data label of Stata file.
- """
- self._ensure_open()
- return self._data_label
-
- @property
- def time_stamp(self) -> str:
- """
- Return time stamp of Stata file.
- """
- self._ensure_open()
- return self._time_stamp
-
- def variable_labels(self) -> dict[str, str]:
- """
- Return a dict associating each variable name with corresponding label.
-
- Returns
- -------
- dict
- """
- self._ensure_open()
- return dict(zip(self._varlist, self._variable_labels))
-
- def value_labels(self) -> dict[str, dict[float, str]]:
- """
- Return a nested dict associating each variable name to its value and label.
-
- Returns
- -------
- dict
- """
- if not self._value_labels_read:
- self._read_value_labels()
-
- return self._value_label_dict
-
-
-@Appender(_read_stata_doc)
-def read_stata(
- filepath_or_buffer: FilePath | ReadBuffer[bytes],
- *,
- convert_dates: bool = True,
- convert_categoricals: bool = True,
- index_col: str | None = None,
- convert_missing: bool = False,
- preserve_dtypes: bool = True,
- columns: Sequence[str] | None = None,
- order_categoricals: bool = True,
- chunksize: int | None = None,
- iterator: bool = False,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
-) -> DataFrame | StataReader:
- reader = StataReader(
- filepath_or_buffer,
- convert_dates=convert_dates,
- convert_categoricals=convert_categoricals,
- index_col=index_col,
- convert_missing=convert_missing,
- preserve_dtypes=preserve_dtypes,
- columns=columns,
- order_categoricals=order_categoricals,
- chunksize=chunksize,
- storage_options=storage_options,
- compression=compression,
- )
-
- if iterator or chunksize:
- return reader
-
- with reader:
- return reader.read()
-
-
-def _set_endianness(endianness: str) -> str:
- if endianness.lower() in ["<", "little"]:
- return "<"
- elif endianness.lower() in [">", "big"]:
- return ">"
- else: # pragma : no cover
- raise ValueError(f"Endianness {endianness} not understood")
-
-
-def _pad_bytes(name: AnyStr, length: int) -> AnyStr:
- """
- Take a char string and pads it with null bytes until it's length chars.
- """
- if isinstance(name, bytes):
- return name + b"\x00" * (length - len(name))
- return name + "\x00" * (length - len(name))
-
-
-def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
- """
- Convert from one of the stata date formats to a type in TYPE_MAP.
- """
- if fmt in [
- "tc",
- "%tc",
- "td",
- "%td",
- "tw",
- "%tw",
- "tm",
- "%tm",
- "tq",
- "%tq",
- "th",
- "%th",
- "ty",
- "%ty",
- ]:
- return np.dtype(np.float64) # Stata expects doubles for SIFs
- else:
- raise NotImplementedError(f"Format {fmt} not implemented")
-
-
-def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict:
- new_dict = {}
- for key in convert_dates:
- if not convert_dates[key].startswith("%"): # make sure proper fmts
- convert_dates[key] = "%" + convert_dates[key]
- if key in varlist:
- new_dict.update({varlist.index(key): convert_dates[key]})
- else:
- if not isinstance(key, int):
- raise ValueError("convert_dates key must be a column or an integer")
- new_dict.update({key: convert_dates[key]})
- return new_dict
-
-
-def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int:
- """
- Convert dtype types to stata types. Returns the byte of the given ordinal.
- See TYPE_MAP and comments for an explanation. This is also explained in
- the dta spec.
- 1 - 244 are strings of this length
- Pandas Stata
- 251 - for int8 byte
- 252 - for int16 int
- 253 - for int32 long
- 254 - for float32 float
- 255 - for double double
-
- If there are dates to convert, then dtype will already have the correct
- type inserted.
- """
- # TODO: expand to handle datetime to integer conversion
- if dtype.type is np.object_: # try to coerce it to the biggest string
- # not memory efficient, what else could we
- # do?
- itemsize = max_len_string_array(ensure_object(column._values))
- return max(itemsize, 1)
- elif dtype.type is np.float64:
- return 255
- elif dtype.type is np.float32:
- return 254
- elif dtype.type is np.int32:
- return 253
- elif dtype.type is np.int16:
- return 252
- elif dtype.type is np.int8:
- return 251
- else: # pragma : no cover
- raise NotImplementedError(f"Data type {dtype} not supported.")
-
-
-def _dtype_to_default_stata_fmt(
- dtype, column: Series, dta_version: int = 114, force_strl: bool = False
-) -> str:
- """
- Map numpy dtype to stata's default format for this type. Not terribly
- important since users can change this in Stata. Semantics are
-
- object -> "%DDs" where DD is the length of the string. If not a string,
- raise ValueError
- float64 -> "%10.0g"
- float32 -> "%9.0g"
- int64 -> "%9.0g"
- int32 -> "%12.0g"
- int16 -> "%8.0g"
- int8 -> "%8.0g"
- strl -> "%9s"
- """
- # TODO: Refactor to combine type with format
- # TODO: expand this to handle a default datetime format?
- if dta_version < 117:
- max_str_len = 244
- else:
- max_str_len = 2045
- if force_strl:
- return "%9s"
- if dtype.type is np.object_:
- itemsize = max_len_string_array(ensure_object(column._values))
- if itemsize > max_str_len:
- if dta_version >= 117:
- return "%9s"
- else:
- raise ValueError(excessive_string_length_error.format(column.name))
- return "%" + str(max(itemsize, 1)) + "s"
- elif dtype == np.float64:
- return "%10.0g"
- elif dtype == np.float32:
- return "%9.0g"
- elif dtype == np.int32:
- return "%12.0g"
- elif dtype in (np.int8, np.int16):
- return "%8.0g"
- else: # pragma : no cover
- raise NotImplementedError(f"Data type {dtype} not supported.")
-
-
-@doc(
- storage_options=_shared_docs["storage_options"],
- compression_options=_shared_docs["compression_options"] % "fname",
-)
-class StataWriter(StataParser):
- """
- A class for writing Stata binary dta files
-
- Parameters
- ----------
- fname : path (string), buffer or path object
- string, path object (pathlib.Path or py._path.local.LocalPath) or
- object implementing a binary write() functions. If using a buffer
- then the buffer will not be automatically closed after the file
- is written.
- data : DataFrame
- Input to save
- convert_dates : dict
- Dictionary mapping columns containing datetime types to stata internal
- format to use when writing the dates. Options are 'tc', 'td', 'tm',
- 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
- Datetime columns that do not have a conversion type specified will be
- converted to 'tc'. Raises NotImplementedError if a datetime column has
- timezone information
- write_index : bool
- Write the index to Stata dataset.
- byteorder : str
- Can be ">", "<", "little", or "big". default is `sys.byteorder`
- time_stamp : datetime
- A datetime to use as file creation date. Default is the current time
- data_label : str
- A label for the data set. Must be 80 characters or smaller.
- variable_labels : dict
- Dictionary containing columns as keys and variable labels as values.
- Each label must be 80 characters or smaller.
- {compression_options}
-
- .. versionadded:: 1.1.0
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- {storage_options}
-
- .. versionadded:: 1.2.0
-
- value_labels : dict of dicts
- Dictionary containing columns as keys and dictionaries of column value
- to labels as values. The combined length of all labels for a single
- variable must be 32,000 characters or smaller.
-
- .. versionadded:: 1.4.0
-
- Returns
- -------
- writer : StataWriter instance
- The StataWriter instance has a write_file method, which will
- write the file to the given `fname`.
-
- Raises
- ------
- NotImplementedError
- * If datetimes contain timezone information
- ValueError
- * Columns listed in convert_dates are neither datetime64[ns]
- or datetime.datetime
- * Column dtype is not representable in Stata
- * Column listed in convert_dates is not in DataFrame
- * Categorical label contains more than 32,000 characters
-
- Examples
- --------
- >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b'])
- >>> writer = StataWriter('./data_file.dta', data)
- >>> writer.write_file()
-
- Directly write a zip file
- >>> compression = {{"method": "zip", "archive_name": "data_file.dta"}}
- >>> writer = StataWriter('./data_file.zip', data, compression=compression)
- >>> writer.write_file()
-
- Save a DataFrame with dates
- >>> from datetime import datetime
- >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date'])
- >>> writer = StataWriter('./date_data_file.dta', data, {{'date' : 'tw'}})
- >>> writer.write_file()
- """
-
- _max_string_length = 244
- _encoding: Literal["latin-1", "utf-8"] = "latin-1"
-
- def __init__(
- self,
- fname: FilePath | WriteBuffer[bytes],
- data: DataFrame,
- convert_dates: dict[Hashable, str] | None = None,
- write_index: bool = True,
- byteorder: str | None = None,
- time_stamp: datetime.datetime | None = None,
- data_label: str | None = None,
- variable_labels: dict[Hashable, str] | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
- *,
- value_labels: dict[Hashable, dict[float, str]] | None = None,
- ) -> None:
- super().__init__()
- self.data = data
- self._convert_dates = {} if convert_dates is None else convert_dates
- self._write_index = write_index
- self._time_stamp = time_stamp
- self._data_label = data_label
- self._variable_labels = variable_labels
- self._non_cat_value_labels = value_labels
- self._value_labels: list[StataValueLabel] = []
- self._has_value_labels = np.array([], dtype=bool)
- self._compression = compression
- self._output_file: IO[bytes] | None = None
- self._converted_names: dict[Hashable, str] = {}
- # attach nobs, nvars, data, varlist, typlist
- self._prepare_pandas(data)
- self.storage_options = storage_options
-
- if byteorder is None:
- byteorder = sys.byteorder
- self._byteorder = _set_endianness(byteorder)
- self._fname = fname
- self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8}
-
- def _write(self, to_write: str) -> None:
- """
- Helper to call encode before writing to file for Python 3 compat.
- """
- self.handles.handle.write(to_write.encode(self._encoding))
-
- def _write_bytes(self, value: bytes) -> None:
- """
- Helper to assert file is open before writing.
- """
- self.handles.handle.write(value)
-
- def _prepare_non_cat_value_labels(
- self, data: DataFrame
- ) -> list[StataNonCatValueLabel]:
- """
- Check for value labels provided for non-categorical columns. Value
- labels
- """
- non_cat_value_labels: list[StataNonCatValueLabel] = []
- if self._non_cat_value_labels is None:
- return non_cat_value_labels
-
- for labname, labels in self._non_cat_value_labels.items():
- if labname in self._converted_names:
- colname = self._converted_names[labname]
- elif labname in data.columns:
- colname = str(labname)
- else:
- raise KeyError(
- f"Can't create value labels for {labname}, it wasn't "
- "found in the dataset."
- )
-
- if not is_numeric_dtype(data[colname].dtype):
- # Labels should not be passed explicitly for categorical
- # columns that will be converted to int
- raise ValueError(
- f"Can't create value labels for {labname}, value labels "
- "can only be applied to numeric columns."
- )
- svl = StataNonCatValueLabel(colname, labels, self._encoding)
- non_cat_value_labels.append(svl)
- return non_cat_value_labels
-
- def _prepare_categoricals(self, data: DataFrame) -> DataFrame:
- """
- Check for categorical columns, retain categorical information for
- Stata file and convert categorical data to int
- """
- is_cat = [is_categorical_dtype(data[col].dtype) for col in data]
- if not any(is_cat):
- return data
-
- self._has_value_labels |= np.array(is_cat)
-
- get_base_missing_value = StataMissingValue.get_base_missing_value
- data_formatted = []
- for col, col_is_cat in zip(data, is_cat):
- if col_is_cat:
- svl = StataValueLabel(data[col], encoding=self._encoding)
- self._value_labels.append(svl)
- dtype = data[col].cat.codes.dtype
- if dtype == np.int64:
- raise ValueError(
- "It is not possible to export "
- "int64-based categorical data to Stata."
- )
- values = data[col].cat.codes._values.copy()
-
- # Upcast if needed so that correct missing values can be set
- if values.max() >= get_base_missing_value(dtype):
- if dtype == np.int8:
- dtype = np.dtype(np.int16)
- elif dtype == np.int16:
- dtype = np.dtype(np.int32)
- else:
- dtype = np.dtype(np.float64)
- values = np.array(values, dtype=dtype)
-
- # Replace missing values with Stata missing value for type
- values[values == -1] = get_base_missing_value(dtype)
- data_formatted.append((col, values))
- else:
- data_formatted.append((col, data[col]))
- return DataFrame.from_dict(dict(data_formatted))
-
- def _replace_nans(self, data: DataFrame) -> DataFrame:
- # return data
- """
- Checks floating point data columns for nans, and replaces these with
- the generic Stata for missing value (.)
- """
- for c in data:
- dtype = data[c].dtype
- if dtype in (np.float32, np.float64):
- if dtype == np.float32:
- replacement = self.MISSING_VALUES["f"]
- else:
- replacement = self.MISSING_VALUES["d"]
- data[c] = data[c].fillna(replacement)
-
- return data
-
- def _update_strl_names(self) -> None:
- """No-op, forward compatibility"""
-
- def _validate_variable_name(self, name: str) -> str:
- """
- Validate variable names for Stata export.
-
- Parameters
- ----------
- name : str
- Variable name
-
- Returns
- -------
- str
- The validated name with invalid characters replaced with
- underscores.
-
- Notes
- -----
- Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9
- and _.
- """
- for c in name:
- if (
- (c < "A" or c > "Z")
- and (c < "a" or c > "z")
- and (c < "0" or c > "9")
- and c != "_"
- ):
- name = name.replace(c, "_")
- return name
-
- def _check_column_names(self, data: DataFrame) -> DataFrame:
- """
- Checks column names to ensure that they are valid Stata column names.
- This includes checks for:
- * Non-string names
- * Stata keywords
- * Variables that start with numbers
- * Variables with names that are too long
-
- When an illegal variable name is detected, it is converted, and if
- dates are exported, the variable name is propagated to the date
- conversion dictionary
- """
- converted_names: dict[Hashable, str] = {}
- columns = list(data.columns)
- original_columns = columns[:]
-
- duplicate_var_id = 0
- for j, name in enumerate(columns):
- orig_name = name
- if not isinstance(name, str):
- name = str(name)
-
- name = self._validate_variable_name(name)
-
- # Variable name must not be a reserved word
- if name in self.RESERVED_WORDS:
- name = "_" + name
-
- # Variable name may not start with a number
- if "0" <= name[0] <= "9":
- name = "_" + name
-
- name = name[: min(len(name), 32)]
-
- if not name == orig_name:
- # check for duplicates
- while columns.count(name) > 0:
- # prepend ascending number to avoid duplicates
- name = "_" + str(duplicate_var_id) + name
- name = name[: min(len(name), 32)]
- duplicate_var_id += 1
- converted_names[orig_name] = name
-
- columns[j] = name
-
- data.columns = Index(columns)
-
- # Check date conversion, and fix key if needed
- if self._convert_dates:
- for c, o in zip(columns, original_columns):
- if c != o:
- self._convert_dates[c] = self._convert_dates[o]
- del self._convert_dates[o]
-
- if converted_names:
- conversion_warning = []
- for orig_name, name in converted_names.items():
- msg = f"{orig_name} -> {name}"
- conversion_warning.append(msg)
-
- ws = invalid_name_doc.format("\n ".join(conversion_warning))
- warnings.warn(
- ws,
- InvalidColumnName,
- stacklevel=find_stack_level(),
- )
-
- self._converted_names = converted_names
- self._update_strl_names()
-
- return data
-
- def _set_formats_and_types(self, dtypes: Series) -> None:
- self.fmtlist: list[str] = []
- self.typlist: list[int] = []
- for col, dtype in dtypes.items():
- self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col]))
- self.typlist.append(_dtype_to_stata_type(dtype, self.data[col]))
-
- def _prepare_pandas(self, data: DataFrame) -> None:
- # NOTE: we might need a different API / class for pandas objects so
- # we can set different semantics - handle this with a PR to pandas.io
-
- data = data.copy()
-
- if self._write_index:
- temp = data.reset_index()
- if isinstance(temp, DataFrame):
- data = temp
-
- # Ensure column names are strings
- data = self._check_column_names(data)
-
- # Check columns for compatibility with stata, upcast if necessary
- # Raise if outside the supported range
- data = _cast_to_stata_types(data)
-
- # Replace NaNs with Stata missing values
- data = self._replace_nans(data)
-
- # Set all columns to initially unlabelled
- self._has_value_labels = np.repeat(False, data.shape[1])
-
- # Create value labels for non-categorical data
- non_cat_value_labels = self._prepare_non_cat_value_labels(data)
-
- non_cat_columns = [svl.labname for svl in non_cat_value_labels]
- has_non_cat_val_labels = data.columns.isin(non_cat_columns)
- self._has_value_labels |= has_non_cat_val_labels
- self._value_labels.extend(non_cat_value_labels)
-
- # Convert categoricals to int data, and strip labels
- data = self._prepare_categoricals(data)
-
- self.nobs, self.nvar = data.shape
- self.data = data
- self.varlist = data.columns.tolist()
-
- dtypes = data.dtypes
-
- # Ensure all date columns are converted
- for col in data:
- if col in self._convert_dates:
- continue
- if is_datetime64_dtype(data[col]):
- self._convert_dates[col] = "tc"
-
- self._convert_dates = _maybe_convert_to_int_keys(
- self._convert_dates, self.varlist
- )
- for key in self._convert_dates:
- new_type = _convert_datetime_to_stata_type(self._convert_dates[key])
- dtypes[key] = np.dtype(new_type)
-
- # Verify object arrays are strings and encode to bytes
- self._encode_strings()
-
- self._set_formats_and_types(dtypes)
-
- # set the given format for the datetime cols
- if self._convert_dates is not None:
- for key in self._convert_dates:
- if isinstance(key, int):
- self.fmtlist[key] = self._convert_dates[key]
-
- def _encode_strings(self) -> None:
- """
- Encode strings in dta-specific encoding
-
- Do not encode columns marked for date conversion or for strL
- conversion. The strL converter independently handles conversion and
- also accepts empty string arrays.
- """
- convert_dates = self._convert_dates
- # _convert_strl is not available in dta 114
- convert_strl = getattr(self, "_convert_strl", [])
- for i, col in enumerate(self.data):
- # Skip columns marked for date conversion or strl conversion
- if i in convert_dates or col in convert_strl:
- continue
- column = self.data[col]
- dtype = column.dtype
- if dtype.type is np.object_:
- inferred_dtype = infer_dtype(column, skipna=True)
- if not ((inferred_dtype == "string") or len(column) == 0):
- col = column.name
- raise ValueError(
- f"""\
-Column `{col}` cannot be exported.\n\nOnly string-like object arrays
-containing all strings or a mix of strings and None can be exported.
-Object arrays containing only null values are prohibited. Other object
-types cannot be exported and must first be converted to one of the
-supported types."""
- )
- encoded = self.data[col].str.encode(self._encoding)
- # If larger than _max_string_length do nothing
- if (
- max_len_string_array(ensure_object(encoded._values))
- <= self._max_string_length
- ):
- self.data[col] = encoded
-
- def write_file(self) -> None:
- """
- Export DataFrame object to Stata dta format.
- """
- with get_handle(
- self._fname,
- "wb",
- compression=self._compression,
- is_text=False,
- storage_options=self.storage_options,
- ) as self.handles:
- if self.handles.compression["method"] is not None:
- # ZipFile creates a file (with the same name) for each write call.
- # Write it first into a buffer and then write the buffer to the ZipFile.
- self._output_file, self.handles.handle = self.handles.handle, BytesIO()
- self.handles.created_handles.append(self.handles.handle)
-
- try:
- self._write_header(
- data_label=self._data_label, time_stamp=self._time_stamp
- )
- self._write_map()
- self._write_variable_types()
- self._write_varnames()
- self._write_sortlist()
- self._write_formats()
- self._write_value_label_names()
- self._write_variable_labels()
- self._write_expansion_fields()
- self._write_characteristics()
- records = self._prepare_data()
- self._write_data(records)
- self._write_strls()
- self._write_value_labels()
- self._write_file_close_tag()
- self._write_map()
- self._close()
- except Exception as exc:
- self.handles.close()
- if isinstance(self._fname, (str, os.PathLike)) and os.path.isfile(
- self._fname
- ):
- try:
- os.unlink(self._fname)
- except OSError:
- warnings.warn(
- f"This save was not successful but {self._fname} could not "
- "be deleted. This file is not valid.",
- ResourceWarning,
- stacklevel=find_stack_level(),
- )
- raise exc
-
- def _close(self) -> None:
- """
- Close the file if it was created by the writer.
-
- If a buffer or file-like object was passed in, for example a GzipFile,
- then leave this file open for the caller to close.
- """
- # write compression
- if self._output_file is not None:
- assert isinstance(self.handles.handle, BytesIO)
- bio, self.handles.handle = self.handles.handle, self._output_file
- self.handles.handle.write(bio.getvalue())
-
- def _write_map(self) -> None:
- """No-op, future compatibility"""
-
- def _write_file_close_tag(self) -> None:
- """No-op, future compatibility"""
-
- def _write_characteristics(self) -> None:
- """No-op, future compatibility"""
-
- def _write_strls(self) -> None:
- """No-op, future compatibility"""
-
- def _write_expansion_fields(self) -> None:
- """Write 5 zeros for expansion fields"""
- self._write(_pad_bytes("", 5))
-
- def _write_value_labels(self) -> None:
- for vl in self._value_labels:
- self._write_bytes(vl.generate_value_label(self._byteorder))
-
- def _write_header(
- self,
- data_label: str | None = None,
- time_stamp: datetime.datetime | None = None,
- ) -> None:
- byteorder = self._byteorder
- # ds_format - just use 114
- self._write_bytes(struct.pack("b", 114))
- # byteorder
- self._write(byteorder == ">" and "\x01" or "\x02")
- # filetype
- self._write("\x01")
- # unused
- self._write("\x00")
- # number of vars, 2 bytes
- self._write_bytes(struct.pack(byteorder + "h", self.nvar)[:2])
- # number of obs, 4 bytes
- self._write_bytes(struct.pack(byteorder + "i", self.nobs)[:4])
- # data label 81 bytes, char, null terminated
- if data_label is None:
- self._write_bytes(self._null_terminate_bytes(_pad_bytes("", 80)))
- else:
- self._write_bytes(
- self._null_terminate_bytes(_pad_bytes(data_label[:80], 80))
- )
- # time stamp, 18 bytes, char, null terminated
- # format dd Mon yyyy hh:mm
- if time_stamp is None:
- time_stamp = datetime.datetime.now()
- elif not isinstance(time_stamp, datetime.datetime):
- raise ValueError("time_stamp should be datetime type")
- # GH #13856
- # Avoid locale-specific month conversion
- months = [
- "Jan",
- "Feb",
- "Mar",
- "Apr",
- "May",
- "Jun",
- "Jul",
- "Aug",
- "Sep",
- "Oct",
- "Nov",
- "Dec",
- ]
- month_lookup = {i + 1: month for i, month in enumerate(months)}
- ts = (
- time_stamp.strftime("%d ")
- + month_lookup[time_stamp.month]
- + time_stamp.strftime(" %Y %H:%M")
- )
- self._write_bytes(self._null_terminate_bytes(ts))
-
- def _write_variable_types(self) -> None:
- for typ in self.typlist:
- self._write_bytes(struct.pack("B", typ))
-
- def _write_varnames(self) -> None:
- # varlist names are checked by _check_column_names
- # varlist, requires null terminated
- for name in self.varlist:
- name = self._null_terminate_str(name)
- name = _pad_bytes(name[:32], 33)
- self._write(name)
-
- def _write_sortlist(self) -> None:
- # srtlist, 2*(nvar+1), int array, encoded by byteorder
- srtlist = _pad_bytes("", 2 * (self.nvar + 1))
- self._write(srtlist)
-
- def _write_formats(self) -> None:
- # fmtlist, 49*nvar, char array
- for fmt in self.fmtlist:
- self._write(_pad_bytes(fmt, 49))
-
- def _write_value_label_names(self) -> None:
- # lbllist, 33*nvar, char array
- for i in range(self.nvar):
- # Use variable name when categorical
- if self._has_value_labels[i]:
- name = self.varlist[i]
- name = self._null_terminate_str(name)
- name = _pad_bytes(name[:32], 33)
- self._write(name)
- else: # Default is empty label
- self._write(_pad_bytes("", 33))
-
- def _write_variable_labels(self) -> None:
- # Missing labels are 80 blank characters plus null termination
- blank = _pad_bytes("", 81)
-
- if self._variable_labels is None:
- for i in range(self.nvar):
- self._write(blank)
- return
-
- for col in self.data:
- if col in self._variable_labels:
- label = self._variable_labels[col]
- if len(label) > 80:
- raise ValueError("Variable labels must be 80 characters or fewer")
- is_latin1 = all(ord(c) < 256 for c in label)
- if not is_latin1:
- raise ValueError(
- "Variable labels must contain only characters that "
- "can be encoded in Latin-1"
- )
- self._write(_pad_bytes(label, 81))
- else:
- self._write(blank)
-
- def _convert_strls(self, data: DataFrame) -> DataFrame:
- """No-op, future compatibility"""
- return data
-
- def _prepare_data(self) -> np.recarray:
- data = self.data
- typlist = self.typlist
- convert_dates = self._convert_dates
- # 1. Convert dates
- if self._convert_dates is not None:
- for i, col in enumerate(data):
- if i in convert_dates:
- data[col] = _datetime_to_stata_elapsed_vec(
- data[col], self.fmtlist[i]
- )
- # 2. Convert strls
- data = self._convert_strls(data)
-
- # 3. Convert bad string data to '' and pad to correct length
- dtypes = {}
- native_byteorder = self._byteorder == _set_endianness(sys.byteorder)
- for i, col in enumerate(data):
- typ = typlist[i]
- if typ <= self._max_string_length:
- data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,))
- stype = f"S{typ}"
- dtypes[col] = stype
- data[col] = data[col].astype(stype)
- else:
- dtype = data[col].dtype
- if not native_byteorder:
- dtype = dtype.newbyteorder(self._byteorder)
- dtypes[col] = dtype
-
- return data.to_records(index=False, column_dtypes=dtypes)
-
- def _write_data(self, records: np.recarray) -> None:
- self._write_bytes(records.tobytes())
-
- @staticmethod
- def _null_terminate_str(s: str) -> str:
- s += "\x00"
- return s
-
- def _null_terminate_bytes(self, s: str) -> bytes:
- return self._null_terminate_str(s).encode(self._encoding)
-
-
-def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool) -> int:
- """
- Converts dtype types to stata types. Returns the byte of the given ordinal.
- See TYPE_MAP and comments for an explanation. This is also explained in
- the dta spec.
- 1 - 2045 are strings of this length
- Pandas Stata
- 32768 - for object strL
- 65526 - for int8 byte
- 65527 - for int16 int
- 65528 - for int32 long
- 65529 - for float32 float
- 65530 - for double double
-
- If there are dates to convert, then dtype will already have the correct
- type inserted.
- """
- # TODO: expand to handle datetime to integer conversion
- if force_strl:
- return 32768
- if dtype.type is np.object_: # try to coerce it to the biggest string
- # not memory efficient, what else could we
- # do?
- itemsize = max_len_string_array(ensure_object(column._values))
- itemsize = max(itemsize, 1)
- if itemsize <= 2045:
- return itemsize
- return 32768
- elif dtype.type is np.float64:
- return 65526
- elif dtype.type is np.float32:
- return 65527
- elif dtype.type is np.int32:
- return 65528
- elif dtype.type is np.int16:
- return 65529
- elif dtype.type is np.int8:
- return 65530
- else: # pragma : no cover
- raise NotImplementedError(f"Data type {dtype} not supported.")
-
-
-def _pad_bytes_new(name: str | bytes, length: int) -> bytes:
- """
- Takes a bytes instance and pads it with null bytes until it's length chars.
- """
- if isinstance(name, str):
- name = bytes(name, "utf-8")
- return name + b"\x00" * (length - len(name))
-
-
-class StataStrLWriter:
- """
- Converter for Stata StrLs
-
- Stata StrLs map 8 byte values to strings which are stored using a
- dictionary-like format where strings are keyed to two values.
-
- Parameters
- ----------
- df : DataFrame
- DataFrame to convert
- columns : Sequence[str]
- List of columns names to convert to StrL
- version : int, optional
- dta version. Currently supports 117, 118 and 119
- byteorder : str, optional
- Can be ">", "<", "little", or "big". default is `sys.byteorder`
-
- Notes
- -----
- Supports creation of the StrL block of a dta file for dta versions
- 117, 118 and 119. These differ in how the GSO is stored. 118 and
- 119 store the GSO lookup value as a uint32 and a uint64, while 117
- uses two uint32s. 118 and 119 also encode all strings as unicode
- which is required by the format. 117 uses 'latin-1' a fixed width
- encoding that extends the 7-bit ascii table with an additional 128
- characters.
- """
-
- def __init__(
- self,
- df: DataFrame,
- columns: Sequence[str],
- version: int = 117,
- byteorder: str | None = None,
- ) -> None:
- if version not in (117, 118, 119):
- raise ValueError("Only dta versions 117, 118 and 119 supported")
- self._dta_ver = version
-
- self.df = df
- self.columns = columns
- self._gso_table = {"": (0, 0)}
- if byteorder is None:
- byteorder = sys.byteorder
- self._byteorder = _set_endianness(byteorder)
-
- gso_v_type = "I" # uint32
- gso_o_type = "Q" # uint64
- self._encoding = "utf-8"
- if version == 117:
- o_size = 4
- gso_o_type = "I" # 117 used uint32
- self._encoding = "latin-1"
- elif version == 118:
- o_size = 6
- else: # version == 119
- o_size = 5
- self._o_offet = 2 ** (8 * (8 - o_size))
- self._gso_o_type = gso_o_type
- self._gso_v_type = gso_v_type
-
- def _convert_key(self, key: tuple[int, int]) -> int:
- v, o = key
- return v + self._o_offet * o
-
- def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]:
- """
- Generates the GSO lookup table for the DataFrame
-
- Returns
- -------
- gso_table : dict
- Ordered dictionary using the string found as keys
- and their lookup position (v,o) as values
- gso_df : DataFrame
- DataFrame where strl columns have been converted to
- (v,o) values
-
- Notes
- -----
- Modifies the DataFrame in-place.
-
- The DataFrame returned encodes the (v,o) values as uint64s. The
- encoding depends on the dta version, and can be expressed as
-
- enc = v + o * 2 ** (o_size * 8)
-
- so that v is stored in the lower bits and o is in the upper
- bits. o_size is
-
- * 117: 4
- * 118: 6
- * 119: 5
- """
- gso_table = self._gso_table
- gso_df = self.df
- columns = list(gso_df.columns)
- selected = gso_df[self.columns]
- col_index = [(col, columns.index(col)) for col in self.columns]
- keys = np.empty(selected.shape, dtype=np.uint64)
- for o, (idx, row) in enumerate(selected.iterrows()):
- for j, (col, v) in enumerate(col_index):
- val = row[col]
- # Allow columns with mixed str and None (GH 23633)
- val = "" if val is None else val
- key = gso_table.get(val, None)
- if key is None:
- # Stata prefers human numbers
- key = (v + 1, o + 1)
- gso_table[val] = key
- keys[o, j] = self._convert_key(key)
- for i, col in enumerate(self.columns):
- gso_df[col] = keys[:, i]
-
- return gso_table, gso_df
-
- def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes:
- """
- Generates the binary blob of GSOs that is written to the dta file.
-
- Parameters
- ----------
- gso_table : dict
- Ordered dictionary (str, vo)
-
- Returns
- -------
- gso : bytes
- Binary content of dta file to be placed between strl tags
-
- Notes
- -----
- Output format depends on dta version. 117 uses two uint32s to
- express v and o while 118+ uses a uint32 for v and a uint64 for o.
- """
- # Format information
- # Length includes null term
- # 117
- # GSOvvvvooootllllxxxxxxxxxxxxxxx...x
- # 3 u4 u4 u1 u4 string + null term
- #
- # 118, 119
- # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x
- # 3 u4 u8 u1 u4 string + null term
-
- bio = BytesIO()
- gso = bytes("GSO", "ascii")
- gso_type = struct.pack(self._byteorder + "B", 130)
- null = struct.pack(self._byteorder + "B", 0)
- v_type = self._byteorder + self._gso_v_type
- o_type = self._byteorder + self._gso_o_type
- len_type = self._byteorder + "I"
- for strl, vo in gso_table.items():
- if vo == (0, 0):
- continue
- v, o = vo
-
- # GSO
- bio.write(gso)
-
- # vvvv
- bio.write(struct.pack(v_type, v))
-
- # oooo / oooooooo
- bio.write(struct.pack(o_type, o))
-
- # t
- bio.write(gso_type)
-
- # llll
- utf8_string = bytes(strl, "utf-8")
- bio.write(struct.pack(len_type, len(utf8_string) + 1))
-
- # xxx...xxx
- bio.write(utf8_string)
- bio.write(null)
-
- return bio.getvalue()
-
-
-class StataWriter117(StataWriter):
- """
- A class for writing Stata binary dta files in Stata 13 format (117)
-
- Parameters
- ----------
- fname : path (string), buffer or path object
- string, path object (pathlib.Path or py._path.local.LocalPath) or
- object implementing a binary write() functions. If using a buffer
- then the buffer will not be automatically closed after the file
- is written.
- data : DataFrame
- Input to save
- convert_dates : dict
- Dictionary mapping columns containing datetime types to stata internal
- format to use when writing the dates. Options are 'tc', 'td', 'tm',
- 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
- Datetime columns that do not have a conversion type specified will be
- converted to 'tc'. Raises NotImplementedError if a datetime column has
- timezone information
- write_index : bool
- Write the index to Stata dataset.
- byteorder : str
- Can be ">", "<", "little", or "big". default is `sys.byteorder`
- time_stamp : datetime
- A datetime to use as file creation date. Default is the current time
- data_label : str
- A label for the data set. Must be 80 characters or smaller.
- variable_labels : dict
- Dictionary containing columns as keys and variable labels as values.
- Each label must be 80 characters or smaller.
- convert_strl : list
- List of columns names to convert to Stata StrL format. Columns with
- more than 2045 characters are automatically written as StrL.
- Smaller columns can be converted by including the column name. Using
- StrLs can reduce output file size when strings are longer than 8
- characters, and either frequently repeated or sparse.
- {compression_options}
-
- .. versionadded:: 1.1.0
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- value_labels : dict of dicts
- Dictionary containing columns as keys and dictionaries of column value
- to labels as values. The combined length of all labels for a single
- variable must be 32,000 characters or smaller.
-
- .. versionadded:: 1.4.0
-
- Returns
- -------
- writer : StataWriter117 instance
- The StataWriter117 instance has a write_file method, which will
- write the file to the given `fname`.
-
- Raises
- ------
- NotImplementedError
- * If datetimes contain timezone information
- ValueError
- * Columns listed in convert_dates are neither datetime64[ns]
- or datetime.datetime
- * Column dtype is not representable in Stata
- * Column listed in convert_dates is not in DataFrame
- * Categorical label contains more than 32,000 characters
-
- Examples
- --------
- >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c'])
- >>> writer = pd.io.stata.StataWriter117('./data_file.dta', data)
- >>> writer.write_file()
-
- Directly write a zip file
- >>> compression = {"method": "zip", "archive_name": "data_file.dta"}
- >>> writer = pd.io.stata.StataWriter117(
- ... './data_file.zip', data, compression=compression
- ... )
- >>> writer.write_file()
-
- Or with long strings stored in strl format
- >>> data = pd.DataFrame([['A relatively long string'], [''], ['']],
- ... columns=['strls'])
- >>> writer = pd.io.stata.StataWriter117(
- ... './data_file_with_long_strings.dta', data, convert_strl=['strls'])
- >>> writer.write_file()
- """
-
- _max_string_length = 2045
- _dta_version = 117
-
- def __init__(
- self,
- fname: FilePath | WriteBuffer[bytes],
- data: DataFrame,
- convert_dates: dict[Hashable, str] | None = None,
- write_index: bool = True,
- byteorder: str | None = None,
- time_stamp: datetime.datetime | None = None,
- data_label: str | None = None,
- variable_labels: dict[Hashable, str] | None = None,
- convert_strl: Sequence[Hashable] | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
- *,
- value_labels: dict[Hashable, dict[float, str]] | None = None,
- ) -> None:
- # Copy to new list since convert_strl might be modified later
- self._convert_strl: list[Hashable] = []
- if convert_strl is not None:
- self._convert_strl.extend(convert_strl)
-
- super().__init__(
- fname,
- data,
- convert_dates,
- write_index,
- byteorder=byteorder,
- time_stamp=time_stamp,
- data_label=data_label,
- variable_labels=variable_labels,
- value_labels=value_labels,
- compression=compression,
- storage_options=storage_options,
- )
- self._map: dict[str, int] = {}
- self._strl_blob = b""
-
- @staticmethod
- def _tag(val: str | bytes, tag: str) -> bytes:
- """Surround val with <tag></tag>"""
- if isinstance(val, str):
- val = bytes(val, "utf-8")
- return bytes("<" + tag + ">", "utf-8") + val + bytes("</" + tag + ">", "utf-8")
-
- def _update_map(self, tag: str) -> None:
- """Update map location for tag with file position"""
- assert self.handles.handle is not None
- self._map[tag] = self.handles.handle.tell()
-
- def _write_header(
- self,
- data_label: str | None = None,
- time_stamp: datetime.datetime | None = None,
- ) -> None:
- """Write the file header"""
- byteorder = self._byteorder
- self._write_bytes(bytes("<stata_dta>", "utf-8"))
- bio = BytesIO()
- # ds_format - 117
- bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release"))
- # byteorder
- bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder"))
- # number of vars, 2 bytes in 117 and 118, 4 byte in 119
- nvar_type = "H" if self._dta_version <= 118 else "I"
- bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K"))
- # 117 uses 4 bytes, 118 uses 8
- nobs_size = "I" if self._dta_version == 117 else "Q"
- bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N"))
- # data label 81 bytes, char, null terminated
- label = data_label[:80] if data_label is not None else ""
- encoded_label = label.encode(self._encoding)
- label_size = "B" if self._dta_version == 117 else "H"
- label_len = struct.pack(byteorder + label_size, len(encoded_label))
- encoded_label = label_len + encoded_label
- bio.write(self._tag(encoded_label, "label"))
- # time stamp, 18 bytes, char, null terminated
- # format dd Mon yyyy hh:mm
- if time_stamp is None:
- time_stamp = datetime.datetime.now()
- elif not isinstance(time_stamp, datetime.datetime):
- raise ValueError("time_stamp should be datetime type")
- # Avoid locale-specific month conversion
- months = [
- "Jan",
- "Feb",
- "Mar",
- "Apr",
- "May",
- "Jun",
- "Jul",
- "Aug",
- "Sep",
- "Oct",
- "Nov",
- "Dec",
- ]
- month_lookup = {i + 1: month for i, month in enumerate(months)}
- ts = (
- time_stamp.strftime("%d ")
- + month_lookup[time_stamp.month]
- + time_stamp.strftime(" %Y %H:%M")
- )
- # '\x11' added due to inspection of Stata file
- stata_ts = b"\x11" + bytes(ts, "utf-8")
- bio.write(self._tag(stata_ts, "timestamp"))
- self._write_bytes(self._tag(bio.getvalue(), "header"))
-
- def _write_map(self) -> None:
- """
- Called twice during file write. The first populates the values in
- the map with 0s. The second call writes the final map locations when
- all blocks have been written.
- """
- if not self._map:
- self._map = {
- "stata_data": 0,
- "map": self.handles.handle.tell(),
- "variable_types": 0,
- "varnames": 0,
- "sortlist": 0,
- "formats": 0,
- "value_label_names": 0,
- "variable_labels": 0,
- "characteristics": 0,
- "data": 0,
- "strls": 0,
- "value_labels": 0,
- "stata_data_close": 0,
- "end-of-file": 0,
- }
- # Move to start of map
- self.handles.handle.seek(self._map["map"])
- bio = BytesIO()
- for val in self._map.values():
- bio.write(struct.pack(self._byteorder + "Q", val))
- self._write_bytes(self._tag(bio.getvalue(), "map"))
-
- def _write_variable_types(self) -> None:
- self._update_map("variable_types")
- bio = BytesIO()
- for typ in self.typlist:
- bio.write(struct.pack(self._byteorder + "H", typ))
- self._write_bytes(self._tag(bio.getvalue(), "variable_types"))
-
- def _write_varnames(self) -> None:
- self._update_map("varnames")
- bio = BytesIO()
- # 118 scales by 4 to accommodate utf-8 data worst case encoding
- vn_len = 32 if self._dta_version == 117 else 128
- for name in self.varlist:
- name = self._null_terminate_str(name)
- name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1)
- bio.write(name)
- self._write_bytes(self._tag(bio.getvalue(), "varnames"))
-
- def _write_sortlist(self) -> None:
- self._update_map("sortlist")
- sort_size = 2 if self._dta_version < 119 else 4
- self._write_bytes(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist"))
-
- def _write_formats(self) -> None:
- self._update_map("formats")
- bio = BytesIO()
- fmt_len = 49 if self._dta_version == 117 else 57
- for fmt in self.fmtlist:
- bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len))
- self._write_bytes(self._tag(bio.getvalue(), "formats"))
-
- def _write_value_label_names(self) -> None:
- self._update_map("value_label_names")
- bio = BytesIO()
- # 118 scales by 4 to accommodate utf-8 data worst case encoding
- vl_len = 32 if self._dta_version == 117 else 128
- for i in range(self.nvar):
- # Use variable name when categorical
- name = "" # default name
- if self._has_value_labels[i]:
- name = self.varlist[i]
- name = self._null_terminate_str(name)
- encoded_name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1)
- bio.write(encoded_name)
- self._write_bytes(self._tag(bio.getvalue(), "value_label_names"))
-
- def _write_variable_labels(self) -> None:
- # Missing labels are 80 blank characters plus null termination
- self._update_map("variable_labels")
- bio = BytesIO()
- # 118 scales by 4 to accommodate utf-8 data worst case encoding
- vl_len = 80 if self._dta_version == 117 else 320
- blank = _pad_bytes_new("", vl_len + 1)
-
- if self._variable_labels is None:
- for _ in range(self.nvar):
- bio.write(blank)
- self._write_bytes(self._tag(bio.getvalue(), "variable_labels"))
- return
-
- for col in self.data:
- if col in self._variable_labels:
- label = self._variable_labels[col]
- if len(label) > 80:
- raise ValueError("Variable labels must be 80 characters or fewer")
- try:
- encoded = label.encode(self._encoding)
- except UnicodeEncodeError as err:
- raise ValueError(
- "Variable labels must contain only characters that "
- f"can be encoded in {self._encoding}"
- ) from err
-
- bio.write(_pad_bytes_new(encoded, vl_len + 1))
- else:
- bio.write(blank)
- self._write_bytes(self._tag(bio.getvalue(), "variable_labels"))
-
- def _write_characteristics(self) -> None:
- self._update_map("characteristics")
- self._write_bytes(self._tag(b"", "characteristics"))
-
- def _write_data(self, records) -> None:
- self._update_map("data")
- self._write_bytes(b"<data>")
- self._write_bytes(records.tobytes())
- self._write_bytes(b"</data>")
-
- def _write_strls(self) -> None:
- self._update_map("strls")
- self._write_bytes(self._tag(self._strl_blob, "strls"))
-
- def _write_expansion_fields(self) -> None:
- """No-op in dta 117+"""
-
- def _write_value_labels(self) -> None:
- self._update_map("value_labels")
- bio = BytesIO()
- for vl in self._value_labels:
- lab = vl.generate_value_label(self._byteorder)
- lab = self._tag(lab, "lbl")
- bio.write(lab)
- self._write_bytes(self._tag(bio.getvalue(), "value_labels"))
-
- def _write_file_close_tag(self) -> None:
- self._update_map("stata_data_close")
- self._write_bytes(bytes("</stata_dta>", "utf-8"))
- self._update_map("end-of-file")
-
- def _update_strl_names(self) -> None:
- """
- Update column names for conversion to strl if they might have been
- changed to comply with Stata naming rules
- """
- # Update convert_strl if names changed
- for orig, new in self._converted_names.items():
- if orig in self._convert_strl:
- idx = self._convert_strl.index(orig)
- self._convert_strl[idx] = new
-
- def _convert_strls(self, data: DataFrame) -> DataFrame:
- """
- Convert columns to StrLs if either very large or in the
- convert_strl variable
- """
- convert_cols = [
- col
- for i, col in enumerate(data)
- if self.typlist[i] == 32768 or col in self._convert_strl
- ]
-
- if convert_cols:
- ssw = StataStrLWriter(data, convert_cols, version=self._dta_version)
- tab, new_data = ssw.generate_table()
- data = new_data
- self._strl_blob = ssw.generate_blob(tab)
- return data
-
- def _set_formats_and_types(self, dtypes: Series) -> None:
- self.typlist = []
- self.fmtlist = []
- for col, dtype in dtypes.items():
- force_strl = col in self._convert_strl
- fmt = _dtype_to_default_stata_fmt(
- dtype,
- self.data[col],
- dta_version=self._dta_version,
- force_strl=force_strl,
- )
- self.fmtlist.append(fmt)
- self.typlist.append(
- _dtype_to_stata_type_117(dtype, self.data[col], force_strl)
- )
-
-
-class StataWriterUTF8(StataWriter117):
- """
- Stata binary dta file writing in Stata 15 (118) and 16 (119) formats
-
- DTA 118 and 119 format files support unicode string data (both fixed
- and strL) format. Unicode is also supported in value labels, variable
- labels and the dataset label. Format 119 is automatically used if the
- file contains more than 32,767 variables.
-
- Parameters
- ----------
- fname : path (string), buffer or path object
- string, path object (pathlib.Path or py._path.local.LocalPath) or
- object implementing a binary write() functions. If using a buffer
- then the buffer will not be automatically closed after the file
- is written.
- data : DataFrame
- Input to save
- convert_dates : dict, default None
- Dictionary mapping columns containing datetime types to stata internal
- format to use when writing the dates. Options are 'tc', 'td', 'tm',
- 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
- Datetime columns that do not have a conversion type specified will be
- converted to 'tc'. Raises NotImplementedError if a datetime column has
- timezone information
- write_index : bool, default True
- Write the index to Stata dataset.
- byteorder : str, default None
- Can be ">", "<", "little", or "big". default is `sys.byteorder`
- time_stamp : datetime, default None
- A datetime to use as file creation date. Default is the current time
- data_label : str, default None
- A label for the data set. Must be 80 characters or smaller.
- variable_labels : dict, default None
- Dictionary containing columns as keys and variable labels as values.
- Each label must be 80 characters or smaller.
- convert_strl : list, default None
- List of columns names to convert to Stata StrL format. Columns with
- more than 2045 characters are automatically written as StrL.
- Smaller columns can be converted by including the column name. Using
- StrLs can reduce output file size when strings are longer than 8
- characters, and either frequently repeated or sparse.
- version : int, default None
- The dta version to use. By default, uses the size of data to determine
- the version. 118 is used if data.shape[1] <= 32767, and 119 is used
- for storing larger DataFrames.
- {compression_options}
-
- .. versionadded:: 1.1.0
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- value_labels : dict of dicts
- Dictionary containing columns as keys and dictionaries of column value
- to labels as values. The combined length of all labels for a single
- variable must be 32,000 characters or smaller.
-
- .. versionadded:: 1.4.0
-
- Returns
- -------
- StataWriterUTF8
- The instance has a write_file method, which will write the file to the
- given `fname`.
-
- Raises
- ------
- NotImplementedError
- * If datetimes contain timezone information
- ValueError
- * Columns listed in convert_dates are neither datetime64[ns]
- or datetime.datetime
- * Column dtype is not representable in Stata
- * Column listed in convert_dates is not in DataFrame
- * Categorical label contains more than 32,000 characters
-
- Examples
- --------
- Using Unicode data and column names
-
- >>> from pandas.io.stata import StataWriterUTF8
- >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ'])
- >>> writer = StataWriterUTF8('./data_file.dta', data)
- >>> writer.write_file()
-
- Directly write a zip file
- >>> compression = {"method": "zip", "archive_name": "data_file.dta"}
- >>> writer = StataWriterUTF8('./data_file.zip', data, compression=compression)
- >>> writer.write_file()
-
- Or with long strings stored in strl format
-
- >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']],
- ... columns=['strls'])
- >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data,
- ... convert_strl=['strls'])
- >>> writer.write_file()
- """
-
- _encoding: Literal["utf-8"] = "utf-8"
-
- def __init__(
- self,
- fname: FilePath | WriteBuffer[bytes],
- data: DataFrame,
- convert_dates: dict[Hashable, str] | None = None,
- write_index: bool = True,
- byteorder: str | None = None,
- time_stamp: datetime.datetime | None = None,
- data_label: str | None = None,
- variable_labels: dict[Hashable, str] | None = None,
- convert_strl: Sequence[Hashable] | None = None,
- version: int | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
- *,
- value_labels: dict[Hashable, dict[float, str]] | None = None,
- ) -> None:
- if version is None:
- version = 118 if data.shape[1] <= 32767 else 119
- elif version not in (118, 119):
- raise ValueError("version must be either 118 or 119.")
- elif version == 118 and data.shape[1] > 32767:
- raise ValueError(
- "You must use version 119 for data sets containing more than"
- "32,767 variables"
- )
-
- super().__init__(
- fname,
- data,
- convert_dates=convert_dates,
- write_index=write_index,
- byteorder=byteorder,
- time_stamp=time_stamp,
- data_label=data_label,
- variable_labels=variable_labels,
- value_labels=value_labels,
- convert_strl=convert_strl,
- compression=compression,
- storage_options=storage_options,
- )
- # Override version set in StataWriter117 init
- self._dta_version = version
-
- def _validate_variable_name(self, name: str) -> str:
- """
- Validate variable names for Stata export.
-
- Parameters
- ----------
- name : str
- Variable name
-
- Returns
- -------
- str
- The validated name with invalid characters replaced with
- underscores.
-
- Notes
- -----
- Stata 118+ support most unicode characters. The only limitation is in
- the ascii range where the characters supported are a-z, A-Z, 0-9 and _.
- """
- # High code points appear to be acceptable
- for c in name:
- if (
- (
- ord(c) < 128
- and (c < "A" or c > "Z")
- and (c < "a" or c > "z")
- and (c < "0" or c > "9")
- and c != "_"
- )
- or 128 <= ord(c) < 192
- or c in {"×", "÷"}
- ):
- name = name.replace(c, "_")
-
- return name
diff --git a/contrib/python/pandas/py3/pandas/io/xml.py b/contrib/python/pandas/py3/pandas/io/xml.py
deleted file mode 100644
index 55817eca1b6..00000000000
--- a/contrib/python/pandas/py3/pandas/io/xml.py
+++ /dev/null
@@ -1,1135 +0,0 @@
-"""
-:mod:`pandas.io.xml` is a module for reading XML.
-"""
-
-from __future__ import annotations
-
-import io
-from typing import (
- Any,
- Callable,
- Sequence,
-)
-
-from pandas._libs import lib
-from pandas._typing import (
- TYPE_CHECKING,
- CompressionOptions,
- ConvertersArg,
- DtypeArg,
- DtypeBackend,
- FilePath,
- ParseDatesArg,
- ReadBuffer,
- StorageOptions,
- XMLParsers,
-)
-from pandas.compat._optional import import_optional_dependency
-from pandas.errors import (
- AbstractMethodError,
- ParserError,
-)
-from pandas.util._decorators import doc
-from pandas.util._validators import check_dtype_backend
-
-from pandas.core.dtypes.common import is_list_like
-
-from pandas.core.shared_docs import _shared_docs
-
-from pandas.io.common import (
- file_exists,
- get_handle,
- infer_compression,
- is_fsspec_url,
- is_url,
- stringify_path,
-)
-from pandas.io.parsers import TextParser
-
-if TYPE_CHECKING:
- from xml.etree.ElementTree import Element
-
- from lxml import etree
-
- from pandas import DataFrame
-
-
-@doc(
- storage_options=_shared_docs["storage_options"],
- decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
-)
-class _XMLFrameParser:
- """
- Internal subclass to parse XML into DataFrames.
-
- Parameters
- ----------
- path_or_buffer : a valid JSON str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file.
-
- xpath : str or regex
- The XPath expression to parse required set of nodes for
- migration to `Data Frame`. `etree` supports limited XPath.
-
- namespaces : dict
- The namespaces defined in XML document (`xmlns:namespace='URI')
- as dicts with key being namespace and value the URI.
-
- elems_only : bool
- Parse only the child elements at the specified `xpath`.
-
- attrs_only : bool
- Parse only the attributes at the specified `xpath`.
-
- names : list
- Column names for Data Frame of parsed XML data.
-
- dtype : dict
- Data type for data or columns. E.g. {{'a': np.float64,
- 'b': np.int32, 'c': 'Int64'}}
-
- .. versionadded:: 1.5.0
-
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can
- either be integers or column labels.
-
- .. versionadded:: 1.5.0
-
- parse_dates : bool or list of int or names or list of lists or dict
- Converts either index or select columns to datetimes
-
- .. versionadded:: 1.5.0
-
- encoding : str
- Encoding of xml object or document.
-
- stylesheet : str or file-like
- URL, file, file-like object, or a raw string containing XSLT,
- `etree` does not support XSLT but retained for consistency.
-
- iterparse : dict, optional
- Dict with row element as key and list of descendant elements
- and/or attributes as value to be retrieved in iterparsing of
- XML document.
-
- .. versionadded:: 1.5.0
-
- {decompression_options}
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- {storage_options}
-
- See also
- --------
- pandas.io.xml._EtreeFrameParser
- pandas.io.xml._LxmlFrameParser
-
- Notes
- -----
- To subclass this class effectively you must override the following methods:`
- * :func:`parse_data`
- * :func:`_parse_nodes`
- * :func:`_iterparse_nodes`
- * :func:`_parse_doc`
- * :func:`_validate_names`
- * :func:`_validate_path`
-
-
- See each method's respective documentation for details on their
- functionality.
- """
-
- def __init__(
- self,
- path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
- xpath: str,
- namespaces: dict[str, str] | None,
- elems_only: bool,
- attrs_only: bool,
- names: Sequence[str] | None,
- dtype: DtypeArg | None,
- converters: ConvertersArg | None,
- parse_dates: ParseDatesArg | None,
- encoding: str | None,
- stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
- iterparse: dict[str, list[str]] | None,
- compression: CompressionOptions,
- storage_options: StorageOptions,
- ) -> None:
- self.path_or_buffer = path_or_buffer
- self.xpath = xpath
- self.namespaces = namespaces
- self.elems_only = elems_only
- self.attrs_only = attrs_only
- self.names = names
- self.dtype = dtype
- self.converters = converters
- self.parse_dates = parse_dates
- self.encoding = encoding
- self.stylesheet = stylesheet
- self.iterparse = iterparse
- self.is_style = None
- self.compression = compression
- self.storage_options = storage_options
-
- def parse_data(self) -> list[dict[str, str | None]]:
- """
- Parse xml data.
-
- This method will call the other internal methods to
- validate xpath, names, parse and return specific nodes.
- """
-
- raise AbstractMethodError(self)
-
- def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:
- """
- Parse xml nodes.
-
- This method will parse the children and attributes of elements
- in xpath, conditionally for only elements, only attributes
- or both while optionally renaming node names.
-
- Raises
- ------
- ValueError
- * If only elements and only attributes are specified.
-
- Notes
- -----
- Namespace URIs will be removed from return node values. Also,
- elements with missing children or attributes compared to siblings
- will have optional keys filled with None values.
- """
-
- dicts: list[dict[str, str | None]]
-
- if self.elems_only and self.attrs_only:
- raise ValueError("Either element or attributes can be parsed not both.")
- if self.elems_only:
- if self.names:
- dicts = [
- {
- **(
- {el.tag: el.text.strip()}
- if el.text and not el.text.isspace()
- else {}
- ),
- **{
- nm: ch.text.strip() if ch.text else None
- for nm, ch in zip(self.names, el.findall("*"))
- },
- }
- for el in elems
- ]
- else:
- dicts = [
- {
- ch.tag: ch.text.strip() if ch.text else None
- for ch in el.findall("*")
- }
- for el in elems
- ]
-
- elif self.attrs_only:
- dicts = [
- {k: v.strip() if v else None for k, v in el.attrib.items()}
- for el in elems
- ]
-
- else:
- if self.names:
- dicts = [
- {
- **el.attrib,
- **(
- {el.tag: el.text.strip()}
- if el.text and not el.text.isspace()
- else {}
- ),
- **{
- nm: ch.text.strip() if ch.text else None
- for nm, ch in zip(self.names, el.findall("*"))
- },
- }
- for el in elems
- ]
-
- else:
- dicts = [
- {
- **el.attrib,
- **(
- {el.tag: el.text.strip()}
- if el.text and not el.text.isspace()
- else {}
- ),
- **{
- ch.tag: ch.text.strip() if ch.text else None
- for ch in el.findall("*")
- },
- }
- for el in elems
- ]
-
- dicts = [
- {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
- ]
-
- keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
- dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
-
- if self.names:
- dicts = [dict(zip(self.names, d.values())) for d in dicts]
-
- return dicts
-
- def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
- """
- Iterparse xml nodes.
-
- This method will read in local disk, decompressed XML files for elements
- and underlying descendants using iterparse, a method to iterate through
- an XML tree without holding entire XML tree in memory.
-
- Raises
- ------
- TypeError
- * If `iterparse` is not a dict or its dict value is not list-like.
- ParserError
- * If `path_or_buffer` is not a physical file on disk or file-like object.
- * If no data is returned from selected items in `iterparse`.
-
- Notes
- -----
- Namespace URIs will be removed from return node values. Also,
- elements with missing children or attributes in submitted list
- will have optional keys filled with None values.
- """
-
- dicts: list[dict[str, str | None]] = []
- row: dict[str, str | None] | None = None
-
- if not isinstance(self.iterparse, dict):
- raise TypeError(
- f"{type(self.iterparse).__name__} is not a valid type for iterparse"
- )
-
- row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
- if not is_list_like(self.iterparse[row_node]):
- raise TypeError(
- f"{type(self.iterparse[row_node])} is not a valid type "
- "for value in iterparse"
- )
-
- if (not hasattr(self.path_or_buffer, "read")) and (
- not isinstance(self.path_or_buffer, str)
- or is_url(self.path_or_buffer)
- or is_fsspec_url(self.path_or_buffer)
- or self.path_or_buffer.startswith(("<?xml", "<"))
- or infer_compression(self.path_or_buffer, "infer") is not None
- ):
- raise ParserError(
- "iterparse is designed for large XML files that are fully extracted on "
- "local disk and not as compressed files or online sources."
- )
-
- iterparse_repeats = len(self.iterparse[row_node]) != len(
- set(self.iterparse[row_node])
- )
-
- for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
- curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
-
- if event == "start":
- if curr_elem == row_node:
- row = {}
-
- if row is not None:
- if self.names and iterparse_repeats:
- for col, nm in zip(self.iterparse[row_node], self.names):
- if curr_elem == col:
- elem_val = elem.text.strip() if elem.text else None
- if elem_val not in row.values() and nm not in row:
- row[nm] = elem_val
-
- if col in elem.attrib:
- if elem.attrib[col] not in row.values() and nm not in row:
- row[nm] = elem.attrib[col]
- else:
- for col in self.iterparse[row_node]:
- if curr_elem == col:
- row[col] = elem.text.strip() if elem.text else None
- if col in elem.attrib:
- row[col] = elem.attrib[col]
-
- if event == "end":
- if curr_elem == row_node and row is not None:
- dicts.append(row)
- row = None
-
- elem.clear()
- if hasattr(elem, "getprevious"):
- while (
- elem.getprevious() is not None and elem.getparent() is not None
- ):
- del elem.getparent()[0]
-
- if dicts == []:
- raise ParserError("No result from selected items in iterparse.")
-
- keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
- dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
-
- if self.names:
- dicts = [dict(zip(self.names, d.values())) for d in dicts]
-
- return dicts
-
- def _validate_path(self) -> list[Any]:
- """
- Validate xpath.
-
- This method checks for syntax, evaluation, or empty nodes return.
-
- Raises
- ------
- SyntaxError
- * If xpah is not supported or issues with namespaces.
-
- ValueError
- * If xpah does not return any nodes.
- """
-
- raise AbstractMethodError(self)
-
- def _validate_names(self) -> None:
- """
- Validate names.
-
- This method will check if names is a list-like and aligns
- with length of parse nodes.
-
- Raises
- ------
- ValueError
- * If value is not a list and less then length of nodes.
- """
- raise AbstractMethodError(self)
-
- def _parse_doc(
- self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> Element | etree._Element:
- """
- Build tree from path_or_buffer.
-
- This method will parse XML object into tree
- either from string/bytes or file location.
- """
- raise AbstractMethodError(self)
-
-
-class _EtreeFrameParser(_XMLFrameParser):
- """
- Internal class to parse XML into DataFrames with the Python
- standard library XML module: `xml.etree.ElementTree`.
- """
-
- def parse_data(self) -> list[dict[str, str | None]]:
- from xml.etree.ElementTree import iterparse
-
- if self.stylesheet is not None:
- raise ValueError(
- "To use stylesheet, you need lxml installed and selected as parser."
- )
-
- if self.iterparse is None:
- self.xml_doc = self._parse_doc(self.path_or_buffer)
- elems = self._validate_path()
-
- self._validate_names()
-
- xml_dicts: list[dict[str, str | None]] = (
- self._parse_nodes(elems)
- if self.iterparse is None
- else self._iterparse_nodes(iterparse)
- )
-
- return xml_dicts
-
- def _validate_path(self) -> list[Any]:
- """
- Notes
- -----
- `etree` supports limited XPath. If user attempts a more complex
- expression syntax error will raise.
- """
-
- msg = (
- "xpath does not return any nodes or attributes. "
- "Be sure to specify in `xpath` the parent nodes of "
- "children and attributes to parse. "
- "If document uses namespaces denoted with "
- "xmlns, be sure to define namespaces and "
- "use them in xpath."
- )
- try:
- elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
- children = [ch for el in elems for ch in el.findall("*")]
- attrs = {k: v for el in elems for k, v in el.attrib.items()}
-
- if elems is None:
- raise ValueError(msg)
-
- if elems is not None:
- if self.elems_only and children == []:
- raise ValueError(msg)
- if self.attrs_only and attrs == {}:
- raise ValueError(msg)
- if children == [] and attrs == {}:
- raise ValueError(msg)
-
- except (KeyError, SyntaxError):
- raise SyntaxError(
- "You have used an incorrect or unsupported XPath "
- "expression for etree library or you used an "
- "undeclared namespace prefix."
- )
-
- return elems
-
- def _validate_names(self) -> None:
- children: list[Any]
-
- if self.names:
- if self.iterparse:
- children = self.iterparse[next(iter(self.iterparse))]
- else:
- parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
- children = parent.findall("*") if parent else []
-
- if is_list_like(self.names):
- if len(self.names) < len(children):
- raise ValueError(
- "names does not match length of child elements in xpath."
- )
- else:
- raise TypeError(
- f"{type(self.names).__name__} is not a valid type for names"
- )
-
- def _parse_doc(
- self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> Element:
- from xml.etree.ElementTree import (
- XMLParser,
- parse,
- )
-
- handle_data = get_data_from_filepath(
- filepath_or_buffer=raw_doc,
- encoding=self.encoding,
- compression=self.compression,
- storage_options=self.storage_options,
- )
-
- with preprocess_data(handle_data) as xml_data:
- curr_parser = XMLParser(encoding=self.encoding)
- document = parse(xml_data, parser=curr_parser)
-
- return document.getroot()
-
-
-class _LxmlFrameParser(_XMLFrameParser):
- """
- Internal class to parse XML into DataFrames with third-party
- full-featured XML library, `lxml`, that supports
- XPath 1.0 and XSLT 1.0.
- """
-
- def parse_data(self) -> list[dict[str, str | None]]:
- """
- Parse xml data.
-
- This method will call the other internal methods to
- validate xpath, names, optionally parse and run XSLT,
- and parse original or transformed XML and return specific nodes.
- """
- from lxml.etree import iterparse
-
- if self.iterparse is None:
- self.xml_doc = self._parse_doc(self.path_or_buffer)
-
- if self.stylesheet:
- self.xsl_doc = self._parse_doc(self.stylesheet)
- self.xml_doc = self._transform_doc()
-
- elems = self._validate_path()
-
- self._validate_names()
-
- xml_dicts: list[dict[str, str | None]] = (
- self._parse_nodes(elems)
- if self.iterparse is None
- else self._iterparse_nodes(iterparse)
- )
-
- return xml_dicts
-
- def _validate_path(self) -> list[Any]:
- msg = (
- "xpath does not return any nodes or attributes. "
- "Be sure to specify in `xpath` the parent nodes of "
- "children and attributes to parse. "
- "If document uses namespaces denoted with "
- "xmlns, be sure to define namespaces and "
- "use them in xpath."
- )
-
- elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
- children = [ch for el in elems for ch in el.xpath("*")]
- attrs = {k: v for el in elems for k, v in el.attrib.items()}
-
- if elems == []:
- raise ValueError(msg)
-
- if elems != []:
- if self.elems_only and children == []:
- raise ValueError(msg)
- if self.attrs_only and attrs == {}:
- raise ValueError(msg)
- if children == [] and attrs == {}:
- raise ValueError(msg)
-
- return elems
-
- def _validate_names(self) -> None:
- children: list[Any]
-
- if self.names:
- if self.iterparse:
- children = self.iterparse[next(iter(self.iterparse))]
- else:
- children = self.xml_doc.xpath(
- self.xpath + "[1]/*", namespaces=self.namespaces
- )
-
- if is_list_like(self.names):
- if len(self.names) < len(children):
- raise ValueError(
- "names does not match length of child elements in xpath."
- )
- else:
- raise TypeError(
- f"{type(self.names).__name__} is not a valid type for names"
- )
-
- def _parse_doc(
- self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> etree._Element:
- from lxml.etree import (
- XMLParser,
- fromstring,
- parse,
- )
-
- handle_data = get_data_from_filepath(
- filepath_or_buffer=raw_doc,
- encoding=self.encoding,
- compression=self.compression,
- storage_options=self.storage_options,
- )
-
- with preprocess_data(handle_data) as xml_data:
- curr_parser = XMLParser(encoding=self.encoding)
-
- if isinstance(xml_data, io.StringIO):
- if self.encoding is None:
- raise TypeError(
- "Can not pass encoding None when input is StringIO."
- )
-
- document = fromstring(
- xml_data.getvalue().encode(self.encoding), parser=curr_parser
- )
- else:
- document = parse(xml_data, parser=curr_parser)
-
- return document
-
- def _transform_doc(self) -> etree._XSLTResultTree:
- """
- Transform original tree using stylesheet.
-
- This method will transform original xml using XSLT script into
- am ideally flatter xml document for easier parsing and migration
- to Data Frame.
- """
- from lxml.etree import XSLT
-
- transformer = XSLT(self.xsl_doc)
- new_doc = transformer(self.xml_doc)
-
- return new_doc
-
-
-def get_data_from_filepath(
- filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
- encoding: str | None,
- compression: CompressionOptions,
- storage_options: StorageOptions,
-) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
- """
- Extract raw XML data.
-
- The method accepts three input types:
- 1. filepath (string-like)
- 2. file-like object (e.g. open file object, StringIO)
- 3. XML string or bytes
-
- This method turns (1) into (2) to simplify the rest of the processing.
- It returns input types (2) and (3) unchanged.
- """
- if not isinstance(filepath_or_buffer, bytes):
- filepath_or_buffer = stringify_path(filepath_or_buffer)
-
- if (
- isinstance(filepath_or_buffer, str)
- and not filepath_or_buffer.startswith(("<?xml", "<"))
- ) and (
- not isinstance(filepath_or_buffer, str)
- or is_url(filepath_or_buffer)
- or is_fsspec_url(filepath_or_buffer)
- or file_exists(filepath_or_buffer)
- ):
- with get_handle(
- filepath_or_buffer,
- "r",
- encoding=encoding,
- compression=compression,
- storage_options=storage_options,
- ) as handle_obj:
- filepath_or_buffer = (
- handle_obj.handle.read()
- if hasattr(handle_obj.handle, "read")
- else handle_obj.handle
- )
-
- return filepath_or_buffer
-
-
-def preprocess_data(data) -> io.StringIO | io.BytesIO:
- """
- Convert extracted raw data.
-
- This method will return underlying data of extracted XML content.
- The data either has a `read` attribute (e.g. a file object or a
- StringIO/BytesIO) or is a string or bytes that is an XML document.
- """
-
- if isinstance(data, str):
- data = io.StringIO(data)
-
- elif isinstance(data, bytes):
- data = io.BytesIO(data)
-
- return data
-
-
-def _data_to_frame(data, **kwargs) -> DataFrame:
- """
- Convert parsed data to Data Frame.
-
- This method will bind xml dictionary data of keys and values
- into named columns of Data Frame using the built-in TextParser
- class that build Data Frame and infers specific dtypes.
- """
-
- tags = next(iter(data))
- nodes = [list(d.values()) for d in data]
-
- try:
- with TextParser(nodes, names=tags, **kwargs) as tp:
- return tp.read()
- except ParserError:
- raise ParserError(
- "XML document may be too complex for import. "
- "Try to flatten document and use distinct "
- "element and attribute names."
- )
-
-
-def _parse(
- path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
- xpath: str,
- namespaces: dict[str, str] | None,
- elems_only: bool,
- attrs_only: bool,
- names: Sequence[str] | None,
- dtype: DtypeArg | None,
- converters: ConvertersArg | None,
- parse_dates: ParseDatesArg | None,
- encoding: str | None,
- parser: XMLParsers,
- stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
- iterparse: dict[str, list[str]] | None,
- compression: CompressionOptions,
- storage_options: StorageOptions,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- **kwargs,
-) -> DataFrame:
- """
- Call internal parsers.
-
- This method will conditionally call internal parsers:
- LxmlFrameParser and/or EtreeParser.
-
- Raises
- ------
- ImportError
- * If lxml is not installed if selected as parser.
-
- ValueError
- * If parser is not lxml or etree.
- """
-
- p: _EtreeFrameParser | _LxmlFrameParser
-
- if parser == "lxml":
- lxml = import_optional_dependency("lxml.etree", errors="ignore")
-
- if lxml is not None:
- p = _LxmlFrameParser(
- path_or_buffer,
- xpath,
- namespaces,
- elems_only,
- attrs_only,
- names,
- dtype,
- converters,
- parse_dates,
- encoding,
- stylesheet,
- iterparse,
- compression,
- storage_options,
- )
- else:
- raise ImportError("lxml not found, please install or use the etree parser.")
-
- elif parser == "etree":
- p = _EtreeFrameParser(
- path_or_buffer,
- xpath,
- namespaces,
- elems_only,
- attrs_only,
- names,
- dtype,
- converters,
- parse_dates,
- encoding,
- stylesheet,
- iterparse,
- compression,
- storage_options,
- )
- else:
- raise ValueError("Values for parser can only be lxml or etree.")
-
- data_dicts = p.parse_data()
-
- return _data_to_frame(
- data=data_dicts,
- dtype=dtype,
- converters=converters,
- parse_dates=parse_dates,
- dtype_backend=dtype_backend,
- **kwargs,
- )
-
-
-@doc(
- storage_options=_shared_docs["storage_options"],
- decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
-)
-def read_xml(
- path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
- *,
- xpath: str = "./*",
- namespaces: dict[str, str] | None = None,
- elems_only: bool = False,
- attrs_only: bool = False,
- names: Sequence[str] | None = None,
- dtype: DtypeArg | None = None,
- converters: ConvertersArg | None = None,
- parse_dates: ParseDatesArg | None = None,
- # encoding can not be None for lxml and StringIO input
- encoding: str | None = "utf-8",
- parser: XMLParsers = "lxml",
- stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
- iterparse: dict[str, list[str]] | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
-) -> DataFrame:
- r"""
- Read XML document into a ``DataFrame`` object.
-
- .. versionadded:: 1.3.0
-
- Parameters
- ----------
- path_or_buffer : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a ``read()`` function. The string can be any valid XML
- string or a path. The string can further be a URL. Valid URL schemes
- include http, ftp, s3, and file.
-
- xpath : str, optional, default './\*'
- The XPath to parse required set of nodes for migration to DataFrame.
- XPath should return a collection of elements and not a single
- element. Note: The ``etree`` parser supports limited XPath
- expressions. For more complex XPath, use ``lxml`` which requires
- installation.
-
- namespaces : dict, optional
- The namespaces defined in XML document as dicts with key being
- namespace prefix and value the URI. There is no need to include all
- namespaces in XML, only the ones used in ``xpath`` expression.
- Note: if XML document uses default namespace denoted as
- `xmlns='<URI>'` without a prefix, you must assign any temporary
- namespace prefix such as 'doc' to the URI in order to parse
- underlying nodes and/or attributes. For example, ::
-
- namespaces = {{"doc": "https://example.com"}}
-
- elems_only : bool, optional, default False
- Parse only the child elements at the specified ``xpath``. By default,
- all child elements and non-empty text nodes are returned.
-
- attrs_only : bool, optional, default False
- Parse only the attributes at the specified ``xpath``.
- By default, all attributes are returned.
-
- names : list-like, optional
- Column names for DataFrame of parsed XML data. Use this parameter to
- rename original element names and distinguish same named elements and
- attributes.
-
- dtype : Type name or dict of column -> type, optional
- Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
- 'c': 'Int64'}}
- Use `str` or `object` together with suitable `na_values` settings
- to preserve and not interpret dtype.
- If converters are specified, they will be applied INSTEAD
- of dtype conversion.
-
- .. versionadded:: 1.5.0
-
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can either
- be integers or column labels.
-
- .. versionadded:: 1.5.0
-
- parse_dates : bool or list of int or names or list of lists or dict, default False
- Identifiers to parse index or columns to datetime. The behavior is as follows:
-
- * boolean. If True -> try parsing the index.
- * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
- each as a separate date column.
- * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
- a single date column.
- * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
- result 'foo'
-
- .. versionadded:: 1.5.0
-
- encoding : str, optional, default 'utf-8'
- Encoding of XML document.
-
- parser : {{'lxml','etree'}}, default 'lxml'
- Parser module to use for retrieval of data. Only 'lxml' and
- 'etree' are supported. With 'lxml' more complex XPath searches
- and ability to use XSLT stylesheet are supported.
-
- stylesheet : str, path object or file-like object
- A URL, file-like object, or a raw string containing an XSLT script.
- This stylesheet should flatten complex, deeply nested XML documents
- for easier parsing. To use this feature you must have ``lxml`` module
- installed and specify 'lxml' as ``parser``. The ``xpath`` must
- reference nodes of transformed XML document generated after XSLT
- transformation and not the original XML document. Only XSLT 1.0
- scripts and not later versions is currently supported.
-
- iterparse : dict, optional
- The nodes or attributes to retrieve in iterparsing of XML document
- as a dict with key being the name of repeating element and value being
- list of elements or attribute names that are descendants of the repeated
- element. Note: If this option is used, it will replace ``xpath`` parsing
- and unlike xpath, descendants do not need to relate to each other but can
- exist any where in document under the repeating element. This memory-
- efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).
- For example, ::
-
- iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}}
-
- .. versionadded:: 1.5.0
-
- {decompression_options}
-
- .. versionchanged:: 1.4.0 Zstandard support.
-
- {storage_options}
-
- dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
-
- The dtype_backends are still experimential.
-
- .. versionadded:: 2.0
-
- Returns
- -------
- df
- A DataFrame.
-
- See Also
- --------
- read_json : Convert a JSON string to pandas object.
- read_html : Read HTML tables into a list of DataFrame objects.
-
- Notes
- -----
- This method is best designed to import shallow XML documents in
- following format which is the ideal fit for the two-dimensions of a
- ``DataFrame`` (row by column). ::
-
- <root>
- <row>
- <column1>data</column1>
- <column2>data</column2>
- <column3>data</column3>
- ...
- </row>
- <row>
- ...
- </row>
- ...
- </root>
-
- As a file format, XML documents can be designed any way including
- layout of elements and attributes as long as it conforms to W3C
- specifications. Therefore, this method is a convenience handler for
- a specific flatter design and not all possible XML structures.
-
- However, for more complex XML documents, ``stylesheet`` allows you to
- temporarily redesign original document with XSLT (a special purpose
- language) for a flatter version for migration to a DataFrame.
-
- This function will *always* return a single :class:`DataFrame` or raise
- exceptions due to issues with XML document, ``xpath``, or other
- parameters.
-
- See the :ref:`read_xml documentation in the IO section of the docs
- <io.read_xml>` for more information in using this method to parse XML
- files to DataFrames.
-
- Examples
- --------
- >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
- ... <data xmlns="http://example.com">
- ... <row>
- ... <shape>square</shape>
- ... <degrees>360</degrees>
- ... <sides>4.0</sides>
- ... </row>
- ... <row>
- ... <shape>circle</shape>
- ... <degrees>360</degrees>
- ... <sides/>
- ... </row>
- ... <row>
- ... <shape>triangle</shape>
- ... <degrees>180</degrees>
- ... <sides>3.0</sides>
- ... </row>
- ... </data>'''
-
- >>> df = pd.read_xml(xml)
- >>> df
- shape degrees sides
- 0 square 360 4.0
- 1 circle 360 NaN
- 2 triangle 180 3.0
-
- >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
- ... <data>
- ... <row shape="square" degrees="360" sides="4.0"/>
- ... <row shape="circle" degrees="360"/>
- ... <row shape="triangle" degrees="180" sides="3.0"/>
- ... </data>'''
-
- >>> df = pd.read_xml(xml, xpath=".//row")
- >>> df
- shape degrees sides
- 0 square 360 4.0
- 1 circle 360 NaN
- 2 triangle 180 3.0
-
- >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
- ... <doc:data xmlns:doc="https://example.com">
- ... <doc:row>
- ... <doc:shape>square</doc:shape>
- ... <doc:degrees>360</doc:degrees>
- ... <doc:sides>4.0</doc:sides>
- ... </doc:row>
- ... <doc:row>
- ... <doc:shape>circle</doc:shape>
- ... <doc:degrees>360</doc:degrees>
- ... <doc:sides/>
- ... </doc:row>
- ... <doc:row>
- ... <doc:shape>triangle</doc:shape>
- ... <doc:degrees>180</doc:degrees>
- ... <doc:sides>3.0</doc:sides>
- ... </doc:row>
- ... </doc:data>'''
-
- >>> df = pd.read_xml(xml,
- ... xpath="//doc:row",
- ... namespaces={{"doc": "https://example.com"}})
- >>> df
- shape degrees sides
- 0 square 360 4.0
- 1 circle 360 NaN
- 2 triangle 180 3.0
- """
- check_dtype_backend(dtype_backend)
-
- return _parse(
- path_or_buffer=path_or_buffer,
- xpath=xpath,
- namespaces=namespaces,
- elems_only=elems_only,
- attrs_only=attrs_only,
- names=names,
- dtype=dtype,
- converters=converters,
- parse_dates=parse_dates,
- encoding=encoding,
- parser=parser,
- stylesheet=stylesheet,
- iterparse=iterparse,
- compression=compression,
- storage_options=storage_options,
- dtype_backend=dtype_backend,
- )
diff --git a/contrib/python/pandas/py3/pandas/plotting/__init__.py b/contrib/python/pandas/py3/pandas/plotting/__init__.py
deleted file mode 100644
index 55c861e384d..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/__init__.py
+++ /dev/null
@@ -1,98 +0,0 @@
-"""
-Plotting public API.
-
-Authors of third-party plotting backends should implement a module with a
-public ``plot(data, kind, **kwargs)``. The parameter `data` will contain
-the data structure and can be a `Series` or a `DataFrame`. For example,
-for ``df.plot()`` the parameter `data` will contain the DataFrame `df`.
-In some cases, the data structure is transformed before being sent to
-the backend (see PlotAccessor.__call__ in pandas/plotting/_core.py for
-the exact transformations).
-
-The parameter `kind` will be one of:
-
-- line
-- bar
-- barh
-- box
-- hist
-- kde
-- area
-- pie
-- scatter
-- hexbin
-
-See the pandas API reference for documentation on each kind of plot.
-
-Any other keyword argument is currently assumed to be backend specific,
-but some parameters may be unified and added to the signature in the
-future (e.g. `title` which should be useful for any backend).
-
-Currently, all the Matplotlib functions in pandas are accessed through
-the selected backend. For example, `pandas.plotting.boxplot` (equivalent
-to `DataFrame.boxplot`) is also accessed in the selected backend. This
-is expected to change, and the exact API is under discussion. But with
-the current version, backends are expected to implement the next functions:
-
-- plot (describe above, used for `Series.plot` and `DataFrame.plot`)
-- hist_series and hist_frame (for `Series.hist` and `DataFrame.hist`)
-- boxplot (`pandas.plotting.boxplot(df)` equivalent to `DataFrame.boxplot`)
-- boxplot_frame and boxplot_frame_groupby
-- register and deregister (register converters for the tick formats)
-- Plots not called as `Series` and `DataFrame` methods:
- - table
- - andrews_curves
- - autocorrelation_plot
- - bootstrap_plot
- - lag_plot
- - parallel_coordinates
- - radviz
- - scatter_matrix
-
-Use the code in pandas/plotting/_matplotib.py and
-https://github.com/pyviz/hvplot as a reference on how to write a backend.
-
-For the discussion about the API see
-https://github.com/pandas-dev/pandas/issues/26747.
-"""
-from pandas.plotting._core import (
- PlotAccessor,
- boxplot,
- boxplot_frame,
- boxplot_frame_groupby,
- hist_frame,
- hist_series,
-)
-from pandas.plotting._misc import (
- andrews_curves,
- autocorrelation_plot,
- bootstrap_plot,
- deregister as deregister_matplotlib_converters,
- lag_plot,
- parallel_coordinates,
- plot_params,
- radviz,
- register as register_matplotlib_converters,
- scatter_matrix,
- table,
-)
-
-__all__ = [
- "PlotAccessor",
- "boxplot",
- "boxplot_frame",
- "boxplot_frame_groupby",
- "hist_frame",
- "hist_series",
- "scatter_matrix",
- "radviz",
- "andrews_curves",
- "bootstrap_plot",
- "parallel_coordinates",
- "lag_plot",
- "autocorrelation_plot",
- "table",
- "plot_params",
- "register_matplotlib_converters",
- "deregister_matplotlib_converters",
-]
diff --git a/contrib/python/pandas/py3/pandas/plotting/_core.py b/contrib/python/pandas/py3/pandas/plotting/_core.py
deleted file mode 100644
index 51c95808506..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_core.py
+++ /dev/null
@@ -1,1864 +0,0 @@
-from __future__ import annotations
-
-import importlib
-import types
-from typing import (
- TYPE_CHECKING,
- Sequence,
-)
-
-from pandas._config import get_option
-
-from pandas._typing import IndexLabel
-from pandas.util._decorators import (
- Appender,
- Substitution,
-)
-
-from pandas.core.dtypes.common import (
- is_integer,
- is_list_like,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
-)
-
-from pandas.core.base import PandasObject
-
-if TYPE_CHECKING:
- from matplotlib.axes import Axes
-
- from pandas import DataFrame
-
-
-def hist_series(
- self,
- by=None,
- ax=None,
- grid: bool = True,
- xlabelsize: int | None = None,
- xrot: float | None = None,
- ylabelsize: int | None = None,
- yrot: float | None = None,
- figsize: tuple[int, int] | None = None,
- bins: int | Sequence[int] = 10,
- backend: str | None = None,
- legend: bool = False,
- **kwargs,
-):
- """
- Draw histogram of the input series using matplotlib.
-
- Parameters
- ----------
- by : object, optional
- If passed, then used to form histograms for separate groups.
- ax : matplotlib axis object
- If not passed, uses gca().
- grid : bool, default True
- Whether to show axis grid lines.
- xlabelsize : int, default None
- If specified changes the x-axis label size.
- xrot : float, default None
- Rotation of x axis labels.
- ylabelsize : int, default None
- If specified changes the y-axis label size.
- yrot : float, default None
- Rotation of y axis labels.
- figsize : tuple, default None
- Figure size in inches by default.
- bins : int or sequence, default 10
- Number of histogram bins to be used. If an integer is given, bins + 1
- bin edges are calculated and returned. If bins is a sequence, gives
- bin edges, including left edge of first bin and right edge of last
- bin. In this case, bins is returned unmodified.
- backend : str, default None
- Backend to use instead of the backend specified in the option
- ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
- specify the ``plotting.backend`` for the whole session, set
- ``pd.options.plotting.backend``.
- legend : bool, default False
- Whether to show the legend.
-
- .. versionadded:: 1.1.0
-
- **kwargs
- To be passed to the actual plotting function.
-
- Returns
- -------
- matplotlib.AxesSubplot
- A histogram plot.
-
- See Also
- --------
- matplotlib.axes.Axes.hist : Plot a histogram using matplotlib.
- """
- plot_backend = _get_plot_backend(backend)
- return plot_backend.hist_series(
- self,
- by=by,
- ax=ax,
- grid=grid,
- xlabelsize=xlabelsize,
- xrot=xrot,
- ylabelsize=ylabelsize,
- yrot=yrot,
- figsize=figsize,
- bins=bins,
- legend=legend,
- **kwargs,
- )
-
-
-def hist_frame(
- data: DataFrame,
- column: IndexLabel = None,
- by=None,
- grid: bool = True,
- xlabelsize: int | None = None,
- xrot: float | None = None,
- ylabelsize: int | None = None,
- yrot: float | None = None,
- ax=None,
- sharex: bool = False,
- sharey: bool = False,
- figsize: tuple[int, int] | None = None,
- layout: tuple[int, int] | None = None,
- bins: int | Sequence[int] = 10,
- backend: str | None = None,
- legend: bool = False,
- **kwargs,
-):
- """
- Make a histogram of the DataFrame's columns.
-
- A `histogram`_ is a representation of the distribution of data.
- This function calls :meth:`matplotlib.pyplot.hist`, on each series in
- the DataFrame, resulting in one histogram per column.
-
- .. _histogram: https://en.wikipedia.org/wiki/Histogram
-
- Parameters
- ----------
- data : DataFrame
- The pandas object holding the data.
- column : str or sequence, optional
- If passed, will be used to limit data to a subset of columns.
- by : object, optional
- If passed, then used to form histograms for separate groups.
- grid : bool, default True
- Whether to show axis grid lines.
- xlabelsize : int, default None
- If specified changes the x-axis label size.
- xrot : float, default None
- Rotation of x axis labels. For example, a value of 90 displays the
- x labels rotated 90 degrees clockwise.
- ylabelsize : int, default None
- If specified changes the y-axis label size.
- yrot : float, default None
- Rotation of y axis labels. For example, a value of 90 displays the
- y labels rotated 90 degrees clockwise.
- ax : Matplotlib axes object, default None
- The axes to plot the histogram on.
- sharex : bool, default True if ax is None else False
- In case subplots=True, share x axis and set some x axis labels to
- invisible; defaults to True if ax is None otherwise False if an ax
- is passed in.
- Note that passing in both an ax and sharex=True will alter all x axis
- labels for all subplots in a figure.
- sharey : bool, default False
- In case subplots=True, share y axis and set some y axis labels to
- invisible.
- figsize : tuple, optional
- The size in inches of the figure to create. Uses the value in
- `matplotlib.rcParams` by default.
- layout : tuple, optional
- Tuple of (rows, columns) for the layout of the histograms.
- bins : int or sequence, default 10
- Number of histogram bins to be used. If an integer is given, bins + 1
- bin edges are calculated and returned. If bins is a sequence, gives
- bin edges, including left edge of first bin and right edge of last
- bin. In this case, bins is returned unmodified.
-
- backend : str, default None
- Backend to use instead of the backend specified in the option
- ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
- specify the ``plotting.backend`` for the whole session, set
- ``pd.options.plotting.backend``.
-
- legend : bool, default False
- Whether to show the legend.
-
- .. versionadded:: 1.1.0
-
- **kwargs
- All other plotting keyword arguments to be passed to
- :meth:`matplotlib.pyplot.hist`.
-
- Returns
- -------
- matplotlib.AxesSubplot or numpy.ndarray of them
-
- See Also
- --------
- matplotlib.pyplot.hist : Plot a histogram using matplotlib.
-
- Examples
- --------
- This example draws a histogram based on the length and width of
- some animals, displayed in three bins
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame({
- ... 'length': [1.5, 0.5, 1.2, 0.9, 3],
- ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]
- ... }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse'])
- >>> hist = df.hist(bins=3)
- """
- plot_backend = _get_plot_backend(backend)
- return plot_backend.hist_frame(
- data,
- column=column,
- by=by,
- grid=grid,
- xlabelsize=xlabelsize,
- xrot=xrot,
- ylabelsize=ylabelsize,
- yrot=yrot,
- ax=ax,
- sharex=sharex,
- sharey=sharey,
- figsize=figsize,
- layout=layout,
- legend=legend,
- bins=bins,
- **kwargs,
- )
-
-
-_boxplot_doc = """
-Make a box plot from DataFrame columns.
-
-Make a box-and-whisker plot from DataFrame columns, optionally grouped
-by some other columns. A box plot is a method for graphically depicting
-groups of numerical data through their quartiles.
-The box extends from the Q1 to Q3 quartile values of the data,
-with a line at the median (Q2). The whiskers extend from the edges
-of box to show the range of the data. By default, they extend no more than
-`1.5 * IQR (IQR = Q3 - Q1)` from the edges of the box, ending at the farthest
-data point within that interval. Outliers are plotted as separate dots.
-
-For further details see
-Wikipedia's entry for `boxplot <https://en.wikipedia.org/wiki/Box_plot>`_.
-
-Parameters
-----------
-%(data)s\
-column : str or list of str, optional
- Column name or list of names, or vector.
- Can be any valid input to :meth:`pandas.DataFrame.groupby`.
-by : str or array-like, optional
- Column in the DataFrame to :meth:`pandas.DataFrame.groupby`.
- One box-plot will be done per value of columns in `by`.
-ax : object of class matplotlib.axes.Axes, optional
- The matplotlib axes to be used by boxplot.
-fontsize : float or str
- Tick label font size in points or as a string (e.g., `large`).
-rot : float, default 0
- The rotation angle of labels (in degrees)
- with respect to the screen coordinate system.
-grid : bool, default True
- Setting this to True will show the grid.
-figsize : A tuple (width, height) in inches
- The size of the figure to create in matplotlib.
-layout : tuple (rows, columns), optional
- For example, (3, 5) will display the subplots
- using 3 rows and 5 columns, starting from the top-left.
-return_type : {'axes', 'dict', 'both'} or None, default 'axes'
- The kind of object to return. The default is ``axes``.
-
- * 'axes' returns the matplotlib axes the boxplot is drawn on.
- * 'dict' returns a dictionary whose values are the matplotlib
- Lines of the boxplot.
- * 'both' returns a namedtuple with the axes and dict.
- * when grouping with ``by``, a Series mapping columns to
- ``return_type`` is returned.
-
- If ``return_type`` is `None`, a NumPy array
- of axes with the same shape as ``layout`` is returned.
-%(backend)s\
-
-**kwargs
- All other plotting keyword arguments to be passed to
- :func:`matplotlib.pyplot.boxplot`.
-
-Returns
--------
-result
- See Notes.
-
-See Also
---------
-pandas.Series.plot.hist: Make a histogram.
-matplotlib.pyplot.boxplot : Matplotlib equivalent plot.
-
-Notes
------
-The return type depends on the `return_type` parameter:
-
-* 'axes' : object of class matplotlib.axes.Axes
-* 'dict' : dict of matplotlib.lines.Line2D objects
-* 'both' : a namedtuple with structure (ax, lines)
-
-For data grouped with ``by``, return a Series of the above or a numpy
-array:
-
-* :class:`~pandas.Series`
-* :class:`~numpy.array` (for ``return_type = None``)
-
-Use ``return_type='dict'`` when you want to tweak the appearance
-of the lines after plotting. In this case a dict containing the Lines
-making up the boxes, caps, fliers, medians, and whiskers is returned.
-
-Examples
---------
-
-Boxplots can be created for every column in the dataframe
-by ``df.boxplot()`` or indicating the columns to be used:
-
-.. plot::
- :context: close-figs
-
- >>> np.random.seed(1234)
- >>> df = pd.DataFrame(np.random.randn(10, 4),
- ... columns=['Col1', 'Col2', 'Col3', 'Col4'])
- >>> boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3']) # doctest: +SKIP
-
-Boxplots of variables distributions grouped by the values of a third
-variable can be created using the option ``by``. For instance:
-
-.. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame(np.random.randn(10, 2),
- ... columns=['Col1', 'Col2'])
- >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A',
- ... 'B', 'B', 'B', 'B', 'B'])
- >>> boxplot = df.boxplot(by='X')
-
-A list of strings (i.e. ``['X', 'Y']``) can be passed to boxplot
-in order to group the data by combination of the variables in the x-axis:
-
-.. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame(np.random.randn(10, 3),
- ... columns=['Col1', 'Col2', 'Col3'])
- >>> df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A',
- ... 'B', 'B', 'B', 'B', 'B'])
- >>> df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A',
- ... 'B', 'A', 'B', 'A', 'B'])
- >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y'])
-
-The layout of boxplot can be adjusted giving a tuple to ``layout``:
-
-.. plot::
- :context: close-figs
-
- >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X',
- ... layout=(2, 1))
-
-Additional formatting can be done to the boxplot, like suppressing the grid
-(``grid=False``), rotating the labels in the x-axis (i.e. ``rot=45``)
-or changing the fontsize (i.e. ``fontsize=15``):
-
-.. plot::
- :context: close-figs
-
- >>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15) # doctest: +SKIP
-
-The parameter ``return_type`` can be used to select the type of element
-returned by `boxplot`. When ``return_type='axes'`` is selected,
-the matplotlib axes on which the boxplot is drawn are returned:
-
- >>> boxplot = df.boxplot(column=['Col1', 'Col2'], return_type='axes')
- >>> type(boxplot)
- <class 'matplotlib.axes._subplots.AxesSubplot'>
-
-When grouping with ``by``, a Series mapping columns to ``return_type``
-is returned:
-
- >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X',
- ... return_type='axes')
- >>> type(boxplot)
- <class 'pandas.core.series.Series'>
-
-If ``return_type`` is `None`, a NumPy array of axes with the same shape
-as ``layout`` is returned:
-
- >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X',
- ... return_type=None)
- >>> type(boxplot)
- <class 'numpy.ndarray'>
-"""
-
-_backend_doc = """\
-backend : str, default None
- Backend to use instead of the backend specified in the option
- ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
- specify the ``plotting.backend`` for the whole session, set
- ``pd.options.plotting.backend``.
-"""
-
-
-_bar_or_line_doc = """
- Parameters
- ----------
- x : label or position, optional
- Allows plotting of one column versus another. If not specified,
- the index of the DataFrame is used.
- y : label or position, optional
- Allows plotting of one column versus another. If not specified,
- all numerical columns are used.
- color : str, array-like, or dict, optional
- The color for each of the DataFrame's columns. Possible values are:
-
- - A single color string referred to by name, RGB or RGBA code,
- for instance 'red' or '#a98d19'.
-
- - A sequence of color strings referred to by name, RGB or RGBA
- code, which will be used for each column recursively. For
- instance ['green','yellow'] each column's %(kind)s will be filled in
- green or yellow, alternatively. If there is only a single column to
- be plotted, then only the first color from the color list will be
- used.
-
- - A dict of the form {column name : color}, so that each column will be
- colored accordingly. For example, if your columns are called `a` and
- `b`, then passing {'a': 'green', 'b': 'red'} will color %(kind)ss for
- column `a` in green and %(kind)ss for column `b` in red.
-
- .. versionadded:: 1.1.0
-
- **kwargs
- Additional keyword arguments are documented in
- :meth:`DataFrame.plot`.
-
- Returns
- -------
- matplotlib.axes.Axes or np.ndarray of them
- An ndarray is returned with one :class:`matplotlib.axes.Axes`
- per column when ``subplots=True``.
-"""
-
-
-@Substitution(data="data : DataFrame\n The data to visualize.\n", backend="")
-@Appender(_boxplot_doc)
-def boxplot(
- data: DataFrame,
- column: str | list[str] | None = None,
- by: str | list[str] | None = None,
- ax: Axes | None = None,
- fontsize: float | str | None = None,
- rot: int = 0,
- grid: bool = True,
- figsize: tuple[float, float] | None = None,
- layout: tuple[int, int] | None = None,
- return_type: str | None = None,
- **kwargs,
-):
- plot_backend = _get_plot_backend("matplotlib")
- return plot_backend.boxplot(
- data,
- column=column,
- by=by,
- ax=ax,
- fontsize=fontsize,
- rot=rot,
- grid=grid,
- figsize=figsize,
- layout=layout,
- return_type=return_type,
- **kwargs,
- )
-
-
-@Substitution(data="", backend=_backend_doc)
-@Appender(_boxplot_doc)
-def boxplot_frame(
- self,
- column=None,
- by=None,
- ax=None,
- fontsize=None,
- rot: int = 0,
- grid: bool = True,
- figsize=None,
- layout=None,
- return_type=None,
- backend=None,
- **kwargs,
-):
- plot_backend = _get_plot_backend(backend)
- return plot_backend.boxplot_frame(
- self,
- column=column,
- by=by,
- ax=ax,
- fontsize=fontsize,
- rot=rot,
- grid=grid,
- figsize=figsize,
- layout=layout,
- return_type=return_type,
- **kwargs,
- )
-
-
-def boxplot_frame_groupby(
- grouped,
- subplots: bool = True,
- column=None,
- fontsize=None,
- rot: int = 0,
- grid: bool = True,
- ax=None,
- figsize=None,
- layout=None,
- sharex: bool = False,
- sharey: bool = True,
- backend=None,
- **kwargs,
-):
- """
- Make box plots from DataFrameGroupBy data.
-
- Parameters
- ----------
- grouped : Grouped DataFrame
- subplots : bool
- * ``False`` - no subplots will be used
- * ``True`` - create a subplot for each group.
-
- column : column name or list of names, or vector
- Can be any valid input to groupby.
- fontsize : float or str
- rot : label rotation angle
- grid : Setting this to True will show the grid
- ax : Matplotlib axis object, default None
- figsize : A tuple (width, height) in inches
- layout : tuple (optional)
- The layout of the plot: (rows, columns).
- sharex : bool, default False
- Whether x-axes will be shared among subplots.
- sharey : bool, default True
- Whether y-axes will be shared among subplots.
- backend : str, default None
- Backend to use instead of the backend specified in the option
- ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
- specify the ``plotting.backend`` for the whole session, set
- ``pd.options.plotting.backend``.
- **kwargs
- All other plotting keyword arguments to be passed to
- matplotlib's boxplot function.
-
- Returns
- -------
- dict of key/value = group key/DataFrame.boxplot return value
- or DataFrame.boxplot return value in case subplots=figures=False
-
- Examples
- --------
- You can create boxplots for grouped data and show them as separate subplots:
-
- .. plot::
- :context: close-figs
-
- >>> import itertools
- >>> tuples = [t for t in itertools.product(range(1000), range(4))]
- >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1'])
- >>> data = np.random.randn(len(index),4)
- >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index)
- >>> grouped = df.groupby(level='lvl1')
- >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8,10)) # doctest: +SKIP
-
- The ``subplots=False`` option shows the boxplots in a single figure.
-
- .. plot::
- :context: close-figs
-
- >>> grouped.boxplot(subplots=False, rot=45, fontsize=12) # doctest: +SKIP
- """
- plot_backend = _get_plot_backend(backend)
- return plot_backend.boxplot_frame_groupby(
- grouped,
- subplots=subplots,
- column=column,
- fontsize=fontsize,
- rot=rot,
- grid=grid,
- ax=ax,
- figsize=figsize,
- layout=layout,
- sharex=sharex,
- sharey=sharey,
- **kwargs,
- )
-
-
-class PlotAccessor(PandasObject):
- """
- Make plots of Series or DataFrame.
-
- Uses the backend specified by the
- option ``plotting.backend``. By default, matplotlib is used.
-
- Parameters
- ----------
- data : Series or DataFrame
- The object for which the method is called.
- x : label or position, default None
- Only used if data is a DataFrame.
- y : label, position or list of label, positions, default None
- Allows plotting of one column versus another. Only used if data is a
- DataFrame.
- kind : str
- The kind of plot to produce:
-
- - 'line' : line plot (default)
- - 'bar' : vertical bar plot
- - 'barh' : horizontal bar plot
- - 'hist' : histogram
- - 'box' : boxplot
- - 'kde' : Kernel Density Estimation plot
- - 'density' : same as 'kde'
- - 'area' : area plot
- - 'pie' : pie plot
- - 'scatter' : scatter plot (DataFrame only)
- - 'hexbin' : hexbin plot (DataFrame only)
- ax : matplotlib axes object, default None
- An axes of the current figure.
- subplots : bool or sequence of iterables, default False
- Whether to group columns into subplots:
-
- - ``False`` : No subplots will be used
- - ``True`` : Make separate subplots for each column.
- - sequence of iterables of column labels: Create a subplot for each
- group of columns. For example `[('a', 'c'), ('b', 'd')]` will
- create 2 subplots: one with columns 'a' and 'c', and one
- with columns 'b' and 'd'. Remaining columns that aren't specified
- will be plotted in additional subplots (one per column).
-
- .. versionadded:: 1.5.0
-
- sharex : bool, default True if ax is None else False
- In case ``subplots=True``, share x axis and set some x axis labels
- to invisible; defaults to True if ax is None otherwise False if
- an ax is passed in; Be aware, that passing in both an ax and
- ``sharex=True`` will alter all x axis labels for all axis in a figure.
- sharey : bool, default False
- In case ``subplots=True``, share y axis and set some y axis labels to invisible.
- layout : tuple, optional
- (rows, columns) for the layout of subplots.
- figsize : a tuple (width, height) in inches
- Size of a figure object.
- use_index : bool, default True
- Use index as ticks for x axis.
- title : str or list
- Title to use for the plot. If a string is passed, print the string
- at the top of the figure. If a list is passed and `subplots` is
- True, print each item in the list above the corresponding subplot.
- grid : bool, default None (matlab style default)
- Axis grid lines.
- legend : bool or {'reverse'}
- Place legend on axis subplots.
- style : list or dict
- The matplotlib line style per column.
- logx : bool or 'sym', default False
- Use log scaling or symlog scaling on x axis.
-
- logy : bool or 'sym' default False
- Use log scaling or symlog scaling on y axis.
-
- loglog : bool or 'sym', default False
- Use log scaling or symlog scaling on both x and y axes.
-
- xticks : sequence
- Values to use for the xticks.
- yticks : sequence
- Values to use for the yticks.
- xlim : 2-tuple/list
- Set the x limits of the current axes.
- ylim : 2-tuple/list
- Set the y limits of the current axes.
- xlabel : label, optional
- Name to use for the xlabel on x-axis. Default uses index name as xlabel, or the
- x-column name for planar plots.
-
- .. versionadded:: 1.1.0
-
- .. versionchanged:: 1.2.0
-
- Now applicable to planar plots (`scatter`, `hexbin`).
-
- .. versionchanged:: 2.0.0
-
- Now applicable to histograms.
-
- ylabel : label, optional
- Name to use for the ylabel on y-axis. Default will show no ylabel, or the
- y-column name for planar plots.
-
- .. versionadded:: 1.1.0
-
- .. versionchanged:: 1.2.0
-
- Now applicable to planar plots (`scatter`, `hexbin`).
-
- .. versionchanged:: 2.0.0
-
- Now applicable to histograms.
-
- rot : float, default None
- Rotation for ticks (xticks for vertical, yticks for horizontal
- plots).
- fontsize : float, default None
- Font size for xticks and yticks.
- colormap : str or matplotlib colormap object, default None
- Colormap to select colors from. If string, load colormap with that
- name from matplotlib.
- colorbar : bool, optional
- If True, plot colorbar (only relevant for 'scatter' and 'hexbin'
- plots).
- position : float
- Specify relative alignments for bar plot layout.
- From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5
- (center).
- table : bool, Series or DataFrame, default False
- If True, draw a table using the data in the DataFrame and the data
- will be transposed to meet matplotlib's default layout.
- If a Series or DataFrame is passed, use passed data to draw a
- table.
- yerr : DataFrame, Series, array-like, dict and str
- See :ref:`Plotting with Error Bars <visualization.errorbars>` for
- detail.
- xerr : DataFrame, Series, array-like, dict and str
- Equivalent to yerr.
- stacked : bool, default False in line and bar plots, and True in area plot
- If True, create stacked plot.
- secondary_y : bool or sequence, default False
- Whether to plot on the secondary y-axis if a list/tuple, which
- columns to plot on secondary y-axis.
- mark_right : bool, default True
- When using a secondary_y axis, automatically mark the column
- labels with "(right)" in the legend.
- include_bool : bool, default is False
- If True, boolean values can be plotted.
- backend : str, default None
- Backend to use instead of the backend specified in the option
- ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
- specify the ``plotting.backend`` for the whole session, set
- ``pd.options.plotting.backend``.
- **kwargs
- Options to pass to matplotlib plotting method.
-
- Returns
- -------
- :class:`matplotlib.axes.Axes` or numpy.ndarray of them
- If the backend is not the default matplotlib one, the return value
- will be the object returned by the backend.
-
- Notes
- -----
- - See matplotlib documentation online for more on this subject
- - If `kind` = 'bar' or 'barh', you can specify relative alignments
- for bar plot layout by `position` keyword.
- From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5
- (center)
- """
-
- _common_kinds = ("line", "bar", "barh", "kde", "density", "area", "hist", "box")
- _series_kinds = ("pie",)
- _dataframe_kinds = ("scatter", "hexbin")
- _kind_aliases = {"density": "kde"}
- _all_kinds = _common_kinds + _series_kinds + _dataframe_kinds
-
- def __init__(self, data) -> None:
- self._parent = data
-
- @staticmethod
- def _get_call_args(backend_name, data, args, kwargs):
- """
- This function makes calls to this accessor `__call__` method compatible
- with the previous `SeriesPlotMethods.__call__` and
- `DataFramePlotMethods.__call__`. Those had slightly different
- signatures, since `DataFramePlotMethods` accepted `x` and `y`
- parameters.
- """
- if isinstance(data, ABCSeries):
- arg_def = [
- ("kind", "line"),
- ("ax", None),
- ("figsize", None),
- ("use_index", True),
- ("title", None),
- ("grid", None),
- ("legend", False),
- ("style", None),
- ("logx", False),
- ("logy", False),
- ("loglog", False),
- ("xticks", None),
- ("yticks", None),
- ("xlim", None),
- ("ylim", None),
- ("rot", None),
- ("fontsize", None),
- ("colormap", None),
- ("table", False),
- ("yerr", None),
- ("xerr", None),
- ("label", None),
- ("secondary_y", False),
- ("xlabel", None),
- ("ylabel", None),
- ]
- elif isinstance(data, ABCDataFrame):
- arg_def = [
- ("x", None),
- ("y", None),
- ("kind", "line"),
- ("ax", None),
- ("subplots", False),
- ("sharex", None),
- ("sharey", False),
- ("layout", None),
- ("figsize", None),
- ("use_index", True),
- ("title", None),
- ("grid", None),
- ("legend", True),
- ("style", None),
- ("logx", False),
- ("logy", False),
- ("loglog", False),
- ("xticks", None),
- ("yticks", None),
- ("xlim", None),
- ("ylim", None),
- ("rot", None),
- ("fontsize", None),
- ("colormap", None),
- ("table", False),
- ("yerr", None),
- ("xerr", None),
- ("secondary_y", False),
- ("xlabel", None),
- ("ylabel", None),
- ]
- else:
- raise TypeError(
- f"Called plot accessor for type {type(data).__name__}, "
- "expected Series or DataFrame"
- )
-
- if args and isinstance(data, ABCSeries):
- positional_args = str(args)[1:-1]
- keyword_args = ", ".join(
- [f"{name}={repr(value)}" for (name, _), value in zip(arg_def, args)]
- )
- msg = (
- "`Series.plot()` should not be called with positional "
- "arguments, only keyword arguments. The order of "
- "positional arguments will change in the future. "
- f"Use `Series.plot({keyword_args})` instead of "
- f"`Series.plot({positional_args})`."
- )
- raise TypeError(msg)
-
- pos_args = {name: value for (name, _), value in zip(arg_def, args)}
- if backend_name == "pandas.plotting._matplotlib":
- kwargs = dict(arg_def, **pos_args, **kwargs)
- else:
- kwargs = dict(pos_args, **kwargs)
-
- x = kwargs.pop("x", None)
- y = kwargs.pop("y", None)
- kind = kwargs.pop("kind", "line")
- return x, y, kind, kwargs
-
- def __call__(self, *args, **kwargs):
- plot_backend = _get_plot_backend(kwargs.pop("backend", None))
-
- x, y, kind, kwargs = self._get_call_args(
- plot_backend.__name__, self._parent, args, kwargs
- )
-
- kind = self._kind_aliases.get(kind, kind)
-
- # when using another backend, get out of the way
- if plot_backend.__name__ != "pandas.plotting._matplotlib":
- return plot_backend.plot(self._parent, x=x, y=y, kind=kind, **kwargs)
-
- if kind not in self._all_kinds:
- raise ValueError(f"{kind} is not a valid plot kind")
-
- # The original data structured can be transformed before passed to the
- # backend. For example, for DataFrame is common to set the index as the
- # `x` parameter, and return a Series with the parameter `y` as values.
- data = self._parent.copy()
-
- if isinstance(data, ABCSeries):
- kwargs["reuse_plot"] = True
-
- if kind in self._dataframe_kinds:
- if isinstance(data, ABCDataFrame):
- return plot_backend.plot(data, x=x, y=y, kind=kind, **kwargs)
- else:
- raise ValueError(f"plot kind {kind} can only be used for data frames")
- elif kind in self._series_kinds:
- if isinstance(data, ABCDataFrame):
- if y is None and kwargs.get("subplots") is False:
- raise ValueError(
- f"{kind} requires either y column or 'subplots=True'"
- )
- if y is not None:
- if is_integer(y) and not data.columns._holds_integer():
- y = data.columns[y]
- # converted to series actually. copy to not modify
- data = data[y].copy()
- data.index.name = y
- elif isinstance(data, ABCDataFrame):
- data_cols = data.columns
- if x is not None:
- if is_integer(x) and not data.columns._holds_integer():
- x = data_cols[x]
- elif not isinstance(data[x], ABCSeries):
- raise ValueError("x must be a label or position")
- data = data.set_index(x)
- if y is not None:
- # check if we have y as int or list of ints
- int_ylist = is_list_like(y) and all(is_integer(c) for c in y)
- int_y_arg = is_integer(y) or int_ylist
- if int_y_arg and not data.columns._holds_integer():
- y = data_cols[y]
-
- label_kw = kwargs["label"] if "label" in kwargs else False
- for kw in ["xerr", "yerr"]:
- if kw in kwargs and (
- isinstance(kwargs[kw], str) or is_integer(kwargs[kw])
- ):
- try:
- kwargs[kw] = data[kwargs[kw]]
- except (IndexError, KeyError, TypeError):
- pass
-
- # don't overwrite
- data = data[y].copy()
-
- if isinstance(data, ABCSeries):
- label_name = label_kw or y
- data.name = label_name
- else:
- match = is_list_like(label_kw) and len(label_kw) == len(y)
- if label_kw and not match:
- raise ValueError(
- "label should be list-like and same length as y"
- )
- label_name = label_kw or data.columns
- data.columns = label_name
-
- return plot_backend.plot(data, kind=kind, **kwargs)
-
- __call__.__doc__ = __doc__
-
- @Appender(
- """
- See Also
- --------
- matplotlib.pyplot.plot : Plot y versus x as lines and/or markers.
-
- Examples
- --------
-
- .. plot::
- :context: close-figs
-
- >>> s = pd.Series([1, 3, 2])
- >>> s.plot.line()
- <AxesSubplot: ylabel='Density'>
-
- .. plot::
- :context: close-figs
-
- The following example shows the populations for some animals
- over the years.
-
- >>> df = pd.DataFrame({
- ... 'pig': [20, 18, 489, 675, 1776],
- ... 'horse': [4, 25, 281, 600, 1900]
- ... }, index=[1990, 1997, 2003, 2009, 2014])
- >>> lines = df.plot.line()
-
- .. plot::
- :context: close-figs
-
- An example with subplots, so an array of axes is returned.
-
- >>> axes = df.plot.line(subplots=True)
- >>> type(axes)
- <class 'numpy.ndarray'>
-
- .. plot::
- :context: close-figs
-
- Let's repeat the same example, but specifying colors for
- each column (in this case, for each animal).
-
- >>> axes = df.plot.line(
- ... subplots=True, color={"pig": "pink", "horse": "#742802"}
- ... )
-
- .. plot::
- :context: close-figs
-
- The following example shows the relationship between both
- populations.
-
- >>> lines = df.plot.line(x='pig', y='horse')
- """
- )
- @Substitution(kind="line")
- @Appender(_bar_or_line_doc)
- def line(self, x=None, y=None, **kwargs) -> PlotAccessor:
- """
- Plot Series or DataFrame as lines.
-
- This function is useful to plot lines using DataFrame's values
- as coordinates.
- """
- return self(kind="line", x=x, y=y, **kwargs)
-
- @Appender(
- """
- See Also
- --------
- DataFrame.plot.barh : Horizontal bar plot.
- DataFrame.plot : Make plots of a DataFrame.
- matplotlib.pyplot.bar : Make a bar plot with matplotlib.
-
- Examples
- --------
- Basic plot.
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]})
- >>> ax = df.plot.bar(x='lab', y='val', rot=0)
-
- Plot a whole dataframe to a bar plot. Each column is assigned a
- distinct color, and each row is nested in a group along the
- horizontal axis.
-
- .. plot::
- :context: close-figs
-
- >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
- >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
- >>> index = ['snail', 'pig', 'elephant',
- ... 'rabbit', 'giraffe', 'coyote', 'horse']
- >>> df = pd.DataFrame({'speed': speed,
- ... 'lifespan': lifespan}, index=index)
- >>> ax = df.plot.bar(rot=0)
-
- Plot stacked bar charts for the DataFrame
-
- .. plot::
- :context: close-figs
-
- >>> ax = df.plot.bar(stacked=True)
-
- Instead of nesting, the figure can be split by column with
- ``subplots=True``. In this case, a :class:`numpy.ndarray` of
- :class:`matplotlib.axes.Axes` are returned.
-
- .. plot::
- :context: close-figs
-
- >>> axes = df.plot.bar(rot=0, subplots=True)
- >>> axes[1].legend(loc=2) # doctest: +SKIP
-
- If you don't like the default colours, you can specify how you'd
- like each column to be colored.
-
- .. plot::
- :context: close-figs
-
- >>> axes = df.plot.bar(
- ... rot=0, subplots=True, color={"speed": "red", "lifespan": "green"}
- ... )
- >>> axes[1].legend(loc=2) # doctest: +SKIP
-
- Plot a single column.
-
- .. plot::
- :context: close-figs
-
- >>> ax = df.plot.bar(y='speed', rot=0)
-
- Plot only selected categories for the DataFrame.
-
- .. plot::
- :context: close-figs
-
- >>> ax = df.plot.bar(x='lifespan', rot=0)
- """
- )
- @Substitution(kind="bar")
- @Appender(_bar_or_line_doc)
- def bar( # pylint: disable=disallowed-name
- self, x=None, y=None, **kwargs
- ) -> PlotAccessor:
- """
- Vertical bar plot.
-
- A bar plot is a plot that presents categorical data with
- rectangular bars with lengths proportional to the values that they
- represent. A bar plot shows comparisons among discrete categories. One
- axis of the plot shows the specific categories being compared, and the
- other axis represents a measured value.
- """
- return self(kind="bar", x=x, y=y, **kwargs)
-
- @Appender(
- """
- See Also
- --------
- DataFrame.plot.bar: Vertical bar plot.
- DataFrame.plot : Make plots of DataFrame using matplotlib.
- matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib.
-
- Examples
- --------
- Basic example
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
- >>> ax = df.plot.barh(x='lab', y='val')
-
- Plot a whole DataFrame to a horizontal bar plot
-
- .. plot::
- :context: close-figs
-
- >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
- >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
- >>> index = ['snail', 'pig', 'elephant',
- ... 'rabbit', 'giraffe', 'coyote', 'horse']
- >>> df = pd.DataFrame({'speed': speed,
- ... 'lifespan': lifespan}, index=index)
- >>> ax = df.plot.barh()
-
- Plot stacked barh charts for the DataFrame
-
- .. plot::
- :context: close-figs
-
- >>> ax = df.plot.barh(stacked=True)
-
- We can specify colors for each column
-
- .. plot::
- :context: close-figs
-
- >>> ax = df.plot.barh(color={"speed": "red", "lifespan": "green"})
-
- Plot a column of the DataFrame to a horizontal bar plot
-
- .. plot::
- :context: close-figs
-
- >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
- >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
- >>> index = ['snail', 'pig', 'elephant',
- ... 'rabbit', 'giraffe', 'coyote', 'horse']
- >>> df = pd.DataFrame({'speed': speed,
- ... 'lifespan': lifespan}, index=index)
- >>> ax = df.plot.barh(y='speed')
-
- Plot DataFrame versus the desired column
-
- .. plot::
- :context: close-figs
-
- >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
- >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
- >>> index = ['snail', 'pig', 'elephant',
- ... 'rabbit', 'giraffe', 'coyote', 'horse']
- >>> df = pd.DataFrame({'speed': speed,
- ... 'lifespan': lifespan}, index=index)
- >>> ax = df.plot.barh(x='lifespan')
- """
- )
- @Substitution(kind="bar")
- @Appender(_bar_or_line_doc)
- def barh(self, x=None, y=None, **kwargs) -> PlotAccessor:
- """
- Make a horizontal bar plot.
-
- A horizontal bar plot is a plot that presents quantitative data with
- rectangular bars with lengths proportional to the values that they
- represent. A bar plot shows comparisons among discrete categories. One
- axis of the plot shows the specific categories being compared, and the
- other axis represents a measured value.
- """
- return self(kind="barh", x=x, y=y, **kwargs)
-
- def box(self, by=None, **kwargs) -> PlotAccessor:
- r"""
- Make a box plot of the DataFrame columns.
-
- A box plot is a method for graphically depicting groups of numerical
- data through their quartiles.
- The box extends from the Q1 to Q3 quartile values of the data,
- with a line at the median (Q2). The whiskers extend from the edges
- of box to show the range of the data. The position of the whiskers
- is set by default to 1.5*IQR (IQR = Q3 - Q1) from the edges of the
- box. Outlier points are those past the end of the whiskers.
-
- For further details see Wikipedia's
- entry for `boxplot <https://en.wikipedia.org/wiki/Box_plot>`__.
-
- A consideration when using this chart is that the box and the whiskers
- can overlap, which is very common when plotting small sets of data.
-
- Parameters
- ----------
- by : str or sequence
- Column in the DataFrame to group by.
-
- .. versionchanged:: 1.4.0
-
- Previously, `by` is silently ignore and makes no groupings
-
- **kwargs
- Additional keywords are documented in
- :meth:`DataFrame.plot`.
-
- Returns
- -------
- :class:`matplotlib.axes.Axes` or numpy.ndarray of them
-
- See Also
- --------
- DataFrame.boxplot: Another method to draw a box plot.
- Series.plot.box: Draw a box plot from a Series object.
- matplotlib.pyplot.boxplot: Draw a box plot in matplotlib.
-
- Examples
- --------
- Draw a box plot from a DataFrame with four columns of randomly
- generated data.
-
- .. plot::
- :context: close-figs
-
- >>> data = np.random.randn(25, 4)
- >>> df = pd.DataFrame(data, columns=list('ABCD'))
- >>> ax = df.plot.box()
-
- You can also generate groupings if you specify the `by` parameter (which
- can take a column name, or a list or tuple of column names):
-
- .. versionchanged:: 1.4.0
-
- .. plot::
- :context: close-figs
-
- >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85]
- >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list})
- >>> ax = df.plot.box(column="age", by="gender", figsize=(10, 8))
- """
- return self(kind="box", by=by, **kwargs)
-
- def hist(self, by=None, bins: int = 10, **kwargs) -> PlotAccessor:
- """
- Draw one histogram of the DataFrame's columns.
-
- A histogram is a representation of the distribution of data.
- This function groups the values of all given Series in the DataFrame
- into bins and draws all bins in one :class:`matplotlib.axes.Axes`.
- This is useful when the DataFrame's Series are in a similar scale.
-
- Parameters
- ----------
- by : str or sequence, optional
- Column in the DataFrame to group by.
-
- .. versionchanged:: 1.4.0
-
- Previously, `by` is silently ignore and makes no groupings
-
- bins : int, default 10
- Number of histogram bins to be used.
- **kwargs
- Additional keyword arguments are documented in
- :meth:`DataFrame.plot`.
-
- Returns
- -------
- class:`matplotlib.AxesSubplot`
- Return a histogram plot.
-
- See Also
- --------
- DataFrame.hist : Draw histograms per DataFrame's Series.
- Series.hist : Draw a histogram with Series' data.
-
- Examples
- --------
- When we roll a die 6000 times, we expect to get each value around 1000
- times. But when we roll two dice and sum the result, the distribution
- is going to be quite different. A histogram illustrates those
- distributions.
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame(
- ... np.random.randint(1, 7, 6000),
- ... columns = ['one'])
- >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)
- >>> ax = df.plot.hist(bins=12, alpha=0.5)
-
- A grouped histogram can be generated by providing the parameter `by` (which
- can be a column name, or a list of column names):
-
- .. plot::
- :context: close-figs
-
- >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85]
- >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list})
- >>> ax = df.plot.hist(column=["age"], by="gender", figsize=(10, 8))
- """
- return self(kind="hist", by=by, bins=bins, **kwargs)
-
- def kde(self, bw_method=None, ind=None, **kwargs) -> PlotAccessor:
- """
- Generate Kernel Density Estimate plot using Gaussian kernels.
-
- In statistics, `kernel density estimation`_ (KDE) is a non-parametric
- way to estimate the probability density function (PDF) of a random
- variable. This function uses Gaussian kernels and includes automatic
- bandwidth determination.
-
- .. _kernel density estimation:
- https://en.wikipedia.org/wiki/Kernel_density_estimation
-
- Parameters
- ----------
- bw_method : str, scalar or callable, optional
- The method used to calculate the estimator bandwidth. This can be
- 'scott', 'silverman', a scalar constant or a callable.
- If None (default), 'scott' is used.
- See :class:`scipy.stats.gaussian_kde` for more information.
- ind : NumPy array or int, optional
- Evaluation points for the estimated PDF. If None (default),
- 1000 equally spaced points are used. If `ind` is a NumPy array, the
- KDE is evaluated at the points passed. If `ind` is an integer,
- `ind` number of equally spaced points are used.
- **kwargs
- Additional keyword arguments are documented in
- :meth:`DataFrame.plot`.
-
- Returns
- -------
- matplotlib.axes.Axes or numpy.ndarray of them
-
- See Also
- --------
- scipy.stats.gaussian_kde : Representation of a kernel-density
- estimate using Gaussian kernels. This is the function used
- internally to estimate the PDF.
-
- Examples
- --------
- Given a Series of points randomly sampled from an unknown
- distribution, estimate its PDF using KDE with automatic
- bandwidth determination and plot the results, evaluating them at
- 1000 equally spaced points (default):
-
- .. plot::
- :context: close-figs
-
- >>> s = pd.Series([1, 2, 2.5, 3, 3.5, 4, 5])
- >>> ax = s.plot.kde()
-
- A scalar bandwidth can be specified. Using a small bandwidth value can
- lead to over-fitting, while using a large bandwidth value may result
- in under-fitting:
-
- .. plot::
- :context: close-figs
-
- >>> ax = s.plot.kde(bw_method=0.3)
-
- .. plot::
- :context: close-figs
-
- >>> ax = s.plot.kde(bw_method=3)
-
- Finally, the `ind` parameter determines the evaluation points for the
- plot of the estimated PDF:
-
- .. plot::
- :context: close-figs
-
- >>> ax = s.plot.kde(ind=[1, 2, 3, 4, 5])
-
- For DataFrame, it works in the same way:
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame({
- ... 'x': [1, 2, 2.5, 3, 3.5, 4, 5],
- ... 'y': [4, 4, 4.5, 5, 5.5, 6, 6],
- ... })
- >>> ax = df.plot.kde()
-
- A scalar bandwidth can be specified. Using a small bandwidth value can
- lead to over-fitting, while using a large bandwidth value may result
- in under-fitting:
-
- .. plot::
- :context: close-figs
-
- >>> ax = df.plot.kde(bw_method=0.3)
-
- .. plot::
- :context: close-figs
-
- >>> ax = df.plot.kde(bw_method=3)
-
- Finally, the `ind` parameter determines the evaluation points for the
- plot of the estimated PDF:
-
- .. plot::
- :context: close-figs
-
- >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6])
- """
- return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs)
-
- density = kde
-
- def area(self, x=None, y=None, stacked: bool = True, **kwargs) -> PlotAccessor:
- """
- Draw a stacked area plot.
-
- An area plot displays quantitative data visually.
- This function wraps the matplotlib area function.
-
- Parameters
- ----------
- x : label or position, optional
- Coordinates for the X axis. By default uses the index.
- y : label or position, optional
- Column to plot. By default uses all columns.
- stacked : bool, default True
- Area plots are stacked by default. Set to False to create a
- unstacked plot.
- **kwargs
- Additional keyword arguments are documented in
- :meth:`DataFrame.plot`.
-
- Returns
- -------
- matplotlib.axes.Axes or numpy.ndarray
- Area plot, or array of area plots if subplots is True.
-
- See Also
- --------
- DataFrame.plot : Make plots of DataFrame using matplotlib / pylab.
-
- Examples
- --------
- Draw an area plot based on basic business metrics:
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame({
- ... 'sales': [3, 2, 3, 9, 10, 6],
- ... 'signups': [5, 5, 6, 12, 14, 13],
- ... 'visits': [20, 42, 28, 62, 81, 50],
- ... }, index=pd.date_range(start='2018/01/01', end='2018/07/01',
- ... freq='M'))
- >>> ax = df.plot.area()
-
- Area plots are stacked by default. To produce an unstacked plot,
- pass ``stacked=False``:
-
- .. plot::
- :context: close-figs
-
- >>> ax = df.plot.area(stacked=False)
-
- Draw an area plot for a single column:
-
- .. plot::
- :context: close-figs
-
- >>> ax = df.plot.area(y='sales')
-
- Draw with a different `x`:
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame({
- ... 'sales': [3, 2, 3],
- ... 'visits': [20, 42, 28],
- ... 'day': [1, 2, 3],
- ... })
- >>> ax = df.plot.area(x='day')
- """
- return self(kind="area", x=x, y=y, stacked=stacked, **kwargs)
-
- def pie(self, **kwargs) -> PlotAccessor:
- """
- Generate a pie plot.
-
- A pie plot is a proportional representation of the numerical data in a
- column. This function wraps :meth:`matplotlib.pyplot.pie` for the
- specified column. If no column reference is passed and
- ``subplots=True`` a pie plot is drawn for each numerical column
- independently.
-
- Parameters
- ----------
- y : int or label, optional
- Label or position of the column to plot.
- If not provided, ``subplots=True`` argument must be passed.
- **kwargs
- Keyword arguments to pass on to :meth:`DataFrame.plot`.
-
- Returns
- -------
- matplotlib.axes.Axes or np.ndarray of them
- A NumPy array is returned when `subplots` is True.
-
- See Also
- --------
- Series.plot.pie : Generate a pie plot for a Series.
- DataFrame.plot : Make plots of a DataFrame.
-
- Examples
- --------
- In the example below we have a DataFrame with the information about
- planet's mass and radius. We pass the 'mass' column to the
- pie function to get a pie plot.
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame({'mass': [0.330, 4.87 , 5.97],
- ... 'radius': [2439.7, 6051.8, 6378.1]},
- ... index=['Mercury', 'Venus', 'Earth'])
- >>> plot = df.plot.pie(y='mass', figsize=(5, 5))
-
- .. plot::
- :context: close-figs
-
- >>> plot = df.plot.pie(subplots=True, figsize=(11, 6))
- """
- if (
- isinstance(self._parent, ABCDataFrame)
- and kwargs.get("y", None) is None
- and not kwargs.get("subplots", False)
- ):
- raise ValueError("pie requires either y column or 'subplots=True'")
- return self(kind="pie", **kwargs)
-
- def scatter(self, x, y, s=None, c=None, **kwargs) -> PlotAccessor:
- """
- Create a scatter plot with varying marker point size and color.
-
- The coordinates of each point are defined by two dataframe columns and
- filled circles are used to represent each point. This kind of plot is
- useful to see complex correlations between two variables. Points could
- be for instance natural 2D coordinates like longitude and latitude in
- a map or, in general, any pair of metrics that can be plotted against
- each other.
-
- Parameters
- ----------
- x : int or str
- The column name or column position to be used as horizontal
- coordinates for each point.
- y : int or str
- The column name or column position to be used as vertical
- coordinates for each point.
- s : str, scalar or array-like, optional
- The size of each point. Possible values are:
-
- - A string with the name of the column to be used for marker's size.
-
- - A single scalar so all points have the same size.
-
- - A sequence of scalars, which will be used for each point's size
- recursively. For instance, when passing [2,14] all points size
- will be either 2 or 14, alternatively.
-
- .. versionchanged:: 1.1.0
-
- c : str, int or array-like, optional
- The color of each point. Possible values are:
-
- - A single color string referred to by name, RGB or RGBA code,
- for instance 'red' or '#a98d19'.
-
- - A sequence of color strings referred to by name, RGB or RGBA
- code, which will be used for each point's color recursively. For
- instance ['green','yellow'] all points will be filled in green or
- yellow, alternatively.
-
- - A column name or position whose values will be used to color the
- marker points according to a colormap.
-
- **kwargs
- Keyword arguments to pass on to :meth:`DataFrame.plot`.
-
- Returns
- -------
- :class:`matplotlib.axes.Axes` or numpy.ndarray of them
-
- See Also
- --------
- matplotlib.pyplot.scatter : Scatter plot using multiple input data
- formats.
-
- Examples
- --------
- Let's see how to draw a scatter plot using coordinates from the values
- in a DataFrame's columns.
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
- ... [6.4, 3.2, 1], [5.9, 3.0, 2]],
- ... columns=['length', 'width', 'species'])
- >>> ax1 = df.plot.scatter(x='length',
- ... y='width',
- ... c='DarkBlue')
-
- And now with the color determined by a column as well.
-
- .. plot::
- :context: close-figs
-
- >>> ax2 = df.plot.scatter(x='length',
- ... y='width',
- ... c='species',
- ... colormap='viridis')
- """
- return self(kind="scatter", x=x, y=y, s=s, c=c, **kwargs)
-
- def hexbin(
- self, x, y, C=None, reduce_C_function=None, gridsize=None, **kwargs
- ) -> PlotAccessor:
- """
- Generate a hexagonal binning plot.
-
- Generate a hexagonal binning plot of `x` versus `y`. If `C` is `None`
- (the default), this is a histogram of the number of occurrences
- of the observations at ``(x[i], y[i])``.
-
- If `C` is specified, specifies values at given coordinates
- ``(x[i], y[i])``. These values are accumulated for each hexagonal
- bin and then reduced according to `reduce_C_function`,
- having as default the NumPy's mean function (:meth:`numpy.mean`).
- (If `C` is specified, it must also be a 1-D sequence
- of the same length as `x` and `y`, or a column label.)
-
- Parameters
- ----------
- x : int or str
- The column label or position for x points.
- y : int or str
- The column label or position for y points.
- C : int or str, optional
- The column label or position for the value of `(x, y)` point.
- reduce_C_function : callable, default `np.mean`
- Function of one argument that reduces all the values in a bin to
- a single number (e.g. `np.mean`, `np.max`, `np.sum`, `np.std`).
- gridsize : int or tuple of (int, int), default 100
- The number of hexagons in the x-direction.
- The corresponding number of hexagons in the y-direction is
- chosen in a way that the hexagons are approximately regular.
- Alternatively, gridsize can be a tuple with two elements
- specifying the number of hexagons in the x-direction and the
- y-direction.
- **kwargs
- Additional keyword arguments are documented in
- :meth:`DataFrame.plot`.
-
- Returns
- -------
- matplotlib.AxesSubplot
- The matplotlib ``Axes`` on which the hexbin is plotted.
-
- See Also
- --------
- DataFrame.plot : Make plots of a DataFrame.
- matplotlib.pyplot.hexbin : Hexagonal binning plot using matplotlib,
- the matplotlib function that is used under the hood.
-
- Examples
- --------
- The following examples are generated with random data from
- a normal distribution.
-
- .. plot::
- :context: close-figs
-
- >>> n = 10000
- >>> df = pd.DataFrame({'x': np.random.randn(n),
- ... 'y': np.random.randn(n)})
- >>> ax = df.plot.hexbin(x='x', y='y', gridsize=20)
-
- The next example uses `C` and `np.sum` as `reduce_C_function`.
- Note that `'observations'` values ranges from 1 to 5 but the result
- plot shows values up to more than 25. This is because of the
- `reduce_C_function`.
-
- .. plot::
- :context: close-figs
-
- >>> n = 500
- >>> df = pd.DataFrame({
- ... 'coord_x': np.random.uniform(-3, 3, size=n),
- ... 'coord_y': np.random.uniform(30, 50, size=n),
- ... 'observations': np.random.randint(1,5, size=n)
- ... })
- >>> ax = df.plot.hexbin(x='coord_x',
- ... y='coord_y',
- ... C='observations',
- ... reduce_C_function=np.sum,
- ... gridsize=10,
- ... cmap="viridis")
- """
- if reduce_C_function is not None:
- kwargs["reduce_C_function"] = reduce_C_function
- if gridsize is not None:
- kwargs["gridsize"] = gridsize
-
- return self(kind="hexbin", x=x, y=y, C=C, **kwargs)
-
-
-_backends: dict[str, types.ModuleType] = {}
-
-
-def _load_backend(backend: str) -> types.ModuleType:
- """
- Load a pandas plotting backend.
-
- Parameters
- ----------
- backend : str
- The identifier for the backend. Either an entrypoint item registered
- with importlib.metadata, "matplotlib", or a module name.
-
- Returns
- -------
- types.ModuleType
- The imported backend.
- """
- from importlib.metadata import entry_points
-
- if backend == "matplotlib":
- # Because matplotlib is an optional dependency and first-party backend,
- # we need to attempt an import here to raise an ImportError if needed.
- try:
- module = importlib.import_module("pandas.plotting._matplotlib")
- except ImportError:
- raise ImportError(
- "matplotlib is required for plotting when the "
- 'default backend "matplotlib" is selected.'
- ) from None
- return module
-
- found_backend = False
-
- eps = entry_points()
- key = "pandas_plotting_backends"
- # entry_points lost dict API ~ PY 3.10
- # https://github.com/python/importlib_metadata/issues/298
- if hasattr(eps, "select"):
- entry = eps.select(group=key) # pyright: ignore[reportGeneralTypeIssues]
- else:
- entry = eps.get(key, ())
- for entry_point in entry:
- found_backend = entry_point.name == backend
- if found_backend:
- module = entry_point.load()
- break
-
- if not found_backend:
- # Fall back to unregistered, module name approach.
- try:
- module = importlib.import_module(backend)
- found_backend = True
- except ImportError:
- # We re-raise later on.
- pass
-
- if found_backend:
- if hasattr(module, "plot"):
- # Validate that the interface is implemented when the option is set,
- # rather than at plot time.
- return module
-
- raise ValueError(
- f"Could not find plotting backend '{backend}'. Ensure that you've "
- f"installed the package providing the '{backend}' entrypoint, or that "
- "the package has a top-level `.plot` method."
- )
-
-
-def _get_plot_backend(backend: str | None = None):
- """
- Return the plotting backend to use (e.g. `pandas.plotting._matplotlib`).
-
- The plotting system of pandas uses matplotlib by default, but the idea here
- is that it can also work with other third-party backends. This function
- returns the module which provides a top-level `.plot` method that will
- actually do the plotting. The backend is specified from a string, which
- either comes from the keyword argument `backend`, or, if not specified, from
- the option `pandas.options.plotting.backend`. All the rest of the code in
- this file uses the backend specified there for the plotting.
-
- The backend is imported lazily, as matplotlib is a soft dependency, and
- pandas can be used without it being installed.
-
- Notes
- -----
- Modifies `_backends` with imported backend as a side effect.
- """
- backend_str: str = backend or get_option("plotting.backend")
-
- if backend_str in _backends:
- return _backends[backend_str]
-
- module = _load_backend(backend_str)
- _backends[backend_str] = module
- return module
diff --git a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/__init__.py b/contrib/python/pandas/py3/pandas/plotting/_matplotlib/__init__.py
deleted file mode 100644
index 75c61da0379..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/__init__.py
+++ /dev/null
@@ -1,93 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-from pandas.plotting._matplotlib.boxplot import (
- BoxPlot,
- boxplot,
- boxplot_frame,
- boxplot_frame_groupby,
-)
-from pandas.plotting._matplotlib.converter import (
- deregister,
- register,
-)
-from pandas.plotting._matplotlib.core import (
- AreaPlot,
- BarhPlot,
- BarPlot,
- HexBinPlot,
- LinePlot,
- PiePlot,
- ScatterPlot,
-)
-from pandas.plotting._matplotlib.hist import (
- HistPlot,
- KdePlot,
- hist_frame,
- hist_series,
-)
-from pandas.plotting._matplotlib.misc import (
- andrews_curves,
- autocorrelation_plot,
- bootstrap_plot,
- lag_plot,
- parallel_coordinates,
- radviz,
- scatter_matrix,
-)
-from pandas.plotting._matplotlib.tools import table
-
-if TYPE_CHECKING:
- from pandas.plotting._matplotlib.core import MPLPlot
-
-PLOT_CLASSES: dict[str, type[MPLPlot]] = {
- "line": LinePlot,
- "bar": BarPlot,
- "barh": BarhPlot,
- "box": BoxPlot,
- "hist": HistPlot,
- "kde": KdePlot,
- "area": AreaPlot,
- "pie": PiePlot,
- "scatter": ScatterPlot,
- "hexbin": HexBinPlot,
-}
-
-
-def plot(data, kind, **kwargs):
- # Importing pyplot at the top of the file (before the converters are
- # registered) causes problems in matplotlib 2 (converters seem to not
- # work)
- import matplotlib.pyplot as plt
-
- if kwargs.pop("reuse_plot", False):
- ax = kwargs.get("ax")
- if ax is None and len(plt.get_fignums()) > 0:
- with plt.rc_context():
- ax = plt.gca()
- kwargs["ax"] = getattr(ax, "left_ax", ax)
- plot_obj = PLOT_CLASSES[kind](data, **kwargs)
- plot_obj.generate()
- plot_obj.draw()
- return plot_obj.result
-
-
-__all__ = [
- "plot",
- "hist_series",
- "hist_frame",
- "boxplot",
- "boxplot_frame",
- "boxplot_frame_groupby",
- "table",
- "andrews_curves",
- "autocorrelation_plot",
- "bootstrap_plot",
- "lag_plot",
- "parallel_coordinates",
- "radviz",
- "scatter_matrix",
- "register",
- "deregister",
-]
diff --git a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/boxplot.py b/contrib/python/pandas/py3/pandas/plotting/_matplotlib/boxplot.py
deleted file mode 100644
index 41817694580..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/boxplot.py
+++ /dev/null
@@ -1,550 +0,0 @@
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Collection,
- Literal,
- NamedTuple,
-)
-import warnings
-
-from matplotlib.artist import setp
-import numpy as np
-
-from pandas._typing import MatplotlibColor
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import is_dict_like
-from pandas.core.dtypes.missing import remove_na_arraylike
-
-import pandas as pd
-import pandas.core.common as com
-
-from pandas.io.formats.printing import pprint_thing
-from pandas.plotting._matplotlib.core import (
- LinePlot,
- MPLPlot,
-)
-from pandas.plotting._matplotlib.groupby import create_iter_data_given_by
-from pandas.plotting._matplotlib.style import get_standard_colors
-from pandas.plotting._matplotlib.tools import (
- create_subplots,
- flatten_axes,
- maybe_adjust_figure,
-)
-
-if TYPE_CHECKING:
- from matplotlib.axes import Axes
- from matplotlib.lines import Line2D
-
-
-class BoxPlot(LinePlot):
- @property
- def _kind(self) -> Literal["box"]:
- return "box"
-
- _layout_type = "horizontal"
-
- _valid_return_types = (None, "axes", "dict", "both")
-
- class BP(NamedTuple):
- # namedtuple to hold results
- ax: Axes
- lines: dict[str, list[Line2D]]
-
- def __init__(self, data, return_type: str = "axes", **kwargs) -> None:
- if return_type not in self._valid_return_types:
- raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}")
-
- self.return_type = return_type
- # Do not call LinePlot.__init__ which may fill nan
- MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called
-
- def _args_adjust(self) -> None:
- if self.subplots:
- # Disable label ax sharing. Otherwise, all subplots shows last
- # column label
- if self.orientation == "vertical":
- self.sharex = False
- else:
- self.sharey = False
-
- # error: Signature of "_plot" incompatible with supertype "MPLPlot"
- @classmethod
- def _plot( # type: ignore[override]
- cls, ax, y, column_num=None, return_type: str = "axes", **kwds
- ):
- if y.ndim == 2:
- y = [remove_na_arraylike(v) for v in y]
- # Boxplot fails with empty arrays, so need to add a NaN
- # if any cols are empty
- # GH 8181
- y = [v if v.size > 0 else np.array([np.nan]) for v in y]
- else:
- y = remove_na_arraylike(y)
- bp = ax.boxplot(y, **kwds)
-
- if return_type == "dict":
- return bp, bp
- elif return_type == "both":
- return cls.BP(ax=ax, lines=bp), bp
- else:
- return ax, bp
-
- def _validate_color_args(self):
- if "color" in self.kwds:
- if self.colormap is not None:
- warnings.warn(
- "'color' and 'colormap' cannot be used "
- "simultaneously. Using 'color'",
- stacklevel=find_stack_level(),
- )
- self.color = self.kwds.pop("color")
-
- if isinstance(self.color, dict):
- valid_keys = ["boxes", "whiskers", "medians", "caps"]
- for key in self.color:
- if key not in valid_keys:
- raise ValueError(
- f"color dict contains invalid key '{key}'. "
- f"The key must be either {valid_keys}"
- )
- else:
- self.color = None
-
- # get standard colors for default
- colors = get_standard_colors(num_colors=3, colormap=self.colormap, color=None)
- # use 2 colors by default, for box/whisker and median
- # flier colors isn't needed here
- # because it can be specified by ``sym`` kw
- self._boxes_c = colors[0]
- self._whiskers_c = colors[0]
- self._medians_c = colors[2]
- self._caps_c = colors[0]
-
- def _get_colors(
- self,
- num_colors=None,
- color_kwds: dict[str, MatplotlibColor]
- | MatplotlibColor
- | Collection[MatplotlibColor]
- | None = "color",
- ) -> None:
- pass
-
- def maybe_color_bp(self, bp) -> None:
- if isinstance(self.color, dict):
- boxes = self.color.get("boxes", self._boxes_c)
- whiskers = self.color.get("whiskers", self._whiskers_c)
- medians = self.color.get("medians", self._medians_c)
- caps = self.color.get("caps", self._caps_c)
- else:
- # Other types are forwarded to matplotlib
- # If None, use default colors
- boxes = self.color or self._boxes_c
- whiskers = self.color or self._whiskers_c
- medians = self.color or self._medians_c
- caps = self.color or self._caps_c
-
- # GH 30346, when users specifying those arguments explicitly, our defaults
- # for these four kwargs should be overridden; if not, use Pandas settings
- if not self.kwds.get("boxprops"):
- setp(bp["boxes"], color=boxes, alpha=1)
- if not self.kwds.get("whiskerprops"):
- setp(bp["whiskers"], color=whiskers, alpha=1)
- if not self.kwds.get("medianprops"):
- setp(bp["medians"], color=medians, alpha=1)
- if not self.kwds.get("capprops"):
- setp(bp["caps"], color=caps, alpha=1)
-
- def _make_plot(self) -> None:
- if self.subplots:
- self._return_obj = pd.Series(dtype=object)
-
- # Re-create iterated data if `by` is assigned by users
- data = (
- create_iter_data_given_by(self.data, self._kind)
- if self.by is not None
- else self.data
- )
-
- for i, (label, y) in enumerate(self._iter_data(data=data)):
- ax = self._get_ax(i)
- kwds = self.kwds.copy()
-
- # When by is applied, show title for subplots to know which group it is
- # just like df.boxplot, and need to apply T on y to provide right input
- if self.by is not None:
- y = y.T
- ax.set_title(pprint_thing(label))
-
- # When `by` is assigned, the ticklabels will become unique grouped
- # values, instead of label which is used as subtitle in this case.
- ticklabels = [
- pprint_thing(col) for col in self.data.columns.levels[0]
- ]
- else:
- ticklabels = [pprint_thing(label)]
-
- ret, bp = self._plot(
- ax, y, column_num=i, return_type=self.return_type, **kwds
- )
- self.maybe_color_bp(bp)
- self._return_obj[label] = ret
- self._set_ticklabels(ax, ticklabels)
- else:
- y = self.data.values.T
- ax = self._get_ax(0)
- kwds = self.kwds.copy()
-
- ret, bp = self._plot(
- ax, y, column_num=0, return_type=self.return_type, **kwds
- )
- self.maybe_color_bp(bp)
- self._return_obj = ret
-
- labels = [left for left, _ in self._iter_data()]
- labels = [pprint_thing(left) for left in labels]
- if not self.use_index:
- labels = [pprint_thing(key) for key in range(len(labels))]
- self._set_ticklabels(ax, labels)
-
- def _set_ticklabels(self, ax: Axes, labels) -> None:
- if self.orientation == "vertical":
- ax.set_xticklabels(labels)
- else:
- ax.set_yticklabels(labels)
-
- def _make_legend(self) -> None:
- pass
-
- def _post_plot_logic(self, ax, data) -> None:
- # GH 45465: make sure that the boxplot doesn't ignore xlabel/ylabel
- if self.xlabel:
- ax.set_xlabel(pprint_thing(self.xlabel))
- if self.ylabel:
- ax.set_ylabel(pprint_thing(self.ylabel))
-
- @property
- def orientation(self) -> Literal["horizontal", "vertical"]:
- if self.kwds.get("vert", True):
- return "vertical"
- else:
- return "horizontal"
-
- @property
- def result(self):
- if self.return_type is None:
- return super().result
- else:
- return self._return_obj
-
-
-def _grouped_plot_by_column(
- plotf,
- data,
- columns=None,
- by=None,
- numeric_only: bool = True,
- grid: bool = False,
- figsize=None,
- ax=None,
- layout=None,
- return_type=None,
- **kwargs,
-):
- grouped = data.groupby(by)
- if columns is None:
- if not isinstance(by, (list, tuple)):
- by = [by]
- columns = data._get_numeric_data().columns.difference(by)
- naxes = len(columns)
- fig, axes = create_subplots(
- naxes=naxes,
- sharex=kwargs.pop("sharex", True),
- sharey=kwargs.pop("sharey", True),
- figsize=figsize,
- ax=ax,
- layout=layout,
- )
-
- _axes = flatten_axes(axes)
-
- # GH 45465: move the "by" label based on "vert"
- xlabel, ylabel = kwargs.pop("xlabel", None), kwargs.pop("ylabel", None)
- if kwargs.get("vert", True):
- xlabel = xlabel or by
- else:
- ylabel = ylabel or by
-
- ax_values = []
-
- for i, col in enumerate(columns):
- ax = _axes[i]
- gp_col = grouped[col]
- keys, values = zip(*gp_col)
- re_plotf = plotf(keys, values, ax, xlabel=xlabel, ylabel=ylabel, **kwargs)
- ax.set_title(col)
- ax_values.append(re_plotf)
- ax.grid(grid)
-
- result = pd.Series(ax_values, index=columns, copy=False)
-
- # Return axes in multiplot case, maybe revisit later # 985
- if return_type is None:
- result = axes
-
- byline = by[0] if len(by) == 1 else by
- fig.suptitle(f"Boxplot grouped by {byline}")
- maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)
-
- return result
-
-
-def boxplot(
- data,
- column=None,
- by=None,
- ax=None,
- fontsize=None,
- rot: int = 0,
- grid: bool = True,
- figsize=None,
- layout=None,
- return_type=None,
- **kwds,
-):
- import matplotlib.pyplot as plt
-
- # validate return_type:
- if return_type not in BoxPlot._valid_return_types:
- raise ValueError("return_type must be {'axes', 'dict', 'both'}")
-
- if isinstance(data, pd.Series):
- data = data.to_frame("x")
- column = "x"
-
- def _get_colors():
- # num_colors=3 is required as method maybe_color_bp takes the colors
- # in positions 0 and 2.
- # if colors not provided, use same defaults as DataFrame.plot.box
- result = get_standard_colors(num_colors=3)
- result = np.take(result, [0, 0, 2])
- result = np.append(result, "k")
-
- colors = kwds.pop("color", None)
- if colors:
- if is_dict_like(colors):
- # replace colors in result array with user-specified colors
- # taken from the colors dict parameter
- # "boxes" value placed in position 0, "whiskers" in 1, etc.
- valid_keys = ["boxes", "whiskers", "medians", "caps"]
- key_to_index = dict(zip(valid_keys, range(4)))
- for key, value in colors.items():
- if key in valid_keys:
- result[key_to_index[key]] = value
- else:
- raise ValueError(
- f"color dict contains invalid key '{key}'. "
- f"The key must be either {valid_keys}"
- )
- else:
- result.fill(colors)
-
- return result
-
- def maybe_color_bp(bp, **kwds) -> None:
- # GH 30346, when users specifying those arguments explicitly, our defaults
- # for these four kwargs should be overridden; if not, use Pandas settings
- if not kwds.get("boxprops"):
- setp(bp["boxes"], color=colors[0], alpha=1)
- if not kwds.get("whiskerprops"):
- setp(bp["whiskers"], color=colors[1], alpha=1)
- if not kwds.get("medianprops"):
- setp(bp["medians"], color=colors[2], alpha=1)
- if not kwds.get("capprops"):
- setp(bp["caps"], color=colors[3], alpha=1)
-
- def plot_group(keys, values, ax: Axes, **kwds):
- # GH 45465: xlabel/ylabel need to be popped out before plotting happens
- xlabel, ylabel = kwds.pop("xlabel", None), kwds.pop("ylabel", None)
- if xlabel:
- ax.set_xlabel(pprint_thing(xlabel))
- if ylabel:
- ax.set_ylabel(pprint_thing(ylabel))
-
- keys = [pprint_thing(x) for x in keys]
- values = [np.asarray(remove_na_arraylike(v), dtype=object) for v in values]
- bp = ax.boxplot(values, **kwds)
- if fontsize is not None:
- ax.tick_params(axis="both", labelsize=fontsize)
-
- # GH 45465: x/y are flipped when "vert" changes
- is_vertical = kwds.get("vert", True)
- ticks = ax.get_xticks() if is_vertical else ax.get_yticks()
- if len(ticks) != len(keys):
- i, remainder = divmod(len(ticks), len(keys))
- assert remainder == 0, remainder
- keys *= i
- if is_vertical:
- ax.set_xticklabels(keys, rotation=rot)
- else:
- ax.set_yticklabels(keys, rotation=rot)
- maybe_color_bp(bp, **kwds)
-
- # Return axes in multiplot case, maybe revisit later # 985
- if return_type == "dict":
- return bp
- elif return_type == "both":
- return BoxPlot.BP(ax=ax, lines=bp)
- else:
- return ax
-
- colors = _get_colors()
- if column is None:
- columns = None
- else:
- if isinstance(column, (list, tuple)):
- columns = column
- else:
- columns = [column]
-
- if by is not None:
- # Prefer array return type for 2-D plots to match the subplot layout
- # https://github.com/pandas-dev/pandas/pull/12216#issuecomment-241175580
- result = _grouped_plot_by_column(
- plot_group,
- data,
- columns=columns,
- by=by,
- grid=grid,
- figsize=figsize,
- ax=ax,
- layout=layout,
- return_type=return_type,
- **kwds,
- )
- else:
- if return_type is None:
- return_type = "axes"
- if layout is not None:
- raise ValueError("The 'layout' keyword is not supported when 'by' is None")
-
- if ax is None:
- rc = {"figure.figsize": figsize} if figsize is not None else {}
- with plt.rc_context(rc):
- ax = plt.gca()
- data = data._get_numeric_data()
- naxes = len(data.columns)
- if naxes == 0:
- raise ValueError(
- "boxplot method requires numerical columns, nothing to plot."
- )
- if columns is None:
- columns = data.columns
- else:
- data = data[columns]
-
- result = plot_group(columns, data.values.T, ax, **kwds)
- ax.grid(grid)
-
- return result
-
-
-def boxplot_frame(
- self,
- column=None,
- by=None,
- ax=None,
- fontsize=None,
- rot: int = 0,
- grid: bool = True,
- figsize=None,
- layout=None,
- return_type=None,
- **kwds,
-):
- import matplotlib.pyplot as plt
-
- ax = boxplot(
- self,
- column=column,
- by=by,
- ax=ax,
- fontsize=fontsize,
- grid=grid,
- rot=rot,
- figsize=figsize,
- layout=layout,
- return_type=return_type,
- **kwds,
- )
- plt.draw_if_interactive()
- return ax
-
-
-def boxplot_frame_groupby(
- grouped,
- subplots: bool = True,
- column=None,
- fontsize=None,
- rot: int = 0,
- grid: bool = True,
- ax=None,
- figsize=None,
- layout=None,
- sharex: bool = False,
- sharey: bool = True,
- **kwds,
-):
- if subplots is True:
- naxes = len(grouped)
- fig, axes = create_subplots(
- naxes=naxes,
- squeeze=False,
- ax=ax,
- sharex=sharex,
- sharey=sharey,
- figsize=figsize,
- layout=layout,
- )
- axes = flatten_axes(axes)
-
- ret = pd.Series(dtype=object)
-
- for (key, group), ax in zip(grouped, axes):
- d = group.boxplot(
- ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds
- )
- ax.set_title(pprint_thing(key))
- ret.loc[key] = d
- maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)
- else:
- keys, frames = zip(*grouped)
- if grouped.axis == 0:
- df = pd.concat(frames, keys=keys, axis=1)
- else:
- if len(frames) > 1:
- df = frames[0].join(frames[1::])
- else:
- df = frames[0]
-
- # GH 16748, DataFrameGroupby fails when subplots=False and `column` argument
- # is assigned, and in this case, since `df` here becomes MI after groupby,
- # so we need to couple the keys (grouped values) and column (original df
- # column) together to search for subset to plot
- if column is not None:
- column = com.convert_to_list_like(column)
- multi_key = pd.MultiIndex.from_product([keys, column])
- column = list(multi_key.values)
- ret = df.boxplot(
- column=column,
- fontsize=fontsize,
- rot=rot,
- grid=grid,
- ax=ax,
- figsize=figsize,
- layout=layout,
- **kwds,
- )
- return ret
diff --git a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/converter.py b/contrib/python/pandas/py3/pandas/plotting/_matplotlib/converter.py
deleted file mode 100644
index 9b0fe99e2d6..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/converter.py
+++ /dev/null
@@ -1,1109 +0,0 @@
-from __future__ import annotations
-
-import contextlib
-import datetime as pydt
-from datetime import (
- datetime,
- timedelta,
- tzinfo,
-)
-import functools
-from typing import (
- TYPE_CHECKING,
- Any,
- Final,
- Generator,
- cast,
-)
-
-from dateutil.relativedelta import relativedelta
-import matplotlib.dates as mdates
-from matplotlib.ticker import (
- AutoLocator,
- Formatter,
- Locator,
-)
-from matplotlib.transforms import nonsingular
-import matplotlib.units as munits
-import numpy as np
-
-from pandas._libs import lib
-from pandas._libs.tslibs import (
- Timestamp,
- to_offset,
-)
-from pandas._libs.tslibs.dtypes import FreqGroup
-from pandas._typing import F
-
-from pandas.core.dtypes.common import (
- is_float,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_nested_list_like,
-)
-
-from pandas import (
- Index,
- Series,
- get_option,
-)
-import pandas.core.common as com
-from pandas.core.indexes.datetimes import date_range
-from pandas.core.indexes.period import (
- Period,
- PeriodIndex,
- period_range,
-)
-import pandas.core.tools.datetimes as tools
-
-if TYPE_CHECKING:
- from pandas._libs.tslibs.offsets import BaseOffset
-
-# constants
-HOURS_PER_DAY: Final = 24.0
-MIN_PER_HOUR: Final = 60.0
-SEC_PER_MIN: Final = 60.0
-
-SEC_PER_HOUR: Final = SEC_PER_MIN * MIN_PER_HOUR
-SEC_PER_DAY: Final = SEC_PER_HOUR * HOURS_PER_DAY
-
-MUSEC_PER_DAY: Final = 10**6 * SEC_PER_DAY
-
-_mpl_units = {} # Cache for units overwritten by us
-
-
-def get_pairs():
- pairs = [
- (Timestamp, DatetimeConverter),
- (Period, PeriodConverter),
- (pydt.datetime, DatetimeConverter),
- (pydt.date, DatetimeConverter),
- (pydt.time, TimeConverter),
- (np.datetime64, DatetimeConverter),
- ]
- return pairs
-
-
-def register_pandas_matplotlib_converters(func: F) -> F:
- """
- Decorator applying pandas_converters.
- """
-
- @functools.wraps(func)
- def wrapper(*args, **kwargs):
- with pandas_converters():
- return func(*args, **kwargs)
-
- return cast(F, wrapper)
-
-
-@contextlib.contextmanager
-def pandas_converters() -> Generator[None, None, None]:
- """
- Context manager registering pandas' converters for a plot.
-
- See Also
- --------
- register_pandas_matplotlib_converters : Decorator that applies this.
- """
- value = get_option("plotting.matplotlib.register_converters")
-
- if value:
- # register for True or "auto"
- register()
- try:
- yield
- finally:
- if value == "auto":
- # only deregister for "auto"
- deregister()
-
-
-def register() -> None:
- pairs = get_pairs()
- for type_, cls in pairs:
- # Cache previous converter if present
- if type_ in munits.registry and not isinstance(munits.registry[type_], cls):
- previous = munits.registry[type_]
- _mpl_units[type_] = previous
- # Replace with pandas converter
- munits.registry[type_] = cls()
-
-
-def deregister() -> None:
- # Renamed in pandas.plotting.__init__
- for type_, cls in get_pairs():
- # We use type to catch our classes directly, no inheritance
- if type(munits.registry.get(type_)) is cls:
- munits.registry.pop(type_)
-
- # restore the old keys
- for unit, formatter in _mpl_units.items():
- if type(formatter) not in {DatetimeConverter, PeriodConverter, TimeConverter}:
- # make it idempotent by excluding ours.
- munits.registry[unit] = formatter
-
-
-def _to_ordinalf(tm: pydt.time) -> float:
- tot_sec = tm.hour * 3600 + tm.minute * 60 + tm.second + tm.microsecond / 10**6
- return tot_sec
-
-
-def time2num(d):
- if isinstance(d, str):
- parsed = Timestamp(d)
- return _to_ordinalf(parsed.time())
- if isinstance(d, pydt.time):
- return _to_ordinalf(d)
- return d
-
-
-class TimeConverter(munits.ConversionInterface):
- @staticmethod
- def convert(value, unit, axis):
- valid_types = (str, pydt.time)
- if isinstance(value, valid_types) or is_integer(value) or is_float(value):
- return time2num(value)
- if isinstance(value, Index):
- return value.map(time2num)
- if isinstance(value, (list, tuple, np.ndarray, Index)):
- return [time2num(x) for x in value]
- return value
-
- @staticmethod
- def axisinfo(unit, axis) -> munits.AxisInfo | None:
- if unit != "time":
- return None
-
- majloc = AutoLocator()
- majfmt = TimeFormatter(majloc)
- return munits.AxisInfo(majloc=majloc, majfmt=majfmt, label="time")
-
- @staticmethod
- def default_units(x, axis) -> str:
- return "time"
-
-
-# time formatter
-class TimeFormatter(Formatter):
- def __init__(self, locs) -> None:
- self.locs = locs
-
- def __call__(self, x, pos: int = 0) -> str:
- """
- Return the time of day as a formatted string.
-
- Parameters
- ----------
- x : float
- The time of day specified as seconds since 00:00 (midnight),
- with up to microsecond precision.
- pos
- Unused
-
- Returns
- -------
- str
- A string in HH:MM:SS.mmmuuu format. Microseconds,
- milliseconds and seconds are only displayed if non-zero.
- """
- fmt = "%H:%M:%S.%f"
- s = int(x)
- msus = round((x - s) * 10**6)
- ms = msus // 1000
- us = msus % 1000
- m, s = divmod(s, 60)
- h, m = divmod(m, 60)
- _, h = divmod(h, 24)
- if us != 0:
- return pydt.time(h, m, s, msus).strftime(fmt)
- elif ms != 0:
- return pydt.time(h, m, s, msus).strftime(fmt)[:-3]
- elif s != 0:
- return pydt.time(h, m, s).strftime("%H:%M:%S")
-
- return pydt.time(h, m).strftime("%H:%M")
-
-
-# Period Conversion
-
-
-class PeriodConverter(mdates.DateConverter):
- @staticmethod
- def convert(values, units, axis):
- if is_nested_list_like(values):
- values = [PeriodConverter._convert_1d(v, units, axis) for v in values]
- else:
- values = PeriodConverter._convert_1d(values, units, axis)
- return values
-
- @staticmethod
- def _convert_1d(values, units, axis):
- if not hasattr(axis, "freq"):
- raise TypeError("Axis must have `freq` set to convert to Periods")
- valid_types = (str, datetime, Period, pydt.date, pydt.time, np.datetime64)
- if isinstance(values, valid_types) or is_integer(values) or is_float(values):
- return get_datevalue(values, axis.freq)
- elif isinstance(values, PeriodIndex):
- return values.asfreq(axis.freq).asi8
- elif isinstance(values, Index):
- return values.map(lambda x: get_datevalue(x, axis.freq))
- elif lib.infer_dtype(values, skipna=False) == "period":
- # https://github.com/pandas-dev/pandas/issues/24304
- # convert ndarray[period] -> PeriodIndex
- return PeriodIndex(values, freq=axis.freq).asi8
- elif isinstance(values, (list, tuple, np.ndarray, Index)):
- return [get_datevalue(x, axis.freq) for x in values]
- return values
-
-
-def get_datevalue(date, freq):
- if isinstance(date, Period):
- return date.asfreq(freq).ordinal
- elif isinstance(date, (str, datetime, pydt.date, pydt.time, np.datetime64)):
- return Period(date, freq).ordinal
- elif (
- is_integer(date)
- or is_float(date)
- or (isinstance(date, (np.ndarray, Index)) and (date.size == 1))
- ):
- return date
- elif date is None:
- return None
- raise ValueError(f"Unrecognizable date '{date}'")
-
-
-# Datetime Conversion
-class DatetimeConverter(mdates.DateConverter):
- @staticmethod
- def convert(values, unit, axis):
- # values might be a 1-d array, or a list-like of arrays.
- if is_nested_list_like(values):
- values = [DatetimeConverter._convert_1d(v, unit, axis) for v in values]
- else:
- values = DatetimeConverter._convert_1d(values, unit, axis)
- return values
-
- @staticmethod
- def _convert_1d(values, unit, axis):
- def try_parse(values):
- try:
- return mdates.date2num(tools.to_datetime(values))
- except Exception:
- return values
-
- if isinstance(values, (datetime, pydt.date, np.datetime64, pydt.time)):
- return mdates.date2num(values)
- elif is_integer(values) or is_float(values):
- return values
- elif isinstance(values, str):
- return try_parse(values)
- elif isinstance(values, (list, tuple, np.ndarray, Index, Series)):
- if isinstance(values, Series):
- # https://github.com/matplotlib/matplotlib/issues/11391
- # Series was skipped. Convert to DatetimeIndex to get asi8
- values = Index(values)
- if isinstance(values, Index):
- values = values.values
- if not isinstance(values, np.ndarray):
- values = com.asarray_tuplesafe(values)
-
- if is_integer_dtype(values) or is_float_dtype(values):
- return values
-
- try:
- values = tools.to_datetime(values)
- except Exception:
- pass
-
- values = mdates.date2num(values)
-
- return values
-
- @staticmethod
- def axisinfo(unit: tzinfo | None, axis) -> munits.AxisInfo:
- """
- Return the :class:`~matplotlib.units.AxisInfo` for *unit*.
-
- *unit* is a tzinfo instance or None.
- The *axis* argument is required but not used.
- """
- tz = unit
-
- majloc = PandasAutoDateLocator(tz=tz)
- majfmt = PandasAutoDateFormatter(majloc, tz=tz)
- datemin = pydt.date(2000, 1, 1)
- datemax = pydt.date(2010, 1, 1)
-
- return munits.AxisInfo(
- majloc=majloc, majfmt=majfmt, label="", default_limits=(datemin, datemax)
- )
-
-
-class PandasAutoDateFormatter(mdates.AutoDateFormatter):
- def __init__(self, locator, tz=None, defaultfmt: str = "%Y-%m-%d") -> None:
- mdates.AutoDateFormatter.__init__(self, locator, tz, defaultfmt)
-
-
-class PandasAutoDateLocator(mdates.AutoDateLocator):
- def get_locator(self, dmin, dmax):
- """Pick the best locator based on a distance."""
- delta = relativedelta(dmax, dmin)
-
- num_days = (delta.years * 12.0 + delta.months) * 31.0 + delta.days
- num_sec = (delta.hours * 60.0 + delta.minutes) * 60.0 + delta.seconds
- tot_sec = num_days * 86400.0 + num_sec
-
- if abs(tot_sec) < self.minticks:
- self._freq = -1
- locator = MilliSecondLocator(self.tz)
- locator.set_axis(self.axis)
-
- locator.axis.set_view_interval(*self.axis.get_view_interval())
- locator.axis.set_data_interval(*self.axis.get_data_interval())
- return locator
-
- return mdates.AutoDateLocator.get_locator(self, dmin, dmax)
-
- def _get_unit(self):
- return MilliSecondLocator.get_unit_generic(self._freq)
-
-
-class MilliSecondLocator(mdates.DateLocator):
- UNIT = 1.0 / (24 * 3600 * 1000)
-
- def __init__(self, tz) -> None:
- mdates.DateLocator.__init__(self, tz)
- self._interval = 1.0
-
- def _get_unit(self):
- return self.get_unit_generic(-1)
-
- @staticmethod
- def get_unit_generic(freq):
- unit = mdates.RRuleLocator.get_unit_generic(freq)
- if unit < 0:
- return MilliSecondLocator.UNIT
- return unit
-
- def __call__(self):
- # if no data have been set, this will tank with a ValueError
- try:
- dmin, dmax = self.viewlim_to_dt()
- except ValueError:
- return []
-
- # We need to cap at the endpoints of valid datetime
- nmax, nmin = mdates.date2num((dmax, dmin))
-
- num = (nmax - nmin) * 86400 * 1000
- max_millis_ticks = 6
- for interval in [1, 10, 50, 100, 200, 500]:
- if num <= interval * (max_millis_ticks - 1):
- self._interval = interval
- break
- # We went through the whole loop without breaking, default to 1
- self._interval = 1000.0
-
- estimate = (nmax - nmin) / (self._get_unit() * self._get_interval())
-
- if estimate > self.MAXTICKS * 2:
- raise RuntimeError(
- "MillisecondLocator estimated to generate "
- f"{estimate:d} ticks from {dmin} to {dmax}: exceeds Locator.MAXTICKS"
- f"* 2 ({self.MAXTICKS * 2:d}) "
- )
-
- interval = self._get_interval()
- freq = f"{interval}L"
- tz = self.tz.tzname(None)
- st = dmin.replace(tzinfo=None)
- ed = dmin.replace(tzinfo=None)
- all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object)
-
- try:
- if len(all_dates) > 0:
- locs = self.raise_if_exceeds(mdates.date2num(all_dates))
- return locs
- except Exception: # pragma: no cover
- pass
-
- lims = mdates.date2num([dmin, dmax])
- return lims
-
- def _get_interval(self):
- return self._interval
-
- def autoscale(self):
- """
- Set the view limits to include the data range.
- """
- # We need to cap at the endpoints of valid datetime
- dmin, dmax = self.datalim_to_dt()
-
- vmin = mdates.date2num(dmin)
- vmax = mdates.date2num(dmax)
-
- return self.nonsingular(vmin, vmax)
-
-
-def _from_ordinal(x, tz: tzinfo | None = None) -> datetime:
- ix = int(x)
- dt = datetime.fromordinal(ix)
- remainder = float(x) - ix
- hour, remainder = divmod(24 * remainder, 1)
- minute, remainder = divmod(60 * remainder, 1)
- second, remainder = divmod(60 * remainder, 1)
- microsecond = int(1_000_000 * remainder)
- if microsecond < 10:
- microsecond = 0 # compensate for rounding errors
- dt = datetime(
- dt.year, dt.month, dt.day, int(hour), int(minute), int(second), microsecond
- )
- if tz is not None:
- dt = dt.astimezone(tz)
-
- if microsecond > 999990: # compensate for rounding errors
- dt += timedelta(microseconds=1_000_000 - microsecond)
-
- return dt
-
-
-# Fixed frequency dynamic tick locators and formatters
-
-# -------------------------------------------------------------------------
-# --- Locators ---
-# -------------------------------------------------------------------------
-
-
-def _get_default_annual_spacing(nyears) -> tuple[int, int]:
- """
- Returns a default spacing between consecutive ticks for annual data.
- """
- if nyears < 11:
- (min_spacing, maj_spacing) = (1, 1)
- elif nyears < 20:
- (min_spacing, maj_spacing) = (1, 2)
- elif nyears < 50:
- (min_spacing, maj_spacing) = (1, 5)
- elif nyears < 100:
- (min_spacing, maj_spacing) = (5, 10)
- elif nyears < 200:
- (min_spacing, maj_spacing) = (5, 25)
- elif nyears < 600:
- (min_spacing, maj_spacing) = (10, 50)
- else:
- factor = nyears // 1000 + 1
- (min_spacing, maj_spacing) = (factor * 20, factor * 100)
- return (min_spacing, maj_spacing)
-
-
-def period_break(dates: PeriodIndex, period: str) -> np.ndarray:
- """
- Returns the indices where the given period changes.
-
- Parameters
- ----------
- dates : PeriodIndex
- Array of intervals to monitor.
- period : str
- Name of the period to monitor.
- """
- current = getattr(dates, period)
- previous = getattr(dates - 1 * dates.freq, period)
- return np.nonzero(current - previous)[0]
-
-
-def has_level_label(label_flags: np.ndarray, vmin: float) -> bool:
- """
- Returns true if the ``label_flags`` indicate there is at least one label
- for this level.
-
- if the minimum view limit is not an exact integer, then the first tick
- label won't be shown, so we must adjust for that.
- """
- if label_flags.size == 0 or (
- label_flags.size == 1 and label_flags[0] == 0 and vmin % 1 > 0.0
- ):
- return False
- else:
- return True
-
-
-def _daily_finder(vmin, vmax, freq: BaseOffset):
- # error: "BaseOffset" has no attribute "_period_dtype_code"
- dtype_code = freq._period_dtype_code # type: ignore[attr-defined]
- freq_group = FreqGroup.from_period_dtype_code(dtype_code)
-
- periodsperday = -1
-
- if dtype_code >= FreqGroup.FR_HR.value:
- if freq_group == FreqGroup.FR_NS:
- periodsperday = 24 * 60 * 60 * 1000000000
- elif freq_group == FreqGroup.FR_US:
- periodsperday = 24 * 60 * 60 * 1000000
- elif freq_group == FreqGroup.FR_MS:
- periodsperday = 24 * 60 * 60 * 1000
- elif freq_group == FreqGroup.FR_SEC:
- periodsperday = 24 * 60 * 60
- elif freq_group == FreqGroup.FR_MIN:
- periodsperday = 24 * 60
- elif freq_group == FreqGroup.FR_HR:
- periodsperday = 24
- else: # pragma: no cover
- raise ValueError(f"unexpected frequency: {dtype_code}")
- periodsperyear = 365 * periodsperday
- periodspermonth = 28 * periodsperday
-
- elif freq_group == FreqGroup.FR_BUS:
- periodsperyear = 261
- periodspermonth = 19
- elif freq_group == FreqGroup.FR_DAY:
- periodsperyear = 365
- periodspermonth = 28
- elif freq_group == FreqGroup.FR_WK:
- periodsperyear = 52
- periodspermonth = 3
- else: # pragma: no cover
- raise ValueError("unexpected frequency")
-
- # save this for later usage
- vmin_orig = vmin
-
- (vmin, vmax) = (
- Period(ordinal=int(vmin), freq=freq),
- Period(ordinal=int(vmax), freq=freq),
- )
- assert isinstance(vmin, Period)
- assert isinstance(vmax, Period)
- span = vmax.ordinal - vmin.ordinal + 1
- dates_ = period_range(start=vmin, end=vmax, freq=freq)
- # Initialize the output
- info = np.zeros(
- span, dtype=[("val", np.int64), ("maj", bool), ("min", bool), ("fmt", "|S20")]
- )
- info["val"][:] = dates_.asi8
- info["fmt"][:] = ""
- info["maj"][[0, -1]] = True
- # .. and set some shortcuts
- info_maj = info["maj"]
- info_min = info["min"]
- info_fmt = info["fmt"]
-
- def first_label(label_flags):
- if (label_flags[0] == 0) and (label_flags.size > 1) and ((vmin_orig % 1) > 0.0):
- return label_flags[1]
- else:
- return label_flags[0]
-
- # Case 1. Less than a month
- if span <= periodspermonth:
- day_start = period_break(dates_, "day")
- month_start = period_break(dates_, "month")
-
- def _hour_finder(label_interval, force_year_start) -> None:
- _hour = dates_.hour
- _prev_hour = (dates_ - 1 * dates_.freq).hour
- hour_start = (_hour - _prev_hour) != 0
- info_maj[day_start] = True
- info_min[hour_start & (_hour % label_interval == 0)] = True
- year_start = period_break(dates_, "year")
- info_fmt[hour_start & (_hour % label_interval == 0)] = "%H:%M"
- info_fmt[day_start] = "%H:%M\n%d-%b"
- info_fmt[year_start] = "%H:%M\n%d-%b\n%Y"
- if force_year_start and not has_level_label(year_start, vmin_orig):
- info_fmt[first_label(day_start)] = "%H:%M\n%d-%b\n%Y"
-
- def _minute_finder(label_interval) -> None:
- hour_start = period_break(dates_, "hour")
- _minute = dates_.minute
- _prev_minute = (dates_ - 1 * dates_.freq).minute
- minute_start = (_minute - _prev_minute) != 0
- info_maj[hour_start] = True
- info_min[minute_start & (_minute % label_interval == 0)] = True
- year_start = period_break(dates_, "year")
- info_fmt = info["fmt"]
- info_fmt[minute_start & (_minute % label_interval == 0)] = "%H:%M"
- info_fmt[day_start] = "%H:%M\n%d-%b"
- info_fmt[year_start] = "%H:%M\n%d-%b\n%Y"
-
- def _second_finder(label_interval) -> None:
- minute_start = period_break(dates_, "minute")
- _second = dates_.second
- _prev_second = (dates_ - 1 * dates_.freq).second
- second_start = (_second - _prev_second) != 0
- info["maj"][minute_start] = True
- info["min"][second_start & (_second % label_interval == 0)] = True
- year_start = period_break(dates_, "year")
- info_fmt = info["fmt"]
- info_fmt[second_start & (_second % label_interval == 0)] = "%H:%M:%S"
- info_fmt[day_start] = "%H:%M:%S\n%d-%b"
- info_fmt[year_start] = "%H:%M:%S\n%d-%b\n%Y"
-
- if span < periodsperday / 12000:
- _second_finder(1)
- elif span < periodsperday / 6000:
- _second_finder(2)
- elif span < periodsperday / 2400:
- _second_finder(5)
- elif span < periodsperday / 1200:
- _second_finder(10)
- elif span < periodsperday / 800:
- _second_finder(15)
- elif span < periodsperday / 400:
- _second_finder(30)
- elif span < periodsperday / 150:
- _minute_finder(1)
- elif span < periodsperday / 70:
- _minute_finder(2)
- elif span < periodsperday / 24:
- _minute_finder(5)
- elif span < periodsperday / 12:
- _minute_finder(15)
- elif span < periodsperday / 6:
- _minute_finder(30)
- elif span < periodsperday / 2.5:
- _hour_finder(1, False)
- elif span < periodsperday / 1.5:
- _hour_finder(2, False)
- elif span < periodsperday * 1.25:
- _hour_finder(3, False)
- elif span < periodsperday * 2.5:
- _hour_finder(6, True)
- elif span < periodsperday * 4:
- _hour_finder(12, True)
- else:
- info_maj[month_start] = True
- info_min[day_start] = True
- year_start = period_break(dates_, "year")
- info_fmt = info["fmt"]
- info_fmt[day_start] = "%d"
- info_fmt[month_start] = "%d\n%b"
- info_fmt[year_start] = "%d\n%b\n%Y"
- if not has_level_label(year_start, vmin_orig):
- if not has_level_label(month_start, vmin_orig):
- info_fmt[first_label(day_start)] = "%d\n%b\n%Y"
- else:
- info_fmt[first_label(month_start)] = "%d\n%b\n%Y"
-
- # Case 2. Less than three months
- elif span <= periodsperyear // 4:
- month_start = period_break(dates_, "month")
- info_maj[month_start] = True
- if dtype_code < FreqGroup.FR_HR.value:
- info["min"] = True
- else:
- day_start = period_break(dates_, "day")
- info["min"][day_start] = True
- week_start = period_break(dates_, "week")
- year_start = period_break(dates_, "year")
- info_fmt[week_start] = "%d"
- info_fmt[month_start] = "\n\n%b"
- info_fmt[year_start] = "\n\n%b\n%Y"
- if not has_level_label(year_start, vmin_orig):
- if not has_level_label(month_start, vmin_orig):
- info_fmt[first_label(week_start)] = "\n\n%b\n%Y"
- else:
- info_fmt[first_label(month_start)] = "\n\n%b\n%Y"
- # Case 3. Less than 14 months ...............
- elif span <= 1.15 * periodsperyear:
- year_start = period_break(dates_, "year")
- month_start = period_break(dates_, "month")
- week_start = period_break(dates_, "week")
- info_maj[month_start] = True
- info_min[week_start] = True
- info_min[year_start] = False
- info_min[month_start] = False
- info_fmt[month_start] = "%b"
- info_fmt[year_start] = "%b\n%Y"
- if not has_level_label(year_start, vmin_orig):
- info_fmt[first_label(month_start)] = "%b\n%Y"
- # Case 4. Less than 2.5 years ...............
- elif span <= 2.5 * periodsperyear:
- year_start = period_break(dates_, "year")
- quarter_start = period_break(dates_, "quarter")
- month_start = period_break(dates_, "month")
- info_maj[quarter_start] = True
- info_min[month_start] = True
- info_fmt[quarter_start] = "%b"
- info_fmt[year_start] = "%b\n%Y"
- # Case 4. Less than 4 years .................
- elif span <= 4 * periodsperyear:
- year_start = period_break(dates_, "year")
- month_start = period_break(dates_, "month")
- info_maj[year_start] = True
- info_min[month_start] = True
- info_min[year_start] = False
-
- month_break = dates_[month_start].month
- jan_or_jul = month_start[(month_break == 1) | (month_break == 7)]
- info_fmt[jan_or_jul] = "%b"
- info_fmt[year_start] = "%b\n%Y"
- # Case 5. Less than 11 years ................
- elif span <= 11 * periodsperyear:
- year_start = period_break(dates_, "year")
- quarter_start = period_break(dates_, "quarter")
- info_maj[year_start] = True
- info_min[quarter_start] = True
- info_min[year_start] = False
- info_fmt[year_start] = "%Y"
- # Case 6. More than 12 years ................
- else:
- year_start = period_break(dates_, "year")
- year_break = dates_[year_start].year
- nyears = span / periodsperyear
- (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears)
- major_idx = year_start[(year_break % maj_anndef == 0)]
- info_maj[major_idx] = True
- minor_idx = year_start[(year_break % min_anndef == 0)]
- info_min[minor_idx] = True
- info_fmt[major_idx] = "%Y"
-
- return info
-
-
-def _monthly_finder(vmin, vmax, freq):
- periodsperyear = 12
-
- vmin_orig = vmin
- (vmin, vmax) = (int(vmin), int(vmax))
- span = vmax - vmin + 1
-
- # Initialize the output
- info = np.zeros(
- span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")]
- )
- info["val"] = np.arange(vmin, vmax + 1)
- dates_ = info["val"]
- info["fmt"] = ""
- year_start = (dates_ % 12 == 0).nonzero()[0]
- info_maj = info["maj"]
- info_fmt = info["fmt"]
-
- if span <= 1.15 * periodsperyear:
- info_maj[year_start] = True
- info["min"] = True
-
- info_fmt[:] = "%b"
- info_fmt[year_start] = "%b\n%Y"
-
- if not has_level_label(year_start, vmin_orig):
- if dates_.size > 1:
- idx = 1
- else:
- idx = 0
- info_fmt[idx] = "%b\n%Y"
-
- elif span <= 2.5 * periodsperyear:
- quarter_start = (dates_ % 3 == 0).nonzero()
- info_maj[year_start] = True
- # TODO: Check the following : is it really info['fmt'] ?
- info["fmt"][quarter_start] = True
- info["min"] = True
-
- info_fmt[quarter_start] = "%b"
- info_fmt[year_start] = "%b\n%Y"
-
- elif span <= 4 * periodsperyear:
- info_maj[year_start] = True
- info["min"] = True
-
- jan_or_jul = (dates_ % 12 == 0) | (dates_ % 12 == 6)
- info_fmt[jan_or_jul] = "%b"
- info_fmt[year_start] = "%b\n%Y"
-
- elif span <= 11 * periodsperyear:
- quarter_start = (dates_ % 3 == 0).nonzero()
- info_maj[year_start] = True
- info["min"][quarter_start] = True
-
- info_fmt[year_start] = "%Y"
-
- else:
- nyears = span / periodsperyear
- (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears)
- years = dates_[year_start] // 12 + 1
- major_idx = year_start[(years % maj_anndef == 0)]
- info_maj[major_idx] = True
- info["min"][year_start[(years % min_anndef == 0)]] = True
-
- info_fmt[major_idx] = "%Y"
-
- return info
-
-
-def _quarterly_finder(vmin, vmax, freq):
- periodsperyear = 4
- vmin_orig = vmin
- (vmin, vmax) = (int(vmin), int(vmax))
- span = vmax - vmin + 1
-
- info = np.zeros(
- span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")]
- )
- info["val"] = np.arange(vmin, vmax + 1)
- info["fmt"] = ""
- dates_ = info["val"]
- info_maj = info["maj"]
- info_fmt = info["fmt"]
- year_start = (dates_ % 4 == 0).nonzero()[0]
-
- if span <= 3.5 * periodsperyear:
- info_maj[year_start] = True
- info["min"] = True
-
- info_fmt[:] = "Q%q"
- info_fmt[year_start] = "Q%q\n%F"
- if not has_level_label(year_start, vmin_orig):
- if dates_.size > 1:
- idx = 1
- else:
- idx = 0
- info_fmt[idx] = "Q%q\n%F"
-
- elif span <= 11 * periodsperyear:
- info_maj[year_start] = True
- info["min"] = True
- info_fmt[year_start] = "%F"
-
- else:
- # https://github.com/pandas-dev/pandas/pull/47602
- years = dates_[year_start] // 4 + 1970
- nyears = span / periodsperyear
- (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears)
- major_idx = year_start[(years % maj_anndef == 0)]
- info_maj[major_idx] = True
- info["min"][year_start[(years % min_anndef == 0)]] = True
- info_fmt[major_idx] = "%F"
-
- return info
-
-
-def _annual_finder(vmin, vmax, freq):
- (vmin, vmax) = (int(vmin), int(vmax + 1))
- span = vmax - vmin + 1
-
- info = np.zeros(
- span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")]
- )
- info["val"] = np.arange(vmin, vmax + 1)
- info["fmt"] = ""
- dates_ = info["val"]
-
- (min_anndef, maj_anndef) = _get_default_annual_spacing(span)
- major_idx = dates_ % maj_anndef == 0
- info["maj"][major_idx] = True
- info["min"][(dates_ % min_anndef == 0)] = True
- info["fmt"][major_idx] = "%Y"
-
- return info
-
-
-def get_finder(freq: BaseOffset):
- # error: "BaseOffset" has no attribute "_period_dtype_code"
- dtype_code = freq._period_dtype_code # type: ignore[attr-defined]
- fgroup = FreqGroup.from_period_dtype_code(dtype_code)
-
- if fgroup == FreqGroup.FR_ANN:
- return _annual_finder
- elif fgroup == FreqGroup.FR_QTR:
- return _quarterly_finder
- elif fgroup == FreqGroup.FR_MTH:
- return _monthly_finder
- elif (dtype_code >= FreqGroup.FR_BUS.value) or fgroup == FreqGroup.FR_WK:
- return _daily_finder
- else: # pragma: no cover
- raise NotImplementedError(f"Unsupported frequency: {dtype_code}")
-
-
-class TimeSeries_DateLocator(Locator):
- """
- Locates the ticks along an axis controlled by a :class:`Series`.
-
- Parameters
- ----------
- freq : BaseOffset
- Valid frequency specifier.
- minor_locator : {False, True}, optional
- Whether the locator is for minor ticks (True) or not.
- dynamic_mode : {True, False}, optional
- Whether the locator should work in dynamic mode.
- base : {int}, optional
- quarter : {int}, optional
- month : {int}, optional
- day : {int}, optional
- """
-
- def __init__(
- self,
- freq: BaseOffset,
- minor_locator: bool = False,
- dynamic_mode: bool = True,
- base: int = 1,
- quarter: int = 1,
- month: int = 1,
- day: int = 1,
- plot_obj=None,
- ) -> None:
- freq = to_offset(freq)
- self.freq = freq
- self.base = base
- (self.quarter, self.month, self.day) = (quarter, month, day)
- self.isminor = minor_locator
- self.isdynamic = dynamic_mode
- self.offset = 0
- self.plot_obj = plot_obj
- self.finder = get_finder(freq)
-
- def _get_default_locs(self, vmin, vmax):
- """Returns the default locations of ticks."""
- if self.plot_obj.date_axis_info is None:
- self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq)
-
- locator = self.plot_obj.date_axis_info
-
- if self.isminor:
- return np.compress(locator["min"], locator["val"])
- return np.compress(locator["maj"], locator["val"])
-
- def __call__(self):
- """Return the locations of the ticks."""
- # axis calls Locator.set_axis inside set_m<xxxx>_formatter
-
- vi = tuple(self.axis.get_view_interval())
- if vi != self.plot_obj.view_interval:
- self.plot_obj.date_axis_info = None
- self.plot_obj.view_interval = vi
- vmin, vmax = vi
- if vmax < vmin:
- vmin, vmax = vmax, vmin
- if self.isdynamic:
- locs = self._get_default_locs(vmin, vmax)
- else: # pragma: no cover
- base = self.base
- (d, m) = divmod(vmin, base)
- vmin = (d + 1) * base
- locs = list(range(vmin, vmax + 1, base))
- return locs
-
- def autoscale(self):
- """
- Sets the view limits to the nearest multiples of base that contain the
- data.
- """
- # requires matplotlib >= 0.98.0
- (vmin, vmax) = self.axis.get_data_interval()
-
- locs = self._get_default_locs(vmin, vmax)
- (vmin, vmax) = locs[[0, -1]]
- if vmin == vmax:
- vmin -= 1
- vmax += 1
- return nonsingular(vmin, vmax)
-
-
-# -------------------------------------------------------------------------
-# --- Formatter ---
-# -------------------------------------------------------------------------
-
-
-class TimeSeries_DateFormatter(Formatter):
- """
- Formats the ticks along an axis controlled by a :class:`PeriodIndex`.
-
- Parameters
- ----------
- freq : BaseOffset
- Valid frequency specifier.
- minor_locator : bool, default False
- Whether the current formatter should apply to minor ticks (True) or
- major ticks (False).
- dynamic_mode : bool, default True
- Whether the formatter works in dynamic mode or not.
- """
-
- def __init__(
- self,
- freq: BaseOffset,
- minor_locator: bool = False,
- dynamic_mode: bool = True,
- plot_obj=None,
- ) -> None:
- freq = to_offset(freq)
- self.format = None
- self.freq = freq
- self.locs: list[Any] = [] # unused, for matplotlib compat
- self.formatdict: dict[Any, Any] | None = None
- self.isminor = minor_locator
- self.isdynamic = dynamic_mode
- self.offset = 0
- self.plot_obj = plot_obj
- self.finder = get_finder(freq)
-
- def _set_default_format(self, vmin, vmax):
- """Returns the default ticks spacing."""
- if self.plot_obj.date_axis_info is None:
- self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq)
- info = self.plot_obj.date_axis_info
-
- if self.isminor:
- format = np.compress(info["min"] & np.logical_not(info["maj"]), info)
- else:
- format = np.compress(info["maj"], info)
- self.formatdict = {x: f for (x, _, _, f) in format}
- return self.formatdict
-
- def set_locs(self, locs) -> None:
- """Sets the locations of the ticks"""
- # don't actually use the locs. This is just needed to work with
- # matplotlib. Force to use vmin, vmax
-
- self.locs = locs
-
- (vmin, vmax) = vi = tuple(self.axis.get_view_interval())
- if vi != self.plot_obj.view_interval:
- self.plot_obj.date_axis_info = None
- self.plot_obj.view_interval = vi
- if vmax < vmin:
- (vmin, vmax) = (vmax, vmin)
- self._set_default_format(vmin, vmax)
-
- def __call__(self, x, pos: int = 0) -> str:
- if self.formatdict is None:
- return ""
- else:
- fmt = self.formatdict.pop(x, "")
- if isinstance(fmt, np.bytes_):
- fmt = fmt.decode("utf-8")
- period = Period(ordinal=int(x), freq=self.freq)
- assert isinstance(period, Period)
- return period.strftime(fmt)
-
-
-class TimeSeries_TimedeltaFormatter(Formatter):
- """
- Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`.
- """
-
- @staticmethod
- def format_timedelta_ticks(x, pos, n_decimals: int) -> str:
- """
- Convert seconds to 'D days HH:MM:SS.F'
- """
- s, ns = divmod(x, 10**9)
- m, s = divmod(s, 60)
- h, m = divmod(m, 60)
- d, h = divmod(h, 24)
- decimals = int(ns * 10 ** (n_decimals - 9))
- s = f"{int(h):02d}:{int(m):02d}:{int(s):02d}"
- if n_decimals > 0:
- s += f".{decimals:0{n_decimals}d}"
- if d != 0:
- s = f"{int(d):d} days {s}"
- return s
-
- def __call__(self, x, pos: int = 0) -> str:
- (vmin, vmax) = tuple(self.axis.get_view_interval())
- n_decimals = min(int(np.ceil(np.log10(100 * 10**9 / abs(vmax - vmin)))), 9)
- return self.format_timedelta_ticks(x, pos, n_decimals)
diff --git a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/core.py b/contrib/python/pandas/py3/pandas/plotting/_matplotlib/core.py
deleted file mode 100644
index 88244018079..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/core.py
+++ /dev/null
@@ -1,1877 +0,0 @@
-from __future__ import annotations
-
-from abc import (
- ABC,
- abstractmethod,
-)
-from typing import (
- TYPE_CHECKING,
- Hashable,
- Iterable,
- Literal,
- Sequence,
-)
-import warnings
-
-import matplotlib as mpl
-from matplotlib.artist import Artist
-import numpy as np
-
-from pandas._typing import (
- IndexLabel,
- PlottingOrientation,
- npt,
-)
-from pandas.errors import AbstractMethodError
-from pandas.util._decorators import cache_readonly
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import (
- is_any_real_numeric_dtype,
- is_categorical_dtype,
- is_extension_array_dtype,
- is_float,
- is_float_dtype,
- is_hashable,
- is_integer,
- is_integer_dtype,
- is_iterator,
- is_list_like,
- is_number,
- is_numeric_dtype,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCIndex,
- ABCMultiIndex,
- ABCPeriodIndex,
- ABCSeries,
-)
-from pandas.core.dtypes.missing import (
- isna,
- notna,
-)
-
-import pandas.core.common as com
-from pandas.core.frame import DataFrame
-from pandas.util.version import Version
-
-from pandas.io.formats.printing import pprint_thing
-from pandas.plotting._matplotlib import tools
-from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters
-from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by
-from pandas.plotting._matplotlib.misc import unpack_single_str_list
-from pandas.plotting._matplotlib.style import get_standard_colors
-from pandas.plotting._matplotlib.timeseries import (
- decorate_axes,
- format_dateaxis,
- maybe_convert_index,
- maybe_resample,
- use_dynamic_x,
-)
-from pandas.plotting._matplotlib.tools import (
- create_subplots,
- flatten_axes,
- format_date_labels,
- get_all_lines,
- get_xlim,
- handle_shared_axes,
-)
-
-if TYPE_CHECKING:
- from matplotlib.axes import Axes
- from matplotlib.axis import Axis
-
-
-def _color_in_style(style: str) -> bool:
- """
- Check if there is a color letter in the style string.
- """
- from matplotlib.colors import BASE_COLORS
-
- return not set(BASE_COLORS).isdisjoint(style)
-
-
-class MPLPlot(ABC):
- """
- Base class for assembling a pandas plot using matplotlib
-
- Parameters
- ----------
- data :
-
- """
-
- @property
- @abstractmethod
- def _kind(self) -> str:
- """Specify kind str. Must be overridden in child class"""
- raise NotImplementedError
-
- _layout_type = "vertical"
- _default_rot = 0
-
- @property
- def orientation(self) -> str | None:
- return None
-
- axes: np.ndarray # of Axes objects
-
- def __init__(
- self,
- data,
- kind=None,
- by: IndexLabel | None = None,
- subplots: bool | Sequence[Sequence[str]] = False,
- sharex=None,
- sharey: bool = False,
- use_index: bool = True,
- figsize=None,
- grid=None,
- legend: bool | str = True,
- rot=None,
- ax=None,
- fig=None,
- title=None,
- xlim=None,
- ylim=None,
- xticks=None,
- yticks=None,
- xlabel: Hashable | None = None,
- ylabel: Hashable | None = None,
- fontsize=None,
- secondary_y: bool | tuple | list | np.ndarray = False,
- colormap=None,
- table: bool = False,
- layout=None,
- include_bool: bool = False,
- column: IndexLabel | None = None,
- **kwds,
- ) -> None:
- import matplotlib.pyplot as plt
-
- self.data = data
-
- # if users assign an empty list or tuple, raise `ValueError`
- # similar to current `df.box` and `df.hist` APIs.
- if by in ([], ()):
- raise ValueError("No group keys passed!")
- self.by = com.maybe_make_list(by)
-
- # Assign the rest of columns into self.columns if by is explicitly defined
- # while column is not, only need `columns` in hist/box plot when it's DF
- # TODO: Might deprecate `column` argument in future PR (#28373)
- if isinstance(data, DataFrame):
- if column:
- self.columns = com.maybe_make_list(column)
- else:
- if self.by is None:
- self.columns = [
- col for col in data.columns if is_numeric_dtype(data[col])
- ]
- else:
- self.columns = [
- col
- for col in data.columns
- if col not in self.by and is_numeric_dtype(data[col])
- ]
-
- # For `hist` plot, need to get grouped original data before `self.data` is
- # updated later
- if self.by is not None and self._kind == "hist":
- self._grouped = data.groupby(unpack_single_str_list(self.by))
-
- self.kind = kind
-
- self.subplots = self._validate_subplots_kwarg(subplots)
-
- if sharex is None:
- # if by is defined, subplots are used and sharex should be False
- if ax is None and by is None:
- self.sharex = True
- else:
- # if we get an axis, the users should do the visibility
- # setting...
- self.sharex = False
- else:
- self.sharex = sharex
-
- self.sharey = sharey
- self.figsize = figsize
- self.layout = layout
-
- self.xticks = xticks
- self.yticks = yticks
- self.xlim = xlim
- self.ylim = ylim
- self.title = title
- self.use_index = use_index
- self.xlabel = xlabel
- self.ylabel = ylabel
-
- self.fontsize = fontsize
-
- if rot is not None:
- self.rot = rot
- # need to know for format_date_labels since it's rotated to 30 by
- # default
- self._rot_set = True
- else:
- self._rot_set = False
- self.rot = self._default_rot
-
- if grid is None:
- grid = False if secondary_y else plt.rcParams["axes.grid"]
-
- self.grid = grid
- self.legend = legend
- self.legend_handles: list[Artist] = []
- self.legend_labels: list[Hashable] = []
-
- self.logx = kwds.pop("logx", False)
- self.logy = kwds.pop("logy", False)
- self.loglog = kwds.pop("loglog", False)
- self.label = kwds.pop("label", None)
- self.style = kwds.pop("style", None)
- self.mark_right = kwds.pop("mark_right", True)
- self.stacked = kwds.pop("stacked", False)
-
- self.ax = ax
- self.fig = fig
- self.axes = np.array([], dtype=object) # "real" version get set in `generate`
-
- # parse errorbar input if given
- xerr = kwds.pop("xerr", None)
- yerr = kwds.pop("yerr", None)
- self.errors = {
- kw: self._parse_errorbars(kw, err)
- for kw, err in zip(["xerr", "yerr"], [xerr, yerr])
- }
-
- if not isinstance(secondary_y, (bool, tuple, list, np.ndarray, ABCIndex)):
- secondary_y = [secondary_y]
- self.secondary_y = secondary_y
-
- # ugly TypeError if user passes matplotlib's `cmap` name.
- # Probably better to accept either.
- if "cmap" in kwds and colormap:
- raise TypeError("Only specify one of `cmap` and `colormap`.")
- if "cmap" in kwds:
- self.colormap = kwds.pop("cmap")
- else:
- self.colormap = colormap
-
- self.table = table
- self.include_bool = include_bool
-
- self.kwds = kwds
-
- self._validate_color_args()
-
- def _validate_subplots_kwarg(
- self, subplots: bool | Sequence[Sequence[str]]
- ) -> bool | list[tuple[int, ...]]:
- """
- Validate the subplots parameter
-
- - check type and content
- - check for duplicate columns
- - check for invalid column names
- - convert column names into indices
- - add missing columns in a group of their own
- See comments in code below for more details.
-
- Parameters
- ----------
- subplots : subplots parameters as passed to PlotAccessor
-
- Returns
- -------
- validated subplots : a bool or a list of tuples of column indices. Columns
- in the same tuple will be grouped together in the resulting plot.
- """
-
- if isinstance(subplots, bool):
- return subplots
- elif not isinstance(subplots, Iterable):
- raise ValueError("subplots should be a bool or an iterable")
-
- supported_kinds = (
- "line",
- "bar",
- "barh",
- "hist",
- "kde",
- "density",
- "area",
- "pie",
- )
- if self._kind not in supported_kinds:
- raise ValueError(
- "When subplots is an iterable, kind must be "
- f"one of {', '.join(supported_kinds)}. Got {self._kind}."
- )
-
- if isinstance(self.data, ABCSeries):
- raise NotImplementedError(
- "An iterable subplots for a Series is not supported."
- )
-
- columns = self.data.columns
- if isinstance(columns, ABCMultiIndex):
- raise NotImplementedError(
- "An iterable subplots for a DataFrame with a MultiIndex column "
- "is not supported."
- )
-
- if columns.nunique() != len(columns):
- raise NotImplementedError(
- "An iterable subplots for a DataFrame with non-unique column "
- "labels is not supported."
- )
-
- # subplots is a list of tuples where each tuple is a group of
- # columns to be grouped together (one ax per group).
- # we consolidate the subplots list such that:
- # - the tuples contain indices instead of column names
- # - the columns that aren't yet in the list are added in a group
- # of their own.
- # For example with columns from a to g, and
- # subplots = [(a, c), (b, f, e)],
- # we end up with [(ai, ci), (bi, fi, ei), (di,), (gi,)]
- # This way, we can handle self.subplots in a homogeneous manner
- # later.
- # TODO: also accept indices instead of just names?
-
- out = []
- seen_columns: set[Hashable] = set()
- for group in subplots:
- if not is_list_like(group):
- raise ValueError(
- "When subplots is an iterable, each entry "
- "should be a list/tuple of column names."
- )
- idx_locs = columns.get_indexer_for(group)
- if (idx_locs == -1).any():
- bad_labels = np.extract(idx_locs == -1, group)
- raise ValueError(
- f"Column label(s) {list(bad_labels)} not found in the DataFrame."
- )
- unique_columns = set(group)
- duplicates = seen_columns.intersection(unique_columns)
- if duplicates:
- raise ValueError(
- "Each column should be in only one subplot. "
- f"Columns {duplicates} were found in multiple subplots."
- )
- seen_columns = seen_columns.union(unique_columns)
- out.append(tuple(idx_locs))
-
- unseen_columns = columns.difference(seen_columns)
- for column in unseen_columns:
- idx_loc = columns.get_loc(column)
- out.append((idx_loc,))
- return out
-
- def _validate_color_args(self):
- if (
- "color" in self.kwds
- and self.nseries == 1
- and not is_list_like(self.kwds["color"])
- ):
- # support series.plot(color='green')
- self.kwds["color"] = [self.kwds["color"]]
-
- if (
- "color" in self.kwds
- and isinstance(self.kwds["color"], tuple)
- and self.nseries == 1
- and len(self.kwds["color"]) in (3, 4)
- ):
- # support RGB and RGBA tuples in series plot
- self.kwds["color"] = [self.kwds["color"]]
-
- if (
- "color" in self.kwds or "colors" in self.kwds
- ) and self.colormap is not None:
- warnings.warn(
- "'color' and 'colormap' cannot be used simultaneously. Using 'color'",
- stacklevel=find_stack_level(),
- )
-
- if "color" in self.kwds and self.style is not None:
- if is_list_like(self.style):
- styles = self.style
- else:
- styles = [self.style]
- # need only a single match
- for s in styles:
- if _color_in_style(s):
- raise ValueError(
- "Cannot pass 'style' string with a color symbol and "
- "'color' keyword argument. Please use one or the "
- "other or pass 'style' without a color symbol"
- )
-
- def _iter_data(self, data=None, keep_index: bool = False, fillna=None):
- if data is None:
- data = self.data
- if fillna is not None:
- data = data.fillna(fillna)
-
- for col, values in data.items():
- if keep_index is True:
- yield col, values
- else:
- yield col, values.values
-
- @property
- def nseries(self) -> int:
- # When `by` is explicitly assigned, grouped data size will be defined, and
- # this will determine number of subplots to have, aka `self.nseries`
- if self.data.ndim == 1:
- return 1
- elif self.by is not None and self._kind == "hist":
- return len(self._grouped)
- elif self.by is not None and self._kind == "box":
- return len(self.columns)
- else:
- return self.data.shape[1]
-
- def draw(self) -> None:
- self.plt.draw_if_interactive()
-
- def generate(self) -> None:
- self._args_adjust()
- self._compute_plot_data()
- self._setup_subplots()
- self._make_plot()
- self._add_table()
- self._make_legend()
- self._adorn_subplots()
-
- for ax in self.axes:
- self._post_plot_logic_common(ax, self.data)
- self._post_plot_logic(ax, self.data)
-
- @abstractmethod
- def _args_adjust(self) -> None:
- pass
-
- def _has_plotted_object(self, ax: Axes) -> bool:
- """check whether ax has data"""
- return len(ax.lines) != 0 or len(ax.artists) != 0 or len(ax.containers) != 0
-
- def _maybe_right_yaxis(self, ax: Axes, axes_num):
- if not self.on_right(axes_num):
- # secondary axes may be passed via ax kw
- return self._get_ax_layer(ax)
-
- if hasattr(ax, "right_ax"):
- # if it has right_ax property, ``ax`` must be left axes
- return ax.right_ax
- elif hasattr(ax, "left_ax"):
- # if it has left_ax property, ``ax`` must be right axes
- return ax
- else:
- # otherwise, create twin axes
- orig_ax, new_ax = ax, ax.twinx()
- # TODO: use Matplotlib public API when available
- new_ax._get_lines = orig_ax._get_lines
- new_ax._get_patches_for_fill = orig_ax._get_patches_for_fill
- orig_ax.right_ax, new_ax.left_ax = new_ax, orig_ax
-
- if not self._has_plotted_object(orig_ax): # no data on left y
- orig_ax.get_yaxis().set_visible(False)
-
- if self.logy is True or self.loglog is True:
- new_ax.set_yscale("log")
- elif self.logy == "sym" or self.loglog == "sym":
- new_ax.set_yscale("symlog")
- return new_ax
-
- def _setup_subplots(self):
- if self.subplots:
- naxes = (
- self.nseries if isinstance(self.subplots, bool) else len(self.subplots)
- )
- fig, axes = create_subplots(
- naxes=naxes,
- sharex=self.sharex,
- sharey=self.sharey,
- figsize=self.figsize,
- ax=self.ax,
- layout=self.layout,
- layout_type=self._layout_type,
- )
- else:
- if self.ax is None:
- fig = self.plt.figure(figsize=self.figsize)
- axes = fig.add_subplot(111)
- else:
- fig = self.ax.get_figure()
- if self.figsize is not None:
- fig.set_size_inches(self.figsize)
- axes = self.ax
-
- axes = flatten_axes(axes)
-
- valid_log = {False, True, "sym", None}
- input_log = {self.logx, self.logy, self.loglog}
- if input_log - valid_log:
- invalid_log = next(iter(input_log - valid_log))
- raise ValueError(
- f"Boolean, None and 'sym' are valid options, '{invalid_log}' is given."
- )
-
- if self.logx is True or self.loglog is True:
- [a.set_xscale("log") for a in axes]
- elif self.logx == "sym" or self.loglog == "sym":
- [a.set_xscale("symlog") for a in axes]
-
- if self.logy is True or self.loglog is True:
- [a.set_yscale("log") for a in axes]
- elif self.logy == "sym" or self.loglog == "sym":
- [a.set_yscale("symlog") for a in axes]
-
- self.fig = fig
- self.axes = axes
-
- @property
- def result(self):
- """
- Return result axes
- """
- if self.subplots:
- if self.layout is not None and not is_list_like(self.ax):
- return self.axes.reshape(*self.layout)
- else:
- return self.axes
- else:
- sec_true = isinstance(self.secondary_y, bool) and self.secondary_y
- # error: Argument 1 to "len" has incompatible type "Union[bool,
- # Tuple[Any, ...], List[Any], ndarray[Any, Any]]"; expected "Sized"
- all_sec = (
- is_list_like(self.secondary_y)
- and len(self.secondary_y) == self.nseries # type: ignore[arg-type]
- )
- if sec_true or all_sec:
- # if all data is plotted on secondary, return right axes
- return self._get_ax_layer(self.axes[0], primary=False)
- else:
- return self.axes[0]
-
- def _convert_to_ndarray(self, data):
- # GH31357: categorical columns are processed separately
- if is_categorical_dtype(data):
- return data
-
- # GH32073: cast to float if values contain nulled integers
- if (
- is_integer_dtype(data.dtype) or is_float_dtype(data.dtype)
- ) and is_extension_array_dtype(data.dtype):
- return data.to_numpy(dtype="float", na_value=np.nan)
-
- # GH25587: cast ExtensionArray of pandas (IntegerArray, etc.) to
- # np.ndarray before plot.
- if len(data) > 0:
- return np.asarray(data)
-
- return data
-
- def _compute_plot_data(self):
- data = self.data
-
- if isinstance(data, ABCSeries):
- label = self.label
- if label is None and data.name is None:
- label = ""
- if label is None:
- # We'll end up with columns of [0] instead of [None]
- data = data.to_frame()
- else:
- data = data.to_frame(name=label)
- elif self._kind in ("hist", "box"):
- cols = self.columns if self.by is None else self.columns + self.by
- data = data.loc[:, cols]
-
- # GH15079 reconstruct data if by is defined
- if self.by is not None:
- self.subplots = True
- data = reconstruct_data_with_by(self.data, by=self.by, cols=self.columns)
-
- # GH16953, infer_objects is needed as fallback, for ``Series``
- # with ``dtype == object``
- data = data.infer_objects(copy=False)
- include_type = [np.number, "datetime", "datetimetz", "timedelta"]
-
- # GH23719, allow plotting boolean
- if self.include_bool is True:
- include_type.append(np.bool_)
-
- # GH22799, exclude datetime-like type for boxplot
- exclude_type = None
- if self._kind == "box":
- # TODO: change after solving issue 27881
- include_type = [np.number]
- exclude_type = ["timedelta"]
-
- # GH 18755, include object and category type for scatter plot
- if self._kind == "scatter":
- include_type.extend(["object", "category"])
-
- numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type)
-
- try:
- is_empty = numeric_data.columns.empty
- except AttributeError:
- is_empty = not len(numeric_data)
-
- # no non-numeric frames or series allowed
- if is_empty:
- raise TypeError("no numeric data to plot")
-
- self.data = numeric_data.apply(self._convert_to_ndarray)
-
- def _make_plot(self):
- raise AbstractMethodError(self)
-
- def _add_table(self) -> None:
- if self.table is False:
- return
- elif self.table is True:
- data = self.data.transpose()
- else:
- data = self.table
- ax = self._get_ax(0)
- tools.table(ax, data)
-
- def _post_plot_logic_common(self, ax, data):
- """Common post process for each axes"""
- if self.orientation == "vertical" or self.orientation is None:
- self._apply_axis_properties(ax.xaxis, rot=self.rot, fontsize=self.fontsize)
- self._apply_axis_properties(ax.yaxis, fontsize=self.fontsize)
-
- if hasattr(ax, "right_ax"):
- self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize)
-
- elif self.orientation == "horizontal":
- self._apply_axis_properties(ax.yaxis, rot=self.rot, fontsize=self.fontsize)
- self._apply_axis_properties(ax.xaxis, fontsize=self.fontsize)
-
- if hasattr(ax, "right_ax"):
- self._apply_axis_properties(ax.right_ax.yaxis, fontsize=self.fontsize)
- else: # pragma no cover
- raise ValueError
-
- @abstractmethod
- def _post_plot_logic(self, ax, data) -> None:
- """Post process for each axes. Overridden in child classes"""
-
- def _adorn_subplots(self):
- """Common post process unrelated to data"""
- if len(self.axes) > 0:
- all_axes = self._get_subplots()
- nrows, ncols = self._get_axes_layout()
- handle_shared_axes(
- axarr=all_axes,
- nplots=len(all_axes),
- naxes=nrows * ncols,
- nrows=nrows,
- ncols=ncols,
- sharex=self.sharex,
- sharey=self.sharey,
- )
-
- for ax in self.axes:
- ax = getattr(ax, "right_ax", ax)
- if self.yticks is not None:
- ax.set_yticks(self.yticks)
-
- if self.xticks is not None:
- ax.set_xticks(self.xticks)
-
- if self.ylim is not None:
- ax.set_ylim(self.ylim)
-
- if self.xlim is not None:
- ax.set_xlim(self.xlim)
-
- # GH9093, currently Pandas does not show ylabel, so if users provide
- # ylabel will set it as ylabel in the plot.
- if self.ylabel is not None:
- ax.set_ylabel(pprint_thing(self.ylabel))
-
- ax.grid(self.grid)
-
- if self.title:
- if self.subplots:
- if is_list_like(self.title):
- if len(self.title) != self.nseries:
- raise ValueError(
- "The length of `title` must equal the number "
- "of columns if using `title` of type `list` "
- "and `subplots=True`.\n"
- f"length of title = {len(self.title)}\n"
- f"number of columns = {self.nseries}"
- )
-
- for ax, title in zip(self.axes, self.title):
- ax.set_title(title)
- else:
- self.fig.suptitle(self.title)
- else:
- if is_list_like(self.title):
- msg = (
- "Using `title` of type `list` is not supported "
- "unless `subplots=True` is passed"
- )
- raise ValueError(msg)
- self.axes[0].set_title(self.title)
-
- def _apply_axis_properties(self, axis: Axis, rot=None, fontsize=None) -> None:
- """
- Tick creation within matplotlib is reasonably expensive and is
- internally deferred until accessed as Ticks are created/destroyed
- multiple times per draw. It's therefore beneficial for us to avoid
- accessing unless we will act on the Tick.
- """
- if rot is not None or fontsize is not None:
- # rot=0 is a valid setting, hence the explicit None check
- labels = axis.get_majorticklabels() + axis.get_minorticklabels()
- for label in labels:
- if rot is not None:
- label.set_rotation(rot)
- if fontsize is not None:
- label.set_fontsize(fontsize)
-
- @property
- def legend_title(self) -> str | None:
- if not isinstance(self.data.columns, ABCMultiIndex):
- name = self.data.columns.name
- if name is not None:
- name = pprint_thing(name)
- return name
- else:
- stringified = map(pprint_thing, self.data.columns.names)
- return ",".join(stringified)
-
- def _mark_right_label(self, label: str, index: int) -> str:
- """
- Append ``(right)`` to the label of a line if it's plotted on the right axis.
-
- Note that ``(right)`` is only appended when ``subplots=False``.
- """
- if not self.subplots and self.mark_right and self.on_right(index):
- label += " (right)"
- return label
-
- def _append_legend_handles_labels(self, handle: Artist, label: str) -> None:
- """
- Append current handle and label to ``legend_handles`` and ``legend_labels``.
-
- These will be used to make the legend.
- """
- self.legend_handles.append(handle)
- self.legend_labels.append(label)
-
- def _make_legend(self) -> None:
- ax, leg = self._get_ax_legend(self.axes[0])
-
- handles = []
- labels = []
- title = ""
-
- if not self.subplots:
- if leg is not None:
- title = leg.get_title().get_text()
- # Replace leg.legend_handles because it misses marker info
- if Version(mpl.__version__) < Version("3.7"):
- handles = leg.legendHandles
- else:
- handles = leg.legend_handles
- labels = [x.get_text() for x in leg.get_texts()]
-
- if self.legend:
- if self.legend == "reverse":
- handles += reversed(self.legend_handles)
- labels += reversed(self.legend_labels)
- else:
- handles += self.legend_handles
- labels += self.legend_labels
-
- if self.legend_title is not None:
- title = self.legend_title
-
- if len(handles) > 0:
- ax.legend(handles, labels, loc="best", title=title)
-
- elif self.subplots and self.legend:
- for ax in self.axes:
- if ax.get_visible():
- ax.legend(loc="best")
-
- def _get_ax_legend(self, ax: Axes):
- """
- Take in axes and return ax and legend under different scenarios
- """
- leg = ax.get_legend()
-
- other_ax = getattr(ax, "left_ax", None) or getattr(ax, "right_ax", None)
- other_leg = None
- if other_ax is not None:
- other_leg = other_ax.get_legend()
- if leg is None and other_leg is not None:
- leg = other_leg
- ax = other_ax
- return ax, leg
-
- @cache_readonly
- def plt(self):
- import matplotlib.pyplot as plt
-
- return plt
-
- _need_to_set_index = False
-
- def _get_xticks(self, convert_period: bool = False):
- index = self.data.index
- is_datetype = index.inferred_type in ("datetime", "date", "datetime64", "time")
-
- if self.use_index:
- if convert_period and isinstance(index, ABCPeriodIndex):
- self.data = self.data.reindex(index=index.sort_values())
- x = self.data.index.to_timestamp()._mpl_repr()
- elif is_any_real_numeric_dtype(index):
- # Matplotlib supports numeric values or datetime objects as
- # xaxis values. Taking LBYL approach here, by the time
- # matplotlib raises exception when using non numeric/datetime
- # values for xaxis, several actions are already taken by plt.
- x = index._mpl_repr()
- elif is_datetype:
- self.data = self.data[notna(self.data.index)]
- self.data = self.data.sort_index()
- x = self.data.index._mpl_repr()
- else:
- self._need_to_set_index = True
- x = list(range(len(index)))
- else:
- x = list(range(len(index)))
-
- return x
-
- @classmethod
- @register_pandas_matplotlib_converters
- def _plot(
- cls, ax: Axes, x, y: np.ndarray, style=None, is_errorbar: bool = False, **kwds
- ):
- mask = isna(y)
- if mask.any():
- y = np.ma.array(y)
- y = np.ma.masked_where(mask, y)
-
- if isinstance(x, ABCIndex):
- x = x._mpl_repr()
-
- if is_errorbar:
- if "xerr" in kwds:
- kwds["xerr"] = np.array(kwds.get("xerr"))
- if "yerr" in kwds:
- kwds["yerr"] = np.array(kwds.get("yerr"))
- return ax.errorbar(x, y, **kwds)
- else:
- # prevent style kwarg from going to errorbar, where it is unsupported
- args = (x, y, style) if style is not None else (x, y)
- return ax.plot(*args, **kwds)
-
- def _get_custom_index_name(self):
- """Specify whether xlabel/ylabel should be used to override index name"""
- return self.xlabel
-
- def _get_index_name(self) -> str | None:
- if isinstance(self.data.index, ABCMultiIndex):
- name = self.data.index.names
- if com.any_not_none(*name):
- name = ",".join([pprint_thing(x) for x in name])
- else:
- name = None
- else:
- name = self.data.index.name
- if name is not None:
- name = pprint_thing(name)
-
- # GH 45145, override the default axis label if one is provided.
- index_name = self._get_custom_index_name()
- if index_name is not None:
- name = pprint_thing(index_name)
-
- return name
-
- @classmethod
- def _get_ax_layer(cls, ax, primary: bool = True):
- """get left (primary) or right (secondary) axes"""
- if primary:
- return getattr(ax, "left_ax", ax)
- else:
- return getattr(ax, "right_ax", ax)
-
- def _col_idx_to_axis_idx(self, col_idx: int) -> int:
- """Return the index of the axis where the column at col_idx should be plotted"""
- if isinstance(self.subplots, list):
- # Subplots is a list: some columns will be grouped together in the same ax
- return next(
- group_idx
- for (group_idx, group) in enumerate(self.subplots)
- if col_idx in group
- )
- else:
- # subplots is True: one ax per column
- return col_idx
-
- def _get_ax(self, i: int):
- # get the twinx ax if appropriate
- if self.subplots:
- i = self._col_idx_to_axis_idx(i)
- ax = self.axes[i]
- ax = self._maybe_right_yaxis(ax, i)
- self.axes[i] = ax
- else:
- ax = self.axes[0]
- ax = self._maybe_right_yaxis(ax, i)
-
- ax.get_yaxis().set_visible(True)
- return ax
-
- @classmethod
- def get_default_ax(cls, ax) -> None:
- import matplotlib.pyplot as plt
-
- if ax is None and len(plt.get_fignums()) > 0:
- with plt.rc_context():
- ax = plt.gca()
- ax = cls._get_ax_layer(ax)
-
- def on_right(self, i):
- if isinstance(self.secondary_y, bool):
- return self.secondary_y
-
- if isinstance(self.secondary_y, (tuple, list, np.ndarray, ABCIndex)):
- return self.data.columns[i] in self.secondary_y
-
- def _apply_style_colors(self, colors, kwds, col_num, label):
- """
- Manage style and color based on column number and its label.
- Returns tuple of appropriate style and kwds which "color" may be added.
- """
- style = None
- if self.style is not None:
- if isinstance(self.style, list):
- try:
- style = self.style[col_num]
- except IndexError:
- pass
- elif isinstance(self.style, dict):
- style = self.style.get(label, style)
- else:
- style = self.style
-
- has_color = "color" in kwds or self.colormap is not None
- nocolor_style = style is None or not _color_in_style(style)
- if (has_color or self.subplots) and nocolor_style:
- if isinstance(colors, dict):
- kwds["color"] = colors[label]
- else:
- kwds["color"] = colors[col_num % len(colors)]
- return style, kwds
-
- def _get_colors(
- self,
- num_colors: int | None = None,
- color_kwds: str = "color",
- ):
- if num_colors is None:
- num_colors = self.nseries
-
- return get_standard_colors(
- num_colors=num_colors,
- colormap=self.colormap,
- color=self.kwds.get(color_kwds),
- )
-
- def _parse_errorbars(self, label, err):
- """
- Look for error keyword arguments and return the actual errorbar data
- or return the error DataFrame/dict
-
- Error bars can be specified in several ways:
- Series: the user provides a pandas.Series object of the same
- length as the data
- ndarray: provides a np.ndarray of the same length as the data
- DataFrame/dict: error values are paired with keys matching the
- key in the plotted DataFrame
- str: the name of the column within the plotted DataFrame
-
- Asymmetrical error bars are also supported, however raw error values
- must be provided in this case. For a ``N`` length :class:`Series`, a
- ``2xN`` array should be provided indicating lower and upper (or left
- and right) errors. For a ``MxN`` :class:`DataFrame`, asymmetrical errors
- should be in a ``Mx2xN`` array.
- """
- if err is None:
- return None
-
- def match_labels(data, e):
- e = e.reindex(data.index)
- return e
-
- # key-matched DataFrame
- if isinstance(err, ABCDataFrame):
- err = match_labels(self.data, err)
- # key-matched dict
- elif isinstance(err, dict):
- pass
-
- # Series of error values
- elif isinstance(err, ABCSeries):
- # broadcast error series across data
- err = match_labels(self.data, err)
- err = np.atleast_2d(err)
- err = np.tile(err, (self.nseries, 1))
-
- # errors are a column in the dataframe
- elif isinstance(err, str):
- evalues = self.data[err].values
- self.data = self.data[self.data.columns.drop(err)]
- err = np.atleast_2d(evalues)
- err = np.tile(err, (self.nseries, 1))
-
- elif is_list_like(err):
- if is_iterator(err):
- err = np.atleast_2d(list(err))
- else:
- # raw error values
- err = np.atleast_2d(err)
-
- err_shape = err.shape
-
- # asymmetrical error bars
- if isinstance(self.data, ABCSeries) and err_shape[0] == 2:
- err = np.expand_dims(err, 0)
- err_shape = err.shape
- if err_shape[2] != len(self.data):
- raise ValueError(
- "Asymmetrical error bars should be provided "
- f"with the shape (2, {len(self.data)})"
- )
- elif isinstance(self.data, ABCDataFrame) and err.ndim == 3:
- if (
- (err_shape[0] != self.nseries)
- or (err_shape[1] != 2)
- or (err_shape[2] != len(self.data))
- ):
- raise ValueError(
- "Asymmetrical error bars should be provided "
- f"with the shape ({self.nseries}, 2, {len(self.data)})"
- )
-
- # broadcast errors to each data series
- if len(err) == 1:
- err = np.tile(err, (self.nseries, 1))
-
- elif is_number(err):
- err = np.tile([err], (self.nseries, len(self.data)))
-
- else:
- msg = f"No valid {label} detected"
- raise ValueError(msg)
-
- return err
-
- def _get_errorbars(
- self, label=None, index=None, xerr: bool = True, yerr: bool = True
- ):
- errors = {}
-
- for kw, flag in zip(["xerr", "yerr"], [xerr, yerr]):
- if flag:
- err = self.errors[kw]
- # user provided label-matched dataframe of errors
- if isinstance(err, (ABCDataFrame, dict)):
- if label is not None and label in err.keys():
- err = err[label]
- else:
- err = None
- elif index is not None and err is not None:
- err = err[index]
-
- if err is not None:
- errors[kw] = err
- return errors
-
- def _get_subplots(self):
- from matplotlib.axes import Subplot
-
- return [
- ax
- for ax in self.fig.get_axes()
- if (isinstance(ax, Subplot) and ax.get_subplotspec() is not None)
- ]
-
- def _get_axes_layout(self) -> tuple[int, int]:
- axes = self._get_subplots()
- x_set = set()
- y_set = set()
- for ax in axes:
- # check axes coordinates to estimate layout
- points = ax.get_position().get_points()
- x_set.add(points[0][0])
- y_set.add(points[0][1])
- return (len(y_set), len(x_set))
-
-
-class PlanePlot(MPLPlot, ABC):
- """
- Abstract class for plotting on plane, currently scatter and hexbin.
- """
-
- _layout_type = "single"
-
- def __init__(self, data, x, y, **kwargs) -> None:
- MPLPlot.__init__(self, data, **kwargs)
- if x is None or y is None:
- raise ValueError(self._kind + " requires an x and y column")
- if is_integer(x) and not self.data.columns._holds_integer():
- x = self.data.columns[x]
- if is_integer(y) and not self.data.columns._holds_integer():
- y = self.data.columns[y]
-
- # Scatter plot allows to plot objects data
- if self._kind == "hexbin":
- if len(self.data[x]._get_numeric_data()) == 0:
- raise ValueError(self._kind + " requires x column to be numeric")
- if len(self.data[y]._get_numeric_data()) == 0:
- raise ValueError(self._kind + " requires y column to be numeric")
-
- self.x = x
- self.y = y
-
- @property
- def nseries(self) -> int:
- return 1
-
- def _post_plot_logic(self, ax: Axes, data) -> None:
- x, y = self.x, self.y
- xlabel = self.xlabel if self.xlabel is not None else pprint_thing(x)
- ylabel = self.ylabel if self.ylabel is not None else pprint_thing(y)
- ax.set_xlabel(xlabel)
- ax.set_ylabel(ylabel)
-
- def _plot_colorbar(self, ax: Axes, **kwds):
- # Addresses issues #10611 and #10678:
- # When plotting scatterplots and hexbinplots in IPython
- # inline backend the colorbar axis height tends not to
- # exactly match the parent axis height.
- # The difference is due to small fractional differences
- # in floating points with similar representation.
- # To deal with this, this method forces the colorbar
- # height to take the height of the parent axes.
- # For a more detailed description of the issue
- # see the following link:
- # https://github.com/ipython/ipython/issues/11215
-
- # GH33389, if ax is used multiple times, we should always
- # use the last one which contains the latest information
- # about the ax
- img = ax.collections[-1]
- return self.fig.colorbar(img, ax=ax, **kwds)
-
-
-class ScatterPlot(PlanePlot):
- @property
- def _kind(self) -> Literal["scatter"]:
- return "scatter"
-
- def __init__(self, data, x, y, s=None, c=None, **kwargs) -> None:
- if s is None:
- # hide the matplotlib default for size, in case we want to change
- # the handling of this argument later
- s = 20
- elif is_hashable(s) and s in data.columns:
- s = data[s]
- super().__init__(data, x, y, s=s, **kwargs)
- if is_integer(c) and not self.data.columns._holds_integer():
- c = self.data.columns[c]
- self.c = c
-
- def _make_plot(self):
- x, y, c, data = self.x, self.y, self.c, self.data
- ax = self.axes[0]
-
- c_is_column = is_hashable(c) and c in self.data.columns
-
- color_by_categorical = c_is_column and is_categorical_dtype(self.data[c])
-
- color = self.kwds.pop("color", None)
- if c is not None and color is not None:
- raise TypeError("Specify exactly one of `c` and `color`")
- if c is None and color is None:
- c_values = self.plt.rcParams["patch.facecolor"]
- elif color is not None:
- c_values = color
- elif color_by_categorical:
- c_values = self.data[c].cat.codes
- elif c_is_column:
- c_values = self.data[c].values
- else:
- c_values = c
-
- if self.colormap is not None:
- cmap = mpl.colormaps.get_cmap(self.colormap)
- else:
- # cmap is only used if c_values are integers, otherwise UserWarning
- if is_integer_dtype(c_values):
- # pandas uses colormap, matplotlib uses cmap.
- cmap = "Greys"
- cmap = mpl.colormaps[cmap]
- else:
- cmap = None
-
- if color_by_categorical:
- from matplotlib import colors
-
- n_cats = len(self.data[c].cat.categories)
- cmap = colors.ListedColormap([cmap(i) for i in range(cmap.N)])
- bounds = np.linspace(0, n_cats, n_cats + 1)
- norm = colors.BoundaryNorm(bounds, cmap.N)
- else:
- norm = self.kwds.pop("norm", None)
- # plot colorbar if
- # 1. colormap is assigned, and
- # 2.`c` is a column containing only numeric values
- plot_colorbar = self.colormap or c_is_column
- cb = self.kwds.pop("colorbar", is_numeric_dtype(c_values) and plot_colorbar)
-
- if self.legend and hasattr(self, "label"):
- label = self.label
- else:
- label = None
- scatter = ax.scatter(
- data[x].values,
- data[y].values,
- c=c_values,
- label=label,
- cmap=cmap,
- norm=norm,
- **self.kwds,
- )
- if cb:
- cbar_label = c if c_is_column else ""
- cbar = self._plot_colorbar(ax, label=cbar_label)
- if color_by_categorical:
- cbar.set_ticks(np.linspace(0.5, n_cats - 0.5, n_cats))
- cbar.ax.set_yticklabels(self.data[c].cat.categories)
-
- if label is not None:
- self._append_legend_handles_labels(scatter, label)
- else:
- self.legend = False
-
- errors_x = self._get_errorbars(label=x, index=0, yerr=False)
- errors_y = self._get_errorbars(label=y, index=0, xerr=False)
- if len(errors_x) > 0 or len(errors_y) > 0:
- err_kwds = dict(errors_x, **errors_y)
- err_kwds["ecolor"] = scatter.get_facecolor()[0]
- ax.errorbar(data[x].values, data[y].values, linestyle="none", **err_kwds)
-
- def _args_adjust(self) -> None:
- pass
-
-
-class HexBinPlot(PlanePlot):
- @property
- def _kind(self) -> Literal["hexbin"]:
- return "hexbin"
-
- def __init__(self, data, x, y, C=None, **kwargs) -> None:
- super().__init__(data, x, y, **kwargs)
- if is_integer(C) and not self.data.columns._holds_integer():
- C = self.data.columns[C]
- self.C = C
-
- def _make_plot(self) -> None:
- x, y, data, C = self.x, self.y, self.data, self.C
- ax = self.axes[0]
- # pandas uses colormap, matplotlib uses cmap.
- cmap = self.colormap or "BuGn"
- cmap = mpl.colormaps.get_cmap(cmap)
- cb = self.kwds.pop("colorbar", True)
-
- if C is None:
- c_values = None
- else:
- c_values = data[C].values
-
- ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap, **self.kwds)
- if cb:
- self._plot_colorbar(ax)
-
- def _make_legend(self) -> None:
- pass
-
- def _args_adjust(self) -> None:
- pass
-
-
-class LinePlot(MPLPlot):
- _default_rot = 0
-
- @property
- def orientation(self) -> PlottingOrientation:
- return "vertical"
-
- @property
- def _kind(self) -> Literal["line", "area", "hist", "kde", "box"]:
- return "line"
-
- def __init__(self, data, **kwargs) -> None:
- from pandas.plotting import plot_params
-
- MPLPlot.__init__(self, data, **kwargs)
- if self.stacked:
- self.data = self.data.fillna(value=0)
- self.x_compat = plot_params["x_compat"]
- if "x_compat" in self.kwds:
- self.x_compat = bool(self.kwds.pop("x_compat"))
-
- def _is_ts_plot(self) -> bool:
- # this is slightly deceptive
- return not self.x_compat and self.use_index and self._use_dynamic_x()
-
- def _use_dynamic_x(self):
- return use_dynamic_x(self._get_ax(0), self.data)
-
- def _make_plot(self) -> None:
- if self._is_ts_plot():
- data = maybe_convert_index(self._get_ax(0), self.data)
-
- x = data.index # dummy, not used
- plotf = self._ts_plot
- it = self._iter_data(data=data, keep_index=True)
- else:
- x = self._get_xticks(convert_period=True)
- # error: Incompatible types in assignment (expression has type
- # "Callable[[Any, Any, Any, Any, Any, Any, KwArg(Any)], Any]", variable has
- # type "Callable[[Any, Any, Any, Any, KwArg(Any)], Any]")
- plotf = self._plot # type: ignore[assignment]
- it = self._iter_data()
-
- stacking_id = self._get_stacking_id()
- is_errorbar = com.any_not_none(*self.errors.values())
-
- colors = self._get_colors()
- for i, (label, y) in enumerate(it):
- ax = self._get_ax(i)
- kwds = self.kwds.copy()
- style, kwds = self._apply_style_colors(colors, kwds, i, label)
-
- errors = self._get_errorbars(label=label, index=i)
- kwds = dict(kwds, **errors)
-
- label = pprint_thing(label) # .encode('utf-8')
- label = self._mark_right_label(label, index=i)
- kwds["label"] = label
-
- newlines = plotf(
- ax,
- x,
- y,
- style=style,
- column_num=i,
- stacking_id=stacking_id,
- is_errorbar=is_errorbar,
- **kwds,
- )
- self._append_legend_handles_labels(newlines[0], label)
-
- if self._is_ts_plot():
- # reset of xlim should be used for ts data
- # TODO: GH28021, should find a way to change view limit on xaxis
- lines = get_all_lines(ax)
- left, right = get_xlim(lines)
- ax.set_xlim(left, right)
-
- # error: Signature of "_plot" incompatible with supertype "MPLPlot"
- @classmethod
- def _plot( # type: ignore[override]
- cls, ax: Axes, x, y, style=None, column_num=None, stacking_id=None, **kwds
- ):
- # column_num is used to get the target column from plotf in line and
- # area plots
- if column_num == 0:
- cls._initialize_stacker(ax, stacking_id, len(y))
- y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"])
- lines = MPLPlot._plot(ax, x, y_values, style=style, **kwds)
- cls._update_stacker(ax, stacking_id, y)
- return lines
-
- def _ts_plot(self, ax: Axes, x, data, style=None, **kwds):
- # accept x to be consistent with normal plot func,
- # x is not passed to tsplot as it uses data.index as x coordinate
- # column_num must be in kwds for stacking purpose
- freq, data = maybe_resample(data, ax, kwds)
-
- # Set ax with freq info
- decorate_axes(ax, freq, kwds)
- # digging deeper
- if hasattr(ax, "left_ax"):
- decorate_axes(ax.left_ax, freq, kwds)
- if hasattr(ax, "right_ax"):
- decorate_axes(ax.right_ax, freq, kwds)
- ax._plot_data.append((data, self._kind, kwds))
-
- lines = self._plot(ax, data.index, data.values, style=style, **kwds)
- # set date formatter, locators and rescale limits
- format_dateaxis(ax, ax.freq, data.index)
- return lines
-
- def _get_stacking_id(self):
- if self.stacked:
- return id(self.data)
- else:
- return None
-
- @classmethod
- def _initialize_stacker(cls, ax: Axes, stacking_id, n: int) -> None:
- if stacking_id is None:
- return
- if not hasattr(ax, "_stacker_pos_prior"):
- ax._stacker_pos_prior = {}
- if not hasattr(ax, "_stacker_neg_prior"):
- ax._stacker_neg_prior = {}
- ax._stacker_pos_prior[stacking_id] = np.zeros(n)
- ax._stacker_neg_prior[stacking_id] = np.zeros(n)
-
- @classmethod
- def _get_stacked_values(cls, ax: Axes, stacking_id, values, label):
- if stacking_id is None:
- return values
- if not hasattr(ax, "_stacker_pos_prior"):
- # stacker may not be initialized for subplots
- cls._initialize_stacker(ax, stacking_id, len(values))
-
- if (values >= 0).all():
- return ax._stacker_pos_prior[stacking_id] + values
- elif (values <= 0).all():
- return ax._stacker_neg_prior[stacking_id] + values
-
- raise ValueError(
- "When stacked is True, each column must be either "
- "all positive or all negative. "
- f"Column '{label}' contains both positive and negative values"
- )
-
- @classmethod
- def _update_stacker(cls, ax: Axes, stacking_id, values) -> None:
- if stacking_id is None:
- return
- if (values >= 0).all():
- ax._stacker_pos_prior[stacking_id] += values
- elif (values <= 0).all():
- ax._stacker_neg_prior[stacking_id] += values
-
- def _args_adjust(self) -> None:
- pass
-
- def _post_plot_logic(self, ax: Axes, data) -> None:
- from matplotlib.ticker import FixedLocator
-
- def get_label(i):
- if is_float(i) and i.is_integer():
- i = int(i)
- try:
- return pprint_thing(data.index[i])
- except Exception:
- return ""
-
- if self._need_to_set_index:
- xticks = ax.get_xticks()
- xticklabels = [get_label(x) for x in xticks]
- ax.xaxis.set_major_locator(FixedLocator(xticks))
- ax.set_xticklabels(xticklabels)
-
- # If the index is an irregular time series, then by default
- # we rotate the tick labels. The exception is if there are
- # subplots which don't share their x-axes, in which we case
- # we don't rotate the ticklabels as by default the subplots
- # would be too close together.
- condition = (
- not self._use_dynamic_x()
- and (data.index._is_all_dates and self.use_index)
- and (not self.subplots or (self.subplots and self.sharex))
- )
-
- index_name = self._get_index_name()
-
- if condition:
- # irregular TS rotated 30 deg. by default
- # probably a better place to check / set this.
- if not self._rot_set:
- self.rot = 30
- format_date_labels(ax, rot=self.rot)
-
- if index_name is not None and self.use_index:
- ax.set_xlabel(index_name)
-
-
-class AreaPlot(LinePlot):
- @property
- def _kind(self) -> Literal["area"]:
- return "area"
-
- def __init__(self, data, **kwargs) -> None:
- kwargs.setdefault("stacked", True)
- data = data.fillna(value=0)
- LinePlot.__init__(self, data, **kwargs)
-
- if not self.stacked:
- # use smaller alpha to distinguish overlap
- self.kwds.setdefault("alpha", 0.5)
-
- if self.logy or self.loglog:
- raise ValueError("Log-y scales are not supported in area plot")
-
- # error: Signature of "_plot" incompatible with supertype "MPLPlot"
- @classmethod
- def _plot( # type: ignore[override]
- cls,
- ax: Axes,
- x,
- y,
- style=None,
- column_num=None,
- stacking_id=None,
- is_errorbar: bool = False,
- **kwds,
- ):
- if column_num == 0:
- cls._initialize_stacker(ax, stacking_id, len(y))
- y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"])
-
- # need to remove label, because subplots uses mpl legend as it is
- line_kwds = kwds.copy()
- line_kwds.pop("label")
- lines = MPLPlot._plot(ax, x, y_values, style=style, **line_kwds)
-
- # get data from the line to get coordinates for fill_between
- xdata, y_values = lines[0].get_data(orig=False)
-
- # unable to use ``_get_stacked_values`` here to get starting point
- if stacking_id is None:
- start = np.zeros(len(y))
- elif (y >= 0).all():
- start = ax._stacker_pos_prior[stacking_id]
- elif (y <= 0).all():
- start = ax._stacker_neg_prior[stacking_id]
- else:
- start = np.zeros(len(y))
-
- if "color" not in kwds:
- kwds["color"] = lines[0].get_color()
-
- rect = ax.fill_between(xdata, start, y_values, **kwds)
- cls._update_stacker(ax, stacking_id, y)
-
- # LinePlot expects list of artists
- res = [rect]
- return res
-
- def _args_adjust(self) -> None:
- pass
-
- def _post_plot_logic(self, ax: Axes, data) -> None:
- LinePlot._post_plot_logic(self, ax, data)
-
- is_shared_y = len(list(ax.get_shared_y_axes())) > 0
- # do not override the default axis behaviour in case of shared y axes
- if self.ylim is None and not is_shared_y:
- if (data >= 0).all().all():
- ax.set_ylim(0, None)
- elif (data <= 0).all().all():
- ax.set_ylim(None, 0)
-
-
-class BarPlot(MPLPlot):
- @property
- def _kind(self) -> Literal["bar", "barh"]:
- return "bar"
-
- _default_rot = 90
-
- @property
- def orientation(self) -> PlottingOrientation:
- return "vertical"
-
- def __init__(self, data, **kwargs) -> None:
- # we have to treat a series differently than a
- # 1-column DataFrame w.r.t. color handling
- self._is_series = isinstance(data, ABCSeries)
- self.bar_width = kwargs.pop("width", 0.5)
- pos = kwargs.pop("position", 0.5)
- kwargs.setdefault("align", "center")
- self.tick_pos = np.arange(len(data))
-
- self.bottom = kwargs.pop("bottom", 0)
- self.left = kwargs.pop("left", 0)
-
- self.log = kwargs.pop("log", False)
- MPLPlot.__init__(self, data, **kwargs)
-
- if self.stacked or self.subplots:
- self.tickoffset = self.bar_width * pos
- if kwargs["align"] == "edge":
- self.lim_offset = self.bar_width / 2
- else:
- self.lim_offset = 0
- else:
- if kwargs["align"] == "edge":
- w = self.bar_width / self.nseries
- self.tickoffset = self.bar_width * (pos - 0.5) + w * 0.5
- self.lim_offset = w * 0.5
- else:
- self.tickoffset = self.bar_width * pos
- self.lim_offset = 0
-
- self.ax_pos = self.tick_pos - self.tickoffset
-
- def _args_adjust(self) -> None:
- if is_list_like(self.bottom):
- self.bottom = np.array(self.bottom)
- if is_list_like(self.left):
- self.left = np.array(self.left)
-
- # error: Signature of "_plot" incompatible with supertype "MPLPlot"
- @classmethod
- def _plot( # type: ignore[override]
- cls,
- ax: Axes,
- x,
- y,
- w,
- start: int | npt.NDArray[np.intp] = 0,
- log: bool = False,
- **kwds,
- ):
- return ax.bar(x, y, w, bottom=start, log=log, **kwds)
-
- @property
- def _start_base(self):
- return self.bottom
-
- def _make_plot(self) -> None:
- colors = self._get_colors()
- ncolors = len(colors)
-
- pos_prior = neg_prior = np.zeros(len(self.data))
- K = self.nseries
-
- for i, (label, y) in enumerate(self._iter_data(fillna=0)):
- ax = self._get_ax(i)
- kwds = self.kwds.copy()
- if self._is_series:
- kwds["color"] = colors
- elif isinstance(colors, dict):
- kwds["color"] = colors[label]
- else:
- kwds["color"] = colors[i % ncolors]
-
- errors = self._get_errorbars(label=label, index=i)
- kwds = dict(kwds, **errors)
-
- label = pprint_thing(label)
- label = self._mark_right_label(label, index=i)
-
- if (("yerr" in kwds) or ("xerr" in kwds)) and (kwds.get("ecolor") is None):
- kwds["ecolor"] = mpl.rcParams["xtick.color"]
-
- start = 0
- if self.log and (y >= 1).all():
- start = 1
- start = start + self._start_base
-
- if self.subplots:
- w = self.bar_width / 2
- rect = self._plot(
- ax,
- self.ax_pos + w,
- y,
- self.bar_width,
- start=start,
- label=label,
- log=self.log,
- **kwds,
- )
- ax.set_title(label)
- elif self.stacked:
- mask = y > 0
- start = np.where(mask, pos_prior, neg_prior) + self._start_base
- w = self.bar_width / 2
- rect = self._plot(
- ax,
- self.ax_pos + w,
- y,
- self.bar_width,
- start=start,
- label=label,
- log=self.log,
- **kwds,
- )
- pos_prior = pos_prior + np.where(mask, y, 0)
- neg_prior = neg_prior + np.where(mask, 0, y)
- else:
- w = self.bar_width / K
- rect = self._plot(
- ax,
- self.ax_pos + (i + 0.5) * w,
- y,
- w,
- start=start,
- label=label,
- log=self.log,
- **kwds,
- )
- self._append_legend_handles_labels(rect, label)
-
- def _post_plot_logic(self, ax: Axes, data) -> None:
- if self.use_index:
- str_index = [pprint_thing(key) for key in data.index]
- else:
- str_index = [pprint_thing(key) for key in range(data.shape[0])]
-
- s_edge = self.ax_pos[0] - 0.25 + self.lim_offset
- e_edge = self.ax_pos[-1] + 0.25 + self.bar_width + self.lim_offset
-
- self._decorate_ticks(ax, self._get_index_name(), str_index, s_edge, e_edge)
-
- def _decorate_ticks(self, ax: Axes, name, ticklabels, start_edge, end_edge) -> None:
- ax.set_xlim((start_edge, end_edge))
-
- if self.xticks is not None:
- ax.set_xticks(np.array(self.xticks))
- else:
- ax.set_xticks(self.tick_pos)
- ax.set_xticklabels(ticklabels)
-
- if name is not None and self.use_index:
- ax.set_xlabel(name)
-
-
-class BarhPlot(BarPlot):
- @property
- def _kind(self) -> Literal["barh"]:
- return "barh"
-
- _default_rot = 0
-
- @property
- def orientation(self) -> Literal["horizontal"]:
- return "horizontal"
-
- @property
- def _start_base(self):
- return self.left
-
- # error: Signature of "_plot" incompatible with supertype "MPLPlot"
- @classmethod
- def _plot( # type: ignore[override]
- cls,
- ax: Axes,
- x,
- y,
- w,
- start: int | npt.NDArray[np.intp] = 0,
- log: bool = False,
- **kwds,
- ):
- return ax.barh(x, y, w, left=start, log=log, **kwds)
-
- def _get_custom_index_name(self):
- return self.ylabel
-
- def _decorate_ticks(self, ax: Axes, name, ticklabels, start_edge, end_edge) -> None:
- # horizontal bars
- ax.set_ylim((start_edge, end_edge))
- ax.set_yticks(self.tick_pos)
- ax.set_yticklabels(ticklabels)
- if name is not None and self.use_index:
- ax.set_ylabel(name)
- ax.set_xlabel(self.xlabel)
-
-
-class PiePlot(MPLPlot):
- @property
- def _kind(self) -> Literal["pie"]:
- return "pie"
-
- _layout_type = "horizontal"
-
- def __init__(self, data, kind=None, **kwargs) -> None:
- data = data.fillna(value=0)
- if (data < 0).any().any():
- raise ValueError(f"{self._kind} plot doesn't allow negative values")
- MPLPlot.__init__(self, data, kind=kind, **kwargs)
-
- def _args_adjust(self) -> None:
- self.grid = False
- self.logy = False
- self.logx = False
- self.loglog = False
-
- def _validate_color_args(self) -> None:
- pass
-
- def _make_plot(self) -> None:
- colors = self._get_colors(num_colors=len(self.data), color_kwds="colors")
- self.kwds.setdefault("colors", colors)
-
- for i, (label, y) in enumerate(self._iter_data()):
- ax = self._get_ax(i)
- if label is not None:
- label = pprint_thing(label)
- ax.set_ylabel(label)
-
- kwds = self.kwds.copy()
-
- def blank_labeler(label, value):
- if value == 0:
- return ""
- else:
- return label
-
- idx = [pprint_thing(v) for v in self.data.index]
- labels = kwds.pop("labels", idx)
- # labels is used for each wedge's labels
- # Blank out labels for values of 0 so they don't overlap
- # with nonzero wedges
- if labels is not None:
- blabels = [blank_labeler(left, value) for left, value in zip(labels, y)]
- else:
- blabels = None
- results = ax.pie(y, labels=blabels, **kwds)
-
- if kwds.get("autopct", None) is not None:
- patches, texts, autotexts = results
- else:
- patches, texts = results
- autotexts = []
-
- if self.fontsize is not None:
- for t in texts + autotexts:
- t.set_fontsize(self.fontsize)
-
- # leglabels is used for legend labels
- leglabels = labels if labels is not None else idx
- for _patch, _leglabel in zip(patches, leglabels):
- self._append_legend_handles_labels(_patch, _leglabel)
-
- def _post_plot_logic(self, ax: Axes, data) -> None:
- pass
diff --git a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/groupby.py b/contrib/python/pandas/py3/pandas/plotting/_matplotlib/groupby.py
deleted file mode 100644
index 17a21429260..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/groupby.py
+++ /dev/null
@@ -1,139 +0,0 @@
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._typing import (
- Dict,
- IndexLabel,
-)
-
-from pandas.core.dtypes.missing import remove_na_arraylike
-
-from pandas import (
- DataFrame,
- MultiIndex,
- Series,
- concat,
-)
-
-from pandas.plotting._matplotlib.misc import unpack_single_str_list
-
-
-def create_iter_data_given_by(
- data: DataFrame, kind: str = "hist"
-) -> Dict[str, DataFrame | Series]:
- """
- Create data for iteration given `by` is assigned or not, and it is only
- used in both hist and boxplot.
-
- If `by` is assigned, return a dictionary of DataFrames in which the key of
- dictionary is the values in groups.
- If `by` is not assigned, return input as is, and this preserves current
- status of iter_data.
-
- Parameters
- ----------
- data : reformatted grouped data from `_compute_plot_data` method.
- kind : str, plot kind. This function is only used for `hist` and `box` plots.
-
- Returns
- -------
- iter_data : DataFrame or Dictionary of DataFrames
-
- Examples
- --------
- If `by` is assigned:
-
- >>> import numpy as np
- >>> tuples = [('h1', 'a'), ('h1', 'b'), ('h2', 'a'), ('h2', 'b')]
- >>> mi = MultiIndex.from_tuples(tuples)
- >>> value = [[1, 3, np.nan, np.nan],
- ... [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]]
- >>> data = DataFrame(value, columns=mi)
- >>> create_iter_data_given_by(data)
- {'h1': h1
- a b
- 0 1.0 3.0
- 1 3.0 4.0
- 2 NaN NaN, 'h2': h2
- a b
- 0 NaN NaN
- 1 NaN NaN
- 2 5.0 6.0}
- """
-
- # For `hist` plot, before transformation, the values in level 0 are values
- # in groups and subplot titles, and later used for column subselection and
- # iteration; For `box` plot, values in level 1 are column names to show,
- # and are used for iteration and as subplots titles.
- if kind == "hist":
- level = 0
- else:
- level = 1
-
- # Select sub-columns based on the value of level of MI, and if `by` is
- # assigned, data must be a MI DataFrame
- assert isinstance(data.columns, MultiIndex)
- return {
- col: data.loc[:, data.columns.get_level_values(level) == col]
- for col in data.columns.levels[level]
- }
-
-
-def reconstruct_data_with_by(
- data: DataFrame, by: IndexLabel, cols: IndexLabel
-) -> DataFrame:
- """
- Internal function to group data, and reassign multiindex column names onto the
- result in order to let grouped data be used in _compute_plot_data method.
-
- Parameters
- ----------
- data : Original DataFrame to plot
- by : grouped `by` parameter selected by users
- cols : columns of data set (excluding columns used in `by`)
-
- Returns
- -------
- Output is the reconstructed DataFrame with MultiIndex columns. The first level
- of MI is unique values of groups, and second level of MI is the columns
- selected by users.
-
- Examples
- --------
- >>> d = {'h': ['h1', 'h1', 'h2'], 'a': [1, 3, 5], 'b': [3, 4, 6]}
- >>> df = DataFrame(d)
- >>> reconstruct_data_with_by(df, by='h', cols=['a', 'b'])
- h1 h2
- a b a b
- 0 1.0 3.0 NaN NaN
- 1 3.0 4.0 NaN NaN
- 2 NaN NaN 5.0 6.0
- """
- by_modified = unpack_single_str_list(by)
- grouped = data.groupby(by_modified)
-
- data_list = []
- for key, group in grouped:
- # error: List item 1 has incompatible type "Union[Hashable,
- # Sequence[Hashable]]"; expected "Iterable[Hashable]"
- columns = MultiIndex.from_product([[key], cols]) # type: ignore[list-item]
- sub_group = group[cols]
- sub_group.columns = columns
- data_list.append(sub_group)
-
- data = concat(data_list, axis=1)
- return data
-
-
-def reformat_hist_y_given_by(
- y: Series | np.ndarray, by: IndexLabel | None
-) -> Series | np.ndarray:
- """Internal function to reformat y given `by` is applied or not for hist plot.
-
- If by is None, input y is 1-d with NaN removed; and if by is not None, groupby
- will take place and input y is multi-dimensional array.
- """
- if by is not None and len(y.shape) > 1:
- return np.array([remove_na_arraylike(col) for col in y.T]).T
- return remove_na_arraylike(y)
diff --git a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/hist.py b/contrib/python/pandas/py3/pandas/plotting/_matplotlib/hist.py
deleted file mode 100644
index bc8e6ed753d..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/hist.py
+++ /dev/null
@@ -1,546 +0,0 @@
-from __future__ import annotations
-
-from typing import (
- TYPE_CHECKING,
- Literal,
-)
-
-import numpy as np
-
-from pandas._typing import PlottingOrientation
-
-from pandas.core.dtypes.common import (
- is_integer,
- is_list_like,
-)
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCIndex,
-)
-from pandas.core.dtypes.missing import (
- isna,
- remove_na_arraylike,
-)
-
-from pandas.io.formats.printing import pprint_thing
-from pandas.plotting._matplotlib.core import (
- LinePlot,
- MPLPlot,
-)
-from pandas.plotting._matplotlib.groupby import (
- create_iter_data_given_by,
- reformat_hist_y_given_by,
-)
-from pandas.plotting._matplotlib.misc import unpack_single_str_list
-from pandas.plotting._matplotlib.tools import (
- create_subplots,
- flatten_axes,
- maybe_adjust_figure,
- set_ticks_props,
-)
-
-if TYPE_CHECKING:
- from matplotlib.axes import Axes
-
- from pandas import DataFrame
-
-
-class HistPlot(LinePlot):
- @property
- def _kind(self) -> Literal["hist", "kde"]:
- return "hist"
-
- def __init__(
- self,
- data,
- bins: int | np.ndarray | list[np.ndarray] = 10,
- bottom: int | np.ndarray = 0,
- **kwargs,
- ) -> None:
- self.bins = bins # use mpl default
- self.bottom = bottom
- self.xlabel = kwargs.get("xlabel")
- self.ylabel = kwargs.get("ylabel")
- # Do not call LinePlot.__init__ which may fill nan
- MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called
-
- def _args_adjust(self) -> None:
- # calculate bin number separately in different subplots
- # where subplots are created based on by argument
- if is_integer(self.bins):
- if self.by is not None:
- by_modified = unpack_single_str_list(self.by)
- grouped = self.data.groupby(by_modified)[self.columns]
- self.bins = [self._calculate_bins(group) for key, group in grouped]
- else:
- self.bins = self._calculate_bins(self.data)
-
- if is_list_like(self.bottom):
- self.bottom = np.array(self.bottom)
-
- def _calculate_bins(self, data: DataFrame) -> np.ndarray:
- """Calculate bins given data"""
- nd_values = data.infer_objects(copy=False)._get_numeric_data()
- values = np.ravel(nd_values)
- values = values[~isna(values)]
-
- hist, bins = np.histogram(
- values, bins=self.bins, range=self.kwds.get("range", None)
- )
- return bins
-
- # error: Signature of "_plot" incompatible with supertype "LinePlot"
- @classmethod
- def _plot( # type: ignore[override]
- cls,
- ax,
- y,
- style=None,
- bottom: int | np.ndarray = 0,
- column_num: int = 0,
- stacking_id=None,
- *,
- bins,
- **kwds,
- ):
- if column_num == 0:
- cls._initialize_stacker(ax, stacking_id, len(bins) - 1)
-
- base = np.zeros(len(bins) - 1)
- bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"])
- # ignore style
- n, bins, patches = ax.hist(y, bins=bins, bottom=bottom, **kwds)
- cls._update_stacker(ax, stacking_id, n)
- return patches
-
- def _make_plot(self) -> None:
- colors = self._get_colors()
- stacking_id = self._get_stacking_id()
-
- # Re-create iterated data if `by` is assigned by users
- data = (
- create_iter_data_given_by(self.data, self._kind)
- if self.by is not None
- else self.data
- )
-
- for i, (label, y) in enumerate(self._iter_data(data=data)):
- ax = self._get_ax(i)
-
- kwds = self.kwds.copy()
-
- label = pprint_thing(label)
- label = self._mark_right_label(label, index=i)
- kwds["label"] = label
-
- style, kwds = self._apply_style_colors(colors, kwds, i, label)
- if style is not None:
- kwds["style"] = style
-
- kwds = self._make_plot_keywords(kwds, y)
-
- # the bins is multi-dimension array now and each plot need only 1-d and
- # when by is applied, label should be columns that are grouped
- if self.by is not None:
- kwds["bins"] = kwds["bins"][i]
- kwds["label"] = self.columns
- kwds.pop("color")
-
- # We allow weights to be a multi-dimensional array, e.g. a (10, 2) array,
- # and each sub-array (10,) will be called in each iteration. If users only
- # provide 1D array, we assume the same weights is used for all iterations
- weights = kwds.get("weights", None)
- if weights is not None:
- if np.ndim(weights) != 1 and np.shape(weights)[-1] != 1:
- try:
- weights = weights[:, i]
- except IndexError as err:
- raise ValueError(
- "weights must have the same shape as data, "
- "or be a single column"
- ) from err
- weights = weights[~isna(y)]
- kwds["weights"] = weights
-
- y = reformat_hist_y_given_by(y, self.by)
-
- artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)
-
- # when by is applied, show title for subplots to know which group it is
- if self.by is not None:
- ax.set_title(pprint_thing(label))
-
- self._append_legend_handles_labels(artists[0], label)
-
- def _make_plot_keywords(self, kwds, y):
- """merge BoxPlot/KdePlot properties to passed kwds"""
- # y is required for KdePlot
- kwds["bottom"] = self.bottom
- kwds["bins"] = self.bins
- return kwds
-
- def _post_plot_logic(self, ax: Axes, data) -> None:
- if self.orientation == "horizontal":
- ax.set_xlabel("Frequency" if self.xlabel is None else self.xlabel)
- ax.set_ylabel(self.ylabel)
- else:
- ax.set_xlabel(self.xlabel)
- ax.set_ylabel("Frequency" if self.ylabel is None else self.ylabel)
-
- @property
- def orientation(self) -> PlottingOrientation:
- if self.kwds.get("orientation", None) == "horizontal":
- return "horizontal"
- else:
- return "vertical"
-
-
-class KdePlot(HistPlot):
- @property
- def _kind(self) -> Literal["kde"]:
- return "kde"
-
- @property
- def orientation(self) -> Literal["vertical"]:
- return "vertical"
-
- def __init__(self, data, bw_method=None, ind=None, **kwargs) -> None:
- # Do not call LinePlot.__init__ which may fill nan
- MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called
- self.bw_method = bw_method
- self.ind = ind
-
- def _args_adjust(self) -> None:
- pass
-
- def _get_ind(self, y):
- if self.ind is None:
- # np.nanmax() and np.nanmin() ignores the missing values
- sample_range = np.nanmax(y) - np.nanmin(y)
- ind = np.linspace(
- np.nanmin(y) - 0.5 * sample_range,
- np.nanmax(y) + 0.5 * sample_range,
- 1000,
- )
- elif is_integer(self.ind):
- sample_range = np.nanmax(y) - np.nanmin(y)
- ind = np.linspace(
- np.nanmin(y) - 0.5 * sample_range,
- np.nanmax(y) + 0.5 * sample_range,
- self.ind,
- )
- else:
- ind = self.ind
- return ind
-
- @classmethod
- def _plot(
- cls,
- ax,
- y,
- style=None,
- bw_method=None,
- ind=None,
- column_num=None,
- stacking_id=None,
- **kwds,
- ):
- from scipy.stats import gaussian_kde
-
- y = remove_na_arraylike(y)
- gkde = gaussian_kde(y, bw_method=bw_method)
-
- y = gkde.evaluate(ind)
- lines = MPLPlot._plot(ax, ind, y, style=style, **kwds)
- return lines
-
- def _make_plot_keywords(self, kwds, y):
- kwds["bw_method"] = self.bw_method
- kwds["ind"] = self._get_ind(y)
- return kwds
-
- def _post_plot_logic(self, ax, data) -> None:
- ax.set_ylabel("Density")
-
-
-def _grouped_plot(
- plotf,
- data,
- column=None,
- by=None,
- numeric_only: bool = True,
- figsize=None,
- sharex: bool = True,
- sharey: bool = True,
- layout=None,
- rot: float = 0,
- ax=None,
- **kwargs,
-):
- if figsize == "default":
- # allowed to specify mpl default with 'default'
- raise ValueError(
- "figsize='default' is no longer supported. "
- "Specify figure size by tuple instead"
- )
-
- grouped = data.groupby(by)
- if column is not None:
- grouped = grouped[column]
-
- naxes = len(grouped)
- fig, axes = create_subplots(
- naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout
- )
-
- _axes = flatten_axes(axes)
-
- for i, (key, group) in enumerate(grouped):
- ax = _axes[i]
- if numeric_only and isinstance(group, ABCDataFrame):
- group = group._get_numeric_data()
- plotf(group, ax, **kwargs)
- ax.set_title(pprint_thing(key))
-
- return fig, axes
-
-
-def _grouped_hist(
- data,
- column=None,
- by=None,
- ax=None,
- bins: int = 50,
- figsize=None,
- layout=None,
- sharex: bool = False,
- sharey: bool = False,
- rot: float = 90,
- grid: bool = True,
- xlabelsize=None,
- xrot=None,
- ylabelsize=None,
- yrot=None,
- legend: bool = False,
- **kwargs,
-):
- """
- Grouped histogram
-
- Parameters
- ----------
- data : Series/DataFrame
- column : object, optional
- by : object, optional
- ax : axes, optional
- bins : int, default 50
- figsize : tuple, optional
- layout : optional
- sharex : bool, default False
- sharey : bool, default False
- rot : float, default 90
- grid : bool, default True
- legend: : bool, default False
- kwargs : dict, keyword arguments passed to matplotlib.Axes.hist
-
- Returns
- -------
- collection of Matplotlib Axes
- """
- if legend:
- assert "label" not in kwargs
- if data.ndim == 1:
- kwargs["label"] = data.name
- elif column is None:
- kwargs["label"] = data.columns
- else:
- kwargs["label"] = column
-
- def plot_group(group, ax) -> None:
- ax.hist(group.dropna().values, bins=bins, **kwargs)
- if legend:
- ax.legend()
-
- if xrot is None:
- xrot = rot
-
- fig, axes = _grouped_plot(
- plot_group,
- data,
- column=column,
- by=by,
- sharex=sharex,
- sharey=sharey,
- ax=ax,
- figsize=figsize,
- layout=layout,
- rot=rot,
- )
-
- set_ticks_props(
- axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
- )
-
- maybe_adjust_figure(
- fig, bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3
- )
- return axes
-
-
-def hist_series(
- self,
- by=None,
- ax=None,
- grid: bool = True,
- xlabelsize=None,
- xrot=None,
- ylabelsize=None,
- yrot=None,
- figsize=None,
- bins: int = 10,
- legend: bool = False,
- **kwds,
-):
- import matplotlib.pyplot as plt
-
- if legend and "label" in kwds:
- raise ValueError("Cannot use both legend and label")
-
- if by is None:
- if kwds.get("layout", None) is not None:
- raise ValueError("The 'layout' keyword is not supported when 'by' is None")
- # hack until the plotting interface is a bit more unified
- fig = kwds.pop(
- "figure", plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize)
- )
- if figsize is not None and tuple(figsize) != tuple(fig.get_size_inches()):
- fig.set_size_inches(*figsize, forward=True)
- if ax is None:
- ax = fig.gca()
- elif ax.get_figure() != fig:
- raise AssertionError("passed axis not bound to passed figure")
- values = self.dropna().values
- if legend:
- kwds["label"] = self.name
- ax.hist(values, bins=bins, **kwds)
- if legend:
- ax.legend()
- ax.grid(grid)
- axes = np.array([ax])
-
- set_ticks_props(
- axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
- )
-
- else:
- if "figure" in kwds:
- raise ValueError(
- "Cannot pass 'figure' when using the "
- "'by' argument, since a new 'Figure' instance will be created"
- )
- axes = _grouped_hist(
- self,
- by=by,
- ax=ax,
- grid=grid,
- figsize=figsize,
- bins=bins,
- xlabelsize=xlabelsize,
- xrot=xrot,
- ylabelsize=ylabelsize,
- yrot=yrot,
- legend=legend,
- **kwds,
- )
-
- if hasattr(axes, "ndim"):
- if axes.ndim == 1 and len(axes) == 1:
- return axes[0]
- return axes
-
-
-def hist_frame(
- data,
- column=None,
- by=None,
- grid: bool = True,
- xlabelsize=None,
- xrot=None,
- ylabelsize=None,
- yrot=None,
- ax=None,
- sharex: bool = False,
- sharey: bool = False,
- figsize=None,
- layout=None,
- bins: int = 10,
- legend: bool = False,
- **kwds,
-):
- if legend and "label" in kwds:
- raise ValueError("Cannot use both legend and label")
- if by is not None:
- axes = _grouped_hist(
- data,
- column=column,
- by=by,
- ax=ax,
- grid=grid,
- figsize=figsize,
- sharex=sharex,
- sharey=sharey,
- layout=layout,
- bins=bins,
- xlabelsize=xlabelsize,
- xrot=xrot,
- ylabelsize=ylabelsize,
- yrot=yrot,
- legend=legend,
- **kwds,
- )
- return axes
-
- if column is not None:
- if not isinstance(column, (list, np.ndarray, ABCIndex)):
- column = [column]
- data = data[column]
- # GH32590
- data = data.select_dtypes(
- include=(np.number, "datetime64", "datetimetz"), exclude="timedelta"
- )
- naxes = len(data.columns)
-
- if naxes == 0:
- raise ValueError(
- "hist method requires numerical or datetime columns, nothing to plot."
- )
-
- fig, axes = create_subplots(
- naxes=naxes,
- ax=ax,
- squeeze=False,
- sharex=sharex,
- sharey=sharey,
- figsize=figsize,
- layout=layout,
- )
- _axes = flatten_axes(axes)
-
- can_set_label = "label" not in kwds
-
- for i, col in enumerate(data.columns):
- ax = _axes[i]
- if legend and can_set_label:
- kwds["label"] = col
- ax.hist(data[col].dropna().values, bins=bins, **kwds)
- ax.set_title(col)
- ax.grid(grid)
- if legend:
- ax.legend()
-
- set_ticks_props(
- axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
- )
- maybe_adjust_figure(fig, wspace=0.3, hspace=0.3)
-
- return axes
diff --git a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/misc.py b/contrib/python/pandas/py3/pandas/plotting/_matplotlib/misc.py
deleted file mode 100644
index 291a6dff965..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/misc.py
+++ /dev/null
@@ -1,482 +0,0 @@
-from __future__ import annotations
-
-import random
-from typing import (
- TYPE_CHECKING,
- Hashable,
-)
-
-from matplotlib import patches
-import matplotlib.lines as mlines
-import numpy as np
-
-from pandas.core.dtypes.missing import notna
-
-from pandas.io.formats.printing import pprint_thing
-from pandas.plotting._matplotlib.style import get_standard_colors
-from pandas.plotting._matplotlib.tools import (
- create_subplots,
- do_adjust_figure,
- maybe_adjust_figure,
- set_ticks_props,
-)
-
-if TYPE_CHECKING:
- from matplotlib.axes import Axes
- from matplotlib.figure import Figure
-
- from pandas import (
- DataFrame,
- Index,
- Series,
- )
-
-
-def scatter_matrix(
- frame: DataFrame,
- alpha: float = 0.5,
- figsize=None,
- ax=None,
- grid: bool = False,
- diagonal: str = "hist",
- marker: str = ".",
- density_kwds=None,
- hist_kwds=None,
- range_padding: float = 0.05,
- **kwds,
-):
- df = frame._get_numeric_data()
- n = df.columns.size
- naxes = n * n
- fig, axes = create_subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False)
-
- # no gaps between subplots
- maybe_adjust_figure(fig, wspace=0, hspace=0)
-
- mask = notna(df)
-
- marker = _get_marker_compat(marker)
-
- hist_kwds = hist_kwds or {}
- density_kwds = density_kwds or {}
-
- # GH 14855
- kwds.setdefault("edgecolors", "none")
-
- boundaries_list = []
- for a in df.columns:
- values = df[a].values[mask[a].values]
- rmin_, rmax_ = np.min(values), np.max(values)
- rdelta_ext = (rmax_ - rmin_) * range_padding / 2
- boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
-
- for i, a in enumerate(df.columns):
- for j, b in enumerate(df.columns):
- ax = axes[i, j]
-
- if i == j:
- values = df[a].values[mask[a].values]
-
- # Deal with the diagonal by drawing a histogram there.
- if diagonal == "hist":
- ax.hist(values, **hist_kwds)
-
- elif diagonal in ("kde", "density"):
- from scipy.stats import gaussian_kde
-
- y = values
- gkde = gaussian_kde(y)
- ind = np.linspace(y.min(), y.max(), 1000)
- ax.plot(ind, gkde.evaluate(ind), **density_kwds)
-
- ax.set_xlim(boundaries_list[i])
-
- else:
- common = (mask[a] & mask[b]).values
-
- ax.scatter(
- df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds
- )
-
- ax.set_xlim(boundaries_list[j])
- ax.set_ylim(boundaries_list[i])
-
- ax.set_xlabel(b)
- ax.set_ylabel(a)
-
- if j != 0:
- ax.yaxis.set_visible(False)
- if i != n - 1:
- ax.xaxis.set_visible(False)
-
- if len(df.columns) > 1:
- lim1 = boundaries_list[0]
- locs = axes[0][1].yaxis.get_majorticklocs()
- locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
- adj = (locs - lim1[0]) / (lim1[1] - lim1[0])
-
- lim0 = axes[0][0].get_ylim()
- adj = adj * (lim0[1] - lim0[0]) + lim0[0]
- axes[0][0].yaxis.set_ticks(adj)
-
- if np.all(locs == locs.astype(int)):
- # if all ticks are int
- locs = locs.astype(int)
- axes[0][0].yaxis.set_ticklabels(locs)
-
- set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
-
- return axes
-
-
-def _get_marker_compat(marker):
- if marker not in mlines.lineMarkers:
- return "o"
- return marker
-
-
-def radviz(
- frame: DataFrame,
- class_column,
- ax: Axes | None = None,
- color=None,
- colormap=None,
- **kwds,
-) -> Axes:
- import matplotlib.pyplot as plt
-
- def normalize(series):
- a = min(series)
- b = max(series)
- return (series - a) / (b - a)
-
- n = len(frame)
- classes = frame[class_column].drop_duplicates()
- class_col = frame[class_column]
- df = frame.drop(class_column, axis=1).apply(normalize)
-
- if ax is None:
- ax = plt.gca()
- ax.set_xlim(-1, 1)
- ax.set_ylim(-1, 1)
-
- to_plot: dict[Hashable, list[list]] = {}
- colors = get_standard_colors(
- num_colors=len(classes), colormap=colormap, color_type="random", color=color
- )
-
- for kls in classes:
- to_plot[kls] = [[], []]
-
- m = len(frame.columns) - 1
- s = np.array(
- [(np.cos(t), np.sin(t)) for t in [2 * np.pi * (i / m) for i in range(m)]]
- )
-
- for i in range(n):
- row = df.iloc[i].values
- row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
- y = (s * row_).sum(axis=0) / row.sum()
- kls = class_col.iat[i]
- to_plot[kls][0].append(y[0])
- to_plot[kls][1].append(y[1])
-
- for i, kls in enumerate(classes):
- ax.scatter(
- to_plot[kls][0],
- to_plot[kls][1],
- color=colors[i],
- label=pprint_thing(kls),
- **kwds,
- )
- ax.legend()
-
- ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none"))
-
- for xy, name in zip(s, df.columns):
- ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray"))
-
- if xy[0] < 0.0 and xy[1] < 0.0:
- ax.text(
- xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small"
- )
- elif xy[0] < 0.0 <= xy[1]:
- ax.text(
- xy[0] - 0.025,
- xy[1] + 0.025,
- name,
- ha="right",
- va="bottom",
- size="small",
- )
- elif xy[1] < 0.0 <= xy[0]:
- ax.text(
- xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small"
- )
- elif xy[0] >= 0.0 and xy[1] >= 0.0:
- ax.text(
- xy[0] + 0.025, xy[1] + 0.025, name, ha="left", va="bottom", size="small"
- )
-
- ax.axis("equal")
- return ax
-
-
-def andrews_curves(
- frame: DataFrame,
- class_column,
- ax: Axes | None = None,
- samples: int = 200,
- color=None,
- colormap=None,
- **kwds,
-) -> Axes:
- import matplotlib.pyplot as plt
-
- def function(amplitudes):
- def f(t):
- x1 = amplitudes[0]
- result = x1 / np.sqrt(2.0)
-
- # Take the rest of the coefficients and resize them
- # appropriately. Take a copy of amplitudes as otherwise numpy
- # deletes the element from amplitudes itself.
- coeffs = np.delete(np.copy(amplitudes), 0)
- coeffs = np.resize(coeffs, (int((coeffs.size + 1) / 2), 2))
-
- # Generate the harmonics and arguments for the sin and cos
- # functions.
- harmonics = np.arange(0, coeffs.shape[0]) + 1
- trig_args = np.outer(harmonics, t)
-
- result += np.sum(
- coeffs[:, 0, np.newaxis] * np.sin(trig_args)
- + coeffs[:, 1, np.newaxis] * np.cos(trig_args),
- axis=0,
- )
- return result
-
- return f
-
- n = len(frame)
- class_col = frame[class_column]
- classes = frame[class_column].drop_duplicates()
- df = frame.drop(class_column, axis=1)
- t = np.linspace(-np.pi, np.pi, samples)
- used_legends: set[str] = set()
-
- color_values = get_standard_colors(
- num_colors=len(classes), colormap=colormap, color_type="random", color=color
- )
- colors = dict(zip(classes, color_values))
- if ax is None:
- ax = plt.gca()
- ax.set_xlim(-np.pi, np.pi)
- for i in range(n):
- row = df.iloc[i].values
- f = function(row)
- y = f(t)
- kls = class_col.iat[i]
- label = pprint_thing(kls)
- if label not in used_legends:
- used_legends.add(label)
- ax.plot(t, y, color=colors[kls], label=label, **kwds)
- else:
- ax.plot(t, y, color=colors[kls], **kwds)
-
- ax.legend(loc="upper right")
- ax.grid()
- return ax
-
-
-def bootstrap_plot(
- series: Series,
- fig: Figure | None = None,
- size: int = 50,
- samples: int = 500,
- **kwds,
-) -> Figure:
- import matplotlib.pyplot as plt
-
- # TODO: is the failure mentioned below still relevant?
- # random.sample(ndarray, int) fails on python 3.3, sigh
- data = list(series.values)
- samplings = [random.sample(data, size) for _ in range(samples)]
-
- means = np.array([np.mean(sampling) for sampling in samplings])
- medians = np.array([np.median(sampling) for sampling in samplings])
- midranges = np.array(
- [(min(sampling) + max(sampling)) * 0.5 for sampling in samplings]
- )
- if fig is None:
- fig = plt.figure()
- x = list(range(samples))
- axes = []
- ax1 = fig.add_subplot(2, 3, 1)
- ax1.set_xlabel("Sample")
- axes.append(ax1)
- ax1.plot(x, means, **kwds)
- ax2 = fig.add_subplot(2, 3, 2)
- ax2.set_xlabel("Sample")
- axes.append(ax2)
- ax2.plot(x, medians, **kwds)
- ax3 = fig.add_subplot(2, 3, 3)
- ax3.set_xlabel("Sample")
- axes.append(ax3)
- ax3.plot(x, midranges, **kwds)
- ax4 = fig.add_subplot(2, 3, 4)
- ax4.set_xlabel("Mean")
- axes.append(ax4)
- ax4.hist(means, **kwds)
- ax5 = fig.add_subplot(2, 3, 5)
- ax5.set_xlabel("Median")
- axes.append(ax5)
- ax5.hist(medians, **kwds)
- ax6 = fig.add_subplot(2, 3, 6)
- ax6.set_xlabel("Midrange")
- axes.append(ax6)
- ax6.hist(midranges, **kwds)
- for axis in axes:
- plt.setp(axis.get_xticklabels(), fontsize=8)
- plt.setp(axis.get_yticklabels(), fontsize=8)
- if do_adjust_figure(fig):
- plt.tight_layout()
- return fig
-
-
-def parallel_coordinates(
- frame: DataFrame,
- class_column,
- cols=None,
- ax: Axes | None = None,
- color=None,
- use_columns: bool = False,
- xticks=None,
- colormap=None,
- axvlines: bool = True,
- axvlines_kwds=None,
- sort_labels: bool = False,
- **kwds,
-) -> Axes:
- import matplotlib.pyplot as plt
-
- if axvlines_kwds is None:
- axvlines_kwds = {"linewidth": 1, "color": "black"}
-
- n = len(frame)
- classes = frame[class_column].drop_duplicates()
- class_col = frame[class_column]
-
- if cols is None:
- df = frame.drop(class_column, axis=1)
- else:
- df = frame[cols]
-
- used_legends: set[str] = set()
-
- ncols = len(df.columns)
-
- # determine values to use for xticks
- x: list[int] | Index
- if use_columns is True:
- if not np.all(np.isreal(list(df.columns))):
- raise ValueError("Columns must be numeric to be used as xticks")
- x = df.columns
- elif xticks is not None:
- if not np.all(np.isreal(xticks)):
- raise ValueError("xticks specified must be numeric")
- if len(xticks) != ncols:
- raise ValueError("Length of xticks must match number of columns")
- x = xticks
- else:
- x = list(range(ncols))
-
- if ax is None:
- ax = plt.gca()
-
- color_values = get_standard_colors(
- num_colors=len(classes), colormap=colormap, color_type="random", color=color
- )
-
- if sort_labels:
- classes = sorted(classes)
- color_values = sorted(color_values)
- colors = dict(zip(classes, color_values))
-
- for i in range(n):
- y = df.iloc[i].values
- kls = class_col.iat[i]
- label = pprint_thing(kls)
- if label not in used_legends:
- used_legends.add(label)
- ax.plot(x, y, color=colors[kls], label=label, **kwds)
- else:
- ax.plot(x, y, color=colors[kls], **kwds)
-
- if axvlines:
- for i in x:
- ax.axvline(i, **axvlines_kwds)
-
- ax.set_xticks(x)
- ax.set_xticklabels(df.columns)
- ax.set_xlim(x[0], x[-1])
- ax.legend(loc="upper right")
- ax.grid()
- return ax
-
-
-def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Axes:
- # workaround because `c='b'` is hardcoded in matplotlib's scatter method
- import matplotlib.pyplot as plt
-
- kwds.setdefault("c", plt.rcParams["patch.facecolor"])
-
- data = series.values
- y1 = data[:-lag]
- y2 = data[lag:]
- if ax is None:
- ax = plt.gca()
- ax.set_xlabel("y(t)")
- ax.set_ylabel(f"y(t + {lag})")
- ax.scatter(y1, y2, **kwds)
- return ax
-
-
-def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwds) -> Axes:
- import matplotlib.pyplot as plt
-
- n = len(series)
- data = np.asarray(series)
- if ax is None:
- ax = plt.gca()
- ax.set_xlim(1, n)
- ax.set_ylim(-1.0, 1.0)
- mean = np.mean(data)
- c0 = np.sum((data - mean) ** 2) / n
-
- def r(h):
- return ((data[: n - h] - mean) * (data[h:] - mean)).sum() / n / c0
-
- x = np.arange(n) + 1
- y = [r(loc) for loc in x]
- z95 = 1.959963984540054
- z99 = 2.5758293035489004
- ax.axhline(y=z99 / np.sqrt(n), linestyle="--", color="grey")
- ax.axhline(y=z95 / np.sqrt(n), color="grey")
- ax.axhline(y=0.0, color="black")
- ax.axhline(y=-z95 / np.sqrt(n), color="grey")
- ax.axhline(y=-z99 / np.sqrt(n), linestyle="--", color="grey")
- ax.set_xlabel("Lag")
- ax.set_ylabel("Autocorrelation")
- ax.plot(x, y, **kwds)
- if "label" in kwds:
- ax.legend()
- ax.grid()
- return ax
-
-
-def unpack_single_str_list(keys):
- # GH 42795
- if isinstance(keys, list) and len(keys) == 1:
- keys = keys[0]
- return keys
diff --git a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/style.py b/contrib/python/pandas/py3/pandas/plotting/_matplotlib/style.py
deleted file mode 100644
index 839da35a8ae..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/style.py
+++ /dev/null
@@ -1,274 +0,0 @@
-from __future__ import annotations
-
-import itertools
-from typing import (
- TYPE_CHECKING,
- Collection,
- Iterator,
- cast,
-)
-import warnings
-
-import matplotlib as mpl
-import matplotlib.colors
-import numpy as np
-
-from pandas._typing import MatplotlibColor as Color
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import is_list_like
-
-import pandas.core.common as com
-
-if TYPE_CHECKING:
- from matplotlib.colors import Colormap
-
-
-def get_standard_colors(
- num_colors: int,
- colormap: Colormap | None = None,
- color_type: str = "default",
- color: dict[str, Color] | Color | Collection[Color] | None = None,
-):
- """
- Get standard colors based on `colormap`, `color_type` or `color` inputs.
-
- Parameters
- ----------
- num_colors : int
- Minimum number of colors to be returned.
- Ignored if `color` is a dictionary.
- colormap : :py:class:`matplotlib.colors.Colormap`, optional
- Matplotlib colormap.
- When provided, the resulting colors will be derived from the colormap.
- color_type : {"default", "random"}, optional
- Type of colors to derive. Used if provided `color` and `colormap` are None.
- Ignored if either `color` or `colormap` are not None.
- color : dict or str or sequence, optional
- Color(s) to be used for deriving sequence of colors.
- Can be either be a dictionary, or a single color (single color string,
- or sequence of floats representing a single color),
- or a sequence of colors.
-
- Returns
- -------
- dict or list
- Standard colors. Can either be a mapping if `color` was a dictionary,
- or a list of colors with a length of `num_colors` or more.
-
- Warns
- -----
- UserWarning
- If both `colormap` and `color` are provided.
- Parameter `color` will override.
- """
- if isinstance(color, dict):
- return color
-
- colors = _derive_colors(
- color=color,
- colormap=colormap,
- color_type=color_type,
- num_colors=num_colors,
- )
-
- return list(_cycle_colors(colors, num_colors=num_colors))
-
-
-def _derive_colors(
- *,
- color: Color | Collection[Color] | None,
- colormap: str | Colormap | None,
- color_type: str,
- num_colors: int,
-) -> list[Color]:
- """
- Derive colors from either `colormap`, `color_type` or `color` inputs.
-
- Get a list of colors either from `colormap`, or from `color`,
- or from `color_type` (if both `colormap` and `color` are None).
-
- Parameters
- ----------
- color : str or sequence, optional
- Color(s) to be used for deriving sequence of colors.
- Can be either be a single color (single color string, or sequence of floats
- representing a single color), or a sequence of colors.
- colormap : :py:class:`matplotlib.colors.Colormap`, optional
- Matplotlib colormap.
- When provided, the resulting colors will be derived from the colormap.
- color_type : {"default", "random"}, optional
- Type of colors to derive. Used if provided `color` and `colormap` are None.
- Ignored if either `color` or `colormap`` are not None.
- num_colors : int
- Number of colors to be extracted.
-
- Returns
- -------
- list
- List of colors extracted.
-
- Warns
- -----
- UserWarning
- If both `colormap` and `color` are provided.
- Parameter `color` will override.
- """
- if color is None and colormap is not None:
- return _get_colors_from_colormap(colormap, num_colors=num_colors)
- elif color is not None:
- if colormap is not None:
- warnings.warn(
- "'color' and 'colormap' cannot be used simultaneously. Using 'color'",
- stacklevel=find_stack_level(),
- )
- return _get_colors_from_color(color)
- else:
- return _get_colors_from_color_type(color_type, num_colors=num_colors)
-
-
-def _cycle_colors(colors: list[Color], num_colors: int) -> Iterator[Color]:
- """Cycle colors until achieving max of `num_colors` or length of `colors`.
-
- Extra colors will be ignored by matplotlib if there are more colors
- than needed and nothing needs to be done here.
- """
- max_colors = max(num_colors, len(colors))
- yield from itertools.islice(itertools.cycle(colors), max_colors)
-
-
-def _get_colors_from_colormap(
- colormap: str | Colormap,
- num_colors: int,
-) -> list[Color]:
- """Get colors from colormap."""
- cmap = _get_cmap_instance(colormap)
- return [cmap(num) for num in np.linspace(0, 1, num=num_colors)]
-
-
-def _get_cmap_instance(colormap: str | Colormap) -> Colormap:
- """Get instance of matplotlib colormap."""
- if isinstance(colormap, str):
- cmap = colormap
- colormap = mpl.colormaps[colormap]
- if colormap is None:
- raise ValueError(f"Colormap {cmap} is not recognized")
- return colormap
-
-
-def _get_colors_from_color(
- color: Color | Collection[Color],
-) -> list[Color]:
- """Get colors from user input color."""
- if len(color) == 0:
- raise ValueError(f"Invalid color argument: {color}")
-
- if _is_single_color(color):
- color = cast(Color, color)
- return [color]
-
- color = cast(Collection[Color], color)
- return list(_gen_list_of_colors_from_iterable(color))
-
-
-def _is_single_color(color: Color | Collection[Color]) -> bool:
- """Check if `color` is a single color, not a sequence of colors.
-
- Single color is of these kinds:
- - Named color "red", "C0", "firebrick"
- - Alias "g"
- - Sequence of floats, such as (0.1, 0.2, 0.3) or (0.1, 0.2, 0.3, 0.4).
-
- See Also
- --------
- _is_single_string_color
- """
- if isinstance(color, str) and _is_single_string_color(color):
- # GH #36972
- return True
-
- if _is_floats_color(color):
- return True
-
- return False
-
-
-def _gen_list_of_colors_from_iterable(color: Collection[Color]) -> Iterator[Color]:
- """
- Yield colors from string of several letters or from collection of colors.
- """
- for x in color:
- if _is_single_color(x):
- yield x
- else:
- raise ValueError(f"Invalid color {x}")
-
-
-def _is_floats_color(color: Color | Collection[Color]) -> bool:
- """Check if color comprises a sequence of floats representing color."""
- return bool(
- is_list_like(color)
- and (len(color) == 3 or len(color) == 4)
- and all(isinstance(x, (int, float)) for x in color)
- )
-
-
-def _get_colors_from_color_type(color_type: str, num_colors: int) -> list[Color]:
- """Get colors from user input color type."""
- if color_type == "default":
- return _get_default_colors(num_colors)
- elif color_type == "random":
- return _get_random_colors(num_colors)
- else:
- raise ValueError("color_type must be either 'default' or 'random'")
-
-
-def _get_default_colors(num_colors: int) -> list[Color]:
- """Get `num_colors` of default colors from matplotlib rc params."""
- import matplotlib.pyplot as plt
-
- colors = [c["color"] for c in plt.rcParams["axes.prop_cycle"]]
- return colors[0:num_colors]
-
-
-def _get_random_colors(num_colors: int) -> list[Color]:
- """Get `num_colors` of random colors."""
- return [_random_color(num) for num in range(num_colors)]
-
-
-def _random_color(column: int) -> list[float]:
- """Get a random color represented as a list of length 3"""
- # GH17525 use common._random_state to avoid resetting the seed
- rs = com.random_state(column)
- return rs.rand(3).tolist()
-
-
-def _is_single_string_color(color: Color) -> bool:
- """Check if `color` is a single string color.
-
- Examples of single string colors:
- - 'r'
- - 'g'
- - 'red'
- - 'green'
- - 'C3'
- - 'firebrick'
-
- Parameters
- ----------
- color : Color
- Color string or sequence of floats.
-
- Returns
- -------
- bool
- True if `color` looks like a valid color.
- False otherwise.
- """
- conv = matplotlib.colors.ColorConverter()
- try:
- conv.to_rgba(color)
- except ValueError:
- return False
- else:
- return True
diff --git a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/timeseries.py b/contrib/python/pandas/py3/pandas/plotting/_matplotlib/timeseries.py
deleted file mode 100644
index 8e21b2c6911..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/timeseries.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# TODO: Use the fact that axis can have units to simplify the process
-
-from __future__ import annotations
-
-from datetime import timedelta
-import functools
-from typing import (
- TYPE_CHECKING,
- cast,
-)
-
-import numpy as np
-
-from pandas._libs.tslibs import (
- BaseOffset,
- Period,
- to_offset,
-)
-from pandas._libs.tslibs.dtypes import FreqGroup
-
-from pandas.core.dtypes.generic import (
- ABCDatetimeIndex,
- ABCPeriodIndex,
- ABCTimedeltaIndex,
-)
-
-from pandas.io.formats.printing import pprint_thing
-from pandas.plotting._matplotlib.converter import (
- TimeSeries_DateFormatter,
- TimeSeries_DateLocator,
- TimeSeries_TimedeltaFormatter,
-)
-from pandas.tseries.frequencies import (
- get_period_alias,
- is_subperiod,
- is_superperiod,
-)
-
-if TYPE_CHECKING:
- from matplotlib.axes import Axes
-
- from pandas import (
- DataFrame,
- DatetimeIndex,
- Index,
- Series,
- )
-
-# ---------------------------------------------------------------------
-# Plotting functions and monkey patches
-
-
-def maybe_resample(series: Series, ax: Axes, kwargs):
- # resample against axes freq if necessary
- freq, ax_freq = _get_freq(ax, series)
-
- if freq is None: # pragma: no cover
- raise ValueError("Cannot use dynamic axis without frequency info")
-
- # Convert DatetimeIndex to PeriodIndex
- if isinstance(series.index, ABCDatetimeIndex):
- series = series.to_period(freq=freq)
-
- if ax_freq is not None and freq != ax_freq:
- if is_superperiod(freq, ax_freq): # upsample input
- series = series.copy()
- # error: "Index" has no attribute "asfreq"
- series.index = series.index.asfreq( # type: ignore[attr-defined]
- ax_freq, how="s"
- )
- freq = ax_freq
- elif _is_sup(freq, ax_freq): # one is weekly
- how = kwargs.pop("how", "last")
- series = getattr(series.resample("D"), how)().dropna()
- series = getattr(series.resample(ax_freq), how)().dropna()
- freq = ax_freq
- elif is_subperiod(freq, ax_freq) or _is_sub(freq, ax_freq):
- _upsample_others(ax, freq, kwargs)
- else: # pragma: no cover
- raise ValueError("Incompatible frequency conversion")
- return freq, series
-
-
-def _is_sub(f1: str, f2: str) -> bool:
- return (f1.startswith("W") and is_subperiod("D", f2)) or (
- f2.startswith("W") and is_subperiod(f1, "D")
- )
-
-
-def _is_sup(f1: str, f2: str) -> bool:
- return (f1.startswith("W") and is_superperiod("D", f2)) or (
- f2.startswith("W") and is_superperiod(f1, "D")
- )
-
-
-def _upsample_others(ax: Axes, freq, kwargs) -> None:
- legend = ax.get_legend()
- lines, labels = _replot_ax(ax, freq, kwargs)
- _replot_ax(ax, freq, kwargs)
-
- other_ax = None
- if hasattr(ax, "left_ax"):
- other_ax = ax.left_ax
- if hasattr(ax, "right_ax"):
- other_ax = ax.right_ax
-
- if other_ax is not None:
- rlines, rlabels = _replot_ax(other_ax, freq, kwargs)
- lines.extend(rlines)
- labels.extend(rlabels)
-
- if legend is not None and kwargs.get("legend", True) and len(lines) > 0:
- title = legend.get_title().get_text()
- if title == "None":
- title = None
- ax.legend(lines, labels, loc="best", title=title)
-
-
-def _replot_ax(ax: Axes, freq, kwargs):
- data = getattr(ax, "_plot_data", None)
-
- # clear current axes and data
- ax._plot_data = []
- ax.clear()
-
- decorate_axes(ax, freq, kwargs)
-
- lines = []
- labels = []
- if data is not None:
- for series, plotf, kwds in data:
- series = series.copy()
- idx = series.index.asfreq(freq, how="S")
- series.index = idx
- ax._plot_data.append((series, plotf, kwds))
-
- # for tsplot
- if isinstance(plotf, str):
- from pandas.plotting._matplotlib import PLOT_CLASSES
-
- plotf = PLOT_CLASSES[plotf]._plot
-
- lines.append(plotf(ax, series.index._mpl_repr(), series.values, **kwds)[0])
- labels.append(pprint_thing(series.name))
-
- return lines, labels
-
-
-def decorate_axes(ax: Axes, freq, kwargs) -> None:
- """Initialize axes for time-series plotting"""
- if not hasattr(ax, "_plot_data"):
- ax._plot_data = []
-
- ax.freq = freq
- xaxis = ax.get_xaxis()
- xaxis.freq = freq
- if not hasattr(ax, "legendlabels"):
- ax.legendlabels = [kwargs.get("label", None)]
- else:
- ax.legendlabels.append(kwargs.get("label", None))
- ax.view_interval = None
- ax.date_axis_info = None
-
-
-def _get_ax_freq(ax: Axes):
- """
- Get the freq attribute of the ax object if set.
- Also checks shared axes (eg when using secondary yaxis, sharex=True
- or twinx)
- """
- ax_freq = getattr(ax, "freq", None)
- if ax_freq is None:
- # check for left/right ax in case of secondary yaxis
- if hasattr(ax, "left_ax"):
- ax_freq = getattr(ax.left_ax, "freq", None)
- elif hasattr(ax, "right_ax"):
- ax_freq = getattr(ax.right_ax, "freq", None)
- if ax_freq is None:
- # check if a shared ax (sharex/twinx) has already freq set
- shared_axes = ax.get_shared_x_axes().get_siblings(ax)
- if len(shared_axes) > 1:
- for shared_ax in shared_axes:
- ax_freq = getattr(shared_ax, "freq", None)
- if ax_freq is not None:
- break
- return ax_freq
-
-
-def _get_period_alias(freq: timedelta | BaseOffset | str) -> str | None:
- freqstr = to_offset(freq).rule_code
-
- return get_period_alias(freqstr)
-
-
-def _get_freq(ax: Axes, series: Series):
- # get frequency from data
- freq = getattr(series.index, "freq", None)
- if freq is None:
- freq = getattr(series.index, "inferred_freq", None)
- freq = to_offset(freq)
-
- ax_freq = _get_ax_freq(ax)
-
- # use axes freq if no data freq
- if freq is None:
- freq = ax_freq
-
- # get the period frequency
- freq = _get_period_alias(freq)
- return freq, ax_freq
-
-
-def use_dynamic_x(ax: Axes, data: DataFrame | Series) -> bool:
- freq = _get_index_freq(data.index)
- ax_freq = _get_ax_freq(ax)
-
- if freq is None: # convert irregular if axes has freq info
- freq = ax_freq
- else: # do not use tsplot if irregular was plotted first
- if (ax_freq is None) and (len(ax.get_lines()) > 0):
- return False
-
- if freq is None:
- return False
-
- freq_str = _get_period_alias(freq)
-
- if freq_str is None:
- return False
-
- # FIXME: hack this for 0.10.1, creating more technical debt...sigh
- if isinstance(data.index, ABCDatetimeIndex):
- # error: "BaseOffset" has no attribute "_period_dtype_code"
- base = to_offset(freq_str)._period_dtype_code # type: ignore[attr-defined]
- x = data.index
- if base <= FreqGroup.FR_DAY.value:
- return x[:1].is_normalized
- period = Period(x[0], freq_str)
- assert isinstance(period, Period)
- return period.to_timestamp().tz_localize(x.tz) == x[0]
- return True
-
-
-def _get_index_freq(index: Index) -> BaseOffset | None:
- freq = getattr(index, "freq", None)
- if freq is None:
- freq = getattr(index, "inferred_freq", None)
- if freq == "B":
- # error: "Index" has no attribute "dayofweek"
- weekdays = np.unique(index.dayofweek) # type: ignore[attr-defined]
- if (5 in weekdays) or (6 in weekdays):
- freq = None
-
- freq = to_offset(freq)
- return freq
-
-
-def maybe_convert_index(ax: Axes, data):
- # tsplot converts automatically, but don't want to convert index
- # over and over for DataFrames
- if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)):
- freq: str | BaseOffset | None = data.index.freq
-
- if freq is None:
- # We only get here for DatetimeIndex
- data.index = cast("DatetimeIndex", data.index)
- freq = data.index.inferred_freq
- freq = to_offset(freq)
-
- if freq is None:
- freq = _get_ax_freq(ax)
-
- if freq is None:
- raise ValueError("Could not get frequency alias for plotting")
-
- freq_str = _get_period_alias(freq)
-
- if isinstance(data.index, ABCDatetimeIndex):
- data = data.tz_localize(None).to_period(freq=freq_str)
- elif isinstance(data.index, ABCPeriodIndex):
- data.index = data.index.asfreq(freq=freq_str)
- return data
-
-
-# Patch methods for subplot. Only format_dateaxis is currently used.
-# Do we need the rest for convenience?
-
-
-def _format_coord(freq, t, y) -> str:
- time_period = Period(ordinal=int(t), freq=freq)
- return f"t = {time_period} y = {y:8f}"
-
-
-def format_dateaxis(subplot, freq, index) -> None:
- """
- Pretty-formats the date axis (x-axis).
-
- Major and minor ticks are automatically set for the frequency of the
- current underlying series. As the dynamic mode is activated by
- default, changing the limits of the x axis will intelligently change
- the positions of the ticks.
- """
- from matplotlib import pylab
-
- # handle index specific formatting
- # Note: DatetimeIndex does not use this
- # interface. DatetimeIndex uses matplotlib.date directly
- if isinstance(index, ABCPeriodIndex):
- majlocator = TimeSeries_DateLocator(
- freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot
- )
- minlocator = TimeSeries_DateLocator(
- freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot
- )
- subplot.xaxis.set_major_locator(majlocator)
- subplot.xaxis.set_minor_locator(minlocator)
-
- majformatter = TimeSeries_DateFormatter(
- freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot
- )
- minformatter = TimeSeries_DateFormatter(
- freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot
- )
- subplot.xaxis.set_major_formatter(majformatter)
- subplot.xaxis.set_minor_formatter(minformatter)
-
- # x and y coord info
- subplot.format_coord = functools.partial(_format_coord, freq)
-
- elif isinstance(index, ABCTimedeltaIndex):
- subplot.xaxis.set_major_formatter(TimeSeries_TimedeltaFormatter())
- else:
- raise TypeError("index type not supported")
-
- pylab.draw_if_interactive()
diff --git a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/tools.py b/contrib/python/pandas/py3/pandas/plotting/_matplotlib/tools.py
deleted file mode 100644
index 7d3c857eea2..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_matplotlib/tools.py
+++ /dev/null
@@ -1,483 +0,0 @@
-# being a bit too dynamic
-from __future__ import annotations
-
-from math import ceil
-from typing import (
- TYPE_CHECKING,
- Iterable,
- Sequence,
-)
-import warnings
-
-from matplotlib import ticker
-import matplotlib.table
-import numpy as np
-
-from pandas.util._exceptions import find_stack_level
-
-from pandas.core.dtypes.common import is_list_like
-from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCIndex,
- ABCSeries,
-)
-
-if TYPE_CHECKING:
- from matplotlib.axes import Axes
- from matplotlib.axis import Axis
- from matplotlib.figure import Figure
- from matplotlib.lines import Line2D
- from matplotlib.table import Table
-
- from pandas import (
- DataFrame,
- Series,
- )
-
-
-def do_adjust_figure(fig: Figure) -> bool:
- """Whether fig has constrained_layout enabled."""
- if not hasattr(fig, "get_constrained_layout"):
- return False
- return not fig.get_constrained_layout()
-
-
-def maybe_adjust_figure(fig: Figure, *args, **kwargs) -> None:
- """Call fig.subplots_adjust unless fig has constrained_layout enabled."""
- if do_adjust_figure(fig):
- fig.subplots_adjust(*args, **kwargs)
-
-
-def format_date_labels(ax: Axes, rot) -> None:
- # mini version of autofmt_xdate
- for label in ax.get_xticklabels():
- label.set_ha("right")
- label.set_rotation(rot)
- fig = ax.get_figure()
- maybe_adjust_figure(fig, bottom=0.2)
-
-
-def table(
- ax, data: DataFrame | Series, rowLabels=None, colLabels=None, **kwargs
-) -> Table:
- if isinstance(data, ABCSeries):
- data = data.to_frame()
- elif isinstance(data, ABCDataFrame):
- pass
- else:
- raise ValueError("Input data must be DataFrame or Series")
-
- if rowLabels is None:
- rowLabels = data.index
-
- if colLabels is None:
- colLabels = data.columns
-
- cellText = data.values
-
- return matplotlib.table.table(
- ax, cellText=cellText, rowLabels=rowLabels, colLabels=colLabels, **kwargs
- )
-
-
-def _get_layout(
- nplots: int,
- layout: tuple[int, int] | None = None,
- layout_type: str = "box",
-) -> tuple[int, int]:
- if layout is not None:
- if not isinstance(layout, (tuple, list)) or len(layout) != 2:
- raise ValueError("Layout must be a tuple of (rows, columns)")
-
- nrows, ncols = layout
-
- if nrows == -1 and ncols > 0:
- layout = nrows, ncols = (ceil(nplots / ncols), ncols)
- elif ncols == -1 and nrows > 0:
- layout = nrows, ncols = (nrows, ceil(nplots / nrows))
- elif ncols <= 0 and nrows <= 0:
- msg = "At least one dimension of layout must be positive"
- raise ValueError(msg)
-
- if nrows * ncols < nplots:
- raise ValueError(
- f"Layout of {nrows}x{ncols} must be larger than required size {nplots}"
- )
-
- return layout
-
- if layout_type == "single":
- return (1, 1)
- elif layout_type == "horizontal":
- return (1, nplots)
- elif layout_type == "vertical":
- return (nplots, 1)
-
- layouts = {1: (1, 1), 2: (1, 2), 3: (2, 2), 4: (2, 2)}
- try:
- return layouts[nplots]
- except KeyError:
- k = 1
- while k**2 < nplots:
- k += 1
-
- if (k - 1) * k >= nplots:
- return k, (k - 1)
- else:
- return k, k
-
-
-# copied from matplotlib/pyplot.py and modified for pandas.plotting
-
-
-def create_subplots(
- naxes: int,
- sharex: bool = False,
- sharey: bool = False,
- squeeze: bool = True,
- subplot_kw=None,
- ax=None,
- layout=None,
- layout_type: str = "box",
- **fig_kw,
-):
- """
- Create a figure with a set of subplots already made.
-
- This utility wrapper makes it convenient to create common layouts of
- subplots, including the enclosing figure object, in a single call.
-
- Parameters
- ----------
- naxes : int
- Number of required axes. Exceeded axes are set invisible. Default is
- nrows * ncols.
-
- sharex : bool
- If True, the X axis will be shared amongst all subplots.
-
- sharey : bool
- If True, the Y axis will be shared amongst all subplots.
-
- squeeze : bool
-
- If True, extra dimensions are squeezed out from the returned axis object:
- - if only one subplot is constructed (nrows=ncols=1), the resulting
- single Axis object is returned as a scalar.
- - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object
- array of Axis objects are returned as numpy 1-d arrays.
- - for NxM subplots with N>1 and M>1 are returned as a 2d array.
-
- If False, no squeezing is done: the returned axis object is always
- a 2-d array containing Axis instances, even if it ends up being 1x1.
-
- subplot_kw : dict
- Dict with keywords passed to the add_subplot() call used to create each
- subplots.
-
- ax : Matplotlib axis object, optional
-
- layout : tuple
- Number of rows and columns of the subplot grid.
- If not specified, calculated from naxes and layout_type
-
- layout_type : {'box', 'horizontal', 'vertical'}, default 'box'
- Specify how to layout the subplot grid.
-
- fig_kw : Other keyword arguments to be passed to the figure() call.
- Note that all keywords not recognized above will be
- automatically included here.
-
- Returns
- -------
- fig, ax : tuple
- - fig is the Matplotlib Figure object
- - ax can be either a single axis object or an array of axis objects if
- more than one subplot was created. The dimensions of the resulting array
- can be controlled with the squeeze keyword, see above.
-
- Examples
- --------
- x = np.linspace(0, 2*np.pi, 400)
- y = np.sin(x**2)
-
- # Just a figure and one subplot
- f, ax = plt.subplots()
- ax.plot(x, y)
- ax.set_title('Simple plot')
-
- # Two subplots, unpack the output array immediately
- f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
- ax1.plot(x, y)
- ax1.set_title('Sharing Y axis')
- ax2.scatter(x, y)
-
- # Four polar axes
- plt.subplots(2, 2, subplot_kw=dict(polar=True))
- """
- import matplotlib.pyplot as plt
-
- if subplot_kw is None:
- subplot_kw = {}
-
- if ax is None:
- fig = plt.figure(**fig_kw)
- else:
- if is_list_like(ax):
- if squeeze:
- ax = flatten_axes(ax)
- if layout is not None:
- warnings.warn(
- "When passing multiple axes, layout keyword is ignored.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- if sharex or sharey:
- warnings.warn(
- "When passing multiple axes, sharex and sharey "
- "are ignored. These settings must be specified when creating axes.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- if ax.size == naxes:
- fig = ax.flat[0].get_figure()
- return fig, ax
- else:
- raise ValueError(
- f"The number of passed axes must be {naxes}, the "
- "same as the output plot"
- )
-
- fig = ax.get_figure()
- # if ax is passed and a number of subplots is 1, return ax as it is
- if naxes == 1:
- if squeeze:
- return fig, ax
- else:
- return fig, flatten_axes(ax)
- else:
- warnings.warn(
- "To output multiple subplots, the figure containing "
- "the passed axes is being cleared.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- fig.clear()
-
- nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type)
- nplots = nrows * ncols
-
- # Create empty object array to hold all axes. It's easiest to make it 1-d
- # so we can just append subplots upon creation, and then
- axarr = np.empty(nplots, dtype=object)
-
- # Create first subplot separately, so we can share it if requested
- ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw)
-
- if sharex:
- subplot_kw["sharex"] = ax0
- if sharey:
- subplot_kw["sharey"] = ax0
- axarr[0] = ax0
-
- # Note off-by-one counting because add_subplot uses the MATLAB 1-based
- # convention.
- for i in range(1, nplots):
- kwds = subplot_kw.copy()
- # Set sharex and sharey to None for blank/dummy axes, these can
- # interfere with proper axis limits on the visible axes if
- # they share axes e.g. issue #7528
- if i >= naxes:
- kwds["sharex"] = None
- kwds["sharey"] = None
- ax = fig.add_subplot(nrows, ncols, i + 1, **kwds)
- axarr[i] = ax
-
- if naxes != nplots:
- for ax in axarr[naxes:]:
- ax.set_visible(False)
-
- handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey)
-
- if squeeze:
- # Reshape the array to have the final desired dimension (nrow,ncol),
- # though discarding unneeded dimensions that equal 1. If we only have
- # one subplot, just return it instead of a 1-element array.
- if nplots == 1:
- axes = axarr[0]
- else:
- axes = axarr.reshape(nrows, ncols).squeeze()
- else:
- # returned axis array will be always 2-d, even if nrows=ncols=1
- axes = axarr.reshape(nrows, ncols)
-
- return fig, axes
-
-
-def _remove_labels_from_axis(axis: Axis) -> None:
- for t in axis.get_majorticklabels():
- t.set_visible(False)
-
- # set_visible will not be effective if
- # minor axis has NullLocator and NullFormatter (default)
- if isinstance(axis.get_minor_locator(), ticker.NullLocator):
- axis.set_minor_locator(ticker.AutoLocator())
- if isinstance(axis.get_minor_formatter(), ticker.NullFormatter):
- axis.set_minor_formatter(ticker.FormatStrFormatter(""))
- for t in axis.get_minorticklabels():
- t.set_visible(False)
-
- axis.get_label().set_visible(False)
-
-
-def _has_externally_shared_axis(ax1: Axes, compare_axis: str) -> bool:
- """
- Return whether an axis is externally shared.
-
- Parameters
- ----------
- ax1 : matplotlib.axes.Axes
- Axis to query.
- compare_axis : str
- `"x"` or `"y"` according to whether the X-axis or Y-axis is being
- compared.
-
- Returns
- -------
- bool
- `True` if the axis is externally shared. Otherwise `False`.
-
- Notes
- -----
- If two axes with different positions are sharing an axis, they can be
- referred to as *externally* sharing the common axis.
-
- If two axes sharing an axis also have the same position, they can be
- referred to as *internally* sharing the common axis (a.k.a twinning).
-
- _handle_shared_axes() is only interested in axes externally sharing an
- axis, regardless of whether either of the axes is also internally sharing
- with a third axis.
- """
- if compare_axis == "x":
- axes = ax1.get_shared_x_axes()
- elif compare_axis == "y":
- axes = ax1.get_shared_y_axes()
- else:
- raise ValueError(
- "_has_externally_shared_axis() needs 'x' or 'y' as a second parameter"
- )
-
- axes = axes.get_siblings(ax1)
-
- # Retain ax1 and any of its siblings which aren't in the same position as it
- ax1_points = ax1.get_position().get_points()
-
- for ax2 in axes:
- if not np.array_equal(ax1_points, ax2.get_position().get_points()):
- return True
-
- return False
-
-
-def handle_shared_axes(
- axarr: Iterable[Axes],
- nplots: int,
- naxes: int,
- nrows: int,
- ncols: int,
- sharex: bool,
- sharey: bool,
-) -> None:
- if nplots > 1:
- row_num = lambda x: x.get_subplotspec().rowspan.start
- col_num = lambda x: x.get_subplotspec().colspan.start
-
- is_first_col = lambda x: x.get_subplotspec().is_first_col()
-
- if nrows > 1:
- try:
- # first find out the ax layout,
- # so that we can correctly handle 'gaps"
- layout = np.zeros((nrows + 1, ncols + 1), dtype=np.bool_)
- for ax in axarr:
- layout[row_num(ax), col_num(ax)] = ax.get_visible()
-
- for ax in axarr:
- # only the last row of subplots should get x labels -> all
- # other off layout handles the case that the subplot is
- # the last in the column, because below is no subplot/gap.
- if not layout[row_num(ax) + 1, col_num(ax)]:
- continue
- if sharex or _has_externally_shared_axis(ax, "x"):
- _remove_labels_from_axis(ax.xaxis)
-
- except IndexError:
- # if gridspec is used, ax.rowNum and ax.colNum may different
- # from layout shape. in this case, use last_row logic
- is_last_row = lambda x: x.get_subplotspec().is_last_row()
- for ax in axarr:
- if is_last_row(ax):
- continue
- if sharex or _has_externally_shared_axis(ax, "x"):
- _remove_labels_from_axis(ax.xaxis)
-
- if ncols > 1:
- for ax in axarr:
- # only the first column should get y labels -> set all other to
- # off as we only have labels in the first column and we always
- # have a subplot there, we can skip the layout test
- if is_first_col(ax):
- continue
- if sharey or _has_externally_shared_axis(ax, "y"):
- _remove_labels_from_axis(ax.yaxis)
-
-
-def flatten_axes(axes: Axes | Sequence[Axes]) -> np.ndarray:
- if not is_list_like(axes):
- return np.array([axes])
- elif isinstance(axes, (np.ndarray, ABCIndex)):
- return np.asarray(axes).ravel()
- return np.array(axes)
-
-
-def set_ticks_props(
- axes: Axes | Sequence[Axes],
- xlabelsize=None,
- xrot=None,
- ylabelsize=None,
- yrot=None,
-):
- import matplotlib.pyplot as plt
-
- for ax in flatten_axes(axes):
- if xlabelsize is not None:
- plt.setp(ax.get_xticklabels(), fontsize=xlabelsize)
- if xrot is not None:
- plt.setp(ax.get_xticklabels(), rotation=xrot)
- if ylabelsize is not None:
- plt.setp(ax.get_yticklabels(), fontsize=ylabelsize)
- if yrot is not None:
- plt.setp(ax.get_yticklabels(), rotation=yrot)
- return axes
-
-
-def get_all_lines(ax: Axes) -> list[Line2D]:
- lines = ax.get_lines()
-
- if hasattr(ax, "right_ax"):
- lines += ax.right_ax.get_lines()
-
- if hasattr(ax, "left_ax"):
- lines += ax.left_ax.get_lines()
-
- return lines
-
-
-def get_xlim(lines: Iterable[Line2D]) -> tuple[float, float]:
- left, right = np.inf, -np.inf
- for line in lines:
- x = line.get_xdata(orig=False)
- left = min(np.nanmin(x), left)
- right = max(np.nanmax(x), right)
- return left, right
diff --git a/contrib/python/pandas/py3/pandas/plotting/_misc.py b/contrib/python/pandas/py3/pandas/plotting/_misc.py
deleted file mode 100644
index 0eb6c826e2d..00000000000
--- a/contrib/python/pandas/py3/pandas/plotting/_misc.py
+++ /dev/null
@@ -1,618 +0,0 @@
-from __future__ import annotations
-
-from contextlib import contextmanager
-from typing import (
- TYPE_CHECKING,
- Generator,
-)
-
-from pandas.plotting._core import _get_plot_backend
-
-if TYPE_CHECKING:
- from matplotlib.axes import Axes
- from matplotlib.figure import Figure
- import numpy as np
-
- from pandas import (
- DataFrame,
- Series,
- )
-
-
-def table(ax, data, **kwargs):
- """
- Helper function to convert DataFrame and Series to matplotlib.table.
-
- Parameters
- ----------
- ax : Matplotlib axes object
- data : DataFrame or Series
- Data for table contents.
- **kwargs
- Keyword arguments to be passed to matplotlib.table.table.
- If `rowLabels` or `colLabels` is not specified, data index or column
- name will be used.
-
- Returns
- -------
- matplotlib table object
- """
- plot_backend = _get_plot_backend("matplotlib")
- return plot_backend.table(
- ax=ax, data=data, rowLabels=None, colLabels=None, **kwargs
- )
-
-
-def register() -> None:
- """
- Register pandas formatters and converters with matplotlib.
-
- This function modifies the global ``matplotlib.units.registry``
- dictionary. pandas adds custom converters for
-
- * pd.Timestamp
- * pd.Period
- * np.datetime64
- * datetime.datetime
- * datetime.date
- * datetime.time
-
- See Also
- --------
- deregister_matplotlib_converters : Remove pandas formatters and converters.
- """
- plot_backend = _get_plot_backend("matplotlib")
- plot_backend.register()
-
-
-def deregister() -> None:
- """
- Remove pandas formatters and converters.
-
- Removes the custom converters added by :func:`register`. This
- attempts to set the state of the registry back to the state before
- pandas registered its own units. Converters for pandas' own types like
- Timestamp and Period are removed completely. Converters for types
- pandas overwrites, like ``datetime.datetime``, are restored to their
- original value.
-
- See Also
- --------
- register_matplotlib_converters : Register pandas formatters and converters
- with matplotlib.
- """
- plot_backend = _get_plot_backend("matplotlib")
- plot_backend.deregister()
-
-
-def scatter_matrix(
- frame: DataFrame,
- alpha: float = 0.5,
- figsize: tuple[float, float] | None = None,
- ax: Axes | None = None,
- grid: bool = False,
- diagonal: str = "hist",
- marker: str = ".",
- density_kwds=None,
- hist_kwds=None,
- range_padding: float = 0.05,
- **kwargs,
-) -> np.ndarray:
- """
- Draw a matrix of scatter plots.
-
- Parameters
- ----------
- frame : DataFrame
- alpha : float, optional
- Amount of transparency applied.
- figsize : (float,float), optional
- A tuple (width, height) in inches.
- ax : Matplotlib axis object, optional
- grid : bool, optional
- Setting this to True will show the grid.
- diagonal : {'hist', 'kde'}
- Pick between 'kde' and 'hist' for either Kernel Density Estimation or
- Histogram plot in the diagonal.
- marker : str, optional
- Matplotlib marker type, default '.'.
- density_kwds : keywords
- Keyword arguments to be passed to kernel density estimate plot.
- hist_kwds : keywords
- Keyword arguments to be passed to hist function.
- range_padding : float, default 0.05
- Relative extension of axis range in x and y with respect to
- (x_max - x_min) or (y_max - y_min).
- **kwargs
- Keyword arguments to be passed to scatter function.
-
- Returns
- -------
- numpy.ndarray
- A matrix of scatter plots.
-
- Examples
- --------
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
- >>> pd.plotting.scatter_matrix(df, alpha=0.2)
- array([[<AxesSubplot: xlabel='A', ylabel='A'>,
- <AxesSubplot: xlabel='B', ylabel='A'>,
- <AxesSubplot: xlabel='C', ylabel='A'>,
- <AxesSubplot: xlabel='D', ylabel='A'>],
- [<AxesSubplot: xlabel='A', ylabel='B'>,
- <AxesSubplot: xlabel='B', ylabel='B'>,
- <AxesSubplot: xlabel='C', ylabel='B'>,
- <AxesSubplot: xlabel='D', ylabel='B'>],
- [<AxesSubplot: xlabel='A', ylabel='C'>,
- <AxesSubplot: xlabel='B', ylabel='C'>,
- <AxesSubplot: xlabel='C', ylabel='C'>,
- <AxesSubplot: xlabel='D', ylabel='C'>],
- [<AxesSubplot: xlabel='A', ylabel='D'>,
- <AxesSubplot: xlabel='B', ylabel='D'>,
- <AxesSubplot: xlabel='C', ylabel='D'>,
- <AxesSubplot: xlabel='D', ylabel='D'>]], dtype=object)
- """
- plot_backend = _get_plot_backend("matplotlib")
- return plot_backend.scatter_matrix(
- frame=frame,
- alpha=alpha,
- figsize=figsize,
- ax=ax,
- grid=grid,
- diagonal=diagonal,
- marker=marker,
- density_kwds=density_kwds,
- hist_kwds=hist_kwds,
- range_padding=range_padding,
- **kwargs,
- )
-
-
-def radviz(
- frame: DataFrame,
- class_column: str,
- ax: Axes | None = None,
- color: list[str] | tuple[str, ...] | None = None,
- colormap=None,
- **kwds,
-) -> Axes:
- """
- Plot a multidimensional dataset in 2D.
-
- Each Series in the DataFrame is represented as a evenly distributed
- slice on a circle. Each data point is rendered in the circle according to
- the value on each Series. Highly correlated `Series` in the `DataFrame`
- are placed closer on the unit circle.
-
- RadViz allow to project a N-dimensional data set into a 2D space where the
- influence of each dimension can be interpreted as a balance between the
- influence of all dimensions.
-
- More info available at the `original article
- <https://doi.org/10.1145/331770.331775>`_
- describing RadViz.
-
- Parameters
- ----------
- frame : `DataFrame`
- Object holding the data.
- class_column : str
- Column name containing the name of the data point category.
- ax : :class:`matplotlib.axes.Axes`, optional
- A plot instance to which to add the information.
- color : list[str] or tuple[str], optional
- Assign a color to each category. Example: ['blue', 'green'].
- colormap : str or :class:`matplotlib.colors.Colormap`, default None
- Colormap to select colors from. If string, load colormap with that
- name from matplotlib.
- **kwds
- Options to pass to matplotlib scatter plotting method.
-
- Returns
- -------
- :class:`matplotlib.axes.Axes`
-
- See Also
- --------
- pandas.plotting.andrews_curves : Plot clustering visualization.
-
- Examples
- --------
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.DataFrame(
- ... {
- ... 'SepalLength': [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6, 6.7, 4.6],
- ... 'SepalWidth': [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2, 3.3, 3.6],
- ... 'PetalLength': [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4, 5.7, 1.0],
- ... 'PetalWidth': [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2, 2.1, 0.2],
- ... 'Category': [
- ... 'virginica',
- ... 'virginica',
- ... 'setosa',
- ... 'virginica',
- ... 'virginica',
- ... 'versicolor',
- ... 'versicolor',
- ... 'setosa',
- ... 'virginica',
- ... 'setosa'
- ... ]
- ... }
- ... )
- >>> pd.plotting.radviz(df, 'Category')
- <AxesSubplot: xlabel='y(t)', ylabel='y(t + 1)'>
- """
- plot_backend = _get_plot_backend("matplotlib")
- return plot_backend.radviz(
- frame=frame,
- class_column=class_column,
- ax=ax,
- color=color,
- colormap=colormap,
- **kwds,
- )
-
-
-def andrews_curves(
- frame: DataFrame,
- class_column: str,
- ax: Axes | None = None,
- samples: int = 200,
- color: list[str] | tuple[str, ...] | None = None,
- colormap=None,
- **kwargs,
-) -> Axes:
- """
- Generate a matplotlib plot for visualising clusters of multivariate data.
-
- Andrews curves have the functional form:
-
- .. math::
- f(t) = \\frac{x_1}{\\sqrt{2}} + x_2 \\sin(t) + x_3 \\cos(t) +
- x_4 \\sin(2t) + x_5 \\cos(2t) + \\cdots
-
- Where :math:`x` coefficients correspond to the values of each dimension
- and :math:`t` is linearly spaced between :math:`-\\pi` and :math:`+\\pi`.
- Each row of frame then corresponds to a single curve.
-
- Parameters
- ----------
- frame : DataFrame
- Data to be plotted, preferably normalized to (0.0, 1.0).
- class_column : label
- Name of the column containing class names.
- ax : axes object, default None
- Axes to use.
- samples : int
- Number of points to plot in each curve.
- color : str, list[str] or tuple[str], optional
- Colors to use for the different classes. Colors can be strings
- or 3-element floating point RGB values.
- colormap : str or matplotlib colormap object, default None
- Colormap to select colors from. If a string, load colormap with that
- name from matplotlib.
- **kwargs
- Options to pass to matplotlib plotting method.
-
- Returns
- -------
- :class:`matplotlib.axes.Axes`
-
- Examples
- --------
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.read_csv(
- ... 'https://raw.githubusercontent.com/pandas-dev/'
- ... 'pandas/main/pandas/tests/io/data/csv/iris.csv'
- ... )
- >>> pd.plotting.andrews_curves(df, 'Name')
- <AxesSubplot: title={'center': 'width'}>
- """
- plot_backend = _get_plot_backend("matplotlib")
- return plot_backend.andrews_curves(
- frame=frame,
- class_column=class_column,
- ax=ax,
- samples=samples,
- color=color,
- colormap=colormap,
- **kwargs,
- )
-
-
-def bootstrap_plot(
- series: Series,
- fig: Figure | None = None,
- size: int = 50,
- samples: int = 500,
- **kwds,
-) -> Figure:
- """
- Bootstrap plot on mean, median and mid-range statistics.
-
- The bootstrap plot is used to estimate the uncertainty of a statistic
- by relying on random sampling with replacement [1]_. This function will
- generate bootstrapping plots for mean, median and mid-range statistics
- for the given number of samples of the given size.
-
- .. [1] "Bootstrapping (statistics)" in \
- https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29
-
- Parameters
- ----------
- series : pandas.Series
- Series from where to get the samplings for the bootstrapping.
- fig : matplotlib.figure.Figure, default None
- If given, it will use the `fig` reference for plotting instead of
- creating a new one with default parameters.
- size : int, default 50
- Number of data points to consider during each sampling. It must be
- less than or equal to the length of the `series`.
- samples : int, default 500
- Number of times the bootstrap procedure is performed.
- **kwds
- Options to pass to matplotlib plotting method.
-
- Returns
- -------
- matplotlib.figure.Figure
- Matplotlib figure.
-
- See Also
- --------
- pandas.DataFrame.plot : Basic plotting for DataFrame objects.
- pandas.Series.plot : Basic plotting for Series objects.
-
- Examples
- --------
- This example draws a basic bootstrap plot for a Series.
-
- .. plot::
- :context: close-figs
-
- >>> s = pd.Series(np.random.uniform(size=100))
- >>> pd.plotting.bootstrap_plot(s)
- <Figure size 640x480 with 6 Axes>
- """
- plot_backend = _get_plot_backend("matplotlib")
- return plot_backend.bootstrap_plot(
- series=series, fig=fig, size=size, samples=samples, **kwds
- )
-
-
-def parallel_coordinates(
- frame: DataFrame,
- class_column: str,
- cols: list[str] | None = None,
- ax: Axes | None = None,
- color: list[str] | tuple[str, ...] | None = None,
- use_columns: bool = False,
- xticks: list | tuple | None = None,
- colormap=None,
- axvlines: bool = True,
- axvlines_kwds=None,
- sort_labels: bool = False,
- **kwargs,
-) -> Axes:
- """
- Parallel coordinates plotting.
-
- Parameters
- ----------
- frame : DataFrame
- class_column : str
- Column name containing class names.
- cols : list, optional
- A list of column names to use.
- ax : matplotlib.axis, optional
- Matplotlib axis object.
- color : list or tuple, optional
- Colors to use for the different classes.
- use_columns : bool, optional
- If true, columns will be used as xticks.
- xticks : list or tuple, optional
- A list of values to use for xticks.
- colormap : str or matplotlib colormap, default None
- Colormap to use for line colors.
- axvlines : bool, optional
- If true, vertical lines will be added at each xtick.
- axvlines_kwds : keywords, optional
- Options to be passed to axvline method for vertical lines.
- sort_labels : bool, default False
- Sort class_column labels, useful when assigning colors.
- **kwargs
- Options to pass to matplotlib plotting method.
-
- Returns
- -------
- matplotlib.axes.Axes
-
- Examples
- --------
-
- .. plot::
- :context: close-figs
-
- >>> df = pd.read_csv(
- ... 'https://raw.githubusercontent.com/pandas-dev/'
- ... 'pandas/main/pandas/tests/io/data/csv/iris.csv'
- ... )
- >>> pd.plotting.parallel_coordinates(
- ... df, 'Name', color=('#556270', '#4ECDC4', '#C7F464')
- ... )
- <AxesSubplot: xlabel='y(t)', ylabel='y(t + 1)'>
- """
- plot_backend = _get_plot_backend("matplotlib")
- return plot_backend.parallel_coordinates(
- frame=frame,
- class_column=class_column,
- cols=cols,
- ax=ax,
- color=color,
- use_columns=use_columns,
- xticks=xticks,
- colormap=colormap,
- axvlines=axvlines,
- axvlines_kwds=axvlines_kwds,
- sort_labels=sort_labels,
- **kwargs,
- )
-
-
-def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Axes:
- """
- Lag plot for time series.
-
- Parameters
- ----------
- series : Series
- The time series to visualize.
- lag : int, default 1
- Lag length of the scatter plot.
- ax : Matplotlib axis object, optional
- The matplotlib axis object to use.
- **kwds
- Matplotlib scatter method keyword arguments.
-
- Returns
- -------
- matplotlib.axes.Axes
-
- Examples
- --------
- Lag plots are most commonly used to look for patterns in time series data.
-
- Given the following time series
-
- .. plot::
- :context: close-figs
-
- >>> np.random.seed(5)
- >>> x = np.cumsum(np.random.normal(loc=1, scale=5, size=50))
- >>> s = pd.Series(x)
- >>> s.plot()
- <AxesSubplot: xlabel='Midrange'>
-
- A lag plot with ``lag=1`` returns
-
- .. plot::
- :context: close-figs
-
- >>> pd.plotting.lag_plot(s, lag=1)
- <AxesSubplot: xlabel='y(t)', ylabel='y(t + 1)'>
- """
- plot_backend = _get_plot_backend("matplotlib")
- return plot_backend.lag_plot(series=series, lag=lag, ax=ax, **kwds)
-
-
-def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Axes:
- """
- Autocorrelation plot for time series.
-
- Parameters
- ----------
- series : Series
- The time series to visualize.
- ax : Matplotlib axis object, optional
- The matplotlib axis object to use.
- **kwargs
- Options to pass to matplotlib plotting method.
-
- Returns
- -------
- matplotlib.axes.Axes
-
- Examples
- --------
- The horizontal lines in the plot correspond to 95% and 99% confidence bands.
-
- The dashed line is 99% confidence band.
-
- .. plot::
- :context: close-figs
-
- >>> spacing = np.linspace(-9 * np.pi, 9 * np.pi, num=1000)
- >>> s = pd.Series(0.7 * np.random.rand(1000) + 0.3 * np.sin(spacing))
- >>> pd.plotting.autocorrelation_plot(s)
- <AxesSubplot: title={'center': 'width'}, xlabel='Lag', ylabel='Autocorrelation'>
- """
- plot_backend = _get_plot_backend("matplotlib")
- return plot_backend.autocorrelation_plot(series=series, ax=ax, **kwargs)
-
-
-class _Options(dict):
- """
- Stores pandas plotting options.
-
- Allows for parameter aliasing so you can just use parameter names that are
- the same as the plot function parameters, but is stored in a canonical
- format that makes it easy to breakdown into groups later.
- """
-
- # alias so the names are same as plotting method parameter names
- _ALIASES = {"x_compat": "xaxis.compat"}
- _DEFAULT_KEYS = ["xaxis.compat"]
-
- def __init__(self, deprecated: bool = False) -> None:
- self._deprecated = deprecated
- super().__setitem__("xaxis.compat", False)
-
- def __getitem__(self, key):
- key = self._get_canonical_key(key)
- if key not in self:
- raise ValueError(f"{key} is not a valid pandas plotting option")
- return super().__getitem__(key)
-
- def __setitem__(self, key, value) -> None:
- key = self._get_canonical_key(key)
- super().__setitem__(key, value)
-
- def __delitem__(self, key) -> None:
- key = self._get_canonical_key(key)
- if key in self._DEFAULT_KEYS:
- raise ValueError(f"Cannot remove default parameter {key}")
- super().__delitem__(key)
-
- def __contains__(self, key) -> bool:
- key = self._get_canonical_key(key)
- return super().__contains__(key)
-
- def reset(self) -> None:
- """
- Reset the option store to its initial state
-
- Returns
- -------
- None
- """
- # error: Cannot access "__init__" directly
- self.__init__() # type: ignore[misc]
-
- def _get_canonical_key(self, key):
- return self._ALIASES.get(key, key)
-
- @contextmanager
- def use(self, key, value) -> Generator[_Options, None, None]:
- """
- Temporarily set a parameter value using the with statement.
- Aliasing allowed.
- """
- old_value = self[key]
- try:
- self[key] = value
- yield self
- finally:
- self[key] = old_value
-
-
-plot_params = _Options()
diff --git a/contrib/python/pandas/py3/pandas/testing.py b/contrib/python/pandas/py3/pandas/testing.py
deleted file mode 100644
index 841b55df485..00000000000
--- a/contrib/python/pandas/py3/pandas/testing.py
+++ /dev/null
@@ -1,18 +0,0 @@
-"""
-Public testing utility functions.
-"""
-
-
-from pandas._testing import (
- assert_extension_array_equal,
- assert_frame_equal,
- assert_index_equal,
- assert_series_equal,
-)
-
-__all__ = [
- "assert_extension_array_equal",
- "assert_frame_equal",
- "assert_series_equal",
- "assert_index_equal",
-]
diff --git a/contrib/python/pandas/py3/pandas/tseries/__init__.py b/contrib/python/pandas/py3/pandas/tseries/__init__.py
deleted file mode 100644
index dd4ce02b194..00000000000
--- a/contrib/python/pandas/py3/pandas/tseries/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
- # import modules that have public classes/functions:
- from pandas.tseries import (
- frequencies,
- offsets,
- )
-
- # and mark only those modules as public
- __all__ = ["frequencies", "offsets"]
diff --git a/contrib/python/pandas/py3/pandas/tseries/api.py b/contrib/python/pandas/py3/pandas/tseries/api.py
deleted file mode 100644
index 9fdf95d09fe..00000000000
--- a/contrib/python/pandas/py3/pandas/tseries/api.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""
-Timeseries API
-"""
-
-from pandas.tseries import offsets
-from pandas.tseries.frequencies import infer_freq
-
-__all__ = ["infer_freq", "offsets"]
diff --git a/contrib/python/pandas/py3/pandas/tseries/frequencies.py b/contrib/python/pandas/py3/pandas/tseries/frequencies.py
deleted file mode 100644
index 34e0c111360..00000000000
--- a/contrib/python/pandas/py3/pandas/tseries/frequencies.py
+++ /dev/null
@@ -1,619 +0,0 @@
-from __future__ import annotations
-
-import numpy as np
-
-from pandas._libs.algos import unique_deltas
-from pandas._libs.tslibs import (
- Timestamp,
- get_unit_from_dtype,
- periods_per_day,
- tz_convert_from_utc,
-)
-from pandas._libs.tslibs.ccalendar import (
- DAYS,
- MONTH_ALIASES,
- MONTH_NUMBERS,
- MONTHS,
- int_to_weekday,
-)
-from pandas._libs.tslibs.fields import (
- build_field_sarray,
- month_position_check,
-)
-from pandas._libs.tslibs.offsets import (
- DateOffset,
- Day,
- to_offset,
-)
-from pandas._libs.tslibs.parsing import get_rule_month
-from pandas._typing import npt
-from pandas.util._decorators import cache_readonly
-
-from pandas.core.dtypes.common import (
- is_datetime64_dtype,
- is_numeric_dtype,
- is_period_dtype,
- is_timedelta64_dtype,
-)
-from pandas.core.dtypes.generic import (
- ABCIndex,
- ABCSeries,
-)
-
-from pandas.core.algorithms import unique
-
-# ---------------------------------------------------------------------
-# Offset names ("time rules") and related functions
-
-_offset_to_period_map = {
- "WEEKDAY": "D",
- "EOM": "M",
- "BM": "M",
- "BQS": "Q",
- "QS": "Q",
- "BQ": "Q",
- "BA": "A",
- "AS": "A",
- "BAS": "A",
- "MS": "M",
- "D": "D",
- "C": "C",
- "B": "B",
- "T": "T",
- "S": "S",
- "L": "L",
- "U": "U",
- "N": "N",
- "H": "H",
- "Q": "Q",
- "A": "A",
- "W": "W",
- "M": "M",
- "Y": "A",
- "BY": "A",
- "YS": "A",
- "BYS": "A",
-}
-
-_need_suffix = ["QS", "BQ", "BQS", "YS", "AS", "BY", "BA", "BYS", "BAS"]
-
-for _prefix in _need_suffix:
- for _m in MONTHS:
- key = f"{_prefix}-{_m}"
- _offset_to_period_map[key] = _offset_to_period_map[_prefix]
-
-for _prefix in ["A", "Q"]:
- for _m in MONTHS:
- _alias = f"{_prefix}-{_m}"
- _offset_to_period_map[_alias] = _alias
-
-for _d in DAYS:
- _offset_to_period_map[f"W-{_d}"] = f"W-{_d}"
-
-
-def get_period_alias(offset_str: str) -> str | None:
- """
- Alias to closest period strings BQ->Q etc.
- """
- return _offset_to_period_map.get(offset_str, None)
-
-
-# ---------------------------------------------------------------------
-# Period codes
-
-
-def infer_freq(index) -> str | None:
- """
- Infer the most likely frequency given the input index.
-
- Parameters
- ----------
- index : DatetimeIndex or TimedeltaIndex
- If passed a Series will use the values of the series (NOT THE INDEX).
-
- Returns
- -------
- str or None
- None if no discernible frequency.
-
- Raises
- ------
- TypeError
- If the index is not datetime-like.
- ValueError
- If there are fewer than three values.
-
- Examples
- --------
- >>> idx = pd.date_range(start='2020/12/01', end='2020/12/30', periods=30)
- >>> pd.infer_freq(idx)
- 'D'
- """
- from pandas.core.api import (
- DatetimeIndex,
- Index,
- )
-
- if isinstance(index, ABCSeries):
- values = index._values
- if not (
- is_datetime64_dtype(values)
- or is_timedelta64_dtype(values)
- or values.dtype == object
- ):
- raise TypeError(
- "cannot infer freq from a non-convertible dtype "
- f"on a Series of {index.dtype}"
- )
- index = values
-
- inferer: _FrequencyInferer
-
- if not hasattr(index, "dtype"):
- pass
- elif is_period_dtype(index.dtype):
- raise TypeError(
- "PeriodIndex given. Check the `freq` attribute "
- "instead of using infer_freq."
- )
- elif is_timedelta64_dtype(index.dtype):
- # Allow TimedeltaIndex and TimedeltaArray
- inferer = _TimedeltaFrequencyInferer(index)
- return inferer.get_freq()
-
- if isinstance(index, Index) and not isinstance(index, DatetimeIndex):
- if is_numeric_dtype(index):
- raise TypeError(
- f"cannot infer freq from a non-convertible index of dtype {index.dtype}"
- )
- index = index._values
-
- if not isinstance(index, DatetimeIndex):
- index = DatetimeIndex(index)
-
- inferer = _FrequencyInferer(index)
- return inferer.get_freq()
-
-
-class _FrequencyInferer:
- """
- Not sure if I can avoid the state machine here
- """
-
- def __init__(self, index) -> None:
- self.index = index
- self.i8values = index.asi8
-
- # For get_unit_from_dtype we need the dtype to the underlying ndarray,
- # which for tz-aware is not the same as index.dtype
- if isinstance(index, ABCIndex):
- # error: Item "ndarray[Any, Any]" of "Union[ExtensionArray,
- # ndarray[Any, Any]]" has no attribute "_ndarray"
- self._creso = get_unit_from_dtype(
- index._data._ndarray.dtype # type: ignore[union-attr]
- )
- else:
- # otherwise we have DTA/TDA
- self._creso = get_unit_from_dtype(index._ndarray.dtype)
-
- # This moves the values, which are implicitly in UTC, to the
- # the timezone so they are in local time
- if hasattr(index, "tz"):
- if index.tz is not None:
- self.i8values = tz_convert_from_utc(
- self.i8values, index.tz, reso=self._creso
- )
-
- if len(index) < 3:
- raise ValueError("Need at least 3 dates to infer frequency")
-
- self.is_monotonic = (
- self.index._is_monotonic_increasing or self.index._is_monotonic_decreasing
- )
-
- @cache_readonly
- def deltas(self) -> npt.NDArray[np.int64]:
- return unique_deltas(self.i8values)
-
- @cache_readonly
- def deltas_asi8(self) -> npt.NDArray[np.int64]:
- # NB: we cannot use self.i8values here because we may have converted
- # the tz in __init__
- return unique_deltas(self.index.asi8)
-
- @cache_readonly
- def is_unique(self) -> bool:
- return len(self.deltas) == 1
-
- @cache_readonly
- def is_unique_asi8(self) -> bool:
- return len(self.deltas_asi8) == 1
-
- def get_freq(self) -> str | None:
- """
- Find the appropriate frequency string to describe the inferred
- frequency of self.i8values
-
- Returns
- -------
- str or None
- """
- if not self.is_monotonic or not self.index._is_unique:
- return None
-
- delta = self.deltas[0]
- ppd = periods_per_day(self._creso)
- if delta and _is_multiple(delta, ppd):
- return self._infer_daily_rule()
-
- # Business hourly, maybe. 17: one day / 65: one weekend
- if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):
- return "BH"
-
- # Possibly intraday frequency. Here we use the
- # original .asi8 values as the modified values
- # will not work around DST transitions. See #8772
- if not self.is_unique_asi8:
- return None
-
- delta = self.deltas_asi8[0]
- pph = ppd // 24
- ppm = pph // 60
- pps = ppm // 60
- if _is_multiple(delta, pph):
- # Hours
- return _maybe_add_count("H", delta / pph)
- elif _is_multiple(delta, ppm):
- # Minutes
- return _maybe_add_count("T", delta / ppm)
- elif _is_multiple(delta, pps):
- # Seconds
- return _maybe_add_count("S", delta / pps)
- elif _is_multiple(delta, (pps // 1000)):
- # Milliseconds
- return _maybe_add_count("L", delta / (pps // 1000))
- elif _is_multiple(delta, (pps // 1_000_000)):
- # Microseconds
- return _maybe_add_count("U", delta / (pps // 1_000_000))
- else:
- # Nanoseconds
- return _maybe_add_count("N", delta)
-
- @cache_readonly
- def day_deltas(self) -> list[int]:
- ppd = periods_per_day(self._creso)
- return [x / ppd for x in self.deltas]
-
- @cache_readonly
- def hour_deltas(self) -> list[int]:
- pph = periods_per_day(self._creso) // 24
- return [x / pph for x in self.deltas]
-
- @cache_readonly
- def fields(self) -> np.ndarray: # structured array of fields
- return build_field_sarray(self.i8values, reso=self._creso)
-
- @cache_readonly
- def rep_stamp(self) -> Timestamp:
- return Timestamp(self.i8values[0])
-
- def month_position_check(self) -> str | None:
- return month_position_check(self.fields, self.index.dayofweek)
-
- @cache_readonly
- def mdiffs(self) -> npt.NDArray[np.int64]:
- nmonths = self.fields["Y"] * 12 + self.fields["M"]
- return unique_deltas(nmonths.astype("i8"))
-
- @cache_readonly
- def ydiffs(self) -> npt.NDArray[np.int64]:
- return unique_deltas(self.fields["Y"].astype("i8"))
-
- def _infer_daily_rule(self) -> str | None:
- annual_rule = self._get_annual_rule()
- if annual_rule:
- nyears = self.ydiffs[0]
- month = MONTH_ALIASES[self.rep_stamp.month]
- alias = f"{annual_rule}-{month}"
- return _maybe_add_count(alias, nyears)
-
- quarterly_rule = self._get_quarterly_rule()
- if quarterly_rule:
- nquarters = self.mdiffs[0] / 3
- mod_dict = {0: 12, 2: 11, 1: 10}
- month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]]
- alias = f"{quarterly_rule}-{month}"
- return _maybe_add_count(alias, nquarters)
-
- monthly_rule = self._get_monthly_rule()
- if monthly_rule:
- return _maybe_add_count(monthly_rule, self.mdiffs[0])
-
- if self.is_unique:
- return self._get_daily_rule()
-
- if self._is_business_daily():
- return "B"
-
- wom_rule = self._get_wom_rule()
- if wom_rule:
- return wom_rule
-
- return None
-
- def _get_daily_rule(self) -> str | None:
- ppd = periods_per_day(self._creso)
- days = self.deltas[0] / ppd
- if days % 7 == 0:
- # Weekly
- wd = int_to_weekday[self.rep_stamp.weekday()]
- alias = f"W-{wd}"
- return _maybe_add_count(alias, days / 7)
- else:
- return _maybe_add_count("D", days)
-
- def _get_annual_rule(self) -> str | None:
- if len(self.ydiffs) > 1:
- return None
-
- if len(unique(self.fields["M"])) > 1:
- return None
-
- pos_check = self.month_position_check()
-
- if pos_check is None:
- return None
- else:
- return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check)
-
- def _get_quarterly_rule(self) -> str | None:
- if len(self.mdiffs) > 1:
- return None
-
- if not self.mdiffs[0] % 3 == 0:
- return None
-
- pos_check = self.month_position_check()
-
- if pos_check is None:
- return None
- else:
- return {"cs": "QS", "bs": "BQS", "ce": "Q", "be": "BQ"}.get(pos_check)
-
- def _get_monthly_rule(self) -> str | None:
- if len(self.mdiffs) > 1:
- return None
- pos_check = self.month_position_check()
-
- if pos_check is None:
- return None
- else:
- return {"cs": "MS", "bs": "BMS", "ce": "M", "be": "BM"}.get(pos_check)
-
- def _is_business_daily(self) -> bool:
- # quick check: cannot be business daily
- if self.day_deltas != [1, 3]:
- return False
-
- # probably business daily, but need to confirm
- first_weekday = self.index[0].weekday()
- shifts = np.diff(self.i8values)
- ppd = periods_per_day(self._creso)
- shifts = np.floor_divide(shifts, ppd)
- weekdays = np.mod(first_weekday + np.cumsum(shifts), 7)
-
- return bool(
- np.all(
- ((weekdays == 0) & (shifts == 3))
- | ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))
- )
- )
-
- def _get_wom_rule(self) -> str | None:
- weekdays = unique(self.index.weekday)
- if len(weekdays) > 1:
- return None
-
- week_of_months = unique((self.index.day - 1) // 7)
- # Only attempt to infer up to WOM-4. See #9425
- week_of_months = week_of_months[week_of_months < 4]
- if len(week_of_months) == 0 or len(week_of_months) > 1:
- return None
-
- # get which week
- week = week_of_months[0] + 1
- wd = int_to_weekday[weekdays[0]]
-
- return f"WOM-{week}{wd}"
-
-
-class _TimedeltaFrequencyInferer(_FrequencyInferer):
- def _infer_daily_rule(self):
- if self.is_unique:
- return self._get_daily_rule()
-
-
-def _is_multiple(us, mult: int) -> bool:
- return us % mult == 0
-
-
-def _maybe_add_count(base: str, count: float) -> str:
- if count != 1:
- assert count == int(count)
- count = int(count)
- return f"{count}{base}"
- else:
- return base
-
-
-# ----------------------------------------------------------------------
-# Frequency comparison
-
-
-def is_subperiod(source, target) -> bool:
- """
- Returns True if downsampling is possible between source and target
- frequencies
-
- Parameters
- ----------
- source : str or DateOffset
- Frequency converting from
- target : str or DateOffset
- Frequency converting to
-
- Returns
- -------
- bool
- """
-
- if target is None or source is None:
- return False
- source = _maybe_coerce_freq(source)
- target = _maybe_coerce_freq(target)
-
- if _is_annual(target):
- if _is_quarterly(source):
- return _quarter_months_conform(
- get_rule_month(source), get_rule_month(target)
- )
- return source in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"}
- elif _is_quarterly(target):
- return source in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"}
- elif _is_monthly(target):
- return source in {"D", "C", "B", "H", "T", "S", "L", "U", "N"}
- elif _is_weekly(target):
- return source in {target, "D", "C", "B", "H", "T", "S", "L", "U", "N"}
- elif target == "B":
- return source in {"B", "H", "T", "S", "L", "U", "N"}
- elif target == "C":
- return source in {"C", "H", "T", "S", "L", "U", "N"}
- elif target == "D":
- return source in {"D", "H", "T", "S", "L", "U", "N"}
- elif target == "H":
- return source in {"H", "T", "S", "L", "U", "N"}
- elif target == "T":
- return source in {"T", "S", "L", "U", "N"}
- elif target == "S":
- return source in {"S", "L", "U", "N"}
- elif target == "L":
- return source in {"L", "U", "N"}
- elif target == "U":
- return source in {"U", "N"}
- elif target == "N":
- return source in {"N"}
- else:
- return False
-
-
-def is_superperiod(source, target) -> bool:
- """
- Returns True if upsampling is possible between source and target
- frequencies
-
- Parameters
- ----------
- source : str or DateOffset
- Frequency converting from
- target : str or DateOffset
- Frequency converting to
-
- Returns
- -------
- bool
- """
- if target is None or source is None:
- return False
- source = _maybe_coerce_freq(source)
- target = _maybe_coerce_freq(target)
-
- if _is_annual(source):
- if _is_annual(target):
- return get_rule_month(source) == get_rule_month(target)
-
- if _is_quarterly(target):
- smonth = get_rule_month(source)
- tmonth = get_rule_month(target)
- return _quarter_months_conform(smonth, tmonth)
- return target in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"}
- elif _is_quarterly(source):
- return target in {"D", "C", "B", "M", "H", "T", "S", "L", "U", "N"}
- elif _is_monthly(source):
- return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"}
- elif _is_weekly(source):
- return target in {source, "D", "C", "B", "H", "T", "S", "L", "U", "N"}
- elif source == "B":
- return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"}
- elif source == "C":
- return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"}
- elif source == "D":
- return target in {"D", "C", "B", "H", "T", "S", "L", "U", "N"}
- elif source == "H":
- return target in {"H", "T", "S", "L", "U", "N"}
- elif source == "T":
- return target in {"T", "S", "L", "U", "N"}
- elif source == "S":
- return target in {"S", "L", "U", "N"}
- elif source == "L":
- return target in {"L", "U", "N"}
- elif source == "U":
- return target in {"U", "N"}
- elif source == "N":
- return target in {"N"}
- else:
- return False
-
-
-def _maybe_coerce_freq(code) -> str:
- """we might need to coerce a code to a rule_code
- and uppercase it
-
- Parameters
- ----------
- source : str or DateOffset
- Frequency converting from
-
- Returns
- -------
- str
- """
- assert code is not None
- if isinstance(code, DateOffset):
- code = code.rule_code
- return code.upper()
-
-
-def _quarter_months_conform(source: str, target: str) -> bool:
- snum = MONTH_NUMBERS[source]
- tnum = MONTH_NUMBERS[target]
- return snum % 3 == tnum % 3
-
-
-def _is_annual(rule: str) -> bool:
- rule = rule.upper()
- return rule == "A" or rule.startswith("A-")
-
-
-def _is_quarterly(rule: str) -> bool:
- rule = rule.upper()
- return rule == "Q" or rule.startswith("Q-") or rule.startswith("BQ")
-
-
-def _is_monthly(rule: str) -> bool:
- rule = rule.upper()
- return rule in ("M", "BM")
-
-
-def _is_weekly(rule: str) -> bool:
- rule = rule.upper()
- return rule == "W" or rule.startswith("W-")
-
-
-__all__ = [
- "Day",
- "get_period_alias",
- "infer_freq",
- "is_subperiod",
- "is_superperiod",
- "to_offset",
-]
diff --git a/contrib/python/pandas/py3/pandas/tseries/holiday.py b/contrib/python/pandas/py3/pandas/tseries/holiday.py
deleted file mode 100644
index 9f1e166cd6a..00000000000
--- a/contrib/python/pandas/py3/pandas/tseries/holiday.py
+++ /dev/null
@@ -1,609 +0,0 @@
-from __future__ import annotations
-
-from datetime import (
- datetime,
- timedelta,
-)
-import warnings
-
-from dateutil.relativedelta import (
- FR,
- MO,
- SA,
- SU,
- TH,
- TU,
- WE,
-)
-import numpy as np
-
-from pandas.errors import PerformanceWarning
-
-from pandas import (
- DateOffset,
- DatetimeIndex,
- Series,
- Timestamp,
- concat,
- date_range,
-)
-
-from pandas.tseries.offsets import (
- Day,
- Easter,
-)
-
-
-def next_monday(dt: datetime) -> datetime:
- """
- If holiday falls on Saturday, use following Monday instead;
- if holiday falls on Sunday, use Monday instead
- """
- if dt.weekday() == 5:
- return dt + timedelta(2)
- elif dt.weekday() == 6:
- return dt + timedelta(1)
- return dt
-
-
-def next_monday_or_tuesday(dt: datetime) -> datetime:
- """
- For second holiday of two adjacent ones!
- If holiday falls on Saturday, use following Monday instead;
- if holiday falls on Sunday or Monday, use following Tuesday instead
- (because Monday is already taken by adjacent holiday on the day before)
- """
- dow = dt.weekday()
- if dow in (5, 6):
- return dt + timedelta(2)
- if dow == 0:
- return dt + timedelta(1)
- return dt
-
-
-def previous_friday(dt: datetime) -> datetime:
- """
- If holiday falls on Saturday or Sunday, use previous Friday instead.
- """
- if dt.weekday() == 5:
- return dt - timedelta(1)
- elif dt.weekday() == 6:
- return dt - timedelta(2)
- return dt
-
-
-def sunday_to_monday(dt: datetime) -> datetime:
- """
- If holiday falls on Sunday, use day thereafter (Monday) instead.
- """
- if dt.weekday() == 6:
- return dt + timedelta(1)
- return dt
-
-
-def weekend_to_monday(dt: datetime) -> datetime:
- """
- If holiday falls on Sunday or Saturday,
- use day thereafter (Monday) instead.
- Needed for holidays such as Christmas observation in Europe
- """
- if dt.weekday() == 6:
- return dt + timedelta(1)
- elif dt.weekday() == 5:
- return dt + timedelta(2)
- return dt
-
-
-def nearest_workday(dt: datetime) -> datetime:
- """
- If holiday falls on Saturday, use day before (Friday) instead;
- if holiday falls on Sunday, use day thereafter (Monday) instead.
- """
- if dt.weekday() == 5:
- return dt - timedelta(1)
- elif dt.weekday() == 6:
- return dt + timedelta(1)
- return dt
-
-
-def next_workday(dt: datetime) -> datetime:
- """
- returns next weekday used for observances
- """
- dt += timedelta(days=1)
- while dt.weekday() > 4:
- # Mon-Fri are 0-4
- dt += timedelta(days=1)
- return dt
-
-
-def previous_workday(dt: datetime) -> datetime:
- """
- returns previous weekday used for observances
- """
- dt -= timedelta(days=1)
- while dt.weekday() > 4:
- # Mon-Fri are 0-4
- dt -= timedelta(days=1)
- return dt
-
-
-def before_nearest_workday(dt: datetime) -> datetime:
- """
- returns previous workday after nearest workday
- """
- return previous_workday(nearest_workday(dt))
-
-
-def after_nearest_workday(dt: datetime) -> datetime:
- """
- returns next workday after nearest workday
- needed for Boxing day or multiple holidays in a series
- """
- return next_workday(nearest_workday(dt))
-
-
-class Holiday:
- """
- Class that defines a holiday with start/end dates and rules
- for observance.
- """
-
- def __init__(
- self,
- name,
- year=None,
- month=None,
- day=None,
- offset=None,
- observance=None,
- start_date=None,
- end_date=None,
- days_of_week=None,
- ) -> None:
- """
- Parameters
- ----------
- name : str
- Name of the holiday , defaults to class name
- offset : array of pandas.tseries.offsets or
- class from pandas.tseries.offsets
- computes offset from date
- observance: function
- computes when holiday is given a pandas Timestamp
- days_of_week:
- provide a tuple of days e.g (0,1,2,3,) for Monday Through Thursday
- Monday=0,..,Sunday=6
-
- Examples
- --------
- >>> from dateutil.relativedelta import MO
-
- >>> USMemorialDay = pd.tseries.holiday.Holiday(
- ... "Memorial Day", month=5, day=31, offset=pd.DateOffset(weekday=MO(-1))
- ... )
- >>> USMemorialDay
- Holiday: Memorial Day (month=5, day=31, offset=<DateOffset: weekday=MO(-1)>)
-
- >>> USLaborDay = pd.tseries.holiday.Holiday(
- ... "Labor Day", month=9, day=1, offset=pd.DateOffset(weekday=MO(1))
- ... )
- >>> USLaborDay
- Holiday: Labor Day (month=9, day=1, offset=<DateOffset: weekday=MO(+1)>)
-
- >>> July3rd = pd.tseries.holiday.Holiday("July 3rd", month=7, day=3)
- >>> July3rd
- Holiday: July 3rd (month=7, day=3, )
-
- >>> NewYears = pd.tseries.holiday.Holiday(
- ... "New Years Day", month=1, day=1,
- ... observance=pd.tseries.holiday.nearest_workday
- ... )
- >>> NewYears # doctest: +SKIP
- Holiday: New Years Day (
- month=1, day=1, observance=<function nearest_workday at 0x66545e9bc440>
- )
-
- >>> July3rd = pd.tseries.holiday.Holiday(
- ... "July 3rd", month=7, day=3,
- ... days_of_week=(0, 1, 2, 3)
- ... )
- >>> July3rd
- Holiday: July 3rd (month=7, day=3, )
- """
- if offset is not None and observance is not None:
- raise NotImplementedError("Cannot use both offset and observance.")
-
- self.name = name
- self.year = year
- self.month = month
- self.day = day
- self.offset = offset
- self.start_date = (
- Timestamp(start_date) if start_date is not None else start_date
- )
- self.end_date = Timestamp(end_date) if end_date is not None else end_date
- self.observance = observance
- assert days_of_week is None or type(days_of_week) == tuple
- self.days_of_week = days_of_week
-
- def __repr__(self) -> str:
- info = ""
- if self.year is not None:
- info += f"year={self.year}, "
- info += f"month={self.month}, day={self.day}, "
-
- if self.offset is not None:
- info += f"offset={self.offset}"
-
- if self.observance is not None:
- info += f"observance={self.observance}"
-
- repr = f"Holiday: {self.name} ({info})"
- return repr
-
- def dates(self, start_date, end_date, return_name: bool = False):
- """
- Calculate holidays observed between start date and end date
-
- Parameters
- ----------
- start_date : starting date, datetime-like, optional
- end_date : ending date, datetime-like, optional
- return_name : bool, optional, default=False
- If True, return a series that has dates and holiday names.
- False will only return dates.
- """
- start_date = Timestamp(start_date)
- end_date = Timestamp(end_date)
-
- filter_start_date = start_date
- filter_end_date = end_date
-
- if self.year is not None:
- dt = Timestamp(datetime(self.year, self.month, self.day))
- if return_name:
- return Series(self.name, index=[dt])
- else:
- return [dt]
-
- dates = self._reference_dates(start_date, end_date)
- holiday_dates = self._apply_rule(dates)
- if self.days_of_week is not None:
- holiday_dates = holiday_dates[
- np.in1d(holiday_dates.dayofweek, self.days_of_week)
- ]
-
- if self.start_date is not None:
- filter_start_date = max(
- self.start_date.tz_localize(filter_start_date.tz), filter_start_date
- )
- if self.end_date is not None:
- filter_end_date = min(
- self.end_date.tz_localize(filter_end_date.tz), filter_end_date
- )
- holiday_dates = holiday_dates[
- (holiday_dates >= filter_start_date) & (holiday_dates <= filter_end_date)
- ]
- if return_name:
- return Series(self.name, index=holiday_dates)
- return holiday_dates
-
- def _reference_dates(self, start_date, end_date):
- """
- Get reference dates for the holiday.
-
- Return reference dates for the holiday also returning the year
- prior to the start_date and year following the end_date. This ensures
- that any offsets to be applied will yield the holidays within
- the passed in dates.
- """
- if self.start_date is not None:
- start_date = self.start_date.tz_localize(start_date.tz)
-
- if self.end_date is not None:
- end_date = self.end_date.tz_localize(start_date.tz)
-
- year_offset = DateOffset(years=1)
- reference_start_date = Timestamp(
- datetime(start_date.year - 1, self.month, self.day)
- )
-
- reference_end_date = Timestamp(
- datetime(end_date.year + 1, self.month, self.day)
- )
- # Don't process unnecessary holidays
- dates = date_range(
- start=reference_start_date,
- end=reference_end_date,
- freq=year_offset,
- tz=start_date.tz,
- )
-
- return dates
-
- def _apply_rule(self, dates):
- """
- Apply the given offset/observance to a DatetimeIndex of dates.
-
- Parameters
- ----------
- dates : DatetimeIndex
- Dates to apply the given offset/observance rule
-
- Returns
- -------
- Dates with rules applied
- """
- if dates.empty:
- return DatetimeIndex([])
-
- if self.observance is not None:
- return dates.map(lambda d: self.observance(d))
-
- if self.offset is not None:
- if not isinstance(self.offset, list):
- offsets = [self.offset]
- else:
- offsets = self.offset
- for offset in offsets:
- # if we are adding a non-vectorized value
- # ignore the PerformanceWarnings:
- with warnings.catch_warnings():
- warnings.simplefilter("ignore", PerformanceWarning)
- dates += offset
- return dates
-
-
-holiday_calendars = {}
-
-
-def register(cls) -> None:
- try:
- name = cls.name
- except AttributeError:
- name = cls.__name__
- holiday_calendars[name] = cls
-
-
-def get_calendar(name):
- """
- Return an instance of a calendar based on its name.
-
- Parameters
- ----------
- name : str
- Calendar name to return an instance of
- """
- return holiday_calendars[name]()
-
-
-class HolidayCalendarMetaClass(type):
- def __new__(cls, clsname, bases, attrs):
- calendar_class = super().__new__(cls, clsname, bases, attrs)
- register(calendar_class)
- return calendar_class
-
-
-class AbstractHolidayCalendar(metaclass=HolidayCalendarMetaClass):
- """
- Abstract interface to create holidays following certain rules.
- """
-
- rules: list[Holiday] = []
- start_date = Timestamp(datetime(1970, 1, 1))
- end_date = Timestamp(datetime(2200, 12, 31))
- _cache = None
-
- def __init__(self, name=None, rules=None) -> None:
- """
- Initializes holiday object with a given set a rules. Normally
- classes just have the rules defined within them.
-
- Parameters
- ----------
- name : str
- Name of the holiday calendar, defaults to class name
- rules : array of Holiday objects
- A set of rules used to create the holidays.
- """
- super().__init__()
- if name is None:
- name = type(self).__name__
- self.name = name
-
- if rules is not None:
- self.rules = rules
-
- def rule_from_name(self, name):
- for rule in self.rules:
- if rule.name == name:
- return rule
-
- return None
-
- def holidays(self, start=None, end=None, return_name: bool = False):
- """
- Returns a curve with holidays between start_date and end_date
-
- Parameters
- ----------
- start : starting date, datetime-like, optional
- end : ending date, datetime-like, optional
- return_name : bool, optional
- If True, return a series that has dates and holiday names.
- False will only return a DatetimeIndex of dates.
-
- Returns
- -------
- DatetimeIndex of holidays
- """
- if self.rules is None:
- raise Exception(
- f"Holiday Calendar {self.name} does not have any rules specified"
- )
-
- if start is None:
- start = AbstractHolidayCalendar.start_date
-
- if end is None:
- end = AbstractHolidayCalendar.end_date
-
- start = Timestamp(start)
- end = Timestamp(end)
-
- # If we don't have a cache or the dates are outside the prior cache, we
- # get them again
- if self._cache is None or start < self._cache[0] or end > self._cache[1]:
- pre_holidays = [
- rule.dates(start, end, return_name=True) for rule in self.rules
- ]
- if pre_holidays:
- holidays = concat(pre_holidays)
- else:
- holidays = Series(index=DatetimeIndex([]), dtype=object)
-
- self._cache = (start, end, holidays.sort_index())
-
- holidays = self._cache[2]
- holidays = holidays[start:end]
-
- if return_name:
- return holidays
- else:
- return holidays.index
-
- @staticmethod
- def merge_class(base, other):
- """
- Merge holiday calendars together. The base calendar
- will take precedence to other. The merge will be done
- based on each holiday's name.
-
- Parameters
- ----------
- base : AbstractHolidayCalendar
- instance/subclass or array of Holiday objects
- other : AbstractHolidayCalendar
- instance/subclass or array of Holiday objects
- """
- try:
- other = other.rules
- except AttributeError:
- pass
-
- if not isinstance(other, list):
- other = [other]
- other_holidays = {holiday.name: holiday for holiday in other}
-
- try:
- base = base.rules
- except AttributeError:
- pass
-
- if not isinstance(base, list):
- base = [base]
- base_holidays = {holiday.name: holiday for holiday in base}
-
- other_holidays.update(base_holidays)
- return list(other_holidays.values())
-
- def merge(self, other, inplace: bool = False):
- """
- Merge holiday calendars together. The caller's class
- rules take precedence. The merge will be done
- based on each holiday's name.
-
- Parameters
- ----------
- other : holiday calendar
- inplace : bool (default=False)
- If True set rule_table to holidays, else return array of Holidays
- """
- holidays = self.merge_class(self, other)
- if inplace:
- self.rules = holidays
- else:
- return holidays
-
-
-USMemorialDay = Holiday(
- "Memorial Day", month=5, day=31, offset=DateOffset(weekday=MO(-1))
-)
-USLaborDay = Holiday("Labor Day", month=9, day=1, offset=DateOffset(weekday=MO(1)))
-USColumbusDay = Holiday(
- "Columbus Day", month=10, day=1, offset=DateOffset(weekday=MO(2))
-)
-USThanksgivingDay = Holiday(
- "Thanksgiving Day", month=11, day=1, offset=DateOffset(weekday=TH(4))
-)
-USMartinLutherKingJr = Holiday(
- "Birthday of Martin Luther King, Jr.",
- start_date=datetime(1986, 1, 1),
- month=1,
- day=1,
- offset=DateOffset(weekday=MO(3)),
-)
-USPresidentsDay = Holiday(
- "Washington’s Birthday", month=2, day=1, offset=DateOffset(weekday=MO(3))
-)
-GoodFriday = Holiday("Good Friday", month=1, day=1, offset=[Easter(), Day(-2)])
-
-EasterMonday = Holiday("Easter Monday", month=1, day=1, offset=[Easter(), Day(1)])
-
-
-class USFederalHolidayCalendar(AbstractHolidayCalendar):
- """
- US Federal Government Holiday Calendar based on rules specified by:
- https://www.opm.gov/policy-data-oversight/pay-leave/federal-holidays/
- """
-
- rules = [
- Holiday("New Year's Day", month=1, day=1, observance=nearest_workday),
- USMartinLutherKingJr,
- USPresidentsDay,
- USMemorialDay,
- Holiday(
- "Juneteenth National Independence Day",
- month=6,
- day=19,
- start_date="2021-06-18",
- observance=nearest_workday,
- ),
- Holiday("Independence Day", month=7, day=4, observance=nearest_workday),
- USLaborDay,
- USColumbusDay,
- Holiday("Veterans Day", month=11, day=11, observance=nearest_workday),
- USThanksgivingDay,
- Holiday("Christmas Day", month=12, day=25, observance=nearest_workday),
- ]
-
-
-def HolidayCalendarFactory(name, base, other, base_class=AbstractHolidayCalendar):
- rules = AbstractHolidayCalendar.merge_class(base, other)
- calendar_class = type(name, (base_class,), {"rules": rules, "name": name})
- return calendar_class
-
-
-__all__ = [
- "after_nearest_workday",
- "before_nearest_workday",
- "FR",
- "get_calendar",
- "HolidayCalendarFactory",
- "MO",
- "nearest_workday",
- "next_monday",
- "next_monday_or_tuesday",
- "next_workday",
- "previous_friday",
- "previous_workday",
- "register",
- "SA",
- "SU",
- "sunday_to_monday",
- "TH",
- "TU",
- "WE",
- "weekend_to_monday",
-]
diff --git a/contrib/python/pandas/py3/pandas/tseries/offsets.py b/contrib/python/pandas/py3/pandas/tseries/offsets.py
deleted file mode 100644
index 169c9cc18a7..00000000000
--- a/contrib/python/pandas/py3/pandas/tseries/offsets.py
+++ /dev/null
@@ -1,91 +0,0 @@
-from __future__ import annotations
-
-from pandas._libs.tslibs.offsets import (
- FY5253,
- BaseOffset,
- BDay,
- BMonthBegin,
- BMonthEnd,
- BQuarterBegin,
- BQuarterEnd,
- BusinessDay,
- BusinessHour,
- BusinessMonthBegin,
- BusinessMonthEnd,
- BYearBegin,
- BYearEnd,
- CBMonthBegin,
- CBMonthEnd,
- CDay,
- CustomBusinessDay,
- CustomBusinessHour,
- CustomBusinessMonthBegin,
- CustomBusinessMonthEnd,
- DateOffset,
- Day,
- Easter,
- FY5253Quarter,
- Hour,
- LastWeekOfMonth,
- Micro,
- Milli,
- Minute,
- MonthBegin,
- MonthEnd,
- Nano,
- QuarterBegin,
- QuarterEnd,
- Second,
- SemiMonthBegin,
- SemiMonthEnd,
- Tick,
- Week,
- WeekOfMonth,
- YearBegin,
- YearEnd,
-)
-
-__all__ = [
- "Day",
- "BaseOffset",
- "BusinessDay",
- "BusinessMonthBegin",
- "BusinessMonthEnd",
- "BDay",
- "CustomBusinessDay",
- "CustomBusinessMonthBegin",
- "CustomBusinessMonthEnd",
- "CDay",
- "CBMonthEnd",
- "CBMonthBegin",
- "MonthBegin",
- "BMonthBegin",
- "MonthEnd",
- "BMonthEnd",
- "SemiMonthEnd",
- "SemiMonthBegin",
- "BusinessHour",
- "CustomBusinessHour",
- "YearBegin",
- "BYearBegin",
- "YearEnd",
- "BYearEnd",
- "QuarterBegin",
- "BQuarterBegin",
- "QuarterEnd",
- "BQuarterEnd",
- "LastWeekOfMonth",
- "FY5253Quarter",
- "FY5253",
- "Week",
- "WeekOfMonth",
- "Easter",
- "Tick",
- "Hour",
- "Minute",
- "Second",
- "Milli",
- "Micro",
- "Nano",
- "DateOffset",
-]
diff --git a/contrib/python/pandas/py3/pandas/util/__init__.py b/contrib/python/pandas/py3/pandas/util/__init__.py
deleted file mode 100644
index aa31c024fe3..00000000000
--- a/contrib/python/pandas/py3/pandas/util/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# pyright: reportUnusedImport = false
-from pandas.util._decorators import ( # noqa:F401
- Appender,
- Substitution,
- cache_readonly,
-)
-
-from pandas.core.util.hashing import ( # noqa:F401
- hash_array,
- hash_pandas_object,
-)
diff --git a/contrib/python/pandas/py3/pandas/util/_decorators.py b/contrib/python/pandas/py3/pandas/util/_decorators.py
deleted file mode 100644
index d0e393e41c6..00000000000
--- a/contrib/python/pandas/py3/pandas/util/_decorators.py
+++ /dev/null
@@ -1,505 +0,0 @@
-from __future__ import annotations
-
-from functools import wraps
-import inspect
-from textwrap import dedent
-from typing import (
- Any,
- Callable,
- Mapping,
- cast,
-)
-import warnings
-
-from pandas._libs.properties import cache_readonly
-from pandas._typing import (
- F,
- T,
-)
-from pandas.util._exceptions import find_stack_level
-
-
-def deprecate(
- name: str,
- alternative: Callable[..., Any],
- version: str,
- alt_name: str | None = None,
- klass: type[Warning] | None = None,
- stacklevel: int = 2,
- msg: str | None = None,
-) -> Callable[[F], F]:
- """
- Return a new function that emits a deprecation warning on use.
-
- To use this method for a deprecated function, another function
- `alternative` with the same signature must exist. The deprecated
- function will emit a deprecation warning, and in the docstring
- it will contain the deprecation directive with the provided version
- so it can be detected for future removal.
-
- Parameters
- ----------
- name : str
- Name of function to deprecate.
- alternative : func
- Function to use instead.
- version : str
- Version of pandas in which the method has been deprecated.
- alt_name : str, optional
- Name to use in preference of alternative.__name__.
- klass : Warning, default FutureWarning
- stacklevel : int, default 2
- msg : str
- The message to display in the warning.
- Default is '{name} is deprecated. Use {alt_name} instead.'
- """
- alt_name = alt_name or alternative.__name__
- klass = klass or FutureWarning
- warning_msg = msg or f"{name} is deprecated, use {alt_name} instead."
-
- @wraps(alternative)
- def wrapper(*args, **kwargs) -> Callable[..., Any]:
- warnings.warn(warning_msg, klass, stacklevel=stacklevel)
- return alternative(*args, **kwargs)
-
- # adding deprecated directive to the docstring
- msg = msg or f"Use `{alt_name}` instead."
- doc_error_msg = (
- "deprecate needs a correctly formatted docstring in "
- "the target function (should have a one liner short "
- "summary, and opening quotes should be in their own "
- f"line). Found:\n{alternative.__doc__}"
- )
-
- # when python is running in optimized mode (i.e. `-OO`), docstrings are
- # removed, so we check that a docstring with correct formatting is used
- # but we allow empty docstrings
- if alternative.__doc__:
- if alternative.__doc__.count("\n") < 3:
- raise AssertionError(doc_error_msg)
- empty1, summary, empty2, doc_string = alternative.__doc__.split("\n", 3)
- if empty1 or empty2 and not summary:
- raise AssertionError(doc_error_msg)
- wrapper.__doc__ = dedent(
- f"""
- {summary.strip()}
-
- .. deprecated:: {version}
- {msg}
-
- {dedent(doc_string)}"""
- )
- # error: Incompatible return value type (got "Callable[[VarArg(Any), KwArg(Any)],
- # Callable[...,Any]]", expected "Callable[[F], F]")
- return wrapper # type: ignore[return-value]
-
-
-def deprecate_kwarg(
- old_arg_name: str,
- new_arg_name: str | None,
- mapping: Mapping[Any, Any] | Callable[[Any], Any] | None = None,
- stacklevel: int = 2,
-) -> Callable[[F], F]:
- """
- Decorator to deprecate a keyword argument of a function.
-
- Parameters
- ----------
- old_arg_name : str
- Name of argument in function to deprecate
- new_arg_name : str or None
- Name of preferred argument in function. Use None to raise warning that
- ``old_arg_name`` keyword is deprecated.
- mapping : dict or callable
- If mapping is present, use it to translate old arguments to
- new arguments. A callable must do its own value checking;
- values not found in a dict will be forwarded unchanged.
-
- Examples
- --------
- The following deprecates 'cols', using 'columns' instead
-
- >>> @deprecate_kwarg(old_arg_name='cols', new_arg_name='columns')
- ... def f(columns=''):
- ... print(columns)
- ...
- >>> f(columns='should work ok')
- should work ok
-
- >>> f(cols='should raise warning') # doctest: +SKIP
- FutureWarning: cols is deprecated, use columns instead
- warnings.warn(msg, FutureWarning)
- should raise warning
-
- >>> f(cols='should error', columns="can\'t pass do both") # doctest: +SKIP
- TypeError: Can only specify 'cols' or 'columns', not both
-
- >>> @deprecate_kwarg('old', 'new', {'yes': True, 'no': False})
- ... def f(new=False):
- ... print('yes!' if new else 'no!')
- ...
- >>> f(old='yes') # doctest: +SKIP
- FutureWarning: old='yes' is deprecated, use new=True instead
- warnings.warn(msg, FutureWarning)
- yes!
-
- To raise a warning that a keyword will be removed entirely in the future
-
- >>> @deprecate_kwarg(old_arg_name='cols', new_arg_name=None)
- ... def f(cols='', another_param=''):
- ... print(cols)
- ...
- >>> f(cols='should raise warning') # doctest: +SKIP
- FutureWarning: the 'cols' keyword is deprecated and will be removed in a
- future version please takes steps to stop use of 'cols'
- should raise warning
- >>> f(another_param='should not raise warning') # doctest: +SKIP
- should not raise warning
-
- >>> f(cols='should raise warning', another_param='') # doctest: +SKIP
- FutureWarning: the 'cols' keyword is deprecated and will be removed in a
- future version please takes steps to stop use of 'cols'
- should raise warning
- """
- if mapping is not None and not hasattr(mapping, "get") and not callable(mapping):
- raise TypeError(
- "mapping from old to new argument values must be dict or callable!"
- )
-
- def _deprecate_kwarg(func: F) -> F:
- @wraps(func)
- def wrapper(*args, **kwargs) -> Callable[..., Any]:
- old_arg_value = kwargs.pop(old_arg_name, None)
-
- if old_arg_value is not None:
- if new_arg_name is None:
- msg = (
- f"the {repr(old_arg_name)} keyword is deprecated and "
- "will be removed in a future version. Please take "
- f"steps to stop the use of {repr(old_arg_name)}"
- )
- warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
- kwargs[old_arg_name] = old_arg_value
- return func(*args, **kwargs)
-
- elif mapping is not None:
- if callable(mapping):
- new_arg_value = mapping(old_arg_value)
- else:
- new_arg_value = mapping.get(old_arg_value, old_arg_value)
- msg = (
- f"the {old_arg_name}={repr(old_arg_value)} keyword is "
- "deprecated, use "
- f"{new_arg_name}={repr(new_arg_value)} instead."
- )
- else:
- new_arg_value = old_arg_value
- msg = (
- f"the {repr(old_arg_name)} keyword is deprecated, "
- f"use {repr(new_arg_name)} instead."
- )
-
- warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
- if kwargs.get(new_arg_name) is not None:
- msg = (
- f"Can only specify {repr(old_arg_name)} "
- f"or {repr(new_arg_name)}, not both."
- )
- raise TypeError(msg)
- kwargs[new_arg_name] = new_arg_value
- return func(*args, **kwargs)
-
- return cast(F, wrapper)
-
- return _deprecate_kwarg
-
-
-def _format_argument_list(allow_args: list[str]) -> str:
- """
- Convert the allow_args argument (either string or integer) of
- `deprecate_nonkeyword_arguments` function to a string describing
- it to be inserted into warning message.
-
- Parameters
- ----------
- allowed_args : list, tuple or int
- The `allowed_args` argument for `deprecate_nonkeyword_arguments`,
- but None value is not allowed.
-
- Returns
- -------
- str
- The substring describing the argument list in best way to be
- inserted to the warning message.
-
- Examples
- --------
- `format_argument_list([])` -> ''
- `format_argument_list(['a'])` -> "except for the arguments 'a'"
- `format_argument_list(['a', 'b'])` -> "except for the arguments 'a' and 'b'"
- `format_argument_list(['a', 'b', 'c'])` ->
- "except for the arguments 'a', 'b' and 'c'"
- """
- if "self" in allow_args:
- allow_args.remove("self")
- if not allow_args:
- return ""
- elif len(allow_args) == 1:
- return f" except for the argument '{allow_args[0]}'"
- else:
- last = allow_args[-1]
- args = ", ".join(["'" + x + "'" for x in allow_args[:-1]])
- return f" except for the arguments {args} and '{last}'"
-
-
-def future_version_msg(version: str | None) -> str:
- """Specify which version of pandas the deprecation will take place in."""
- if version is None:
- return "In a future version of pandas"
- else:
- return f"Starting with pandas version {version}"
-
-
-def deprecate_nonkeyword_arguments(
- version: str | None,
- allowed_args: list[str] | None = None,
- name: str | None = None,
-) -> Callable[[F], F]:
- """
- Decorator to deprecate a use of non-keyword arguments of a function.
-
- Parameters
- ----------
- version : str, optional
- The version in which positional arguments will become
- keyword-only. If None, then the warning message won't
- specify any particular version.
-
- allowed_args : list, optional
- In case of list, it must be the list of names of some
- first arguments of the decorated functions that are
- OK to be given as positional arguments. In case of None value,
- defaults to list of all arguments not having the
- default value.
-
- name : str, optional
- The specific name of the function to show in the warning
- message. If None, then the Qualified name of the function
- is used.
- """
-
- def decorate(func):
- old_sig = inspect.signature(func)
-
- if allowed_args is not None:
- allow_args = allowed_args
- else:
- allow_args = [
- p.name
- for p in old_sig.parameters.values()
- if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
- and p.default is p.empty
- ]
-
- new_params = [
- p.replace(kind=p.KEYWORD_ONLY)
- if (
- p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
- and p.name not in allow_args
- )
- else p
- for p in old_sig.parameters.values()
- ]
- new_params.sort(key=lambda p: p.kind)
- new_sig = old_sig.replace(parameters=new_params)
-
- num_allow_args = len(allow_args)
- msg = (
- f"{future_version_msg(version)} all arguments of "
- f"{name or func.__qualname__}{{arguments}} will be keyword-only."
- )
-
- @wraps(func)
- def wrapper(*args, **kwargs):
- if len(args) > num_allow_args:
- warnings.warn(
- msg.format(arguments=_format_argument_list(allow_args)),
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return func(*args, **kwargs)
-
- # error: "Callable[[VarArg(Any), KwArg(Any)], Any]" has no
- # attribute "__signature__"
- wrapper.__signature__ = new_sig # type: ignore[attr-defined]
- return wrapper
-
- return decorate
-
-
-def doc(*docstrings: None | str | Callable, **params) -> Callable[[F], F]:
- """
- A decorator take docstring templates, concatenate them and perform string
- substitution on it.
-
- This decorator will add a variable "_docstring_components" to the wrapped
- callable to keep track the original docstring template for potential usage.
- If it should be consider as a template, it will be saved as a string.
- Otherwise, it will be saved as callable, and later user __doc__ and dedent
- to get docstring.
-
- Parameters
- ----------
- *docstrings : None, str, or callable
- The string / docstring / docstring template to be appended in order
- after default docstring under callable.
- **params
- The string which would be used to format docstring template.
- """
-
- def decorator(decorated: F) -> F:
- # collecting docstring and docstring templates
- docstring_components: list[str | Callable] = []
- if decorated.__doc__:
- docstring_components.append(dedent(decorated.__doc__))
-
- for docstring in docstrings:
- if docstring is None:
- continue
- if hasattr(docstring, "_docstring_components"):
- docstring_components.extend(
- docstring._docstring_components # pyright: ignore[reportGeneralTypeIssues] # noqa: E501
- )
- elif isinstance(docstring, str) or docstring.__doc__:
- docstring_components.append(docstring)
-
- params_applied = [
- component.format(**params)
- if isinstance(component, str) and len(params) > 0
- else component
- for component in docstring_components
- ]
-
- decorated.__doc__ = "".join(
- [
- component
- if isinstance(component, str)
- else dedent(component.__doc__ or "")
- for component in params_applied
- ]
- )
-
- # error: "F" has no attribute "_docstring_components"
- decorated._docstring_components = ( # type: ignore[attr-defined]
- docstring_components
- )
- return decorated
-
- return decorator
-
-
-# Substitution and Appender are derived from matplotlib.docstring (1.1.0)
-# module https://matplotlib.org/users/license.html
-
-
-class Substitution:
- """
- A decorator to take a function's docstring and perform string
- substitution on it.
-
- This decorator should be robust even if func.__doc__ is None
- (for example, if -OO was passed to the interpreter)
-
- Usage: construct a docstring.Substitution with a sequence or
- dictionary suitable for performing substitution; then
- decorate a suitable function with the constructed object. e.g.
-
- sub_author_name = Substitution(author='Jason')
-
- @sub_author_name
- def some_function(x):
- "%(author)s wrote this function"
-
- # note that some_function.__doc__ is now "Jason wrote this function"
-
- One can also use positional arguments.
-
- sub_first_last_names = Substitution('Edgar Allen', 'Poe')
-
- @sub_first_last_names
- def some_function(x):
- "%s %s wrote the Raven"
- """
-
- def __init__(self, *args, **kwargs) -> None:
- if args and kwargs:
- raise AssertionError("Only positional or keyword args are allowed")
-
- self.params = args or kwargs
-
- def __call__(self, func: F) -> F:
- func.__doc__ = func.__doc__ and func.__doc__ % self.params
- return func
-
- def update(self, *args, **kwargs) -> None:
- """
- Update self.params with supplied args.
- """
- if isinstance(self.params, dict):
- self.params.update(*args, **kwargs)
-
-
-class Appender:
- """
- A function decorator that will append an addendum to the docstring
- of the target function.
-
- This decorator should be robust even if func.__doc__ is None
- (for example, if -OO was passed to the interpreter).
-
- Usage: construct a docstring.Appender with a string to be joined to
- the original docstring. An optional 'join' parameter may be supplied
- which will be used to join the docstring and addendum. e.g.
-
- add_copyright = Appender("Copyright (c) 2009", join='\n')
-
- @add_copyright
- def my_dog(has='fleas'):
- "This docstring will have a copyright below"
- pass
- """
-
- addendum: str | None
-
- def __init__(self, addendum: str | None, join: str = "", indents: int = 0) -> None:
- if indents > 0:
- self.addendum = indent(addendum, indents=indents)
- else:
- self.addendum = addendum
- self.join = join
-
- def __call__(self, func: T) -> T:
- func.__doc__ = func.__doc__ if func.__doc__ else ""
- self.addendum = self.addendum if self.addendum else ""
- docitems = [func.__doc__, self.addendum]
- func.__doc__ = dedent(self.join.join(docitems))
- return func
-
-
-def indent(text: str | None, indents: int = 1) -> str:
- if not text or not isinstance(text, str):
- return ""
- jointext = "".join(["\n"] + [" "] * indents)
- return jointext.join(text.split("\n"))
-
-
-__all__ = [
- "Appender",
- "cache_readonly",
- "deprecate",
- "deprecate_kwarg",
- "deprecate_nonkeyword_arguments",
- "doc",
- "future_version_msg",
- "Substitution",
-]
diff --git a/contrib/python/pandas/py3/pandas/util/_doctools.py b/contrib/python/pandas/py3/pandas/util/_doctools.py
deleted file mode 100644
index 9e3ab80d1d4..00000000000
--- a/contrib/python/pandas/py3/pandas/util/_doctools.py
+++ /dev/null
@@ -1,199 +0,0 @@
-from __future__ import annotations
-
-from typing import Iterable
-
-import numpy as np
-
-import pandas as pd
-
-
-class TablePlotter:
- """
- Layout some DataFrames in vertical/horizontal layout for explanation.
- Used in merging.rst
- """
-
- def __init__(
- self,
- cell_width: float = 0.37,
- cell_height: float = 0.25,
- font_size: float = 7.5,
- ) -> None:
- self.cell_width = cell_width
- self.cell_height = cell_height
- self.font_size = font_size
-
- def _shape(self, df: pd.DataFrame) -> tuple[int, int]:
- """
- Calculate table shape considering index levels.
- """
- row, col = df.shape
- return row + df.columns.nlevels, col + df.index.nlevels
-
- def _get_cells(self, left, right, vertical) -> tuple[int, int]:
- """
- Calculate appropriate figure size based on left and right data.
- """
- if vertical:
- # calculate required number of cells
- vcells = max(sum(self._shape(df)[0] for df in left), self._shape(right)[0])
- hcells = max(self._shape(df)[1] for df in left) + self._shape(right)[1]
- else:
- vcells = max([self._shape(df)[0] for df in left] + [self._shape(right)[0]])
- hcells = sum([self._shape(df)[1] for df in left] + [self._shape(right)[1]])
- return hcells, vcells
-
- def plot(self, left, right, labels: Iterable[str] = (), vertical: bool = True):
- """
- Plot left / right DataFrames in specified layout.
-
- Parameters
- ----------
- left : list of DataFrames before operation is applied
- right : DataFrame of operation result
- labels : list of str to be drawn as titles of left DataFrames
- vertical : bool, default True
- If True, use vertical layout. If False, use horizontal layout.
- """
- from matplotlib import gridspec
- import matplotlib.pyplot as plt
-
- if not isinstance(left, list):
- left = [left]
- left = [self._conv(df) for df in left]
- right = self._conv(right)
-
- hcells, vcells = self._get_cells(left, right, vertical)
-
- if vertical:
- figsize = self.cell_width * hcells, self.cell_height * vcells
- else:
- # include margin for titles
- figsize = self.cell_width * hcells, self.cell_height * vcells
- fig = plt.figure(figsize=figsize)
-
- if vertical:
- gs = gridspec.GridSpec(len(left), hcells)
- # left
- max_left_cols = max(self._shape(df)[1] for df in left)
- max_left_rows = max(self._shape(df)[0] for df in left)
- for i, (_left, _label) in enumerate(zip(left, labels)):
- ax = fig.add_subplot(gs[i, 0:max_left_cols])
- self._make_table(ax, _left, title=_label, height=1.0 / max_left_rows)
- # right
- ax = plt.subplot(gs[:, max_left_cols:])
- self._make_table(ax, right, title="Result", height=1.05 / vcells)
- fig.subplots_adjust(top=0.9, bottom=0.05, left=0.05, right=0.95)
- else:
- max_rows = max(self._shape(df)[0] for df in left + [right])
- height = 1.0 / np.max(max_rows)
- gs = gridspec.GridSpec(1, hcells)
- # left
- i = 0
- for df, _label in zip(left, labels):
- sp = self._shape(df)
- ax = fig.add_subplot(gs[0, i : i + sp[1]])
- self._make_table(ax, df, title=_label, height=height)
- i += sp[1]
- # right
- ax = plt.subplot(gs[0, i:])
- self._make_table(ax, right, title="Result", height=height)
- fig.subplots_adjust(top=0.85, bottom=0.05, left=0.05, right=0.95)
-
- return fig
-
- def _conv(self, data):
- """
- Convert each input to appropriate for table outplot.
- """
- if isinstance(data, pd.Series):
- if data.name is None:
- data = data.to_frame(name="")
- else:
- data = data.to_frame()
- data = data.fillna("NaN")
- return data
-
- def _insert_index(self, data):
- # insert is destructive
- data = data.copy()
- idx_nlevels = data.index.nlevels
- if idx_nlevels == 1:
- data.insert(0, "Index", data.index)
- else:
- for i in range(idx_nlevels):
- data.insert(i, f"Index{i}", data.index._get_level_values(i))
-
- col_nlevels = data.columns.nlevels
- if col_nlevels > 1:
- col = data.columns._get_level_values(0)
- values = [
- data.columns._get_level_values(i)._values for i in range(1, col_nlevels)
- ]
- col_df = pd.DataFrame(values)
- data.columns = col_df.columns
- data = pd.concat([col_df, data])
- data.columns = col
- return data
-
- def _make_table(self, ax, df, title: str, height: float | None = None) -> None:
- if df is None:
- ax.set_visible(False)
- return
-
- from pandas import plotting
-
- idx_nlevels = df.index.nlevels
- col_nlevels = df.columns.nlevels
- # must be convert here to get index levels for colorization
- df = self._insert_index(df)
- tb = plotting.table(ax, df, loc=9)
- tb.set_fontsize(self.font_size)
-
- if height is None:
- height = 1.0 / (len(df) + 1)
-
- props = tb.properties()
- for (r, c), cell in props["celld"].items():
- if c == -1:
- cell.set_visible(False)
- elif r < col_nlevels and c < idx_nlevels:
- cell.set_visible(False)
- elif r < col_nlevels or c < idx_nlevels:
- cell.set_facecolor("#AAAAAA")
- cell.set_height(height)
-
- ax.set_title(title, size=self.font_size)
- ax.axis("off")
-
-
-def main() -> None:
- import matplotlib.pyplot as plt
-
- p = TablePlotter()
-
- df1 = pd.DataFrame({"A": [10, 11, 12], "B": [20, 21, 22], "C": [30, 31, 32]})
- df2 = pd.DataFrame({"A": [10, 12], "C": [30, 32]})
-
- p.plot([df1, df2], pd.concat([df1, df2]), labels=["df1", "df2"], vertical=True)
- plt.show()
-
- df3 = pd.DataFrame({"X": [10, 12], "Z": [30, 32]})
-
- p.plot(
- [df1, df3], pd.concat([df1, df3], axis=1), labels=["df1", "df2"], vertical=False
- )
- plt.show()
-
- idx = pd.MultiIndex.from_tuples(
- [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")]
- )
- column = pd.MultiIndex.from_tuples([(1, "A"), (1, "B")])
- df3 = pd.DataFrame({"v1": [1, 2, 3, 4, 5, 6], "v2": [5, 6, 7, 8, 9, 10]}, index=idx)
- df3.columns = column
- p.plot(df3, df3, labels=["df3"])
- plt.show()
-
-
-if __name__ == "__main__":
- main()
diff --git a/contrib/python/pandas/py3/pandas/util/_exceptions.py b/contrib/python/pandas/py3/pandas/util/_exceptions.py
deleted file mode 100644
index 1eefd06a133..00000000000
--- a/contrib/python/pandas/py3/pandas/util/_exceptions.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from __future__ import annotations
-
-import contextlib
-import inspect
-import os
-import re
-from typing import Generator
-import warnings
-
-
-@contextlib.contextmanager
-def rewrite_exception(old_name: str, new_name: str) -> Generator[None, None, None]:
- """
- Rewrite the message of an exception.
- """
- try:
- yield
- except Exception as err:
- if not err.args:
- raise
- msg = str(err.args[0])
- msg = msg.replace(old_name, new_name)
- args: tuple[str, ...] = (msg,)
- if len(err.args) > 1:
- args = args + err.args[1:]
- err.args = args
- raise
-
-
-def find_stack_level() -> int:
- """
- Find the first place in the stack that is not inside pandas
- (tests notwithstanding).
- """
-
- import pandas as pd
-
- pkg_dir = os.path.dirname(pd.__file__)
- test_dir = os.path.join(pkg_dir, "tests")
-
- # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow
- frame = inspect.currentframe()
- n = 0
- while frame:
- fname = inspect.getfile(frame)
- if fname.startswith(pkg_dir) and not fname.startswith(test_dir):
- frame = frame.f_back
- n += 1
- else:
- break
- return n
-
-
-@contextlib.contextmanager
-def rewrite_warning(
- target_message: str,
- target_category: type[Warning],
- new_message: str,
- new_category: type[Warning] | None = None,
-) -> Generator[None, None, None]:
- """
- Rewrite the message of a warning.
-
- Parameters
- ----------
- target_message : str
- Warning message to match.
- target_category : Warning
- Warning type to match.
- new_message : str
- New warning message to emit.
- new_category : Warning or None, default None
- New warning type to emit. When None, will be the same as target_category.
- """
- if new_category is None:
- new_category = target_category
- with warnings.catch_warnings(record=True) as record:
- yield
- if len(record) > 0:
- match = re.compile(target_message)
- for warning in record:
- if warning.category is target_category and re.search(
- match, str(warning.message)
- ):
- category = new_category
- message: Warning | str = new_message
- else:
- category, message = warning.category, warning.message
- warnings.warn_explicit(
- message=message,
- category=category,
- filename=warning.filename,
- lineno=warning.lineno,
- )
diff --git a/contrib/python/pandas/py3/pandas/util/_print_versions.py b/contrib/python/pandas/py3/pandas/util/_print_versions.py
deleted file mode 100644
index 2526fafe585..00000000000
--- a/contrib/python/pandas/py3/pandas/util/_print_versions.py
+++ /dev/null
@@ -1,134 +0,0 @@
-from __future__ import annotations
-
-import codecs
-import json
-import locale
-import os
-import platform
-import struct
-import sys
-
-from pandas._typing import JSONSerializable
-from pandas.compat._optional import (
- VERSIONS,
- get_version,
- import_optional_dependency,
-)
-
-
-def _get_commit_hash() -> str | None:
- """
- Use vendored versioneer code to get git hash, which handles
- git worktree correctly.
- """
- from pandas._version import get_versions
-
- versions = get_versions()
- return versions["full-revisionid"]
-
-
-def _get_sys_info() -> dict[str, JSONSerializable]:
- """
- Returns system information as a JSON serializable dictionary.
- """
- uname_result = platform.uname()
- language_code, encoding = locale.getlocale()
- return {
- "commit": _get_commit_hash(),
- "python": ".".join([str(i) for i in sys.version_info]),
- "python-bits": struct.calcsize("P") * 8,
- "OS": uname_result.system,
- "OS-release": uname_result.release,
- "Version": uname_result.version,
- "machine": uname_result.machine,
- "processor": uname_result.processor,
- "byteorder": sys.byteorder,
- "LC_ALL": os.environ.get("LC_ALL"),
- "LANG": os.environ.get("LANG"),
- "LOCALE": {"language-code": language_code, "encoding": encoding},
- }
-
-
-def _get_dependency_info() -> dict[str, JSONSerializable]:
- """
- Returns dependency information as a JSON serializable dictionary.
- """
- deps = [
- "pandas",
- # required
- "numpy",
- "pytz",
- "dateutil",
- # install / build,
- "setuptools",
- "pip",
- "Cython",
- # test
- "pytest",
- "hypothesis",
- # docs
- "sphinx",
- # Other, need a min version
- "blosc",
- "feather",
- "xlsxwriter",
- "lxml.etree",
- "html5lib",
- "pymysql",
- "psycopg2",
- "jinja2",
- # Other, not imported.
- "IPython",
- "pandas_datareader",
- ]
- deps.extend(list(VERSIONS))
-
- result: dict[str, JSONSerializable] = {}
- for modname in deps:
- mod = import_optional_dependency(modname, errors="ignore")
- result[modname] = get_version(mod) if mod else None
- return result
-
-
-def show_versions(as_json: str | bool = False) -> None:
- """
- Provide useful information, important for bug reports.
-
- It comprises info about hosting operation system, pandas version,
- and versions of other installed relative packages.
-
- Parameters
- ----------
- as_json : str or bool, default False
- * If False, outputs info in a human readable form to the console.
- * If str, it will be considered as a path to a file.
- Info will be written to that file in JSON format.
- * If True, outputs info in JSON format to the console.
- """
- sys_info = _get_sys_info()
- deps = _get_dependency_info()
-
- if as_json:
- j = {"system": sys_info, "dependencies": deps}
-
- if as_json is True:
- sys.stdout.writelines(json.dumps(j, indent=2))
- else:
- assert isinstance(as_json, str) # needed for mypy
- with codecs.open(as_json, "wb", encoding="utf8") as f:
- json.dump(j, f, indent=2)
-
- else:
- assert isinstance(sys_info["LOCALE"], dict) # needed for mypy
- language_code = sys_info["LOCALE"]["language-code"]
- encoding = sys_info["LOCALE"]["encoding"]
- sys_info["LOCALE"] = f"{language_code}.{encoding}"
-
- maxlen = max(len(x) for x in deps)
- print("\nINSTALLED VERSIONS")
- print("------------------")
- for k, v in sys_info.items():
- print(f"{k:<{maxlen}}: {v}")
- print("")
- for k, v in deps.items():
- print(f"{k:<{maxlen}}: {v}")
diff --git a/contrib/python/pandas/py3/pandas/util/_str_methods.py b/contrib/python/pandas/py3/pandas/util/_str_methods.py
deleted file mode 100644
index 8f7aef80bc1..00000000000
--- a/contrib/python/pandas/py3/pandas/util/_str_methods.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""
-Python3.9 introduces removesuffix and remove prefix.
-
-They're reimplemented here for use in Python3.8.
-
-NOTE: when pyupgrade --py39-plus removes nearly everything in this file,
-this file and the associated tests should be removed.
-"""
-from __future__ import annotations
-
-import sys
-
-if sys.version_info < (3, 9):
-
- def removesuffix(string: str, suffix: str) -> str:
- if string.endswith(suffix):
- return string[: -len(suffix)]
- return string
-
- def removeprefix(string: str, prefix: str) -> str:
- if string.startswith(prefix):
- return string[len(prefix) :]
- return string
-
-else:
- # NOTE: remove this file when pyupgrade --py39-plus removes
- # the above block
- pass
diff --git a/contrib/python/pandas/py3/pandas/util/_test_decorators.py b/contrib/python/pandas/py3/pandas/util/_test_decorators.py
deleted file mode 100644
index 5f2e3f1a0e7..00000000000
--- a/contrib/python/pandas/py3/pandas/util/_test_decorators.py
+++ /dev/null
@@ -1,264 +0,0 @@
-"""
-This module provides decorator functions which can be applied to test objects
-in order to skip those objects when certain conditions occur. A sample use case
-is to detect if the platform is missing ``matplotlib``. If so, any test objects
-which require ``matplotlib`` and decorated with ``@td.skip_if_no_mpl`` will be
-skipped by ``pytest`` during the execution of the test suite.
-
-To illustrate, after importing this module:
-
-import pandas.util._test_decorators as td
-
-The decorators can be applied to classes:
-
-@td.skip_if_some_reason
-class Foo:
- ...
-
-Or individual functions:
-
-@td.skip_if_some_reason
-def test_foo():
- ...
-
-For more information, refer to the ``pytest`` documentation on ``skipif``.
-"""
-from __future__ import annotations
-
-import locale
-from typing import Callable
-
-import numpy as np
-import pytest
-
-from pandas._config import get_option
-
-from pandas._typing import F
-from pandas.compat import (
- IS64,
- is_platform_windows,
-)
-from pandas.compat._optional import import_optional_dependency
-
-from pandas.core.computation.expressions import (
- NUMEXPR_INSTALLED,
- USE_NUMEXPR,
-)
-from pandas.util.version import Version
-
-
-def safe_import(mod_name: str, min_version: str | None = None):
- """
- Parameters
- ----------
- mod_name : str
- Name of the module to be imported
- min_version : str, default None
- Minimum required version of the specified mod_name
-
- Returns
- -------
- object
- The imported module if successful, or False
- """
- try:
- mod = __import__(mod_name)
- except ImportError:
- return False
- except SystemError:
- # TODO: numba is incompatible with numpy 1.24+.
- # Once that's fixed, this block should be removed.
- if mod_name == "numba":
- return False
- else:
- raise
-
- if not min_version:
- return mod
- else:
- import sys
-
- try:
- version = getattr(sys.modules[mod_name], "__version__")
- except AttributeError:
- # xlrd uses a capitalized attribute name
- version = getattr(sys.modules[mod_name], "__VERSION__")
- if version and Version(version) >= Version(min_version):
- return mod
-
- return False
-
-
-def _skip_if_no_mpl() -> bool:
- mod = safe_import("matplotlib")
- if mod:
- mod.use("Agg")
- return False
- else:
- return True
-
-
-def _skip_if_not_us_locale() -> bool:
- lang, _ = locale.getlocale()
- if lang != "en_US":
- return True
- return False
-
-
-def _skip_if_no_scipy() -> bool:
- return not (
- safe_import("scipy.stats")
- and safe_import("scipy.sparse")
- and safe_import("scipy.interpolate")
- and safe_import("scipy.signal")
- )
-
-
-# TODO(pytest#7469): return type, _pytest.mark.structures.MarkDecorator is not public
-# https://github.com/pytest-dev/pytest/issues/7469
-def skip_if_installed(package: str):
- """
- Skip a test if a package is installed.
-
- Parameters
- ----------
- package : str
- The name of the package.
- """
- return pytest.mark.skipif(
- safe_import(package), reason=f"Skipping because {package} is installed."
- )
-
-
-# TODO(pytest#7469): return type, _pytest.mark.structures.MarkDecorator is not public
-# https://github.com/pytest-dev/pytest/issues/7469
-def skip_if_no(package: str, min_version: str | None = None):
- """
- Generic function to help skip tests when required packages are not
- present on the testing system.
-
- This function returns a pytest mark with a skip condition that will be
- evaluated during test collection. An attempt will be made to import the
- specified ``package`` and optionally ensure it meets the ``min_version``
-
- The mark can be used as either a decorator for a test function or to be
- applied to parameters in pytest.mark.parametrize calls or parametrized
- fixtures.
-
- If the import and version check are unsuccessful, then the test function
- (or test case when used in conjunction with parametrization) will be
- skipped.
-
- Parameters
- ----------
- package: str
- The name of the required package.
- min_version: str or None, default None
- Optional minimum version of the package.
-
- Returns
- -------
- _pytest.mark.structures.MarkDecorator
- a pytest.mark.skipif to use as either a test decorator or a
- parametrization mark.
- """
- msg = f"Could not import '{package}'"
- if min_version:
- msg += f" satisfying a min_version of {min_version}"
- return pytest.mark.skipif(
- not safe_import(package, min_version=min_version), reason=msg
- )
-
-
-skip_if_no_mpl = pytest.mark.skipif(
- _skip_if_no_mpl(), reason="Missing matplotlib dependency"
-)
-skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present")
-skip_if_32bit = pytest.mark.skipif(not IS64, reason="skipping for 32 bit")
-skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows")
-skip_if_not_us_locale = pytest.mark.skipif(
- _skip_if_not_us_locale(),
- reason=f"Specific locale is set {locale.getlocale()[0]}",
-)
-skip_if_no_scipy = pytest.mark.skipif(
- _skip_if_no_scipy(), reason="Missing SciPy requirement"
-)
-skip_if_no_ne = pytest.mark.skipif(
- not USE_NUMEXPR,
- reason=f"numexpr enabled->{USE_NUMEXPR}, installed->{NUMEXPR_INSTALLED}",
-)
-
-
-# TODO(pytest#7469): return type, _pytest.mark.structures.MarkDecorator is not public
-# https://github.com/pytest-dev/pytest/issues/7469
-def skip_if_np_lt(ver_str: str, *args, reason: str | None = None):
- if reason is None:
- reason = f"NumPy {ver_str} or greater required"
- return pytest.mark.skipif(
- Version(np.__version__) < Version(ver_str),
- *args,
- reason=reason,
- )
-
-
-def parametrize_fixture_doc(*args) -> Callable[[F], F]:
- """
- Intended for use as a decorator for parametrized fixture,
- this function will wrap the decorated function with a pytest
- ``parametrize_fixture_doc`` mark. That mark will format
- initial fixture docstring by replacing placeholders {0}, {1} etc
- with parameters passed as arguments.
-
- Parameters
- ----------
- args: iterable
- Positional arguments for docstring.
-
- Returns
- -------
- function
- The decorated function wrapped within a pytest
- ``parametrize_fixture_doc`` mark
- """
-
- def documented_fixture(fixture):
- fixture.__doc__ = fixture.__doc__.format(*args)
- return fixture
-
- return documented_fixture
-
-
-def async_mark():
- try:
- import_optional_dependency("pytest_asyncio")
- async_mark = pytest.mark.asyncio
- except ImportError:
- async_mark = pytest.mark.skip(reason="Missing dependency pytest-asyncio")
-
- return async_mark
-
-
-def mark_array_manager_not_yet_implemented(request) -> None:
- mark = pytest.mark.xfail(reason="Not yet implemented for ArrayManager")
- request.node.add_marker(mark)
-
-
-skip_array_manager_not_yet_implemented = pytest.mark.xfail(
- get_option("mode.data_manager") == "array",
- reason="Not yet implemented for ArrayManager",
-)
-
-skip_array_manager_invalid_test = pytest.mark.skipif(
- get_option("mode.data_manager") == "array",
- reason="Test that relies on BlockManager internals or specific behaviour",
-)
-
-skip_copy_on_write_not_yet_implemented = pytest.mark.xfail(
- get_option("mode.copy_on_write"),
- reason="Not yet implemented/adapted for Copy-on-Write mode",
-)
-
-skip_copy_on_write_invalid_test = pytest.mark.skipif(
- get_option("mode.copy_on_write"),
- reason="Test not valid for Copy-on-Write mode",
-)
diff --git a/contrib/python/pandas/py3/pandas/util/_tester.py b/contrib/python/pandas/py3/pandas/util/_tester.py
deleted file mode 100644
index 3fe7c4d5899..00000000000
--- a/contrib/python/pandas/py3/pandas/util/_tester.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""
-Entrypoint for testing from the top-level namespace.
-"""
-from __future__ import annotations
-
-import os
-import sys
-
-from pandas.compat._optional import import_optional_dependency
-
-PKG = os.path.dirname(os.path.dirname(__file__))
-
-
-def test(extra_args: list[str] | None = None) -> None:
- """
- Run the pandas test suite using pytest.
-
- By default, runs with the marks --skip-slow, --skip-network, --skip-db
-
- Parameters
- ----------
- extra_args : list[str], default None
- Extra marks to run the tests.
- """
- pytest = import_optional_dependency("pytest")
- import_optional_dependency("hypothesis")
- cmd = ["--skip-slow", "--skip-network", "--skip-db"]
- if extra_args:
- if not isinstance(extra_args, list):
- extra_args = [extra_args]
- cmd = extra_args
- cmd += [PKG]
- joined = " ".join(cmd)
- print(f"running: pytest {joined}")
- sys.exit(pytest.main(cmd))
-
-
-__all__ = ["test"]
diff --git a/contrib/python/pandas/py3/pandas/util/_validators.py b/contrib/python/pandas/py3/pandas/util/_validators.py
deleted file mode 100644
index fddd56cc7bc..00000000000
--- a/contrib/python/pandas/py3/pandas/util/_validators.py
+++ /dev/null
@@ -1,451 +0,0 @@
-"""
-Module that contains many useful utilities
-for validating data or function arguments
-"""
-from __future__ import annotations
-
-from typing import (
- Iterable,
- Sequence,
- TypeVar,
- overload,
-)
-
-import numpy as np
-
-from pandas._libs import lib
-
-from pandas.core.dtypes.common import (
- is_bool,
- is_integer,
-)
-
-BoolishT = TypeVar("BoolishT", bool, int)
-BoolishNoneT = TypeVar("BoolishNoneT", bool, int, None)
-
-
-def _check_arg_length(fname, args, max_fname_arg_count, compat_args):
- """
- Checks whether 'args' has length of at most 'compat_args'. Raises
- a TypeError if that is not the case, similar to in Python when a
- function is called with too many arguments.
- """
- if max_fname_arg_count < 0:
- raise ValueError("'max_fname_arg_count' must be non-negative")
-
- if len(args) > len(compat_args):
- max_arg_count = len(compat_args) + max_fname_arg_count
- actual_arg_count = len(args) + max_fname_arg_count
- argument = "argument" if max_arg_count == 1 else "arguments"
-
- raise TypeError(
- f"{fname}() takes at most {max_arg_count} {argument} "
- f"({actual_arg_count} given)"
- )
-
-
-def _check_for_default_values(fname, arg_val_dict, compat_args):
- """
- Check that the keys in `arg_val_dict` are mapped to their
- default values as specified in `compat_args`.
-
- Note that this function is to be called only when it has been
- checked that arg_val_dict.keys() is a subset of compat_args
- """
- for key in arg_val_dict:
- # try checking equality directly with '=' operator,
- # as comparison may have been overridden for the left
- # hand object
- try:
- v1 = arg_val_dict[key]
- v2 = compat_args[key]
-
- # check for None-ness otherwise we could end up
- # comparing a numpy array vs None
- if (v1 is not None and v2 is None) or (v1 is None and v2 is not None):
- match = False
- else:
- match = v1 == v2
-
- if not is_bool(match):
- raise ValueError("'match' is not a boolean")
-
- # could not compare them directly, so try comparison
- # using the 'is' operator
- except ValueError:
- match = arg_val_dict[key] is compat_args[key]
-
- if not match:
- raise ValueError(
- f"the '{key}' parameter is not supported in "
- f"the pandas implementation of {fname}()"
- )
-
-
-def validate_args(fname, args, max_fname_arg_count, compat_args) -> None:
- """
- Checks whether the length of the `*args` argument passed into a function
- has at most `len(compat_args)` arguments and whether or not all of these
- elements in `args` are set to their default values.
-
- Parameters
- ----------
- fname : str
- The name of the function being passed the `*args` parameter
- args : tuple
- The `*args` parameter passed into a function
- max_fname_arg_count : int
- The maximum number of arguments that the function `fname`
- can accept, excluding those in `args`. Used for displaying
- appropriate error messages. Must be non-negative.
- compat_args : dict
- A dictionary of keys and their associated default values.
- In order to accommodate buggy behaviour in some versions of `numpy`,
- where a signature displayed keyword arguments but then passed those
- arguments **positionally** internally when calling downstream
- implementations, a dict ensures that the original
- order of the keyword arguments is enforced.
-
- Raises
- ------
- TypeError
- If `args` contains more values than there are `compat_args`
- ValueError
- If `args` contains values that do not correspond to those
- of the default values specified in `compat_args`
- """
- _check_arg_length(fname, args, max_fname_arg_count, compat_args)
-
- # We do this so that we can provide a more informative
- # error message about the parameters that we are not
- # supporting in the pandas implementation of 'fname'
- kwargs = dict(zip(compat_args, args))
- _check_for_default_values(fname, kwargs, compat_args)
-
-
-def _check_for_invalid_keys(fname, kwargs, compat_args):
- """
- Checks whether 'kwargs' contains any keys that are not
- in 'compat_args' and raises a TypeError if there is one.
- """
- # set(dict) --> set of the dictionary's keys
- diff = set(kwargs) - set(compat_args)
-
- if diff:
- bad_arg = list(diff)[0]
- raise TypeError(f"{fname}() got an unexpected keyword argument '{bad_arg}'")
-
-
-def validate_kwargs(fname, kwargs, compat_args) -> None:
- """
- Checks whether parameters passed to the **kwargs argument in a
- function `fname` are valid parameters as specified in `*compat_args`
- and whether or not they are set to their default values.
-
- Parameters
- ----------
- fname : str
- The name of the function being passed the `**kwargs` parameter
- kwargs : dict
- The `**kwargs` parameter passed into `fname`
- compat_args: dict
- A dictionary of keys that `kwargs` is allowed to have and their
- associated default values
-
- Raises
- ------
- TypeError if `kwargs` contains keys not in `compat_args`
- ValueError if `kwargs` contains keys in `compat_args` that do not
- map to the default values specified in `compat_args`
- """
- kwds = kwargs.copy()
- _check_for_invalid_keys(fname, kwargs, compat_args)
- _check_for_default_values(fname, kwds, compat_args)
-
-
-def validate_args_and_kwargs(
- fname, args, kwargs, max_fname_arg_count, compat_args
-) -> None:
- """
- Checks whether parameters passed to the *args and **kwargs argument in a
- function `fname` are valid parameters as specified in `*compat_args`
- and whether or not they are set to their default values.
-
- Parameters
- ----------
- fname: str
- The name of the function being passed the `**kwargs` parameter
- args: tuple
- The `*args` parameter passed into a function
- kwargs: dict
- The `**kwargs` parameter passed into `fname`
- max_fname_arg_count: int
- The minimum number of arguments that the function `fname`
- requires, excluding those in `args`. Used for displaying
- appropriate error messages. Must be non-negative.
- compat_args: dict
- A dictionary of keys that `kwargs` is allowed to
- have and their associated default values.
-
- Raises
- ------
- TypeError if `args` contains more values than there are
- `compat_args` OR `kwargs` contains keys not in `compat_args`
- ValueError if `args` contains values not at the default value (`None`)
- `kwargs` contains keys in `compat_args` that do not map to the default
- value as specified in `compat_args`
-
- See Also
- --------
- validate_args : Purely args validation.
- validate_kwargs : Purely kwargs validation.
-
- """
- # Check that the total number of arguments passed in (i.e.
- # args and kwargs) does not exceed the length of compat_args
- _check_arg_length(
- fname, args + tuple(kwargs.values()), max_fname_arg_count, compat_args
- )
-
- # Check there is no overlap with the positional and keyword
- # arguments, similar to what is done in actual Python functions
- args_dict = dict(zip(compat_args, args))
-
- for key in args_dict:
- if key in kwargs:
- raise TypeError(
- f"{fname}() got multiple values for keyword argument '{key}'"
- )
-
- kwargs.update(args_dict)
- validate_kwargs(fname, kwargs, compat_args)
-
-
-def validate_bool_kwarg(
- value: BoolishNoneT, arg_name, none_allowed: bool = True, int_allowed: bool = False
-) -> BoolishNoneT:
- """
- Ensure that argument passed in arg_name can be interpreted as boolean.
-
- Parameters
- ----------
- value : bool
- Value to be validated.
- arg_name : str
- Name of the argument. To be reflected in the error message.
- none_allowed : bool, default True
- Whether to consider None to be a valid boolean.
- int_allowed : bool, default False
- Whether to consider integer value to be a valid boolean.
-
- Returns
- -------
- value
- The same value as input.
-
- Raises
- ------
- ValueError
- If the value is not a valid boolean.
- """
- good_value = is_bool(value)
- if none_allowed:
- good_value = good_value or value is None
-
- if int_allowed:
- good_value = good_value or isinstance(value, int)
-
- if not good_value:
- raise ValueError(
- f'For argument "{arg_name}" expected type bool, received '
- f"type {type(value).__name__}."
- )
- return value
-
-
-def validate_fillna_kwargs(value, method, validate_scalar_dict_value: bool = True):
- """
- Validate the keyword arguments to 'fillna'.
-
- This checks that exactly one of 'value' and 'method' is specified.
- If 'method' is specified, this validates that it's a valid method.
-
- Parameters
- ----------
- value, method : object
- The 'value' and 'method' keyword arguments for 'fillna'.
- validate_scalar_dict_value : bool, default True
- Whether to validate that 'value' is a scalar or dict. Specifically,
- validate that it is not a list or tuple.
-
- Returns
- -------
- value, method : object
- """
- from pandas.core.missing import clean_fill_method
-
- if value is None and method is None:
- raise ValueError("Must specify a fill 'value' or 'method'.")
- if value is None and method is not None:
- method = clean_fill_method(method)
-
- elif value is not None and method is None:
- if validate_scalar_dict_value and isinstance(value, (list, tuple)):
- raise TypeError(
- '"value" parameter must be a scalar or dict, but '
- f'you passed a "{type(value).__name__}"'
- )
-
- elif value is not None and method is not None:
- raise ValueError("Cannot specify both 'value' and 'method'.")
-
- return value, method
-
-
-def validate_percentile(q: float | Iterable[float]) -> np.ndarray:
- """
- Validate percentiles (used by describe and quantile).
-
- This function checks if the given float or iterable of floats is a valid percentile
- otherwise raises a ValueError.
-
- Parameters
- ----------
- q: float or iterable of floats
- A single percentile or an iterable of percentiles.
-
- Returns
- -------
- ndarray
- An ndarray of the percentiles if valid.
-
- Raises
- ------
- ValueError if percentiles are not in given interval([0, 1]).
- """
- q_arr = np.asarray(q)
- # Don't change this to an f-string. The string formatting
- # is too expensive for cases where we don't need it.
- msg = "percentiles should all be in the interval [0, 1]. Try {} instead."
- if q_arr.ndim == 0:
- if not 0 <= q_arr <= 1:
- raise ValueError(msg.format(q_arr / 100.0))
- else:
- if not all(0 <= qs <= 1 for qs in q_arr):
- raise ValueError(msg.format(q_arr / 100.0))
- return q_arr
-
-
-@overload
-def validate_ascending(ascending: BoolishT) -> BoolishT:
- ...
-
-
-@overload
-def validate_ascending(ascending: Sequence[BoolishT]) -> list[BoolishT]:
- ...
-
-
-def validate_ascending(
- ascending: bool | int | Sequence[BoolishT],
-) -> bool | int | list[BoolishT]:
- """Validate ``ascending`` kwargs for ``sort_index`` method."""
- kwargs = {"none_allowed": False, "int_allowed": True}
- if not isinstance(ascending, Sequence):
- return validate_bool_kwarg(ascending, "ascending", **kwargs)
-
- return [validate_bool_kwarg(item, "ascending", **kwargs) for item in ascending]
-
-
-def validate_endpoints(closed: str | None) -> tuple[bool, bool]:
- """
- Check that the `closed` argument is among [None, "left", "right"]
-
- Parameters
- ----------
- closed : {None, "left", "right"}
-
- Returns
- -------
- left_closed : bool
- right_closed : bool
-
- Raises
- ------
- ValueError : if argument is not among valid values
- """
- left_closed = False
- right_closed = False
-
- if closed is None:
- left_closed = True
- right_closed = True
- elif closed == "left":
- left_closed = True
- elif closed == "right":
- right_closed = True
- else:
- raise ValueError("Closed has to be either 'left', 'right' or None")
-
- return left_closed, right_closed
-
-
-def validate_inclusive(inclusive: str | None) -> tuple[bool, bool]:
- """
- Check that the `inclusive` argument is among {"both", "neither", "left", "right"}.
-
- Parameters
- ----------
- inclusive : {"both", "neither", "left", "right"}
-
- Returns
- -------
- left_right_inclusive : tuple[bool, bool]
-
- Raises
- ------
- ValueError : if argument is not among valid values
- """
- left_right_inclusive: tuple[bool, bool] | None = None
-
- if isinstance(inclusive, str):
- left_right_inclusive = {
- "both": (True, True),
- "left": (True, False),
- "right": (False, True),
- "neither": (False, False),
- }.get(inclusive)
-
- if left_right_inclusive is None:
- raise ValueError(
- "Inclusive has to be either 'both', 'neither', 'left' or 'right'"
- )
-
- return left_right_inclusive
-
-
-def validate_insert_loc(loc: int, length: int) -> int:
- """
- Check that we have an integer between -length and length, inclusive.
-
- Standardize negative loc to within [0, length].
-
- The exceptions we raise on failure match np.insert.
- """
- if not is_integer(loc):
- raise TypeError(f"loc must be an integer between -{length} and {length}")
-
- if loc < 0:
- loc += length
- if not 0 <= loc <= length:
- raise IndexError(f"loc must be an integer between -{length} and {length}")
- return loc
-
-
-def check_dtype_backend(dtype_backend) -> None:
- if dtype_backend is not lib.no_default:
- if dtype_backend not in ["numpy_nullable", "pyarrow"]:
- raise ValueError(
- f"dtype_backend {dtype_backend} is invalid, only 'numpy_nullable' and "
- f"'pyarrow' are allowed.",
- )
diff --git a/contrib/python/pandas/py3/pandas/util/version/__init__.py b/contrib/python/pandas/py3/pandas/util/version/__init__.py
deleted file mode 100644
index 0b5e1d149da..00000000000
--- a/contrib/python/pandas/py3/pandas/util/version/__init__.py
+++ /dev/null
@@ -1,574 +0,0 @@
-# Vendored from https://github.com/pypa/packaging/blob/main/packaging/_structures.py
-# and https://github.com/pypa/packaging/blob/main/packaging/_structures.py
-# changeset ae891fd74d6dd4c6063bb04f2faeadaac6fc6313
-# 04/30/2021
-
-# This file is dual licensed under the terms of the Apache License, Version
-# 2.0, and the BSD License. See the LICENSE file in the root of this repository
-# for complete details.
-from __future__ import annotations
-
-import collections
-import itertools
-import re
-from typing import (
- Callable,
- Iterator,
- SupportsInt,
- Tuple,
- Union,
-)
-import warnings
-
-__all__ = ["parse", "Version", "LegacyVersion", "InvalidVersion", "VERSION_PATTERN"]
-
-
-class InfinityType:
- def __repr__(self) -> str:
- return "Infinity"
-
- def __hash__(self) -> int:
- return hash(repr(self))
-
- def __lt__(self, other: object) -> bool:
- return False
-
- def __le__(self, other: object) -> bool:
- return False
-
- def __eq__(self, other: object) -> bool:
- return isinstance(other, type(self))
-
- def __ne__(self, other: object) -> bool:
- return not isinstance(other, type(self))
-
- def __gt__(self, other: object) -> bool:
- return True
-
- def __ge__(self, other: object) -> bool:
- return True
-
- def __neg__(self: object) -> NegativeInfinityType:
- return NegativeInfinity
-
-
-Infinity = InfinityType()
-
-
-class NegativeInfinityType:
- def __repr__(self) -> str:
- return "-Infinity"
-
- def __hash__(self) -> int:
- return hash(repr(self))
-
- def __lt__(self, other: object) -> bool:
- return True
-
- def __le__(self, other: object) -> bool:
- return True
-
- def __eq__(self, other: object) -> bool:
- return isinstance(other, type(self))
-
- def __ne__(self, other: object) -> bool:
- return not isinstance(other, type(self))
-
- def __gt__(self, other: object) -> bool:
- return False
-
- def __ge__(self, other: object) -> bool:
- return False
-
- def __neg__(self: object) -> InfinityType:
- return Infinity
-
-
-NegativeInfinity = NegativeInfinityType()
-
-
-InfiniteTypes = Union[InfinityType, NegativeInfinityType]
-PrePostDevType = Union[InfiniteTypes, Tuple[str, int]]
-SubLocalType = Union[InfiniteTypes, int, str]
-LocalType = Union[
- NegativeInfinityType,
- Tuple[
- Union[
- SubLocalType,
- Tuple[SubLocalType, str],
- Tuple[NegativeInfinityType, SubLocalType],
- ],
- ...,
- ],
-]
-CmpKey = Tuple[
- int, Tuple[int, ...], PrePostDevType, PrePostDevType, PrePostDevType, LocalType
-]
-LegacyCmpKey = Tuple[int, Tuple[str, ...]]
-VersionComparisonMethod = Callable[
- [Union[CmpKey, LegacyCmpKey], Union[CmpKey, LegacyCmpKey]], bool
-]
-
-_Version = collections.namedtuple(
- "_Version", ["epoch", "release", "dev", "pre", "post", "local"]
-)
-
-
-def parse(version: str) -> LegacyVersion | Version:
- """
- Parse the given version string and return either a :class:`Version` object
- or a :class:`LegacyVersion` object depending on if the given version is
- a valid PEP 440 version or a legacy version.
- """
- try:
- return Version(version)
- except InvalidVersion:
- return LegacyVersion(version)
-
-
-class InvalidVersion(ValueError):
- """
- An invalid version was found, users should refer to PEP 440.
- """
-
-
-class _BaseVersion:
- _key: CmpKey | LegacyCmpKey
-
- def __hash__(self) -> int:
- return hash(self._key)
-
- # Please keep the duplicated `isinstance` check
- # in the six comparisons hereunder
- # unless you find a way to avoid adding overhead function calls.
- def __lt__(self, other: _BaseVersion) -> bool:
- if not isinstance(other, _BaseVersion):
- return NotImplemented
-
- return self._key < other._key
-
- def __le__(self, other: _BaseVersion) -> bool:
- if not isinstance(other, _BaseVersion):
- return NotImplemented
-
- return self._key <= other._key
-
- def __eq__(self, other: object) -> bool:
- if not isinstance(other, _BaseVersion):
- return NotImplemented
-
- return self._key == other._key
-
- def __ge__(self, other: _BaseVersion) -> bool:
- if not isinstance(other, _BaseVersion):
- return NotImplemented
-
- return self._key >= other._key
-
- def __gt__(self, other: _BaseVersion) -> bool:
- if not isinstance(other, _BaseVersion):
- return NotImplemented
-
- return self._key > other._key
-
- def __ne__(self, other: object) -> bool:
- if not isinstance(other, _BaseVersion):
- return NotImplemented
-
- return self._key != other._key
-
-
-class LegacyVersion(_BaseVersion):
- def __init__(self, version: str) -> None:
- self._version = str(version)
- self._key = _legacy_cmpkey(self._version)
-
- warnings.warn(
- "Creating a LegacyVersion has been deprecated and will be "
- "removed in the next major release.",
- DeprecationWarning,
- )
-
- def __str__(self) -> str:
- return self._version
-
- def __repr__(self) -> str:
- return f"<LegacyVersion('{self}')>"
-
- @property
- def public(self) -> str:
- return self._version
-
- @property
- def base_version(self) -> str:
- return self._version
-
- @property
- def epoch(self) -> int:
- return -1
-
- @property
- def release(self) -> None:
- return None
-
- @property
- def pre(self) -> None:
- return None
-
- @property
- def post(self) -> None:
- return None
-
- @property
- def dev(self) -> None:
- return None
-
- @property
- def local(self) -> None:
- return None
-
- @property
- def is_prerelease(self) -> bool:
- return False
-
- @property
- def is_postrelease(self) -> bool:
- return False
-
- @property
- def is_devrelease(self) -> bool:
- return False
-
-
-_legacy_version_component_re = re.compile(r"(\d+ | [a-z]+ | \.| -)", re.VERBOSE)
-
-_legacy_version_replacement_map = {
- "pre": "c",
- "preview": "c",
- "-": "final-",
- "rc": "c",
- "dev": "@",
-}
-
-
-def _parse_version_parts(s: str) -> Iterator[str]:
- for part in _legacy_version_component_re.split(s):
- part = _legacy_version_replacement_map.get(part, part)
-
- if not part or part == ".":
- continue
-
- if part[:1] in "0123456789":
- # pad for numeric comparison
- yield part.zfill(8)
- else:
- yield "*" + part
-
- # ensure that alpha/beta/candidate are before final
- yield "*final"
-
-
-def _legacy_cmpkey(version: str) -> LegacyCmpKey:
- # We hardcode an epoch of -1 here. A PEP 440 version can only have a epoch
- # greater than or equal to 0. This will effectively put the LegacyVersion,
- # which uses the defacto standard originally implemented by setuptools,
- # as before all PEP 440 versions.
- epoch = -1
-
- # This scheme is taken from pkg_resources.parse_version setuptools prior to
- # it's adoption of the packaging library.
- parts: list[str] = []
- for part in _parse_version_parts(version.lower()):
- if part.startswith("*"):
- # remove "-" before a prerelease tag
- if part < "*final":
- while parts and parts[-1] == "*final-":
- parts.pop()
-
- # remove trailing zeros from each series of numeric parts
- while parts and parts[-1] == "00000000":
- parts.pop()
-
- parts.append(part)
-
- return epoch, tuple(parts)
-
-
-# Deliberately not anchored to the start and end of the string, to make it
-# easier for 3rd party code to reuse
-VERSION_PATTERN = r"""
- v?
- (?:
- (?:(?P<epoch>[0-9]+)!)? # epoch
- (?P<release>[0-9]+(?:\.[0-9]+)*) # release segment
- (?P<pre> # pre-release
- [-_\.]?
- (?P<pre_l>(a|b|c|rc|alpha|beta|pre|preview))
- [-_\.]?
- (?P<pre_n>[0-9]+)?
- )?
- (?P<post> # post release
- (?:-(?P<post_n1>[0-9]+))
- |
- (?:
- [-_\.]?
- (?P<post_l>post|rev|r)
- [-_\.]?
- (?P<post_n2>[0-9]+)?
- )
- )?
- (?P<dev> # dev release
- [-_\.]?
- (?P<dev_l>dev)
- [-_\.]?
- (?P<dev_n>[0-9]+)?
- )?
- )
- (?:\+(?P<local>[a-z0-9]+(?:[-_\.][a-z0-9]+)*))? # local version
-"""
-
-
-class Version(_BaseVersion):
- _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
-
- def __init__(self, version: str) -> None:
- # Validate the version and parse it into pieces
- match = self._regex.search(version)
- if not match:
- raise InvalidVersion(f"Invalid version: '{version}'")
-
- # Store the parsed out pieces of the version
- self._version = _Version(
- epoch=int(match.group("epoch")) if match.group("epoch") else 0,
- release=tuple(int(i) for i in match.group("release").split(".")),
- pre=_parse_letter_version(match.group("pre_l"), match.group("pre_n")),
- post=_parse_letter_version(
- match.group("post_l"), match.group("post_n1") or match.group("post_n2")
- ),
- dev=_parse_letter_version(match.group("dev_l"), match.group("dev_n")),
- local=_parse_local_version(match.group("local")),
- )
-
- # Generate a key which will be used for sorting
- self._key = _cmpkey(
- self._version.epoch,
- self._version.release,
- self._version.pre,
- self._version.post,
- self._version.dev,
- self._version.local,
- )
-
- def __repr__(self) -> str:
- return f"<Version('{self}')>"
-
- def __str__(self) -> str:
- parts = []
-
- # Epoch
- if self.epoch != 0:
- parts.append(f"{self.epoch}!")
-
- # Release segment
- parts.append(".".join([str(x) for x in self.release]))
-
- # Pre-release
- if self.pre is not None:
- parts.append("".join([str(x) for x in self.pre]))
-
- # Post-release
- if self.post is not None:
- parts.append(f".post{self.post}")
-
- # Development release
- if self.dev is not None:
- parts.append(f".dev{self.dev}")
-
- # Local version segment
- if self.local is not None:
- parts.append(f"+{self.local}")
-
- return "".join(parts)
-
- @property
- def epoch(self) -> int:
- _epoch: int = self._version.epoch
- return _epoch
-
- @property
- def release(self) -> tuple[int, ...]:
- _release: tuple[int, ...] = self._version.release
- return _release
-
- @property
- def pre(self) -> tuple[str, int] | None:
- _pre: tuple[str, int] | None = self._version.pre
- return _pre
-
- @property
- def post(self) -> int | None:
- return self._version.post[1] if self._version.post else None
-
- @property
- def dev(self) -> int | None:
- return self._version.dev[1] if self._version.dev else None
-
- @property
- def local(self) -> str | None:
- if self._version.local:
- return ".".join([str(x) for x in self._version.local])
- else:
- return None
-
- @property
- def public(self) -> str:
- return str(self).split("+", 1)[0]
-
- @property
- def base_version(self) -> str:
- parts = []
-
- # Epoch
- if self.epoch != 0:
- parts.append(f"{self.epoch}!")
-
- # Release segment
- parts.append(".".join([str(x) for x in self.release]))
-
- return "".join(parts)
-
- @property
- def is_prerelease(self) -> bool:
- return self.dev is not None or self.pre is not None
-
- @property
- def is_postrelease(self) -> bool:
- return self.post is not None
-
- @property
- def is_devrelease(self) -> bool:
- return self.dev is not None
-
- @property
- def major(self) -> int:
- return self.release[0] if len(self.release) >= 1 else 0
-
- @property
- def minor(self) -> int:
- return self.release[1] if len(self.release) >= 2 else 0
-
- @property
- def micro(self) -> int:
- return self.release[2] if len(self.release) >= 3 else 0
-
-
-def _parse_letter_version(
- letter: str, number: str | bytes | SupportsInt
-) -> tuple[str, int] | None:
- if letter:
- # We consider there to be an implicit 0 in a pre-release if there is
- # not a numeral associated with it.
- if number is None:
- number = 0
-
- # We normalize any letters to their lower case form
- letter = letter.lower()
-
- # We consider some words to be alternate spellings of other words and
- # in those cases we want to normalize the spellings to our preferred
- # spelling.
- if letter == "alpha":
- letter = "a"
- elif letter == "beta":
- letter = "b"
- elif letter in ["c", "pre", "preview"]:
- letter = "rc"
- elif letter in ["rev", "r"]:
- letter = "post"
-
- return letter, int(number)
- if not letter and number:
- # We assume if we are given a number, but we are not given a letter
- # then this is using the implicit post release syntax (e.g. 1.0-1)
- letter = "post"
-
- return letter, int(number)
-
- return None
-
-
-_local_version_separators = re.compile(r"[\._-]")
-
-
-def _parse_local_version(local: str) -> LocalType | None:
- """
- Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve").
- """
- if local is not None:
- return tuple(
- part.lower() if not part.isdigit() else int(part)
- for part in _local_version_separators.split(local)
- )
- return None
-
-
-def _cmpkey(
- epoch: int,
- release: tuple[int, ...],
- pre: tuple[str, int] | None,
- post: tuple[str, int] | None,
- dev: tuple[str, int] | None,
- local: tuple[SubLocalType] | None,
-) -> CmpKey:
- # When we compare a release version, we want to compare it with all of the
- # trailing zeros removed. So we'll use a reverse the list, drop all the now
- # leading zeros until we come to something non zero, then take the rest
- # re-reverse it back into the correct order and make it a tuple and use
- # that for our sorting key.
- _release = tuple(
- reversed(list(itertools.dropwhile(lambda x: x == 0, reversed(release))))
- )
-
- # We need to "trick" the sorting algorithm to put 1.0.dev0 before 1.0a0.
- # We'll do this by abusing the pre segment, but we _only_ want to do this
- # if there is not a pre or a post segment. If we have one of those then
- # the normal sorting rules will handle this case correctly.
- if pre is None and post is None and dev is not None:
- _pre: PrePostDevType = NegativeInfinity
- # Versions without a pre-release (except as noted above) should sort after
- # those with one.
- elif pre is None:
- _pre = Infinity
- else:
- _pre = pre
-
- # Versions without a post segment should sort before those with one.
- if post is None:
- _post: PrePostDevType = NegativeInfinity
-
- else:
- _post = post
-
- # Versions without a development segment should sort after those with one.
- if dev is None:
- _dev: PrePostDevType = Infinity
-
- else:
- _dev = dev
-
- if local is None:
- # Versions without a local segment should sort before those with one.
- _local: LocalType = NegativeInfinity
- else:
- # Versions with a local segment need that segment parsed to implement
- # the sorting rules in PEP440.
- # - Alpha numeric segments sort before numeric segments
- # - Alpha numeric segments sort lexicographically
- # - Numeric segments sort numerically
- # - Shorter versions sort before longer versions when the prefixes
- # match exactly
- _local = tuple(
- (i, "") if isinstance(i, int) else (NegativeInfinity, i) for i in local
- )
-
- return epoch, _release, _pre, _post, _dev, _local
diff --git a/contrib/python/pandas/py3/patches/01-arcadia.patch b/contrib/python/pandas/py3/patches/01-arcadia.patch
deleted file mode 100644
index b228067b37a..00000000000
--- a/contrib/python/pandas/py3/patches/01-arcadia.patch
+++ /dev/null
@@ -1,33 +0,0 @@
---- contrib/python/pandas/py3/pandas/_libs/src/klib/khash_python.h (index)
-+++ contrib/python/pandas/py3/pandas/_libs/src/klib/khash_python.h (working tree)
-@@ -23,1 +23,1 @@ typedef npy_complex128 khcomplex128_t;
--void *traced_malloc(size_t size){
-+static void *traced_malloc(size_t size){
-@@ -31,1 +31,1 @@ void *traced_malloc(size_t size){
--void *traced_calloc(size_t num, size_t size){
-+static void *traced_calloc(size_t num, size_t size){
-@@ -39,1 +39,1 @@ void *traced_calloc(size_t num, size_t size){
--void *traced_realloc(void* old_ptr, size_t size){
-+static void *traced_realloc(void* old_ptr, size_t size){
-@@ -50,1 +50,1 @@ void *traced_realloc(void* old_ptr, size_t size){
--void traced_free(void* ptr){
-+static void traced_free(void* ptr){
---- contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.c (index)
-+++ contrib/python/pandas/py3/pandas/_libs/src/ujson/python/date_conversions.c (working tree)
-@@ -9,2 +9,2 @@ The full license is in the LICENSE file, distributed with this software.
--#include <../../../tslibs/src/datetime/np_datetime.h>
--#include <../../../tslibs/src/datetime/np_datetime_strings.h>
-+#include "../../../tslibs/src/datetime/np_datetime.h"
-+#include "../../../tslibs/src/datetime/np_datetime_strings.h"
---- contrib/python/pandas/py3/pandas/compat/_optional.py (index)
-+++ contrib/python/pandas/py3/pandas/compat/_optional.py (working tree)
-@@ -41,1 +41,1 @@ VERSIONS = {
-- "sqlalchemy": "1.4.16",
-+ "sqlalchemy": "1.2.0",
---- contrib/python/pandas/py3/ya.make (index)
-+++ contrib/python/pandas/py3/ya.make (working tree)
-@@ -44,2 +44,4 @@ CFLAGS(
-
-+INCLUDE(symbols.cmake)
-+
- SRCS(
diff --git a/contrib/python/pandas/py3/symbols.cmake b/contrib/python/pandas/py3/symbols.cmake
deleted file mode 100644
index 16e7d04424c..00000000000
--- a/contrib/python/pandas/py3/symbols.cmake
+++ /dev/null
@@ -1,180 +0,0 @@
-CFLAGS(
- -DBuffer_AppendDoubleUnchecked=_pandas_Buffer_AppendDoubleUnchecked
- -DBuffer_AppendIndentNewlineUnchecked=_pandac_Buffer_AppendIndentNewlineUnchecked
- -DBuffer_AppendIndentUnchecked=_pandas_Buffer_AppendIndentUnchecked
- -DBuffer_AppendIntUnchecked=_pandas_Buffer_AppendIntUnchecked
- -DBuffer_AppendLongUnchecked=_pandas_Buffer_AppendLongUnchecked
- -DBuffer_EscapeStringUnvalidated=_pandas_Buffer_EscapeStringUnvalidated
- -DBuffer_EscapeStringValidated=_pandas_Buffer_EscapeStringValidated
- -DBuffer_Realloc=_pandas_Buffer_Realloc
- -DDataFrame_iterBegin=_pandas_DataFrame_iterBegin
- -DDataFrame_iterEnd=_pandas_DataFrame_iterEnd
- -DDataFrame_iterGetName=_pandas_DataFrame_iterGetName
- -DDataFrame_iterGetValue=_pandas_DataFrame_iterGetValue
- -DDataFrame_iterNext=_pandas_DataFrame_iterNext
- -DDict_iterBegin=_pandas_Dict_iterBegin
- -DDict_iterEnd=_pandas_Dict_iterEnd
- -DDict_iterGetName=_pandas_Dict_iterGetName
- -DDict_iterGetValue=_pandas_Dict_iterGetValue
- -DDict_iterNext=_pandas_Dict_iterNext
- -DDir_iterBegin=_pandas_Dir_iterBegin
- -DDir_iterEnd=_pandas_Dir_iterEnd
- -DDir_iterGetName=_pandas_Dir_iterGetName
- -DDir_iterGetValue=_pandas_Dir_iterGetValue
- -DDir_iterNext=_pandas_Dir_iterNext
- -DIndex_iterBegin=_pandas_Index_iterBegin
- -DIndex_iterEnd=_pandas_Index_iterEnd
- -DIndex_iterGetName=_pandas_Index_iterGetName
- -DIndex_iterGetValue=_pandas_Index_iterGetValue
- -DIndex_iterNext=_pandas_Index_iterNext
- -DIter_iterBegin=_pandas_Iter_iterBegin
- -DIter_iterEnd=_pandas_Iter_iterEnd
- -DIter_iterGetName=_pandas_Iter_iterGetName
- -DIter_iterGetValue=_pandas_Iter_iterGetValue
- -DIter_iterNext=_pandas_Iter_iterNext
- -DJSONFileToObj=_pandas_JSONFileToObj
- -DJSONToObj=_pandas_JSONToObj
- -DJSON_DecodeObject=_pandas_JSON_DecodeObject
- -DJSON_EncodeObject=_pandas_JSON_EncodeObject
- -Dobject_is_decimal_type=_pandas_object_is_decimal_type
- -DList_iterBegin=_pandas_List_iterBegin
- -DList_iterEnd=_pandas_List_iterEnd
- -DList_iterGetName=_pandas_List_iterGetName
- -DList_iterGetValue=_pandas_List_iterGetValue
- -DList_iterNext=_pandas_List_iterNext
- -DNpyArrPassThru_iterBegin=_pandas_NpyArrPassThru_iterBegin
- -DNpyArrPassThru_iterEnd=_pandas_NpyArrPassThru_iterEnd
- -DNpyArr_encodeLabels=_pandas_NpyArr_encodeLabels
- -DNpyArr_freeLabels=_pandas_NpyArr_freeLabels
- -DNpyArr_iterBegin=_pandas_NpyArr_iterBegin
- -DNpyArr_iterEnd=_pandas_NpyArr_iterEnd
- -DNpyArr_iterGetName=_pandas_NpyArr_iterGetName
- -DNpyArr_iterGetValue=_pandas_NpyArr_iterGetValue
- -DNpyArr_iterNext=_pandas_NpyArr_iterNext
- -DNpyArr_iterNextItem=_pandas_NpyArr_iterNextItem
- -DNpyArr_iterNextNone=_pandas_NpyArr_iterNextNone
- -DNpy_releaseContext=_pandas_Npy_releaseContext
- -DNpy_returnLabelled=_pandas_Npy_returnLabelled
- -DObject_arrayAddItem=_pandas_Object_arrayAddItem
- -DObject_beginTypeContext=_pandas_Object_beginTypeContext
- -DObject_endArray=_pandas_Object_endArray
- -DObject_endObject=_pandas_Object_endObject
- -DObject_endTypeContext=_pandas_Object_endTypeContext
- -DObject_getDoubleValue=_pandas_Object_getDoubleValue
- -DObject_getIntValue=_pandas_Object_getIntValue
- -DObject_getLongValue=_pandas_Object_getLongValue
- -DObject_getStringValue=_pandas_Object_getStringValue
- -DObject_invokeDefaultHandler=_pandas_Object_invokeDefaultHandler
- -DObject_iterBegin=_pandas_Object_iterBegin
- -DObject_iterEnd=_pandas_Object_iterEnd
- -DObject_iterGetName=_pandas_Object_iterGetName
- -DObject_iterGetValue=_pandas_Object_iterGetValue
- -DObject_iterNext=_pandas_Object_iterNext
- -DObject_newArray=_pandas_Object_newArray
- -DObject_newDouble=_pandas_Object_newDouble
- -DObject_newFalse=_pandas_Object_newFalse
- -DObject_newInteger=_pandas_Object_newInteger
- -DObject_newLong=_pandas_Object_newLong
- -DObject_newNull=_pandas_Object_newNull
- -DObject_newObject=_pandas_Object_newObject
- -DObject_newString=_pandas_Object_newString
- -DObject_newTrue=_pandas_Object_newTrue
- -DObject_npyArrayAddItem=_pandas_Object_npyArrayAddItem
- -DObject_npyArrayListAddItem=_pandas_Object_npyArrayListAddItem
- -DObject_npyEndArray=_pandas_Object_npyEndArray
- -DObject_npyEndArrayList=_pandas_Object_npyEndArrayList
- -DObject_npyEndObject=_pandas_Object_npyEndObject
- -DObject_npyNewArray=_pandas_Object_npyNewArray
- -DObject_npyNewArrayList=_pandas_Object_npyNewArrayList
- -DObject_npyNewObject=_pandas_Object_npyNewObject
- -DObject_npyObjectAddKey=_pandas_Object_npyObjectAddKey
- -DObject_objectAddKey=_pandas_Object_objectAddKey
- -DPdBlockPassThru_iterBegin=_pandas_PdBlockPassThru_iterBegin
- -DPdBlockPassThru_iterEnd=_pandas_PdBlockPassThru_iterEnd
- -DPdBlock_iterBegin=_pandas_PdBlock_iterBegin
- -DPdBlock_iterEnd=_pandas_PdBlock_iterEnd
- -DPdBlock_iterGetName=_pandas_PdBlock_iterGetName
- -DPdBlock_iterGetName_Transpose=_pandas_PdBlock_iterGetName_Transpose
- -DPdBlock_iterNext=_pandas_PdBlock_iterNext
- -DPdBlock_iterNextItem=_pandas_PdBlock_iterNextItem
- -DSeries_iterBegin=_pandas_Series_iterBegin
- -DSeries_iterEnd=_pandas_Series_iterEnd
- -DSeries_iterGetName=_pandas_Series_iterGetName
- -DSeries_iterGetValue=_pandas_Series_iterGetValue
- -DSeries_iterNext=_pandas_Series_iterNext
- -DSkipWhitespace=_pandas_SkipWhitespace
- -DTuple_iterBegin=_pandas_Tuple_iterBegin
- -DTuple_iterEnd=_pandas_Tuple_iterEnd
- -DTuple_iterGetName=_pandas_Tuple_iterGetName
- -DTuple_iterGetValue=_pandas_Tuple_iterGetValue
- -DTuple_iterNext=_pandas_Tuple_iterNext
- -DUJSON_NUMPY=_pandas_UJSON_NUMPY
- -D_NS_MAX_DTS=_pandas__NS_MAX_DTS
- -D_NS_MIN_DTS=_pandas__NS_MIN_DTS
- -D_tokenize_helper=_pandas__tokenize_helper
- -Dadd_minutes_to_datetimestruct=_pandas_add_minutes_to_datetimestruct
- -Dadd_seconds_to_datetimestruct=_pandas_add_seconds_to_datetimestruct
- -Dbuffer_file_bytes=_pandas_buffer_file_bytes
- -Dbuffer_mmap_bytes=_pandas_buffer_mmap_bytes
- -Dbuffer_rd_bytes=_pandas_buffer_rd_bytes
- -Dcmp_npy_datetimestruct=_pandas_cmp_npy_datetimestruct
- -Dcoliter_new=_pandas_coliter_new
- -Dcoliter_setup=_pandas_coliter_setup
- -Dconvert_pydatetime_to_datetimestruct=_pandas_convert_pydatetime_to_datetimestruct
- -DcreateDouble=_pandas_createDouble
- -Ddays_per_month_table=_pandas_days_per_month_table
- -DdecodePreciseFloat=_pandas_decodePreciseFloat
- -Ddecode_any=_pandas_decode_any
- -Ddecode_array=_pandas_decode_array
- -Ddecode_false=_pandas_decode_false
- -Ddecode_null=_pandas_decode_null
- -Ddecode_numeric=_pandas_decode_numeric
- -Ddecode_object=_pandas_decode_object
- -Ddecode_string=_pandas_decode_string
- -Ddecode_true=_pandas_decode_true
- -Ddel_file_source=_pandas_del_file_source
- -Ddel_mmap=_pandas_del_mmap
- -Ddel_rd_source=_pandas_del_rd_source
- -Dencode=_pandas_encode
- -Dfloatify=_pandas_floatify
- -Dget_datetime_iso_8601_strlen=_pandas_get_datetime_iso_8601_strlen
- -Dget_datetimestruct_days=_pandas_get_datetimestruct_days
- -Dget_nat=_pandas_get_nat
- -Dget_parser_memory_footprint=_pandas_get_parser_memory_footprint
- -DinitObjToJSON=_pandas_initObjToJSON
- -Dis_leapyear=_pandas_is_leapyear
- -Dmake_iso_8601_datetime=_pandas_make_iso_8601_datetime
- -Dnew_file_source=_pandas_new_file_source
- -Dnew_mmap=_pandas_new_mmap
- -Dnew_rd_source=_pandas_new_rd_source
- -Dnpy_datetimestruct_to_datetime=_pandas_npy_datetimestruct_to_datetime
- -DobjToJSON=_pandas_objToJSON
- -DobjToJSONFile=_pandas_objToJSONFile
- -Dparse_iso_8601_datetime=_pandas_parse_iso_8601_datetime
- -Dparser_add_skiprow=_pandas_parser_add_skiprow
- -Dparser_cleanup=_pandas_parser_cleanup
- -Dparser_clear_data_buffers=_pandas_parser_clear_data_buffers
- -Dparser_consume_rows=_pandas_parser_consume_rows
- -Dparser_del=_pandas_parser_del
- -Dparser_free=_pandas_parser_free
- -Dparser_init=_pandas_parser_init
- -Dparser_new=_pandas_parser_new
- -Dparser_set_default_options=_pandas_parser_set_default_options
- -Dparser_set_skipfirstnrows=_pandas_parser_set_skipfirstnrows
- -Dparser_trim_buffers=_pandas_parser_trim_buffers
- -Dprecise_xstrtod=_pandas_precise_xstrtod
- -Dround_trip=_pandas_round_trip
- -Dskip_this_line=_pandas_skip_this_line
- -Dstr_to_int64=_pandas_str_to_int64
- -Dstr_to_uint64=_pandas_str_to_uint64
- -Dto_boolean=_pandas_to_boolean
- -Dto_double=_pandas_to_double
- -Dtokenize_all_rows=_pandas_tokenize_all_rows
- -Dtokenize_bytes=_pandas_tokenize_bytes
- -Dtokenize_nrows=_pandas_tokenize_nrows
- -Duint64_conflict=_pandas_uint64_conflict
- -Duint_state_init=_pandas_uint_state_init
- -Dxstrtod=_pandas_xstrtod
- -DObject_newUnsignedLong=_pandas_Object_newUnsignedLong
- -Dget_datetime_metadata_from_dtype=_pandas_get_datetime_metadata_from_dtype
-)
diff --git a/contrib/python/pandas/py3/ya.make b/contrib/python/pandas/py3/ya.make
deleted file mode 100644
index 47987ac8c43..00000000000
--- a/contrib/python/pandas/py3/ya.make
+++ /dev/null
@@ -1,465 +0,0 @@
-# Generated by devtools/yamaker (pypi).
-
-PY3_LIBRARY()
-
-VERSION(2.0.3)
-
-LICENSE(BSD-3-Clause)
-
-PEERDIR(
- contrib/python/numpy
- contrib/python/python-dateutil
- contrib/python/pytz
- contrib/python/tzdata
-)
-
-ADDINCL(
- FOR cython contrib/python/pandas/py3
- contrib/python/pandas/py3/pandas/_libs
- contrib/python/pandas/py3/pandas/_libs/src
- contrib/python/pandas/py3/pandas/_libs/src/klib
- contrib/python/pandas/py3/pandas/_libs/src/ujson/lib
- contrib/python/pandas/py3/pandas/_libs/tslibs
-)
-
-NO_COMPILER_WARNINGS()
-
-NO_LINT()
-
-NO_CHECK_IMPORTS(
- pandas._testing._hypothesis
- pandas.core._numba.*
- pandas.core.arrays.arrow.*
- pandas.io.*
- pandas.plotting.*
- pandas.tseries.*
- pandas.util.*
-)
-
-CFLAGS(
- -DNPY_NO_DEPRECATED_API=0
-)
-
-INCLUDE(symbols.cmake)
-
-SRCS(
- pandas/_libs/src/parser/io.c
- pandas/_libs/src/parser/tokenizer.c
- pandas/_libs/src/ujson/lib/ultrajsondec.c
- pandas/_libs/src/ujson/lib/ultrajsonenc.c
- pandas/_libs/src/ujson/python/JSONtoObj.c
- pandas/_libs/src/ujson/python/date_conversions.c
- pandas/_libs/src/ujson/python/objToJSON.c
- pandas/_libs/src/ujson/python/ujson.c
- pandas/_libs/tslibs/src/datetime/np_datetime.c
- pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
-)
-
-PY_REGISTER(
- pandas._libs.json
-)
-
-PY_SRCS(
- TOP_LEVEL
- pandas/__init__.py
- pandas/_config/__init__.py
- pandas/_config/config.py
- pandas/_config/dates.py
- pandas/_config/display.py
- pandas/_config/localization.py
- pandas/_libs/__init__.py
- pandas/_libs/algos.pyi
- pandas/_libs/arrays.pyi
- pandas/_libs/groupby.pyi
- pandas/_libs/hashing.pyi
- pandas/_libs/hashtable.pyi
- pandas/_libs/index.pyi
- pandas/_libs/indexing.pyi
- pandas/_libs/internals.pyi
- pandas/_libs/interval.pyi
- pandas/_libs/join.pyi
- pandas/_libs/json.pyi
- pandas/_libs/lib.pyi
- pandas/_libs/missing.pyi
- pandas/_libs/ops.pyi
- pandas/_libs/ops_dispatch.pyi
- pandas/_libs/parsers.pyi
- pandas/_libs/properties.pyi
- pandas/_libs/reduction.pyi
- pandas/_libs/reshape.pyi
- pandas/_libs/sparse.pyi
- pandas/_libs/testing.pyi
- pandas/_libs/tslib.pyi
- pandas/_libs/tslibs/__init__.py
- pandas/_libs/tslibs/ccalendar.pyi
- pandas/_libs/tslibs/conversion.pyi
- pandas/_libs/tslibs/dtypes.pyi
- pandas/_libs/tslibs/fields.pyi
- pandas/_libs/tslibs/nattype.pyi
- pandas/_libs/tslibs/np_datetime.pyi
- pandas/_libs/tslibs/offsets.pyi
- pandas/_libs/tslibs/parsing.pyi
- pandas/_libs/tslibs/period.pyi
- pandas/_libs/tslibs/strptime.pyi
- pandas/_libs/tslibs/timedeltas.pyi
- pandas/_libs/tslibs/timestamps.pyi
- pandas/_libs/tslibs/timezones.pyi
- pandas/_libs/tslibs/tzconversion.pyi
- pandas/_libs/tslibs/vectorized.pyi
- pandas/_libs/window/__init__.py
- pandas/_libs/window/aggregations.pyi
- pandas/_libs/window/indexers.pyi
- pandas/_libs/writers.pyi
- pandas/_testing/__init__.py
- pandas/_testing/_hypothesis.py
- pandas/_testing/_io.py
- pandas/_testing/_random.py
- pandas/_testing/_warnings.py
- pandas/_testing/asserters.py
- pandas/_testing/compat.py
- pandas/_testing/contexts.py
- pandas/_typing.py
- pandas/_version.py
- pandas/api/__init__.py
- pandas/api/extensions/__init__.py
- pandas/api/indexers/__init__.py
- pandas/api/interchange/__init__.py
- pandas/api/types/__init__.py
- pandas/arrays/__init__.py
- pandas/compat/__init__.py
- pandas/compat/_constants.py
- pandas/compat/_optional.py
- pandas/compat/compressors.py
- pandas/compat/numpy/__init__.py
- pandas/compat/numpy/function.py
- pandas/compat/pickle_compat.py
- pandas/compat/pyarrow.py
- pandas/core/__init__.py
- pandas/core/_numba/__init__.py
- pandas/core/_numba/executor.py
- pandas/core/_numba/kernels/__init__.py
- pandas/core/_numba/kernels/mean_.py
- pandas/core/_numba/kernels/min_max_.py
- pandas/core/_numba/kernels/shared.py
- pandas/core/_numba/kernels/sum_.py
- pandas/core/_numba/kernels/var_.py
- pandas/core/accessor.py
- pandas/core/algorithms.py
- pandas/core/api.py
- pandas/core/apply.py
- pandas/core/array_algos/__init__.py
- pandas/core/array_algos/datetimelike_accumulations.py
- pandas/core/array_algos/masked_accumulations.py
- pandas/core/array_algos/masked_reductions.py
- pandas/core/array_algos/putmask.py
- pandas/core/array_algos/quantile.py
- pandas/core/array_algos/replace.py
- pandas/core/array_algos/take.py
- pandas/core/array_algos/transforms.py
- pandas/core/arraylike.py
- pandas/core/arrays/__init__.py
- pandas/core/arrays/_mixins.py
- pandas/core/arrays/_ranges.py
- pandas/core/arrays/arrow/__init__.py
- pandas/core/arrays/arrow/_arrow_utils.py
- pandas/core/arrays/arrow/array.py
- pandas/core/arrays/arrow/dtype.py
- pandas/core/arrays/arrow/extension_types.py
- pandas/core/arrays/base.py
- pandas/core/arrays/boolean.py
- pandas/core/arrays/categorical.py
- pandas/core/arrays/datetimelike.py
- pandas/core/arrays/datetimes.py
- pandas/core/arrays/floating.py
- pandas/core/arrays/integer.py
- pandas/core/arrays/interval.py
- pandas/core/arrays/masked.py
- pandas/core/arrays/numeric.py
- pandas/core/arrays/numpy_.py
- pandas/core/arrays/period.py
- pandas/core/arrays/sparse/__init__.py
- pandas/core/arrays/sparse/accessor.py
- pandas/core/arrays/sparse/array.py
- pandas/core/arrays/sparse/dtype.py
- pandas/core/arrays/sparse/scipy_sparse.py
- pandas/core/arrays/string_.py
- pandas/core/arrays/string_arrow.py
- pandas/core/arrays/timedeltas.py
- pandas/core/base.py
- pandas/core/common.py
- pandas/core/computation/__init__.py
- pandas/core/computation/align.py
- pandas/core/computation/api.py
- pandas/core/computation/check.py
- pandas/core/computation/common.py
- pandas/core/computation/engines.py
- pandas/core/computation/eval.py
- pandas/core/computation/expr.py
- pandas/core/computation/expressions.py
- pandas/core/computation/ops.py
- pandas/core/computation/parsing.py
- pandas/core/computation/pytables.py
- pandas/core/computation/scope.py
- pandas/core/config_init.py
- pandas/core/construction.py
- pandas/core/dtypes/__init__.py
- pandas/core/dtypes/api.py
- pandas/core/dtypes/astype.py
- pandas/core/dtypes/base.py
- pandas/core/dtypes/cast.py
- pandas/core/dtypes/common.py
- pandas/core/dtypes/concat.py
- pandas/core/dtypes/dtypes.py
- pandas/core/dtypes/generic.py
- pandas/core/dtypes/inference.py
- pandas/core/dtypes/missing.py
- pandas/core/flags.py
- pandas/core/frame.py
- pandas/core/generic.py
- pandas/core/groupby/__init__.py
- pandas/core/groupby/base.py
- pandas/core/groupby/categorical.py
- pandas/core/groupby/generic.py
- pandas/core/groupby/groupby.py
- pandas/core/groupby/grouper.py
- pandas/core/groupby/indexing.py
- pandas/core/groupby/numba_.py
- pandas/core/groupby/ops.py
- pandas/core/indexers/__init__.py
- pandas/core/indexers/objects.py
- pandas/core/indexers/utils.py
- pandas/core/indexes/__init__.py
- pandas/core/indexes/accessors.py
- pandas/core/indexes/api.py
- pandas/core/indexes/base.py
- pandas/core/indexes/category.py
- pandas/core/indexes/datetimelike.py
- pandas/core/indexes/datetimes.py
- pandas/core/indexes/extension.py
- pandas/core/indexes/frozen.py
- pandas/core/indexes/interval.py
- pandas/core/indexes/multi.py
- pandas/core/indexes/period.py
- pandas/core/indexes/range.py
- pandas/core/indexes/timedeltas.py
- pandas/core/indexing.py
- pandas/core/interchange/__init__.py
- pandas/core/interchange/buffer.py
- pandas/core/interchange/column.py
- pandas/core/interchange/dataframe.py
- pandas/core/interchange/dataframe_protocol.py
- pandas/core/interchange/from_dataframe.py
- pandas/core/interchange/utils.py
- pandas/core/internals/__init__.py
- pandas/core/internals/api.py
- pandas/core/internals/array_manager.py
- pandas/core/internals/base.py
- pandas/core/internals/blocks.py
- pandas/core/internals/concat.py
- pandas/core/internals/construction.py
- pandas/core/internals/managers.py
- pandas/core/internals/ops.py
- pandas/core/methods/__init__.py
- pandas/core/methods/describe.py
- pandas/core/methods/selectn.py
- pandas/core/methods/to_dict.py
- pandas/core/missing.py
- pandas/core/nanops.py
- pandas/core/ops/__init__.py
- pandas/core/ops/array_ops.py
- pandas/core/ops/common.py
- pandas/core/ops/dispatch.py
- pandas/core/ops/docstrings.py
- pandas/core/ops/invalid.py
- pandas/core/ops/mask_ops.py
- pandas/core/ops/methods.py
- pandas/core/ops/missing.py
- pandas/core/resample.py
- pandas/core/reshape/__init__.py
- pandas/core/reshape/api.py
- pandas/core/reshape/concat.py
- pandas/core/reshape/encoding.py
- pandas/core/reshape/melt.py
- pandas/core/reshape/merge.py
- pandas/core/reshape/pivot.py
- pandas/core/reshape/reshape.py
- pandas/core/reshape/tile.py
- pandas/core/reshape/util.py
- pandas/core/roperator.py
- pandas/core/sample.py
- pandas/core/series.py
- pandas/core/shared_docs.py
- pandas/core/sorting.py
- pandas/core/sparse/__init__.py
- pandas/core/sparse/api.py
- pandas/core/strings/__init__.py
- pandas/core/strings/accessor.py
- pandas/core/strings/base.py
- pandas/core/strings/object_array.py
- pandas/core/tools/__init__.py
- pandas/core/tools/datetimes.py
- pandas/core/tools/numeric.py
- pandas/core/tools/timedeltas.py
- pandas/core/tools/times.py
- pandas/core/util/__init__.py
- pandas/core/util/hashing.py
- pandas/core/util/numba_.py
- pandas/core/window/__init__.py
- pandas/core/window/common.py
- pandas/core/window/doc.py
- pandas/core/window/ewm.py
- pandas/core/window/expanding.py
- pandas/core/window/numba_.py
- pandas/core/window/online.py
- pandas/core/window/rolling.py
- pandas/errors/__init__.py
- pandas/io/__init__.py
- pandas/io/_util.py
- pandas/io/api.py
- pandas/io/clipboard/__init__.py
- pandas/io/clipboards.py
- pandas/io/common.py
- pandas/io/excel/__init__.py
- pandas/io/excel/_base.py
- pandas/io/excel/_odfreader.py
- pandas/io/excel/_odswriter.py
- pandas/io/excel/_openpyxl.py
- pandas/io/excel/_pyxlsb.py
- pandas/io/excel/_util.py
- pandas/io/excel/_xlrd.py
- pandas/io/excel/_xlsxwriter.py
- pandas/io/feather_format.py
- pandas/io/formats/__init__.py
- pandas/io/formats/_color_data.py
- pandas/io/formats/console.py
- pandas/io/formats/css.py
- pandas/io/formats/csvs.py
- pandas/io/formats/excel.py
- pandas/io/formats/format.py
- pandas/io/formats/html.py
- pandas/io/formats/info.py
- pandas/io/formats/latex.py
- pandas/io/formats/printing.py
- pandas/io/formats/string.py
- pandas/io/formats/style.py
- pandas/io/formats/style_render.py
- pandas/io/formats/xml.py
- pandas/io/gbq.py
- pandas/io/html.py
- pandas/io/json/__init__.py
- pandas/io/json/_json.py
- pandas/io/json/_normalize.py
- pandas/io/json/_table_schema.py
- pandas/io/orc.py
- pandas/io/parquet.py
- pandas/io/parsers/__init__.py
- pandas/io/parsers/arrow_parser_wrapper.py
- pandas/io/parsers/base_parser.py
- pandas/io/parsers/c_parser_wrapper.py
- pandas/io/parsers/python_parser.py
- pandas/io/parsers/readers.py
- pandas/io/pickle.py
- pandas/io/pytables.py
- pandas/io/sas/__init__.py
- pandas/io/sas/_byteswap.pyi
- pandas/io/sas/_sas.pyi
- pandas/io/sas/sas7bdat.py
- pandas/io/sas/sas_constants.py
- pandas/io/sas/sas_xport.py
- pandas/io/sas/sasreader.py
- pandas/io/spss.py
- pandas/io/sql.py
- pandas/io/stata.py
- pandas/io/xml.py
- pandas/plotting/__init__.py
- pandas/plotting/_core.py
- pandas/plotting/_matplotlib/__init__.py
- pandas/plotting/_matplotlib/boxplot.py
- pandas/plotting/_matplotlib/converter.py
- pandas/plotting/_matplotlib/core.py
- pandas/plotting/_matplotlib/groupby.py
- pandas/plotting/_matplotlib/hist.py
- pandas/plotting/_matplotlib/misc.py
- pandas/plotting/_matplotlib/style.py
- pandas/plotting/_matplotlib/timeseries.py
- pandas/plotting/_matplotlib/tools.py
- pandas/plotting/_misc.py
- pandas/testing.py
- pandas/tseries/__init__.py
- pandas/tseries/api.py
- pandas/tseries/frequencies.py
- pandas/tseries/holiday.py
- pandas/tseries/offsets.py
- pandas/util/__init__.py
- pandas/util/_decorators.py
- pandas/util/_doctools.py
- pandas/util/_exceptions.py
- pandas/util/_print_versions.py
- pandas/util/_str_methods.py
- pandas/util/_test_decorators.py
- pandas/util/_tester.py
- pandas/util/_validators.py
- pandas/util/version/__init__.py
- CYTHON_DIRECTIVE
- language_level=3
- CYTHON_C
- pandas/_libs/algos.pyx
- pandas/_libs/arrays.pyx
- pandas/_libs/groupby.pyx
- pandas/_libs/hashing.pyx
- pandas/_libs/hashtable.pyx
- pandas/_libs/index.pyx
- pandas/_libs/indexing.pyx
- pandas/_libs/internals.pyx
- pandas/_libs/interval.pyx
- pandas/_libs/join.pyx
- pandas/_libs/lib.pyx
- pandas/_libs/missing.pyx
- pandas/_libs/ops.pyx
- pandas/_libs/ops_dispatch.pyx
- pandas/_libs/parsers.pyx
- pandas/_libs/properties.pyx
- pandas/_libs/reduction.pyx
- pandas/_libs/reshape.pyx
- pandas/_libs/sparse.pyx
- pandas/_libs/testing.pyx
- pandas/_libs/tslib.pyx
- pandas/_libs/tslibs/base.pyx
- pandas/_libs/tslibs/ccalendar.pyx
- pandas/_libs/tslibs/conversion.pyx
- pandas/_libs/tslibs/dtypes.pyx
- pandas/_libs/tslibs/fields.pyx
- pandas/_libs/tslibs/nattype.pyx
- pandas/_libs/tslibs/np_datetime.pyx
- pandas/_libs/tslibs/offsets.pyx
- pandas/_libs/tslibs/parsing.pyx
- pandas/_libs/tslibs/period.pyx
- pandas/_libs/tslibs/strptime.pyx
- pandas/_libs/tslibs/timedeltas.pyx
- pandas/_libs/tslibs/timestamps.pyx
- pandas/_libs/tslibs/timezones.pyx
- pandas/_libs/tslibs/tzconversion.pyx
- pandas/_libs/tslibs/vectorized.pyx
- pandas/_libs/window/indexers.pyx
- pandas/_libs/writers.pyx
- pandas/io/sas/sas.pyx
- CYTHON_CPP
- pandas/_libs/window/aggregations.pyx
- pandas/io/sas/byteswap.pyx
-)
-
-RESOURCE_FILES(
- PREFIX contrib/python/pandas/py3/
- .dist-info/METADATA
- .dist-info/entry_points.txt
- .dist-info/top_level.txt
- pandas/io/formats/templates/html.tpl
- pandas/io/formats/templates/html_style.tpl
- pandas/io/formats/templates/html_table.tpl
- pandas/io/formats/templates/latex.tpl
- pandas/io/formats/templates/latex_longtable.tpl
- pandas/io/formats/templates/latex_table.tpl
- pandas/io/formats/templates/string.tpl
-)
-
-END()